Source code for learners.cpdlearners

'''
The parameters of a PRM model are the conditional probability distributions (CPDs or `local distributions`), they are learned using the module :mod:`!learners.cpdlearners`.

This module contains classes that are used to learn conditional probability distributions (CPDs) for attributes. 
The basic CPD class is :class:`prm.localdistributions.CPD` which can be implemented for different CPD representations.
:class:`prm.localdistributions.CPDTabular` stores the CPD as a matrix, whereas :class:`prm.localdistributions.CPDTree` 
stores the CPD in a decision tree (not implemented yet).

'''

from analytics.performance import time_analysis

from prm.localdistribution import CPDTabular

#from pylab import *
import numpy as N

[docs]class CPDLearner():
    '''
    Abstract class that is used to learn the conditional probability distributions for 
    a PRM using the data linked in the data interface.
    '''
    def __init__(self):        
        ''' 
        Creates CPDLearner. It now has to be configured using the self.configure() method.
        '''
        
        import prm.prm as PRM
        import data.datainterface as DI
        self.prmToLearn = PRM 
        self.di = DI
    
    def __repr__(self):            
        return "%s - PRM %s - DI %s"%(self.__class__.__name__ ,self.prmToLearn.name,self.di.name)
        
    def learnCPDs(self,saveDistributions=False,forceLearning=False):
        raise Exception("learnCPDs not implemented in the CPDLearner instance")
    
    
    
[docs]class CPDTabularLearner(CPDLearner):
    '''
    The CPDTabularLearner learns the local distributions for all probabilistic attributes and stores them 
    in tabular form :class:`prm.localdistributions.CPDTabular`. It loads all the necessary data for each attribute 
    in one query using the data interface.
    '''
    def __init__(self):
        
        CPDLearner.__init__(self)                                                                         
        
    
[docs]    def learnCPDs(self,saveDistributions=False,forceLearning=False):
        '''
        Depending on which method is standard for learning CPDs, either
        :meth:`.learnCPDsCount` or :meth:`.learnCPDsFull` is executed.
        '''
        
        #self.learnCPDsCount(saveDistributions,forceLearning)
        self.learnCPDsFull(saveDistributions,forceLearning)
        
    #@time_analysis
[docs]    def learnCPDsCount(self,saveDistributions=False,forceLearning=False):
        '''
        Learns the conditional probability distributions for all probabilistic attributes
        by counting the occurences on the data side. If the data interface connects
        to a SQL based database this can easily be done using the `COUNT` statement. 
        The data is retrieved by calling the data interface method :meth:`data.sqliteinterface.loadCountCPDdata`
        
        :arg saveDistributions: If `True`, saves the learned CPDs to disk and prints the XML line that needs to be added to the PRM specification to the standard output
        :arg forceLearning: If `True`, the CPDs are learned even if there are distributions that could be loaded from disk
        '''
        #iterate over all attributes
        for attr in self.prmToLearn.attributes.values():                        

            if attr.probabilistic: #only learn probabilistic attributes
            
                if attr.CPD != None and not forceLearning:
                    print "... CPD for attribute '%s' already loaded"%(attr.name)
                    
                else: #only learn attributes that don't have a CPD yet (it could also be specified in prm)

                    print "... Learning CPD for attribute '%s'"%(attr.fullname)
                
                    #create CPD instance for attribute
                    attr.CPD = CPDTabular(attr)
                    #we count all occurrences of the individual parent assignments
                    counter = N.zeros((attr.CPD.cpdMatrix.shape[0],1))
                
                
                    '''
                    We learn the distributions over all the trainig sets (no cross validation)
                    '''
                    for dsi in self.di.DSI:
                    
                
                        #load the full data for attribute
                        #dsi.loadFullCPDdata(attr)

                        #or load the aggregated data for attribute
                        dsi.loadCountCPDdata(attr)
                                                                            

                        for i,currentRow in enumerate(dsi.resultSet()):
                            
                            #keeping track of the index for the attribute
                            npa = attr.CPD.cpdMatrix.shape[0]
                            attrIndex = i / npa
                            parentIndex = i % npa
                            
                            '''                            
                            We handle the current row
                            '''
                            #count the attribute assignment
                            attr.CPD.cpdMatrix[parentIndex,attrIndex] = currentRow[0]
                            #count the parent assignment
                            counter[parentIndex] += currentRow[0]                            
    
                
                    #add fake counts
                    #TODO
            
                    #calculate probabilities
                    attr.CPD.cpdMatrix = attr.CPD.cpdMatrix / counter 
                
                    #compute the cumulative distribution
                    attr.CPD.computeCumulativeDist()
                    attr.CPD.computeLogDists()
                
                    #print attr.CPD.cpdMatrix                    
                    #print attr.CPD.cpdMatrix.sum(axis=1)
                
                
                if saveDistributions:
                    ''' Finally we can save the distributions to file if desired '''    
                    attr.CDP.save()
    
    #@time_analysis
[docs]    def learnCPDsFull(self,saveDistributions=False,forceLearning=False):
        '''
        Learns the conditional probability distributions for all probabilistic attributes
        by iterating over a big table counting the occurences on the way. If the data interface 
        connects to a SQL based database, the result set is a big table in the form
        [valAttr, valPa1, valPa2, etc.]. 
        The data is retrieved by calling the data interface method :meth:`data.sqliteinterface.loadFullCPDdata`
        
        :arg saveDistributions: If `True`, saves the learned CPDs to disk and prints the XML line that needs to be added to the PRM specification to the standard output        
        :arg forceLearning: If `True`, the CPDs are learned even if there are distributions that could be loaded from disk
        '''
        print "Learning CPD for attributes '%s'"%(','.join([attr.fullname for attr in self.prmToLearn.attributes.values() if attr.probabilistic]))
        #iterate over all attributes
        for attr in self.prmToLearn.attributes.values():                        

            if attr.probabilistic: #only learn probabilistic attributes
                
                if attr.CPD != None and not forceLearning:
                    print "... CPD for attribute '%s' already loaded"%(attr.name)
                else: #only learn attributes that don't have a CPD yet (it could also be specified in prm)
                    
                    #print "... Learning CPD for attribute '%s'"%(attr.fullname)
                    
                    #create CPD instance for attribute
                    attr.CPD = CPDTabular(attr)
                    #we count all occurrences of the individual parent assignments
                    counter = N.zeros((attr.CPD.cpdMatrix.shape[0],1))
                    
                    
                    '''
                    We learn the distributions over all the trainig sets (no cross validation)
                    '''
                    for dsi in self.di.DSI:
                        
                    
                        #load the full data for attribute
                        dsi.loadFullCPDdata(attr)

                        #or load the aggregated data for attribute
                        #dsi.loadFullAggCPDdata(attr)
                        
                                                                                
                       
                        for currentRow in dsi.resultSet():                                                                                                                                

                            '''                            
                            We handle the current row
                            '''                        
                            
                            #compute the matrix index for the attribute values
                            [indexRow,indexColumn] = attr.CPD.indexingCPD(currentRow)
                            #count the attribute assignment
                            attr.CPD.cpdMatrix[indexRow,indexColumn] += 1
                            #count the parent assignment
                            counter[indexRow] +=1
                        
                    
                         
                    
                    
                    #add fake counts
                    nF = 1
                    attr.CPD.cpdMatrix += nF
                    counter += attr.CPD.cpdMatrix.shape[1]*nF
                
                    #calculate probabilities
                    attr.CPD.cpdMatrix = attr.CPD.cpdMatrix / counter 
                    
                    #compute the cumulative distribution
                    attr.CPD.computeCumulativeDist()
                    attr.CPD.computeLogDists()
                    
                    #print attr.CPD.cpdMatrix                    
                    #print attr.CPD.cpdMatrix.sum(axis=1)
                    
                    
                if saveDistributions:
                    ''' Finally we can save the distributions to file if desired '''    
                    attr.CPD.save()
                    

[docs]    def loglikelihood(self):
        '''
        Computes the log likelihood for the learned CPDs. As aggregation is possibly required, it uses the method
        :meth:`data.sqliteinterface.loadFullAggCPDdata` to retrieve the data.
        
        '''
        
        loglik = 0
        
        for attr in self.prmToLearn.attributes.values():                        

            if attr.probabilistic: 
                
                for dsi in self.di.DSI:
                    
                
                    #load the full data for attribute
                    #dsi.loadFullCPDdata(attr)

                    #or load the aggregated data for attribute
                    dsi.loadFullAggCPDdata(attr)
                    
                    for currentRow in dsi.resultSet():  
                        '''                            
                        We handle the current row
                        '''
                        #compute the matrix index for the attribute values
                        [indexRow,indexColumn] = attr.CPD.indexingCPD(currentRow)
                        #update the loglik with the log prob of the instance that we have seen
                        loglik += attr.CPD.cpdLogMatrix[indexRow,indexColumn] 
                        
                    
        return loglik
    
    

class CPDTreeLearner(CPDLearner):    
    pass
Navigation

Quick search

Source code for learners.cpdlearners

Navigation