OBOValidation.py

#!/usr/bin/env python3
""" 
Functions for validating the proceedures in OBOModelling and OBOPartitioning
"""

from numpy.random import uniform
import numpy as np
import pandas as pd
import random
from partitionsets import partition
import CategoryToNumberAssignment as c2n
import OBOModelling as oboM
import OBOPartitioning as oboP

#Sampling algorythm found on the interwebs. A better one would be good...
#http://www.adamlaiacano.com/post/14987215771/python-function-for-sampling-from-an-arbitrary 
def slice_sampler(px, N = 1, x = None):
    """
    Provides samples from a user-defined distribution.
    
    slice_sampler(px, N = 1, x = None)
    
    Inputs:
    px = A discrete probability distribution.
    N  = Number of samples to return, default is 1
    x  = Optional list/array of observation values to return, where prob(x) = px.

    Outputs:
    If x=None (default) or if len(x) != len(px), it will return an array of integers
    between 0 and len(px)-1. If x is supplied, it will return the
    samples from x according to the distribution px.    
    """
    values = np.zeros(N, dtype=np.int)
    samples = np.arange(len(px))
    px = np.array(px) / (1.*sum(px))
    u = uniform(0, max(px))
    for n in range(N):
        included = px>=u
        choice = random.sample(range(np.sum(included)), 1)[0]
        values[n] = samples[included][choice]
        u = uniform(0, px[included][choice])
    if x:
        if len(x) == len(px):
            x=np.array(x)
            values = x[values]
        else:
            print("px and x are different lengths. Returning index locations for px.")
    if N == 1:
        return values[0]
    return values

def discreteRandomSamples(px, N=10000):
    """Return an array of counts for N samples from the discrete distribution px.
    
    Parameters
    ----------
    px : list, Discrete distribution, where sum(px)=1
    N : int, number of samples
    
    Returns
    -------
    list, same lengh as px with sample frequencies from N trials
    """
    return np.bincount(slice_sampler(px, N))

def generateDummyDataFrame(Delta=5):
    """ Generate a DataFrame of dummy data to mimic an Emperical Categories frame.
    We want to produce dummy data to test our model selection. We have two trials in one:
    Detect the optimal delta, and detect the optimal partitionings.
    We do this with one data set where we have some small number of slightly different
    distributions, repeated the number of times for our.
    We only need distinct distributions for offences, repeated for all 7 punishments
    
    Arbitrarily we will choose deltat = 5 and three distinct partitions.
     One of size one,
     One of size two,
     One of size seve
    """
    offenceDistributions = [
        [0.05,0.2,0.05,0.05,0.25,0.05,0.05,0.25,0.05], # C,A,C,C,B,C,C,B,C
        [0.25,0.05,0.05,0.05,0.25,0.05,0.05,0.2,0.05], # B,C,C,C,B,C,C,A,C
        [0.05,0.25,0.25,0.05,0.05,0.05,0.05,0.05,0.2], # C,B,B,C,C,C,C,C,A
        [0.05,0.2,0.05,0.05,0.25,0.05,0.05,0.25,0.05]  # C,A,C,C,B,C,C,B,C
        ]
    
    probabilities = []
    for offDis in offenceDistributions:
        probabilities.append([x for x in [z/7 for z in offDis] for y in [1,2,3,4,5,6,7]])
    
    dummyEmp = pd.DataFrame(index=list(range(1674,(1674+4*Delta))), columns=c2n.generateCategories()[1:64])
    
    for offenDist in range(4):
        for delta in range(Delta):
            dummyEmp.iloc[offenDist*Delta+delta] =  discreteRandomSamples(probabilities[offenDist])
    
    return dummyEmp

    #return probabilities

#Modified version of AIC generation for our dummy emperical frame
def validateAICModelSelection(DummyEmp):
    """Create a DataFrame containing AIC scores of all standard models. Only NoGender at this stage.
    
    Parameters
    ----------
    DumyEmp : pandas DataFrame, Dummy emperical data frame as generated by generateDummyDataFrame()
    
    Returns:
    pandas DataFrame, Sorted AIC scores for different Delta's and model types
    """
    AICFrame = pd.DataFrame(columns = ['Delta','k','ll','AIC'])
    CatEmp = DummyEmp.mul(1)
    
    Deltas = [1,2,3,4,5,10]
    #Generate all the refactorings:
    CatEmps = [ oboM.changePeriod(CatEmp,Delta) for Delta in Deltas]*2 #*2 as one for Dep and one for Indep
    CatModsDep = [ oboM.generateDependentModelLaplace(CatEmp, Delta=Delta) for Delta in Deltas ]
    CatModsIndep = [ oboM.generateIndependentModel(CatEmp, Delta=Delta) for Delta in Deltas ]
    CatMods = CatModsDep + CatModsIndep
    
    #For all generated model types on all deltas
    for c,CatMod in enumerate(CatMods):
        ModType = 'Dependent' if (c < len(CatMods)/2) else 'Independent'
        
        if 'Dependent' in ModType:
            k = (9-1)*(7-1)*CatMod.shape[0]
        elif 'Independent' in ModType:
            k = (9+7-2)*CatMod.shape[0]
        else:
            print('Oh deary me')
            return
        ll = oboM.loglikilyhood(CatEmps[c], CatMod)
        AIC = 2*k - 2*ll
        AICFrame.loc[c] = [ ModType + ' Delta = ' + str(Deltas[int(c%(len(CatMods)/2))]), k, ll, AIC]
            ##print('{}, years: {}, model index {}, k {}, log-likelihood {}, AIC {}'.format(file,fileYearDelta,DeltasIndex,k,ll, AIC))
            ##print('{} loaded'.format(file))
        #except:
            #print('{} failed to load'.format(file))
    ##Return the frame
    return AICFrame.sort('AIC')
    
def listMul(list, Multiplier):
    """Multiply the elements of a list by Multiplier"""
    return [ x*Multiplier for x in list]
    
def validatePartitioning():
    """ Validate the partition searching.
    
    Parameters
    ----------
    DumyEmp : pandas DataFrame, Dummy emperical data frame as generated by generateDummyDataFrame()
    BestPartition: bool, Whether to return the partition AIC scores or the best partition.
    
    Returns:
    pandas DataFrame, Partition AIC scores, or the best partition 
    
    Try also:
    x = [15,85,44,56,49,51,46,54,50,50,50,50,56,44,46,54,4,96] NNNNope
    

    """
    #Death Or Not test
    #For partitions of typ: A, B, A, C, A, A, A, C, A, where we should get
    # Which is [[0,2,4,5,6,8],[1],[3,7]]
    #A = [ 'breakingPeace','deception', 'miscellaneous', 'royalOffences', 'sexual', 'violentTheft']
    #B = [ 'damage' ]
    #C = [ 'kill', 'theft']
    # We also ensure that the occurence of the offences are different within the same partition
    TestRow = [40, 160, 70, 30, 20, 80, 50, 50, 20, 80, 20, 80, 10, 40, 100, 100, 20, 80]
    
    #For not Death or Not, have 7 punishmnets
    A = [ 10, 20, 30, 40, 50, 60, 70 ]
    B = [ 70, 60, 50, 40, 30, 20, 10 ]
    C = [ 20, 10, 40, 30, 60, 50, 70 ]
    #For 9 offences
    #A, B, A, C, A, A, A, C, A as above
    TestFullRowA = A+listMul(B,2)+listMul(A,3)+C+A+A+listMul(A,2)+C+A
    #C, A, A, B, B, C, A, A, C
    TestFullRowB = C+A+listMul(A,2)+B+listMul(B,3)+C+A+listMul(A,3)+C
    
    TestFrame = pd.DataFrame([TestRow,TestRow], columns=list(range(18)), index=[0,1])
    TestFullFrame = pd.DataFrame([TestFullRowA,TestFullRowA,TestFullRowB,TestFullRowB,TestFullRowB], columns=list(range(63)), index=list(range(5)))
    
    partitions = partition.Partition([c2n.upcaseFirstLetter(x) for x in c2n.offcat])
    print('Testing Death Or Not partitioning')
    
    DeathAICtable = oboP.generateAICtable(TestFrame)
    DeathAICmin = DeathAICtable.idxmin(axis=1).apply(lambda x: partitions[int(x)])
    print('Found minimal partitions:')
    print(DeathAICmin)
    
    print('Testing full partitioning')
    AICtable = oboP.generateAICtable(TestFullFrame)
    AICmin = AICtable.idxmin(axis=1).apply(lambda x: partitions[int(x)])
    print('Found minimal partitions:')
    print(AICmin)