-
Notifications
You must be signed in to change notification settings - Fork 3
/
OBOValidation.py
197 lines (163 loc) · 7.51 KB
/
OBOValidation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/usr/bin/env python3
"""
Functions for validating the proceedures in OBOModelling and OBOPartitioning
"""
from numpy.random import uniform
import numpy as np
import pandas as pd
import random
from partitionsets import partition
import CategoryToNumberAssignment as c2n
import OBOModelling as oboM
import OBOPartitioning as oboP
#Sampling algorythm found on the interwebs. A better one would be good...
#http://www.adamlaiacano.com/post/14987215771/python-function-for-sampling-from-an-arbitrary
def slice_sampler(px, N = 1, x = None):
"""
Provides samples from a user-defined distribution.
slice_sampler(px, N = 1, x = None)
Inputs:
px = A discrete probability distribution.
N = Number of samples to return, default is 1
x = Optional list/array of observation values to return, where prob(x) = px.
Outputs:
If x=None (default) or if len(x) != len(px), it will return an array of integers
between 0 and len(px)-1. If x is supplied, it will return the
samples from x according to the distribution px.
"""
values = np.zeros(N, dtype=np.int)
samples = np.arange(len(px))
px = np.array(px) / (1.*sum(px))
u = uniform(0, max(px))
for n in range(N):
included = px>=u
choice = random.sample(range(np.sum(included)), 1)[0]
values[n] = samples[included][choice]
u = uniform(0, px[included][choice])
if x:
if len(x) == len(px):
x=np.array(x)
values = x[values]
else:
print("px and x are different lengths. Returning index locations for px.")
if N == 1:
return values[0]
return values
def discreteRandomSamples(px, N=10000):
"""Return an array of counts for N samples from the discrete distribution px.
Parameters
----------
px : list, Discrete distribution, where sum(px)=1
N : int, number of samples
Returns
-------
list, same lengh as px with sample frequencies from N trials
"""
return np.bincount(slice_sampler(px, N))
def generateDummyDataFrame(Delta=5):
""" Generate a DataFrame of dummy data to mimic an Emperical Categories frame.
We want to produce dummy data to test our model selection. We have two trials in one:
Detect the optimal delta, and detect the optimal partitionings.
We do this with one data set where we have some small number of slightly different
distributions, repeated the number of times for our.
We only need distinct distributions for offences, repeated for all 7 punishments
Arbitrarily we will choose deltat = 5 and three distinct partitions.
One of size one,
One of size two,
One of size seve
"""
offenceDistributions = [
[0.05,0.2,0.05,0.05,0.25,0.05,0.05,0.25,0.05], # C,A,C,C,B,C,C,B,C
[0.25,0.05,0.05,0.05,0.25,0.05,0.05,0.2,0.05], # B,C,C,C,B,C,C,A,C
[0.05,0.25,0.25,0.05,0.05,0.05,0.05,0.05,0.2], # C,B,B,C,C,C,C,C,A
[0.05,0.2,0.05,0.05,0.25,0.05,0.05,0.25,0.05] # C,A,C,C,B,C,C,B,C
]
probabilities = []
for offDis in offenceDistributions:
probabilities.append([x for x in [z/7 for z in offDis] for y in [1,2,3,4,5,6,7]])
dummyEmp = pd.DataFrame(index=list(range(1674,(1674+4*Delta))), columns=c2n.generateCategories()[1:64])
for offenDist in range(4):
for delta in range(Delta):
dummyEmp.iloc[offenDist*Delta+delta] = discreteRandomSamples(probabilities[offenDist])
return dummyEmp
#return probabilities
#Modified version of AIC generation for our dummy emperical frame
def validateAICModelSelection(DummyEmp):
"""Create a DataFrame containing AIC scores of all standard models. Only NoGender at this stage.
Parameters
----------
DumyEmp : pandas DataFrame, Dummy emperical data frame as generated by generateDummyDataFrame()
Returns:
pandas DataFrame, Sorted AIC scores for different Delta's and model types
"""
AICFrame = pd.DataFrame(columns = ['Delta','k','ll','AIC'])
CatEmp = DummyEmp.mul(1)
Deltas = [1,2,3,4,5,10]
#Generate all the refactorings:
CatEmps = [ oboM.changePeriod(CatEmp,Delta) for Delta in Deltas]*2 #*2 as one for Dep and one for Indep
CatModsDep = [ oboM.generateDependentModelLaplace(CatEmp, Delta=Delta) for Delta in Deltas ]
CatModsIndep = [ oboM.generateIndependentModel(CatEmp, Delta=Delta) for Delta in Deltas ]
CatMods = CatModsDep + CatModsIndep
#For all generated model types on all deltas
for c,CatMod in enumerate(CatMods):
ModType = 'Dependent' if (c < len(CatMods)/2) else 'Independent'
if 'Dependent' in ModType:
k = (9-1)*(7-1)*CatMod.shape[0]
elif 'Independent' in ModType:
k = (9+7-2)*CatMod.shape[0]
else:
print('Oh deary me')
return
ll = oboM.loglikilyhood(CatEmps[c], CatMod)
AIC = 2*k - 2*ll
AICFrame.loc[c] = [ ModType + ' Delta = ' + str(Deltas[int(c%(len(CatMods)/2))]), k, ll, AIC]
##print('{}, years: {}, model index {}, k {}, log-likelihood {}, AIC {}'.format(file,fileYearDelta,DeltasIndex,k,ll, AIC))
##print('{} loaded'.format(file))
#except:
#print('{} failed to load'.format(file))
##Return the frame
return AICFrame.sort('AIC')
def listMul(list, Multiplier):
"""Multiply the elements of a list by Multiplier"""
return [ x*Multiplier for x in list]
def validatePartitioning():
""" Validate the partition searching.
Parameters
----------
DumyEmp : pandas DataFrame, Dummy emperical data frame as generated by generateDummyDataFrame()
BestPartition: bool, Whether to return the partition AIC scores or the best partition.
Returns:
pandas DataFrame, Partition AIC scores, or the best partition
Try also:
x = [15,85,44,56,49,51,46,54,50,50,50,50,56,44,46,54,4,96] NNNNope
"""
#Death Or Not test
#For partitions of typ: A, B, A, C, A, A, A, C, A, where we should get
# Which is [[0,2,4,5,6,8],[1],[3,7]]
#A = [ 'breakingPeace','deception', 'miscellaneous', 'royalOffences', 'sexual', 'violentTheft']
#B = [ 'damage' ]
#C = [ 'kill', 'theft']
# We also ensure that the occurence of the offences are different within the same partition
TestRow = [40, 160, 70, 30, 20, 80, 50, 50, 20, 80, 20, 80, 10, 40, 100, 100, 20, 80]
#For not Death or Not, have 7 punishmnets
A = [ 10, 20, 30, 40, 50, 60, 70 ]
B = [ 70, 60, 50, 40, 30, 20, 10 ]
C = [ 20, 10, 40, 30, 60, 50, 70 ]
#For 9 offences
#A, B, A, C, A, A, A, C, A as above
TestFullRowA = A+listMul(B,2)+listMul(A,3)+C+A+A+listMul(A,2)+C+A
#C, A, A, B, B, C, A, A, C
TestFullRowB = C+A+listMul(A,2)+B+listMul(B,3)+C+A+listMul(A,3)+C
TestFrame = pd.DataFrame([TestRow,TestRow], columns=list(range(18)), index=[0,1])
TestFullFrame = pd.DataFrame([TestFullRowA,TestFullRowA,TestFullRowB,TestFullRowB,TestFullRowB], columns=list(range(63)), index=list(range(5)))
partitions = partition.Partition([c2n.upcaseFirstLetter(x) for x in c2n.offcat])
print('Testing Death Or Not partitioning')
DeathAICtable = oboP.generateAICtable(TestFrame)
DeathAICmin = DeathAICtable.idxmin(axis=1).apply(lambda x: partitions[int(x)])
print('Found minimal partitions:')
print(DeathAICmin)
print('Testing full partitioning')
AICtable = oboP.generateAICtable(TestFullFrame)
AICmin = AICtable.idxmin(axis=1).apply(lambda x: partitions[int(x)])
print('Found minimal partitions:')
print(AICmin)