-
Notifications
You must be signed in to change notification settings - Fork 2
/
DescisionTree.py
417 lines (365 loc) · 18.2 KB
/
DescisionTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
import numpy as np
import pdb
import csv
import math
from sets import Set
class Node:
# Keeps a mapping of discrete and continuos attributes
def __init__(self, best_attribute,best_list_set,best_attribute_val_dict,best_prob_distribution):
self.best_attribute = best_attribute
self.best_list_set=best_list_set
self.best_attribute_val_dict=best_attribute_val_dict
self.leafnode =False
self.newnode =None
self.prob_distribution= best_prob_distribution
class DescisionTree:
noOfClasses =2
dict_discrete_continous={}
def __init__(self):
pass
def cal_entropy(self,data):
#pdb.set_trace()
set_class_values=Set()
#N=data.shape[0]
N=len(data)
for i in xrange(len(data)):
set_class_values.add(data[i][-1]) # This will add distinct values for numbers of ground truth
entropy=0
for set_val in set_class_values:
class_label_count=0
for k in xrange(len(data)):
if (data[k][-1] == set_val):
class_label_count =class_label_count+1
#class_label_count=np.count(data[:,-1]== set_class_values[j])
val = float(class_label_count)/N
#pdb.set_trace()
entropy = entropy - (val * math.log(val,2))
return entropy
# This will check whether the data belongs to same class or not ,if yes, then this will return true.
def check_distribution(self,data):
#pdb.set_trace()
#val = data[0,-1]
if bool(data)==False: # To check if the dictionary is empty or not
return False,None
else:
val = data[0][-1]
count=0
for k in xrange(len(data)):
if (data[k][-1] == val):
count =count+1
#count = np.sum(data[data[:,-1]==val])
#if (count==data.shape[0]):
if (count==len(data)):
return True,val # This will also give the label, which will be used of testing
else:
return False,None
# Get the probability distribution for that node
def get_prob_distribution(self,data):
dict_prob = {i:0 for i in xrange(self.noOfClasses)}
for j in xrange(len(data)):
class_val = data[j][-1]
if class_val in dict_prob:
dict_prob[class_val] = dict_prob[class_val] + 1
#print (dict_prob)
list_prob=[]
for k in dict_prob:
list_prob.append(float(dict_prob[k]) / len(data))
del dict_prob # Free memory
return list_prob
#Suppose you have a dictionary for continuous attributes
def cal_gain_ratio(self,data,column):
#Calculate Entropy of the full set
list_sets=[] # this will be used to store data when split on a particular attribute
check_distribution={}
Info_T = self.cal_entropy(data)
prob_distribution= self.get_prob_distribution(data)
#N = data.shape[0]
N=len(data)
val_to_split_cont=0
#For now just take the discrete values , leave the continuous values
#For continuous / values
#if dict_discrete_continous
if self.dict_discrete_continous[column]=="continuous":
cal_to_split_cont=None
#if isinstance(data[0,column],int) or isinstance(data[0,column],float):
sorted_column = [data[i][column] for i in data ]
#sorted_column = np.sort(data[:,column])
sorted_column.sort()
if (N%2==0 or N==1): # for even len
#print ("***%d") % (N)
mid_point = (sorted_column[N/2] )
else:
#print ("***%d") % (N)
mid_point = (sorted_column[N/2] + sorted_column[(N/2+1)])/2
for iter_i in xrange(len(sorted_column)):
if sorted_column[iter_i]>mid_point:
cal_to_split_cont=sorted_column[iter_i-1] # Since, it is already sorted, take the value less than the midpoint but greater than all others till midpoint.
break
if cal_to_split_cont is None:
cal_to_split_cont = mid_point
#split_function=lambda data:data[column]>=cal_to_split_cont
#sub_data_1 = data[data[:,column]<=cal_to_split_cont]
#sub_data_2 = data[data[:,column]>cal_to_split_cont]
sub_data_1={}
sub_data_2={}
count_1=0
count_2 = 0
for k in xrange(len(data)):
if data[k][column]<=cal_to_split_cont:
sub_data_1[count_1] = data[k]
count_1=count_1+1
else:
sub_data_2[count_2] = data[k]
count_2=count_2+1
list_sets.append(sub_data_1)
list_sets.append(sub_data_2)
#pdb.set_trace()
check_distribution["<="] = [self.check_distribution(sub_data_1), Node,cal_to_split_cont]
check_distribution[">"] = [self.check_distribution(sub_data_2), Node,cal_to_split_cont] # if all samples belong to same class then it will return true.
sub_data_temp=(sub_data_1,sub_data_2)
Info_x=0
split_ratio=0
#pdb.set_trace()
for i in [0,1] :
sub_data=sub_data_temp[i]
if bool(sub_data) ==False: # Fpr dictionary being empty
break
else:
#sub_N = sub_data.shape[0]
sub_N = len(sub_data)
sub_N_val = float(sub_N)/float(N)
Info_x = Info_x+ (sub_N_val*self.cal_entropy(sub_data))
split_ratio = split_ratio + (-sub_N_val) * math.log((sub_N_val),2) # calculating split ratio
# calculate Split ratio
gain = (Info_T-Info_x)
if split_ratio==0:
gain_ratio=gain
else:
gain_ratio = gain/split_ratio
# For nominal values
else:
# Now entropy for the attribute i.e Inof(x)
#set_distinct_values=(row[column] for row in data)
set_distinct_values=Set(data[row][column] for row in data) # This should give distinct values
Info_x=0
split_ratio=0
for col_val in set_distinct_values:
sub_data={}
#check this
#sub_data = data[data[:,column]==col_val]
count_sub_data=0
for k in data:
if data[k][column]==col_val:
sub_data[count_sub_data] = data[k]
count_sub_data = count_sub_data + 1
check_distribution[col_val] = [self.check_distribution(sub_data), Node] # if all samples belong to same class then it will return true.
list_sets.append(sub_data)
#sub_N = sub_data.shape[0]
sub_N = len(sub_data)
#pdb.set_trace()
sub_N_val = float(sub_N)/float(N)
Info_x = Info_x+ (sub_N_val*self.cal_entropy(sub_data))
split_ratio = split_ratio + (-sub_N_val) * math.log((sub_N_val),2) # calculating split ratio
del sub_data
gain = (Info_T-Info_x)
if split_ratio==0:
gain_ratio = gain
else:
gain_ratio = gain/split_ratio
return gain_ratio,list_sets,check_distribution,prob_distribution
#return gain_ratio
#####################BuildTree######################
def buildDecisionTree(self,data,list_col):
#list_col ---> this will contain the column which we have to split the data on
#pdb.set_trace()
if(len(list_col)==0 or len(data)==0): # Till all the attributes / columns are checked
node = Node(None,None,None,None)
node.leafnode = True
return node
else:
#pdb.set_trace()
list_col_gng_frwd=[]
best_gain_ratio=0
best_attribute=None
best_list_set=[]
best_attribute_val_dict={} # For chrcking at test time which attribute is true and which one is false
best_prob_distribution=[]
#cols = len(data)-1
for column in list_col:
list_col_gng_frwd.append(column)
gain_ratio,list_sets,check_distribution, prob_distribution = self.cal_gain_ratio(data,column)
if gain_ratio>=best_gain_ratio:
best_gain_ratio=gain_ratio
best_attribute=column
best_list_set=list_sets
best_attribute_val_dict = check_distribution
best_prob_distribution = prob_distribution
del list_sets # free memory
del check_distribution # free memory
# so that we do not calculate gain ratio for this attribute,becuase we have split the node on that attribute
list_col_gng_frwd.remove(best_attribute)
#pdb.set_trace()
# call the next subtree
# pdb.set_trace()
node = Node(best_attribute,best_list_set,best_attribute_val_dict,best_prob_distribution)
if self.dict_discrete_continous[best_attribute]=="continuous":
for i,j in enumerate (best_attribute_val_dict):
#check_bool_distribution this is a (tuple,function)
check_bool_distribution,newnode,threshold=best_attribute_val_dict[j]
if(check_bool_distribution[0]==False):
#node = Node(best_attribute,best_list_set,best_attribute_val_dict)
newdata = best_list_set[i]
if len(newdata)!=0:
newnode = self.buildDecisionTree(newdata,list_col_gng_frwd) #making change
best_attribute_val_dict[j][1]=newnode
else:
best_attribute_val_dict[j][1]=None
# node.leafnode=False
else:
list_gng_frwd_leaf=[]
newdata = best_list_set[i]
newnode = self.buildDecisionTree(newdata, list_gng_frwd_leaf)
best_attribute_val_dict[j][1] = newnode
else:
for i,j in enumerate (best_attribute_val_dict):
check_bool_distribution,newnode=best_attribute_val_dict[j]
if(check_bool_distribution[0]==False):
newdata = best_list_set[i]
if len(newdata)!=0:
newnode = self.buildDecisionTree(newdata,list_col_gng_frwd)
best_attribute_val_dict[j][1]=newnode
else:
best_attribute_val_dict[j][1]=None
# node.leafnode=False
else:
list_gng_frwd_leaf=[]
newdata = best_list_set[i]
newnode = self.buildDecisionTree(newdata, list_gng_frwd_leaf)
best_attribute_val_dict[j][1] = newnode
return node
def classify(self,test_data,ROOT):
pred_label={i:0 for i in xrange(len(test_data))}
#pdb.set_trace()
for i in xrange(len(test_data)):
root=ROOT
flag=True
row = test_data[i]
#print (row)
# print ("ietartion", i)
# while root.newnode not None: # need to change this
while bool(root.best_attribute_val_dict) :
#pdb.set_trace()
col = root.best_attribute
if self.dict_discrete_continous[col]=="continuous":
check_distribution, new_root, threshold = root.best_attribute_val_dict["<="]
if row[col]<=threshold:
if check_distribution[0]==True:
pred_label[i] = check_distribution[1]
flag=False
break
else:
prob_distribution = root.prob_distribution
root=new_root
# Now get the probability distribution of the node.
if root is None:
class_val = prob_distribution.index(max(prob_distribution)) # give the prob of max lass in thsoe samples
pred_label[i] = class_val # Class label for the class with highest probability distribution
break
else:
check_distribution, new_root, threshold = root.best_attribute_val_dict[">"]
if check_distribution[0]==True:
pred_label[i] = check_distribution[1]
flag=False
break
else:
prob_distribution = root.prob_distribution
root=new_root
# Now get the probability distribution of the node, since its a null node
if root is None:
class_val = prob_distribution.index(max(prob_distribution)) # give the prob of max lass in thsoe samples
pred_label[i] = class_val # Class label for the class with highest probability distribution
break
# For discrete attributes
else:
val = row[col]
key_val=None
check_flag =False
#Check if the key is there or not in the dictionary
for key in root.best_attribute_val_dict:
if key == val:
check_flag=True
break
else:
check_flag = False
key_val=key
prob_distribution = root.prob_distribution
# If the key is there then check the distribution
if check_flag==True:
check_distribution,new_root=root.best_attribute_val_dict[val] # How will you check for continuos attribute
if check_distribution[0]==True:
pred_label[i]= check_distribution[1]
flag=False
break
else:
root=new_root
# Now get the probability distribution of the node, since its a null node
if root is None:
class_val = prob_distribution.index(max(prob_distribution)) # give the prob of max lass in thsoe samples
pred_label[i] = class_val # Class label for the class with highest probability distribution
break
# If the key is not there, then just take the class with highest distribution for taht node and retun
else:
class_val = prob_distribution.index(max(prob_distribution)) # give the prob of max lass in thsoe samples
pred_label[i] = class_val # Class label for the class with highest probability distribution
break
return pred_label
#def train(self, training_set):
#def classify(self, test_instance):
def run_decision_tree():
# Load the dataset
with open("hw4-task1-data.csv","r") as file_csv:
data = [tuple(line) for line in csv.reader(file_csv,delimiter="\t")]
print "Total number of entries: %d" % (len(data))
d_tree = DescisionTree()
d_tree.dict_discrete_continous[0]="continuous " #age
d_tree.dict_discrete_continous[1]="discrete " #workclass
d_tree.dict_discrete_continous[2]="continuous " #fnlwgt
d_tree.dict_discrete_continous[3]="discrete " #education
d_tree.dict_discrete_continous[4]="continuous" #education-num
d_tree.dict_discrete_continous[5]="discrete " #martial -status
d_tree.dict_discrete_continous[6]="discrete" #occupation
d_tree.dict_discrete_continous[7]="discrete" #relationship
d_tree.dict_discrete_continous[8]="discrete " #race
d_tree.dict_discrete_continous[9]="discrete " #sex
d_tree.dict_discrete_continous[10]="continuous " #capital gain
d_tree.dict_discrete_continous[11]="continuous" #capitalloss
d_tree.dict_discrete_continous[12]="continuous" #hrs per week
d_tree.dict_discrete_continous[13]="discrete "# native-country
d_tree.dict_discrete_continous[14]="discrete"
#print data[0]
columns=15
X_train = {}
y_train = {}
for i in xrange(len(data)):
line = data[i][0]
list_data=[]
line_list = line.split(",")
if line_list[-1]==">50K":
y_train[i]=1
else:
y_train[i] = 0
for j in xrange(columns-1):
if d_tree.dict_discrete_continous[j] =="continuous":
list_data.append(float(line_list[j]))
else:
list_data.append(line_list[j])
list_data.append(y_train[i])
X_train[i]=list_data
list_col = len(X_train[0]) -1
print (list_col)
#print (type(X_train))
sample_dict = {i: X_train [i] for i in xrange(10) }
#print (sample_dict)
#d_tree.buildDecisionTree(sample,list_col)
if __name__=="__main__":
run_decision_tree()