-
Notifications
You must be signed in to change notification settings - Fork 0
/
gender_predictor.py
94 lines (72 loc) · 2.72 KB
/
gender_predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from nltk import NaiveBayesClassifier, classify
import name_loader
import random
class genderPredictor():
def getFeatures(self):
maleNames, femaleNames = self._loadNames()
featureset = list()
for nameTuple in maleNames:
features = self._nameFeatures(nameTuple[0])
male_prob, female_prob = self._getProbDistr(nameTuple)
features['male_prob'] = male_prob
features['female_prob'] = female_prob
featureset.append((features, 'M'))
for nameTuple in femaleNames:
features = self._nameFeatures(nameTuple[0])
male_prob, female_prob = self._getProbDistr(nameTuple)
features['male_prob'] = male_prob
features['female_prob'] = female_prob
featureset.append((features, 'F'))
return featureset
def trainAndTest(self, trainingPercent=0.80):
featureset = self.getFeatures()
random.shuffle(featureset)
name_count = len(featureset)
cut_point = int(name_count * trainingPercent)
train_set = featureset[:cut_point]
test_set = featureset[cut_point:]
self.train(train_set)
return self.test(test_set)
def classify(self, name):
feats = self._nameFeatures(name)
return self.classifier.classify(feats)
def train(self, train_set):
self.classifier = NaiveBayesClassifier.train(train_set)
return self.classifier
def test(self, test_set):
return classify.accuracy(self.classifier, test_set)
def _getProbDistr(self, nameTuple):
male_prob = (nameTuple[1] * 1.0) / (nameTuple[1] + nameTuple[2])
if male_prob == 1.0:
male_prob = 0.99
elif male_prob == 0.0:
male_prob = 0.01
else:
pass
female_prob = 1.0 - male_prob
return (male_prob, female_prob)
def getMostInformativeFeatures(self, n=5):
return self.classifier.most_informative_features(n)
def _loadNames(self):
return name_loader.getNameList()
def _nameFeatures(self, name):
name = name.upper()
return {
'last_letter': name[-1],
'last_two': name[-2:],
'last_three': name[-3:],
'last_is_vowel': (name[-1] in 'AEIOUY')
}
if __name__ == "__main__":
gp = genderPredictor()
accuracy = gp.trainAndTest()
print ('Accuracy: %f' % accuracy)
print ('Most Informative Features')
feats = gp.getMostInformativeFeatures(10)
for feat in feats:
print ('\t%s = %s' % feat)
name = ''
while name != 'quit':
name = input('Enter name to classify: ')
name = name.strip()
print ('\n%s is classified as %s' % (name, gp.classify(name)))