-
Notifications
You must be signed in to change notification settings - Fork 40
/
TrainFunctions.py
127 lines (89 loc) · 4.15 KB
/
TrainFunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import nltk
def getCasing(word):
""" Returns the casing of a word"""
if len(word) == 0:
return 'other'
elif word.isdigit(): #Is a digit
return 'numeric'
elif word.islower(): #All lower case
return 'allLower'
elif word.isupper(): #All upper case
return 'allUpper'
elif word[0].isupper(): #is a title, initial char upper, then all lower
return 'initialUpper'
return 'other'
def checkSentenceSanity(sentence):
""" Checks the sanity of the sentence. If the sentence is for example all uppercase, it is recjected"""
caseDist = nltk.FreqDist()
for token in sentence:
caseDist[getCasing(token)] += 1
if caseDist.most_common(1)[0][0] != 'allLower':
return False
return True
def updateDistributionsFromSentences(text, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
"""
Updates the NLTK Frequency Distributions based on a list of sentences.
text: Array of sentences.
Each sentence must be an array of Tokens.
"""
# :: Create unigram lookup ::
for sentence in text:
if not checkSentenceSanity(sentence):
continue
for tokenIdx in xrange(1, len(sentence)):
word = sentence[tokenIdx]
uniDist[word] += 1
if word.lower() not in wordCasingLookup:
wordCasingLookup[word.lower()] = set()
wordCasingLookup[word.lower()].add(word)
# :: Create backward + forward bigram lookup ::
for sentence in text:
if not checkSentenceSanity(sentence):
continue
for tokenIdx in xrange(2, len(sentence)): #Start at 2 to skip first word in sentence
word = sentence[tokenIdx]
wordLower = word.lower()
if wordLower in wordCasingLookup and len(wordCasingLookup[wordLower]) >= 2: #Only if there are multiple options
prevWord = sentence[tokenIdx-1]
backwardBiDist[prevWord+"_"+word] +=1
if tokenIdx < len(sentence)-1:
nextWord = sentence[tokenIdx+1].lower()
forwardBiDist[word+"_"+nextWord] += 1
# :: Create trigram lookup ::
for sentence in text:
if not checkSentenceSanity(sentence):
continue
for tokenIdx in xrange(2, len(sentence)-1): #Start at 2 to skip first word in sentence
prevWord = sentence[tokenIdx-1]
curWord = sentence[tokenIdx]
curWordLower = curWord.lower()
nextWordLower = sentence[tokenIdx+1].lower()
if curWordLower in wordCasingLookup and len(wordCasingLookup[curWordLower]) >= 2: #Only if there are multiple options
trigramDist[prevWord+"_"+curWord+"_"+nextWordLower] += 1
def updateDistributionsFromNgrams(bigramFile, trigramFile, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
"""
Updates the FrequencyDistribitions based on an ngram file,
e.g. the ngram file of http://www.ngrams.info/download_coca.asp
"""
for line in open(bigramFile):
splits = line.strip().split('\t')
cnt, word1, word2 = splits
cnt = int(cnt)
# Unigram
if word1.lower() not in wordCasingLookup:
wordCasingLookup[word1.lower()] = set()
wordCasingLookup[word1.lower()].add(word1)
if word2.lower() not in wordCasingLookup:
wordCasingLookup[word2.lower()] = set()
wordCasingLookup[word2.lower()].add(word2)
uniDist[word1] += cnt
uniDist[word2] += cnt
# Bigrams
backwardBiDist[word1+"_"+word2] +=cnt
forwardBiDist[word1+"_"+word2.lower()] += cnt
#Tigrams
for line in open(trigramFile):
splits = line.strip().split('\t')
cnt, word1, word2, word3 = splits
cnt = int(cnt)
trigramDist[word1+"_"+word2+"_"+word3.lower()] += cnt