-
Notifications
You must be signed in to change notification settings - Fork 3
/
1_generate_lexicon.py
149 lines (130 loc) · 4.15 KB
/
1_generate_lexicon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from ivector import ivector
import time
import os,glob
import sys
reload(sys)
sys.setdefaultencoding('utf8')
scorefile_original = open('inputfiles/airscores.txt','r') #this file is for original scores. No need to change this file.
origscores1 = scorefile_original.read()
origscores2 = origscores1.split()
class lexicon:
startfrom = 0
def __init__(self):
self.wordx = []
self.scorex = []
self.occurx = []
self.loadlexicon()
self.ofile = open("lexicons/lexicon.txt",'w')
def listtextfiles(self,foldername): #returns the name of all files inside the source folder.
owd = os.getcwd()
fld = foldername + "/"
os.chdir(fld) #this is the name of the folder from which the file names are returned.
arr = [] #empty array, the names of files are appended to this array, and returned.
for file in glob.glob("*.txt"):
arr.append(file)
os.chdir(owd)
return arr
def loadlexicon(self):
presentfiles = self.listtextfiles('lexicons')
if 'lexicon.txt' not in presentfiles:
print "Initializing!..."
else:
tofrom1 = open('lexicons/status.txt','r')
tofrom2 = tofrom1.read()
tofrom3 = tofrom2.split()
self.startfrom = int(tofrom3[1])
tofrom1.close()
infile = open("lexicons/lexicon.txt",'r')
inlines = infile.readlines()
lenv = len(inlines)
for j in range(lenv):
stuff = inlines[j].split()
self.wordx.append(stuff[0])
self.scorex.append(float(stuff[1]))
self.occurx.append(int(stuff[2]))
infile.close()
def getstart(self):
return self.startfrom
def checkpresent(self,word):
lenw = len(self.wordx)
yeah = 1
neah = -1
for j in range(lenw):
if self.wordx[j]==word:
return yeah,self.scorex[j],j
return neah,neah,-1
def addword(self,word,score):
lenw = len(self.wordx)
#wordx[lenw] = word
#scorex[lenw] = score
self.wordx.append(word)
self.scorex.append(score)
self.occurx.append(1)
def modifyword(self,word,score,indx):
#print "modify location",indx
prevscr = self.scorex[indx]
prevocc = self.occurx[indx]
newocc = prevocc+1
newscr = (prevscr*prevocc + score)/float(newocc)
self.scorex[indx] = newscr
self.occurx[indx] = newocc
def processword(self,word,score):
p,q,r = self.checkpresent(word)
if p==-1: #word is not present
#print "word not present!"
self.addword(word,score)
else:
#print "word already present at location",r
self.modifyword(word,score,r)
def writelexicon(self):
for j in range(len(self.wordx)):
#print self.wordx[j],self.scorex[j]
self.ofile.write(self.wordx[j]+" "+str(self.scorex[j])+" "+str(self.occurx[j])+"\n")
def getlexicon(self):
return self.wordx,self.scorex,self.occurx
def drawProgressBar(percent, barLen = 50):
sys.stdout.write("\r")
progress = ""
for i in range(barLen):
if i<int(barLen * percent):
progress += "="
else:
progress += " "
sys.stdout.write("[ %s ] %.2f%%" % (progress, percent * 100))
sys.stdout.flush()
def main():
start_time = time.time()
#loading the input file.
f = open('inputfiles/airline.txt')
linex = f.readlines()
len1= len(linex)
lex = lexicon()
startx = lex.getstart();
dofor = 1000
endx = startx + dofor #provide the end position here
print "creating lexicon from",startx,'till',endx
numx = endx-startx;
for p in range(startx,endx):
thisline = linex[p]
vec = ivector.getweights(thisline) #get the importance vector.
thisline = thisline.lower()
wordss = word_tokenize(thisline)
thisscore = float(origscores2[p])
for g in range(len(wordss)):
if vec[g]!=0.0:
#print wordss[g],vec[g] show this to see all scores computed in real time
scr = vec[g]*thisscore
lex.processword(wordss[g],scr)
perc = (p-startx)/float(numx)
drawProgressBar(perc)
drawProgressBar(1)
lex.writelexicon()
statfile = open('lexicons/status.txt','w')
statfile.write(str(startx)+' '+str(p+1))
statfile.close()
print("\n\n--- %s seconds ---" % (time.time() - start_time))
if __name__ == "__main__":
main()