-
Notifications
You must be signed in to change notification settings - Fork 1
/
setup.py
37 lines (32 loc) · 1.33 KB
/
setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from nltk.corpus import wordnet as wn
import pickle
lex = {}
lex['adj'] = set() # adjectives
lex['non'] = {'nonya'} #nouns
lex['vrb'] = set() # verbs
lex['dup'] = set() # lexical reduplication
lex['ber'] = {'berdiri', 'belaja', 'bersama'}
lex['all'] = {'setidaknya'} # all the words
### prepositions from the web
lex['prep'] = {'atas', 'setelah', 'sekitar', 'di', 'karena', 'sebelum', 'samping', 'antara', 'tapi', 'dekat ke', 'turun', 'selama', 'untuk', 'dari', 'di', 'depan', 'dalam', 'daripada', 'seperti', 'dekat', 'berdekatan', 'puncak', 'keluar', 'luar', 'diatas', 'seberang', 'tentang', 'sejak', 'daripada', 'ke', 'bawah', 'sampai', 'naik', 'tanpa', 'tentang'}
lex['all'] = lex['all'].union(lex['ber']).union(lex['non']).union(lex['prep'])
print('Reading Indonesian Wordnet (through OMW)')
for ss in wn.all_synsets():
lp = ss.pos()
for lemma in ss.lemmas(lang='ind'):
ln = lemma.name().replace('_', ' ')
if ' ' in ln: # we only need single words
continue
lex['all'].add(ln)
if "-" in ln:
lex['dup'].add(ln)
if lp == 'a' or lp == 's':
lex['adj'].add(ln)
elif lp == 'v':
lex['vrb'].add(ln)
elif lp == 'n':
lex['non'].add(ln)
print('Pickling Lex to lex_id.pickle')
f = open('lex_id.pickle', 'wb')
pickle.dump(lex, f)
print('Done!')