-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrain.py
61 lines (43 loc) · 1.41 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
###
### train a perceptron-based POS tagger for Indonesian
###
import nltk
from nltk import ConditionalFreqDist, FreqDist
from utils import read_tagged
# read a corpus
# from https://github.com/famrashel/idn-tagged-corpus
#
sents = [] # [(word, tag), ...
ind_tagged = 'idn-tagged-corpus/Indonesian_Manually_Tagged_Corpus.tsv'
sents = read_tagged(ind_tagged)
cfd = ConditionalFreqDist([(t,w) for s in sents for (w,t) in s])
fd = FreqDist([t for s in sents for (w,t) in s])
with open("tagset.tsv",'w') as out:
out.write("Tag\tFreq\tExamples\n")
for tag in sorted(fd.keys()):
out.write("{}\t{:6,d}\t{}\n".format(tag,
fd[tag],
"; ".join(["{} ({:,d})".format(w,f) for (w,f) in cfd[tag].most_common(3)])))
##
## train and save the perceptron tagger
##
tp = nltk.tag.perceptron.PerceptronTagger(load=False)
tp.train(sents)
tp.model.save('averaged_perceptron_tagger_id.pickle')
#exit()
###
### check everything works
###
size = int(len(sents) * 0.95)
train_sents = sents[:size]
test_sents = sents[size:]
t0 = nltk.DefaultTagger('NNP')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t2.evaluate(test_sents)
# 0.953646173969915
print (t2.evaluate(test_sents))
tp = nltk.tag.perceptron.PerceptronTagger(load=False)
tp.train(train_sents)
print(tp.evaluate(test_nts))
#0.9715500327011118