-
Notifications
You must be signed in to change notification settings - Fork 8
/
dump_training_data.py
42 lines (33 loc) · 1.36 KB
/
dump_training_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
__author__ = 'Gongyu'
import json
def dump_training_data(origin_file, format_file):
infile = open(origin_file, 'r')
outfile = open(format_file, 'w')
cnt = 1
for line in infile:
if len(line.strip()) == 0:
continue
if cnt % 1000 == 0:
print 'cnt: ' + str(cnt)
data = json.loads(line.strip())
if (data['cor_type'] == 'cor') or (data['cor_type'] == 'spe_cor'):
continue
if data['match_type'] == 'precise':
outfile.write(data['word'] + '\t' + data['key'] + '\t' + str(data['cnt']) + '\n')
elif data['match_type'] == 'predict':
if data['cor_type'] == '':
outfile.write(data['word'] + '\t' + data['word'] + '\t' + str(data['cnt']) + '\n')
elif data['cor_type'] == 'spe':
error_word = data['spell_info']['spell_in']
for i in xrange(len(data['word'])):
if i < len(data['spell_info']['spell_out']):
continue
else:
error_word = error_word + data['word'][i]
outfile.write(data['word'] + '\t' + error_word + '\t' + str(data['cnt']) + '\n')
outfile.flush()
cnt += 1
infile.close()
outfile.close()
if __name__ == '__main__':
dump_training_data('final.out', 'training_data.txt')