-
Notifications
You must be signed in to change notification settings - Fork 3
/
preprocess-corpus.py
36 lines (31 loc) · 1.05 KB
/
preprocess-corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import glob, codecs, re, gzip
import jieba_fast as jieba
pdpath = '/home/fqx/Documents/pd-corpus/**/*.txt'
corpuspath = 'corpus/pd-aio.txt.gz'
paragraphbreak = re.compile('[‖ ]')
linebreak = re.compile('[【】。!?… ]')
jieba.load_userdict('names.txt')
jieba.enable_parallel(4)
aiofile = gzip.open(corpuspath, 'wt', encoding='utf-8')
pdfiles = glob.glob(pdpath,recursive=True)
for addr in pdfiles:
print('Processing %s' % addr)
try:
file = codecs.open(addr, 'r', 'GB18030')
lines = file.readlines()
file.close()
except UnicodeDecodeError:
print('Decoding Error!')
continue
for line in lines:
paras = re.split(paragraphbreak,line)
for para in paras:
reallines = re.split(linebreak,para)
for realline in reallines:
if len(realline) > 19:
words = jieba.cut(realline)
realwords = ' '.join(words)
aiofile.write('%s\n' % realwords)
else:
pass
aiofile.close()