-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
107 lines (85 loc) · 3.24 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# coding: utf-8
import sys
sys.path.append('/home/plum/kimo_emo')
import pymongo
from Tokenizer import Tokenizer
def toTraditionalChinese(dbName, collectionName):
import jianfan
"""
Parameters
==========
dbName
collectionName
"""
db = pymongo.Connection('doraemon.iis.sinica.edu.tw')[dbName]
co = db[collectionName]
total = co.count()
for i, mdoc in enumerate(co.find()):
if "content" not in mdoc or not mdoc["content"]: continue
try:
cht_content = jianfan.jtof( mdoc["content"] )
print '(%d/%d) convert successfully' % (i+1, total)
except:
print '(%d/%d) fail to convert' % (i+1, total)
cht_content = mdoc["content"]
co.update({'_id':mdoc['_id']}, { '$set': { 'cht_contents': cht_content } })
def tokenize_all(**kwargs):
"""
tokenize all sentences in mongodb
Example
=======
tokenize_all(mongo_addr="localhost", mongo_db="espanol", mongo_co="bk.posts")
Parameters
==========
mongo_addr: str
address to mongodb
mongo_db: str
db_name in mongodb
mongo_co: str
collection_name in mongodb
source_field: str
the source field name in mongo, e.g., 'content'
target_field: str
the target field name in mongo, e.g., 'parsed'
Example
=======
>> from preprocessing import tokenize_all
>> tokenize_all(mongo_co='qy.posts', source_field='cht_contents', target_field="parsed")
Returns
=======
no return value, insert back to mongo directly
"""
mongo_addr = 'doraemon.iis.sinica.edu.tw' if 'mongo_addr' not in kwargs else kwargs['mongo_addr']
mongo_db = 'espanol' if 'mongo_db' not in kwargs else kwargs['mongo_db']
mongo_co = 'bk.posts' if 'mongo_co' not in kwargs else kwargs['mongo_co']
source_field = 'content' if 'source_field' not in kwargs else kwargs['source_field']
target_field = 'parsed' if 'target_field' not in kwargs else kwargs['target_field']
## connect to mongo
db = pymongo.Connection(mongo_addr)[mongo_db]
co = db[mongo_co]
## init a CKIP tokenizer
tok = Tokenizer()
total = co.count()
for ith, mdoc in enumerate(co.find()):
if target_field in mdoc:
print >> sys.stderr, '> %d/%d doc already parsed. skip' % (ith+1, total)
continue
if source_field not in mdoc:
print >> sys.stderr, '> %d/%d has no field named %s' % (ith+1, total, source_field)
continue
parsed_sent_lst = []
for raw_sent in mdoc[source_field]:
## sent: 你好... (u'\u4f60\u597d...')
## ckip accept '\xe4\xbd\xa0\xe5\xa5\xbd' as input
try:
parsed_sent = tok.tokenizeStr( raw_sent.encode('utf-8') ).decode('utf-8')
except:
## tokenize error
## keep the raw sentence
parsed_sent = raw_sent
print >> sys.stderr, '! tokenize error'
parsed_sent_lst.append( parsed_sent )
## update
print >> sys.stderr, '> %d/%d doc parsed. update' % (ith+1, total)
current_id = mdoc["_id"]
co.update( {"_id": current_id} , {"$set": { target_field : parsed_sent_lst} })