-
Notifications
You must be signed in to change notification settings - Fork 29
/
filter.py
282 lines (218 loc) · 9.51 KB
/
filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
import json
import logging
import os
def main():
Filter = ArticleFilter()
Filter.load_processed_corpus()
#Filter.process_raw_data("data/raw/",is_dir=True,to_one_file=True,one_file_name="CorpusPatch2.json")
#Filter.merge_coprus()
Filter.print_titles()
Filter.print_response()
# Filter.print_user_info() TODO?
class ArticleFilter(object):
def __init__(self):
self.stopwords = None
self.stoptags = None
self.raw_data = None
self.corpus = []
self.order_response = []
self.order_titles = []
self.article_count = 0
self.titles = set()
self.users_info = {}
self.init_load_stopwords()
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
def init_load_stopwords(self):
"""
回應過濾用語集,如5樓、幫補血、紅明顯等不適用於聊天機器人的回應
"""
with open('data/stopwords/ptt_words.txt','r', encoding='utf-8') as sw:
self.stopwords = [word.strip('\n') for word in sw]
#print(self.stopwords)
with open('data/stopwords/gossiping.tag','r', encoding='utf-8') as sw:
self.stoptags = [word.strip('\n') for word in sw]
def process_raw_data(self, path, is_dir=False, to_one_file=False, one_file_name="corpus.json"):
data = []
total = []
filename = None
count = 0
if is_dir:
filenames = [name for name in os.listdir(path) if not name.startswith(".")]
else:
filenames = [path]
for filename in filenames:
count +=1
if count % 100 == 0:
logging.info("已處理 %d 頁文章, 其中有效文章數為 %d" % (count, self.article_count))
with open(os.path.join(path, filename),'r', encoding="utf-8") as data:
res = self.generate_corpus(json.load(data))
if to_one_file:
total += res
else:
with open("data/processed/"+filename,'w', encoding='utf-8') as op:
op.write(json.dumps(res, indent=4, ensure_ascii=False))
logging.info("已處理 " + filename)
if to_one_file:
with open("data/processed/" + one_file_name,'w', encoding='utf-8') as op:
op.write(json.dumps(total, indent=4, ensure_ascii=False))
def reclean_corpus(self):
pass
def merge_coprus(self, path="data/processed/"):
logging.info("即將進行語料庫的合併")
corpus_names = [name for name in os.listdir(path)
if not name.startswith(".")
and name != "reply"]
all_corpus = []
for corpus_name in corpus_names:
with open(os.path.join(path, corpus_name),'r', encoding='utf-8') as data:
c = json.load(data)
logging.info("已載入" + corpus_name)
all_corpus += c
with open("data/processed/all_corpus.json", 'w', encoding='utf-8') as op:
op.write(json.dumps(all_corpus, indent=4, ensure_ascii=False))
logging.info("合併完成,輸出至" + path + "all_corpus.json")
def load_processed_corpus(self, path="data/processed/"):
corpus_names = [name for name in os.listdir(path)
if not name.startswith(".") and os.path.isfile(os.path.join("data/processed/",name))]
logging.info("正在載入現有語料")
for corpus_name in corpus_names:
with open(os.path.join(path, corpus_name),'r', encoding='utf-8') as data:
c = json.load(data)
logging.info("已載入" + corpus_name)
self.corpus += c
logging.info("並讀入了 %d 篇文章" % len(self.corpus))
logging.info("正在抽取文章與回應")
for article in self.corpus:
self.titles.add(article["Title"])
self.order_titles.append(article["Title"])
self.order_response.append(article["Responses"])
logging.info("文章與回應抽取完成")
def generate_corpus(self, articles, drop_response=True, negative_tag=None, no_content=True, min_length=6):
"""
依據需求挑選出符合語料庫需求的文章
Args:
- articles: 描述文章的字典,格式參見 PTT-Crawler
- drop_response: 是否濾除回應文章
- negative_tag: 要濾除的標籤集
- no_content: 是否需要保存文章內容
- min_length: 只保存長度超過 min_length 之標題
Return:
- coprus: 一個儲存符合需求的文章列表
"""
if negative_tag is None:
negative_tag = self.stoptags
clean_article = []
for article in articles:
#####################濾除非結構化文章#####################
try:
title = article["Title"]
clean_responses = self.clean_responses(article["Responses"])
if len(clean_responses) == 0:
continue # 不需要沒有回應的文章
article["Responses"] = clean_responses
except Exception as e:
#print("Wrong Format: %s" % str(e))
continue
######################文章客製化選項######################
if title in self.titles or len(title) < min_length:
#捨去已存在語料庫的標題或過短的標題
continue
if drop_response:
#捨去回應類文章與快訊文章, i.e Re: and Fw:
if title.startswith("Re") or title.startswith("Fw"):
continue
if no_content:
article.pop("Content")
#######################標籤抽取##########################
tag, clean_title = self.get_tag(title) #將標題與標籤分開
if tag in negative_tag:
continue
article["Tag"] = tag
article["Title"] = clean_title
self.titles.add(clean_title)
self.order_titles.append(clean_title)
self.order_response.append(clean_responses)
self.article_count += 1
clean_article.append(article)
return clean_article
def clean_responses(self, responses, negative_user=set(), min_length=6, stopwords=None):
"""
依照負面使用者案例、回應長度與是否包含停用詞來濾除負面的回應
Args:
- responses: 回應的 dictionary
- negative_user: 要濾除該 User set 的回應
- min_length: 濾除低於 min_length 的回應
- stopwords: 濾除有敏感字詞的回應
Return:
- Responses: 已清除負面回應的字典
"""
if stopwords is None:
stopwords = self.stopwords
clean_responses = []
for response in responses:
#self._update_users_history(response) # 更新使用者推噓文紀錄
drop = False
# 濾除過短與特定使用者的回應
if response["User"] in negative_user or len(response["Content"]) < min_length:
drop = True
# 濾除包含停用詞的回應
for w in stopwords:
if w in response["Content"]:
drop = True
if not drop:
response["Content"] = response["Content"].strip()
clean_responses.append(response)
return clean_responses
def _update_users_history(self, response):
"""
記錄 user 的推/噓/箭頭
"""
user = response["User"]
if user not in self.users_info.keys():
res = {
"推":0,
"噓":0,
"箭頭":0
}
self.users_info[user] = res
if response["Vote"] == "推":
self.users_info[user]["推"] += 1
elif response["Vote"] == "噓":
self.users_info[user]["噓"] += 1
else:
self.users_info[user]["箭頭"] += 1
def get_tag(self, title):
"""
回傳文章標籤與清理好的標題
"""
try:
tag,title = title.split("]",1)
except:
#print("發現無標籤標題: " + title)
return None,title
title = title.lstrip()
return tag[1:],title
def print_titles(self):
logging.info("正在輸出文章標題")
with open('data/Titles.txt','w',encoding='utf-8') as op:
for title in self.order_titles:
op.write(title + "\n")
logging.info("文章標題輸出完成")
def print_user_info(self):
with open('data/User_info.txt','w',encoding='utf-8') as op:
op.write(json.dumps(self.users_info, indent=4, ensure_ascii=False))
def print_response(self):
logging.info("正在輸出回應內容")
resSplit = []
sc = 0
for response in self.order_response:
sc += 1
resSplit.append(response)
if sc % 1000 == 0:
with open('data/processed/reply/'+str(int(sc/1000) - 1)+'.json','w',encoding='utf-8') as tr:
tr.write(json.dumps(resSplit, indent=4, ensure_ascii=False))
resSplit = []
logging.info("已輸出 %d 篇回應" % sc)
logging.info("回應輸出完成")
if __name__ == '__main__':
main()