-
Notifications
You must be signed in to change notification settings - Fork 0
/
disambig_basic.py
215 lines (168 loc) · 7.48 KB
/
disambig_basic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import typing
import pywikibot
import re
import opencc
s2t = opencc.OpenCC("s2t.json")
t2s = opencc.OpenCC("t2s.json")
class NoneProcess:
def start(self):
pass
def wait(self):
pass
def add(self, func: typing.Callable, *args, **kwargs):
return func(*args, **kwargs)
def print(self, *args, **kwargs):
return print(*args, **kwargs)
def action(self, *args, **kwargs):
return disambig_linkshere_action(*args, **kwargs)
def gen_redo(self, *args, **kwargs):
return []
def no_redo(self, *args, **kwargs):
return True
def bot_save(page: pywikibot.Page, summary: str = "") -> None:
if summary:
summary += "。"
prefix = '' if page.site.code == 'zh' else 'zhmoe:'
summary += "本次编辑由机器人进行,如修改有误,请撤销或更正,并[[" + prefix + "User_talk:C8H17OH|联系操作者]]。"
page.save(summary=summary, asynchronous=True, watch="nochange", minor=True, botflag=True, tags={"Bot"})
def short_url(page: pywikibot.Page) -> str:
return page.site.base_url(page.site.articlepath.format('_?curid=' + str(page.pageid)))
def link_preproc(link: str) -> str:
link = link.strip() # 去除首尾空格
link = re.escape(link) # 正则表达式化
link = re.sub(r"(?:\\ |_)", r"(?:\\ |_)", link) # 空格和下划线互通
if link[0].lower() != link[0].upper():
link = r"[" + link[0].lower() + link[0].upper() + r"]" + link[1:] # 首字母大小写互通
ret = ""
for char in link:
if s2t.convert(char) != t2s.convert(char):
ret += r"[" + s2t.convert(char) + t2s.convert(char) + r"]" # 繁简互通
else:
ret += char
return ret
def template_and_redirects_regex(page: pywikibot.Page) -> str:
return link_preproc(page.title()) + ''.join(('|' + link_preproc(redirect.title(with_ns=False))) for redirect in page.backlinks(filter_redirects=True))
def find_link(text: str, link: str) -> bool:
pattern = r"\[\[[\ _]*" + link_preproc(link) + r"[\ _]*(\#[^\[\]]*?[\ _]*)?(\|.*?[\ _]*)?[\ _]*\]\]"
return re.search(pattern, text) != None
def replace_link(text: str, oldlink: str, newlink: str, keep_no_caption: bool = False) -> str:
oldlink = link_preproc(oldlink)
# replace "[[oldlink|newlink]]" to "[[newlink]]"
pattern = r"\[\[[ _]*" + oldlink + r"[ _]*\|[ _]*" + link_preproc(newlink) + r"[ _]*\]\]"
repl = r"[[" + newlink + r"]]"
text = re.sub(pattern, repl, text)
# replace "[[oldlink#section|caption]]" to "[[newlink#section|caption]]"
pattern = r"\[\[[ _]*" + oldlink + r"[ _]*(\#[^\[\]]*?[ _]*)?(\|[^\[\]]*?[ _]*)[ _]*\]\]"
repl = r"[[" + newlink + r"\1\2]]" # \1 is "#section", \2 is "|caption"
text = re.sub(pattern, repl, text)
# if keep_no_caption, then replace "[[oldlink#section]]" to "[[newlink#section]]"
# else, use old full title as caption, i.e. replace "[[oldlink#section]]" to "[[newlink#section|oldlink#section]]";
pattern = r"\[\[[ _]*(" + oldlink + r"[ _]*(\#[^\[\]]*?)?)[ _]*\]\]"
if keep_no_caption:
repl = r"[[" + newlink + r"\2]]" # \2 is "#section"
else:
repl = r"[[" + newlink + r"\2|\1]]" # \1 is "oldlink#section", \2 is "#section"
text = re.sub(pattern, repl, text)
# replace "[[File:...|link=<oldlink>]]" (and also "[[Image:...]]") to "[[File:...|link=<newlink>]]"
pattern = r"\[\[[ _]*((?:[Ff][Ii][Ll][Ee]|[Ii][Mm][Aa][Gg][Ee])[ _]*:.*?\| *link=)[ _]*" + oldlink + r"[ _]*(\|.*?)?\]\]"
repl = r"[[\g<1>" + newlink + r"\2]]"
text = re.sub(pattern, repl, text)
return text
def remove_link(text: str, oldlink: str) -> str:
pattern = r"\[\[[ _]*" + link_preproc(oldlink) + r"[ _]*(\#[^\[\]]*?[ _]*)?(\|[^\[\]]*?[ _]*)?[ _]*\]\]"
return re.sub(pattern, "", text)
# def original(s):
# return s
# def capitalize(s):
# return s.capitalize()
# def minusculize(s):
# return s[0].lower() + s[1:]
# def text_preproc_func(func, targ, tval, *args, **kwargs):
# kwargs[targ] = tval
# ret = func(*args, **kwargs)
# for convert in (s2t.convert, t2s.convert):
# for initial in (capitalize, minusculize):
# proced = convert(initial(tval))
# if proced != tval:
# kwargs[targ] = proced
# ret = ret or func(*args, **kwargs)
# return ret
# def find_link(text, link):
# ret = find_link_once(text, link)
# for convert in (s2t.convert, t2s.convert):
# for initial in (capitalize, minusculize):
# proced = convert(initial(link))
# if proced != link:
# ret = ret or find_link_once(text, proced)
# return ret
# def replace_link(text, oldlink, newlink):
# text = replace_link_once(text, oldlink, newlink)
# for convert in (s2t.convert, t2s.convert):
# for initial in (capitalize, minusculize):
# proced = convert(initial(oldlink))
# if proced != oldlink:
# text = replace_link_once(text, proced, newlink)
# return text
def find_word(text: str, word: str) -> bool:
return re.search(link_preproc(word), text) != None
# def find_word(text, word):
# ret = (text.find(word) >= 0)
# for convert in (s2t.convert, t2s.convert):
# for initial in (capitalize, minusculize):
# proced = convert(initial(word))
# if proced != word:
# ret = ret or (text.find(word) >= 0)
# return ret
def disambig_linkshere_action(
disambig: pywikibot.Page,
autos: typing.List[typing.Tuple[pywikibot.Page, str, pywikibot.Page, typing.Set[str]]],
manuals: typing.List[typing.Tuple[pywikibot.Page, str, pywikibot.Page, typing.Set[str]]],
do_edit: bool = False,
show_manual: bool = False
) -> str:
passes = list()
if not do_edit:
while True:
print(end="Action? (y[es] / [n]o / [p]ass some / [r]edo / [q]uit): ")
order = input()
if not order:
pass
elif order[0] == 'y':
do_edit = True
break
elif order[0] == 'n':
break
elif order[0] == 'r':
return "redo"
elif order[0] == 'q':
return "quit"
elif order[0] == 'p':
passes = order.split()[1:]
if not passes:
print(end="Pass which ones? ")
passes = input().split()
do_edit = True
passes = [int(i) for i in passes]
break
if do_edit:
index = 0
for (backlink, redirect_title, article_link, article_relations) in autos:
index += 1
if index in passes:
continue
backlink.text = replace_link(backlink.get(force=True), redirect_title, article_link)
bot_save(backlink, summary="消歧义:[[" + redirect_title + "]]→[[" + article_link + "]]")
if show_manual:
print("====== manuals:", disambig.title(), "======")
for (backlink, redirect_title, article_link, article_relations) in manuals:
print(backlink.title(), backlink.full_url())
return "done" if do_edit else "deny"
def disambig_basic_test():
text = "隨故事发展逐渐展現对[[ I am_大老师_yes_]]的好感,曾在遊樂設施上要求[[I_am 大老師_yes ]]拯救她。"
link = " I_am_大老师 yes"
word = "大老師"
repl = "比企谷八幡"
print(replace_link(text, link, repl))
print(find_word(text, word))
if __name__ == "__main__":
disambig_basic_test()