-
Notifications
You must be signed in to change notification settings - Fork 0
/
list_disambig_articles.py
126 lines (109 loc) · 4.32 KB
/
list_disambig_articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import sys
import typing
import pywikibot
import re
# from disambig_linkshere import disambig_linkshere
from disambig_task_process import TaskProcess
from disambig_basic import NoneProcess
class Link:
def __init__(self, link_tuple: typing.Tuple[str]):
(prefix, core, suffix, section, caption) = link_tuple
self.prefix = prefix
self.core = core
self.suffix = suffix
self.section = section
self.caption = caption
@property
def title(self) -> str:
ret = str()
if self.prefix:
ret += self.prefix + ":"
ret += self.core
if self.suffix:
ret += "(" + self.suffix + ")"
return ret
@property
def link(self) -> str:
ret = self.title
if self.section:
ret += "#" + self.section
return ret
@property
def showed_caption(self) -> str:
return self.caption if self.caption else self.link
def __str__(self) -> str:
ret = self.link
# if self.caption:
# ret += '|' + self.caption
return ret
def __repr__(self) -> str:
return '\'' + self.__str__() + '\''
def clean_zero_width_spaces(text: str) -> str:
return text.replace("\u200e", "")
LINK_PATTERN = r"(?:[\ _]*([^\[\]\|]*?)\:)?([^\[\]\|]*?)(?:\(([^\[\]\|]*?)\))?(?:\#([^\[\]\|]*?)[\ _]*)?(?:\|[\ _]*([^\[\]]*?)[\ _]*)?"
def findlinks(text: str) -> typing.List[Link]:
link_tuple_list = re.findall(r"(?:\[\[|\{\{[\ _]*(?:coloredlink\|.*?|dl)[\ _]*\|[^\|]*\|)" + LINK_PATTERN + r"\]\]", text)
# (prefix, core, suffix, section, caption):
# "[[prefix:core(suffix)#section|caption]]"
# "{{coloredlink|color|prefix:core(suffix)#section|caption}}"
# "{{dl|prefix:core(suffix)#section}}"
return [Link(tuple(map(clean_zero_width_spaces, link_tuple))) for link_tuple in link_tuple_list]
def list_disambig_articles(
disambig: pywikibot.Page,
process: typing.Union[TaskProcess, NoneProcess] = NoneProcess(),
article_except: typing.List[str] = [],
dropout_multi_articles: bool = False,
dropout_no_keyword: bool = False
) -> typing.List[typing.Dict[str, typing.Union[str, typing.Set[str]]]]:
# print("list_disambig_articles(" + disambig.title + ")")
articles = list()
process.print("==", disambig.title(), "==")
process.print(disambig.full_url())
if len(list(disambig.categories())) > 2:
process.print(list(disambig.categories()))
for line in disambig.text.splitlines():
line_split = line.split("————")
if len(line_split) < 2:
line_split = line.split("——")
if len(line_split) < 2:
continue
# print(line_split)
article_links = findlinks(line_split[0])
keyword_links = findlinks(line_split[1])
# print(article_links, keyword_links)
if not article_links \
or (dropout_multi_articles and len(article_links) > 1):
continue
process.print(line)
for article_link in article_links:
if article_except and article_link.title in article_except:
continue
article = article_link.__dict__
article["title"] = article_link.title
article["link"] = article_link.link
keywords = set()
if article_link.prefix and article_link.prefix not in (
"Template", "模板", "Category", "分类", "User", "用户", "zhwiki"):
keywords.add(article_link.prefix)
elif article_link.suffix:
keywords.add(article_link.suffix)
for keyword_link in keyword_links:
keywords.add(keyword_link.title)
if keyword_link.caption:
keywords.add(keyword_link.caption)
if dropout_no_keyword and not keywords:
continue
article["keywords"] = keywords
articles.append(article)
# for article in articles:
# process.print(article)
return articles
# process.print(articles)
def list_disambig_articles_main():
title = sys.argv[1] if len(sys.argv) > 1 else "Afterglow"
site = pywikibot.Site()
page = pywikibot.Page(site, title)
list_disambig_articles(page, dropout_multi_articles=True, dropout_no_keyword=True)
# pass
if __name__ == '__main__':
list_disambig_articles_main()