This repository has been archived by the owner on Apr 24, 2020. It is now read-only.
forked from faraday/wikiprep-esa
-
Notifications
You must be signed in to change notification settings - Fork 2
/
addAnchors.py
133 lines (110 loc) · 4.68 KB
/
addAnchors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/python
'''
Copyright (C) 2010 Cagatay Calli <[email protected]>
Adds anchors from Wikiprep output to target Wikipedia articles.
Legacy input format: <Target page id> <Source page id> <Anchor text (up to the end of the line)>
Input format: <Target page id> <Source page id> <Anchor location within text> <Anchor text (up to the end of the line)>
Output format: <Target page id> <Anchor text>
USAGE: addAnchors.py <anchor file from Wikiprep> <any writeable folder>
The folder is used by the script to create data files that are loaded into database.
IMPORTANT: If you use XML output from a recent version of Wikiprep
(e.g. Zemanta fork), then set FORMAT to 'Zemanta-legacy' or 'Zemanta-modern'.
'''
import os
import sys
import MySQLdb
from optparse import OptionParser
from subprocess import Popen, PIPE
# Wikiprep dump format enum
# formats: 1) Gabrilovich 2) Zemanta-legacy 3) Zemanta-modern
F_GABRI = 0 # gabrilovich
F_ZLEGACY = 1 # zemanta legacy
F_ZMODERN = 2 # zemanta modern
usage = """
USAGE: addAnchors.py <anchor files from Wikiprep> <any writeable folder>' --format=<Wikiprep dump format>
Wikiprep dump formats:
1. Gabrilovich [gl, gabrilovich]
2. Zemanta legacy [zl, legacy, zemanta-legacy]
3. Zemanta modern [zm, modern, zemanta-modern]
"""
parser = OptionParser(usage=usage)
parser.add_option("-f", "--format", dest="_format", help="Wikiprep dump format (g for Gabrilovich, zl for Zemanta-legacy,zm for Zemanta-modern)", metavar="FORMAT")
(options, args) = parser.parse_args()
if len(args) < 2:
print usage
sys.exit()
if not options._format:
print """
Wikiprep dump format not specified! Please select one from below with --format option:
Wikiprep dump formats:
1. Gabrilovich [gl, gabrilovich]
2. Zemanta legacy [zl, legacy, zemanta-legacy]
3. Zemanta modern [zm, modern, zemanta-modern]
"""
sys.exit()
if options._format in ['zm', 'zemanta-modern', 'Zemanta-modern', 'Zemanta-Modern', 'modern']:
FORMAT = F_ZMODERN
elif options._format in ['gl', 'gabrilovich', 'Gabrilovich']:
FORMAT = F_GABRI
elif options._format in ['zl', 'zemanta-legacy', 'Zemanta-legacy', 'Zemanta-Legacy', 'legacy']:
FORMAT = F_ZLEGACY
PARTITION_SIZE = 2000000
if FORMAT == F_GABRI:
FIELD_POS = 2
else:
FIELD_POS = 3
outPrefix = os.path.join(args[-1], 'zanchor')
out = open(outPrefix + '0', 'w')
lc = 0
outk = 0
# usage python addAnchors.py enwiki-latest-pages-articles.anchor_text.000* out_dir --format=zm
for fname in args[:-1]:
print >>sys.stderr, " -> Processing file", fname
#f = Popen(['zcat', fname], stdout=PIPE) # much faster than python gzip
fpopen = Popen(['pigz', '-d', '-c', fname], stdout=PIPE) # even faster
f = fpopen.stdout
for i in range(3):
f.readline() # skip header?
for line in f:
fields = line.split('\t')
anc = fields[FIELD_POS].strip()
if not anc:
continue
name = fields[0].strip()
if not name:
continue
out.write("%s\t%s\n" % (name, anc))
lc += 1
if lc >= PARTITION_SIZE:
lc = 0
outk += 1
out.close()
out = open(outPrefix + str(outk), 'w')
out.close()
outk += 1
try:
conn = MySQLdb.connect(host='localhost', user='root', passwd='123456', db='wiki', charset="utf8", use_unicode=True)
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
sys.exit(1)
try:
cursor = conn.cursor()
for i in range(outk):
si = str(i)
cursor.execute("DROP TABLE IF EXISTS zanchor" + si)
cursor.execute("CREATE TABLE zanchor" + si + " (target_id int(10) unsigned, anchor blob)")
cursor.execute("LOAD DATA LOCAL INFILE '" + outPrefix + si + "' INTO TABLE zanchor" + si)
cursor.execute("CREATE INDEX idx_target_id ON zanchor" + si + " (target_id);")
cursor.execute("DROP TABLE IF EXISTS anchorList" + si)
cursor.execute("CREATE TABLE anchorList" + si + " (target_id int(10) unsigned, anchor_text mediumblob)")
cursor.execute("INSERT anchorList" + si + " SELECT a.target_id,GROUP_CONCAT(a.anchor SEPARATOR ' \n ') AS anchor_text FROM zanchor" + si + " a WHERE a.anchor IS NOT NULL GROUP BY a.target_id")
cursor.execute("DROP TABLE zanchor" + si)
# add anchors after creating each partition
cursor.execute("CREATE INDEX idx_target_id ON anchorList" + si + " (target_id);")
cursor.execute("UPDATE text t, anchorList" + si + " a SET t.old_text = CONCAT(a.anchor_text,' \n',t.old_text) WHERE t.old_id = a.target_id AND a.anchor_text IS NOT NULL;")
cursor.execute("DROP TABLE anchorList" + si)
cursor.close()
conn.close()
except MySQLdb.Error, e:
print "Error: %s" % e
sys.exit(1)