This repository has been archived by the owner on Oct 3, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathbuild_knowledge.py
146 lines (115 loc) · 4.47 KB
/
build_knowledge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
Knowledge graph builder
@author TaoPR (github.com/starcolon)
---
Process the entire bulk of the crawled dataset (MongoDB)
and potentially create a knowledge graph (OrientDB)
"""
import re
import sys
import argparse
from termcolor import colored
from nltk.tokenize.punkt import PunktSentenceTokenizer
from pylib.knowledge.graph import Knowledge
from pylib.text import structure as TextStructure
from pylib.text.pos_tree import PatternCapture
from pylib.knowledge.datasource import MineDB
from pybloom_live import ScalableBloomFilter
arguments = argparse.ArgumentParser()
arguments.add_argument('--verbose', dest='verbose', action='store_true', help='Turn verbose output on.')
arguments.add_argument('--start', type=int, default=0, help='Starting index of the crawling record to annotate.')
arguments.add_argument('--root', type=str, default=None, help='Supply the OrientDB password for root account.')
arguments.add_argument('--limit', type=int, default=100, help='Maximum number of topics we want to import')
args = vars(arguments.parse_args(sys.argv[1:]))
"""
Initialise a lazy connection to the crawling record collection
"""
def init_crawl_collection():
crawl_collection = MineDB('localhost','vor','crawl')
return crawl_collection
def iter_topic(crawl_collection,start):
# Prepare a naive sentence tokeniser utility
pst = PunktSentenceTokenizer()
n = 0
for wiki in crawl_collection.query({'downloaded': True},field=None,skip=start):
# Skip empty content or the added one
if wiki['content'] is None or 'added_to_graph' in wiki:
continue
m = 0
content = wiki['content']
if args['verbose']:
print(colored('[Extracting wiki] : ','cyan'), content['title'])
# A wiki page may probably comprise of multiple content
for c in content['contents']:
# Explode a long topic into list of sentences
sentences = pst.sentences_from_text(c)
for s in sentences:
m += 1
yield (content['title'],s.split(' '))
# After all sentences are processed,
# mark the current wiki record as 'processed'
crit = {'_id': wiki['_id']}
crawl_collection.update(crit, {'$set':{'added_to_graph':True}})
n += 1
if args['verbose']:
print(content['title'] + " processed with {0} nodes.".format(m))
print(colored("{0} wiki documents processed so far...".format(n),'blue'))
"""
Remove stopwords & ensure text encoder
"""
def ensure_viable(ns,stopwords):
def clean(a):
# Strip non-alphanumeric symbols (unicode symbols reserved)
a = re.sub("[\x00-\x2F\x3A-\x40\x5B-\x60\x7B-\x7F]+", "", a)
for s in stopwords:
a.replace(s,'')
# TAOTODO: Clean up unicode of [a]
return a.strip()
ns = set(clean(n) for n in ns)
ns = [n for n in ns if len(n)>2]
return list(ns)
if __name__ == '__main__':
# Initialise a knowledge database
print(colored('Initialising knowledge graph database...','cyan'))
kb = Knowledge('localhost','vor','root',args['root'])
kb.clear()
# Load existing pos patterns
print(colored('Loading POS patterns...','cyan'))
patterns = PatternCapture()
patterns.load('./pos-patterns')
# Load list of stopwords
print(colored('Loading stopwords...','cyan'))
stopwords = []
with open('./pos-stopwords') as f:
stopwords = list(f.readlines())
# Initialise a crawling dataset connection
print(colored('Initialising wikipedia crawling collection...','cyan'))
crawl_collection = init_crawl_collection()
# Iterate through the crawling database
n = 0
print(colored('Iterating over crawling database...','cyan'))
bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
for topic,sentence in iter_topic(crawl_collection,args['start']):
# Clean topic string
topic = topic.replace("'",'').replace('\n','')
# Check if the number of processed topic exceed the limit?
if topic not in bf:
bf.add(topic)
if len(bf) > args['limit']:
print(colored('[Topics limit reached] ... BYE','cyan'))
sys.exit(0)
# Break the sentence into knowledge nodes
pos = TextStructure.pos_tag(sentence)
kb_nodes = patterns.capture(pos)
# Clean up each of the nodes
# a) Remove stopwords
# b) Remove duplicates
# c) Ensure supported encoding
kb_nodes = ensure_viable(kb_nodes, stopwords)
if args['verbose']:
print(kb_nodes)
# Create a set of knowledge links
kb.add(topic,kb_nodes,args['verbose'])
n += 1
if n%100 == 0 and n>0:
print('... {} topics done so far.'.format(n))