forked from halfak/deltas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
demo_ply.py
51 lines (42 loc) · 916 Bytes
/
demo_ply.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import time
from ply.lex import lex
from mw import api
tokens = (
'word',
'number',
'period',
'qmark',
'epoint',
'comma',
'colon',
'scolon',
'break',
'whitespace',
"etc"
)
t_word = r'[^\W\d]+'
t_number = r'[\d]+'
t_period = r'\.'
t_qmark = r'\?'
t_epoint = r'!'
t_comma = r','
t_colon = r':'
t_scolon = r';'
t_break = r'(\n|\n\r|\r\n)\s*(\n|\n\r|\r\n)+'
t_whitespace = r'[\n\r\s]+'
t_etc = r"."
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
lexer = lex()
session = api.Session("https://en.wikipedia.org/w/api.php")
common1 = session.revisions.get(638029546, properties={"content"})['*']
start = time.time()
for i in range(50):
lexer.input(common1)
while True:
token = lexer.token()
#print(token)
if token is None:
break
print("Tokenizing (text_split):", (time.time() - start)/50)