forked from TurkuNLP/Turku-neural-parser-pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
regextokenizer_mod.py
56 lines (47 loc) · 1.49 KB
/
regextokenizer_mod.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import sys
import io
import argparse
import numpy as np
import pickle
import re
sent_regex=re.compile(r"""[.!?](?=\s+[A-Z]|$)""",re.U|re.M)
#token_regex=re.compile(r"""([,;:")\]]|[.'?!-]+)(?=\b)|(?<=\b)(['"(\[]+)""")
token_regex=re.compile(r"""\s|([,:;']+)(?=[\s]|[.?!]|$)|([.?!]+)(?=$)|(?<=\s)([']+)|(?<=^)([']+)""",re.U|re.M)
def launch(args,q_in,q_out):
counter=0
while True:
jobid,txt=q_in.get()
if jobid=="FINAL":
q_out.put((jobid,txt))
return
cache=io.StringIO()
for sent in sentences(txt):
print("# sent_id =",counter,file=cache)
counter+=1
#print("# text =",sent.replace("\n"," "),file=cache)
for id,token in enumerate(tokens(sent)):
print(id+1,token,*(["_"]*8),sep="\t",file=cache)
print(file=cache)
q_out.put((jobid,cache.getvalue()))
def sentences(s):
sents=[]
prev=0
for match in sent_regex.finditer(s):
sents.append(s[prev:match.end()].strip())
prev=match.end()
else:
if s[prev:]:
sents.append(s[prev:].strip())
return sents
def tokens(sent):
parts=token_regex.split(sent)
parts=[p for p in parts if p and p.strip()]
return parts
argparser = argparse.ArgumentParser(description='Quick hack regex tokenizer')
if __name__=="__main__":
inp=sys.stdin.read()
sents=sentences(inp.strip())
for s in sents:
print(s)
print(*tokens(s))
print()