forked from TurkuNLP/Finnish-dep-parser
-
Notifications
You must be signed in to change notification settings - Fork 1
/
txt_to_09.py
35 lines (30 loc) · 1.12 KB
/
txt_to_09.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import sys
import json
import os.path
try:
import argparse
except ImportError:
import compat.argparse as argparse
parser = argparse.ArgumentParser(description='Options')
parser.add_argument('-d', help='Where to read the comments?')
args = parser.parse_args()
hashes=None
if args.d and os.path.isfile(args.d):
with open(args.d,"r") as f:
hashes=json.load(f)
comment=False
for lineIdx,line in enumerate(sys.stdin):
line=unicode(line,"utf-8").rstrip()
if not line or line.startswith(u"###START") or line.startswith(u"###END"):
continue
tokens=line.split()
if lineIdx!=0 and comment==False: # do not print empty line after comment
print
# if hashes and len(tokens)==1 and tokens[0] in hashes: # this is hashed comment, extract it
if hashes and "".join(tokens) in hashes: # this is hashed comment, extract it, do not trust tokenizer to not split hash
print hashes["".join(tokens)].encode(u"utf-8")
comment=True
continue
for tIdx,t in enumerate(tokens):
print (u"%d\t%s\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_"%(tIdx+1,t)).encode("utf-8")
comment=False