forked from facebookresearch/fairseq
-
Notifications
You must be signed in to change notification settings - Fork 2
/
split_full.py
executable file
·58 lines (43 loc) · 1.24 KB
/
split_full.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env python3
import sys
import re
sys.path.append('data')
import osis_tran
SRC_MOD = '2TGreek'
# inclusive
RANGES = {
'en': (0, 22874),
'fi': (22875, 24405),
'de': (24406, 25935)
}
RM_SPACE = re.compile(r" ([?.,!;':])")
APOS = re.compile(r' ' ')
QUOT = re.compile(r'"')
def postprocess(s):
s = APOS.sub("'", s)
s = RM_SPACE.sub(r'\1', s)
s = QUOT.sub('"', s) # don't know which side to put it on...
return s
def main():
fname = sys.argv[1]
mod = osis_tran.load_osis_module(SRC_MOD)
keys = list(mod.keys())
key_len = max(len(x) for x in keys)
with open(fname) as inf:
lines = inf.read().splitlines()
lines = [x.split('\t') for x in lines]
lines = [(int(num), txt) for num, txt in lines]
lines.sort()
fps = {}
for lang in RANGES:
fps[lang] = open(fname + '.' + lang, 'w')
for lineno, text in lines:
for lang, (start, end) in RANGES.items():
if lineno >= start and lineno <= end:
k = keys[lineno-start]
k += ' '*(key_len-len(k))
print('{} {}'.format(k, postprocess(text)), file=fps[lang])
for lang in RANGES:
fps[lang].close()
if __name__ == '__main__':
main()