-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_wikinewsmax.py
84 lines (63 loc) · 2.47 KB
/
prepare_wikinewsmax.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import csv
from pathlib import Path
import sys
from typing import Iterable, List, Tuple
OUTPUT_COLUMNS = (
'word',
'gold_diac',
'gold_diac_alt',
)
def gen_entries(path: Path) -> Tuple[List[dict[str, str]], List[dict[str, str]]]:
"""Iterate through entries in a WikiNewsMax TSV data file, returning
two lists of entries suitable for evaluation. The first list uses the
original WikiNews gold diacritization while the other uses the Max gold
diacritization as well as an alternate Max gold dicritization.
"""
with path.open('r', encoding='utf-8') as fp:
reader = csv.DictReader(fp,
dialect='excel-tab',
quotechar=None,
quoting=csv.QUOTE_NONE)
orig_entries = []
max_entries = []
for row in reader:
dediac = row['dediac'].strip()
orig_gold = row['original_gold'].strip()
max_gold = row['max_gold'].strip()
max_gold_alt = row.get('max_gold_alt', '').strip()
orig_entry = {
'word': dediac,
'gold_diac': orig_gold,
'gold_diac_alt': '',
}
max_entry = {
'word': dediac,
'gold_diac': max_gold,
'gold_diac_alt': max_gold_alt,
}
orig_entries.append(orig_entry)
max_entries.append(max_entry)
return orig_entries, max_entries
def write_entries(path: Path, entries: Iterable[dict[str, str]]):
"""Write entries to a TSV file at a given path.
"""
with path.open('w', encoding='utf-8') as fp:
writer = csv.DictWriter(fp,
dialect='excel-tab',
fieldnames=OUTPUT_COLUMNS,
quotechar=None,
quoting=csv.QUOTE_NONE,
extrasaction='ignore',
lineterminator='\n')
writer.writeheader()
writer.writerows(entries)
def main():
input_path = Path(sys.argv[1])
output_dir_path = Path(sys.argv[2])
orig_entries, max_entries = gen_entries(input_path)
orig_output_path = Path(output_dir_path, 'dediac_orig_gold.tsv')
write_entries(orig_output_path, orig_entries)
max_output_path = Path(output_dir_path, 'dediac_max_gold.tsv')
write_entries(max_output_path, max_entries)
if __name__ == '__main__':
main()