-
Notifications
You must be signed in to change notification settings - Fork 1
/
evaluate_conllx_driver.py
113 lines (96 loc) · 4.69 KB
/
evaluate_conllx_driver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
Evaluates a parsed CoNLL file against gold or
a directory containing parsed CoNLL files against a gold directory.
Usage:
evaluate_conllx_driver ((-g <gold> | --gold=<gold>) (-p <parsed> | --parsed=<parsed>) | ((--gold_dir=<gold_dir>) (--parsed_dir=<parsed_dir>)))
[-x | --transliterate_pnx]
[-n | --transliterate_num]
[-a | --normalize_alef_yeh_ta]
evaluate_conllx_driver (-h | --help)
Options:
-g <gold> --gold=<gold>
The gold CoNLL file.
-p <parsed> --parsed=<parsed>
The parsed CoNLL file.
--gold_dir=<gold_dir>
The directory containing gold CoNLL files.
--parsed_dir=<parsed_dir>
The directory containing parsed CoNLL files.
-x --transliterate_pnx
Transliterate punctuation to Roman script (punctuation will always match regardless of script)
-n --transliterate_num
Transliterate numbers to Roman script (numbers will always match regardless of script)
-a --normalize_alef_yeh_ta
Normalizes alef, alef maksura, and teh marbuta
-h --help
Show this screen.
"""
import pathlib
from docopt import docopt
from pandas import DataFrame, Series
from conllx_scores import get_scores_means
from conllx_counts import get_sentence_list_counts
from class_conllx import Conllx
from char_map import bw2ar_map_lines
from normalization import normalize_alef_yeh_ta
from utils import get_file_names
arguments = docopt(__doc__)
def get_synced_file_names(gold_file_names, parsed_file_names):
tuple_list = []
for gold_file in gold_file_names:
# a_start = a_file
parsed_file = [x for x in parsed_file_names if x == gold_file][0]
tuple_list.append((gold_file, parsed_file))
return tuple_list
def get_file_path_details(file_path):
full_path = pathlib.Path(file_path)
dir_path = full_path.parent
file_name = full_path.name
return dir_path, file_name
if __name__ == '__main__':
if arguments["--gold"] and arguments["--parsed"]:
gold_dir_path, gold_file_name = get_file_path_details(arguments["--gold"])
parsed_dir_path, parsed_file_name = get_file_path_details(arguments["--parsed"])
print('comparing two files')
tuple_list = [(gold_file_name, parsed_file_name)]
elif arguments["--gold_dir"] and arguments["--parsed_dir"]:
print('comparing two directories')
gold_dir_path = pathlib.Path(arguments["--gold_dir"])
parsed_dir_path = pathlib.Path(arguments["--parsed_dir"])
gold_file_names = get_file_names(arguments["--gold_dir"], '.conllx')
parsed_file_names = get_file_names(arguments["--parsed_dir"], '.conllx')
# matching files
tuple_list = get_synced_file_names(gold_file_names, parsed_file_names)
else:
raise ValueError('Invalid arguments')
# reading files and storing scores for each file
tree_counts_list = []
tree_matches_list = []
alignment_numbers_list = []
num_sentences_list = []
for gold_file, parsed_file in tuple_list:
gold_conllx = Conllx(file_name=gold_file, file_path=gold_dir_path)
gold_conllx.read_file()
parsed_conllx = Conllx(file_name=parsed_file, file_path=parsed_dir_path)
parsed_conllx.read_file()
if arguments['--transliterate_pnx']:
gold_conllx.file_data = bw2ar_map_lines(gold_conllx.file_data, 'punctuation')
parsed_conllx.file_data = bw2ar_map_lines(parsed_conllx.file_data, 'punctuation')
if arguments['--transliterate_num']:
gold_conllx.file_data = bw2ar_map_lines(gold_conllx.file_data, 'numbers')
parsed_conllx.file_data = bw2ar_map_lines(parsed_conllx.file_data, 'numbers')
if arguments['--normalize_alef_yeh_ta']:
gold_conllx.file_data = normalize_alef_yeh_ta(gold_conllx.file_data)
parsed_conllx.file_data = normalize_alef_yeh_ta(parsed_conllx.file_data)
conllx_file_statistics = get_sentence_list_counts(gold_conllx.conllx_to_sentence_list(), parsed_conllx.conllx_to_sentence_list())
tree_counts_list.append(conllx_file_statistics.tree_counts)
tree_matches_list.append(conllx_file_statistics.tree_matches)
alignment_numbers_list.append(conllx_file_statistics.alignment_numbers)
num_sentences_list.append(conllx_file_statistics.sentence_number)
subcorpus_scores = get_scores_means(tree_matches_list, tree_counts_list, num_sentences_list)
# print("\t".join(map(str, list(Series(subcorpus_scores.__dict__.copy()).values))))
# print(DataFrame(alignment_numbers_list).sum())
print(Series(subcorpus_scores.__dict__.copy()))
print()
results = DataFrame(alignment_numbers_list).sum()
results.to_csv('results.tsv', sep='\t', index=False)