-
Notifications
You must be signed in to change notification settings - Fork 2
/
evaluator.py
194 lines (169 loc) · 7.8 KB
/
evaluator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
# Evaluation script for the Cross-Domain Authorship Attribution task @PAN2019.
We use the F1 metric (macro-average) as implemented in scikit-learn:
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
We include the following ad hoc rules:
- If authors are predicted which were not seen during training,
these predictions will count as false predictions ('<UNK>' class)
and they will negatively effect performance.
- If texts are left unattributed they will assigned to the ('<UNK>'
class) and they will negatively effect performance.
- The <UNK> class is excluded from the macro-average across classes.
- If multiple test attributions are given for a single unknown document,
only the first one will be taken into consideration.
Dependencies:
- Python 2.7 or 3.6 (we recommend the Anaconda Python distribution)
- scikit-learn
Usage from the command line:
>>> python pan19-cdaa-evaluator.py -i COLLECTION -a ANSWERS -o OUTPUT
where
COLLECTION is the path to the main folder of the evaluation collection
ANSWERS is the path to the answers folder of a submitted method
OUTPUT is the path to the folder where the results of the evaluation will be saved
-i /Users/valerioneri/Sapienza/Projects/Authorship/Datasets/training-dataset-2019-01-23 -a /Users/valerioneri/Sapienza/Projects/Authorship/Code/baseline/outs -o /Users/valerioneri/Sapienza/Projects/Authorship/Code/baseline/evaluation-outs
Example:
>>> python pan19-cdaa-evaluator.py -i "/mydata/pan19-cdaa-development-corpus" -a "/mydata/pan19-answers" -o "/mydata/pan19-evaluation"
# References:
@article{scikit-learn,
title={Scikit-learn: Machine Learning in {P}ython},
author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
journal={Journal of Machine Learning Research},
volume={12},
pages={2825--2830},
year={2011}
}
"""
import argparse
import os
import json
import warnings
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
def eval_measures(gt, pred):
"""Compute macro-averaged F1-scores, macro-averaged precision,
macro-averaged recall, and micro-averaged accuracy according the ad hoc
rules discussed at the top of this file.
Parameters
----------
gt : dict
Ground truth, where keys indicate text file names
(e.g. `unknown00002.txt`), and values represent
author labels (e.g. `candidate00003`)
pred : dict
Predicted attribution, where keys indicate text file names
(e.g. `unknown00002.txt`), and values represent
author labels (e.g. `candidate00003`)
Returns
-------
f1 : float
Macro-averaged F1-score
precision : float
Macro-averaged precision
recall : float
Macro-averaged recall
accuracy : float
Micro-averaged F1-score
"""
actual_authors = list(gt.values())
encoder = LabelEncoder().fit(['<UNK>'] + actual_authors)
text_ids, gold_authors, silver_authors = [], [], []
for text_id in sorted(gt):
text_ids.append(text_id)
gold_authors.append(gt[text_id])
try:
silver_authors.append(pred[text_id])
except KeyError:
# missing attributions get <UNK>:
silver_authors.append('<UNK>')
assert len(text_ids) == len(gold_authors)
assert len(text_ids) == len(silver_authors)
# replace non-existent silver authors with '<UNK>':
silver_authors = [a if a in encoder.classes_ else '<UNK>'
for a in silver_authors]
gold_author_ints = encoder.transform(gold_authors)
silver_author_ints = encoder.transform(silver_authors)
# get F1 for individual classes (and suppress warnings):
with warnings.catch_warnings():
warnings.simplefilter('ignore')
labels = list(set(gold_author_ints))
# Exclude the <UNK> class
for x in labels:
if encoder.inverse_transform(x) == '<UNK>':
labels.remove(x)
f1 = f1_score(gold_author_ints,
silver_author_ints,
labels,
average='macro')
precision = precision_score(gold_author_ints,
silver_author_ints,
labels,
average='macro')
recall = recall_score(gold_author_ints,
silver_author_ints,
labels,
average='macro')
accuracy = accuracy_score(gold_author_ints,
silver_author_ints)
return f1, precision, recall
def evaluate(ground_truth_file, predictions_file):
# Calculates evaluation measures for a single attribution problem
gt = {}
with open(ground_truth_file, 'r') as f:
for attrib in json.load(f)['ground_truth']:
gt[attrib['unknown-text']] = attrib['true-author']
pred = {}
with open(predictions_file, 'r') as f:
for attrib in json.load(f):
if attrib['unknown-text'] not in pred:
pred[attrib['unknown-text']] = attrib['predicted-author']
f1, precision, recall = eval_measures(gt, pred)
return round(f1, 3), round(precision, 3), round(recall, 3)
def evaluate_all(path_collection, path_answers, path_out):
# Calculates evaluation measures for a PAN-18 collection of attribution problems
infocollection = path_collection + os.sep + 'collection-info.json'
problems = []
data = []
with open(infocollection, 'r') as f:
for attrib in json.load(f):
problems.append(attrib['problem-name'])
scores = [];
for problem in problems:
f1, precision, recall = evaluate(path_collection + os.sep + problem + os.sep + 'ground-truth.json',
path_answers + os.sep + 'answers-' + problem + '.json')
scores.append(f1)
data.append({'problem-name': problem, 'macro-f1': round(f1, 3), 'macro-precision': round(precision, 3),
'macro-recall': round(recall, 3)})
print(str(problem), 'Macro-F1:', round(f1, 3))
overall_score = sum(scores) / len(scores)
# Saving data to output files (out.json and evaluation.prototext)
with open(path_out + os.sep + 'out.json', 'w') as f:
json.dump({'problems': data, 'overall_score': round(overall_score, 3)}, f, indent=4, sort_keys=True)
print('Overall score:', round(overall_score, 3))
prototext = 'measure {\n key: "mean macro-f1"\n value: "' + str(round(overall_score, 3)) + '"\n}\n'
with open(path_out + os.sep + 'evaluation.prototext', 'w') as f:
f.write(prototext)
def main():
parser = argparse.ArgumentParser(description='Evaluation script AA@PAN2019')
parser.add_argument('-i', type=str,
help='Path to evaluation collection')
parser.add_argument('-a', type=str,
help='Path to answers folder')
parser.add_argument('-o', type=str,
help='Path to output files')
args = parser.parse_args()
if not args.i:
print('ERROR: The collection path is required')
parser.exit(1)
if not args.a:
print('ERROR: The answers folder is required')
parser.exit(1)
if not args.o:
print('ERROR: The output path is required')
parser.exit(1)
evaluate_all(args.i, args.a, args.o)
main()