-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[!133][GENDER] Add INES evaluation script
# Why is the change needed? INES-eval.py is a new feature. It is an evaluation script to be associated with the newly de-en resource for MT INES. It calculates gender inclusivity and term coverage. To be working, it requires the [INES corpus](https://drive.google.com/drive/u/1/folders/15c3ZW9gxbCKZnvHf3nOulQ_0Ke0bz5ti) as definition file in .tsv and the model output as .txt # What changes does the patch introduce? None, this is a new feature so all of it is new. # How was this patch tested? Added UTs. For a realistic scenario, it was tested on the (tokenized) [output](https://drive.google.com/file/d/1ctwpf_6MQe3_fHWac1xm1r4Pk97h19rc/view?usp=sharing) of an MT model. Sentence level scores, and hence global scores were manually checked to confirm correct functioning of INES-eval.py Also, the following output sentences and corresponding definition file were created as stress test on difficult positive and negative cases. These include: - cases of non-consecutive inclusive/not_inclusive tokens not to be matched as inclusive/not_inclusive terms. - cases with match of both inclusive/not_inclusive terms. - cases where the inclusive or not_inclusive terms appear twice, and should be only matched once. [AIRC.de-en_tok-stress.txt](/uploads/e29def8c2c2af64b5ad663f3436efd5b/AIRC.de-en_tok-stress.txt) [INES-stress.tsv](/uploads/755ce38fe9cc9f6f146cc62ba8569338/INES-stress.tsv)
- Loading branch information
Showing
2 changed files
with
372 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,190 @@ | ||
#!/usr/bin/env python3 | ||
# Copyright 2023 FBK | ||
|
||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
|
||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License | ||
|
||
import argparse | ||
import csv | ||
from collections import namedtuple | ||
import os | ||
import logging | ||
|
||
|
||
InesAccuracy = namedtuple("InesAccuracy", ["term_coverage", "inclusivity_accuracy"]) | ||
|
||
logging.basicConfig( | ||
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", | ||
datefmt="%Y-%m-%d %H:%M:%S", | ||
level=os.environ.get("LOGLEVEL", "INFO").upper(),) | ||
LOGGER = logging.getLogger("INES_eval") | ||
|
||
|
||
def full_match(generated_terms, eval_tokens): | ||
# Check if the sequence of eval tokens fully matches a subsequence in generated terms | ||
for i in range(len(generated_terms) - len(eval_tokens) + 1): | ||
if generated_terms[i:i + len(eval_tokens)] == eval_tokens: | ||
return True | ||
return False | ||
|
||
|
||
def sentence_level_scores(in_f, tsv_f): | ||
# Calculate sentence-level scores | ||
sentences = [] | ||
with open(in_f) as i_f, open(tsv_f) as t_f: | ||
tsv_reader = csv.DictReader(t_f, delimiter='\t') | ||
for (i_line, terms_f) in zip(i_f, tsv_reader): | ||
sentence_inclusive = 0 | ||
sentence_not_inclusive = 0 | ||
sentence_found = 0 | ||
generated_terms = i_line.strip().lower().split() | ||
eval_terms = terms_f['EVAL-TERMS-en'].strip().lower().split(";") | ||
inclusive_term = eval_terms[0] | ||
not_inclusive_term = eval_terms[1] | ||
|
||
inclusive_tokens = inclusive_term.split() | ||
not_inclusive_tokens = not_inclusive_term.split() | ||
|
||
found_inclusive = full_match(generated_terms, inclusive_tokens) | ||
found_not_inclusive = full_match(generated_terms, not_inclusive_tokens) | ||
|
||
if found_inclusive: | ||
sentence_inclusive += 1 | ||
if found_not_inclusive: | ||
sentence_not_inclusive += 1 | ||
if found_inclusive or found_not_inclusive: | ||
sentence_found += 1 | ||
# check if both inclusive and not_inclusive are matched | ||
if found_inclusive and found_not_inclusive: | ||
line_number = terms_f['ID'] | ||
LOGGER.info(f"Both inclusive and not inclusive terms found at line {line_number}: " | ||
f"'{i_line.strip()}'") | ||
|
||
sentences.append({ | ||
"num_terms_found": sentence_found, | ||
"num_inclusive": sentence_inclusive, | ||
"num_not_inclusive": sentence_not_inclusive | ||
}) | ||
|
||
# asserting both files have been completed | ||
assert next(i_f, None) is None and next(t_f, None) is None, \ | ||
"INES TSV and hypothesis should have the same length" | ||
return sentences | ||
|
||
|
||
def write_sentence_scores(out_f, sentence_scores): | ||
# Write sentence-level scores to a file | ||
with open(out_f, 'w') as f_w: | ||
writer = csv.DictWriter( | ||
f_w, ["num_terms_found", "num_inclusive", "num_not_inclusive"], delimiter='\t') | ||
writer.writeheader() | ||
writer.writerows(sentence_scores) | ||
|
||
|
||
def global_inclusivity_index(sentence_scores): | ||
# Calculate global evaluation scores for inclusivity index as % of generated not_inclusive terms | ||
tot_terms = len(sentence_scores) | ||
tot_not_inclusive = 0 | ||
|
||
for score in sentence_scores: | ||
tot_not_inclusive += score["num_not_inclusive"] | ||
|
||
if tot_terms == 0: | ||
raise Exception("Cannot evaluate with empty INES TSV") | ||
return 1 - float(tot_not_inclusive) / tot_terms | ||
|
||
|
||
def global_accuracy(sentence_scores): | ||
# Calculate global evaluation scores for term coverage and inclusivity accuracy | ||
tot_terms = len(sentence_scores) | ||
if tot_terms == 0: | ||
raise Exception("Cannot evaluate with empty INES TSV") | ||
tot_found = 0 | ||
tot_inclusive = 0 | ||
tot_not_inclusive = 0 | ||
|
||
for score in sentence_scores: | ||
tot_found += score["num_terms_found"] | ||
tot_inclusive += score["num_inclusive"] | ||
tot_not_inclusive += score["num_not_inclusive"] | ||
|
||
term_cov = tot_found / tot_terms | ||
if tot_inclusive + tot_not_inclusive > 0: | ||
inclusivity_acc = tot_inclusive / (tot_inclusive + tot_not_inclusive) | ||
else: | ||
inclusivity_acc = 0.0 | ||
overall_scores = InesAccuracy(term_cov, inclusivity_acc) | ||
|
||
if LOGGER.isEnabledFor(logging.DEBUG): | ||
LOGGER.debug("Evaluated {} sentences...".format(len(sentence_scores))) | ||
if LOGGER.isEnabledFor(logging.DEBUG): | ||
LOGGER.debug("Global: all->{}; found->{}; inclusive->{}; not_inclusive->{}".format( | ||
tot_terms, tot_found, tot_inclusive, tot_not_inclusive)) | ||
|
||
return overall_scores | ||
|
||
|
||
def print_index_scores(out_scores): | ||
# Print global evaluation scores for inclusivity index | ||
print("Global Inclusivity Index") | ||
print("------------------------") | ||
print("{}".format(out_scores)) | ||
|
||
|
||
def print_acc_scores(out_scores): | ||
# Print global evaluation scores | ||
print("Term Coverage\tInclusivity Accuracy") | ||
print("-------------------------------------------------") | ||
print("{}\t{}".format(out_scores.term_coverage, out_scores.inclusivity_accuracy)) | ||
|
||
|
||
if __name__ == '__main__': | ||
""" | ||
Scripts for the evaluation of gender-inclusive language in MT on INES. | ||
Given pairs of target inclusive/not-inclusive terms, the evaluation | ||
scripts calculates: | ||
- *inclusivity-index*, as the proportion of not_inclusive generated | ||
by a system. The lower the proportion, the higher the level of | ||
inclusivity. | ||
As complementary metrics, given the --acc-scores argument, the scripts | ||
can also return: | ||
- *term coverage*, as the proportion of either inclusive/non-inclusive | ||
terms generated by a system. | ||
- *inclusivity accuracy*, as the proportion of desirable inclusive terms | ||
among all inclusive/not-inclusive terms generated by a system. | ||
Example usage: | ||
python INES-eval.py --input your_MT_output.txt --tsv-definition INES.tsv | ||
Version: 1.0 | ||
""" | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--input', required=True, type=str, metavar='FILE', | ||
help='Input file to be used to compute scores (it must be tokenized).') | ||
parser.add_argument('--tsv-definition', required=True, type=str, metavar='FILE', | ||
help='TSV INES definitions file.') | ||
parser.add_argument('--sentence-scores', required=False, default=None, type=str, metavar='FILE', | ||
help='If set, sentence level scores are written into this file.') | ||
parser.add_argument('--acc-scores', required=False, action='store_true', default=False, | ||
help='If set, print global accuracy and term coverage.') | ||
|
||
args = parser.parse_args() | ||
|
||
sl_scores = sentence_level_scores(args.input, args.tsv_definition) | ||
if args.sentence_scores: | ||
write_sentence_scores(args.sentence_scores, sl_scores) | ||
scores = global_inclusivity_index(sl_scores) | ||
print_index_scores(scores) | ||
if args.acc_scores: | ||
accuracy_scores = global_accuracy(sl_scores) | ||
print_acc_scores(accuracy_scores) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
# Copyright 2023 FBK | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
|
||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License | ||
import unittest | ||
import tempfile | ||
|
||
from examples.speech_to_text.scripts.gender.INES_eval import sentence_level_scores, global_inclusivity_index, global_accuracy | ||
|
||
|
||
class InesEvalTestCase(unittest.TestCase): | ||
|
||
def test_sentence_level_scores_base_not_inclusive(self): | ||
hypos = ["The average man spends about eight hours a day with sleep ."] | ||
# Create temporary files for object acceptable to function in INES_eval | ||
with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \ | ||
tempfile.NamedTemporaryFile(mode="w") as tsv_file: | ||
# here write the hypos to hypos_file and the tsv_file | ||
hypos_file.write('\n'.join(hypos)) | ||
tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man") | ||
hypos_file.flush() | ||
tsv_file.flush() | ||
out = sentence_level_scores(hypos_file.name, tsv_file.name) | ||
self.assertDictEqual(out[0], { | ||
"num_terms_found": 1, | ||
"num_inclusive": 0, | ||
"num_not_inclusive": 1}) | ||
|
||
def test_sentence_level_scores_base_inclusive(self): | ||
# mind the \ I missed in my previous comment | ||
hypos = ["The average person spends about eight hours a day with sleep ."] | ||
with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \ | ||
tempfile.NamedTemporaryFile(mode="w") as tsv_file: | ||
# here write the hypos to hypos_file and the tsv_file | ||
hypos_file.write('\n'.join(hypos)) | ||
tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man") | ||
hypos_file.flush() | ||
tsv_file.flush() | ||
out = sentence_level_scores(hypos_file.name, tsv_file.name) | ||
self.assertDictEqual(out[0], { | ||
"num_terms_found": 1, | ||
"num_inclusive": 1, | ||
"num_not_inclusive": 0}) | ||
|
||
def test_sentence_level_scores_partial_match(self): | ||
hypos = ["I am the average male ."] | ||
with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \ | ||
tempfile.NamedTemporaryFile(mode="w") as tsv_file: | ||
# here write the hypos to hypos_file and the tsv_file | ||
hypos_file.write('\n'.join(hypos)) | ||
tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man") | ||
hypos_file.flush() | ||
tsv_file.flush() | ||
out = sentence_level_scores(hypos_file.name, tsv_file.name) | ||
self.assertDictEqual(out[0], { | ||
"num_terms_found": 0, | ||
"num_inclusive": 0, | ||
"num_not_inclusive": 0}) | ||
|
||
def test_sentence_level_scores_inconsecutive_tokens(self): | ||
hypos = ["I am the average male , while you are a good person ."] | ||
with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \ | ||
tempfile.NamedTemporaryFile(mode="w") as tsv_file: | ||
# here write the hypos to hypos_file and the tsv_file | ||
hypos_file.write('\n'.join(hypos)) | ||
tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man") | ||
hypos_file.flush() | ||
tsv_file.flush() | ||
out = sentence_level_scores(hypos_file.name, tsv_file.name) | ||
self.assertDictEqual(out[0], { | ||
"num_terms_found": 0, | ||
"num_inclusive": 0, | ||
"num_not_inclusive": 0}) | ||
|
||
def test_sentence_level_scores_consecutive_tokens(self): | ||
hypos = ["The average dog for the average man ."] | ||
with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \ | ||
tempfile.NamedTemporaryFile(mode="w") as tsv_file: | ||
# here write the hypos to hypos_file and the tsv_file | ||
hypos_file.write('\n'.join(hypos)) | ||
tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man") | ||
hypos_file.flush() | ||
tsv_file.flush() | ||
out = sentence_level_scores(hypos_file.name, tsv_file.name) | ||
self.assertDictEqual(out[0], { | ||
"num_terms_found": 1, | ||
"num_inclusive": 0, | ||
"num_not_inclusive": 1}) | ||
|
||
def test_sentence_level_scores_both_term_in(self): | ||
hypos = ["The average person is an average man ."] | ||
with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \ | ||
tempfile.NamedTemporaryFile(mode="w") as tsv_file: | ||
# here write the hypos to hypos_file and the tsv_file | ||
hypos_file.write('\n'.join(hypos)) | ||
tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man") | ||
hypos_file.flush() | ||
tsv_file.flush() | ||
out = sentence_level_scores(hypos_file.name, tsv_file.name) | ||
self.assertDictEqual(out[0], { | ||
"num_terms_found": 1, | ||
"num_inclusive": 1, | ||
"num_not_inclusive": 1}) | ||
|
||
def test_sentence_level_scores_both_term_ni(self): | ||
hypos = ["The average man is an average person ."] | ||
with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \ | ||
tempfile.NamedTemporaryFile(mode="w") as tsv_file: | ||
# here write the hypos to hypos_file and the tsv_file | ||
hypos_file.write('\n'.join(hypos)) | ||
tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man") | ||
hypos_file.flush() | ||
tsv_file.flush() | ||
out = sentence_level_scores(hypos_file.name, tsv_file.name) | ||
self.assertDictEqual(out[0], { | ||
"num_terms_found": 1, | ||
"num_inclusive": 1, | ||
"num_not_inclusive": 1}) | ||
|
||
def test_sentence_level_scores_duplicate_term(self): | ||
hypos = ["The average person is an average person ."] | ||
with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \ | ||
tempfile.NamedTemporaryFile(mode="w") as tsv_file: | ||
# here write the hypos to hypos_file and the tsv_file | ||
hypos_file.write('\n'.join(hypos)) | ||
tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man") | ||
hypos_file.flush() | ||
tsv_file.flush() | ||
out = sentence_level_scores(hypos_file.name, tsv_file.name) | ||
self.assertDictEqual(out[0], { | ||
"num_terms_found": 1, | ||
"num_inclusive": 1, | ||
"num_not_inclusive": 0}) | ||
|
||
def test_global_inclusivity_index_empty(self): | ||
with self.assertRaises(Exception) as e: | ||
out = global_inclusivity_index([]) | ||
self.assertEqual(str(e.exception), "Cannot evaluate with empty INES TSV") | ||
|
||
def test_global_accuracy_empty(self): | ||
with self.assertRaises(Exception) as e: | ||
out = global_accuracy([]) | ||
self.assertEqual(str(e.exception), "Cannot evaluate with empty INES TSV") | ||
|
||
def test_global_accuracy(self): | ||
sentence_scores = [ | ||
{"num_terms_found": 1, "num_inclusive": 0, "num_not_inclusive": 1}, | ||
{"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 0}, | ||
{"num_terms_found": 0, "num_inclusive": 0, "num_not_inclusive": 0}, | ||
{"num_terms_found": 0, "num_inclusive": 0, "num_not_inclusive": 0}, | ||
{"num_terms_found": 1, "num_inclusive": 0, "num_not_inclusive": 1}, | ||
{"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 1}, | ||
{"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 1}, | ||
{"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 0}] | ||
global_score = global_accuracy(sentence_scores) | ||
self.assertEqual(global_score.term_coverage, 0.75) | ||
self.assertEqual(global_score.inclusivity_accuracy, 0.5) | ||
|
||
def test_inclusivity_index(self): | ||
sentence_scores = [ | ||
{"num_terms_found": 1, "num_inclusive": 0, "num_not_inclusive": 1}, | ||
{"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 0}, | ||
{"num_terms_found": 0, "num_inclusive": 0, "num_not_inclusive": 0}, | ||
{"num_terms_found": 0, "num_inclusive": 0, "num_not_inclusive": 0}, | ||
{"num_terms_found": 1, "num_inclusive": 0, "num_not_inclusive": 1}, | ||
{"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 1}, | ||
{"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 1}, | ||
{"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 0}] | ||
global_score = global_inclusivity_index(sentence_scores) | ||
self.assertEqual(global_score, 0.5) | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main() |