From 35dc97b75c5c6975bcced314b37ed89392a6e9a6 Mon Sep 17 00:00:00 2001 From: Beatrice Savoldi Date: Fri, 8 Sep 2023 22:07:32 +0200 Subject: [PATCH] [!133][GENDER] Add INES evaluation script # Why is the change needed? INES-eval.py is a new feature. It is an evaluation script to be associated with the newly de-en resource for MT INES. It calculates gender inclusivity and term coverage. To be working, it requires the [INES corpus](https://drive.google.com/drive/u/1/folders/15c3ZW9gxbCKZnvHf3nOulQ_0Ke0bz5ti) as definition file in .tsv and the model output as .txt # What changes does the patch introduce? None, this is a new feature so all of it is new. # How was this patch tested? Added UTs. For a realistic scenario, it was tested on the (tokenized) [output](https://drive.google.com/file/d/1ctwpf_6MQe3_fHWac1xm1r4Pk97h19rc/view?usp=sharing) of an MT model. Sentence level scores, and hence global scores were manually checked to confirm correct functioning of INES-eval.py Also, the following output sentences and corresponding definition file were created as stress test on difficult positive and negative cases. These include: - cases of non-consecutive inclusive/not_inclusive tokens not to be matched as inclusive/not_inclusive terms. - cases with match of both inclusive/not_inclusive terms. - cases where the inclusive or not_inclusive terms appear twice, and should be only matched once. [AIRC.de-en_tok-stress.txt](/uploads/e29def8c2c2af64b5ad663f3436efd5b/AIRC.de-en_tok-stress.txt) [INES-stress.tsv](/uploads/755ce38fe9cc9f6f146cc62ba8569338/INES-stress.tsv) --- .../scripts/gender/INES_eval.py | 190 ++++++++++++++++++ fbk_uts/scripts/test_ines_eval.py | 182 +++++++++++++++++ 2 files changed, 372 insertions(+) create mode 100644 examples/speech_to_text/scripts/gender/INES_eval.py create mode 100644 fbk_uts/scripts/test_ines_eval.py diff --git a/examples/speech_to_text/scripts/gender/INES_eval.py b/examples/speech_to_text/scripts/gender/INES_eval.py new file mode 100644 index 00000000..e08f0294 --- /dev/null +++ b/examples/speech_to_text/scripts/gender/INES_eval.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +# Copyright 2023 FBK + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +import argparse +import csv +from collections import namedtuple +import os +import logging + + +InesAccuracy = namedtuple("InesAccuracy", ["term_coverage", "inclusivity_accuracy"]) + +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=os.environ.get("LOGLEVEL", "INFO").upper(),) +LOGGER = logging.getLogger("INES_eval") + + +def full_match(generated_terms, eval_tokens): + # Check if the sequence of eval tokens fully matches a subsequence in generated terms + for i in range(len(generated_terms) - len(eval_tokens) + 1): + if generated_terms[i:i + len(eval_tokens)] == eval_tokens: + return True + return False + + +def sentence_level_scores(in_f, tsv_f): + # Calculate sentence-level scores + sentences = [] + with open(in_f) as i_f, open(tsv_f) as t_f: + tsv_reader = csv.DictReader(t_f, delimiter='\t') + for (i_line, terms_f) in zip(i_f, tsv_reader): + sentence_inclusive = 0 + sentence_not_inclusive = 0 + sentence_found = 0 + generated_terms = i_line.strip().lower().split() + eval_terms = terms_f['EVAL-TERMS-en'].strip().lower().split(";") + inclusive_term = eval_terms[0] + not_inclusive_term = eval_terms[1] + + inclusive_tokens = inclusive_term.split() + not_inclusive_tokens = not_inclusive_term.split() + + found_inclusive = full_match(generated_terms, inclusive_tokens) + found_not_inclusive = full_match(generated_terms, not_inclusive_tokens) + + if found_inclusive: + sentence_inclusive += 1 + if found_not_inclusive: + sentence_not_inclusive += 1 + if found_inclusive or found_not_inclusive: + sentence_found += 1 + # check if both inclusive and not_inclusive are matched + if found_inclusive and found_not_inclusive: + line_number = terms_f['ID'] + LOGGER.info(f"Both inclusive and not inclusive terms found at line {line_number}: " + f"'{i_line.strip()}'") + + sentences.append({ + "num_terms_found": sentence_found, + "num_inclusive": sentence_inclusive, + "num_not_inclusive": sentence_not_inclusive + }) + + # asserting both files have been completed + assert next(i_f, None) is None and next(t_f, None) is None, \ + "INES TSV and hypothesis should have the same length" + return sentences + + +def write_sentence_scores(out_f, sentence_scores): + # Write sentence-level scores to a file + with open(out_f, 'w') as f_w: + writer = csv.DictWriter( + f_w, ["num_terms_found", "num_inclusive", "num_not_inclusive"], delimiter='\t') + writer.writeheader() + writer.writerows(sentence_scores) + + +def global_inclusivity_index(sentence_scores): + # Calculate global evaluation scores for inclusivity index as % of generated not_inclusive terms + tot_terms = len(sentence_scores) + tot_not_inclusive = 0 + + for score in sentence_scores: + tot_not_inclusive += score["num_not_inclusive"] + + if tot_terms == 0: + raise Exception("Cannot evaluate with empty INES TSV") + return 1 - float(tot_not_inclusive) / tot_terms + + +def global_accuracy(sentence_scores): + # Calculate global evaluation scores for term coverage and inclusivity accuracy + tot_terms = len(sentence_scores) + if tot_terms == 0: + raise Exception("Cannot evaluate with empty INES TSV") + tot_found = 0 + tot_inclusive = 0 + tot_not_inclusive = 0 + + for score in sentence_scores: + tot_found += score["num_terms_found"] + tot_inclusive += score["num_inclusive"] + tot_not_inclusive += score["num_not_inclusive"] + + term_cov = tot_found / tot_terms + if tot_inclusive + tot_not_inclusive > 0: + inclusivity_acc = tot_inclusive / (tot_inclusive + tot_not_inclusive) + else: + inclusivity_acc = 0.0 + overall_scores = InesAccuracy(term_cov, inclusivity_acc) + + if LOGGER.isEnabledFor(logging.DEBUG): + LOGGER.debug("Evaluated {} sentences...".format(len(sentence_scores))) + if LOGGER.isEnabledFor(logging.DEBUG): + LOGGER.debug("Global: all->{}; found->{}; inclusive->{}; not_inclusive->{}".format( + tot_terms, tot_found, tot_inclusive, tot_not_inclusive)) + + return overall_scores + + +def print_index_scores(out_scores): + # Print global evaluation scores for inclusivity index + print("Global Inclusivity Index") + print("------------------------") + print("{}".format(out_scores)) + + +def print_acc_scores(out_scores): + # Print global evaluation scores + print("Term Coverage\tInclusivity Accuracy") + print("-------------------------------------------------") + print("{}\t{}".format(out_scores.term_coverage, out_scores.inclusivity_accuracy)) + + +if __name__ == '__main__': + """ + Scripts for the evaluation of gender-inclusive language in MT on INES. + Given pairs of target inclusive/not-inclusive terms, the evaluation + scripts calculates: + - *inclusivity-index*, as the proportion of not_inclusive generated + by a system. The lower the proportion, the higher the level of + inclusivity. + + As complementary metrics, given the --acc-scores argument, the scripts + can also return: + - *term coverage*, as the proportion of either inclusive/non-inclusive + terms generated by a system. + - *inclusivity accuracy*, as the proportion of desirable inclusive terms + among all inclusive/not-inclusive terms generated by a system. + + Example usage: + python INES-eval.py --input your_MT_output.txt --tsv-definition INES.tsv + + Version: 1.0 + """ + parser = argparse.ArgumentParser() + parser.add_argument('--input', required=True, type=str, metavar='FILE', + help='Input file to be used to compute scores (it must be tokenized).') + parser.add_argument('--tsv-definition', required=True, type=str, metavar='FILE', + help='TSV INES definitions file.') + parser.add_argument('--sentence-scores', required=False, default=None, type=str, metavar='FILE', + help='If set, sentence level scores are written into this file.') + parser.add_argument('--acc-scores', required=False, action='store_true', default=False, + help='If set, print global accuracy and term coverage.') + + args = parser.parse_args() + + sl_scores = sentence_level_scores(args.input, args.tsv_definition) + if args.sentence_scores: + write_sentence_scores(args.sentence_scores, sl_scores) + scores = global_inclusivity_index(sl_scores) + print_index_scores(scores) + if args.acc_scores: + accuracy_scores = global_accuracy(sl_scores) + print_acc_scores(accuracy_scores) diff --git a/fbk_uts/scripts/test_ines_eval.py b/fbk_uts/scripts/test_ines_eval.py new file mode 100644 index 00000000..ae70d57c --- /dev/null +++ b/fbk_uts/scripts/test_ines_eval.py @@ -0,0 +1,182 @@ +# Copyright 2023 FBK +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +import unittest +import tempfile + +from examples.speech_to_text.scripts.gender.INES_eval import sentence_level_scores, global_inclusivity_index, global_accuracy + + +class InesEvalTestCase(unittest.TestCase): + + def test_sentence_level_scores_base_not_inclusive(self): + hypos = ["The average man spends about eight hours a day with sleep ."] + # Create temporary files for object acceptable to function in INES_eval + with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \ + tempfile.NamedTemporaryFile(mode="w") as tsv_file: + # here write the hypos to hypos_file and the tsv_file + hypos_file.write('\n'.join(hypos)) + tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man") + hypos_file.flush() + tsv_file.flush() + out = sentence_level_scores(hypos_file.name, tsv_file.name) + self.assertDictEqual(out[0], { + "num_terms_found": 1, + "num_inclusive": 0, + "num_not_inclusive": 1}) + + def test_sentence_level_scores_base_inclusive(self): + # mind the \ I missed in my previous comment + hypos = ["The average person spends about eight hours a day with sleep ."] + with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \ + tempfile.NamedTemporaryFile(mode="w") as tsv_file: + # here write the hypos to hypos_file and the tsv_file + hypos_file.write('\n'.join(hypos)) + tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man") + hypos_file.flush() + tsv_file.flush() + out = sentence_level_scores(hypos_file.name, tsv_file.name) + self.assertDictEqual(out[0], { + "num_terms_found": 1, + "num_inclusive": 1, + "num_not_inclusive": 0}) + + def test_sentence_level_scores_partial_match(self): + hypos = ["I am the average male ."] + with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \ + tempfile.NamedTemporaryFile(mode="w") as tsv_file: + # here write the hypos to hypos_file and the tsv_file + hypos_file.write('\n'.join(hypos)) + tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man") + hypos_file.flush() + tsv_file.flush() + out = sentence_level_scores(hypos_file.name, tsv_file.name) + self.assertDictEqual(out[0], { + "num_terms_found": 0, + "num_inclusive": 0, + "num_not_inclusive": 0}) + + def test_sentence_level_scores_inconsecutive_tokens(self): + hypos = ["I am the average male , while you are a good person ."] + with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \ + tempfile.NamedTemporaryFile(mode="w") as tsv_file: + # here write the hypos to hypos_file and the tsv_file + hypos_file.write('\n'.join(hypos)) + tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man") + hypos_file.flush() + tsv_file.flush() + out = sentence_level_scores(hypos_file.name, tsv_file.name) + self.assertDictEqual(out[0], { + "num_terms_found": 0, + "num_inclusive": 0, + "num_not_inclusive": 0}) + + def test_sentence_level_scores_consecutive_tokens(self): + hypos = ["The average dog for the average man ."] + with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \ + tempfile.NamedTemporaryFile(mode="w") as tsv_file: + # here write the hypos to hypos_file and the tsv_file + hypos_file.write('\n'.join(hypos)) + tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man") + hypos_file.flush() + tsv_file.flush() + out = sentence_level_scores(hypos_file.name, tsv_file.name) + self.assertDictEqual(out[0], { + "num_terms_found": 1, + "num_inclusive": 0, + "num_not_inclusive": 1}) + + def test_sentence_level_scores_both_term_in(self): + hypos = ["The average person is an average man ."] + with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \ + tempfile.NamedTemporaryFile(mode="w") as tsv_file: + # here write the hypos to hypos_file and the tsv_file + hypos_file.write('\n'.join(hypos)) + tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man") + hypos_file.flush() + tsv_file.flush() + out = sentence_level_scores(hypos_file.name, tsv_file.name) + self.assertDictEqual(out[0], { + "num_terms_found": 1, + "num_inclusive": 1, + "num_not_inclusive": 1}) + + def test_sentence_level_scores_both_term_ni(self): + hypos = ["The average man is an average person ."] + with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \ + tempfile.NamedTemporaryFile(mode="w") as tsv_file: + # here write the hypos to hypos_file and the tsv_file + hypos_file.write('\n'.join(hypos)) + tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man") + hypos_file.flush() + tsv_file.flush() + out = sentence_level_scores(hypos_file.name, tsv_file.name) + self.assertDictEqual(out[0], { + "num_terms_found": 1, + "num_inclusive": 1, + "num_not_inclusive": 1}) + + def test_sentence_level_scores_duplicate_term(self): + hypos = ["The average person is an average person ."] + with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \ + tempfile.NamedTemporaryFile(mode="w") as tsv_file: + # here write the hypos to hypos_file and the tsv_file + hypos_file.write('\n'.join(hypos)) + tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man") + hypos_file.flush() + tsv_file.flush() + out = sentence_level_scores(hypos_file.name, tsv_file.name) + self.assertDictEqual(out[0], { + "num_terms_found": 1, + "num_inclusive": 1, + "num_not_inclusive": 0}) + + def test_global_inclusivity_index_empty(self): + with self.assertRaises(Exception) as e: + out = global_inclusivity_index([]) + self.assertEqual(str(e.exception), "Cannot evaluate with empty INES TSV") + + def test_global_accuracy_empty(self): + with self.assertRaises(Exception) as e: + out = global_accuracy([]) + self.assertEqual(str(e.exception), "Cannot evaluate with empty INES TSV") + + def test_global_accuracy(self): + sentence_scores = [ + {"num_terms_found": 1, "num_inclusive": 0, "num_not_inclusive": 1}, + {"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 0}, + {"num_terms_found": 0, "num_inclusive": 0, "num_not_inclusive": 0}, + {"num_terms_found": 0, "num_inclusive": 0, "num_not_inclusive": 0}, + {"num_terms_found": 1, "num_inclusive": 0, "num_not_inclusive": 1}, + {"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 1}, + {"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 1}, + {"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 0}] + global_score = global_accuracy(sentence_scores) + self.assertEqual(global_score.term_coverage, 0.75) + self.assertEqual(global_score.inclusivity_accuracy, 0.5) + + def test_inclusivity_index(self): + sentence_scores = [ + {"num_terms_found": 1, "num_inclusive": 0, "num_not_inclusive": 1}, + {"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 0}, + {"num_terms_found": 0, "num_inclusive": 0, "num_not_inclusive": 0}, + {"num_terms_found": 0, "num_inclusive": 0, "num_not_inclusive": 0}, + {"num_terms_found": 1, "num_inclusive": 0, "num_not_inclusive": 1}, + {"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 1}, + {"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 1}, + {"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 0}] + global_score = global_inclusivity_index(sentence_scores) + self.assertEqual(global_score, 0.5) + + +if __name__ == '__main__': + unittest.main()