[!133][GENDER] Add INES evaluation script

# Why is the change needed? INES-eval.py is a new feature. It is an evaluation script to be associated with the newly de-en resource for MT INES. It calculates gender inclusivity and term coverage. To be working, it requires the [INES corpus](https://drive.google.com/drive/u/1/folders/15c3ZW9gxbCKZnvHf3nOulQ_0Ke0bz5ti) as definition file in .tsv and the model output as .txt # What changes does the patch introduce? None, this is a new feature so all of it is new. # How was this patch tested? Added UTs. For a realistic scenario, it was tested on the (tokenized) [output](https://drive.google.com/file/d/1ctwpf_6MQe3_fHWac1xm1r4Pk97h19rc/view?usp=sharing) of an MT model. Sentence level scores, and hence global scores were manually checked to confirm correct functioning of INES-eval.py Also, the following output sentences and corresponding definition file were created as stress test on difficult positive and negative cases. These include: - cases of non-consecutive inclusive/not_inclusive tokens not to be matched as inclusive/not_inclusive terms. - cases with match of both inclusive/not_inclusive terms. - cases where the inclusive or not_inclusive terms appear twice, and should be only matched once. [AIRC.de-en_tok-stress.txt](/uploads/e29def8c2c2af64b5ad663f3436efd5b/AIRC.de-en_tok-stress.txt) [INES-stress.tsv](/uploads/755ce38fe9cc9f6f146cc62ba8569338/INES-stress.tsv)
hlt-mt · Oct 18, 2023 · 35dc97b · 35dc97b
1 parent 5316873
commit 35dc97b
Show file tree

Hide file tree

Showing 2 changed files with 372 additions and 0 deletions.
diff --git a/examples/speech_to_text/scripts/gender/INES_eval.py b/examples/speech_to_text/scripts/gender/INES_eval.py
@@ -0,0 +1,190 @@
+#!/usr/bin/env python3
+# Copyright 2023 FBK
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import argparse
+import csv
+from collections import namedtuple
+import os
+import logging
+
+
+InesAccuracy = namedtuple("InesAccuracy", ["term_coverage", "inclusivity_accuracy"])
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),)
+LOGGER = logging.getLogger("INES_eval")
+
+
+def full_match(generated_terms, eval_tokens):
+    # Check if the sequence of eval tokens fully matches a subsequence in generated terms
+    for i in range(len(generated_terms) - len(eval_tokens) + 1):
+        if generated_terms[i:i + len(eval_tokens)] == eval_tokens:
+            return True
+    return False
+
+
+def sentence_level_scores(in_f, tsv_f):
+    # Calculate sentence-level scores
+    sentences = []
+    with open(in_f) as i_f, open(tsv_f) as t_f:
+        tsv_reader = csv.DictReader(t_f, delimiter='\t')
+        for (i_line, terms_f) in zip(i_f, tsv_reader):
+            sentence_inclusive = 0
+            sentence_not_inclusive = 0
+            sentence_found = 0
+            generated_terms = i_line.strip().lower().split()
+            eval_terms = terms_f['EVAL-TERMS-en'].strip().lower().split(";")
+            inclusive_term = eval_terms[0]
+            not_inclusive_term = eval_terms[1]
+
+            inclusive_tokens = inclusive_term.split()
+            not_inclusive_tokens = not_inclusive_term.split()
+
+            found_inclusive = full_match(generated_terms, inclusive_tokens)
+            found_not_inclusive = full_match(generated_terms, not_inclusive_tokens)
+
+            if found_inclusive:
+                sentence_inclusive += 1
+            if found_not_inclusive:
+                sentence_not_inclusive += 1
+            if found_inclusive or found_not_inclusive:
+                sentence_found += 1
+            # check if both inclusive and not_inclusive are matched
+            if found_inclusive and found_not_inclusive:
+                line_number = terms_f['ID']
+                LOGGER.info(f"Both inclusive and not inclusive terms found at line {line_number}: "
+                            f"'{i_line.strip()}'")
+
+            sentences.append({
+                "num_terms_found": sentence_found,
+                "num_inclusive": sentence_inclusive,
+                "num_not_inclusive": sentence_not_inclusive
+            })
+
+        # asserting both files have been completed
+        assert next(i_f, None) is None and next(t_f, None) is None, \
+            "INES TSV and hypothesis should have the same length"
+    return sentences
+
+
+def write_sentence_scores(out_f, sentence_scores):
+    # Write sentence-level scores to a file
+    with open(out_f, 'w') as f_w:
+        writer = csv.DictWriter(
+            f_w, ["num_terms_found", "num_inclusive", "num_not_inclusive"], delimiter='\t')
+        writer.writeheader()
+        writer.writerows(sentence_scores)
+
+
+def global_inclusivity_index(sentence_scores):
+    # Calculate global evaluation scores for inclusivity index as % of generated not_inclusive terms
+    tot_terms = len(sentence_scores)
+    tot_not_inclusive = 0
+
+    for score in sentence_scores:
+        tot_not_inclusive += score["num_not_inclusive"]
+
+    if tot_terms == 0:
+        raise Exception("Cannot evaluate with empty INES TSV")
+    return 1 - float(tot_not_inclusive) / tot_terms
+
+
+def global_accuracy(sentence_scores):
+    # Calculate global evaluation scores for term coverage and inclusivity accuracy
+    tot_terms = len(sentence_scores)
+    if tot_terms == 0:
+        raise Exception("Cannot evaluate with empty INES TSV")
+    tot_found = 0
+    tot_inclusive = 0
+    tot_not_inclusive = 0
+
+    for score in sentence_scores:
+        tot_found += score["num_terms_found"]
+        tot_inclusive += score["num_inclusive"]
+        tot_not_inclusive += score["num_not_inclusive"]
+
+    term_cov = tot_found / tot_terms
+    if tot_inclusive + tot_not_inclusive > 0:
+        inclusivity_acc = tot_inclusive / (tot_inclusive + tot_not_inclusive)
+    else:
+        inclusivity_acc = 0.0
+    overall_scores = InesAccuracy(term_cov, inclusivity_acc)
+
+    if LOGGER.isEnabledFor(logging.DEBUG):
+        LOGGER.debug("Evaluated {} sentences...".format(len(sentence_scores)))
+    if LOGGER.isEnabledFor(logging.DEBUG):
+        LOGGER.debug("Global: all->{}; found->{}; inclusive->{}; not_inclusive->{}".format(
+            tot_terms, tot_found, tot_inclusive, tot_not_inclusive))
+
+    return overall_scores
+
+
+def print_index_scores(out_scores):
+    # Print global evaluation scores for inclusivity index
+    print("Global Inclusivity Index")
+    print("------------------------")
+    print("{}".format(out_scores))
+
+
+def print_acc_scores(out_scores):
+    # Print global evaluation scores
+    print("Term Coverage\tInclusivity Accuracy")
+    print("-------------------------------------------------")
+    print("{}\t{}".format(out_scores.term_coverage, out_scores.inclusivity_accuracy))
+
+
+if __name__ == '__main__':
+    """
+    Scripts for the evaluation of gender-inclusive language in MT on INES.
+    Given pairs of target inclusive/not-inclusive terms, the evaluation
+    scripts calculates:
+        - *inclusivity-index*, as the proportion of not_inclusive generated
+           by a system. The lower the proportion, the higher the level of
+           inclusivity. 
+           
+    As complementary metrics, given the --acc-scores argument, the scripts 
+    can also return:
+        - *term coverage*, as the proportion of either inclusive/non-inclusive
+          terms generated by a system.
+        - *inclusivity accuracy*, as the proportion of desirable inclusive terms
+          among all inclusive/not-inclusive terms generated by a system.
+
+    Example usage:
+        python INES-eval.py --input your_MT_output.txt --tsv-definition INES.tsv
+
+    Version: 1.0
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input', required=True, type=str, metavar='FILE',
+                        help='Input file to be used to compute scores (it must be tokenized).')
+    parser.add_argument('--tsv-definition', required=True, type=str, metavar='FILE',
+                        help='TSV INES definitions file.')
+    parser.add_argument('--sentence-scores', required=False, default=None, type=str, metavar='FILE',
+                        help='If set, sentence level scores are written into this file.')
+    parser.add_argument('--acc-scores', required=False, action='store_true', default=False,
+                        help='If set, print global accuracy and term coverage.')
+
+    args = parser.parse_args()
+
+    sl_scores = sentence_level_scores(args.input, args.tsv_definition)
+    if args.sentence_scores:
+        write_sentence_scores(args.sentence_scores, sl_scores)
+    scores = global_inclusivity_index(sl_scores)
+    print_index_scores(scores)
+    if args.acc_scores:
+        accuracy_scores = global_accuracy(sl_scores)
+        print_acc_scores(accuracy_scores)
diff --git a/fbk_uts/scripts/test_ines_eval.py b/fbk_uts/scripts/test_ines_eval.py
@@ -0,0 +1,182 @@
+# Copyright 2023 FBK
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+import unittest
+import tempfile
+
+from examples.speech_to_text.scripts.gender.INES_eval import sentence_level_scores, global_inclusivity_index, global_accuracy
+
+
+class InesEvalTestCase(unittest.TestCase):
+
+    def test_sentence_level_scores_base_not_inclusive(self):
+        hypos = ["The average man spends about eight hours a day with sleep ."]
+        # Create temporary files for object acceptable to function in INES_eval
+        with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \
+                tempfile.NamedTemporaryFile(mode="w") as tsv_file:
+            # here write the hypos to hypos_file and the tsv_file
+            hypos_file.write('\n'.join(hypos))
+            tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man")
+            hypos_file.flush()
+            tsv_file.flush()
+            out = sentence_level_scores(hypos_file.name, tsv_file.name)
+        self.assertDictEqual(out[0], {
+            "num_terms_found": 1,
+            "num_inclusive": 0,
+            "num_not_inclusive": 1})
+
+    def test_sentence_level_scores_base_inclusive(self):
+        # mind the \ I missed in my previous comment
+        hypos = ["The average person spends about eight hours a day with sleep ."]
+        with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \
+                tempfile.NamedTemporaryFile(mode="w") as tsv_file:
+            # here write the hypos to hypos_file and the tsv_file
+            hypos_file.write('\n'.join(hypos))
+            tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man")
+            hypos_file.flush()
+            tsv_file.flush()
+            out = sentence_level_scores(hypos_file.name, tsv_file.name)
+            self.assertDictEqual(out[0], {
+                "num_terms_found": 1,
+                "num_inclusive": 1,
+                "num_not_inclusive": 0})
+
+    def test_sentence_level_scores_partial_match(self):
+        hypos = ["I am the average male ."]
+        with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \
+                tempfile.NamedTemporaryFile(mode="w") as tsv_file:
+            # here write the hypos to hypos_file and the tsv_file
+            hypos_file.write('\n'.join(hypos))
+            tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man")
+            hypos_file.flush()
+            tsv_file.flush()
+            out = sentence_level_scores(hypos_file.name, tsv_file.name)
+        self.assertDictEqual(out[0], {
+            "num_terms_found": 0,
+            "num_inclusive": 0,
+            "num_not_inclusive": 0})
+
+    def test_sentence_level_scores_inconsecutive_tokens(self):
+        hypos = ["I am the average male , while you are a good person ."]
+        with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \
+                tempfile.NamedTemporaryFile(mode="w") as tsv_file:
+            # here write the hypos to hypos_file and the tsv_file
+            hypos_file.write('\n'.join(hypos))
+            tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man")
+            hypos_file.flush()
+            tsv_file.flush()
+            out = sentence_level_scores(hypos_file.name, tsv_file.name)
+        self.assertDictEqual(out[0], {
+            "num_terms_found": 0,
+            "num_inclusive": 0,
+            "num_not_inclusive": 0})
+
+    def test_sentence_level_scores_consecutive_tokens(self):
+        hypos = ["The average dog for the average man ."]
+        with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \
+                tempfile.NamedTemporaryFile(mode="w") as tsv_file:
+            # here write the hypos to hypos_file and the tsv_file
+            hypos_file.write('\n'.join(hypos))
+            tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man")
+            hypos_file.flush()
+            tsv_file.flush()
+            out = sentence_level_scores(hypos_file.name, tsv_file.name)
+        self.assertDictEqual(out[0], {
+            "num_terms_found": 1,
+            "num_inclusive": 0,
+            "num_not_inclusive": 1})
+
+    def test_sentence_level_scores_both_term_in(self):
+        hypos = ["The average person is an average man ."]
+        with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \
+                tempfile.NamedTemporaryFile(mode="w") as tsv_file:
+            # here write the hypos to hypos_file and the tsv_file
+            hypos_file.write('\n'.join(hypos))
+            tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man")
+            hypos_file.flush()
+            tsv_file.flush()
+            out = sentence_level_scores(hypos_file.name, tsv_file.name)
+        self.assertDictEqual(out[0], {
+            "num_terms_found": 1,
+            "num_inclusive": 1,
+            "num_not_inclusive": 1})
+
+    def test_sentence_level_scores_both_term_ni(self):
+        hypos = ["The average man is an average person ."]
+        with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \
+                tempfile.NamedTemporaryFile(mode="w") as tsv_file:
+            # here write the hypos to hypos_file and the tsv_file
+            hypos_file.write('\n'.join(hypos))
+            tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man")
+            hypos_file.flush()
+            tsv_file.flush()
+            out = sentence_level_scores(hypos_file.name, tsv_file.name)
+        self.assertDictEqual(out[0], {
+            "num_terms_found": 1,
+            "num_inclusive": 1,
+            "num_not_inclusive": 1})
+
+    def test_sentence_level_scores_duplicate_term(self):
+        hypos = ["The average person is an average person ."]
+        with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \
+                tempfile.NamedTemporaryFile(mode="w") as tsv_file:
+            # here write the hypos to hypos_file and the tsv_file
+            hypos_file.write('\n'.join(hypos))
+            tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man")
+            hypos_file.flush()
+            tsv_file.flush()
+            out = sentence_level_scores(hypos_file.name, tsv_file.name)
+        self.assertDictEqual(out[0], {
+            "num_terms_found": 1,
+            "num_inclusive": 1,
+            "num_not_inclusive": 0})
+
+    def test_global_inclusivity_index_empty(self):
+        with self.assertRaises(Exception) as e:
+            out = global_inclusivity_index([])
+        self.assertEqual(str(e.exception), "Cannot evaluate with empty INES TSV")
+
+    def test_global_accuracy_empty(self):
+        with self.assertRaises(Exception) as e:
+            out = global_accuracy([])
+        self.assertEqual(str(e.exception), "Cannot evaluate with empty INES TSV")
+
+    def test_global_accuracy(self):
+        sentence_scores = [
+            {"num_terms_found": 1, "num_inclusive": 0, "num_not_inclusive": 1},
+            {"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 0},
+            {"num_terms_found": 0, "num_inclusive": 0, "num_not_inclusive": 0},
+            {"num_terms_found": 0, "num_inclusive": 0, "num_not_inclusive": 0},
+            {"num_terms_found": 1, "num_inclusive": 0, "num_not_inclusive": 1},
+            {"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 1},
+            {"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 1},
+            {"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 0}]
+        global_score = global_accuracy(sentence_scores)
+        self.assertEqual(global_score.term_coverage, 0.75)
+        self.assertEqual(global_score.inclusivity_accuracy, 0.5)
+
+    def test_inclusivity_index(self):
+        sentence_scores = [
+            {"num_terms_found": 1, "num_inclusive": 0, "num_not_inclusive": 1},
+            {"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 0},
+            {"num_terms_found": 0, "num_inclusive": 0, "num_not_inclusive": 0},
+            {"num_terms_found": 0, "num_inclusive": 0, "num_not_inclusive": 0},
+            {"num_terms_found": 1, "num_inclusive": 0, "num_not_inclusive": 1},
+            {"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 1},
+            {"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 1},
+            {"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 0}]
+        global_score = global_inclusivity_index(sentence_scores)
+        self.assertEqual(global_score, 0.5)
+
+
+if __name__ == '__main__':
+    unittest.main()