Skip to content

Commit

Permalink
[!133][GENDER] Add INES evaluation script
Browse files Browse the repository at this point in the history
# Why is the change needed?

INES-eval.py is a new feature. It is an evaluation script to be associated with the newly de-en resource for MT INES.
It calculates gender inclusivity and term coverage.
To be working, it requires the [INES corpus](https://drive.google.com/drive/u/1/folders/15c3ZW9gxbCKZnvHf3nOulQ_0Ke0bz5ti) as definition file in .tsv and the model output as .txt

# What changes does the patch introduce?
None, this is a new feature so all of it is new.

# How was this patch tested?
Added UTs.

For a realistic scenario, it was tested on the (tokenized) [output](https://drive.google.com/file/d/1ctwpf_6MQe3_fHWac1xm1r4Pk97h19rc/view?usp=sharing) of an MT model.
Sentence level scores, and hence global scores were manually checked to confirm correct functioning of INES-eval.py

Also, the following output sentences and corresponding definition file were created as stress test on difficult positive and negative cases.
These include:

- cases of non-consecutive inclusive/not_inclusive tokens not to be matched as inclusive/not_inclusive terms.
- cases with match of both inclusive/not_inclusive terms.
- cases where the inclusive or not_inclusive terms appear twice, and should be only matched once.

[AIRC.de-en_tok-stress.txt](/uploads/e29def8c2c2af64b5ad663f3436efd5b/AIRC.de-en_tok-stress.txt)

[INES-stress.tsv](/uploads/755ce38fe9cc9f6f146cc62ba8569338/INES-stress.tsv)
  • Loading branch information
bsavoldi authored and mgaido91 committed Oct 18, 2023
1 parent 5316873 commit 35dc97b
Show file tree
Hide file tree
Showing 2 changed files with 372 additions and 0 deletions.
190 changes: 190 additions & 0 deletions examples/speech_to_text/scripts/gender/INES_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
#!/usr/bin/env python3
# Copyright 2023 FBK

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License

import argparse
import csv
from collections import namedtuple
import os
import logging


InesAccuracy = namedtuple("InesAccuracy", ["term_coverage", "inclusivity_accuracy"])

logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO").upper(),)
LOGGER = logging.getLogger("INES_eval")


def full_match(generated_terms, eval_tokens):
# Check if the sequence of eval tokens fully matches a subsequence in generated terms
for i in range(len(generated_terms) - len(eval_tokens) + 1):
if generated_terms[i:i + len(eval_tokens)] == eval_tokens:
return True
return False


def sentence_level_scores(in_f, tsv_f):
# Calculate sentence-level scores
sentences = []
with open(in_f) as i_f, open(tsv_f) as t_f:
tsv_reader = csv.DictReader(t_f, delimiter='\t')
for (i_line, terms_f) in zip(i_f, tsv_reader):
sentence_inclusive = 0
sentence_not_inclusive = 0
sentence_found = 0
generated_terms = i_line.strip().lower().split()
eval_terms = terms_f['EVAL-TERMS-en'].strip().lower().split(";")
inclusive_term = eval_terms[0]
not_inclusive_term = eval_terms[1]

inclusive_tokens = inclusive_term.split()
not_inclusive_tokens = not_inclusive_term.split()

found_inclusive = full_match(generated_terms, inclusive_tokens)
found_not_inclusive = full_match(generated_terms, not_inclusive_tokens)

if found_inclusive:
sentence_inclusive += 1
if found_not_inclusive:
sentence_not_inclusive += 1
if found_inclusive or found_not_inclusive:
sentence_found += 1
# check if both inclusive and not_inclusive are matched
if found_inclusive and found_not_inclusive:
line_number = terms_f['ID']
LOGGER.info(f"Both inclusive and not inclusive terms found at line {line_number}: "
f"'{i_line.strip()}'")

sentences.append({
"num_terms_found": sentence_found,
"num_inclusive": sentence_inclusive,
"num_not_inclusive": sentence_not_inclusive
})

# asserting both files have been completed
assert next(i_f, None) is None and next(t_f, None) is None, \
"INES TSV and hypothesis should have the same length"
return sentences


def write_sentence_scores(out_f, sentence_scores):
# Write sentence-level scores to a file
with open(out_f, 'w') as f_w:
writer = csv.DictWriter(
f_w, ["num_terms_found", "num_inclusive", "num_not_inclusive"], delimiter='\t')
writer.writeheader()
writer.writerows(sentence_scores)


def global_inclusivity_index(sentence_scores):
# Calculate global evaluation scores for inclusivity index as % of generated not_inclusive terms
tot_terms = len(sentence_scores)
tot_not_inclusive = 0

for score in sentence_scores:
tot_not_inclusive += score["num_not_inclusive"]

if tot_terms == 0:
raise Exception("Cannot evaluate with empty INES TSV")
return 1 - float(tot_not_inclusive) / tot_terms


def global_accuracy(sentence_scores):
# Calculate global evaluation scores for term coverage and inclusivity accuracy
tot_terms = len(sentence_scores)
if tot_terms == 0:
raise Exception("Cannot evaluate with empty INES TSV")
tot_found = 0
tot_inclusive = 0
tot_not_inclusive = 0

for score in sentence_scores:
tot_found += score["num_terms_found"]
tot_inclusive += score["num_inclusive"]
tot_not_inclusive += score["num_not_inclusive"]

term_cov = tot_found / tot_terms
if tot_inclusive + tot_not_inclusive > 0:
inclusivity_acc = tot_inclusive / (tot_inclusive + tot_not_inclusive)
else:
inclusivity_acc = 0.0
overall_scores = InesAccuracy(term_cov, inclusivity_acc)

if LOGGER.isEnabledFor(logging.DEBUG):
LOGGER.debug("Evaluated {} sentences...".format(len(sentence_scores)))
if LOGGER.isEnabledFor(logging.DEBUG):
LOGGER.debug("Global: all->{}; found->{}; inclusive->{}; not_inclusive->{}".format(
tot_terms, tot_found, tot_inclusive, tot_not_inclusive))

return overall_scores


def print_index_scores(out_scores):
# Print global evaluation scores for inclusivity index
print("Global Inclusivity Index")
print("------------------------")
print("{}".format(out_scores))


def print_acc_scores(out_scores):
# Print global evaluation scores
print("Term Coverage\tInclusivity Accuracy")
print("-------------------------------------------------")
print("{}\t{}".format(out_scores.term_coverage, out_scores.inclusivity_accuracy))


if __name__ == '__main__':
"""
Scripts for the evaluation of gender-inclusive language in MT on INES.
Given pairs of target inclusive/not-inclusive terms, the evaluation
scripts calculates:
- *inclusivity-index*, as the proportion of not_inclusive generated
by a system. The lower the proportion, the higher the level of
inclusivity.
As complementary metrics, given the --acc-scores argument, the scripts
can also return:
- *term coverage*, as the proportion of either inclusive/non-inclusive
terms generated by a system.
- *inclusivity accuracy*, as the proportion of desirable inclusive terms
among all inclusive/not-inclusive terms generated by a system.
Example usage:
python INES-eval.py --input your_MT_output.txt --tsv-definition INES.tsv
Version: 1.0
"""
parser = argparse.ArgumentParser()
parser.add_argument('--input', required=True, type=str, metavar='FILE',
help='Input file to be used to compute scores (it must be tokenized).')
parser.add_argument('--tsv-definition', required=True, type=str, metavar='FILE',
help='TSV INES definitions file.')
parser.add_argument('--sentence-scores', required=False, default=None, type=str, metavar='FILE',
help='If set, sentence level scores are written into this file.')
parser.add_argument('--acc-scores', required=False, action='store_true', default=False,
help='If set, print global accuracy and term coverage.')

args = parser.parse_args()

sl_scores = sentence_level_scores(args.input, args.tsv_definition)
if args.sentence_scores:
write_sentence_scores(args.sentence_scores, sl_scores)
scores = global_inclusivity_index(sl_scores)
print_index_scores(scores)
if args.acc_scores:
accuracy_scores = global_accuracy(sl_scores)
print_acc_scores(accuracy_scores)
182 changes: 182 additions & 0 deletions fbk_uts/scripts/test_ines_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
# Copyright 2023 FBK
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
import unittest
import tempfile

from examples.speech_to_text.scripts.gender.INES_eval import sentence_level_scores, global_inclusivity_index, global_accuracy


class InesEvalTestCase(unittest.TestCase):

def test_sentence_level_scores_base_not_inclusive(self):
hypos = ["The average man spends about eight hours a day with sleep ."]
# Create temporary files for object acceptable to function in INES_eval
with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \
tempfile.NamedTemporaryFile(mode="w") as tsv_file:
# here write the hypos to hypos_file and the tsv_file
hypos_file.write('\n'.join(hypos))
tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man")
hypos_file.flush()
tsv_file.flush()
out = sentence_level_scores(hypos_file.name, tsv_file.name)
self.assertDictEqual(out[0], {
"num_terms_found": 1,
"num_inclusive": 0,
"num_not_inclusive": 1})

def test_sentence_level_scores_base_inclusive(self):
# mind the \ I missed in my previous comment
hypos = ["The average person spends about eight hours a day with sleep ."]
with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \
tempfile.NamedTemporaryFile(mode="w") as tsv_file:
# here write the hypos to hypos_file and the tsv_file
hypos_file.write('\n'.join(hypos))
tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man")
hypos_file.flush()
tsv_file.flush()
out = sentence_level_scores(hypos_file.name, tsv_file.name)
self.assertDictEqual(out[0], {
"num_terms_found": 1,
"num_inclusive": 1,
"num_not_inclusive": 0})

def test_sentence_level_scores_partial_match(self):
hypos = ["I am the average male ."]
with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \
tempfile.NamedTemporaryFile(mode="w") as tsv_file:
# here write the hypos to hypos_file and the tsv_file
hypos_file.write('\n'.join(hypos))
tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man")
hypos_file.flush()
tsv_file.flush()
out = sentence_level_scores(hypos_file.name, tsv_file.name)
self.assertDictEqual(out[0], {
"num_terms_found": 0,
"num_inclusive": 0,
"num_not_inclusive": 0})

def test_sentence_level_scores_inconsecutive_tokens(self):
hypos = ["I am the average male , while you are a good person ."]
with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \
tempfile.NamedTemporaryFile(mode="w") as tsv_file:
# here write the hypos to hypos_file and the tsv_file
hypos_file.write('\n'.join(hypos))
tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man")
hypos_file.flush()
tsv_file.flush()
out = sentence_level_scores(hypos_file.name, tsv_file.name)
self.assertDictEqual(out[0], {
"num_terms_found": 0,
"num_inclusive": 0,
"num_not_inclusive": 0})

def test_sentence_level_scores_consecutive_tokens(self):
hypos = ["The average dog for the average man ."]
with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \
tempfile.NamedTemporaryFile(mode="w") as tsv_file:
# here write the hypos to hypos_file and the tsv_file
hypos_file.write('\n'.join(hypos))
tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man")
hypos_file.flush()
tsv_file.flush()
out = sentence_level_scores(hypos_file.name, tsv_file.name)
self.assertDictEqual(out[0], {
"num_terms_found": 1,
"num_inclusive": 0,
"num_not_inclusive": 1})

def test_sentence_level_scores_both_term_in(self):
hypos = ["The average person is an average man ."]
with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \
tempfile.NamedTemporaryFile(mode="w") as tsv_file:
# here write the hypos to hypos_file and the tsv_file
hypos_file.write('\n'.join(hypos))
tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man")
hypos_file.flush()
tsv_file.flush()
out = sentence_level_scores(hypos_file.name, tsv_file.name)
self.assertDictEqual(out[0], {
"num_terms_found": 1,
"num_inclusive": 1,
"num_not_inclusive": 1})

def test_sentence_level_scores_both_term_ni(self):
hypos = ["The average man is an average person ."]
with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \
tempfile.NamedTemporaryFile(mode="w") as tsv_file:
# here write the hypos to hypos_file and the tsv_file
hypos_file.write('\n'.join(hypos))
tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man")
hypos_file.flush()
tsv_file.flush()
out = sentence_level_scores(hypos_file.name, tsv_file.name)
self.assertDictEqual(out[0], {
"num_terms_found": 1,
"num_inclusive": 1,
"num_not_inclusive": 1})

def test_sentence_level_scores_duplicate_term(self):
hypos = ["The average person is an average person ."]
with tempfile.NamedTemporaryFile(mode="w") as hypos_file, \
tempfile.NamedTemporaryFile(mode="w") as tsv_file:
# here write the hypos to hypos_file and the tsv_file
hypos_file.write('\n'.join(hypos))
tsv_file.write("ID\tEVAL-TERMS-en\n1\taverage person;average man")
hypos_file.flush()
tsv_file.flush()
out = sentence_level_scores(hypos_file.name, tsv_file.name)
self.assertDictEqual(out[0], {
"num_terms_found": 1,
"num_inclusive": 1,
"num_not_inclusive": 0})

def test_global_inclusivity_index_empty(self):
with self.assertRaises(Exception) as e:
out = global_inclusivity_index([])
self.assertEqual(str(e.exception), "Cannot evaluate with empty INES TSV")

def test_global_accuracy_empty(self):
with self.assertRaises(Exception) as e:
out = global_accuracy([])
self.assertEqual(str(e.exception), "Cannot evaluate with empty INES TSV")

def test_global_accuracy(self):
sentence_scores = [
{"num_terms_found": 1, "num_inclusive": 0, "num_not_inclusive": 1},
{"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 0},
{"num_terms_found": 0, "num_inclusive": 0, "num_not_inclusive": 0},
{"num_terms_found": 0, "num_inclusive": 0, "num_not_inclusive": 0},
{"num_terms_found": 1, "num_inclusive": 0, "num_not_inclusive": 1},
{"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 1},
{"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 1},
{"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 0}]
global_score = global_accuracy(sentence_scores)
self.assertEqual(global_score.term_coverage, 0.75)
self.assertEqual(global_score.inclusivity_accuracy, 0.5)

def test_inclusivity_index(self):
sentence_scores = [
{"num_terms_found": 1, "num_inclusive": 0, "num_not_inclusive": 1},
{"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 0},
{"num_terms_found": 0, "num_inclusive": 0, "num_not_inclusive": 0},
{"num_terms_found": 0, "num_inclusive": 0, "num_not_inclusive": 0},
{"num_terms_found": 1, "num_inclusive": 0, "num_not_inclusive": 1},
{"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 1},
{"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 1},
{"num_terms_found": 1, "num_inclusive": 1, "num_not_inclusive": 0}]
global_score = global_inclusivity_index(sentence_scores)
self.assertEqual(global_score, 0.5)


if __name__ == '__main__':
unittest.main()

0 comments on commit 35dc97b

Please sign in to comment.