From dbf2a444df57f2920723b335d07459e5a78f39d6 Mon Sep 17 00:00:00 2001 From: nmowmita Date: Wed, 14 Feb 2024 11:50:01 +0100 Subject: [PATCH 01/11] task crowspairs de --- lm_eval/tasks/__init__.py | 5 ++ lm_eval/tasks/crowspairsde.py | 138 ++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 lm_eval/tasks/crowspairsde.py diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py index 06cab49b9f..59628331be 100644 --- a/lm_eval/tasks/__init__.py +++ b/lm_eval/tasks/__init__.py @@ -69,6 +69,9 @@ from .mlmm import multilingual_mmlu from .mlmm import multilingual_truthfulqa +from . import crowspairsde + + ######################################## # Translation tasks @@ -101,6 +104,8 @@ TASK_REGISTRY = { + "crowspairsde": crowspairsde.CrowsPairsDE, + # GLUE "cola": glue.CoLA, "mnli": glue.MNLI, diff --git a/lm_eval/tasks/crowspairsde.py b/lm_eval/tasks/crowspairsde.py new file mode 100644 index 0000000000..10604240ff --- /dev/null +++ b/lm_eval/tasks/crowspairsde.py @@ -0,0 +1,138 @@ + +""" +CrowS-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models +https://aclanthology.org/2020.emnlp-main.154/ + + +CrowS-Pairs is a challenge set for evaluating what language models (LMs) on their tendency +to generate biased outputs. CrowS-Pairs comes in 2 languages and the English subset has +a newer version which fixes some issues with the original version. + +Homepage: https://github.com/nyu-mll/crows-pairs +""" + +from lm_eval.base import rf, Task +from lm_eval.metrics import mean + +_CITATION = """ +@inproceedings{nangia-etal-2020-crows, + title = "{C}row{S}-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models", + author = "Nangia, Nikita and + Vania, Clara and + Bhalerao, Rasika and + Bowman, Samuel R.", + booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)", + month = nov, + year = "2020", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2020.emnlp-main.154", + doi = "10.18653/v1/2020.emnlp-main.154", + pages = "1953--1967", + abstract = "Pretrained language models, especially masked language models (MLMs) have seen success across many NLP tasks. However, there is ample evidence that they use the cultural biases that are undoubtedly present in the corpora they are trained on, implicitly creating harm with biased representations. To measure some forms of social bias in language models against protected demographic groups in the US, we introduce the Crowdsourced Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 examples that cover stereotypes dealing with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building less biased models advances, this dataset can be used as a benchmark to evaluate progress.", +} +""" + + +class CrowsPairsDE(Task): + VERSION = 0 + DATASET_PATH = "lamarr-org/crows_pairs_de" + DATASET_NAME = None + BIAS_TYPE = None + + def __init__(self, data_dir=None, cache_dir=None, download_mode=None): + super().__init__() + self.download(data_dir, cache_dir, download_mode) + self._training_docs = None + self._fewshot_docs = None + self.non_unknown_outputs = 0 + self.biased_answers = 0 + self.final_accuracy = None + self.bias_disambiguate = None + def has_training_docs(self): + return True + + def has_validation_docs(self): + return True + + def has_test_docs(self): + return True + + def training_docs(self): + return self.dataset["train"] + + def validation_docs(self): + if self.has_validation_docs(): + test_dataset = self.dataset["test"] + if self.BIAS_TYPE is not None: + test_dataset = test_dataset.filter( + lambda example: example["bias_type"].startswith(self.BIAS_TYPE) + ) + return test_dataset + + def fewshot_context( + self, doc, num_fewshot, provide_description=None, rnd=None, description=None + ): + assert num_fewshot == num_fewshot + assert ( + rnd is not None + ), "A `random.Random` generator argument must be provided to `rnd`" + assert not provide_description, ( + "The `provide_description` arg will be removed in future versions. To prepend " + "a custom description to the context, supply the corresponding string via the " + "`description` arg." + ) + if provide_description is not None: + # nudge people to not specify it at all + print( + "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict" + ) + + return "" + + def test_docs(self): + return self.dataset["test"] + + def _process_doc(self, doc): + return doc + + def doc_to_text(self, doc): + return "" + + def should_decontaminate(self): + return True + + def doc_to_decontamination_query(self, doc): + return doc["sentence_good"] + " " + doc["sentence_bad"] + + def doc_to_target(self, doc): + return " " + + def construct_requests(self, doc, ctx): + assert not ctx + + # Calculate the loglikelihood for the more and the less stereotypical sentence. + # Note that loglikelihood translates the "" prefix to the "<|endoftext|>" token + return [ + rf.loglikelihood("", doc["sent_more"]), + rf.loglikelihood("", doc["sent_less"]), + ] + + def process_results(self, doc, results): + likelihood1, likelihood2 = results + + # Calculate the absolute difference in loglikelihoods + diff = abs(likelihood1[0] - likelihood2[0]) + + # if stereotypical sentence more likely (loglikelihood higher) + # treat this as predicting stereotyped sentence + acc = 1.0 if likelihood1[0] > likelihood2[0] else 0.0 + + return {"likelihood_difference": diff, "pct_stereotype": acc} + + def aggregation(self): + return {"likelihood_difference": mean, "pct_stereotype": mean} + + def higher_is_better(self): + # For all metrics lower is better + return {"likelihood_difference": False, "pct_stereotype": False} From 0298a15031850b3ae5dea252f5818f071bd64444 Mon Sep 17 00:00:00 2001 From: nmowmita Date: Wed, 14 Feb 2024 16:35:32 +0100 Subject: [PATCH 02/11] error line fix --- lm_eval/tasks/crowspairsde.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lm_eval/tasks/crowspairsde.py b/lm_eval/tasks/crowspairsde.py index 10604240ff..3c2e3c34f2 100644 --- a/lm_eval/tasks/crowspairsde.py +++ b/lm_eval/tasks/crowspairsde.py @@ -49,6 +49,7 @@ def __init__(self, data_dir=None, cache_dir=None, download_mode=None): self.biased_answers = 0 self.final_accuracy = None self.bias_disambiguate = None + def has_training_docs(self): return True From d80daa0b89af83ea920c574d491f93b00b131f18 Mon Sep 17 00:00:00 2001 From: nmowmita Date: Wed, 14 Feb 2024 16:39:11 +0100 Subject: [PATCH 03/11] error line fix --- lm_eval/tasks/__init__.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py index 59628331be..ae96893e4e 100644 --- a/lm_eval/tasks/__init__.py +++ b/lm_eval/tasks/__init__.py @@ -70,13 +70,9 @@ from .mlmm import multilingual_truthfulqa from . import crowspairsde - - - ######################################## # Translation tasks ######################################## - # 6 total gpt3_translation_benchmarks = { "wmt14": ["en-fr", "fr-en"], # French From 96dda902d077946f11fae65acd4bbb3dd92db293 Mon Sep 17 00:00:00 2001 From: nmowmita Date: Fri, 16 Feb 2024 17:06:07 +0100 Subject: [PATCH 04/11] error line fix --- lm_eval/tasks/crowspairsde.py | 36 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/lm_eval/tasks/crowspairsde.py b/lm_eval/tasks/crowspairsde.py index 3c2e3c34f2..392eeb4511 100644 --- a/lm_eval/tasks/crowspairsde.py +++ b/lm_eval/tasks/crowspairsde.py @@ -14,24 +14,21 @@ from lm_eval.base import rf, Task from lm_eval.metrics import mean -_CITATION = """ -@inproceedings{nangia-etal-2020-crows, - title = "{C}row{S}-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models", - author = "Nangia, Nikita and - Vania, Clara and - Bhalerao, Rasika and - Bowman, Samuel R.", - booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)", - month = nov, - year = "2020", - address = "Online", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2020.emnlp-main.154", - doi = "10.18653/v1/2020.emnlp-main.154", - pages = "1953--1967", - abstract = "Pretrained language models, especially masked language models (MLMs) have seen success across many NLP tasks. However, there is ample evidence that they use the cultural biases that are undoubtedly present in the corpora they are trained on, implicitly creating harm with biased representations. To measure some forms of social bias in language models against protected demographic groups in the US, we introduce the Crowdsourced Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 examples that cover stereotypes dealing with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building less biased models advances, this dataset can be used as a benchmark to evaluate progress.", -} -""" +_CITATION = """@inproceedings{nangia-etal-2020-crows, title = "{C}row{S}-Pairs: A Challenge Dataset for Measuring +Social Biases in Masked Language Models", author = "Nangia, Nikita and Vania, Clara and Bhalerao, Rasika and +Bowman, Samuel R.", booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language +Processing (EMNLP)", month = nov, year = "2020", address = "Online", publisher = "Association for Computational +Linguistics", url = "https://aclanthology.org/2020.emnlp-main.154", doi = "10.18653/v1/2020.emnlp-main.154", +pages = "1953--1967", abstract = "Pretrained language models, especially masked language models (MLMs) have seen +success across many NLP tasks. However, there is ample evidence that they use the cultural biases that are +undoubtedly present in the corpora they are trained on, implicitly creating harm with biased representations. To +measure some forms of social bias in language models against protected demographic groups in the US, we introduce the +Crowdsourced Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 examples that cover stereotypes dealing +with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one +that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically +disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we +evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building +less biased models advances, this dataset can be used as a benchmark to evaluate progress.", } """ class CrowsPairsDE(Task): @@ -86,7 +83,8 @@ def fewshot_context( if provide_description is not None: # nudge people to not specify it at all print( - "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict" + "WARNING: provide_description is deprecated and will be removed in a future version in favor of " + "description_dict " ) return "" From 2bf4c39d786f084dfdf7985ae49aeaa76de9c941 Mon Sep 17 00:00:00 2001 From: nmowmita Date: Fri, 16 Feb 2024 17:09:37 +0100 Subject: [PATCH 05/11] error line fix --- lm_eval/tasks/__init__.py | 2 +- lm_eval/tasks/crowspairsde.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py index ae96893e4e..20167ee52e 100644 --- a/lm_eval/tasks/__init__.py +++ b/lm_eval/tasks/__init__.py @@ -70,6 +70,7 @@ from .mlmm import multilingual_truthfulqa from . import crowspairsde + ######################################## # Translation tasks ######################################## @@ -101,7 +102,6 @@ TASK_REGISTRY = { "crowspairsde": crowspairsde.CrowsPairsDE, - # GLUE "cola": glue.CoLA, "mnli": glue.MNLI, diff --git a/lm_eval/tasks/crowspairsde.py b/lm_eval/tasks/crowspairsde.py index 392eeb4511..3478c80b9c 100644 --- a/lm_eval/tasks/crowspairsde.py +++ b/lm_eval/tasks/crowspairsde.py @@ -1,4 +1,3 @@ - """ CrowS-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models https://aclanthology.org/2020.emnlp-main.154/ From 329ea57c47bfd7c71a6e3385f2f9c7c58d88c623 Mon Sep 17 00:00:00 2001 From: nmowmita Date: Fri, 16 Feb 2024 17:14:05 +0100 Subject: [PATCH 06/11] error line fix --- lm_eval/tasks/crowspairsde.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lm_eval/tasks/crowspairsde.py b/lm_eval/tasks/crowspairsde.py index 3478c80b9c..68c0237385 100644 --- a/lm_eval/tasks/crowspairsde.py +++ b/lm_eval/tasks/crowspairsde.py @@ -22,7 +22,7 @@ success across many NLP tasks. However, there is ample evidence that they use the cultural biases that are undoubtedly present in the corpora they are trained on, implicitly creating harm with biased representations. To measure some forms of social bias in language models against protected demographic groups in the US, we introduce the -Crowdsourced Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 examples that cover stereotypes dealing +Crowdsources Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 examples that cover stereotypes dealing with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we From 9cf8ff786d2cbcbabebd07e7b72d68b836c4cfa9 Mon Sep 17 00:00:00 2001 From: nmowmita Date: Fri, 16 Feb 2024 17:15:24 +0100 Subject: [PATCH 07/11] error line fix --- lm_eval/tasks/crowspairsde.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lm_eval/tasks/crowspairsde.py b/lm_eval/tasks/crowspairsde.py index 68c0237385..8c70686323 100644 --- a/lm_eval/tasks/crowspairsde.py +++ b/lm_eval/tasks/crowspairsde.py @@ -26,8 +26,7 @@ with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we -evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building -less biased models advances, this dataset can be used as a benchmark to evaluate progress.", } """ +evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building less biased models advances, this dataset can be used as a benchmark to evaluate progress.", } """ class CrowsPairsDE(Task): From ab95f5cfad1dd92ecad01af2b7b0f0530c8e7e95 Mon Sep 17 00:00:00 2001 From: nmowmita Date: Fri, 16 Feb 2024 17:17:37 +0100 Subject: [PATCH 08/11] error line fix --- lm_eval/tasks/crowspairsde.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lm_eval/tasks/crowspairsde.py b/lm_eval/tasks/crowspairsde.py index 8c70686323..68c0237385 100644 --- a/lm_eval/tasks/crowspairsde.py +++ b/lm_eval/tasks/crowspairsde.py @@ -26,7 +26,8 @@ with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we -evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building less biased models advances, this dataset can be used as a benchmark to evaluate progress.", } """ +evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building +less biased models advances, this dataset can be used as a benchmark to evaluate progress.", } """ class CrowsPairsDE(Task): From 305156647e65f12c1e516c219deb2f57c163e668 Mon Sep 17 00:00:00 2001 From: nmowmita Date: Fri, 16 Feb 2024 17:18:59 +0100 Subject: [PATCH 09/11] error line fix --- lm_eval/tasks/crowspairsde.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lm_eval/tasks/crowspairsde.py b/lm_eval/tasks/crowspairsde.py index 68c0237385..df100ff7c5 100644 --- a/lm_eval/tasks/crowspairsde.py +++ b/lm_eval/tasks/crowspairsde.py @@ -26,7 +26,7 @@ with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we -evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building +evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building less biased models advances, this dataset can be used as a benchmark to evaluate progress.", } """ From e3efb390ca8b6adbc8e6d79ed830483cb285d277 Mon Sep 17 00:00:00 2001 From: nmowmita Date: Fri, 16 Feb 2024 17:20:33 +0100 Subject: [PATCH 10/11] error line fix --- lm_eval/tasks/crowspairsde.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lm_eval/tasks/crowspairsde.py b/lm_eval/tasks/crowspairsde.py index df100ff7c5..68c0237385 100644 --- a/lm_eval/tasks/crowspairsde.py +++ b/lm_eval/tasks/crowspairsde.py @@ -26,7 +26,7 @@ with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we -evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building +evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building less biased models advances, this dataset can be used as a benchmark to evaluate progress.", } """ From 4e4aaae91fa82a5ffc2411a3f6e15feab4474891 Mon Sep 17 00:00:00 2001 From: nmowmita Date: Fri, 16 Feb 2024 17:25:03 +0100 Subject: [PATCH 11/11] error line fix --- lm_eval/tasks/crowspairsde.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/lm_eval/tasks/crowspairsde.py b/lm_eval/tasks/crowspairsde.py index 68c0237385..cb14a828cf 100644 --- a/lm_eval/tasks/crowspairsde.py +++ b/lm_eval/tasks/crowspairsde.py @@ -13,21 +13,24 @@ from lm_eval.base import rf, Task from lm_eval.metrics import mean -_CITATION = """@inproceedings{nangia-etal-2020-crows, title = "{C}row{S}-Pairs: A Challenge Dataset for Measuring -Social Biases in Masked Language Models", author = "Nangia, Nikita and Vania, Clara and Bhalerao, Rasika and -Bowman, Samuel R.", booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language -Processing (EMNLP)", month = nov, year = "2020", address = "Online", publisher = "Association for Computational -Linguistics", url = "https://aclanthology.org/2020.emnlp-main.154", doi = "10.18653/v1/2020.emnlp-main.154", -pages = "1953--1967", abstract = "Pretrained language models, especially masked language models (MLMs) have seen -success across many NLP tasks. However, there is ample evidence that they use the cultural biases that are -undoubtedly present in the corpora they are trained on, implicitly creating harm with biased representations. To -measure some forms of social bias in language models against protected demographic groups in the US, we introduce the -Crowdsources Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 examples that cover stereotypes dealing -with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one -that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically -disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we -evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building -less biased models advances, this dataset can be used as a benchmark to evaluate progress.", } """ +_CITATION = """ +@inproceedings{nangia-etal-2020-crows, + title = "{C}row{S}-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models", + author = "Nangia, Nikita and + Vania, Clara and + Bhalerao, Rasika and + Bowman, Samuel R.", + booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)", + month = nov, + year = "2020", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2020.emnlp-main.154", + doi = "10.18653/v1/2020.emnlp-main.154", + pages = "1953--1967", + abstract = "Pretrained language models, especially masked language models (MLMs) have seen success across many NLP tasks. However, there is ample evidence that they use the cultural biases that are undoubtedly present in the corpora they are trained on, implicitly creating harm with biased representations. To measure some forms of social bias in language models against protected demographic groups in the US, we introduce the Crowdsourced Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 examples that cover stereotypes dealing with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building less biased models advances, this dataset can be used as a benchmark to evaluate progress.", +} +""" class CrowsPairsDE(Task):