OpenGPTX · jjbuschhoff · Oct 20, 2023 · Oct 30, 2023 · Nov 7, 2023 · Nov 9, 2023
diff --git a/lm_eval/tasks/opengptx/all_tasks_registry.py b/lm_eval/tasks/opengptx/all_tasks_registry.py
@@ -1,15 +1,19 @@
 # OpenGPT-X tasks
+from . import arc_de
 from . import german_europarl_ppl
 from . import german_ler_ppl
 from . import germanquad
 from . import germeval2017
 from . import germeval2018
 from . import gnad10
+from . import hellaswag_de
+from . import hendrycks_test_de
 from . import mlqa
 from . import mlsum
 from . import oscar_ppl
 from . import pawsx
 from . import stereoset
+from . import truthfulqa_de
 from . import wino_x
 from . import xcsr
 from . import xlwic
@@ -20,19 +24,23 @@
 
 TASK_REGISTRY_TMP = {
     # OpenGPT-X tasks
+    "arc_challenge_de": arc_de.ARCChallengeDE,
     "german_europarl_ppl": german_europarl_ppl.GermanEuroparlPerplexity,
     "german_ler_ppl": german_ler_ppl.GermanLERPerplexity,
     "germanquad": germanquad.GermanQuAD,
     "germeval2017": germeval2017.GermEval2017,
     "germeval2018_coarse": germeval2018.GermEval2018,
     "germeval2018_fine": germeval2018.GermEval2018_fine,
     "gnad10": gnad10.GNAD10,
+    "hellaswag_de": hellaswag_de.HellaSwagDE,
+    **hendrycks_test_de.create_all_tasks(),
     **mlqa.construct_tasks(),
     **mlsum.construct_tasks(),
     "oscar_ppl_de": oscar_ppl.OscarPerplexityGerman,
     **pawsx.construct_tasks(),
     **stereoset.construct_tasks(),
     **xcsr.construct_tasks(),
+    "truthful_qa_de": truthfulqa_de.TruthfulQADEMultipleChoice,
     "wino_de": wino_x.WinograndeXDe,
     "xlwic_de": xlwic.WordsInContextDe,
     "xlwic_it": xlwic.WordsInContextIt,

diff --git a/lm_eval/tasks/opengptx/arc_de.py b/lm_eval/tasks/opengptx/arc_de.py
@@ -0,0 +1,69 @@
+"""
+Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge
+https://arxiv.org/pdf/1803.05457.pdf
+The ARC dataset consists of 7,787 science exam questions drawn from a variety
+of sources, including science questions provided under license by a research
+partner affiliated with AI2. These are text-only, English language exam questions
+that span several grade levels as indicated in the files. Each question has a
+multiple choice structure (typically 4 answer options). The questions are sorted
+into a Challenge Set of 2,590 “hard” questions (those that both a retrieval and
+a co-occurrence method fail to answer correctly) and an Easy Set of 5,197 questions.
+Homepage: https://allenai.org/data/arc
+NOTE: This German version is lifted without change from
+https://github.com/bjoernpl/lm-evaluation-harness-de/blob/mmlu_de/lm_eval/tasks/arc_de.py.
+"""
+from lm_eval.base import MultipleChoiceTask
+
+
+_CITATION = """
+@article{Clark2018ThinkYH,
+  title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
+  author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
+  journal={ArXiv},
+  year={2018},
+  volume={abs/1803.05457}
+}
+"""
+
+
+class ARCChallengeDE(MultipleChoiceTask):
+    VERSION = 0
+    DATASET_PATH = "bjoernp/arc_challenge_de"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def validation_docs(self):
+        return map(self._process_doc, self.dataset["validation"])
+
+    def test_docs(self):
+        return map(self._process_doc, self.dataset["test"])
+
+    def _process_doc(self, doc):
+        # NOTE: Some `doc["answerKey"]`s are in numeric string format being one
+        # of {'1', '2', '3', '4', '5'}. We map them back to letters.
+        num_to_letter = {"1": "A", "2": "B", "3": "C", "4": "D", "5": "E"}
+        doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"])
+        out_doc = {
+            "id": doc["id"],
+            "query": "Frage: " + doc["question_de"] + "\nAntwort:",
+            "choices": doc["choices_de"]["text"],
+            "gold": ["A", "B", "C", "D", "E"].index(doc["answerKey"]),
+        }
+        return out_doc
+
+    def doc_to_text(self, doc):
+        return doc["query"]
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["query"]
diff --git a/lm_eval/tasks/opengptx/hellaswag_de.py b/lm_eval/tasks/opengptx/hellaswag_de.py
@@ -0,0 +1,79 @@
+"""
+HellaSwag: Can a Machine Really Finish Your Sentence?
+https://arxiv.org/pdf/1905.07830.pdf
+
+Hellaswag is a commonsense inference challenge dataset. Though its questions are
+trivial for humans (>95% accuracy), state-of-the-art models struggle (<48%). This is
+achieved via Adversarial Filtering (AF), a data collection paradigm wherein a
+series of discriminators iteratively select an adversarial set of machine-generated
+wrong answers. AF proves to be surprisingly robust. The key insight is to scale up
+the length and complexity of the dataset examples towards a critical 'Goldilocks'
+zone wherein generated text is ridiculous to humans, yet often misclassified by
+state-of-the-art models.
+
+Homepage: https://rowanzellers.com/hellaswag/
+NOTE: This German version is lifted without change from
+https://github.com/bjoernpl/lm-evaluation-harness-de/blob/mmlu_de/lm_eval/tasks/hellaswag_de.py.
+"""
+import re
+from lm_eval.base import MultipleChoiceTask
+
+
+_CITATION = """
+@inproceedings{zellers2019hellaswag,
+    title={HellaSwag: Can a Machine Really Finish Your Sentence?},
+    author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
+    booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
+    year={2019}
+}
+"""
+
+
+class HellaSwagDE(MultipleChoiceTask):
+    VERSION = 0
+    DATASET_PATH = "bjoernp/hellaswag_de"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(map(self._process_doc, self.dataset["train"]))
+        return self._training_docs
+
+    def validation_docs(self):
+        return map(self._process_doc, self.dataset["validation"])
+
+    def _process_doc(self, doc):
+        ctx = doc["ctx_de"]
+        out_doc = {
+            "query": self.preprocess(doc["activity_label_de"] + ": " + ctx),
+            "choices": [self.preprocess(ending) for ending in doc["endings_de"]],
+            "gold": int(doc["label"]),
+        }
+        return out_doc
+
+    @classmethod
+    def preprocess(cls, text):
+        text = text.strip()
+        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
+        text = text.replace(" [title]", ". ")
+        text = re.sub("\\[.*?\\]", "", text)
+        text = text.replace("  ", " ")
+        return text
+
+    def doc_to_text(self, doc):
+        return doc["query"]
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["query"]