OpenGPTX · jjbuschhoff · Jan 10, 2024 · Jan 10, 2024 · Jan 11, 2024 · Jan 10, 2024
diff --git a/lm_eval/tasks/opengptx/all_tasks_registry.py b/lm_eval/tasks/opengptx/all_tasks_registry.py
@@ -1,16 +1,21 @@
 # OpenGPT-X tasks
 from . import flores200
+from . import arcx
 from . import german_europarl_ppl
 from . import german_ler_ppl
 from . import germanquad
 from . import germeval2017
 from . import germeval2018
+from . import hellaswagx
 from . import gnad10
+from . import gsm8kx
 from . import mlqa
 from . import mlsum
+from . import mmlux
 from . import oscar_ppl
 from . import pawsx
 from . import stereoset
+from . import truthfulqax
 from . import wino_x
 from . import xcsr
 from . import xlwic
@@ -24,18 +29,23 @@
 
 TASK_REGISTRY_TMP = {
     # OpenGPT-X tasks
+    **arcx.construct_all_tasks(),
     "german_europarl_ppl": german_europarl_ppl.GermanEuroparlPerplexity,
     "german_ler_ppl": german_ler_ppl.GermanLERPerplexity,
     "germanquad": germanquad.GermanQuAD,
     "germeval2017": germeval2017.GermEval2017,
     "germeval2018_coarse": germeval2018.GermEval2018,
     "germeval2018_fine": germeval2018.GermEval2018_fine,
+    **hellaswagx.construct_all_tasks(),
     "gnad10": gnad10.GNAD10,
+    **gsm8kx.construct_all_tasks(),
     **mlqa.construct_tasks(),
     **mlsum.construct_tasks(),
+    **mmlux.create_all_tasks(),
     "oscar_ppl_de": oscar_ppl.OscarPerplexityGerman,
     **pawsx.construct_tasks(),
     **stereoset.construct_tasks(),
+    **truthfulqax.construct_all_tasks(),
     **xcsr.construct_tasks(),
     "wino_de": wino_x.WinograndeXDe,
     "xlwic_de": xlwic.WordsInContextDe,

diff --git a/lm_eval/tasks/opengptx/arcx.py b/lm_eval/tasks/opengptx/arcx.py
@@ -0,0 +1,137 @@
+"""
+Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge
+https://arxiv.org/pdf/1803.05457.pdf
+
+The ARC dataset consists of 7,787 science exam questions drawn from a variety
+of sources, including science questions provided under license by a research
+partner affiliated with AI2. These are text-only, English language exam questions
+that span several grade levels as indicated in the files. Each question has a
+multiple choice structure (typically 4 answer options). The questions are sorted
+into a Challenge Set of 2,590 “hard” questions (those that both a retrieval and
+a co-occurrence method fail to answer correctly) and an Easy Set of 5,197 questions.
+
+Homepage: https://allenai.org/data/arc
+"""
+from lm_eval.base import MultipleChoiceTask
+
+
+_CITATION = """
+@article{Clark2018ThinkYH,
+  title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
+  author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
+  journal={ArXiv},
+  year={2018},
+  volume={abs/1803.05457}
+}
+"""
+LANGS = [
+    "BG",
+    "DA",
+    "DE",
+    "ET",
+    "FI",
+    "FR",
+    "EL",
+    "IT",
+    "LV",
+    "LT",
+    "NL",
+    "PL",
+    "PT-PT",
+    "RO",
+    "SV",
+    "SK",
+    "SL",
+    "ES",
+    "CS",
+    "HU",
+]
+
+PROMPT_WORDS = {
+    "BG": ("Въпрос", "Отговор"),
+    "DA": ("Spørgsmål", "Svar"),
+    "DE": ("Frage", "Antwort"),
+    "ET": ("Küsimus", "Vastus"),
+    "FI": ("Kysymys", "Vastaa"),
+    "FR": ("Question", "Réponse"),
+    "EL": ("Ερώτηση", "Απάντηση"),
+    "IT": ("Domanda", "Risposta"),
+    "LV": ("Jautājums", "Atbilde"),
+    "LT": ("Klausimas", "Atsakymas"),
+    "NL": ("Vraag", "Antwoord"),
+    "PL": ("Pytanie", "Odpowiedź"),
+    "PT-PT": ("Questão", "Resposta"),
+    "RO": ("Întrebare", "Răspuns"),
+    "SV": ("Fråga", "Svar"),
+    "SK": ("Otázka", "Odpoveď"),
+    "SL": ("Vprašanje", "Odgovor"),
+    "ES": ("Pregunta", "Respuesta"),
+    "CS": ("Otázka", "Odpověď"),
+    "HU": ("Kérdés", "Válasz"),
+}
+
+
+def construct_task(lang: str, split: str):
+    class ARC(ARCBase):
+        QWORD, RWORD = PROMPT_WORDS.get(lang, ("Question", "Answer"))
+        DATASET_NAME = f"{split}_{lang}"
+
+    return ARC
+
+
+def construct_all_tasks():
+    return {
+        f"arcx_{s}_{l.lower()}": construct_task(l, s)
+        for l in LANGS
+        for s in ["easy", "challenge"]
+    }
+
+
+class ARCBase(MultipleChoiceTask):
+    VERSION = 0
+    DATASET_PATH = "openGPT-X/arcx"
+    DATASET_NAME = None
+    NUM_FEW_SHOT = 25
+    QWORD, RWORD = None, None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(map(self._process_doc, self.dataset["train"]))
+        return self._training_docs
+
+    def validation_docs(self):
+        return map(self._process_doc, self.dataset["validation"])
+
+    def test_docs(self):
+        return map(self._process_doc, self.dataset["test"])
+
+    def _process_doc(self, doc):
+        # NOTE: Some `doc["answerKey"]`s are in numeric string format being one
+        # of {'1', '2', '3', '4', '5'}. We map them back to letters.
+        num_to_letter = {"1": "A", "2": "B", "3": "C", "4": "D", "5": "E"}
+        doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"])
+        out_doc = {
+            "id": doc["id"],
+            "query": self.QWORD + ": " + doc["question"] + f"\n{self.RWORD}:",
+            "choices": doc["choices"]["text"],
+            "gold": ["A", "B", "C", "D", "E"].index(doc["answerKey"]),
+        }
+        return out_doc
+
+    def doc_to_text(self, doc):
+        return doc["query"]
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["query"]
diff --git a/lm_eval/tasks/opengptx/gsm8kx.py b/lm_eval/tasks/opengptx/gsm8kx.py
@@ -0,0 +1,187 @@
+"""
+"Training Verifiers to Solve Math Word Problems"
+https://arxiv.org/abs/2110.14168
+
+State-of-the-art language models can match human performance on many tasks, but
+they still struggle to robustly perform multi-step mathematical reasoning. To
+diagnose the failures of current models and support research, we introduce GSM8K,
+a dataset of 8.5K high quality linguistically diverse grade school math word problems.
+We find that even the largest transformer models fail to achieve high test performance,
+despite the conceptual simplicity of this problem distribution.
+
+NOTE: See the official implementation of the task:
+    https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py
+for how to make use of the dataset's calculator annotations in your language
+model's sample/generation function.
+
+Homepage: https://github.com/openai/grade-school-math
+"""
+import re
+from lm_eval.base import Task, rf
+from lm_eval.metrics import mean
+
+
+_CITATION = """
+@misc{cobbe2021training,
+      title={Training Verifiers to Solve Math Word Problems},
+      author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
+      year={2021},
+      eprint={2110.14168},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+"""
+
+LANGS = [
+    "BG",
+    "DA",
+    "DE",
+    "ET",
+    "FI",
+    "FR",
+    "EL",
+    "IT",
+    "LV",
+    "LT",
+    "NL",
+    "PL",
+    "PT-PT",
+    "RO",
+    "SV",
+    "SK",
+    "SL",
+    "ES",
+    "CS",
+    "HU",
+]
+
+PROMPT_WORDS = {
+    "BG": ("Въпрос", "Отговор"),
+    "DA": ("Spørgsmål", "Svar"),
+    "DE": ("Frage", "Antwort"),
+    "ET": ("Küsimus", "Vastus"),
+    "FI": ("Kysymys", "Vastaa"),
+    "FR": ("Question", "Réponse"),
+    "EL": ("Ερώτηση", "Απάντηση"),
+    "IT": ("Domanda", "Risposta"),
+    "LV": ("Jautājums", "Atbilde"),
+    "LT": ("Klausimas", "Atsakymas"),
+    "NL": ("Vraag", "Antwoord"),
+    "PL": ("Pytanie", "Odpowiedź"),
+    "PT-PT": ("Questão", "Resposta"),
+    "RO": ("Întrebare", "Răspuns"),
+    "SV": ("Fråga", "Svar"),
+    "SK": ("Otázka", "Odpoveď"),
+    "SL": ("Vprašanje", "Odgovor"),
+    "ES": ("Pregunta", "Respuesta"),
+    "CS": ("Otázka", "Odpověď"),
+    "HU": ("Kérdés", "Válasz"),
+}
+
+
+def construct_all_tasks():
+    return {f"gsm8kx_{lang.lower()}": construct_task(lang) for lang in LANGS}
+
+
+def construct_task(lang):
+    class task(GradeSchoolMath8K):
+        DATASET_NAME = lang
+        QWORD, RWORD = PROMPT_WORDS.get(lang, ("Question", "Answer"))
+
+    return task
+
+
+ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
+INVALID_ANS = "[invalid]"
+
+
+class GradeSchoolMath8K(Task):
+    VERSION = 0
+    DATASET_PATH = "openGPT-X/gsm8kx"
+    QWORD, RWORD = None, None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        return self.dataset["train"]
+
+    def validation_docs(self):
+        raise NotImplementedError
+
+    def test_docs(self):
+        return self.dataset["test"]
+
+    def doc_to_text(self, doc):
+        return self.QWORD + ": " + doc["question"] + f"\n{self.RWORD}:"
+
+    def doc_to_target(self, doc):
+        return " " + doc["answer"]
+
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        # NOTE: The paper implements "verifiers" that assign a score to multiple
+        # solutions and output the highest ranked solution.
+        completion = rf.greedy_until(
+            ctx, {"until": [":", f"{self.QWORD}:", f"{self.QWORD}"]}
+        )
+        return completion
+
+    def _extract_answer(self, completion):
+        match = ANS_RE.search(completion)
+        if match:
+            match_str = match.group(1).strip()
+            match_str = match_str.replace(",", "")
+            return match_str
+        else:
+            return INVALID_ANS
+
+    def _is_correct(self, completion, answer):
+        gold = self._extract_answer(answer)
+        assert gold != INVALID_ANS, "No ground truth answer found in the document."
+        return self._extract_answer(completion) == gold
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        completion = results[0]
+        answer = doc["answer"]
+        return {"acc": self._is_correct(completion, answer)}
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {"acc": mean}
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {"acc": True}