Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrated various German tasks #97

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions lm_eval/tasks/opengptx/all_tasks_registry.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
# OpenGPT-X tasks
from . import arc_de
from . import german_europarl_ppl
from . import german_ler_ppl
from . import germanquad
from . import germeval2017
from . import germeval2018
from . import gnad10
from . import hellaswag_de
from . import hendrycks_test_de
from . import mlqa
from . import mlsum
from . import oscar_ppl
from . import pawsx
from . import stereoset
from . import truthfulqa_de
from . import wino_x
from . import xcsr
from . import xlwic
Expand All @@ -20,19 +24,23 @@

TASK_REGISTRY_TMP = {
# OpenGPT-X tasks
"arc_challenge_de": arc_de.ARCChallengeDE,
"german_europarl_ppl": german_europarl_ppl.GermanEuroparlPerplexity,
"german_ler_ppl": german_ler_ppl.GermanLERPerplexity,
"germanquad": germanquad.GermanQuAD,
"germeval2017": germeval2017.GermEval2017,
"germeval2018_coarse": germeval2018.GermEval2018,
"germeval2018_fine": germeval2018.GermEval2018_fine,
"gnad10": gnad10.GNAD10,
"hellaswag_de": hellaswag_de.HellaSwagDE,
**hendrycks_test_de.create_all_tasks(),
**mlqa.construct_tasks(),
**mlsum.construct_tasks(),
"oscar_ppl_de": oscar_ppl.OscarPerplexityGerman,
**pawsx.construct_tasks(),
**stereoset.construct_tasks(),
**xcsr.construct_tasks(),
"truthful_qa_de": truthfulqa_de.TruthfulQADEMultipleChoice,
"wino_de": wino_x.WinograndeXDe,
"xlwic_de": xlwic.WordsInContextDe,
"xlwic_it": xlwic.WordsInContextIt,
Expand Down
69 changes: 69 additions & 0 deletions lm_eval/tasks/opengptx/arc_de.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge
https://arxiv.org/pdf/1803.05457.pdf
The ARC dataset consists of 7,787 science exam questions drawn from a variety
of sources, including science questions provided under license by a research
partner affiliated with AI2. These are text-only, English language exam questions
that span several grade levels as indicated in the files. Each question has a
multiple choice structure (typically 4 answer options). The questions are sorted
into a Challenge Set of 2,590 “hard” questions (those that both a retrieval and
a co-occurrence method fail to answer correctly) and an Easy Set of 5,197 questions.
Homepage: https://allenai.org/data/arc
NOTE: This German version is lifted without change from
https://github.com/bjoernpl/lm-evaluation-harness-de/blob/mmlu_de/lm_eval/tasks/arc_de.py.
"""
from lm_eval.base import MultipleChoiceTask


_CITATION = """
@article{Clark2018ThinkYH,
title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
journal={ArXiv},
year={2018},
volume={abs/1803.05457}
}
"""


class ARCChallengeDE(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "bjoernp/arc_challenge_de"
DATASET_NAME = None

def has_training_docs(self):
return False

def has_validation_docs(self):
return True

def has_test_docs(self):
return True

def validation_docs(self):
return map(self._process_doc, self.dataset["validation"])

def test_docs(self):
return map(self._process_doc, self.dataset["test"])

def _process_doc(self, doc):
# NOTE: Some `doc["answerKey"]`s are in numeric string format being one
# of {'1', '2', '3', '4', '5'}. We map them back to letters.
num_to_letter = {"1": "A", "2": "B", "3": "C", "4": "D", "5": "E"}
doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"])
out_doc = {
"id": doc["id"],
"query": "Frage: " + doc["question_de"] + "\nAntwort:",
"choices": doc["choices_de"]["text"],
"gold": ["A", "B", "C", "D", "E"].index(doc["answerKey"]),
}
return out_doc

def doc_to_text(self, doc):
return doc["query"]

def should_decontaminate(self):
return True

def doc_to_decontamination_query(self, doc):
return doc["query"]
79 changes: 79 additions & 0 deletions lm_eval/tasks/opengptx/hellaswag_de.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""
HellaSwag: Can a Machine Really Finish Your Sentence?
https://arxiv.org/pdf/1905.07830.pdf

Hellaswag is a commonsense inference challenge dataset. Though its questions are
trivial for humans (>95% accuracy), state-of-the-art models struggle (<48%). This is
achieved via Adversarial Filtering (AF), a data collection paradigm wherein a
series of discriminators iteratively select an adversarial set of machine-generated
wrong answers. AF proves to be surprisingly robust. The key insight is to scale up
the length and complexity of the dataset examples towards a critical 'Goldilocks'
zone wherein generated text is ridiculous to humans, yet often misclassified by
state-of-the-art models.

Homepage: https://rowanzellers.com/hellaswag/
NOTE: This German version is lifted without change from
https://github.com/bjoernpl/lm-evaluation-harness-de/blob/mmlu_de/lm_eval/tasks/hellaswag_de.py.
"""
import re
from lm_eval.base import MultipleChoiceTask


_CITATION = """
@inproceedings{zellers2019hellaswag,
title={HellaSwag: Can a Machine Really Finish Your Sentence?},
author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
year={2019}
}
"""


class HellaSwagDE(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "bjoernp/hellaswag_de"
DATASET_NAME = None

def has_training_docs(self):
return True

def has_validation_docs(self):
return True

def has_test_docs(self):
return False

def training_docs(self):
if self._training_docs is None:
self._training_docs = list(map(self._process_doc, self.dataset["train"]))
return self._training_docs

def validation_docs(self):
return map(self._process_doc, self.dataset["validation"])

def _process_doc(self, doc):
ctx = doc["ctx_de"]
KlaudiaTH marked this conversation as resolved.
Show resolved Hide resolved
out_doc = {
"query": self.preprocess(doc["activity_label_de"] + ": " + ctx),
"choices": [self.preprocess(ending) for ending in doc["endings_de"]],
"gold": int(doc["label"]),
}
return out_doc

@classmethod
def preprocess(cls, text):
text = text.strip()
# NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
text = text.replace(" [title]", ". ")
text = re.sub("\\[.*?\\]", "", text)
text = text.replace(" ", " ")
return text

def doc_to_text(self, doc):
return doc["query"]

def should_decontaminate(self):
return True

def doc_to_decontamination_query(self, doc):
return doc["query"]
Loading
Loading