From 3e9da2ba64a8cd8fef1099551c27caac39d6c95e Mon Sep 17 00:00:00 2001 From: Jasper Schulze Buschhoff Date: Wed, 15 Nov 2023 17:07:37 +0100 Subject: [PATCH 1/2] implemented belebele in EU5 --- lm_eval/tasks/opengptx/all_tasks_registry.py | 2 + lm_eval/tasks/opengptx/belebele.py | 312 +++++++++++++++++++ 2 files changed, 314 insertions(+) create mode 100644 lm_eval/tasks/opengptx/belebele.py diff --git a/lm_eval/tasks/opengptx/all_tasks_registry.py b/lm_eval/tasks/opengptx/all_tasks_registry.py index b05b47e5ec..d1f6c7532e 100644 --- a/lm_eval/tasks/opengptx/all_tasks_registry.py +++ b/lm_eval/tasks/opengptx/all_tasks_registry.py @@ -1,4 +1,5 @@ # OpenGPT-X tasks +from . import belebele from . import german_europarl_ppl from . import german_ler_ppl from . import germanquad @@ -20,6 +21,7 @@ TASK_REGISTRY_TMP = { # OpenGPT-X tasks + **belebele.construct_tasks(), "german_europarl_ppl": german_europarl_ppl.GermanEuroparlPerplexity, "german_ler_ppl": german_ler_ppl.GermanLERPerplexity, "germanquad": germanquad.GermanQuAD, diff --git a/lm_eval/tasks/opengptx/belebele.py b/lm_eval/tasks/opengptx/belebele.py new file mode 100644 index 0000000000..17dd979e07 --- /dev/null +++ b/lm_eval/tasks/opengptx/belebele.py @@ -0,0 +1,312 @@ +# TODO: Remove all TODO comments once the implementation is complete. +""" +The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants +https://arxiv.org/abs/2308.16884 + +'We present Belebele, a multiple-choice machine reading comprehension (MRC) dataset spanning 122 language variants. +Significantly expanding the language coverage of natural language understanding (NLU) benchmarks, +this dataset enables the evaluation of text models in high-, medium-, and low-resource languages. +Each question is based on a short passage from the Flores-200 dataset and has four multiple-choice answers. +The questions were carefully curated to discriminate between models with different levels of general language comprehension. +The English dataset on its own proves difficult enough to challenge state-of-the-art language models. +Being fully parallel, this dataset enables direct comparison of model performance across all languages. +We use this dataset to evaluate the capabilities of multilingual masked language models (MLMs) and large language models (LLMs).' + +https://github.com/facebookresearch/belebele +""" +from lm_eval.base import MultipleChoiceTask + + +_CITATION = """@article{bandarkar2023belebele, + title={The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants}, + author={Lucas Bandarkar and Davis Liang and Benjamin Muller and Mikel Artetxe and Satya Narayan Shukla and Donald Husa and Naman Goyal and Abhinandan Krishnan and Luke Zettlemoyer and Madian Khabsa}, + year={2023}, + journal={arXiv preprint arXiv:2308.16884} +} +""" + +_PROMPT_PATTERN = { + "Passage": "Passage", + "Question": "Question", + "Answer": "Answer", + "Choices": "Choices" +} + +class belebeleBase(MultipleChoiceTask): + VERSION = 0 + # TODO: Add the `DATASET_PATH` string. This will be the name of the `Task` + # dataset as denoted in HuggingFace `datasets`. + DATASET_PATH = "facebook/belebele" + + def __init__(self, lang: str = None): + self.DATASET_NAME = self.lang_code = lang + super().__init__() + + def has_training_docs(self): + return False + + def has_validation_docs(self): + return False + + def has_test_docs(self): + return True + + def test_docs(self): + if self.has_test_docs(): + # TODO: Return the test document generator from `self.dataset`. + # In most case you can leave this as is unless the dataset split is + # named differently than the default `"test"`. + return map(self._process_doc, + self.dataset["train"].filter( + lambda x: x["dialect"]==self.lang_code)) + + def _process_doc(self, doc): + def format_example(doc, keys): + """ + Passage: + Question: + Choices: + A. + B. + C. + D. + Answer: + """ + + #TODO: decie whether choices should be included in prommpt (on bloom-1b5-clp: worse performance when included, 0.38 vs 0.27) + """ + _PROMPT_PATTERN["Choices"] + "\n" +\ + "".join([f"{keys[i-1]}: {doc[f'mc_answer{i}']}\n" for i in range(1,5)]) +\ + """ + prompt = _PROMPT_PATTERN["Passage"] + ": " + doc["flores_passage"] + "\n" +\ + _PROMPT_PATTERN["Question"] + ": " + doc["question"] + "\n" +\ + _PROMPT_PATTERN["Answer"] + ":" + return prompt + keys = ["A", "B", "C", "D"] + return { + "query": format_example(doc, keys), # The query prompt. + "choices": [doc[f"mc_answer{i}"] for i in range(1,5)], # The list of choices. + "gold": int(doc["correct_answer_num"])-1, # The integer used to index into the correct element of `"choices"`. + } + + def doc_to_text(self, doc): + return doc["query"] + +def create_translation_task(language, version=0): + class belebele(belebeleBase): + VERSION = version + + def __init__(self): + super().__init__(language) + + return belebele + +def construct_tasks(): + return {f"belebele_{lang}":create_translation_task(lang) for lang in _LANGUAGES} + +_LANGUAGES = [ + # "ace_Arab", + # "ace_Latn", + # "acm_Arab", + # "acq_Arab", + # "aeb_Arab", + # "afr_Latn", + # "ajp_Arab", + # "aka_Latn", + # "als_Latn", + # "amh_Ethi", + # "apc_Arab", + # "arb_Arab", + # "arb_Latn", + # "ars_Arab", + # "ary_Arab", + # "arz_Arab", + # "asm_Beng", + # "ast_Latn", + # "awa_Deva", + # "ayr_Latn", + # "azb_Arab", + # "azj_Latn", + # "bak_Cyrl", + # "bam_Latn", + # "ban_Latn", + # "bel_Cyrl", + # "bem_Latn", + # "ben_Beng", + # "bho_Deva", + # "bjn_Arab", + # "bjn_Latn", + # "bod_Tibt", + # "bos_Latn", + # "bug_Latn", + # "bul_Cyrl", + # "cat_Latn", + # "ceb_Latn", + # "ces_Latn", + # "cjk_Latn", + # "ckb_Arab", + # "crh_Latn", + # "cym_Latn", + # "dan_Latn", + "deu_Latn", + # "dik_Latn", + # "dyu_Latn", + # "dzo_Tibt", + # "ell_Grek", + "eng_Latn", + # "epo_Latn", + # "est_Latn", + # "eus_Latn", + # "ewe_Latn", + # "fao_Latn", + # "fij_Latn", + # "fin_Latn", + # "fon_Latn", + "fra_Latn", + # "fur_Latn", + # "fuv_Latn", + # "gaz_Latn", + # "gla_Latn", + # "gle_Latn", + # "glg_Latn", + # "grn_Latn", + # "guj_Gujr", + # "hat_Latn", + # "hau_Latn", + # "heb_Hebr", + # "hin_Deva", + # "hne_Deva", + # "hrv_Latn", + # "hun_Latn", + # "hye_Armn", + # "ibo_Latn", + # "ilo_Latn", + # "ind_Latn", + # "isl_Latn", + "ita_Latn", + # "jav_Latn", + # "jpn_Jpan", + # "kab_Latn", + # "kac_Latn", + # "kam_Latn", + # "kan_Knda", + # "kas_Arab", + # "kas_Deva", + # "kat_Geor", + # "kaz_Cyrl", + # "kbp_Latn", + # "kea_Latn", + # "khk_Cyrl", + # "khm_Khmr", + # "kik_Latn", + # "kin_Latn", + # "kir_Cyrl", + # "kmb_Latn", + # "kmr_Latn", + # "knc_Arab", + # "knc_Latn", + # "kon_Latn", + # "kor_Hang", + # "lao_Laoo", + # "lij_Latn", + # "lim_Latn", + # "lin_Latn", + # "lit_Latn", + # "lmo_Latn", + # "ltg_Latn", + # "ltz_Latn", + # "lua_Latn", + # "lug_Latn", + # "luo_Latn", + # "lus_Latn", + # "lvs_Latn", + # "mag_Deva", + # "mai_Deva", + # "mal_Mlym", + # "mar_Deva", + # "min_Arab", + # "min_Latn", + # "mkd_Cyrl", + # "mlt_Latn", + # "mni_Beng", + # "mos_Latn", + # "mri_Latn", + # "mya_Mymr", + # "nld_Latn", + # "nno_Latn", + # "nob_Latn", + # "npi_Deva", + # "nso_Latn", + # "nus_Latn", + # "nya_Latn", + # "oci_Latn", + # "ory_Orya", + # "pag_Latn", + # "pan_Guru", + # "pap_Latn", + # "pbt_Arab", + # "pes_Arab", + # "plt_Latn", + # "pol_Latn", + # "por_Latn", + # "prs_Arab", + # "quy_Latn", + # "ron_Latn", + # "run_Latn", + # "rus_Cyrl", + # "sag_Latn", + # "san_Deva", + # "sat_Olck", + # "scn_Latn", + # "shn_Mymr", + # "sin_Sinh", + # "slk_Latn", + # "slv_Latn", + # "smo_Latn", + # "sna_Latn", + # "snd_Arab", + # "som_Latn", + # "sot_Latn", + "spa_Latn", + # "srd_Latn", + # "srp_Cyrl", + # "ssw_Latn", + # "sun_Latn", + # "swe_Latn", + # "swh_Latn", + # "szl_Latn", + # "tam_Taml", + # "taq_Latn", + # "taq_Tfng", + # "tat_Cyrl", + # "tel_Telu", + # "tgk_Cyrl", + # "tgl_Latn", + # "tha_Thai", + # "tir_Ethi", + # "tpi_Latn", + # "tsn_Latn", + # "tso_Latn", + # "tuk_Latn", + # "tum_Latn" + # "tur_Latn", + # "twi_Latn", + # "tzm_Tfng", + # "uig_Arab", + # "ukr_Cyrl", + # "umb_Latn", + # "urd_Arab", + # "uzn_Latn", + # "vec_Latn", + # "vie_Latn", + # "war_Latn", + # "wol_Latn", + # "xho_Latn", + # "ydd_Hebr", + # "yor_Latn", + # "yue_Hant", + # "zho_Hans", + # "zho_Hant", + # "zsm_Latn", + # "zul_Latn", +] From 5085e957c4713d7ecf6bd7a8556ec533740a609e Mon Sep 17 00:00:00 2001 From: KlaudiaTH Date: Fri, 1 Dec 2023 11:54:15 +0100 Subject: [PATCH 2/2] Re-implemented belebele. --- lm_eval/tasks/opengptx/belebele.py | 129 ++++++++++++++--------------- 1 file changed, 61 insertions(+), 68 deletions(-) diff --git a/lm_eval/tasks/opengptx/belebele.py b/lm_eval/tasks/opengptx/belebele.py index 17dd979e07..e6901fefd7 100644 --- a/lm_eval/tasks/opengptx/belebele.py +++ b/lm_eval/tasks/opengptx/belebele.py @@ -1,4 +1,3 @@ -# TODO: Remove all TODO comments once the implementation is complete. """ The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants https://arxiv.org/abs/2308.16884 @@ -25,21 +24,39 @@ } """ -_PROMPT_PATTERN = { - "Passage": "Passage", - "Question": "Question", - "Answer": "Answer", - "Choices": "Choices" -} - -class belebeleBase(MultipleChoiceTask): +class BelebeleBase(MultipleChoiceTask): VERSION = 0 - # TODO: Add the `DATASET_PATH` string. This will be the name of the `Task` - # dataset as denoted in HuggingFace `datasets`. DATASET_PATH = "facebook/belebele" + PROMPT_KEYWORDS = { + "eng_Latn":["Passage", "Question", "Answer"], + "deu_Latn":["Passage", "Frage", "Antwort"], + "fra_Latn":["Passage", "Question", "Réponse"], + "ita_Latn":["Passaggio", "Domanda", "Risposta"], + "spa_Latn":["Pasaje", "Pregunta", "Respuesta"], + "bul_Cyrl":["Пасаж", "Въпрос", "Отговор"], + "ces_Latn":["Pasáž", "Otázka", "Odpověď"], + "dan_Latn":["Passage", "Spørgsmål", "Svar"], + "ell_Grek":["Απόσπασμα", "Ερώτηση", "Απάντηση"], + "est_Latn":["Passage", "Question", "Answer"], + "fin_Latn":["Kohta", "Kysymys", "Vastaus"], + "gle_Latn":["Sliocht", "Ceist", "Freagra"], + "hrv_Latn":["Odlomak","Pitanje","Odgovor"], + "hun_Latn":["Passzus", "Kérdés", "Válasz"], + #"lij_Latn":["Fragments", "Jautājums", "Atbilde"], + "lit_Latn":["Ištrauka", "Klausimas", "Atsakymas"], + "mlt_Latn":["Silta", "Mistoqsija", "Tweġiba"], + "nld_Latn":["Passage", "Vraag", "Antwoord"], + "pol_Latn":["Fragment", "Pytanie", "Odpowiedź"], + "por_Latn":["Passagem", "Pergunta", "Resposta"], + "ron_Latn":["Pasaj", "Întrebare", "Răspuns"], + "slk_Latn":["Pasáž", "Otázka", "Odpoveď"], + "slv_Latn":["Odlomek", "Vprašanje", "Odgovor"], + "swe_Latn":["Passage", "Fråga", "Svar"], + } + def __init__(self, lang: str = None): - self.DATASET_NAME = self.lang_code = lang + self.lang_code = lang super().__init__() def has_training_docs(self): @@ -52,57 +69,33 @@ def has_test_docs(self): return True def test_docs(self): - if self.has_test_docs(): - # TODO: Return the test document generator from `self.dataset`. - # In most case you can leave this as is unless the dataset split is - # named differently than the default `"test"`. - return map(self._process_doc, - self.dataset["train"].filter( - lambda x: x["dialect"]==self.lang_code)) + return map(self._process_doc, self.dataset[self.lang_code]) def _process_doc(self, doc): - def format_example(doc, keys): - """ - Passage: - Question: - Choices: - A. - B. - C. - D. - Answer: - """ + keywords = self.PROMPT_KEYWORDS[self.lang_code] + + out_doc = { + "query": f"{keywords[0]}: {doc['flores_passage']}\n{keywords[1]}: {doc['question']}\n{keywords[2]}:", + "choices": [doc[f"mc_answer{i}"] for i in [1, 2, 3, 4]], + "gold": int(doc["correct_answer_num"])-1, + } - #TODO: decie whether choices should be included in prommpt (on bloom-1b5-clp: worse performance when included, 0.38 vs 0.27) - """ - _PROMPT_PATTERN["Choices"] + "\n" +\ - "".join([f"{keys[i-1]}: {doc[f'mc_answer{i}']}\n" for i in range(1,5)]) +\ - """ - prompt = _PROMPT_PATTERN["Passage"] + ": " + doc["flores_passage"] + "\n" +\ - _PROMPT_PATTERN["Question"] + ": " + doc["question"] + "\n" +\ - _PROMPT_PATTERN["Answer"] + ":" - return prompt - keys = ["A", "B", "C", "D"] - return { - "query": format_example(doc, keys), # The query prompt. - "choices": [doc[f"mc_answer{i}"] for i in range(1,5)], # The list of choices. - "gold": int(doc["correct_answer_num"])-1, # The integer used to index into the correct element of `"choices"`. - } + return out_doc def doc_to_text(self, doc): return doc["query"] -def create_translation_task(language, version=0): - class belebele(belebeleBase): +def create_task(language, version=0): + class Belebele(BelebeleBase): VERSION = version def __init__(self): super().__init__(language) - return belebele + return Belebele def construct_tasks(): - return {f"belebele_{lang}":create_translation_task(lang) for lang in _LANGUAGES} + return {f"belebele_{lang}": create_task(lang) for lang in _LANGUAGES} _LANGUAGES = [ # "ace_Arab", @@ -139,35 +132,35 @@ def construct_tasks(): # "bod_Tibt", # "bos_Latn", # "bug_Latn", - # "bul_Cyrl", + "bul_Cyrl", # "cat_Latn", # "ceb_Latn", - # "ces_Latn", + "ces_Latn", # "cjk_Latn", # "ckb_Arab", # "crh_Latn", # "cym_Latn", - # "dan_Latn", + "dan_Latn", "deu_Latn", # "dik_Latn", # "dyu_Latn", # "dzo_Tibt", - # "ell_Grek", + "ell_Grek", "eng_Latn", # "epo_Latn", - # "est_Latn", + "est_Latn", # "eus_Latn", # "ewe_Latn", # "fao_Latn", # "fij_Latn", - # "fin_Latn", + "fin_Latn", # "fon_Latn", "fra_Latn", # "fur_Latn", # "fuv_Latn", # "gaz_Latn", # "gla_Latn", - # "gle_Latn", + "gle_Latn", # "glg_Latn", # "grn_Latn", # "guj_Gujr", @@ -176,8 +169,8 @@ def construct_tasks(): # "heb_Hebr", # "hin_Deva", # "hne_Deva", - # "hrv_Latn", - # "hun_Latn", + "hrv_Latn", + "hun_Latn", # "hye_Armn", # "ibo_Latn", # "ilo_Latn", @@ -211,7 +204,7 @@ def construct_tasks(): # "lij_Latn", # "lim_Latn", # "lin_Latn", - # "lit_Latn", + "lit_Latn", # "lmo_Latn", # "ltg_Latn", # "ltz_Latn", @@ -227,12 +220,12 @@ def construct_tasks(): # "min_Arab", # "min_Latn", # "mkd_Cyrl", - # "mlt_Latn", + "mlt_Latn", # "mni_Beng", # "mos_Latn", # "mri_Latn", # "mya_Mymr", - # "nld_Latn", + "nld_Latn", # "nno_Latn", # "nob_Latn", # "npi_Deva", @@ -247,11 +240,11 @@ def construct_tasks(): # "pbt_Arab", # "pes_Arab", # "plt_Latn", - # "pol_Latn", - # "por_Latn", + "pol_Latn", + "por_Latn", # "prs_Arab", # "quy_Latn", - # "ron_Latn", + "ron_Latn", # "run_Latn", # "rus_Cyrl", # "sag_Latn", @@ -260,8 +253,8 @@ def construct_tasks(): # "scn_Latn", # "shn_Mymr", # "sin_Sinh", - # "slk_Latn", - # "slv_Latn", + "slk_Latn", + "slv_Latn", # "smo_Latn", # "sna_Latn", # "snd_Arab", @@ -272,7 +265,7 @@ def construct_tasks(): # "srp_Cyrl", # "ssw_Latn", # "sun_Latn", - # "swe_Latn", + "swe_Latn", # "swh_Latn", # "szl_Latn", # "tam_Taml", @@ -309,4 +302,4 @@ def construct_tasks(): # "zho_Hant", # "zsm_Latn", # "zul_Latn", -] +] \ No newline at end of file