From a38e716bfd8a3849db771b8bf4d67c48593a14a8 Mon Sep 17 00:00:00 2001 From: Railey Montalan Date: Tue, 27 Feb 2024 16:30:45 +0800 Subject: [PATCH 1/4] Implement dataloader for SeaEval --- seacrowd/sea_datasets/seaeval/__init__.py | 0 seacrowd/sea_datasets/seaeval/seaeval.py | 272 ++++++++++++++++++++++ 2 files changed, 272 insertions(+) create mode 100644 seacrowd/sea_datasets/seaeval/__init__.py create mode 100644 seacrowd/sea_datasets/seaeval/seaeval.py diff --git a/seacrowd/sea_datasets/seaeval/__init__.py b/seacrowd/sea_datasets/seaeval/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/seaeval/seaeval.py b/seacrowd/sea_datasets/seaeval/seaeval.py new file mode 100644 index 000000000..8d517b002 --- /dev/null +++ b/seacrowd/sea_datasets/seaeval/seaeval.py @@ -0,0 +1,272 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{SeaEval2023, + title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning}, + author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.}, + journal={arXiv preprint arXiv:2309.04766}, + year={2023}, + url={https://github.com/SeaEval/SeaEval} +} +""" + +_DATASETNAME = "seaeval" + +_DESCRIPTION = """\ +SeaEval is a benchmark toolkit for evaluating multilingual LLMs. The benchmark contains 28 datasets, +covering 7 languages. It contains 2 datasets for cross-lingual consistency, each containing parallel +questions for the 7 represented languages. It alsoc ontains 4 datasets for cultural reasoning +(multiple choice Q&A) that are in English but focused on regions including Singapore and Philipines. + +This dataloader provides examples for Indonesia, Vietnamese, Malay, and Filipino. +""" + +_HOMEPAGE = "https://github.com/SeaEval/SeaEval" + +_LANGUAGES = {"ind": "Indonesian", "vie": "Vietnamese", "zlm": "Malay", "fil": "Filipino"} +_LANGUAGES_EXCHANGED = dict((v, k) for k, v in _LANGUAGES.items()) + +_LICENSE = Licenses.CC_BY_NC_4_0.value + +_LOCAL = False + +_URLS = { + "cross_mmlu": "https://huggingface.co/datasets/SeaEval/SeaEval_datasets/raw/main/cross_mmlu.json", + "cross_logiqa": "https://huggingface.co/datasets/SeaEval/SeaEval_datasets/raw/main/cross_logiqa.json", + "sg_eval": "https://huggingface.co/datasets/SeaEval/SeaEval_datasets/raw/main/sg_eval.json", + "ph_eval": "https://huggingface.co/datasets/SeaEval/SeaEval_datasets/raw/main/ph_eval.json", +} + +_SUBSETS = list(_URLS.keys()) + +_SUPPORTED_TASKS = [Tasks.COMMONSENSE_REASONING, Tasks.QUESTION_ANSWERING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class SeaEvalDataset(datasets.GeneratorBasedBuilder): + """ + SeaEval is a benchmark for evaluating multilingual LLMs from https://github.com/SeaEval/SeaEval. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} {subset} source schema", + schema="source", + subset_id=f"{subset}", + ) + for subset in _SUBSETS + ] + + BUILDER_CONFIGS += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_seacrowd_qa", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} {subset} SEACrowd schema", + schema="seacrowd_qa", + subset_id=f"{subset}", + ) + for subset in _SUBSETS + ] + + BUILDER_CONFIGS += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_qa", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_qa", + subset_id="all", + ) + ] + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source" and self.config.subset_id not in ["cross_logiqa", "ph_eval"]: + features = datasets.Features( + { + "id": datasets.Value("string"), + "language": datasets.Value("string"), + "question": datasets.Value("string"), + "choices": datasets.Sequence(datasets.Value("string")), + "answer": datasets.Value("string"), + } + ) + elif self.config.schema == "source" and self.config.subset_id == "cross_logiqa": + features = datasets.Features( + { + "id": datasets.Value("string"), + "language": datasets.Value("string"), + "question": datasets.Value("string"), + "context": datasets.Value("string"), + "choices": datasets.Sequence(datasets.Value("string")), + "answer": datasets.Value("string"), + } + ) + elif self.config.schema == "source" and self.config.subset_id == "ph_eval": + features = datasets.Features( + { + "id": datasets.Value("string"), + "language": datasets.Value("string"), + "question": datasets.Value("string"), + "choices": datasets.Sequence(datasets.Value("string")), + "answer": datasets.Value("string"), + "category": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_qa": + features = datasets.Features( + { + "id": datasets.Value("string"), + "question_id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "question": datasets.Value("string"), + "type": datasets.Value("string"), + "choices": datasets.Sequence(datasets.Value("string")), + "context": datasets.Value("string"), + "answer": datasets.Sequence(datasets.Value("string")), + "meta": { + "language": datasets.Value("string"), + }, + } + ) + + else: + raise ValueError(f"Unexpected schema received! {self.config.schema}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """ + Returns SplitGenerators. + """ + + data = {key: dl_manager.download_and_extract(value) for key, value in _URLS.items()} + + paths = {} + if self.config.subset_id == "all": + paths = data + else: + paths[self.config.subset_id] = data[self.config.subset_id] + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "paths": paths, + "split": "train", + }, + ), + ] + + def _generate_examples(self, paths: Path, split: str) -> Tuple[int, Dict]: + """ + Yields examples as (key, example) tuples. + """ + examples = [] + + for key, path in paths.items(): + if "cross" in key: + data = pd.read_json(path).rename(columns=_LANGUAGES_EXCHANGED) + data = pd.melt(data, id_vars=["id"], value_vars=_LANGUAGES.keys(), var_name="language") + data_flattened = pd.json_normalize(data["value"]) + data_merged = pd.merge(data, data_flattened, left_index=True, right_index=True).drop(columns=["value"]) + examples.append(data_merged) + elif "eval" in key: + data = pd.read_json(path) + data["language"] = "eng" + examples.append(data) + + if len(examples) > 1: + examples = pd.concat(examples).to_records() + else: + examples = examples[0].to_records() + + idx = 0 + if self.config.schema == "source" and self.config.subset_id not in ["cross_logiqa", "ph_eval"]: + for row in examples: + x = { + "id": row["id"], + "language": row["language"], + "question": row["question"], + "choices": row["choices"], + "answer": row["answer"], + } + yield idx, x + idx += 1 + elif self.config.schema == "source" and self.config.subset_id == "cross_logiqa": + for row in examples: + x = { + "id": row["id"], + "language": row["language"], + "question": row["question"], + "context": row["context"], + "choices": row["choices"], + "answer": row["answer"], + } + yield idx, x + idx += 1 + elif self.config.schema == "source" and self.config.subset_id == "ph_eval": + for row in examples: + x = { + "id": row["id"], + "language": row["language"], + "question": row["question"], + "choices": row["choices"], + "answer": row["answer"], + "category": row["category"], + } + yield idx, x + idx += 1 + elif self.config.schema == "seacrowd_qa": + for row in examples: + x = { + "id": idx, + "question_id": row["id"], + "document_id": row["id"], + "question": row["question"], + "type": "multiple_choice", + "choices": row["choices"], + "context": row["context"] if "context" in row else None, + "answer": [row["answer"]], + "meta": { + "language": row["language"], + }, + } + yield idx, x + idx += 1 + else: + raise ValueError(f"Invalid config: {self.config.name}") From 0390851534947f20129013f8a1adce07146c6e56 Mon Sep 17 00:00:00 2001 From: Railey Montalan Date: Wed, 6 Mar 2024 19:43:28 +0800 Subject: [PATCH 2/4] Split into language subsets --- seacrowd/sea_datasets/seaeval/seaeval.py | 90 ++++++++---------------- 1 file changed, 28 insertions(+), 62 deletions(-) diff --git a/seacrowd/sea_datasets/seaeval/seaeval.py b/seacrowd/sea_datasets/seaeval/seaeval.py index 8d517b002..d6e63f7da 100644 --- a/seacrowd/sea_datasets/seaeval/seaeval.py +++ b/seacrowd/sea_datasets/seaeval/seaeval.py @@ -19,6 +19,7 @@ import datasets import pandas as pd +from seacrowd.utils import schemas from seacrowd.utils.configs import SEACrowdConfig from seacrowd.utils.constants import Licenses, Tasks @@ -46,7 +47,6 @@ _HOMEPAGE = "https://github.com/SeaEval/SeaEval" _LANGUAGES = {"ind": "Indonesian", "vie": "Vietnamese", "zlm": "Malay", "fil": "Filipino"} -_LANGUAGES_EXCHANGED = dict((v, k) for k, v in _LANGUAGES.items()) _LICENSE = Licenses.CC_BY_NC_4_0.value @@ -59,8 +59,6 @@ "ph_eval": "https://huggingface.co/datasets/SeaEval/SeaEval_datasets/raw/main/ph_eval.json", } -_SUBSETS = list(_URLS.keys()) - _SUPPORTED_TASKS = [Tasks.COMMONSENSE_REASONING, Tasks.QUESTION_ANSWERING] _SOURCE_VERSION = "1.0.0" @@ -76,36 +74,31 @@ class SeaEvalDataset(datasets.GeneratorBasedBuilder): SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + LANGUAGES_EXCHANGED = dict((v, k) for k, v in _LANGUAGES.items()) + SUBSETS_CROSS_MMLU = ['cross_mmlu_' + lang for lang in _LANGUAGES.keys()] + SUBSETS_CROSS_LOGIQA = ['cross_logiqa_' + lang for lang in _LANGUAGES.keys()] + SUBSETS = SUBSETS_CROSS_MMLU + SUBSETS_CROSS_LOGIQA + ['sg_eval_eng', 'ph_eval_eng'] + BUILDER_CONFIGS = [ SEACrowdConfig( name=f"{_DATASETNAME}_{subset}_source", version=datasets.Version(_SOURCE_VERSION), - description=f"{_DATASETNAME} {subset} source schema", + description=f"{_DATASETNAME}_{subset} source schema", schema="source", - subset_id=f"{subset}", + subset_id=f"{_DATASETNAME}_{subset}", ) - for subset in _SUBSETS + for subset in SUBSETS ] BUILDER_CONFIGS += [ SEACrowdConfig( name=f"{_DATASETNAME}_{subset}_seacrowd_qa", version=datasets.Version(_SOURCE_VERSION), - description=f"{_DATASETNAME} {subset} SEACrowd schema", + description=f"{_DATASETNAME}_{subset} SEACrowd schema", schema="seacrowd_qa", - subset_id=f"{subset}", - ) - for subset in _SUBSETS - ] - - BUILDER_CONFIGS += [ - SEACrowdConfig( - name=f"{_DATASETNAME}_seacrowd_qa", - version=datasets.Version(_SEACROWD_VERSION), - description=f"{_DATASETNAME} SEACrowd schema", - schema="seacrowd_qa", - subset_id="all", + subset_id=f"{_DATASETNAME}_{subset}", ) + for subset in SUBSETS ] def _info(self) -> datasets.DatasetInfo: @@ -113,7 +106,6 @@ def _info(self) -> datasets.DatasetInfo: features = datasets.Features( { "id": datasets.Value("string"), - "language": datasets.Value("string"), "question": datasets.Value("string"), "choices": datasets.Sequence(datasets.Value("string")), "answer": datasets.Value("string"), @@ -123,7 +115,6 @@ def _info(self) -> datasets.DatasetInfo: features = datasets.Features( { "id": datasets.Value("string"), - "language": datasets.Value("string"), "question": datasets.Value("string"), "context": datasets.Value("string"), "choices": datasets.Sequence(datasets.Value("string")), @@ -134,7 +125,6 @@ def _info(self) -> datasets.DatasetInfo: features = datasets.Features( { "id": datasets.Value("string"), - "language": datasets.Value("string"), "question": datasets.Value("string"), "choices": datasets.Sequence(datasets.Value("string")), "answer": datasets.Value("string"), @@ -142,22 +132,7 @@ def _info(self) -> datasets.DatasetInfo: } ) elif self.config.schema == "seacrowd_qa": - features = datasets.Features( - { - "id": datasets.Value("string"), - "question_id": datasets.Value("string"), - "document_id": datasets.Value("string"), - "question": datasets.Value("string"), - "type": datasets.Value("string"), - "choices": datasets.Sequence(datasets.Value("string")), - "context": datasets.Value("string"), - "answer": datasets.Sequence(datasets.Value("string")), - "meta": { - "language": datasets.Value("string"), - }, - } - ) - + features = schemas.qa_features else: raise ValueError(f"Unexpected schema received! {self.config.schema}") @@ -177,10 +152,9 @@ def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datase data = {key: dl_manager.download_and_extract(value) for key, value in _URLS.items()} paths = {} - if self.config.subset_id == "all": - paths = data - else: - paths[self.config.subset_id] = data[self.config.subset_id] + file = self.config.subset_id.split("_") + file = "_".join(file[1:3]) + paths[self.config.subset_id] = data[file] return [ datasets.SplitGenerator( @@ -196,31 +170,27 @@ def _generate_examples(self, paths: Path, split: str) -> Tuple[int, Dict]: """ Yields examples as (key, example) tuples. """ - examples = [] + + language = self.config.subset_id.split("_")[3] + examples = None for key, path in paths.items(): if "cross" in key: - data = pd.read_json(path).rename(columns=_LANGUAGES_EXCHANGED) + data = pd.read_json(path).rename(columns=self.LANGUAGES_EXCHANGED) data = pd.melt(data, id_vars=["id"], value_vars=_LANGUAGES.keys(), var_name="language") data_flattened = pd.json_normalize(data["value"]) - data_merged = pd.merge(data, data_flattened, left_index=True, right_index=True).drop(columns=["value"]) - examples.append(data_merged) + data_merged = pd.merge(data, data_flattened, left_index=True, right_index=True) + data_filtered = data_merged[data_merged['language']==language].drop(columns=["value", "language"]) + examples = data_filtered.to_records() elif "eval" in key: data = pd.read_json(path) - data["language"] = "eng" - examples.append(data) - - if len(examples) > 1: - examples = pd.concat(examples).to_records() - else: - examples = examples[0].to_records() + examples = data.to_records() idx = 0 if self.config.schema == "source" and self.config.subset_id not in ["cross_logiqa", "ph_eval"]: for row in examples: x = { "id": row["id"], - "language": row["language"], "question": row["question"], "choices": row["choices"], "answer": row["answer"], @@ -231,9 +201,8 @@ def _generate_examples(self, paths: Path, split: str) -> Tuple[int, Dict]: for row in examples: x = { "id": row["id"], - "language": row["language"], "question": row["question"], - "context": row["context"], + "context": row["context"] if "context" in row else None, "choices": row["choices"], "answer": row["answer"], } @@ -243,11 +212,10 @@ def _generate_examples(self, paths: Path, split: str) -> Tuple[int, Dict]: for row in examples: x = { "id": row["id"], - "language": row["language"], "question": row["question"], "choices": row["choices"], "answer": row["answer"], - "category": row["category"], + "category": row["category"] if "category" in row else None, } yield idx, x idx += 1 @@ -262,11 +230,9 @@ def _generate_examples(self, paths: Path, split: str) -> Tuple[int, Dict]: "choices": row["choices"], "context": row["context"] if "context" in row else None, "answer": [row["answer"]], - "meta": { - "language": row["language"], - }, + "meta": {}, } yield idx, x idx += 1 else: - raise ValueError(f"Invalid config: {self.config.name}") + raise ValueError(f"Invalid schema: {self.config.schema}") From 0be35a74289317f423bfa79828e7c18dcbdab323 Mon Sep 17 00:00:00 2001 From: Railey Montalan Date: Wed, 6 Mar 2024 19:44:24 +0800 Subject: [PATCH 3/4] Split into language subsets --- seacrowd/sea_datasets/seaeval/seaeval.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/seacrowd/sea_datasets/seaeval/seaeval.py b/seacrowd/sea_datasets/seaeval/seaeval.py index d6e63f7da..af81bae71 100644 --- a/seacrowd/sea_datasets/seaeval/seaeval.py +++ b/seacrowd/sea_datasets/seaeval/seaeval.py @@ -75,9 +75,9 @@ class SeaEvalDataset(datasets.GeneratorBasedBuilder): SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) LANGUAGES_EXCHANGED = dict((v, k) for k, v in _LANGUAGES.items()) - SUBSETS_CROSS_MMLU = ['cross_mmlu_' + lang for lang in _LANGUAGES.keys()] - SUBSETS_CROSS_LOGIQA = ['cross_logiqa_' + lang for lang in _LANGUAGES.keys()] - SUBSETS = SUBSETS_CROSS_MMLU + SUBSETS_CROSS_LOGIQA + ['sg_eval_eng', 'ph_eval_eng'] + SUBSETS_CROSS_MMLU = ["cross_mmlu_" + lang for lang in _LANGUAGES.keys()] + SUBSETS_CROSS_LOGIQA = ["cross_logiqa_" + lang for lang in _LANGUAGES.keys()] + SUBSETS = SUBSETS_CROSS_MMLU + SUBSETS_CROSS_LOGIQA + ["sg_eval_eng", "ph_eval_eng"] BUILDER_CONFIGS = [ SEACrowdConfig( @@ -180,7 +180,7 @@ def _generate_examples(self, paths: Path, split: str) -> Tuple[int, Dict]: data = pd.melt(data, id_vars=["id"], value_vars=_LANGUAGES.keys(), var_name="language") data_flattened = pd.json_normalize(data["value"]) data_merged = pd.merge(data, data_flattened, left_index=True, right_index=True) - data_filtered = data_merged[data_merged['language']==language].drop(columns=["value", "language"]) + data_filtered = data_merged[data_merged["language"] == language].drop(columns=["value", "language"]) examples = data_filtered.to_records() elif "eval" in key: data = pd.read_json(path) @@ -202,7 +202,7 @@ def _generate_examples(self, paths: Path, split: str) -> Tuple[int, Dict]: x = { "id": row["id"], "question": row["question"], - "context": row["context"] if "context" in row else None, + "context": row["context"] if "context" in row else None, "choices": row["choices"], "answer": row["answer"], } @@ -215,7 +215,7 @@ def _generate_examples(self, paths: Path, split: str) -> Tuple[int, Dict]: "question": row["question"], "choices": row["choices"], "answer": row["answer"], - "category": row["category"] if "category" in row else None, + "category": row["category"] if "category" in row else None, } yield idx, x idx += 1 From 781b52c91c507934796cc568d50a402cf7f715fb Mon Sep 17 00:00:00 2001 From: Railey Montalan Date: Tue, 12 Mar 2024 13:13:54 +0800 Subject: [PATCH 4/4] Change default split to TEST --- seacrowd/sea_datasets/seaeval/seaeval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/seacrowd/sea_datasets/seaeval/seaeval.py b/seacrowd/sea_datasets/seaeval/seaeval.py index af81bae71..db13204d9 100644 --- a/seacrowd/sea_datasets/seaeval/seaeval.py +++ b/seacrowd/sea_datasets/seaeval/seaeval.py @@ -158,10 +158,10 @@ def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datase return [ datasets.SplitGenerator( - name=datasets.Split.TRAIN, + name=datasets.Split.TEST, gen_kwargs={ "paths": paths, - "split": "train", + "split": "test", }, ), ]