diff --git a/seacrowd/sea_datasets/bactrian_x/__init__.py b/seacrowd/sea_datasets/bactrian_x/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/bactrian_x/bactrian_x.py b/seacrowd/sea_datasets/bactrian_x/bactrian_x.py new file mode 100644 index 000000000..da99f28f3 --- /dev/null +++ b/seacrowd/sea_datasets/bactrian_x/bactrian_x.py @@ -0,0 +1,153 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses, TASK_TO_SCHEMA, SCHEMA_TO_FEATURES + +_CITATION = """\ +@misc{li2023bactrianx, + title={Bactrian-X : A Multilingual Replicable Instruction-Following Model with Low-Rank Adaptation}, + author={Haonan Li and Fajri Koto and Minghao Wu and Alham Fikri Aji and Timothy Baldwin}, + year={2023}, + eprint={2305.15011}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +""" + +_DATASETNAME = "bactrian_x" + +_DESCRIPTION = """\ +The Bactrain-X dataset is a collection of 3.4M instruction-response pairs in 52 +languages, that are obtained by translating 67K English instructions (alpaca-52k ++ dolly-15k) into 51 languages using Google Translate API. The translated +instructions are then fed to ChatGPT (gpt-3.5-turbo) to obtain its natural +responses, resulting in 3.4M instruction-response pairs in 52 languages (52 +languages x 67k instances = 3.4M instances). Human evaluations were conducted to +evaluate response quality for several languages, with those of interest to +SEACrowd being Burmese and Tagalog. +""" + +_HOMEPAGE = "https://github.com/mbzuai-nlp/Bactrian-X" + +_LANGUAGES = ["mya", "tgl", "ind", "khm", "tha", "vie"] + +_LICENSE = Licenses.CC_BY_NC_4_0.value + +_LOCAL = False + +_BASE_URL = "https://huggingface.co/datasets/MBZUAI/Bactrian-X/resolve/main/data/{subset}.json.gz?download=true" +_SUBSETS = ["my", "tl", "id", "km", "th", "vi"] + +_SUPPORTED_TASKS = [Tasks.INSTRUCTION_TUNING] +_SEACROWD_SCHEMA = f"seacrowd_{TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]].lower()}" # t2t + +_SOURCE_VERSION = "1.0.1" + +_SEACROWD_VERSION = "1.0.0" + + +class BactrianXDataset(datasets.GeneratorBasedBuilder): + """A collection of translated instruction-response pairs, evaluated with ChatGPT and human.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [] + for subset in _SUBSETS: + BUILDER_CONFIGS += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} {subset} source schema", + schema="source", + subset_id=subset, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_{_SEACROWD_SCHEMA}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} {subset} SEACrowd schema", + schema=_SEACROWD_SCHEMA, + subset_id=subset, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_id_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "instruction": datasets.Value("string"), + "input": datasets.Value("string"), + "id": datasets.Value("string"), + "output": datasets.Value("string"), + } + ) + elif self.config.schema == _SEACROWD_SCHEMA: + features = SCHEMA_TO_FEATURES[ + TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]] + ] # text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + data_url = _BASE_URL.format(subset=self.config.name.split("_")[2]) + data_path = Path(dl_manager.download_and_extract(data_url)) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "data_path": data_path, + }, + ) + ] + + def _generate_examples(self, data_path: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + with open(data_path, "r", encoding="utf-8") as file: + data = json.load(file) + + if self.config.schema == "source": + for idx, example in enumerate(data): + yield idx, { + "instruction": example["instruction"], + "input": example["input"], + "id": example["id"], + "output": example["output"], + } + elif self.config.schema == _SEACROWD_SCHEMA: + for idx, example in enumerate(data): + yield idx, { + "id": example["id"], + "text_1": f"Instruction: {example['instruction']}\nInput: {example['input']}", + "text_2": example["output"], + "text_1_name": "instruction + input", + "text_2_name": "output", + } diff --git a/seacrowd/sea_datasets/maxm/__init__.py b/seacrowd/sea_datasets/maxm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/maxm/maxm.py b/seacrowd/sea_datasets/maxm/maxm.py new file mode 100644 index 000000000..b649e6bba --- /dev/null +++ b/seacrowd/sea_datasets/maxm/maxm.py @@ -0,0 +1,210 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses, TASK_TO_SCHEMA, SCHEMA_TO_FEATURES + +_CITATION = """\ +@inproceedings{changpinyo-etal-2023-maxm, + title = "{M}a{XM}: Towards Multilingual Visual Question Answering", + author = "Changpinyo, Soravit and + Xue, Linting and + Yarom, Michal and + Thapliyal, Ashish and + Szpektor, Idan and + Amelot, Julien and + Chen, Xi and + Soricut, Radu", + editor = "Bouamor, Houda and + Pino, Juan and + Bali, Kalika", + booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023", + month = dec, + year = "2023", + address = "Singapore", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2023.findings-emnlp.176", + doi = "10.18653/v1/2023.findings-emnlp.176", + pages = "2667--2682", + abstract = "Visual Question Answering (VQA) has been primarily studied + through the lens of the English language. Yet, tackling VQA in other + languages in the same manner would require a considerable amount of + resources. In this paper, we propose scalable solutions to multilingual + visual question answering (mVQA), on both data and modeling fronts. We first + propose a translation-based framework to mVQA data generation that requires + much less human annotation efforts than the conventional approach of + directly collection questions and answers. Then, we apply our framework to + the multilingual captions in the Crossmodal-3600 dataset and develop an + efficient annotation protocol to create MaXM, a test-only VQA benchmark in 7 + diverse languages. Finally, we develop a simple, lightweight, and effective + approach as well as benchmark state-of-the-art English and multilingual VQA + models. We hope that our benchmark encourages further research on mVQA.", +} +""" + +_DATASETNAME = "maxm" + +_DESCRIPTION = """\ +MaXM, a test-only VQA benchmark in 7 diverse languages, including Thai. The +dataset is generated by first applying a translation-based framework to mVQA and +then applying framework to the multilingual captions in the Crossmodal-3600 +dataset. +""" + +_HOMEPAGE = "https://github.com/google-research-datasets/maxm" + +_LANGUAGES = ["tha"] + +_LICENSE = f"""{Licenses.OTHERS.value} | \ +The dataset may be freely used for any purpose, although acknowledgement of Google LLC ("Google") as the data source would be appreciated. +The dataset is provided "AS IS" without any warranty, express or implied. +Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.""" + +_LOCAL = False + +_URL = "https://storage.googleapis.com/maxm/maxm_v1_release.zip" +_SUBSETS = ["regular", "yesno"] + +_SUPPORTED_TASKS = [Tasks.VISUAL_QUESTION_ANSWERING] +_SEACROWD_SCHEMA = f"seacrowd_{TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]].lower()}" # imqa + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class MaXMDataset(datasets.GeneratorBasedBuilder): + """A test-only VQA benchmark in 7 diverse languages, including Thai.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [] + for subset in _SUBSETS: + BUILDER_CONFIGS += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} {subset} source schema", + schema="source", + subset_id=subset, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_{_SEACROWD_SCHEMA}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} {subset} SEACrowd schema", + schema=_SEACROWD_SCHEMA, + subset_id=subset, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_regular_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "image_id": datasets.Value("string"), + "image_url": datasets.Value("string"), + "question_id": datasets.Value("string"), + "question": datasets.Value("string"), + "answers": datasets.Sequence(datasets.Value("string")), + "processed_answers": datasets.Sequence(datasets.Value("string")), + "is_collection": datasets.Value("bool"), + "method": datasets.Value("string"), + } + ) + elif self.config.schema == _SEACROWD_SCHEMA: + features = SCHEMA_TO_FEATURES[ + TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]] + ] # imqa_features + features["meta"] = { + "processed_answers": datasets.Sequence(datasets.Value("string")), + "is_collection": datasets.Value("bool"), + "method": datasets.Value("string"), + } + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + data_path = Path(dl_manager.download_and_extract(_URL), "maxm_v1_release") + file_path = ( + data_path + / f"maxm_v1_{'yesno_' if self.config.subset_id == 'yesno' else ''}th.json" + ) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": file_path, + }, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + with open(filepath, "r", encoding="utf-8") as file: + data = json.load(file) + + key = 0 + data = data["annotations"] + if self.config.schema == "source": + for example in data: + for id, qa_pair in enumerate(example["qa_pairs"]): + yield key, { + "image_id": example["image_id"], + "image_url": example["image_url"][id], + "question_id": qa_pair["question_id"], + "question": qa_pair["question"], + "answers": qa_pair["answers"], + "processed_answers": qa_pair["processed_answers"], + "is_collection": qa_pair["is_collection"], + "method": qa_pair["method"], + } + key += 1 + elif self.config.schema == _SEACROWD_SCHEMA: + for example in data: + for id, qa_pair in enumerate(example["qa_pairs"]): + yield key, { + "id": str(key), + "question_id": qa_pair["question_id"], + "document_id": example["image_id"], + "questions": [qa_pair["question"]], + # "type": None, + # "choices": None, + # "context": None, + "answer": qa_pair["answers"], + "image_paths": [example["image_url"][id]], + "meta": { + "processed_answers": qa_pair["processed_answers"], + "is_collection": qa_pair["is_collection"], + "method": qa_pair["method"], + }, + } + key += 1