diff --git a/seacrowd/sea_datasets/maxm/__init__.py b/seacrowd/sea_datasets/maxm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/maxm/maxm.py b/seacrowd/sea_datasets/maxm/maxm.py new file mode 100644 index 000000000..b649e6bba --- /dev/null +++ b/seacrowd/sea_datasets/maxm/maxm.py @@ -0,0 +1,210 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses, TASK_TO_SCHEMA, SCHEMA_TO_FEATURES + +_CITATION = """\ +@inproceedings{changpinyo-etal-2023-maxm, + title = "{M}a{XM}: Towards Multilingual Visual Question Answering", + author = "Changpinyo, Soravit and + Xue, Linting and + Yarom, Michal and + Thapliyal, Ashish and + Szpektor, Idan and + Amelot, Julien and + Chen, Xi and + Soricut, Radu", + editor = "Bouamor, Houda and + Pino, Juan and + Bali, Kalika", + booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023", + month = dec, + year = "2023", + address = "Singapore", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2023.findings-emnlp.176", + doi = "10.18653/v1/2023.findings-emnlp.176", + pages = "2667--2682", + abstract = "Visual Question Answering (VQA) has been primarily studied + through the lens of the English language. Yet, tackling VQA in other + languages in the same manner would require a considerable amount of + resources. In this paper, we propose scalable solutions to multilingual + visual question answering (mVQA), on both data and modeling fronts. We first + propose a translation-based framework to mVQA data generation that requires + much less human annotation efforts than the conventional approach of + directly collection questions and answers. Then, we apply our framework to + the multilingual captions in the Crossmodal-3600 dataset and develop an + efficient annotation protocol to create MaXM, a test-only VQA benchmark in 7 + diverse languages. Finally, we develop a simple, lightweight, and effective + approach as well as benchmark state-of-the-art English and multilingual VQA + models. We hope that our benchmark encourages further research on mVQA.", +} +""" + +_DATASETNAME = "maxm" + +_DESCRIPTION = """\ +MaXM, a test-only VQA benchmark in 7 diverse languages, including Thai. The +dataset is generated by first applying a translation-based framework to mVQA and +then applying framework to the multilingual captions in the Crossmodal-3600 +dataset. +""" + +_HOMEPAGE = "https://github.com/google-research-datasets/maxm" + +_LANGUAGES = ["tha"] + +_LICENSE = f"""{Licenses.OTHERS.value} | \ +The dataset may be freely used for any purpose, although acknowledgement of Google LLC ("Google") as the data source would be appreciated. +The dataset is provided "AS IS" without any warranty, express or implied. +Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.""" + +_LOCAL = False + +_URL = "https://storage.googleapis.com/maxm/maxm_v1_release.zip" +_SUBSETS = ["regular", "yesno"] + +_SUPPORTED_TASKS = [Tasks.VISUAL_QUESTION_ANSWERING] +_SEACROWD_SCHEMA = f"seacrowd_{TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]].lower()}" # imqa + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class MaXMDataset(datasets.GeneratorBasedBuilder): + """A test-only VQA benchmark in 7 diverse languages, including Thai.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [] + for subset in _SUBSETS: + BUILDER_CONFIGS += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} {subset} source schema", + schema="source", + subset_id=subset, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_{_SEACROWD_SCHEMA}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} {subset} SEACrowd schema", + schema=_SEACROWD_SCHEMA, + subset_id=subset, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_regular_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "image_id": datasets.Value("string"), + "image_url": datasets.Value("string"), + "question_id": datasets.Value("string"), + "question": datasets.Value("string"), + "answers": datasets.Sequence(datasets.Value("string")), + "processed_answers": datasets.Sequence(datasets.Value("string")), + "is_collection": datasets.Value("bool"), + "method": datasets.Value("string"), + } + ) + elif self.config.schema == _SEACROWD_SCHEMA: + features = SCHEMA_TO_FEATURES[ + TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]] + ] # imqa_features + features["meta"] = { + "processed_answers": datasets.Sequence(datasets.Value("string")), + "is_collection": datasets.Value("bool"), + "method": datasets.Value("string"), + } + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + data_path = Path(dl_manager.download_and_extract(_URL), "maxm_v1_release") + file_path = ( + data_path + / f"maxm_v1_{'yesno_' if self.config.subset_id == 'yesno' else ''}th.json" + ) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": file_path, + }, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + with open(filepath, "r", encoding="utf-8") as file: + data = json.load(file) + + key = 0 + data = data["annotations"] + if self.config.schema == "source": + for example in data: + for id, qa_pair in enumerate(example["qa_pairs"]): + yield key, { + "image_id": example["image_id"], + "image_url": example["image_url"][id], + "question_id": qa_pair["question_id"], + "question": qa_pair["question"], + "answers": qa_pair["answers"], + "processed_answers": qa_pair["processed_answers"], + "is_collection": qa_pair["is_collection"], + "method": qa_pair["method"], + } + key += 1 + elif self.config.schema == _SEACROWD_SCHEMA: + for example in data: + for id, qa_pair in enumerate(example["qa_pairs"]): + yield key, { + "id": str(key), + "question_id": qa_pair["question_id"], + "document_id": example["image_id"], + "questions": [qa_pair["question"]], + # "type": None, + # "choices": None, + # "context": None, + "answer": qa_pair["answers"], + "image_paths": [example["image_url"][id]], + "meta": { + "processed_answers": qa_pair["processed_answers"], + "is_collection": qa_pair["is_collection"], + "method": qa_pair["method"], + }, + } + key += 1