forked from SEACrowd/seacrowd-datahub
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add okapi_m_arc dataloader * Update okapi_m_arc.py minor comment update * Update okapi_m_arc.py Update config description name * Update okapi_m_arc.py add typehints and apply formatter suggestions --------- Co-authored-by: Samuel Cahyawijaya <[email protected]>
- Loading branch information
1 parent
94bb4a1
commit 330b787
Showing
2 changed files
with
181 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
import json | ||
import os | ||
from pathlib import Path | ||
from typing import Dict, Generator, List, Tuple | ||
|
||
import datasets | ||
|
||
from seacrowd.utils import schemas | ||
from seacrowd.utils.configs import SEACrowdConfig | ||
from seacrowd.utils.constants import Licenses, Tasks | ||
|
||
_CITATION = """\ | ||
@article{dac2023okapi, | ||
title={Okapi: Instruction-tuned Large Language Models in Multiple Languages with Reinforcement Learning from Human Feedback}, | ||
author={Dac Lai, Viet and Van Nguyen, Chien and Ngo, Nghia Trung and Nguyen, Thuat and Dernoncourt, Franck and Rossi, Ryan A and Nguyen, Thien Huu}, | ||
journal={arXiv e-prints}, | ||
pages={arXiv--2307}, | ||
year={2023} | ||
} | ||
@article{Clark2018ThinkYH, | ||
title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge}, | ||
author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord}, | ||
journal={ArXiv}, | ||
year={2018}, | ||
volume={abs/1803.05457} | ||
} | ||
""" | ||
|
||
_DATASETNAME = "okapi_m_arc" | ||
|
||
_DESCRIPTION = """\ | ||
mARC is a Multilingual translation of AI2's Arc Challenge from the paper "Okapi: Instruction-tuned Large Language Models in Multiple Languages with Reinforcement Learning from Human Feedback" (Lai et al., 2023). | ||
The original ARC dataset is a multiple-choice question answering dataset of 7,787 genuine grade-school level science questions assembled to encourage research in advanced question-answering. | ||
The dataset is partitioned into a Challenge Set and an Easy Set, where the former contains only questions answered incorrectly by both a retrieval-based algorithm and a word co-occurrence algorithm. | ||
We also include a corpus of over 14 million science sentences relevant to the task and an implementation of three neural baseline models for this dataset. We pose ARC as a challenge to the community. | ||
""" | ||
|
||
|
||
_HOMEPAGE = "https://huggingface.co/datasets/jon-tow/okapi_arc_challenge" | ||
_LICENSE = Licenses.CC_BY_NC_4_0.value | ||
_LOCAL = False | ||
_LANGUAGES = ["ind", "vie"] | ||
|
||
_LANG_MAP = {"ind": "id", "vie": "vi"} | ||
_URLS = { | ||
"base_url": "https://huggingface.co/datasets/jon-tow/okapi_arc_challenge/resolve/main", | ||
} | ||
_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] | ||
_SOURCE_VERSION = "1.0.0" | ||
_SEACROWD_VERSION = "1.0.0" | ||
|
||
|
||
class MultilingualArc(datasets.GeneratorBasedBuilder): | ||
"""mARC is a Multilingual translation of AI2's Arc Challenge which is a multiple-choice question answering dataset | ||
of 7,787 genuine grade-school level science questions assembled to encourage research in advanced question-answering""" | ||
|
||
BUILDER_CONFIGS = [ | ||
SEACrowdConfig( | ||
name="okapi_m_arc_vie_source", | ||
version=datasets.Version(_SOURCE_VERSION), | ||
description="Vietnamese mARC source schema", | ||
schema="source", | ||
subset_id="okapi_m_arc_vie_source", | ||
), | ||
SEACrowdConfig( | ||
name="okapi_m_arc_ind_source", | ||
version=datasets.Version(_SOURCE_VERSION), | ||
description="Indonesian mARC source schema", | ||
schema="source", | ||
subset_id="okapi_m_arc_ind_source", | ||
), | ||
SEACrowdConfig( | ||
name="okapi_m_arc_vie_seacrowd_qa", | ||
version=datasets.Version(_SEACROWD_VERSION), | ||
description="Vietnamese mARC SEACrowd question answering schema", | ||
schema="seacrowd_qa", | ||
subset_id="okapi_m_arc_vie_seacrowd_qa", | ||
), | ||
SEACrowdConfig( | ||
name="okapi_m_arc_ind_seacrowd_qa", | ||
version=datasets.Version(_SEACROWD_VERSION), | ||
description="Indonesian mARC SEACrowd question answering schema", | ||
schema="seacrowd_qa", | ||
subset_id="okapi_m_arc_ind_seacrowd_qa", | ||
), | ||
] | ||
|
||
DEFAULT_CONFIG_NAME = "okapi_m_arc_ind_seacrowd_qa" | ||
|
||
def _info(self) -> datasets.DatasetInfo: | ||
if self.config.schema == "source": | ||
features = datasets.Features( | ||
{ | ||
"id": datasets.Value("string"), | ||
"question": datasets.Value("string"), | ||
"choices": datasets.features.Sequence( | ||
{ | ||
"text": datasets.Value("string"), | ||
"label": datasets.Value("string"), | ||
} | ||
), | ||
"answerKey": datasets.Value("string"), | ||
} | ||
) | ||
else: | ||
features = schemas.qa_features | ||
|
||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=features, | ||
homepage=_HOMEPAGE, | ||
license=_LICENSE, | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: | ||
"""Returns SplitGenerators.""" | ||
lang = self.config.subset_id[: -(len(self.config.schema) + 1)].split("_")[-1] | ||
train_path = Path(dl_manager.download_and_extract(f"{_URLS['base_url']}/data/{_LANG_MAP[lang]}_train.json")) | ||
valid_path = Path(dl_manager.download_and_extract(f"{_URLS['base_url']}/data/{_LANG_MAP[lang]}_validation.json")) | ||
test_path = Path(dl_manager.download_and_extract(f"{_URLS['base_url']}/data/{_LANG_MAP[lang]}_test.json")) | ||
|
||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, | ||
gen_kwargs={"filepath": train_path}, | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.VALIDATION, | ||
gen_kwargs={"filepath": valid_path}, | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TEST, | ||
gen_kwargs={"filepath": test_path}, | ||
), | ||
] | ||
|
||
def _generate_examples(self, filepath) -> Generator[Tuple[int, Dict], None, None]: | ||
"""Yields examples.""" | ||
with open(filepath, encoding="utf-8") as f: | ||
data = json.load(f) | ||
|
||
for i, d in enumerate(data): | ||
text_choices = [] | ||
label_choices = [] | ||
if "option_a" in d: | ||
text_choices.append(d["option_a"]) | ||
label_choices.append("A") | ||
if "option_b" in d: | ||
text_choices.append(d["option_b"]) | ||
label_choices.append("B") | ||
if "option_c" in d: | ||
text_choices.append(d["option_c"]) | ||
label_choices.append("C") | ||
if "option_d" in d: | ||
text_choices.append(d["option_d"]) | ||
label_choices.append("D") | ||
if "option_e" in d: | ||
text_choices.append(d["option_e"]) | ||
label_choices.append("E") | ||
|
||
if self.config.schema == "source": | ||
yield i, { | ||
"id": d["id"], | ||
"answerKey": d["answer"], | ||
"question": d["instruction"], | ||
"choices": {"text": text_choices, "label": label_choices}, | ||
} | ||
else: | ||
yield i, { | ||
"id": i, | ||
"question_id": d["id"], | ||
"document_id": d["id"], | ||
"question": d["instruction"], | ||
"type": "multiple_choice", | ||
"choices": [f"{label}. {text}" for label, text in zip(label_choices, text_choices)], | ||
"context": None, | ||
"answer": [f'{d["answer"]}. {text_choices[ord(d["answer"])-65]}'], | ||
"meta": {} | ||
} |