From e8fa072635ebdc5589af4dccb751b7620a3e0564 Mon Sep 17 00:00:00 2001 From: Muhammad Dehan Al Kautsar <68471412+dehanalkautsar@users.noreply.github.com> Date: Fri, 19 Jan 2024 02:23:05 +0700 Subject: [PATCH] Closes #53 | Implement dataloader for IndoCamRest (#257) * feat: add dataloader indocamrest for source * refactor: indocamrest by pre-commit * remove __name__:__main__ on indocamrest * fix the license constant in IndoCamrest --- .../sea_datasets/indocamrest/indocamrest.py | 163 ++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 seacrowd/sea_datasets/indocamrest/indocamrest.py diff --git a/seacrowd/sea_datasets/indocamrest/indocamrest.py b/seacrowd/sea_datasets/indocamrest/indocamrest.py new file mode 100644 index 000000000..b2ed78b7d --- /dev/null +++ b/seacrowd/sea_datasets/indocamrest/indocamrest.py @@ -0,0 +1,163 @@ +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses + +_CITATION = """\ +@article{kautsar2023indotod, + author={Kautsar, Muhammad Dehan Al and Nurdini, Rahmah Khoirussyifa' and Cahyawijaya, Samuel and Winata, Genta Indra and Purwarianti, Ayu}, + title={IndoToD: A Multi-Domain Indonesian Benchmark For End-to-End Task-Oriented Dialogue Systems}, + journal={arXiv preprint arXiv:2311.00958}, + year={2023}, +} +""" + +_LANGUAGES = ["ind"] +_LOCAL = False + +_DATASETNAME = "indocamrest" + +_DESCRIPTION = """\ +IndoCamRest is a synthetic task-oriented dialogue system dataset that translated from Cambridge Restaurant 676 (CamRest) dataset (Wen et al., 2016) into the new Indonesian parallel dataset using the translation pipeline method including the delexicalization, translation, and delexicalization. +The dataset consists of 676 dialogues in the restaurant reservation domain, with a user and an agent talking to each other to search the restaurant near the user. +It also consists of slots and dialogue acts from the user and the agent. +""" + +_HOMEPAGE = "https://github.com/dehanalkautsar/IndoToD/tree/main/IndoCamRest" + +_LICENSE = Licenses.CC_BY_SA_4_0.value + +_URLS = { + _DATASETNAME: "https://raw.githubusercontent.com/dehanalkautsar/IndoToD/main/IndoCamRest/IndoCamRest676.json", +} + +_SUPPORTED_TASKS = [Tasks.E2E_TASK_ORIENTED_DIALOGUE] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class IndoCamRest(datasets.GeneratorBasedBuilder): + """IndoToD: A Multi-Domain Indonesian Benchmark For End-to-End Task-Oriented Dialogue Systems""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="indocamrest_source", + version=SOURCE_VERSION, + description="IndoToD: IndoCamRest source schema", + schema="source", + subset_id="indocamrest", + ), + SEACrowdConfig( + name="indocamrest_seacrowd_tod", + version=SEACROWD_VERSION, + description="IndoToD: IndoCamRest SEACrowd End-to-end Task Oriented Dialogue schema", + schema="seacrowd_tod", + subset_id="indocamrest", + ), + ] + + DEFAULT_CONFIG_NAME = "indocamrest_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "index": datasets.Value("string"), + "dialogue_id": datasets.Value("int32"), + "finished": datasets.Value("string"), + "goal": {"constraints": [[datasets.Value("string")]], "request-slots": [datasets.Value("string")], "text": datasets.Value("string")}, + "dial": [ + { + "turn": datasets.Value("int32"), + "usr": { + "transcript": datasets.Value("string"), + "delex_transcript": datasets.Value("string"), + "slu": [{"act": datasets.Value("string"), "slots": [[datasets.Value("string")]]}], + }, + "sys": {"sent": datasets.Value("string"), "delex_sent": datasets.Value("string"), "DA": [datasets.Value("string")]}, + } + ], + } + ) + elif self.config.schema == "seacrowd_tod": + features = schemas.tod_features + else: + raise NotImplementedError(f"Schema {self.config.schema} has not been implemented") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir, + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + with open(filepath, "r+") as fw: + data = json.loads(fw.read()) + + if self.config.schema == "source": + for idx, example in enumerate(data): + example["index"] = str(idx) + yield str(idx), example + + elif self.config.schema == "seacrowd_tod": + for idx, tod_dialogue in enumerate(data): + example = {} + example["dialogue_idx"] = idx + + dialogue = [] + for i in range(len(tod_dialogue["dial"]) + 1): + dial = {} + dial["turn_idx"] = i + + # system_utterance properties + if i == 0: + # case if turn_idx == 0 + dial["system_utterance"] = "" + dial["system_acts"] = [] + else: + dial["system_utterance"] = tod_dialogue["dial"][i - 1]["sys"]["sent"] + # some system_acts is either to string or list of strings, + # converting all to list of strings + dial["system_acts"] = [[act] if isinstance(act, str) else act for act in tod_dialogue["dial"][i - 1]["sys"]["DA"]] + + # user_utterance properties + dial["turn_label"] = [] + dial["belief_state"] = [] + if i == len(tod_dialogue["dial"]): + # case if turn_idx > len(dialogue) --> add dummy user_utterance + dial["user_utterance"] = "" + else: + dial["user_utterance"] = tod_dialogue["dial"][i]["usr"]["transcript"] + for j in range(len(tod_dialogue["dial"][i]["usr"]["slu"])): + dial["belief_state"].append({"slots": tod_dialogue["dial"][i]["usr"]["slu"][j]["slots"], "act": tod_dialogue["dial"][i]["usr"]["slu"][j]["act"]}) + + # append to dialogue + dialogue.append(dial) + example["dialogue"] = dialogue + yield str(idx), example