Skip to content

Commit

Permalink
Closes SEACrowd#53 | Implement dataloader for IndoCamRest (SEACrowd#257)
Browse files Browse the repository at this point in the history
* feat: add dataloader indocamrest for source

* refactor: indocamrest by pre-commit

* remove __name__:__main__ on indocamrest

* fix the license constant in IndoCamrest
  • Loading branch information
dehanalkautsar authored and Railey Montalan committed Feb 28, 2024
1 parent 0846ae6 commit e8fa072
Showing 1 changed file with 163 additions and 0 deletions.
163 changes: 163 additions & 0 deletions seacrowd/sea_datasets/indocamrest/indocamrest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import json
from pathlib import Path
from typing import Dict, List, Tuple

import datasets

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Tasks, Licenses

_CITATION = """\
@article{kautsar2023indotod,
author={Kautsar, Muhammad Dehan Al and Nurdini, Rahmah Khoirussyifa' and Cahyawijaya, Samuel and Winata, Genta Indra and Purwarianti, Ayu},
title={IndoToD: A Multi-Domain Indonesian Benchmark For End-to-End Task-Oriented Dialogue Systems},
journal={arXiv preprint arXiv:2311.00958},
year={2023},
}
"""

_LANGUAGES = ["ind"]
_LOCAL = False

_DATASETNAME = "indocamrest"

_DESCRIPTION = """\
IndoCamRest is a synthetic task-oriented dialogue system dataset that translated from Cambridge Restaurant 676 (CamRest) dataset (Wen et al., 2016) into the new Indonesian parallel dataset using the translation pipeline method including the delexicalization, translation, and delexicalization.
The dataset consists of 676 dialogues in the restaurant reservation domain, with a user and an agent talking to each other to search the restaurant near the user.
It also consists of slots and dialogue acts from the user and the agent.
"""

_HOMEPAGE = "https://github.com/dehanalkautsar/IndoToD/tree/main/IndoCamRest"

_LICENSE = Licenses.CC_BY_SA_4_0.value

_URLS = {
_DATASETNAME: "https://raw.githubusercontent.com/dehanalkautsar/IndoToD/main/IndoCamRest/IndoCamRest676.json",
}

_SUPPORTED_TASKS = [Tasks.E2E_TASK_ORIENTED_DIALOGUE]

_SOURCE_VERSION = "1.0.0"

_SEACROWD_VERSION = "1.0.0"


class IndoCamRest(datasets.GeneratorBasedBuilder):
"""IndoToD: A Multi-Domain Indonesian Benchmark For End-to-End Task-Oriented Dialogue Systems"""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)

BUILDER_CONFIGS = [
SEACrowdConfig(
name="indocamrest_source",
version=SOURCE_VERSION,
description="IndoToD: IndoCamRest source schema",
schema="source",
subset_id="indocamrest",
),
SEACrowdConfig(
name="indocamrest_seacrowd_tod",
version=SEACROWD_VERSION,
description="IndoToD: IndoCamRest SEACrowd End-to-end Task Oriented Dialogue schema",
schema="seacrowd_tod",
subset_id="indocamrest",
),
]

DEFAULT_CONFIG_NAME = "indocamrest_source"

def _info(self) -> datasets.DatasetInfo:
if self.config.schema == "source":
features = datasets.Features(
{
"index": datasets.Value("string"),
"dialogue_id": datasets.Value("int32"),
"finished": datasets.Value("string"),
"goal": {"constraints": [[datasets.Value("string")]], "request-slots": [datasets.Value("string")], "text": datasets.Value("string")},
"dial": [
{
"turn": datasets.Value("int32"),
"usr": {
"transcript": datasets.Value("string"),
"delex_transcript": datasets.Value("string"),
"slu": [{"act": datasets.Value("string"), "slots": [[datasets.Value("string")]]}],
},
"sys": {"sent": datasets.Value("string"), "delex_sent": datasets.Value("string"), "DA": [datasets.Value("string")]},
}
],
}
)
elif self.config.schema == "seacrowd_tod":
features = schemas.tod_features
else:
raise NotImplementedError(f"Schema {self.config.schema} has not been implemented")

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
urls = _URLS[_DATASETNAME]
data_dir = dl_manager.download_and_extract(urls)

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": data_dir,
"split": "train",
},
),
]

def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
with open(filepath, "r+") as fw:
data = json.loads(fw.read())

if self.config.schema == "source":
for idx, example in enumerate(data):
example["index"] = str(idx)
yield str(idx), example

elif self.config.schema == "seacrowd_tod":
for idx, tod_dialogue in enumerate(data):
example = {}
example["dialogue_idx"] = idx

dialogue = []
for i in range(len(tod_dialogue["dial"]) + 1):
dial = {}
dial["turn_idx"] = i

# system_utterance properties
if i == 0:
# case if turn_idx == 0
dial["system_utterance"] = ""
dial["system_acts"] = []
else:
dial["system_utterance"] = tod_dialogue["dial"][i - 1]["sys"]["sent"]
# some system_acts is either to string or list of strings,
# converting all to list of strings
dial["system_acts"] = [[act] if isinstance(act, str) else act for act in tod_dialogue["dial"][i - 1]["sys"]["DA"]]

# user_utterance properties
dial["turn_label"] = []
dial["belief_state"] = []
if i == len(tod_dialogue["dial"]):
# case if turn_idx > len(dialogue) --> add dummy user_utterance
dial["user_utterance"] = ""
else:
dial["user_utterance"] = tod_dialogue["dial"][i]["usr"]["transcript"]
for j in range(len(tod_dialogue["dial"][i]["usr"]["slu"])):
dial["belief_state"].append({"slots": tod_dialogue["dial"][i]["usr"]["slu"][j]["slots"], "act": tod_dialogue["dial"][i]["usr"]["slu"][j]["act"]})

# append to dialogue
dialogue.append(dial)
example["dialogue"] = dialogue
yield str(idx), example

0 comments on commit e8fa072

Please sign in to comment.