From 0b8f3d5caddeca43e5167099ec981399e9be4b06 Mon Sep 17 00:00:00 2001 From: Muhammad Dehan Al Kautsar <68471412+dehanalkautsar@users.noreply.github.com> Date: Tue, 5 Mar 2024 00:51:04 +0700 Subject: [PATCH] Closes #83 | Implement Dataloader for GlobalWoZ (#261) * refactor by pre-commit * reformatted by pre-commit * refactor code for globalwoz --- seacrowd/sea_datasets/globalwoz/__init__.py | 0 seacrowd/sea_datasets/globalwoz/globalwoz.py | 226 +++++++++++++++++++ 2 files changed, 226 insertions(+) create mode 100644 seacrowd/sea_datasets/globalwoz/__init__.py create mode 100644 seacrowd/sea_datasets/globalwoz/globalwoz.py diff --git a/seacrowd/sea_datasets/globalwoz/__init__.py b/seacrowd/sea_datasets/globalwoz/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/globalwoz/globalwoz.py b/seacrowd/sea_datasets/globalwoz/globalwoz.py new file mode 100644 index 000000000..d612eff04 --- /dev/null +++ b/seacrowd/sea_datasets/globalwoz/globalwoz.py @@ -0,0 +1,226 @@ +import os +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import itertools + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses + +_CITATION = """\ +@inproceedings{ding-etal-2022-globalwoz, + title = "{G}lobal{W}o{Z}: Globalizing {M}ulti{W}o{Z} to Develop Multilingual Task-Oriented Dialogue Systems", + author = "Ding, Bosheng and + Hu, Junjie and + Bing, Lidong and + Aljunied, Mahani and + Joty, Shafiq and + Si, Luo and + Miao, Chunyan", + editor = "Muresan, Smaranda and + Nakov, Preslav and + Villavicencio, Aline", + booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", + month = may, + year = "2022", +} +""" + +_DATASETNAME = "globalwoz" + +_DESCRIPTION = """\ +This is the data of the paper “GlobalWoZ: Globalizing MultiWoZ to Develop Multilingual Task-Oriented Dialogue Systems” accepted by ACL 2022. The dataset contains several sub-datasets in 20 languages and 3 schemes (F&E, E&F, F&F), including Indonesian (id), Thai (th), and Vietnamese (vi) language. The method is based on translating dialogue templates and filling them with local entities in the target language countries. +""" + + +_HOMEPAGE = "https://github.com/bosheng2020/globalwoz" + + +_LANGUAGES = ["ind", "tha", "vie"] + +_LICENSE = Licenses.UNKNOWN.value + +_LOCAL = True + +_URLS = {} + +_SUPPORTED_TASKS = [Tasks.E2E_TASK_ORIENTED_DIALOGUE] + +_SOURCE_VERSION = "2.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +def seacrowd_config_constructor(dial_type, lang, schema, version): + if dial_type not in ["EandF", "FandE", "FandF"]: + raise ValueError(f"Invalid dialogue type {dial_type}") + + if lang == "": + raise ValueError(f"Invalid lang {lang}") + + if schema not in ["source", "seacrowd_tod"]: + raise ValueError(f"Invalid schema: {schema}") + + return SEACrowdConfig( + name="globalwoz_{dial_type}_{lang}_{schema}".format(dial_type=dial_type, lang=lang, schema=schema), + version=datasets.Version(version), + description="GlobalWoZ schema for {schema}: {dial_type}_{lang}".format(schema=schema, dial_type=dial_type, lang=lang), + schema=schema, + subset_id="globalwoz_{dial_type}_{lang}".format(dial_type=dial_type, lang=lang), + ) + + +class GlobalWoZ(datasets.GeneratorBasedBuilder): + """This is the data of the paper “GlobalWoZ: Globalizing MultiWoZ to Develop Multilingual Task-Oriented Dialogue Systems” accepted by ACL 2022. + The dataset contains several sub-datasets in 20 languages and 3 schemes (F&E, E&F, F&F), including Indonesian (id), Thai (th), + and Vietnamese (vi) language. The method is based on translating dialogue templates and filling them with local entities in the target language countries. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + seacrowd_config_constructor(tod_format, lang, schema, _SOURCE_VERSION if schema == "source" else _SEACROWD_VERSION) for tod_format, lang, schema in itertools.product(("EandF", "FandE", "FandF"), ("id", "th", "vi"), ("source", "seacrowd_tod")) + ] + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "goal": { + "attraction": datasets.Value("string"), + "hospital": datasets.Value("string"), + "hotel": datasets.Value("string"), + "police": datasets.Value("string"), + "restaurant": datasets.Value("string"), + "taxi": datasets.Value("string"), + "train": datasets.Value("string"), + }, + "log": [ + { + "dialog_act": datasets.Value("string"), + "metadata": datasets.Value("string"), + "span_info": [[datasets.Value("string")]], + "text": datasets.Value("string"), + } + ], + } + ) + + elif self.config.schema == "seacrowd_tod": + features = schemas.tod_features + else: + raise NotImplementedError() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + _split_generators = [] + + type_and_lang = {"dial_type": self.config.subset_id.split("_")[1].replace("and", "&"), "lang": self.config.subset_id.split("_")[2]} # globalwoz_{dial_type}_{lang} + + if self.config.data_dir is None: + raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") + else: + data_dir = self.config.data_dir + + if not os.path.exists(os.path.join(data_dir, f"{type_and_lang['dial_type']}_{type_and_lang['lang']}.json")): + raise FileNotFoundError() + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + # "filepath": data_dir + f"_{type_and_lang['dial_type']}_{type_and_lang['lang']}.json", + "filepath": os.path.join(data_dir, f"{type_and_lang['dial_type']}_{type_and_lang['lang']}.json"), + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + # For local datasets you will have access to self.config.data_dir and self.config.data_files + with open(filepath, "r+", encoding="utf8") as fw: + data = json.load(fw) + + if self.config.schema == "source": + for idx, tod_dialogue in enumerate(data.values()): + example = {} + example["id"] = str(idx) + example["goal"] = {} + + for goal_key in ["attraction", "hospital", "hotel", "police", "restaurant", "taxi", "train"]: + example["goal"][goal_key] = json.dumps(tod_dialogue["goal"][goal_key]) + example["log"] = [] + + for dial_log in tod_dialogue["log"]: + dial = {} + dial["dialog_act"] = json.dumps(dial_log["dialog_act"]) + dial["metadata"] = json.dumps(dial_log["metadata"]) + for i in range(len(dial_log["span_info"])): + for j in range(len(dial_log["span_info"][i])): + dial_log["span_info"][i][j] = str(dial_log["span_info"][i][j]) # casting to str + dial["span_info"] = [[str(span)] if isinstance(span, str) else span for span in dial_log["span_info"]] + dial["text"] = dial_log["text"] + + example["log"].append(dial) + + yield example["id"], example + + elif self.config.schema == "seacrowd_tod": + for idx, tod_dialogue in enumerate(data.values()): + example = {} + example["dialogue_idx"] = idx + + dialogue = [] + # NOTE: the dialogue always started with `user` as first utterance + for turn, i in enumerate(range(0, len(tod_dialogue["log"]) + 2, 2)): + dial = {} + dial["turn_idx"] = turn + + # system_utterance properties + dial["system_utterance"] = "" + dial["system_acts"] = [] + if turn != 0: + dial["system_utterance"] = tod_dialogue["log"][i - 1]["text"] + if i < len(tod_dialogue["log"]): + # NOTE: "system_acts will be populated with the `dialog_act` from the user utterance in the original dataset, as our schema dictates + # that `system_acts` should represent the system's intended actions based on the user's utterance." + for acts in tod_dialogue["log"][i]["dialog_act"].values(): + for act in acts: + dial["system_acts"].append([act[0]]) + + # user_utterance properties + dial["turn_label"] = [] # left as an empty array + dial["belief_state"] = [] + if i == len(tod_dialogue["log"]): + # case if turn_idx > len(dialogue) --> add dummy user_utterance + dial["user_utterance"] = "" + else: + dial["user_utterance"] = tod_dialogue["log"][i]["text"] + # NOTE: "the belief_state will be populated with the `span_info` from the user utterance in the original dataset, as our schema dictates + # that `belief_state` should represent the system's belief state based on the user's utterance." + for span in tod_dialogue["log"][i]["span_info"]: + if span[0].split("-")[1] == "request": # Request action + dial["belief_state"].append({"slots": [["slot", span[1]]], "act": "request"}) + else: + dial["belief_state"].append({"slots": [[span[1], span[2]]], "act": span[0].split("-")[1]}) + + # append to dialogue + dialogue.append(dial) + + example["dialogue"] = dialogue + + yield example["dialogue_idx"], example