Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closes #83 | Implement Dataloader for GlobalWoZ #261

Merged
merged 6 commits into from
Mar 4, 2024
Merged
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
244 changes: 244 additions & 0 deletions seacrowd/sea_datasets/globalwoz/globalwoz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
import os
import json
from pathlib import Path
from typing import Dict, List, Tuple

import datasets

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Tasks, Licenses

_CITATION = """\
@inproceedings{ding-etal-2022-globalwoz,
title = "{G}lobal{W}o{Z}: Globalizing {M}ulti{W}o{Z} to Develop Multilingual Task-Oriented Dialogue Systems",
author = "Ding, Bosheng and
Hu, Junjie and
Bing, Lidong and
Aljunied, Mahani and
Joty, Shafiq and
Si, Luo and
Miao, Chunyan",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = may,
year = "2022",
}
"""

_DATASETNAME = "[globalwoz]"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

keep this as string w/o any non word char


_DESCRIPTION = """\
This is the data of the paper “GlobalWoZ: Globalizing MultiWoZ to Develop Multilingual Task-Oriented Dialogue Systems” accepted by ACL 2022. The dataset contains several sub-datasets in 20 languages and 3 schemes (F&E, E&F, F&F), including Indonesian (id), Thai (th), and Vietnamese (vi) language. The method is based on translating dialogue templates and filling them with local entities in the target language countries.
"""


_HOMEPAGE = "https://github.com/bosheng2020/globalwoz"


_LANGUAGES = ["ind", "tha", "vie"]

_LICENSE = Licenses.UNKNOWN.value

_LOCAL = True

_URLS = {}

_SUPPORTED_TASKS = [Tasks.E2E_TASK_ORIENTED_DIALOGUE]

_SOURCE_VERSION = "2.0.0"

_SEACROWD_VERSION = "1.0.0"


def seacrowd_config_constructor(dial_type, lang, schema, version):
if dial_type not in ["EandF", "FandE", "FandF"]:
raise ValueError(f"Invalid dialogue type {dial_type}")

if lang == "":
raise ValueError(f"Invalid lang {lang}")

if schema not in ["source", "seacrowd_tod"]:
raise ValueError(f"Invalid schema: {schema}")

return SEACrowdConfig(
name="globalwoz_{dial_type}_{lang}_{schema}".format(dial_type=dial_type, lang=lang, schema=schema),
version=datasets.Version(version),
description="GlobalWoZ schema for {schema}: {dial_type}_{lang}".format(schema=schema, dial_type=dial_type, lang=lang),
schema=schema,
subset_id="globalwoz_{dial_type}_{lang}".format(dial_type=dial_type, lang=lang),
)


class GlobalWoZ(datasets.GeneratorBasedBuilder):
"""This is the data of the paper “GlobalWoZ: Globalizing MultiWoZ to Develop Multilingual Task-Oriented Dialogue Systems” accepted by ACL 2022.
The dataset contains several sub-datasets in 20 languages and 3 schemes (F&E, E&F, F&F), including Indonesian (id), Thai (th),
and Vietnamese (vi) language. The method is based on translating dialogue templates and filling them with local entities in the target language countries.
"""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)

BUILDER_CONFIGS = [
seacrowd_config_constructor("EandF", "id", "source", _SOURCE_VERSION),
seacrowd_config_constructor("EandF", "th", "source", _SOURCE_VERSION),
seacrowd_config_constructor("EandF", "vi", "source", _SOURCE_VERSION),
seacrowd_config_constructor("FandE", "id", "source", _SOURCE_VERSION),
seacrowd_config_constructor("FandE", "th", "source", _SOURCE_VERSION),
seacrowd_config_constructor("FandE", "vi", "source", _SOURCE_VERSION),
seacrowd_config_constructor("FandF", "id", "source", _SOURCE_VERSION),
seacrowd_config_constructor("FandF", "th", "source", _SOURCE_VERSION),
seacrowd_config_constructor("FandF", "vi", "source", _SOURCE_VERSION),
seacrowd_config_constructor("EandF", "id", "seacrowd_tod", _SEACROWD_VERSION),
seacrowd_config_constructor("EandF", "th", "seacrowd_tod", _SEACROWD_VERSION),
seacrowd_config_constructor("EandF", "vi", "seacrowd_tod", _SEACROWD_VERSION),
seacrowd_config_constructor("FandE", "id", "seacrowd_tod", _SEACROWD_VERSION),
seacrowd_config_constructor("FandE", "th", "seacrowd_tod", _SEACROWD_VERSION),
seacrowd_config_constructor("FandE", "vi", "seacrowd_tod", _SEACROWD_VERSION),
seacrowd_config_constructor("FandF", "id", "seacrowd_tod", _SEACROWD_VERSION),
seacrowd_config_constructor("FandF", "th", "seacrowd_tod", _SEACROWD_VERSION),
seacrowd_config_constructor("FandF", "vi", "seacrowd_tod", _SEACROWD_VERSION),
]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mind replacing this by using itertools.product and iterating over it rather than having these expressions?

smth like this:

Suggested change
BUILDER_CONFIGS = [
seacrowd_config_constructor("EandF", "id", "source", _SOURCE_VERSION),
seacrowd_config_constructor("EandF", "th", "source", _SOURCE_VERSION),
seacrowd_config_constructor("EandF", "vi", "source", _SOURCE_VERSION),
seacrowd_config_constructor("FandE", "id", "source", _SOURCE_VERSION),
seacrowd_config_constructor("FandE", "th", "source", _SOURCE_VERSION),
seacrowd_config_constructor("FandE", "vi", "source", _SOURCE_VERSION),
seacrowd_config_constructor("FandF", "id", "source", _SOURCE_VERSION),
seacrowd_config_constructor("FandF", "th", "source", _SOURCE_VERSION),
seacrowd_config_constructor("FandF", "vi", "source", _SOURCE_VERSION),
seacrowd_config_constructor("EandF", "id", "seacrowd_tod", _SEACROWD_VERSION),
seacrowd_config_constructor("EandF", "th", "seacrowd_tod", _SEACROWD_VERSION),
seacrowd_config_constructor("EandF", "vi", "seacrowd_tod", _SEACROWD_VERSION),
seacrowd_config_constructor("FandE", "id", "seacrowd_tod", _SEACROWD_VERSION),
seacrowd_config_constructor("FandE", "th", "seacrowd_tod", _SEACROWD_VERSION),
seacrowd_config_constructor("FandE", "vi", "seacrowd_tod", _SEACROWD_VERSION),
seacrowd_config_constructor("FandF", "id", "seacrowd_tod", _SEACROWD_VERSION),
seacrowd_config_constructor("FandF", "th", "seacrowd_tod", _SEACROWD_VERSION),
seacrowd_config_constructor("FandF", "vi", "seacrowd_tod", _SEACROWD_VERSION),
]
BUILDER_CONFIGS = [
seacrowd_config_constructor(tod_format, lang, schema, _SOURCE_VERSION if schema == "source" else _SEACROWD_VERSION) for tod_format, lang, schema in itertools.product(("EandE", "EandF", "FandF"), ("id", "th", "vi"), ("source", "seacrowd_tod")]


DEFAULT_CONFIG_NAME = "globalwoz_EandF_id_source"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since DEFAULT_CONFIG_NAME is actually optional, do you think it's still make sense to assign a default config to it?


def _info(self) -> datasets.DatasetInfo:
if self.config.schema == "source":
features = datasets.Features(
{
"id": datasets.Value("string"),
"goal": {
"attraction": datasets.Value("string"),
"hospital": datasets.Value("string"),
"hotel": datasets.Value("string"),
"police": datasets.Value("string"),
"restaurant": datasets.Value("string"),
"taxi": datasets.Value("string"),
"train": datasets.Value("string"),
},
"log": [
{
"dialog_act": datasets.Value("string"),
"metadata": datasets.Value("string"),
"span_info": [[datasets.Value("string")]],
"text": datasets.Value("string"),
}
],
}
)

elif self.config.schema == "seacrowd_tod":
features = schemas.tod_features
else:
raise NotImplementedError()

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""
_split_generators = []

type_and_lang = {"dial_type": self.config.subset_id.split("_")[1].replace("and", "&"), "lang": self.config.subset_id.split("_")[2]} # globalwoz_{dial_type}_{lang}

if self.config.data_dir is None:
raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.")
else:
data_dir = self.config.data_dir

if not os.path.exists(os.path.join(data_dir, f"{type_and_lang['dial_type']}_{type_and_lang['lang']}.json")):
raise FileNotFoundError()

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
# "filepath": data_dir + f"_{type_and_lang['dial_type']}_{type_and_lang['lang']}.json",
"filepath": os.path.join(data_dir, f"{type_and_lang['dial_type']}_{type_and_lang['lang']}.json"),
"split": "train",
},
),
]

def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""
# For local datasets you will have access to self.config.data_dir and self.config.data_files
with open(filepath, "r+", encoding="utf8") as fw:
data = json.load(fw)

if self.config.schema == "source":
for idx, tod_dialogue in enumerate(data.values()):
example = {}
example["id"] = str(idx)
example["goal"] = {}

for goal_key in ["attraction", "hospital", "hotel", "police", "restaurant", "taxi", "train"]:
example["goal"][goal_key] = json.dumps(tod_dialogue["goal"][goal_key])
example["log"] = []

for dial_log in tod_dialogue["log"]:
dial = {}
dial["dialog_act"] = json.dumps(dial_log["dialog_act"])
dial["metadata"] = json.dumps(dial_log["metadata"])
for i in range(len(dial_log["span_info"])):
for j in range(len(dial_log["span_info"][i])):
dial_log["span_info"][i][j] = str(dial_log["span_info"][i][j]) # casting to str
dial["span_info"] = [[str(span)] if isinstance(span, str) else span for span in dial_log["span_info"]]
dial["text"] = dial_log["text"]

example["log"].append(dial)

yield example["id"], example

elif self.config.schema == "seacrowd_tod":
for idx, tod_dialogue in enumerate(data.values()):
example = {}
example["dialogue_idx"] = idx

dialogue = []
# NOTE: the dialogue always started with `user` as first utterance
for turn, i in enumerate(range(0, len(tod_dialogue["log"]) + 2, 2)):
dial = {}
dial["turn_idx"] = turn

# system_utterance properties
dial["system_utterance"] = ""
dial["system_acts"] = []
if turn != 0:
dial["system_utterance"] = tod_dialogue["log"][i - 1]["text"]
if i < len(tod_dialogue["log"]):
# NOTE: "system_acts will be populated with the `dialog_act` from the user utterance in the original dataset, as our schema dictates
# that `system_acts` should represent the system's intended actions based on the user's utterance."
for acts in tod_dialogue["log"][i]["dialog_act"].values():
for act in acts:
dial["system_acts"].append([act[0]])

# user_utterance properties
dial["turn_label"] = [] # left as an empty array
dial["belief_state"] = []
if i == len(tod_dialogue["log"]):
# case if turn_idx > len(dialogue) --> add dummy user_utterance
dial["user_utterance"] = ""
else:
dial["user_utterance"] = tod_dialogue["log"][i]["text"]
# NOTE: "the belief_state will be populated with the `span_info` from the user utterance in the original dataset, as our schema dictates
# that `belief_state` should represent the system's belief state based on the user's utterance."
for span in tod_dialogue["log"][i]["span_info"]:
if span[0].split("-")[1] == "request": # Request action
dial["belief_state"].append({"slots": [["slot", span[1]]], "act": "request"})
else:
dial["belief_state"].append({"slots": [[span[1], span[2]]], "act": span[0].split("-")[1]})

# append to dialogue
dialogue.append(dial)

example["dialogue"] = dialogue

yield example["dialogue_idx"], example