Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementing Schema: End-to-end Task-oriented Dialogue System (WoZ) #237

Merged
merged 14 commits into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
29 changes: 29 additions & 0 deletions seacrowd/sea_datasets/phoatis_intent_cls/intent_label.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
UNK
abbreviation
aircraft
aircraft#flight
aircraft#flight#flight_no
airfare
airfare#flight
airfare#flight_time
airline
airline#flight
airline#flight_no
airport
capacity
city
city#flight_time
distance
flight
flight#flight_no
flight#flight_time
flight_no
flight_no#flight_time
flight_time
ground_fare
ground_fare#ground_service
ground_service
meal
quantity
restriction
day_name
202 changes: 202 additions & 0 deletions seacrowd/sea_datasets/phoatis_intent_cls/phoatis_intent_cls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
from pathlib import Path
from typing import Dict, List, Tuple

import datasets

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Tasks

# TODO: Add BibTeX citation
_CITATION = """\
@article{dao2021intent,
title={Intent Detection and Slot Filling for Vietnamese},
author={Mai Hoang Dao and Thinh Hung Truong and Dat Quoc Nguyen},
year={2021},
eprint={2104.02021},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""

_DATASETNAME = "phoatis"

_DESCRIPTION = """\
This is first public intent detection and slot filling dataset for Vietnamese. The data contains 5871 English utterances from ATIS that are manually translated by professional translators into Vietnamese.
"""

_HOMEPAGE = "https://github.com/VinAIResearch/JointIDSF/"

_LICENSE = "Licenses.UNKNOWN.value"

_URLS = {
_DATASETNAME: {
"syllable": {
"syllable_train": [
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/train/seq.in",
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/train/seq.out",
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/train/label",
],
"syllable_dev": [
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/dev/seq.in",
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/dev/seq.out",
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/dev/label",
],
"syllable_test": [
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/test/seq.in",
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/test/seq.out",
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/test/label",
],
},
"word": {
"word_train": [
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/train/seq.in",
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/train/seq.out",
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/train/label",
],
"word_dev": [
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/dev/seq.in",
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/dev/seq.out",
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/dev/label",
],
"word_test": [
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/test/seq.in",
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/test/seq.out",
"https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/test/label",
],
},
}
}

_SUPPORTED_TASKS = [Tasks.INTENT_CLASSIFICATION]

_SOURCE_VERSION = "1.0.0"

_SEACROWD_VERSION = "1.0.0"


def config_constructor(schema: str, version: str, phoatis_subset: str = "syllable") -> SEACrowdConfig:
assert phoatis_subset == "syllable" or phoatis_subset == "word"

return SEACrowdConfig(
name="phoatis_intent_cls_{phoatis_subset}_{schema}".format(phoatis_subset=phoatis_subset.lower(), schema=schema),
version=version,
description="PhoATIS Intent Classification: {subset} {schema} schema".format(subset=phoatis_subset, schema=schema),
schema=schema,
subset_id=phoatis_subset,
)


class PhoATIS(datasets.GeneratorBasedBuilder):
"""This is first public intent detection and slot filling dataset for Vietnamese. The data contains 5871 English utterances from ATIS that are manually translated by professional translators into Vietnamese."""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)

BUILDER_CONFIGS = [config_constructor("source", _SOURCE_VERSION, schema) for schema in ["syllable", "word"]]
BUILDER_CONFIGS.extend([config_constructor("seacrowd_text", _SOURCE_VERSION, schema) for schema in ["syllable", "word"]])

BUILDER_CONFIGS.extend(
[ # Default config
SEACrowdConfig(
name="phoatis_intent_cls_source",
version=SOURCE_VERSION,
description="PhoATIS Intent Classification source schema (Syllable version)",
schema="source",
subset_id="syllable",
),
SEACrowdConfig(
name="phoatis_intent_cls_seacrowd_text",
version=SEACROWD_VERSION,
description="PhoATIS Intent Classification SEACrowd schema (Syllable version)",
schema="seacrowd_text",
subset_id="syllable",
),
]
)

DEFAULT_CONFIG_NAME = "phoatis_intent_cls_source"

def _info(self) -> datasets.DatasetInfo:

if self.config.schema == "source":
features = datasets.Features(
{
"id": datasets.Value("string"),
"text": datasets.Value("string"),
"intent_label": datasets.Value("string"),
"slot_label": datasets.Sequence(datasets.Value("string")),
}
)

elif self.config.schema == "seacrowd_text":
with open(".\seacrowd\sea_datasets\phoatis_intent_cls\intent_label.txt", "r+", encoding="utf8") as fw:
intent_label = fw.read()
intent_label = intent_label.split("\n")
features = schemas.text_features(intent_label)

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
schema = self.config.subset_id
urls = _URLS[_DATASETNAME][schema]
data_dir = dl_manager.download_and_extract(urls)

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": data_dir[f"{schema}_train"],
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": data_dir[f"{schema}_test"],
"split": "test",
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": data_dir[f"{schema}_dev"],
"split": "dev",
},
),
]

def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
with open(filepath[0], "r+", encoding="utf8") as fw:
data_input = fw.read()
data_input = data_input.split("\n")
with open(filepath[1], "r+", encoding="utf8") as fw:
data_slot = fw.read()
data_slot = data_slot.split("\n")
with open(filepath[2], "r+", encoding="utf8") as fw:
data_intent = fw.read()
data_intent = data_intent.split("\n")

if self.config.schema == "source":
for idx, text in enumerate(data_input):
example = {}
example["id"] = str(idx)
example["text"] = text
example["intent_label"] = data_intent[idx]
data_slot[idx] = data_slot[idx].split()
example["slot_label"] = data_slot[idx]
yield example["id"], example

elif self.config.schema == "seacrowd_text":
for idx, text in enumerate(data_input):
example = {}
example["id"] = str(idx)
example["text"] = text
example["label"] = data_intent[idx]
yield example["id"], example
142 changes: 142 additions & 0 deletions seacrowd/sea_datasets/phoatis_intent_cls/slot_label.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
PAD
UNK
O
B-aircraft_code
B-airline_code
B-airline_name
I-airline_name
B-airport_code
B-airport_name
I-airport_name
B-arrive_date.date_relative
I-arrive_date.date_relative
B-arrive_date.day_name
I-arrive_date.day_name
B-arrive_date.day_number
I-arrive_date.day_number
B-arrive_date.month_name
I-arrive_date.month_name
B-arrive_date.today_relative
B-arrive_time.end_time
I-arrive_time.end_time
B-arrive_time.period_mod
I-arrive_time.period_mod
B-arrive_time.period_of_day
I-arrive_time.period_of_day
B-arrive_time.start_time
I-arrive_time.start_time
B-arrive_time.time
I-arrive_time.time
B-arrive_time.time_relative
I-arrive_time.time_relative
B-city_name
I-city_name
B-class_type
I-class_type
B-connect
I-connect
B-cost_relative
I-cost_relative
B-day_name
I-day_name
B-day_number
I-day_number
B-days_code
B-depart_date.date_relative
I-depart_date.date_relative
B-depart_date.day_name
I-depart_date.day_name
B-depart_date.day_number
I-depart_date.day_number
B-depart_date.month_name
I-depart_date.month_name
B-depart_date.today_relative
I-depart_date.today_relative
B-depart_date.year
I-depart_date.year
B-depart_time.end_time
I-depart_time.end_time
B-depart_time.period_mod
I-depart_time.period_mod
B-depart_time.period_of_day
I-depart_time.period_of_day
B-depart_time.start_time
I-depart_time.start_time
B-depart_time.time
I-depart_time.time
B-depart_time.time_relative
I-depart_time.time_relative
B-economy
I-economy
B-fare_amount
I-fare_amount
B-fare_basis_code
B-flight_days
I-flight_days
B-flight_mod
I-flight_mod
B-flight_number
B-flight_stop
I-flight_stop
B-flight_time
I-flight_time
B-fromloc.airport_code
B-fromloc.airport_name
I-fromloc.airport_name
B-fromloc.city_name
I-fromloc.city_name
B-fromloc.state_code
B-fromloc.state_name
I-fromloc.state_name
B-meal
I-meal
B-meal_code
I-meal_code
B-meal_description
I-meal_description
B-mod
I-mod
B-month_name
B-or
B-period_of_day
I-period_of_day
B-restriction_code
I-restriction_code
B-return_date.date_relative
I-return_date.date_relative
B-return_date.day_name
I-return_date.day_name
B-return_date.day_number
I-return_date.day_number
B-return_date.month_name
I-return_date.month_name
B-return_date.today_relative
I-return_date.today_relative
B-return_time.period_mod
B-return_time.period_of_day
I-return_time.period_of_day
B-round_trip
I-round_trip
B-state_code
B-state_name
B-stoploc.airport_name
B-stoploc.city_name
I-stoploc.city_name
B-stoploc.state_code
B-time
I-time
B-time_relative
B-today_relative
I-today_relative
B-toloc.airport_code
B-toloc.airport_name
I-toloc.airport_name
B-toloc.city_name
I-toloc.city_name
B-toloc.country_name
I-toloc.country_name
B-toloc.state_code
B-toloc.state_name
I-toloc.state_name
B-transport_type
I-transport_type
Empty file.
Loading