SEACrowd · SamuelCahyawijaya · Jan 3, 2024 · Dec 21, 2023 · Dec 26, 2023 · Dec 26, 2023
diff --git a/seacrowd/sea_datasets/phoatis_intent_cls/__init__.py b/seacrowd/sea_datasets/phoatis_intent_cls/__init__.py
diff --git a/seacrowd/sea_datasets/phoatis_intent_cls/intent_label.txt b/seacrowd/sea_datasets/phoatis_intent_cls/intent_label.txt
@@ -0,0 +1,29 @@
+UNK
+abbreviation
+aircraft
+aircraft#flight
+aircraft#flight#flight_no
+airfare
+airfare#flight
+airfare#flight_time
+airline
+airline#flight
+airline#flight_no
+airport
+capacity
+city
+city#flight_time
+distance
+flight
+flight#flight_no
+flight#flight_time
+flight_no
+flight_no#flight_time
+flight_time
+ground_fare
+ground_fare#ground_service
+ground_service
+meal
+quantity
+restriction
+day_name
diff --git a/seacrowd/sea_datasets/phoatis_intent_cls/phoatis_intent_cls.py b/seacrowd/sea_datasets/phoatis_intent_cls/phoatis_intent_cls.py
@@ -0,0 +1,202 @@
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Tasks
+
+# TODO: Add BibTeX citation
+_CITATION = """\
+@article{dao2021intent,
+      title={Intent Detection and Slot Filling for Vietnamese},
+      author={Mai Hoang Dao and Thinh Hung Truong and Dat Quoc Nguyen},
+      year={2021},
+      eprint={2104.02021},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+"""
+
+_DATASETNAME = "phoatis"
+
+_DESCRIPTION = """\
+This is first public intent detection and slot filling dataset for Vietnamese. The data contains 5871 English utterances from ATIS that are manually translated by professional translators into Vietnamese.
+"""
+
+_HOMEPAGE = "https://github.com/VinAIResearch/JointIDSF/"
+
+_LICENSE = "Licenses.UNKNOWN.value"
+
+_URLS = {
+    _DATASETNAME: {
+        "syllable": {
+            "syllable_train": [
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/train/seq.in",
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/train/seq.out",
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/train/label",
+            ],
+            "syllable_dev": [
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/dev/seq.in",
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/dev/seq.out",
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/dev/label",
+            ],
+            "syllable_test": [
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/test/seq.in",
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/test/seq.out",
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/syllable-level/test/label",
+            ],
+        },
+        "word": {
+            "word_train": [
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/train/seq.in",
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/train/seq.out",
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/train/label",
+            ],
+            "word_dev": [
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/dev/seq.in",
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/dev/seq.out",
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/dev/label",
+            ],
+            "word_test": [
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/test/seq.in",
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/test/seq.out",
+                "https://raw.githubusercontent.com/VinAIResearch/JointIDSF/main/PhoATIS/word-level/test/label",
+            ],
+        },
+    }
+}
+
+_SUPPORTED_TASKS = [Tasks.INTENT_CLASSIFICATION]
+
+_SOURCE_VERSION = "1.0.0"
+
+_SEACROWD_VERSION = "1.0.0"
+
+
+def config_constructor(schema: str, version: str, phoatis_subset: str = "syllable") -> SEACrowdConfig:
+    assert phoatis_subset == "syllable" or phoatis_subset == "word"
+
+    return SEACrowdConfig(
+        name="phoatis_intent_cls_{phoatis_subset}_{schema}".format(phoatis_subset=phoatis_subset.lower(), schema=schema),
+        version=version,
+        description="PhoATIS Intent Classification: {subset} {schema} schema".format(subset=phoatis_subset, schema=schema),
+        schema=schema,
+        subset_id=phoatis_subset,
+    )
+
+
+class PhoATIS(datasets.GeneratorBasedBuilder):
+    """This is first public intent detection and slot filling dataset for Vietnamese. The data contains 5871 English utterances from ATIS that are manually translated by professional translators into Vietnamese."""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    BUILDER_CONFIGS = [config_constructor("source", _SOURCE_VERSION, schema) for schema in ["syllable", "word"]]
+    BUILDER_CONFIGS.extend([config_constructor("seacrowd_text", _SOURCE_VERSION, schema) for schema in ["syllable", "word"]])
+
+    BUILDER_CONFIGS.extend(
+        [  # Default config
+            SEACrowdConfig(
+                name="phoatis_intent_cls_source",
+                version=SOURCE_VERSION,
+                description="PhoATIS Intent Classification source schema (Syllable version)",
+                schema="source",
+                subset_id="syllable",
+            ),
+            SEACrowdConfig(
+                name="phoatis_intent_cls_seacrowd_text",
+                version=SEACROWD_VERSION,
+                description="PhoATIS Intent Classification SEACrowd schema (Syllable version)",
+                schema="seacrowd_text",
+                subset_id="syllable",
+            ),
+        ]
+    )
+
+    DEFAULT_CONFIG_NAME = "phoatis_intent_cls_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "text": datasets.Value("string"),
+                    "intent_label": datasets.Value("string"),
+                    "slot_label": datasets.Sequence(datasets.Value("string")),
+                }
+            )
+
+        elif self.config.schema == "seacrowd_text":
+            with open(".\seacrowd\sea_datasets\phoatis_intent_cls\intent_label.txt", "r+", encoding="utf8") as fw:
+                intent_label = fw.read()
+                intent_label = intent_label.split("\n")
+            features = schemas.text_features(intent_label)
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        schema = self.config.subset_id
+        urls = _URLS[_DATASETNAME][schema]
+        data_dir = dl_manager.download_and_extract(urls)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": data_dir[f"{schema}_train"],
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": data_dir[f"{schema}_test"],
+                    "split": "test",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": data_dir[f"{schema}_dev"],
+                    "split": "dev",
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
+        with open(filepath[0], "r+", encoding="utf8") as fw:
+            data_input = fw.read()
+            data_input = data_input.split("\n")
+        with open(filepath[1], "r+", encoding="utf8") as fw:
+            data_slot = fw.read()
+            data_slot = data_slot.split("\n")
+        with open(filepath[2], "r+", encoding="utf8") as fw:
+            data_intent = fw.read()
+            data_intent = data_intent.split("\n")
+
+        if self.config.schema == "source":
+            for idx, text in enumerate(data_input):
+                example = {}
+                example["id"] = str(idx)
+                example["text"] = text
+                example["intent_label"] = data_intent[idx]
+                data_slot[idx] = data_slot[idx].split()
+                example["slot_label"] = data_slot[idx]
+                yield example["id"], example
+
+        elif self.config.schema == "seacrowd_text":
+            for idx, text in enumerate(data_input):
+                example = {}
+                example["id"] = str(idx)
+                example["text"] = text
+                example["label"] = data_intent[idx]
+                yield example["id"], example
diff --git a/seacrowd/sea_datasets/phoatis_intent_cls/slot_label.txt b/seacrowd/sea_datasets/phoatis_intent_cls/slot_label.txt
@@ -0,0 +1,142 @@
+PAD
+UNK
+O
+B-aircraft_code
+B-airline_code
+B-airline_name
+I-airline_name
+B-airport_code
+B-airport_name
+I-airport_name
+B-arrive_date.date_relative
+I-arrive_date.date_relative
+B-arrive_date.day_name
+I-arrive_date.day_name
+B-arrive_date.day_number
+I-arrive_date.day_number
+B-arrive_date.month_name
+I-arrive_date.month_name
+B-arrive_date.today_relative
+B-arrive_time.end_time
+I-arrive_time.end_time
+B-arrive_time.period_mod
+I-arrive_time.period_mod
+B-arrive_time.period_of_day
+I-arrive_time.period_of_day
+B-arrive_time.start_time
+I-arrive_time.start_time
+B-arrive_time.time
+I-arrive_time.time
+B-arrive_time.time_relative
+I-arrive_time.time_relative
+B-city_name
+I-city_name
+B-class_type
+I-class_type
+B-connect
+I-connect
+B-cost_relative
+I-cost_relative
+B-day_name
+I-day_name
+B-day_number
+I-day_number
+B-days_code
+B-depart_date.date_relative
+I-depart_date.date_relative
+B-depart_date.day_name
+I-depart_date.day_name
+B-depart_date.day_number
+I-depart_date.day_number
+B-depart_date.month_name
+I-depart_date.month_name
+B-depart_date.today_relative
+I-depart_date.today_relative
+B-depart_date.year
+I-depart_date.year
+B-depart_time.end_time
+I-depart_time.end_time
+B-depart_time.period_mod
+I-depart_time.period_mod
+B-depart_time.period_of_day
+I-depart_time.period_of_day
+B-depart_time.start_time
+I-depart_time.start_time
+B-depart_time.time
+I-depart_time.time
+B-depart_time.time_relative
+I-depart_time.time_relative
+B-economy
+I-economy
+B-fare_amount
+I-fare_amount
+B-fare_basis_code
+B-flight_days
+I-flight_days
+B-flight_mod
+I-flight_mod
+B-flight_number
+B-flight_stop
+I-flight_stop
+B-flight_time
+I-flight_time
+B-fromloc.airport_code
+B-fromloc.airport_name
+I-fromloc.airport_name
+B-fromloc.city_name
+I-fromloc.city_name
+B-fromloc.state_code
+B-fromloc.state_name
+I-fromloc.state_name
+B-meal
+I-meal
+B-meal_code
+I-meal_code
+B-meal_description
+I-meal_description
+B-mod
+I-mod
+B-month_name
+B-or
+B-period_of_day
+I-period_of_day
+B-restriction_code
+I-restriction_code
+B-return_date.date_relative
+I-return_date.date_relative
+B-return_date.day_name
+I-return_date.day_name
+B-return_date.day_number
+I-return_date.day_number
+B-return_date.month_name
+I-return_date.month_name
+B-return_date.today_relative
+I-return_date.today_relative
+B-return_time.period_mod
+B-return_time.period_of_day
+I-return_time.period_of_day
+B-round_trip
+I-round_trip
+B-state_code
+B-state_name
+B-stoploc.airport_name
+B-stoploc.city_name
+I-stoploc.city_name
+B-stoploc.state_code
+B-time
+I-time
+B-time_relative
+B-today_relative
+I-today_relative
+B-toloc.airport_code
+B-toloc.airport_name
+I-toloc.airport_name
+B-toloc.city_name
+I-toloc.city_name
+B-toloc.country_name
+I-toloc.country_name
+B-toloc.state_code
+B-toloc.state_name
+I-toloc.state_name
+B-transport_type
+I-transport_type
diff --git a/seacrowd/sea_datasets/phoatis_slot_filling/__init__.py b/seacrowd/sea_datasets/phoatis_slot_filling/__init__.py