Closes #115 | Create dataset loader for PhoMT dataset (#489)

* add dataloader for PhoMT dataset * Update seacrowd/sea_datasets/phomt/phomt.py Co-authored-by: Elyanah Aco <[email protected]> * Update seacrowd/sea_datasets/phomt/phomt.py Co-authored-by: Elyanah Aco <[email protected]> * Update seacrowd/sea_datasets/phomt/phomt.py Co-authored-by: Elyanah Aco <[email protected]> * Update seacrowd/sea_datasets/phomt/phomt.py Co-authored-by: Elyanah Aco <[email protected]> * Update seacrowd/sea_datasets/phomt/phomt.py Co-authored-by: Elyanah Aco <[email protected]> * update text1/2 name for PhoMT dataset * Update phomt.py to replace en&vi to eng&vie --------- Co-authored-by: Elyanah Aco <[email protected]>
SEACrowd · Mar 26, 2024 · 9fc8863 · 9fc8863
1 parent d89f945
commit 9fc8863
Show file tree

Hide file tree

Showing 2 changed files with 139 additions and 0 deletions.
diff --git a/seacrowd/sea_datasets/phomt/__init__.py b/seacrowd/sea_datasets/phomt/__init__.py
diff --git a/seacrowd/sea_datasets/phomt/phomt.py b/seacrowd/sea_datasets/phomt/phomt.py
@@ -0,0 +1,139 @@
+import os
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Licenses, Tasks
+
+_CITATION = """\
+@inproceedings{PhoMT,
+title     = {{PhoMT: A High-Quality and Large-Scale Benchmark Dataset for Vietnamese-English Machine Translation}},
+author    = {Long Doan and Linh The Nguyen and Nguyen Luong Tran and Thai Hoang and Dat Quoc Nguyen},
+booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
+year      = {2021},
+pages     = {4495--4503}
+}
+"""
+
+_DATASETNAME = "phomt"
+
+
+_DESCRIPTION = """\
+PhoMT is a high-quality and large-scale Vietnamese-English parallel dataset of 3.02M sentence pairs, which is 2.9M
+pairs larger than the benchmark Vietnamese-English machine translation corpus IWSLT15. This is the first large-scale
+Vietnamese-English machine translation study.
+"""
+
+_LANGUAGES = ["vie", "eng"]  # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
+_LOCAL = True
+
+_HOMEPAGE = "https://github.com/VinAIResearch/PhoMT"
+
+_LICENSE = Licenses.MIT.value
+
+_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION]
+
+_SOURCE_VERSION = "1.0.0"
+
+_SEACROWD_VERSION = "1.0.0"
+
+MAP_LANG = {"eng": "en", "vie": "vi"}
+
+
+def seacrowd_config_constructor(src_lang, tgt_lang, schema, version):
+    if src_lang == "" or tgt_lang == "":
+        raise ValueError(f"Invalid src_lang {src_lang} or tgt_lang {tgt_lang}")
+
+    if schema not in ["source", "seacrowd_t2t"]:
+        raise ValueError(f"Invalid schema: {schema}")
+
+    return SEACrowdConfig(
+        name="phomt_{src}_{tgt}_{schema}".format(src=src_lang, tgt=tgt_lang, schema=schema),
+        version=datasets.Version(version),
+        description="phomt schema for {schema} from {src} to {tgt}".format(schema=schema, src=src_lang, tgt=tgt_lang),
+        schema=schema,
+        subset_id="phomt_{src}_{tgt}".format(src=src_lang, tgt=tgt_lang),
+    )
+
+
+class PhoMT(datasets.GeneratorBasedBuilder):
+    """
+    PhoMT is a high-quality and large-scale Vietnamese-English parallel dataset of 3.02M sentence pairs, which is
+    2.9M pairs larger than the benchmark Vietnamese-English machine translation corpus IWSLT15.
+    """
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    BUILDER_CONFIGS = [
+        seacrowd_config_constructor("eng", "vie", "source", _SOURCE_VERSION),
+        seacrowd_config_constructor("eng", "vie", "seacrowd_t2t", _SEACROWD_VERSION),
+    ]
+
+    DEFAULT_CONFIG_NAME = "phomt_eng_vie_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+        if self.config.schema in ("source", "seacrowd_t2t"):
+            features = schemas.text2text_features
+        else:
+            raise ValueError(f"Invalid config schema: {self.config.schema}")
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        if self.config.data_dir is None:
+            raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.")
+        else:
+            data_dir = self.config.data_dir
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": os.path.join(data_dir, "detokenization", "train", "train.{lang}")},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"filepath": os.path.join(data_dir, "detokenization", "dev", "dev.{lang}")},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={"filepath": os.path.join(data_dir, "detokenization", "test", "test.{lang}")},
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]:
+        config_names_split = self.config.name.split("_")
+        src_lang = config_names_split[1]
+        tgt_lang = config_names_split[2]
+
+        src_path = filepath.format(lang=MAP_LANG[src_lang])
+        tgt_path = filepath.format(lang=MAP_LANG[tgt_lang])
+
+        with open(src_path, "r", encoding="utf8") as f:
+            src_lines = f.readlines()
+        with open(tgt_path, "r", encoding="utf8") as f:
+            tgt_lines = f.readlines()
+
+        if self.config.schema in ("source", "seacrowd_t2t"):
+            for idx, (src_line, tgt_line) in enumerate(zip(src_lines, tgt_lines)):
+                ex = {
+                    "id": str(idx),
+                    "text_1": src_line.strip(),
+                    "text_2": tgt_line.strip(),
+                    "text_1_name": src_lang,
+                    "text_2_name": tgt_lang,
+                }
+                yield idx, ex
+
+        else:
+            raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.")