Closes SEACrowd#314 | Add dataloader for Indonesia chinese mt robust …

…eval (SEACrowd#388) * add dataloader for indonesian_madurese_bible_translation * update dataloader for indonesia_chinese_mtrobusteval * Delete seacrowd/sea_datasets/indonesian_madurese_bible_translation/indonesian_madurese_bible_translation.py * Update indonesia_chinese_mtrobusteval.py * update code based on the reviewer comments * add __init__.py * Update seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/indonesia_chinese_mtrobusteval.py * Update seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/indonesia_chinese_mtrobusteval.py --------- Co-authored-by: Jennifer Santoso <[email protected]>
R-Damanhuri · Mar 9, 2024 · ca8e109 · ca8e109
1 parent 8482c95
commit ca8e109
Show file tree

Hide file tree

Showing 2 changed files with 151 additions and 0 deletions.
diff --git a/seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/__init__.py b/seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/__init__.py
diff --git a/seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/indonesia_chinese_mtrobusteval.py b/seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/indonesia_chinese_mtrobusteval.py
@@ -0,0 +1,151 @@
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+import jsonlines
+import pandas as pd
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Licenses, Tasks
+
+_CITATION = """\
+@article{,
+  author    = {supryzhu},
+  title     = {Indonesia-Chinese-MTRobustEval},
+  journal   = {None},
+  volume    = {None},
+  year      = {2023},
+  url       = {https://github.com/supryzhu/Indonesia-Chinese-MTRobustEval},
+  doi       = {None},
+  biburl    = {None},
+  bibsource = {None}
+}
+"""
+
+
+_DATASETNAME = "indonesia_chinese_mtrobusteval"
+
+_DESCRIPTION = """\
+The dataset is curated for the purpose of evaluating the robustness of Neural Machine Translation (NMT) towards natural occuring noise
+(typo, slang, code switching, etc.). The dataset is crawled from Twitter, then pre-processed to obtain sentences with noise.
+The dataset consists of a thousand noisy sentences. The dataset is translated into Chinese manually as the benchmark for evaluating the robustness of NMT.
+"""
+
+_HOMEPAGE = "https://github.com/supryzhu/Indonesia-Chinese-MTRobustEval"
+
+_LANGUAGES = ["ind", "cmn"]  # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
+
+
+_LICENSE = Licenses.MIT.value  # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value
+
+_LOCAL = False
+
+_URLS = {
+    _DATASETNAME: "https://github.com/supryzhu/Indonesia-Chinese-MTRobustEval/raw/main/data/Indonesia-Chinese.xlsx",
+}
+
+_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION]  # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
+
+_SOURCE_VERSION = "1.0.0"
+
+_SEACROWD_VERSION = "1.0.0"
+
+
+class IndonesiaChineseMtRobustEval(datasets.GeneratorBasedBuilder):
+    """The dataset consists of a thousand noisy sentences. The dataset is translated into Chinese manually as the benchmark for evaluating the robustness of NMT."""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    BUILDER_CONFIGS = [
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_source",
+            version=SOURCE_VERSION,
+            description="indonesia_chinese_mtrobusteval source schema",
+            schema="source",
+            subset_id=f"{_DATASETNAME}",
+        ),
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_seacrowd_t2t",
+            version=SEACROWD_VERSION,
+            description="indonesia_chinese_mtrobusteval SEACrowd schema",
+            schema="seacrowd_t2t",
+            subset_id=f"{_DATASETNAME}",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "src": datasets.Value("string"),
+                    "tgt": datasets.Value("string"),
+                }
+            )
+
+        elif self.config.schema == "seacrowd_t2t":
+            features = schemas.text2text_features
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        urls = _URLS[_DATASETNAME]
+        file_path = dl_manager.download(urls)
+        df = pd.read_excel(file_path)
+        src = df["Indonesia"].tolist()
+        tgt = df["Chinese"].tolist()
+        results = []
+        for i, item in enumerate(src):
+            results.append({"id": str(i), "src": item, "tgt": tgt[i]})
+        self._write_jsonl(file_path + ".jsonl", results)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # Whatever you put in gen_kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": file_path + ".jsonl",
+                    "split": "train",
+                },
+            )
+        ]
+
+
+    def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
+        if self.config.schema == "source":
+            i = 0
+            with jsonlines.open(filepath) as f:
+                for each_data in f.iter():
+                    ex = {
+                        "id": each_data["id"],
+                        "src": each_data["src"],
+                        "tgt": each_data["tgt"],
+                    }
+                    yield i, ex
+                    i += 1
+
+        elif self.config.schema == "seacrowd_t2t":
+            i = 0
+            with jsonlines.open(filepath) as f:
+                for each_data in f.iter():
+                    ex = {"id": each_data["id"], "text_1": each_data["src"], "text_2": each_data["tgt"], "text_1_name": "ind", "text_2_name": "cmn"}
+                    yield i, ex
+                    i += 1
+
+    def _write_jsonl(self, filepath, values):
+        with jsonlines.open(filepath, "w") as writer:
+            for line in values:
+                writer.write(line)
+