Closes #392 | Create dataset loader for Generated Review ENTH (#435)

* [Feat] Add Generated Review ENTH dataloader * [Chore] Tidy up: run formatter, remove args, use f-string * [Fix] Change citation to follow scb-mt-en-th-2020
SEACrowd · Apr 19, 2024 · a19097e · a19097e
1 parent 1959846
commit a19097e
Show file tree

Hide file tree

Showing 2 changed files with 153 additions and 0 deletions.
diff --git a/seacrowd/sea_datasets/generated_review_enth/__init__.py b/seacrowd/sea_datasets/generated_review_enth/__init__.py
diff --git a/seacrowd/sea_datasets/generated_review_enth/generated_review_enth.py b/seacrowd/sea_datasets/generated_review_enth/generated_review_enth.py
@@ -0,0 +1,153 @@
+import os
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+import jsonlines
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Licenses, Tasks
+
+_CITATION = """\
+@article{Lowphansirikul2021,
+    author={Lowphansirikul, Lalita
+            and Polpanumas, Charin
+            and Rutherford, Attapol T.
+            and Nutanong, Sarana},
+    title={A large English--Thai parallel corpus from the web and machine-generated text},
+    journal={Language Resources and Evaluation},
+    year={2021},
+    month={Mar},
+    day={30},
+    issn={1574-0218},
+    doi={10.1007/s10579-021-09536-6},
+    url={https://doi.org/10.1007/s10579-021-09536-6}
+"""
+
+_DATASETNAME = "generated_review_enth"
+
+_DESCRIPTION = """\
+Generated Reviews ENTH is created as part of scb-mt-en-th-2020 for machine translation task. This dataset (referred to as generated_reviews_yn in scb-mt-en-th-2020) are English product reviews generated by CTRL, translated by Google Translate API and annotated as accepted or rejected (correct) based on fluency and adequacy of the translation by human annotators. This allows it to be used for English-to-Thai translation quality esitmation (binary label), machine translation, and sentiment analysis. For SEACrowd, we use data with correct = 1.
+"""
+
+_HOMEPAGE = "https://github.com/vistec-ai/generated_reviews_enth"
+
+_LANGUAGES = ["tha", "eng"]
+
+_LICENSE = Licenses.CC_BY_SA_4_0.value
+
+_LOCAL = False
+
+_URLS = {_DATASETNAME: "https://github.com/vistec-AI/generated_reviews_enth/raw/main/data.zip"}
+
+_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION]
+
+_SOURCE_VERSION = "1.0.0"
+
+_SEACROWD_VERSION = "1.0.0"
+
+
+class GeneratedReviewENTHDataset(datasets.GeneratorBasedBuilder):
+    """This dataset (referred to as generated_reviews_yn in scb-mt-en-th-2020) are English product reviews generated by CTRL, translated by Google Translate API and annotated as accepted or rejected (correct) based on fluency and adequacy of the translation by human annotators. This allows it to be used for English-to-Thai translation quality esitmation (binary label), machine translation, and sentiment analysis."""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    BUILDER_CONFIGS = [
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_source",
+            version=SOURCE_VERSION,
+            description="Generated Review EN-TH source schema",
+            schema="source",
+            subset_id=f"{_DATASETNAME}",
+        ),
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_seacrowd_t2t",
+            version=SEACROWD_VERSION,
+            description="Generated Review EN-TH SEACrowd schema",
+            schema="seacrowd_t2t",
+            subset_id=f"{_DATASETNAME}",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "en_segment": datasets.Value("string"),
+                    "th_segment": datasets.Value("string"),
+                    "review_star": datasets.Value("int32"),
+                    "correct": datasets.Value("bool"),
+                }
+            )
+
+        elif self.config.schema == "seacrowd_t2t":
+            features = schemas.text2text_features
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+
+        urls = _URLS[_DATASETNAME]
+        data_dir = dl_manager.download_and_extract(urls)
+        data_dir = os.path.join(data_dir, "data")
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "train.jsonl"),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "test.jsonl"),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "valid.jsonl"),
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+
+        if self.config.schema == "source":
+            with jsonlines.open(filepath) as f:
+                i = -1
+                for example in f.iter():
+                    i += 1
+                    yield str(i), {
+                        "en_segment": example["en_segment"],
+                        "th_segment": example["th_segment"],
+                        "review_star": example["review_star"],
+                        "correct": example["correct"],
+                    }
+
+        elif self.config.schema == "seacrowd_t2t":
+            with jsonlines.open(filepath) as f:
+                i = -1
+                for example in f.iter():
+                    if example["correct"]:  # SEACrowd only use correct data
+                        i += 1
+                        yield str(i), {
+                            "id": str(i),
+                            "text_1": example["en_segment"],
+                            "text_2": example["th_segment"],
+                            "text_1_name": "eng",
+                            "text_2_name": "tha",
+                        }