Add tatoeba dataset loader

Closes SEACrowd#5
ljvmiranda921 · Nov 5, 2023 · 026d00a · 026d00a
1 parent e210a53
commit 026d00a
Show file tree

Hide file tree

Showing 2 changed files with 131 additions and 0 deletions.
diff --git a/seacrowd/sea_datasets/tatoeba/__init__.py b/seacrowd/sea_datasets/tatoeba/__init__.py
diff --git a/seacrowd/sea_datasets/tatoeba/tatoeba.py b/seacrowd/sea_datasets/tatoeba/tatoeba.py
@@ -0,0 +1,131 @@
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+from datasets.download.download_manager import DownloadManager
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Licenses, Tasks
+
+_CITATION = """\
+@article{tatoeba,
+    title     = {Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond},
+    author    = {Mikel, Artetxe and Holger, Schwenk,},
+    journal   = {arXiv:1812.10464v2},
+    year      = {2018}
+}
+"""
+
+_LOCAL = False
+_LANGUAGES = ["ind", "vie", "tgl", "jav", "tha"]
+_DATASETNAME = "tatoeba"
+_DESCRIPTION = """\
+This dataset is a subset of the Tatoeba corpus containing language pairs for Indonesian, Vietnamese, Tagalog, Javanese, and Thai.
+The original dataset description can be found below:
+
+This data is extracted from the Tatoeba corpus, dated Saturday 2018/11/17.
+For each languages, we have selected 1000 English sentences and their translations, if available. Please check
+this paper for a description of the languages, their families and scripts as well as baseline results.
+Please note that the English sentences are not identical for all language pairs. This means that the results are
+not directly comparable across languages. In particular, the sentences tend to have less variety for several
+low-resource languages, e.g. "Tom needed water", "Tom needs water", "Tom is getting water", ...
+"""
+
+_HOMEPAGE = "https://github.com/facebookresearch/LASER/blob/main/data/tatoeba/v1/README.md"
+_LICENSE = Licenses.APACHE_2_0.value
+_URL = "https://github.com/facebookresearch/LASER/raw/main/data/tatoeba/v1/"
+
+_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION]
+_SOURCE_VERSION = "1.0.0"
+_SEACROWD_VERSION = "1.0.0"
+
+
+class TatoebaDatset(datasets.GeneratorBasedBuilder):
+    """Tatoeba subset for Indonesian, Vietnamese, Tagalog, Javanese, and Thai."""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    SEACROWD_SCHEMA_NAME = "t2t"
+
+    dataset_names = sorted([f"tatoeba.{lang}" for lang in _LANGUAGES])
+    BUILDER_CONFIGS = []
+    for name in dataset_names:
+        source_config = SEACrowdConfig(
+            name=f"{name}_source",
+            version=SOURCE_VERSION,
+            description=f"{_DATASETNAME} source schema",
+            schema="source",
+            subset_id=name,
+        )
+        BUILDER_CONFIGS.append(source_config)
+        seacrowd_config = SEACrowdConfig(
+            name=f"{name}_seacrowd_{SEACROWD_SCHEMA_NAME}",
+            version=SEACROWD_VERSION,
+            description=f"{_DATASETNAME} SEACrowd schema",
+            schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}",
+            subset_id=name,
+        )
+        BUILDER_CONFIGS.append(seacrowd_config)
+
+    # Choose first language as default
+    DEFAULT_CONFIG_NAME = f"{dataset_names[0]}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "source_sentence": datasets.Value("string"),
+                    "target_sentence": datasets.Value("string"),
+                    "source_lang": datasets.Value("string"),
+                    "target_lang": datasets.Value("string"),
+                }
+            )
+        elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}":
+            features = schemas.text2text_features
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]:
+        """Return SplitGenerators."""
+        lang_source = self.config.name.split(".")[1]
+        lang = lang_source.split("_")[0]
+        tatoeba_source_data = dl_manager.download_and_extract(_URL + f"tatoeba.{lang}-eng.{lang}")
+        tatoeba_eng_data = dl_manager.download_and_extract(_URL + f"tatoeba.{lang}-eng.eng")
+        return [datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": (tatoeba_source_data, tatoeba_eng_data), "split": "dev"})]
+
+    def _generate_examples(self, filepath: Tuple[Path, Path], split: str) -> Tuple[int, Dict]:
+        """Yield examples as (key, example) tuples"""
+        source_file = filepath[0]
+        target_file = filepath[1]
+        source_sentences = []
+        target_sentences = []
+        with open(source_file, encoding="utf-8") as f1:
+            for row in f1:
+                source_sentences.append(row)
+        with open(target_file, encoding="utf-8") as f2:
+            for row in f2:
+                target_sentences.append(row)
+        for idx in range(len(source_sentences)):
+            if self.config.schema == "source":
+                example = {
+                    "source_sentence": source_sentences[idx],
+                    "target_sentence": target_sentences[idx],
+                    "source_lang": source_file.split(".")[-1],
+                    "target_lang": "eng",
+                }
+            elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}":
+                example = {
+                    "id": str(idx),
+                    "text_1": source_sentences[idx],
+                    "text_2": target_sentences[idx],
+                    "text_1_name": source_file.split(".")[-1],
+                    "text_2_name": "eng",
+                }
+            yield idx, example