Merge branch 'master' into tree_schema

SEACrowd · Jan 9, 2024 · 811b5c6 · 811b5c6
2 parents 3789f26 + cdc64ab
commit 811b5c6
Show file tree

Hide file tree

Showing 13 changed files with 930 additions and 5 deletions.
diff --git a/seacrowd/sea_datasets/bloom_speech/__init__.py b/seacrowd/sea_datasets/bloom_speech/__init__.py
diff --git a/seacrowd/sea_datasets/bloom_speech/bloom_speech.py b/seacrowd/sea_datasets/bloom_speech/bloom_speech.py
@@ -0,0 +1,172 @@
+"""
+SEA Crowd Data Loader for Bloom Speech.
+"""
+from typing import Dict, List, Tuple
+
+import datasets
+from datasets.download.download_manager import DownloadManager
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks
+
+_CITATION = r"""
+@inproceedings{leong-etal-2022-bloom,
+    title = "Bloom Library: Multimodal Datasets in 300+ Languages for a Variety of Downstream Tasks",
+    author = "Leong, Colin  and
+      Nemecek, Joshua  and
+      Mansdorfer, Jacob  and
+      Filighera, Anna  and
+      Owodunni, Abraham  and
+      Whitenack, Daniel",
+    editor = "Goldberg, Yoav  and
+      Kozareva, Zornitsa  and
+      Zhang, Yue",
+    booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
+    month = dec,
+    year = "2022",
+    address = "Abu Dhabi, United Arab Emirates",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2022.emnlp-main.590",
+    doi = "10.18653/v1/2022.emnlp-main.590",
+    pages = "8608--8621",
+}
+"""
+
+logger = datasets.logging.get_logger(__name__)
+
+# this config is created for SEACrowd Dataloader
+_LANG_CONFIG = {"bjn": "Banjar", "bzi": "Bisu", "ceb": "Cebuano", "ind": "Indonesian", "jra": "Jarai", "kqr": "Kimaragang", "mya": "Burmese", "tgl": "Tagalog"}
+
+_LOCAL = False
+_LANGUAGES = list(_LANG_CONFIG.keys())
+
+
+_DATASETNAME = "bloom_speech"
+_DESCRIPTION = r"""
+This version of the Bloom Library data is developed specifically for the automatic speech recognition and speech-to-text tasks.
+It includes data from 56 languages across 18 language families. 8 languages are spoken in Southeast Asia.
+Before using this dataloader, please accept the acknowledgement at https://huggingface.co/datasets/sil-ai/bloom-speech and use huggingface-cli login for authentication.
+"""
+
+_HOMEPAGE = "https://huggingface.co/datasets/sil-ai/bloom-speech"
+_LICENSE = Licenses.CC.value
+
+_URL = "https://huggingface.co/datasets/sil-ai/bloom-speech"
+_HF_REMOTE_REF = "/".join(_URL.split("/")[-2:])
+
+_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION]
+_SOURCE_VERSION = "0.0.1"
+_SEACROWD_VERSION = "1.0.0"
+
+CONFIG_SUFFIXES_FOR_TASK = [TASK_TO_SCHEMA.get(task).lower() for task in _SUPPORTED_TASKS]
+
+
+def construct_configs_on_langs(languages: list = None) -> List[SEACrowdConfig]:
+    """
+    The function `construct_configs` constructs a list of SEACrowdConfig objects based on the provided
+    languages or a default language, and returns the list.
+
+    input:
+        languages (list, default None): The `languages` parameter is a list that specifies the languages for which the
+        configurations need to be constructed. If no languages are provided (value=None), the first value in language config
+        will be used.
+    output:
+        a list of `SEACrowdConfig` objects based on instantiated init variables
+    """
+
+    # set output var
+    config_list = []
+
+    # construct zipped arg for config instantiation
+    TASKS_AND_CONFIG_SUFFIX_PAIRS = list(zip(_SUPPORTED_TASKS, CONFIG_SUFFIXES_FOR_TASK))
+
+    # implement source schema
+    version, config_name_prefix = _SOURCE_VERSION, "source"
+    config_list += [
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_{_LANG}_{config_name_prefix}",
+            version=datasets.Version(version),
+            description=f"{_DATASETNAME} {config_name_prefix} schema for language code {_LANG}",
+            schema=f"{config_name_prefix}",
+            subset_id=_LANG,
+        )
+        for _LANG in languages
+    ]
+
+    # implement SEACrowd schema
+    version, config_name_prefix = _SEACROWD_VERSION, "seacrowd"
+    for task_obj, config_name_suffix in TASKS_AND_CONFIG_SUFFIX_PAIRS:
+        config_list += [
+            SEACrowdConfig(
+                name=f"{_DATASETNAME}_{_LANG}_{config_name_prefix}_{config_name_suffix}",
+                version=datasets.Version(version),
+                description=f"{_DATASETNAME} {config_name_prefix} schema for {task_obj.name} and language code {_LANG}",
+                schema=f"{config_name_prefix}_{config_name_suffix}",
+                subset_id=_LANG,
+            )
+            for _LANG in languages
+        ]
+    return config_list
+
+
+class BloomSpeechDataset(datasets.GeneratorBasedBuilder):
+    """Bloom Speech dataset, subsetted from https://huggingface.co/datasets/sil-ai/bloom-speech"""
+
+    # get all schema w/o lang arg + get all schema w/ lang arg
+    BUILDER_CONFIGS = construct_configs_on_langs(_LANGUAGES)
+
+    def _info(self) -> datasets.DatasetInfo:
+        _config_schema_name = self.config.schema
+        logger.info(f"Received schema name: {self.config.schema}")
+        # source schema
+        if _config_schema_name == "source":
+            features = datasets.Features(
+                {
+                    "file": datasets.Value("string"),
+                    "audio": datasets.Audio(sampling_rate=16_000),
+                    "text": datasets.Value("string"),
+                    "book": datasets.Value("string"),
+                    "instance": datasets.Value("string"),
+                    "license": datasets.Value("string"),
+                    "credits": datasets.Value("string"),
+                    "original_lang_tag": datasets.Value("string"),
+                }
+            )
+
+        # speech-text schema
+        elif _config_schema_name == "seacrowd_sptext":
+            features = schemas.speech_text_features
+
+        else:
+            raise ValueError(f"Received unexpected config schema of {_config_schema_name}!")
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]:
+        hf_dset_dict = datasets.load_dataset(_HF_REMOTE_REF, self.config.subset_id)
+
+        return [datasets.SplitGenerator(name=datasets.Split(dset_key), gen_kwargs={"hf_dset": dset}) for dset_key, dset in hf_dset_dict.items() if dset.num_rows > 0]
+
+    def _generate_examples(self, hf_dset) -> Tuple[int, Dict]:
+        _config_schema_name = self.config.schema
+
+        _idx = 0
+        for datapoints in hf_dset:
+            # since no _idx is available to be used, we're creating it manually for both schema
+            if _config_schema_name == "source":
+                yield _idx, {colname: datapoints[colname] for colname in self.info.features}
+
+            elif _config_schema_name == "seacrowd_sptext":
+                yield _idx, {"id": _idx, "path": datapoints["file"], "audio": datapoints["audio"], "text": datapoints["text"], "speaker_id": None, "metadata": {"speaker_age": None, "speaker_gender": None}}
+
+            else:
+                raise ValueError(f"Received unexpected config schema of {_config_schema_name}!")
+
+            _idx += 1
diff --git a/seacrowd/sea_datasets/cc_aligned_sent/__init__.py b/seacrowd/sea_datasets/cc_aligned_sent/__init__.py
diff --git a/seacrowd/sea_datasets/cc_aligned_sent/cc_aligned_sent.py b/seacrowd/sea_datasets/cc_aligned_sent/cc_aligned_sent.py
@@ -0,0 +1,167 @@
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+from datasets.download.download_manager import DownloadManager
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Licenses, Tasks
+
+_CITATION = r"""
+@inproceedings{chaudhary-etal-2019-low,
+    title = "Low-Resource Corpus Filtering Using Multilingual Sentence Embeddings",
+    author = "Chaudhary, Vishrav  and
+      Tang, Yuqing  and
+      Guzm{\'a}n, Francisco  and
+      Schwenk, Holger  and
+      Koehn, Philipp",
+    editor = "Bojar, Ond{\v{r}}ej  and
+      Chatterjee, Rajen  and
+      Federmann, Christian  and
+      Fishel, Mark  and
+      Graham, Yvette  and
+      Haddow, Barry  and
+      Huck, Matthias  and
+      Yepes, Antonio Jimeno  and
+      Koehn, Philipp  and
+      Martins, Andr{\'e}  and
+      Monz, Christof  and
+      Negri, Matteo  and
+      N{\'e}v{\'e}ol, Aur{\'e}lie  and
+      Neves, Mariana  and
+      Post, Matt  and
+      Turchi, Marco  and
+      Verspoor, Karin",
+    booktitle = "Proceedings of the Fourth Conference on Machine Translation (Volume 3: Shared Task Papers, Day 2)",
+    month = aug,
+    year = "2019",
+    address = "Florence, Italy",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/W19-5435",
+    doi = "10.18653/v1/W19-5435",
+    pages = "261--266",
+}
+"""
+
+_LOCAL = False
+_LANGUAGES = ["ind", "jav", "sun", "tha", "vie", "zlm", "lao", "khm", "mya", "ceb"]
+_DATASETNAME = "cc_aligned_sent"
+_DESCRIPTION = """\
+This dataset contains the sentence pairs extracted from CC-Aligned document
+pairs using similarity scores of LASER embeddings (minimum similarity 1.04,
+sorted based on decreasing similarity score). It misses some languages not
+covered by LASER.
+"""
+
+_HOMEPAGE = "https://www2.statmt.org/cc-aligned/"
+_LICENSE = Licenses.UNKNOWN.value
+_URL = "https://data.statmt.org/cc-aligned/sentence-aligned/"
+
+_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION]
+_SOURCE_VERSION = "1.0.0"
+_SEACROWD_VERSION = "1.0.0"
+
+_SUBSETS = ["id_ID", "jv_ID", "su_ID", "th_TH", "vi_VN", "ms_MY", "lo_LA", "km_KH", "my_MM", "cx_PH"]
+
+
+class CCAlignedSentencesDataset(datasets.GeneratorBasedBuilder):
+    """CC Aligned Sentences dataset by Chaudhary et al., (2019)"""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    SEACROWD_SCHEMA_NAME = "t2t"
+
+    # Add configurations for loading a dataset per language.
+    dataset_names = sorted([f"{_DATASETNAME}_{subset}" for subset in _SUBSETS])
+    BUILDER_CONFIGS = []
+    for name in dataset_names:
+        source_config = SEACrowdConfig(
+            name=f"{name}_source",
+            version=SOURCE_VERSION,
+            description=f"{_DATASETNAME} source schema",
+            schema="source",
+            subset_id=name,
+        )
+        BUILDER_CONFIGS.append(source_config)
+        seacrowd_config = SEACrowdConfig(
+            name=f"{name}_seacrowd_{SEACROWD_SCHEMA_NAME}",
+            version=SEACROWD_VERSION,
+            description=f"{_DATASETNAME} SEACrowd schema",
+            schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}",
+            subset_id=name,
+        )
+        BUILDER_CONFIGS.append(seacrowd_config)
+
+    # Choose first language as default
+    first_subset = sorted(_SUBSETS)[0]
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{first_subset}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "Source_Sentence": datasets.Value("string"),
+                    "Target_Sentence": datasets.Value("string"),
+                    "LASER_similarity": datasets.Value("float64"),
+                }
+            )
+
+        if self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}":
+            features = schemas.text_to_text.features
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]:
+        """Return SplitGenerators."""
+        # Define some functions for parsing config and URL names
+        def _split_at_n(text: str, n: int) -> Tuple[str, str]:
+            """Split text on the n-th instance"""
+            return ("_".join(text.split("_")[:n]), "_".join(text.split("_")[n:]))
+
+        # Get URL. For cx_PH, the source and target languages are reversed
+        _, subset = _split_at_n(_split_at_n(self.config.name, 5)[0], 3)
+        (source_lang, target_lang) = (subset, "en_XX") if subset == "cx_PH" else ("en_XX", subset)
+        url = _URL + f"{source_lang}-{target_lang}.tsv.xz"
+        filepath = dl_manager.download_and_extract(url)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": filepath,
+                    "source_lang": source_lang,
+                    "target_lang": target_lang,
+                },
+            )
+        ]
+
+    def _generate_examples(self, filepath: Path, source_lang: str, target_lang: str) -> Tuple[int, Dict]:
+        """Yield examples as (key, example) tuples"""
+        with open(filepath, encoding="utf-8") as file:
+            for idx, row in enumerate(file):
+                text_1, text_2, score = row.strip().split("\t")
+                if self.config.schema == "source":
+                    example = {
+                        "id": idx,
+                        "Source_Sentence": text_1,
+                        "Target_Sentence": text_2,
+                        "LASER_similarity": float(score),
+                    }
+                if self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}":
+                    example = {
+                        "id": idx,
+                        "text_1": text_1,
+                        "text_2": text_2,
+                        "text_1_name": source_lang,
+                        "text_2_name": target_lang,
+                    }
+                yield idx, example
diff --git a/seacrowd/sea_datasets/fsl_105/__init__.py b/seacrowd/sea_datasets/fsl_105/__init__.py