Skip to content

Commit

Permalink
Merge branch 'master' into tree_schema
Browse files Browse the repository at this point in the history
  • Loading branch information
MJonibek authored Jan 9, 2024
2 parents 3789f26 + cdc64ab commit 811b5c6
Show file tree
Hide file tree
Showing 13 changed files with 930 additions and 5 deletions.
Empty file.
172 changes: 172 additions & 0 deletions seacrowd/sea_datasets/bloom_speech/bloom_speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
"""
SEA Crowd Data Loader for Bloom Speech.
"""
from typing import Dict, List, Tuple

import datasets
from datasets.download.download_manager import DownloadManager

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks

_CITATION = r"""
@inproceedings{leong-etal-2022-bloom,
title = "Bloom Library: Multimodal Datasets in 300+ Languages for a Variety of Downstream Tasks",
author = "Leong, Colin and
Nemecek, Joshua and
Mansdorfer, Jacob and
Filighera, Anna and
Owodunni, Abraham and
Whitenack, Daniel",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.590",
doi = "10.18653/v1/2022.emnlp-main.590",
pages = "8608--8621",
}
"""

logger = datasets.logging.get_logger(__name__)

# this config is created for SEACrowd Dataloader
_LANG_CONFIG = {"bjn": "Banjar", "bzi": "Bisu", "ceb": "Cebuano", "ind": "Indonesian", "jra": "Jarai", "kqr": "Kimaragang", "mya": "Burmese", "tgl": "Tagalog"}

_LOCAL = False
_LANGUAGES = list(_LANG_CONFIG.keys())


_DATASETNAME = "bloom_speech"
_DESCRIPTION = r"""
This version of the Bloom Library data is developed specifically for the automatic speech recognition and speech-to-text tasks.
It includes data from 56 languages across 18 language families. 8 languages are spoken in Southeast Asia.
Before using this dataloader, please accept the acknowledgement at https://huggingface.co/datasets/sil-ai/bloom-speech and use huggingface-cli login for authentication.
"""

_HOMEPAGE = "https://huggingface.co/datasets/sil-ai/bloom-speech"
_LICENSE = Licenses.CC.value

_URL = "https://huggingface.co/datasets/sil-ai/bloom-speech"
_HF_REMOTE_REF = "/".join(_URL.split("/")[-2:])

_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION]
_SOURCE_VERSION = "0.0.1"
_SEACROWD_VERSION = "1.0.0"

CONFIG_SUFFIXES_FOR_TASK = [TASK_TO_SCHEMA.get(task).lower() for task in _SUPPORTED_TASKS]


def construct_configs_on_langs(languages: list = None) -> List[SEACrowdConfig]:
"""
The function `construct_configs` constructs a list of SEACrowdConfig objects based on the provided
languages or a default language, and returns the list.
input:
languages (list, default None): The `languages` parameter is a list that specifies the languages for which the
configurations need to be constructed. If no languages are provided (value=None), the first value in language config
will be used.
output:
a list of `SEACrowdConfig` objects based on instantiated init variables
"""

# set output var
config_list = []

# construct zipped arg for config instantiation
TASKS_AND_CONFIG_SUFFIX_PAIRS = list(zip(_SUPPORTED_TASKS, CONFIG_SUFFIXES_FOR_TASK))

# implement source schema
version, config_name_prefix = _SOURCE_VERSION, "source"
config_list += [
SEACrowdConfig(
name=f"{_DATASETNAME}_{_LANG}_{config_name_prefix}",
version=datasets.Version(version),
description=f"{_DATASETNAME} {config_name_prefix} schema for language code {_LANG}",
schema=f"{config_name_prefix}",
subset_id=_LANG,
)
for _LANG in languages
]

# implement SEACrowd schema
version, config_name_prefix = _SEACROWD_VERSION, "seacrowd"
for task_obj, config_name_suffix in TASKS_AND_CONFIG_SUFFIX_PAIRS:
config_list += [
SEACrowdConfig(
name=f"{_DATASETNAME}_{_LANG}_{config_name_prefix}_{config_name_suffix}",
version=datasets.Version(version),
description=f"{_DATASETNAME} {config_name_prefix} schema for {task_obj.name} and language code {_LANG}",
schema=f"{config_name_prefix}_{config_name_suffix}",
subset_id=_LANG,
)
for _LANG in languages
]
return config_list


class BloomSpeechDataset(datasets.GeneratorBasedBuilder):
"""Bloom Speech dataset, subsetted from https://huggingface.co/datasets/sil-ai/bloom-speech"""

# get all schema w/o lang arg + get all schema w/ lang arg
BUILDER_CONFIGS = construct_configs_on_langs(_LANGUAGES)

def _info(self) -> datasets.DatasetInfo:
_config_schema_name = self.config.schema
logger.info(f"Received schema name: {self.config.schema}")
# source schema
if _config_schema_name == "source":
features = datasets.Features(
{
"file": datasets.Value("string"),
"audio": datasets.Audio(sampling_rate=16_000),
"text": datasets.Value("string"),
"book": datasets.Value("string"),
"instance": datasets.Value("string"),
"license": datasets.Value("string"),
"credits": datasets.Value("string"),
"original_lang_tag": datasets.Value("string"),
}
)

# speech-text schema
elif _config_schema_name == "seacrowd_sptext":
features = schemas.speech_text_features

else:
raise ValueError(f"Received unexpected config schema of {_config_schema_name}!")

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]:
hf_dset_dict = datasets.load_dataset(_HF_REMOTE_REF, self.config.subset_id)

return [datasets.SplitGenerator(name=datasets.Split(dset_key), gen_kwargs={"hf_dset": dset}) for dset_key, dset in hf_dset_dict.items() if dset.num_rows > 0]

def _generate_examples(self, hf_dset) -> Tuple[int, Dict]:
_config_schema_name = self.config.schema

_idx = 0
for datapoints in hf_dset:
# since no _idx is available to be used, we're creating it manually for both schema
if _config_schema_name == "source":
yield _idx, {colname: datapoints[colname] for colname in self.info.features}

elif _config_schema_name == "seacrowd_sptext":
yield _idx, {"id": _idx, "path": datapoints["file"], "audio": datapoints["audio"], "text": datapoints["text"], "speaker_id": None, "metadata": {"speaker_age": None, "speaker_gender": None}}

else:
raise ValueError(f"Received unexpected config schema of {_config_schema_name}!")

_idx += 1
Empty file.
167 changes: 167 additions & 0 deletions seacrowd/sea_datasets/cc_aligned_sent/cc_aligned_sent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
from pathlib import Path
from typing import Dict, List, Tuple

import datasets
from datasets.download.download_manager import DownloadManager

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Licenses, Tasks

_CITATION = r"""
@inproceedings{chaudhary-etal-2019-low,
title = "Low-Resource Corpus Filtering Using Multilingual Sentence Embeddings",
author = "Chaudhary, Vishrav and
Tang, Yuqing and
Guzm{\'a}n, Francisco and
Schwenk, Holger and
Koehn, Philipp",
editor = "Bojar, Ond{\v{r}}ej and
Chatterjee, Rajen and
Federmann, Christian and
Fishel, Mark and
Graham, Yvette and
Haddow, Barry and
Huck, Matthias and
Yepes, Antonio Jimeno and
Koehn, Philipp and
Martins, Andr{\'e} and
Monz, Christof and
Negri, Matteo and
N{\'e}v{\'e}ol, Aur{\'e}lie and
Neves, Mariana and
Post, Matt and
Turchi, Marco and
Verspoor, Karin",
booktitle = "Proceedings of the Fourth Conference on Machine Translation (Volume 3: Shared Task Papers, Day 2)",
month = aug,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-5435",
doi = "10.18653/v1/W19-5435",
pages = "261--266",
}
"""

_LOCAL = False
_LANGUAGES = ["ind", "jav", "sun", "tha", "vie", "zlm", "lao", "khm", "mya", "ceb"]
_DATASETNAME = "cc_aligned_sent"
_DESCRIPTION = """\
This dataset contains the sentence pairs extracted from CC-Aligned document
pairs using similarity scores of LASER embeddings (minimum similarity 1.04,
sorted based on decreasing similarity score). It misses some languages not
covered by LASER.
"""

_HOMEPAGE = "https://www2.statmt.org/cc-aligned/"
_LICENSE = Licenses.UNKNOWN.value
_URL = "https://data.statmt.org/cc-aligned/sentence-aligned/"

_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION]
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"

_SUBSETS = ["id_ID", "jv_ID", "su_ID", "th_TH", "vi_VN", "ms_MY", "lo_LA", "km_KH", "my_MM", "cx_PH"]


class CCAlignedSentencesDataset(datasets.GeneratorBasedBuilder):
"""CC Aligned Sentences dataset by Chaudhary et al., (2019)"""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)

SEACROWD_SCHEMA_NAME = "t2t"

# Add configurations for loading a dataset per language.
dataset_names = sorted([f"{_DATASETNAME}_{subset}" for subset in _SUBSETS])
BUILDER_CONFIGS = []
for name in dataset_names:
source_config = SEACrowdConfig(
name=f"{name}_source",
version=SOURCE_VERSION,
description=f"{_DATASETNAME} source schema",
schema="source",
subset_id=name,
)
BUILDER_CONFIGS.append(source_config)
seacrowd_config = SEACrowdConfig(
name=f"{name}_seacrowd_{SEACROWD_SCHEMA_NAME}",
version=SEACROWD_VERSION,
description=f"{_DATASETNAME} SEACrowd schema",
schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}",
subset_id=name,
)
BUILDER_CONFIGS.append(seacrowd_config)

# Choose first language as default
first_subset = sorted(_SUBSETS)[0]
DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{first_subset}_source"

def _info(self) -> datasets.DatasetInfo:
if self.config.schema == "source":
features = datasets.Features(
{
"id": datasets.Value("string"),
"Source_Sentence": datasets.Value("string"),
"Target_Sentence": datasets.Value("string"),
"LASER_similarity": datasets.Value("float64"),
}
)

if self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}":
features = schemas.text_to_text.features

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]:
"""Return SplitGenerators."""
# Define some functions for parsing config and URL names
def _split_at_n(text: str, n: int) -> Tuple[str, str]:
"""Split text on the n-th instance"""
return ("_".join(text.split("_")[:n]), "_".join(text.split("_")[n:]))

# Get URL. For cx_PH, the source and target languages are reversed
_, subset = _split_at_n(_split_at_n(self.config.name, 5)[0], 3)
(source_lang, target_lang) = (subset, "en_XX") if subset == "cx_PH" else ("en_XX", subset)
url = _URL + f"{source_lang}-{target_lang}.tsv.xz"
filepath = dl_manager.download_and_extract(url)

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": filepath,
"source_lang": source_lang,
"target_lang": target_lang,
},
)
]

def _generate_examples(self, filepath: Path, source_lang: str, target_lang: str) -> Tuple[int, Dict]:
"""Yield examples as (key, example) tuples"""
with open(filepath, encoding="utf-8") as file:
for idx, row in enumerate(file):
text_1, text_2, score = row.strip().split("\t")
if self.config.schema == "source":
example = {
"id": idx,
"Source_Sentence": text_1,
"Target_Sentence": text_2,
"LASER_similarity": float(score),
}
if self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}":
example = {
"id": idx,
"text_1": text_1,
"text_2": text_2,
"text_1_name": source_lang,
"text_2_name": target_lang,
}
yield idx, example
Empty file.
Loading

0 comments on commit 811b5c6

Please sign in to comment.