-
Notifications
You must be signed in to change notification settings - Fork 57
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' into tree_schema
- Loading branch information
Showing
13 changed files
with
930 additions
and
5 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
""" | ||
SEA Crowd Data Loader for Bloom Speech. | ||
""" | ||
from typing import Dict, List, Tuple | ||
|
||
import datasets | ||
from datasets.download.download_manager import DownloadManager | ||
|
||
from seacrowd.utils import schemas | ||
from seacrowd.utils.configs import SEACrowdConfig | ||
from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks | ||
|
||
_CITATION = r""" | ||
@inproceedings{leong-etal-2022-bloom, | ||
title = "Bloom Library: Multimodal Datasets in 300+ Languages for a Variety of Downstream Tasks", | ||
author = "Leong, Colin and | ||
Nemecek, Joshua and | ||
Mansdorfer, Jacob and | ||
Filighera, Anna and | ||
Owodunni, Abraham and | ||
Whitenack, Daniel", | ||
editor = "Goldberg, Yoav and | ||
Kozareva, Zornitsa and | ||
Zhang, Yue", | ||
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing", | ||
month = dec, | ||
year = "2022", | ||
address = "Abu Dhabi, United Arab Emirates", | ||
publisher = "Association for Computational Linguistics", | ||
url = "https://aclanthology.org/2022.emnlp-main.590", | ||
doi = "10.18653/v1/2022.emnlp-main.590", | ||
pages = "8608--8621", | ||
} | ||
""" | ||
|
||
logger = datasets.logging.get_logger(__name__) | ||
|
||
# this config is created for SEACrowd Dataloader | ||
_LANG_CONFIG = {"bjn": "Banjar", "bzi": "Bisu", "ceb": "Cebuano", "ind": "Indonesian", "jra": "Jarai", "kqr": "Kimaragang", "mya": "Burmese", "tgl": "Tagalog"} | ||
|
||
_LOCAL = False | ||
_LANGUAGES = list(_LANG_CONFIG.keys()) | ||
|
||
|
||
_DATASETNAME = "bloom_speech" | ||
_DESCRIPTION = r""" | ||
This version of the Bloom Library data is developed specifically for the automatic speech recognition and speech-to-text tasks. | ||
It includes data from 56 languages across 18 language families. 8 languages are spoken in Southeast Asia. | ||
Before using this dataloader, please accept the acknowledgement at https://huggingface.co/datasets/sil-ai/bloom-speech and use huggingface-cli login for authentication. | ||
""" | ||
|
||
_HOMEPAGE = "https://huggingface.co/datasets/sil-ai/bloom-speech" | ||
_LICENSE = Licenses.CC.value | ||
|
||
_URL = "https://huggingface.co/datasets/sil-ai/bloom-speech" | ||
_HF_REMOTE_REF = "/".join(_URL.split("/")[-2:]) | ||
|
||
_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION] | ||
_SOURCE_VERSION = "0.0.1" | ||
_SEACROWD_VERSION = "1.0.0" | ||
|
||
CONFIG_SUFFIXES_FOR_TASK = [TASK_TO_SCHEMA.get(task).lower() for task in _SUPPORTED_TASKS] | ||
|
||
|
||
def construct_configs_on_langs(languages: list = None) -> List[SEACrowdConfig]: | ||
""" | ||
The function `construct_configs` constructs a list of SEACrowdConfig objects based on the provided | ||
languages or a default language, and returns the list. | ||
input: | ||
languages (list, default None): The `languages` parameter is a list that specifies the languages for which the | ||
configurations need to be constructed. If no languages are provided (value=None), the first value in language config | ||
will be used. | ||
output: | ||
a list of `SEACrowdConfig` objects based on instantiated init variables | ||
""" | ||
|
||
# set output var | ||
config_list = [] | ||
|
||
# construct zipped arg for config instantiation | ||
TASKS_AND_CONFIG_SUFFIX_PAIRS = list(zip(_SUPPORTED_TASKS, CONFIG_SUFFIXES_FOR_TASK)) | ||
|
||
# implement source schema | ||
version, config_name_prefix = _SOURCE_VERSION, "source" | ||
config_list += [ | ||
SEACrowdConfig( | ||
name=f"{_DATASETNAME}_{_LANG}_{config_name_prefix}", | ||
version=datasets.Version(version), | ||
description=f"{_DATASETNAME} {config_name_prefix} schema for language code {_LANG}", | ||
schema=f"{config_name_prefix}", | ||
subset_id=_LANG, | ||
) | ||
for _LANG in languages | ||
] | ||
|
||
# implement SEACrowd schema | ||
version, config_name_prefix = _SEACROWD_VERSION, "seacrowd" | ||
for task_obj, config_name_suffix in TASKS_AND_CONFIG_SUFFIX_PAIRS: | ||
config_list += [ | ||
SEACrowdConfig( | ||
name=f"{_DATASETNAME}_{_LANG}_{config_name_prefix}_{config_name_suffix}", | ||
version=datasets.Version(version), | ||
description=f"{_DATASETNAME} {config_name_prefix} schema for {task_obj.name} and language code {_LANG}", | ||
schema=f"{config_name_prefix}_{config_name_suffix}", | ||
subset_id=_LANG, | ||
) | ||
for _LANG in languages | ||
] | ||
return config_list | ||
|
||
|
||
class BloomSpeechDataset(datasets.GeneratorBasedBuilder): | ||
"""Bloom Speech dataset, subsetted from https://huggingface.co/datasets/sil-ai/bloom-speech""" | ||
|
||
# get all schema w/o lang arg + get all schema w/ lang arg | ||
BUILDER_CONFIGS = construct_configs_on_langs(_LANGUAGES) | ||
|
||
def _info(self) -> datasets.DatasetInfo: | ||
_config_schema_name = self.config.schema | ||
logger.info(f"Received schema name: {self.config.schema}") | ||
# source schema | ||
if _config_schema_name == "source": | ||
features = datasets.Features( | ||
{ | ||
"file": datasets.Value("string"), | ||
"audio": datasets.Audio(sampling_rate=16_000), | ||
"text": datasets.Value("string"), | ||
"book": datasets.Value("string"), | ||
"instance": datasets.Value("string"), | ||
"license": datasets.Value("string"), | ||
"credits": datasets.Value("string"), | ||
"original_lang_tag": datasets.Value("string"), | ||
} | ||
) | ||
|
||
# speech-text schema | ||
elif _config_schema_name == "seacrowd_sptext": | ||
features = schemas.speech_text_features | ||
|
||
else: | ||
raise ValueError(f"Received unexpected config schema of {_config_schema_name}!") | ||
|
||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=features, | ||
homepage=_HOMEPAGE, | ||
license=_LICENSE, | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: | ||
hf_dset_dict = datasets.load_dataset(_HF_REMOTE_REF, self.config.subset_id) | ||
|
||
return [datasets.SplitGenerator(name=datasets.Split(dset_key), gen_kwargs={"hf_dset": dset}) for dset_key, dset in hf_dset_dict.items() if dset.num_rows > 0] | ||
|
||
def _generate_examples(self, hf_dset) -> Tuple[int, Dict]: | ||
_config_schema_name = self.config.schema | ||
|
||
_idx = 0 | ||
for datapoints in hf_dset: | ||
# since no _idx is available to be used, we're creating it manually for both schema | ||
if _config_schema_name == "source": | ||
yield _idx, {colname: datapoints[colname] for colname in self.info.features} | ||
|
||
elif _config_schema_name == "seacrowd_sptext": | ||
yield _idx, {"id": _idx, "path": datapoints["file"], "audio": datapoints["audio"], "text": datapoints["text"], "speaker_id": None, "metadata": {"speaker_age": None, "speaker_gender": None}} | ||
|
||
else: | ||
raise ValueError(f"Received unexpected config schema of {_config_schema_name}!") | ||
|
||
_idx += 1 |
Empty file.
167 changes: 167 additions & 0 deletions
167
seacrowd/sea_datasets/cc_aligned_sent/cc_aligned_sent.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
from pathlib import Path | ||
from typing import Dict, List, Tuple | ||
|
||
import datasets | ||
from datasets.download.download_manager import DownloadManager | ||
|
||
from seacrowd.utils import schemas | ||
from seacrowd.utils.configs import SEACrowdConfig | ||
from seacrowd.utils.constants import Licenses, Tasks | ||
|
||
_CITATION = r""" | ||
@inproceedings{chaudhary-etal-2019-low, | ||
title = "Low-Resource Corpus Filtering Using Multilingual Sentence Embeddings", | ||
author = "Chaudhary, Vishrav and | ||
Tang, Yuqing and | ||
Guzm{\'a}n, Francisco and | ||
Schwenk, Holger and | ||
Koehn, Philipp", | ||
editor = "Bojar, Ond{\v{r}}ej and | ||
Chatterjee, Rajen and | ||
Federmann, Christian and | ||
Fishel, Mark and | ||
Graham, Yvette and | ||
Haddow, Barry and | ||
Huck, Matthias and | ||
Yepes, Antonio Jimeno and | ||
Koehn, Philipp and | ||
Martins, Andr{\'e} and | ||
Monz, Christof and | ||
Negri, Matteo and | ||
N{\'e}v{\'e}ol, Aur{\'e}lie and | ||
Neves, Mariana and | ||
Post, Matt and | ||
Turchi, Marco and | ||
Verspoor, Karin", | ||
booktitle = "Proceedings of the Fourth Conference on Machine Translation (Volume 3: Shared Task Papers, Day 2)", | ||
month = aug, | ||
year = "2019", | ||
address = "Florence, Italy", | ||
publisher = "Association for Computational Linguistics", | ||
url = "https://aclanthology.org/W19-5435", | ||
doi = "10.18653/v1/W19-5435", | ||
pages = "261--266", | ||
} | ||
""" | ||
|
||
_LOCAL = False | ||
_LANGUAGES = ["ind", "jav", "sun", "tha", "vie", "zlm", "lao", "khm", "mya", "ceb"] | ||
_DATASETNAME = "cc_aligned_sent" | ||
_DESCRIPTION = """\ | ||
This dataset contains the sentence pairs extracted from CC-Aligned document | ||
pairs using similarity scores of LASER embeddings (minimum similarity 1.04, | ||
sorted based on decreasing similarity score). It misses some languages not | ||
covered by LASER. | ||
""" | ||
|
||
_HOMEPAGE = "https://www2.statmt.org/cc-aligned/" | ||
_LICENSE = Licenses.UNKNOWN.value | ||
_URL = "https://data.statmt.org/cc-aligned/sentence-aligned/" | ||
|
||
_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] | ||
_SOURCE_VERSION = "1.0.0" | ||
_SEACROWD_VERSION = "1.0.0" | ||
|
||
_SUBSETS = ["id_ID", "jv_ID", "su_ID", "th_TH", "vi_VN", "ms_MY", "lo_LA", "km_KH", "my_MM", "cx_PH"] | ||
|
||
|
||
class CCAlignedSentencesDataset(datasets.GeneratorBasedBuilder): | ||
"""CC Aligned Sentences dataset by Chaudhary et al., (2019)""" | ||
|
||
SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) | ||
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) | ||
|
||
SEACROWD_SCHEMA_NAME = "t2t" | ||
|
||
# Add configurations for loading a dataset per language. | ||
dataset_names = sorted([f"{_DATASETNAME}_{subset}" for subset in _SUBSETS]) | ||
BUILDER_CONFIGS = [] | ||
for name in dataset_names: | ||
source_config = SEACrowdConfig( | ||
name=f"{name}_source", | ||
version=SOURCE_VERSION, | ||
description=f"{_DATASETNAME} source schema", | ||
schema="source", | ||
subset_id=name, | ||
) | ||
BUILDER_CONFIGS.append(source_config) | ||
seacrowd_config = SEACrowdConfig( | ||
name=f"{name}_seacrowd_{SEACROWD_SCHEMA_NAME}", | ||
version=SEACROWD_VERSION, | ||
description=f"{_DATASETNAME} SEACrowd schema", | ||
schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", | ||
subset_id=name, | ||
) | ||
BUILDER_CONFIGS.append(seacrowd_config) | ||
|
||
# Choose first language as default | ||
first_subset = sorted(_SUBSETS)[0] | ||
DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{first_subset}_source" | ||
|
||
def _info(self) -> datasets.DatasetInfo: | ||
if self.config.schema == "source": | ||
features = datasets.Features( | ||
{ | ||
"id": datasets.Value("string"), | ||
"Source_Sentence": datasets.Value("string"), | ||
"Target_Sentence": datasets.Value("string"), | ||
"LASER_similarity": datasets.Value("float64"), | ||
} | ||
) | ||
|
||
if self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": | ||
features = schemas.text_to_text.features | ||
|
||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=features, | ||
homepage=_HOMEPAGE, | ||
license=_LICENSE, | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: | ||
"""Return SplitGenerators.""" | ||
# Define some functions for parsing config and URL names | ||
def _split_at_n(text: str, n: int) -> Tuple[str, str]: | ||
"""Split text on the n-th instance""" | ||
return ("_".join(text.split("_")[:n]), "_".join(text.split("_")[n:])) | ||
|
||
# Get URL. For cx_PH, the source and target languages are reversed | ||
_, subset = _split_at_n(_split_at_n(self.config.name, 5)[0], 3) | ||
(source_lang, target_lang) = (subset, "en_XX") if subset == "cx_PH" else ("en_XX", subset) | ||
url = _URL + f"{source_lang}-{target_lang}.tsv.xz" | ||
filepath = dl_manager.download_and_extract(url) | ||
|
||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, | ||
gen_kwargs={ | ||
"filepath": filepath, | ||
"source_lang": source_lang, | ||
"target_lang": target_lang, | ||
}, | ||
) | ||
] | ||
|
||
def _generate_examples(self, filepath: Path, source_lang: str, target_lang: str) -> Tuple[int, Dict]: | ||
"""Yield examples as (key, example) tuples""" | ||
with open(filepath, encoding="utf-8") as file: | ||
for idx, row in enumerate(file): | ||
text_1, text_2, score = row.strip().split("\t") | ||
if self.config.schema == "source": | ||
example = { | ||
"id": idx, | ||
"Source_Sentence": text_1, | ||
"Target_Sentence": text_2, | ||
"LASER_similarity": float(score), | ||
} | ||
if self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": | ||
example = { | ||
"id": idx, | ||
"text_1": text_1, | ||
"text_2": text_2, | ||
"text_1_name": source_lang, | ||
"text_2_name": target_lang, | ||
} | ||
yield idx, example |
Empty file.
Oops, something went wrong.