forked from SEACrowd/seacrowd-datahub
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request SEACrowd#57 from sabilmakbar/sabil/fleurs
Closes SEACrowd#31 | Add Fleurs loader
- Loading branch information
Showing
3 changed files
with
334 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,292 @@ | ||
""" | ||
SEA Crowd Data Loader for Fleurs. | ||
""" | ||
|
||
import json | ||
from itertools import product | ||
from typing import Dict, List, Tuple | ||
|
||
import datasets | ||
from datasets import load_dataset | ||
from datasets.download.download_manager import DownloadManager | ||
|
||
from seacrowd.utils import schemas | ||
from seacrowd.utils.configs import SEACrowdConfig | ||
from seacrowd.utils.constants import TASK_TO_SCHEMA, Licenses, Tasks | ||
|
||
_CITATION = """ | ||
@inproceedings{conneau22_interspeech, | ||
author={Alexis Conneau and Ankur Bapna and Yu Zhang and Min Ma and Patrick {von Platen} and Anton Lozhkov and Colin Cherry | ||
and Ye Jia and Clara Rivera and Mihir Kale and Daan {van Esch} and Vera Axelrod and Simran Khanuja and Jonathan Clark | ||
and Orhan Firat and Michael Auli and Sebastian Ruder and Jason Riesa and Melvin Johnson}, | ||
title={{XTREME-S: Evaluating Cross-lingual Speech Representations}}, | ||
year=2022, | ||
booktitle={Proc. Interspeech 2022}, | ||
pages={3248--3252}, | ||
doi={10.21437/Interspeech.2022-10007} | ||
} | ||
""" | ||
|
||
logger = datasets.logging.get_logger(__name__) | ||
|
||
|
||
with open(DownloadManager().download_and_extract("seacrowd/sea_datasets/fleurs/lang_config.json"), "r") as f: | ||
_LANG_CONFIG = json.load(f) | ||
|
||
_LOCAL = False | ||
_LANGUAGES = list(_LANG_CONFIG.keys()) | ||
|
||
# since this fleurs source already subsets SEA langs, the names on lang group id is hard-coded | ||
_LANG_GROUP_ID = ["south_east_asian_sea"] | ||
|
||
_DATASETNAME = "fleurs" | ||
_DESCRIPTION = """\ | ||
Fleurs dataset is a part of XTREME-S benchmark to evaluate universal cross-lingual speech representations in many languages. | ||
Fleurs is used for two tasks: automatic speech recognition and speech classification. | ||
Fleurs covers 10 language native to Southeast Asian and other 3 major languages | ||
mostly spoken in few of Southeast Asia countries (Mandarin Chinese, Portuguese, and Tamil). | ||
""" | ||
|
||
_HOMEPAGE = "https://huggingface.co/datasets/google/xtreme_s" | ||
_LICENSE = Licenses.CC_BY_4_0.value | ||
|
||
# url won't be used since it will implement load_dataset method on HF URL provided | ||
_URL = "https://huggingface.co/datasets/google/xtreme_s" | ||
|
||
# construct remote_hf_reference by the last 2 of string-spliited of "/" (expected: "google/xtreme_s") | ||
_HF_REMOTE_REF = "/".join(_URL.split("/")[-2:]) | ||
|
||
_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION, Tasks.SPEECH_LANGUAGE_IDENTIFICATION] | ||
_SOURCE_VERSION = "1.0.0" | ||
_SEACROWD_VERSION = "1.0.0" | ||
|
||
CONFIG_SUFFIXES_FOR_TASK = [TASK_TO_SCHEMA.get(task).lower() for task in _SUPPORTED_TASKS] | ||
|
||
|
||
def conform_init_config(): | ||
"""Assertion Function for Instantiated Configs""" | ||
if len(_LANGUAGES) == 0: | ||
raise AssertionError("No Languages detected from config!") | ||
if len(CONFIG_SUFFIXES_FOR_TASK) != len(_SUPPORTED_TASKS): | ||
raise AssertionError("Config prefixes doesn't matched in terms of `len` with `_SUPPORTED_TASKS`!") | ||
if len(CONFIG_SUFFIXES_FOR_TASK) == 0: | ||
raise AssertionError("Config prefixes and `_SUPPORTED_TASKS` have `len` of 0!") | ||
|
||
|
||
conform_init_config() | ||
|
||
|
||
def construct_configs_on_langs(languages: list = None) -> List[SEACrowdConfig]: | ||
""" | ||
The function `construct_configs` constructs a list of SEACrowdConfig objects based on the provided | ||
languages or a default language, and returns the list. | ||
input: | ||
languages (list): The `languages` parameter is a list that specifies the languages for which the | ||
configurations need to be constructed. If no languages are provided (value=None), the first value in language config | ||
will be used. | ||
output: | ||
a list of `SEACrowdConfig` objects based on instantiated init variables | ||
""" | ||
# set output var | ||
config_list = [] | ||
|
||
# set flag whether the task is lang-agnostic based on extended `_SUPPORTED_TASKS` | ||
IS_TASK_LANG_SUBSETTED = [True, False] | ||
|
||
TASKS_AND_CONFIG_SUFFIX_PAIRS = list(zip(_SUPPORTED_TASKS, CONFIG_SUFFIXES_FOR_TASK, IS_TASK_LANG_SUBSETTED)) | ||
|
||
# implement source schema | ||
version, config_name_prefix = _SOURCE_VERSION, "source" | ||
config_list += [ | ||
SEACrowdConfig( | ||
name=f"{_DATASETNAME}_{_LANG}_{config_name_prefix}", | ||
version=datasets.Version(version), | ||
description=f"{_DATASETNAME} {config_name_prefix} schema for language code {_LANG}", | ||
schema=f"{config_name_prefix}", | ||
subset_id=_LANG, | ||
) | ||
for _LANG in languages | ||
] | ||
|
||
# implement SEACrowd schema | ||
version, config_name_prefix = _SEACROWD_VERSION, "seacrowd" | ||
for (task_obj, config_name_suffix, is_lang_subsetted) in TASKS_AND_CONFIG_SUFFIX_PAIRS: | ||
if is_lang_subsetted: | ||
# construct configs based on its lang, since the task & config needs to defined per lang | ||
# for this dataloader, Tasks.SPEECH_RECOGNITION will enter this condition | ||
config_list += [ | ||
SEACrowdConfig( | ||
name=f"{_DATASETNAME}_{_LANG}_{config_name_prefix}_{config_name_suffix}", | ||
version=datasets.Version(version), | ||
description=f"{_DATASETNAME} {config_name_prefix} schema for {task_obj.name} and language code {_LANG}", | ||
schema=f"{config_name_prefix}_{config_name_suffix}", | ||
subset_id=_LANG, | ||
) | ||
for _LANG in languages | ||
] | ||
|
||
else: | ||
# else, its defined for all languages | ||
# for this dataloader, Tasks.SPEECH_LANGUAGE_IDENTIFICATION will enter this condition | ||
# however no "source" schema will be defined here (the source will follow this `fleurs_{lang_code}_source` config name) | ||
config_list.append( | ||
SEACrowdConfig( | ||
name=f"{_DATASETNAME}_{config_name_prefix}_{config_name_suffix}", | ||
version=datasets.Version(version), | ||
description=f"{_DATASETNAME} {config_name_prefix} schema for {task_obj.name}", | ||
schema=f"{config_name_prefix}_{config_name_suffix}", | ||
subset_id="all", | ||
) | ||
) | ||
|
||
return config_list | ||
|
||
|
||
class FleursDataset(datasets.GeneratorBasedBuilder): | ||
"""Fleurs dataset from https://huggingface.co/datasets/google/xtreme_s""" | ||
|
||
# get all schema w/o lang arg + get all schema w/ lang arg | ||
BUILDER_CONFIGS = construct_configs_on_langs(_LANGUAGES) | ||
|
||
def _info(self) -> datasets.DatasetInfo: | ||
_config_schema_name = self.config.schema | ||
logger.info(f"Received schema name: {self.config.schema}") | ||
|
||
# source schema | ||
if _config_schema_name == "source": | ||
features = datasets.Features( | ||
{ | ||
"id": datasets.Value("int32"), | ||
"num_samples": datasets.Value("int32"), | ||
"path": datasets.Value("string"), | ||
"audio": datasets.Audio(sampling_rate=16_000), | ||
"transcription": datasets.Value("string"), | ||
"raw_transcription": datasets.Value("string"), | ||
"gender": datasets.ClassLabel(names=["male", "female", "other"]), | ||
"lang_id": datasets.ClassLabel(names=_LANGUAGES), | ||
"language": datasets.Value("string"), | ||
"lang_group_id": datasets.ClassLabel( | ||
names=_LANG_GROUP_ID) | ||
} | ||
) | ||
|
||
# asr transcription schema for seacrowd | ||
elif _config_schema_name == f"seacrowd_{CONFIG_SUFFIXES_FOR_TASK[0]}": | ||
features = schemas.speech_text_features | ||
|
||
# speech lang classification schema for seacrowd | ||
elif _config_schema_name == f"seacrowd_{CONFIG_SUFFIXES_FOR_TASK[1]}": | ||
features = schemas.speech_features(label_names=_LANGUAGES) | ||
|
||
else: | ||
raise ValueError(f"Unexpected schema received! {_config_schema_name}") | ||
|
||
return datasets.DatasetInfo(description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION) | ||
|
||
def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: | ||
# args of dl_manager is useless since this data loader will wrap the hf `load_dataset` from given _URL | ||
return [ | ||
datasets.SplitGenerator( | ||
name=split_name, | ||
gen_kwargs={"split_name": split_name._name}) | ||
for split_name in ( | ||
datasets.Split.TRAIN, | ||
datasets.Split.VALIDATION, | ||
datasets.Split.TEST) | ||
] | ||
|
||
def _load_hf_data_from_remote(self, split_name: str) -> datasets.DatasetDict: | ||
|
||
if self.config.subset_id == "all": | ||
raise ValueError("Unexpected subset_id value of `all` received in eager-load of SEACrowd fleurs loader!") | ||
else: | ||
_config_name_args = "fleurs." + _LANG_CONFIG[self.config.subset_id]["fleurs_lang_code"] + "_" + _LANG_CONFIG[self.config.subset_id]["fleurs_country_code"] | ||
|
||
logger.info(f"Loading dataset from remote HF {_HF_REMOTE_REF} with seacrowd lang args of {self.config.subset_id} and hf-source config args of {_config_name_args}") | ||
_hf_dataset_source = load_dataset(_HF_REMOTE_REF, _config_name_args, split=split_name) | ||
|
||
return _hf_dataset_source | ||
|
||
def _lazy_load_hf_data_from_remote(self, split_name: str) -> datasets.DatasetDict: | ||
|
||
if self.config.subset_id != "all": | ||
raise ValueError(f"Unexpected subset_id value of {self.config.subset_id} received in lazy-load of SEACrowd fleurs loader!") | ||
else: | ||
_config_name_args = [(f"fleurs.{fleurs_lang_info['fleurs_lang_code']}_{fleurs_lang_info['fleurs_country_code']}", lang) for lang, fleurs_lang_info in _LANG_CONFIG.items()] | ||
|
||
for _config, lang_name in _config_name_args: | ||
logger.info(f"Loading dataset from remote HF {_HF_REMOTE_REF} with seacrowd lang args of {self.config.subset_id} and hf-source config args of {_config}") | ||
yield load_dataset(_HF_REMOTE_REF, _config, split=split_name), lang_name | ||
|
||
def _generate_examples(self, split_name: str) -> Tuple[int, Dict]: | ||
|
||
_config_schema_name = self.config.schema | ||
|
||
# for source schema and asr transcription schema (the data is loaded eagerly, since it's splitted by lang) | ||
if _config_schema_name in ("source", f"seacrowd_{CONFIG_SUFFIXES_FOR_TASK[0]}"): | ||
loaded_data = self._load_hf_data_from_remote(split_name) | ||
|
||
# iterate over datapoints and arrange hf dataset schema in source to match w/ config args: | ||
for id_, _data in enumerate(loaded_data): | ||
if _config_schema_name == "source": | ||
|
||
#re-map "language_id" and "lang_group_id" | ||
_data["lang_id"] = _LANGUAGES.index(self.config.subset_id) | ||
_data["lang_group_id"] = 0 | ||
|
||
yield id_, { | ||
colname: _data[colname] for colname in self.info.features} | ||
|
||
# 2 notes on seacrowd schema for ASR: | ||
# 1. since in source data, no speakers id nor its info were provided, it will be filled by default values: | ||
# ("" for any data string-typed, and -1 for age data int-typed) | ||
# 2. the "id" is re-created on sequential order on loaded data bcs it's original id | ||
# doesn't pass unit-test of seacrowd schema | ||
|
||
elif "seacrowd" in _config_schema_name: | ||
yield id_, { | ||
"id": id_, | ||
"path": _data["path"], | ||
"audio": _data["audio"], | ||
"text": _data["transcription"], | ||
"speaker_id": "", | ||
"metadata": { | ||
"speaker_age": -1, | ||
"speaker_gender": _data["gender"], | ||
}, | ||
} | ||
|
||
else: | ||
raise ValueError(f"Received unexpected config schema of {_config_schema_name}!") | ||
|
||
# add id_ so it will be globally unique | ||
id_ += 1 | ||
|
||
# for speech lang classification schema (the data is loaded lazily per lang) | ||
elif _config_schema_name == f"seacrowd_{CONFIG_SUFFIXES_FOR_TASK[1]}": | ||
loaded_data = self._lazy_load_hf_data_from_remote(split_name) | ||
id_ = 0 | ||
while True: | ||
_loaded_data, lang_info = next(loaded_data, (None, None)) | ||
if _loaded_data is None: | ||
break | ||
# iterate over datapoints and arrange hf dataset schema in source to match w/ config args: | ||
for _data in _loaded_data: | ||
yield id_, { | ||
"id": id_, | ||
"path": _data["path"], | ||
"audio": _data["audio"], | ||
"labels": _LANGUAGES.index(lang_info), | ||
"speaker_id": "", | ||
"metadata": { | ||
"speaker_age": -1, | ||
"speaker_gender": _data["gender"], | ||
}, | ||
} | ||
|
||
# add id_ so it will be globally unique | ||
id_ += 1 | ||
|
||
else: | ||
raise ValueError(f"Received unexpected config schema of {_config_schema_name}!") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
{ | ||
"mya": { | ||
"fleurs_lang_code": "my", | ||
"fleurs_country_code": "mm" | ||
}, | ||
"ceb": { | ||
"fleurs_lang_code": "ceb", | ||
"fleurs_country_code": "ph" | ||
}, | ||
"fil": { | ||
"fleurs_lang_code": "fil", | ||
"fleurs_country_code": "ph" | ||
}, | ||
"ind": { | ||
"fleurs_lang_code": "id", | ||
"fleurs_country_code": "id" | ||
}, | ||
"jav": { | ||
"fleurs_lang_code": "jv", | ||
"fleurs_country_code": "id" | ||
}, | ||
"khm": { | ||
"fleurs_lang_code": "km", | ||
"fleurs_country_code": "kh" | ||
}, | ||
"lao": { | ||
"fleurs_lang_code": "lo", | ||
"fleurs_country_code": "la" | ||
}, | ||
"zlm": { | ||
"fleurs_lang_code": "ms", | ||
"fleurs_country_code": "my" | ||
}, | ||
"tha": { | ||
"fleurs_lang_code": "th", | ||
"fleurs_country_code": "th" | ||
}, | ||
"vie": { | ||
"fleurs_lang_code": "vi", | ||
"fleurs_country_code": "vn" | ||
} | ||
} |