diff --git a/seacrowd/sea_datasets/glotstorybook/__init__.py b/seacrowd/sea_datasets/glotstorybook/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/glotstorybook/glotstorybook.py b/seacrowd/sea_datasets/glotstorybook/glotstorybook.py new file mode 100644 index 000000000..e750a89e1 --- /dev/null +++ b/seacrowd/sea_datasets/glotstorybook/glotstorybook.py @@ -0,0 +1,143 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses, TASK_TO_SCHEMA, SCHEMA_TO_FEATURES + +_CITATION = """\ +@inproceedings{kargaran2023glotlid, + title = {{GlotLID}: Language Identification for Low-Resource Languages}, + author = {Kargaran, Amir Hossein and + Imani, Ayyoob and + Yvon, Fran{\c{c}}ois + and Sch{\"u}tze, Hinrich}, + booktitle = {The 2023 Conference on Empirical Methods in Natural Language Processing}, + year = {2023}, + url = {https://openreview.net/forum?id=dl4e3EBz5j} +} +""" + +_DATASETNAME = "glotstorybook" +_DESCRIPTION = """\ +The GlotStoryBook dataset is a compilation of children's storybooks from the Global +Storybooks project, encompassing 174 languages organized for machine translation tasks. It +features rows containing the text segment (text number), the language code, and the file +name, which corresponds to the specific book and story segment. This structure allows for +the comparison of texts across different languages by matching file names and text numbers +between rows. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/cis-lmu/GlotStoryBook" +_LICENSE = f"""{Licenses.OTHERS.value} | \ +We do not own any of the text from which these data has been extracted. All the files are +collected from the repository located at https://github.com/global-asp/. The source +repository for each text and file is stored in the dataset. Each file in the dataset is +associated with one license from the CC family. The licenses include 'CC BY', 'CC BY-NC', +'CC BY-NC-SA', 'CC-BY', 'CC-BY-NC', and 'Public Domain'. We also license the code, actual +packaging and the metadata of these data under the cc0-1.0. +""" +_URLS = "https://huggingface.co/datasets/cis-lmu/GlotStoryBook/resolve/main/GlotStoryBook.csv" + +_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] +_SUPPORTED_LANGS = ["khg", "khm", "mya", "tet", "tha", "vie"] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class GlotStoryBookDataset(datasets.GeneratorBasedBuilder): + """Compilation of storybooks from the Global Storybooks project""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = TASK_TO_SCHEMA[_SUPPORTED_TASKS[0]].lower() + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "text": datasets.Value("string"), + "text_number": datasets.Value("int64"), + "license": datasets.Value("string"), + "text_by": datasets.Value("string"), + "translation_by": datasets.Value("string"), + "language": datasets.Value("string"), + "file_name": datasets.Value("string"), + "source": datasets.Value("string"), + "iso639-3": datasets.Value("string"), + "script": datasets.Value("string"), + } + ) + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = SCHEMA_TO_FEATURES[self.SEACROWD_SCHEMA_NAME.upper()] + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + data_path = Path(dl_manager.download_and_extract(_URLS)) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_path, + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + df = pd.read_csv(filepath) + df = df[df["ISO639-3"].isin(_SUPPORTED_LANGS)] + + if self.config.schema == "source": + for i, row in df.iterrows(): + yield i, { + "text": row["Text"], + "text_number": row["Text Number"], + "license": row["License"], + "text_by": row["Text By"], + "translation_by": row["Translation By"], + "language": row["Language"], + "file_name": row["File Name"], + "source": row["Source"], + "iso639-3": row["ISO639-3"], + "script": row["Script"], + } + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + df = df.sort_values(by=["ISO639-3", "Source", "File Name", "Text Number"]) + df = df.groupby(["ISO639-3", "Source", "File Name"]).agg({"Text": " ".join}).reset_index() + for i, row in df.iterrows(): + yield i, { + "id": str(i), + "text": row["Text"], + } \ No newline at end of file