From a7d4b9f1efdba4bff6fb5710ef846931260d4de4 Mon Sep 17 00:00:00 2001 From: joan <68073738+joanitolopo@users.noreply.github.com> Date: Sat, 24 Feb 2024 21:24:43 +0700 Subject: [PATCH] Closes #10 | Create beaye_lexicon dataset loader (#320) * Create beaye_lexicon dataset loader * add implementation of eng-day word pairs --- .../sea_datasets/beaye_lexicon/__init__.py | 0 .../beaye_lexicon/beaye_lexicon.py | 116 ++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 seacrowd/sea_datasets/beaye_lexicon/__init__.py create mode 100644 seacrowd/sea_datasets/beaye_lexicon/beaye_lexicon.py diff --git a/seacrowd/sea_datasets/beaye_lexicon/__init__.py b/seacrowd/sea_datasets/beaye_lexicon/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/beaye_lexicon/beaye_lexicon.py b/seacrowd/sea_datasets/beaye_lexicon/beaye_lexicon.py new file mode 100644 index 000000000..01695249f --- /dev/null +++ b/seacrowd/sea_datasets/beaye_lexicon/beaye_lexicon.py @@ -0,0 +1,116 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses + +_CITATION = """\ +@misc{beayelexicon2024, + author = {Lopo, Joanito Agili and Moeljadi, David and Cahyawijaya, Samuel and Aji, Alham Fikri and Sommerlot, + Carly J. and Jacob, June}, + title = {Penyusunan Korpus Paralel Bahasa Indonesia–Bahasa Melayu Ambon, Melayu Kupang, Beaye, dan Uab Meto}, + year = {2024}, + howpublished = {Online}, + url = {https://github.com/joanitolopo/makalah-kongresxii}, + note = {Manuscript in preparation}, +} +""" + +_DATASETNAME = "beaye_lexicon" +_DESCRIPTION = """The Beaye Lexicon is a lexicon resource encompassing translations between Indonesian, English, and +Beaye words. Developed through a collaborative effort involving two native Beaye speakers and evaluated by linguistic +experts, this lexicon comprises 984 Beaye vocabularies. The creation of the Beaye Lexicon marks the inaugural effort in +documenting the previously unrecorded Beaye language.""" + +_HOMEPAGE = "https://github.com/joanitolopo/bhinneka-korpus/tree/main/lexicon" +_LICENSE = Licenses.APACHE_2_0.value +_URLS = "https://raw.githubusercontent.com/joanitolopo/bhinneka-korpus/main/lexicon" +_SUPPORTED_TASKS = [] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" +_LOCAL = False + +_LANGUAGES = ["ind", "day", "eng"] + +class BeayeLexicon(datasets.GeneratorBasedBuilder): + """Beaye Lexicon is a lexicon resource encompassing translations between Indonesian, English, and Beaye words""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = ( + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{lang}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"beaye lexicon with source schema for {lang} language", + schema="source", + subset_id="beaye_lexicon", + ) + for lang in _LANGUAGES if lang != "eng" + ] + + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_ext_{lang}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"beaye lexicon with source schema for extensive definiton of beaye language", + schema="source", + subset_id="beaye_lexicon", + ) + for lang in _LANGUAGES if lang != "ind" + ] + ) + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_ind_source" + + def _info(self) -> datasets.DatasetInfo: + schema = self.config.schema + if schema == "source": + features = datasets.Features({"id": datasets.Value("string"), "word": datasets.Value("string")}) + else: + raise NotImplementedError() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + if "ext" in self.config.name.split("_"): + data_dir = Path(dl_manager.download(_URLS + "/english.xlsx")) + else: + data_dir = Path(dl_manager.download(_URLS + "/lexicon.xlsx")) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir, + "split": "train", + } + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + dfs = pd.read_excel(filepath, engine="openpyxl") + if "ext" in self.config.name.split("_"): + lang = self.config.name.split("_")[3] + else: + lang = self.config.name.split("_")[2] + + text = dfs[lang] + + if self.config.schema == "source": + for idx, word in enumerate(text.values): + row = {"id": str(idx), "word": word} + yield idx, row + else: + raise ValueError(f"Invalid config: {self.config.name}")