forked from SEACrowd/seacrowd-datahub
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Create beaye_lexicon dataset loader * add implementation of eng-day word pairs
- Loading branch information
1 parent
ad10716
commit a7d4b9f
Showing
2 changed files
with
116 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
from pathlib import Path | ||
from typing import Dict, List, Tuple | ||
|
||
import datasets | ||
import pandas as pd | ||
|
||
from seacrowd.utils.configs import SEACrowdConfig | ||
from seacrowd.utils.constants import Licenses | ||
|
||
_CITATION = """\ | ||
@misc{beayelexicon2024, | ||
author = {Lopo, Joanito Agili and Moeljadi, David and Cahyawijaya, Samuel and Aji, Alham Fikri and Sommerlot, | ||
Carly J. and Jacob, June}, | ||
title = {Penyusunan Korpus Paralel Bahasa Indonesia–Bahasa Melayu Ambon, Melayu Kupang, Beaye, dan Uab Meto}, | ||
year = {2024}, | ||
howpublished = {Online}, | ||
url = {https://github.com/joanitolopo/makalah-kongresxii}, | ||
note = {Manuscript in preparation}, | ||
} | ||
""" | ||
|
||
_DATASETNAME = "beaye_lexicon" | ||
_DESCRIPTION = """The Beaye Lexicon is a lexicon resource encompassing translations between Indonesian, English, and | ||
Beaye words. Developed through a collaborative effort involving two native Beaye speakers and evaluated by linguistic | ||
experts, this lexicon comprises 984 Beaye vocabularies. The creation of the Beaye Lexicon marks the inaugural effort in | ||
documenting the previously unrecorded Beaye language.""" | ||
|
||
_HOMEPAGE = "https://github.com/joanitolopo/bhinneka-korpus/tree/main/lexicon" | ||
_LICENSE = Licenses.APACHE_2_0.value | ||
_URLS = "https://raw.githubusercontent.com/joanitolopo/bhinneka-korpus/main/lexicon" | ||
_SUPPORTED_TASKS = [] | ||
_SOURCE_VERSION = "1.0.0" | ||
_SEACROWD_VERSION = "1.0.0" | ||
_LOCAL = False | ||
|
||
_LANGUAGES = ["ind", "day", "eng"] | ||
|
||
class BeayeLexicon(datasets.GeneratorBasedBuilder): | ||
"""Beaye Lexicon is a lexicon resource encompassing translations between Indonesian, English, and Beaye words""" | ||
|
||
SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) | ||
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) | ||
|
||
BUILDER_CONFIGS = ( | ||
[ | ||
SEACrowdConfig( | ||
name=f"{_DATASETNAME}_{lang}_source", | ||
version=datasets.Version(_SOURCE_VERSION), | ||
description=f"beaye lexicon with source schema for {lang} language", | ||
schema="source", | ||
subset_id="beaye_lexicon", | ||
) | ||
for lang in _LANGUAGES if lang != "eng" | ||
] | ||
+ [ | ||
SEACrowdConfig( | ||
name=f"{_DATASETNAME}_ext_{lang}_source", | ||
version=datasets.Version(_SOURCE_VERSION), | ||
description=f"beaye lexicon with source schema for extensive definiton of beaye language", | ||
schema="source", | ||
subset_id="beaye_lexicon", | ||
) | ||
for lang in _LANGUAGES if lang != "ind" | ||
] | ||
) | ||
|
||
DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_ind_source" | ||
|
||
def _info(self) -> datasets.DatasetInfo: | ||
schema = self.config.schema | ||
if schema == "source": | ||
features = datasets.Features({"id": datasets.Value("string"), "word": datasets.Value("string")}) | ||
else: | ||
raise NotImplementedError() | ||
|
||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=features, | ||
homepage=_HOMEPAGE, | ||
license=_LICENSE, | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: | ||
"""Returns SplitGenerators.""" | ||
if "ext" in self.config.name.split("_"): | ||
data_dir = Path(dl_manager.download(_URLS + "/english.xlsx")) | ||
else: | ||
data_dir = Path(dl_manager.download(_URLS + "/lexicon.xlsx")) | ||
|
||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, | ||
gen_kwargs={ | ||
"filepath": data_dir, | ||
"split": "train", | ||
} | ||
) | ||
] | ||
|
||
def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: | ||
"""Yields examples as (key, example) tuples.""" | ||
dfs = pd.read_excel(filepath, engine="openpyxl") | ||
if "ext" in self.config.name.split("_"): | ||
lang = self.config.name.split("_")[3] | ||
else: | ||
lang = self.config.name.split("_")[2] | ||
|
||
text = dfs[lang] | ||
|
||
if self.config.schema == "source": | ||
for idx, word in enumerate(text.values): | ||
row = {"id": str(idx), "word": word} | ||
yield idx, row | ||
else: | ||
raise ValueError(f"Invalid config: {self.config.name}") |