Skip to content

Commit

Permalink
Closes SEACrowd#10 | Create beaye_lexicon dataset loader (SEACrowd#320)
Browse files Browse the repository at this point in the history
* Create beaye_lexicon dataset loader

* add implementation of eng-day word pairs
  • Loading branch information
joanitolopo authored and Railey Montalan committed Feb 27, 2024
1 parent ad10716 commit a7d4b9f
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 0 deletions.
Empty file.
116 changes: 116 additions & 0 deletions seacrowd/sea_datasets/beaye_lexicon/beaye_lexicon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from pathlib import Path
from typing import Dict, List, Tuple

import datasets
import pandas as pd

from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Licenses

_CITATION = """\
@misc{beayelexicon2024,
author = {Lopo, Joanito Agili and Moeljadi, David and Cahyawijaya, Samuel and Aji, Alham Fikri and Sommerlot,
Carly J. and Jacob, June},
title = {Penyusunan Korpus Paralel Bahasa Indonesia–Bahasa Melayu Ambon, Melayu Kupang, Beaye, dan Uab Meto},
year = {2024},
howpublished = {Online},
url = {https://github.com/joanitolopo/makalah-kongresxii},
note = {Manuscript in preparation},
}
"""

_DATASETNAME = "beaye_lexicon"
_DESCRIPTION = """The Beaye Lexicon is a lexicon resource encompassing translations between Indonesian, English, and
Beaye words. Developed through a collaborative effort involving two native Beaye speakers and evaluated by linguistic
experts, this lexicon comprises 984 Beaye vocabularies. The creation of the Beaye Lexicon marks the inaugural effort in
documenting the previously unrecorded Beaye language."""

_HOMEPAGE = "https://github.com/joanitolopo/bhinneka-korpus/tree/main/lexicon"
_LICENSE = Licenses.APACHE_2_0.value
_URLS = "https://raw.githubusercontent.com/joanitolopo/bhinneka-korpus/main/lexicon"
_SUPPORTED_TASKS = []
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"
_LOCAL = False

_LANGUAGES = ["ind", "day", "eng"]

class BeayeLexicon(datasets.GeneratorBasedBuilder):
"""Beaye Lexicon is a lexicon resource encompassing translations between Indonesian, English, and Beaye words"""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)

BUILDER_CONFIGS = (
[
SEACrowdConfig(
name=f"{_DATASETNAME}_{lang}_source",
version=datasets.Version(_SOURCE_VERSION),
description=f"beaye lexicon with source schema for {lang} language",
schema="source",
subset_id="beaye_lexicon",
)
for lang in _LANGUAGES if lang != "eng"
]
+ [
SEACrowdConfig(
name=f"{_DATASETNAME}_ext_{lang}_source",
version=datasets.Version(_SOURCE_VERSION),
description=f"beaye lexicon with source schema for extensive definiton of beaye language",
schema="source",
subset_id="beaye_lexicon",
)
for lang in _LANGUAGES if lang != "ind"
]
)

DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_ind_source"

def _info(self) -> datasets.DatasetInfo:
schema = self.config.schema
if schema == "source":
features = datasets.Features({"id": datasets.Value("string"), "word": datasets.Value("string")})
else:
raise NotImplementedError()

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""
if "ext" in self.config.name.split("_"):
data_dir = Path(dl_manager.download(_URLS + "/english.xlsx"))
else:
data_dir = Path(dl_manager.download(_URLS + "/lexicon.xlsx"))

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": data_dir,
"split": "train",
}
)
]

def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""
dfs = pd.read_excel(filepath, engine="openpyxl")
if "ext" in self.config.name.split("_"):
lang = self.config.name.split("_")[3]
else:
lang = self.config.name.split("_")[2]

text = dfs[lang]

if self.config.schema == "source":
for idx, word in enumerate(text.values):
row = {"id": str(idx), "word": word}
yield idx, row
else:
raise ValueError(f"Invalid config: {self.config.name}")

0 comments on commit a7d4b9f

Please sign in to comment.