Skip to content

Commit

Permalink
Closes #193 | Create dataset loader for MALINDO Morph (#332)
Browse files Browse the repository at this point in the history
* Implement dataloader for MALINDO morph

* Specify file encoding and remove newlines when loading data

* Add blank __init__.py

* Fix typos in docstring

* Fix typos

* Update seacrowd/sea_datasets/malindo_morph/malindo_morph.py

Co-authored-by: Jennifer Santoso <[email protected]>

* Update seacrowd/sea_datasets/malindo_morph/malindo_morph.py

Co-authored-by: Jennifer Santoso <[email protected]>

* Update seacrowd/sea_datasets/malindo_morph/malindo_morph.py

---------

Co-authored-by: Jennifer Santoso <[email protected]>
  • Loading branch information
danjohnvelasco and jensan-1 authored Mar 7, 2024
1 parent cd64e18 commit 1b5f2bd
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 0 deletions.
Empty file.
124 changes: 124 additions & 0 deletions seacrowd/sea_datasets/malindo_morph/malindo_morph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
from pathlib import Path
from typing import Dict, List, Tuple

import datasets

from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Licenses

_CITATION = """\
@InProceedings{NOMOTO18.8,
author = {Hiroki Nomoto ,Hannah Choi ,David Moeljadi and Francis Bond},
title = {MALINDO Morph: Morphological dictionary and analyser for Malay/Indonesian},
booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)},
year = {2018},
month = {may},
date = {7-12},
location = {Miyazaki, Japan},
editor = {Kiyoaki Shirai},
publisher = {European Language Resources Association (ELRA)},
address = {Paris, France},
isbn = {979-10-95546-24-5},
language = {english}
}
"""


_DATASETNAME = "malindo_morph"

_DESCRIPTION = """\
MALINDO Morph is a morphological dictionary for Malay (bahasa Melayu) and Indonesian (bahasa Indonesia) language.
It contains over 200,000 lines, with each containing an analysis for one (case-sensitive) token.
Each line is made up of the following six items, separated by tabs: root, surface form, prefix, suffix, circumfix, reduplication.
"""

_HOMEPAGE = "https://github.com/matbahasa/MALINDO_Morph"

_LANGUAGES = ["zlm", "ind"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)

_LICENSE = Licenses.CC_BY_4_0.value # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value

_LOCAL = False

_URLS = {
_DATASETNAME: "https://raw.githubusercontent.com/matbahasa/MALINDO_Morph/master/malindo_dic_2023.tsv",
}

_SUPPORTED_TASKS = []

_SOURCE_VERSION = "2023.0.0"

_SEACROWD_VERSION = "1.0.0"


class MalindoMorph(datasets.GeneratorBasedBuilder):
"""MALINDO Morph is a morphological dictionary for Malay (bahasa Melayu) and Indonesian (bahasa Indonesia) language. It provides morphological information (root, prefix, suffix, circumfix, reduplication) for over 200,000 surface forms."""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)

BUILDER_CONFIGS = [
SEACrowdConfig(
name=f"{_DATASETNAME}_source",
version=SOURCE_VERSION,
description=f"{_DATASETNAME} source schema",
schema="source",
subset_id=f"{_DATASETNAME}",
),
]

DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"

def _info(self) -> datasets.DatasetInfo:
if self.config.schema == "source":
features = datasets.Features(
{
"id": datasets.Value("string"),
"root": datasets.Value("string"),
"bentuk_jadian": datasets.Value("string"),
"prefix": datasets.Value("string"),
"suffix": datasets.Value("string"),
"circumfix": datasets.Value("string"),
"reduplication": datasets.Value("string"),
"source": datasets.Value("string"),
"stem": datasets.Value("string"),
"lemma": datasets.Value("string"),
}
)

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""
urls = _URLS[_DATASETNAME]
file = dl_manager.download_and_extract(urls)

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": file,
"split": "train",
},
)
]

def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""
rows = []
with open(filepath, encoding="utf8") as file:
for line in file:
row = line.split("\t")
row[-1] = row[-1].split("\n")[0] # remove newlines from lemma feature
rows.append(row)

if self.config.schema == "source":
for key, row in enumerate(rows):
example = {"id": row[0], "root": row[1], "bentuk_jadian": row[2], "prefix": row[3], "suffix": row[4], "circumfix": row[5], "reduplication": row[6], "source": row[7], "stem": row[8], "lemma": row[9]}
yield key, example

0 comments on commit 1b5f2bd

Please sign in to comment.