Skip to content

Commit

Permalink
Closes SEACrowd#313 | Add dataloader for Saltik (SEACrowd#387)
Browse files Browse the repository at this point in the history
* add dataloader for indonesian_madurese_bible_translation

* add dataloader for saltik

* Delete seacrowd/sea_datasets/indonesian_madurese_bible_translation/indonesian_madurese_bible_translation.py

* update based on the reviewer comment

* update based on the reviewer comment

* Remove the modified constants.py from PR

---------

Co-authored-by: Holy Lovenia <[email protected]>
  • Loading branch information
TysonYu and holylovenia authored Apr 15, 2024
1 parent 94d34c6 commit 79e6641
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 0 deletions.
Empty file.
133 changes: 133 additions & 0 deletions seacrowd/sea_datasets/saltik/saltik.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import json
from pathlib import Path
from typing import Dict, List, Tuple

import datasets
import jsonlines

from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Licenses, Tasks

_CITATION = """\
@article{,
author = {Audah, Hanif Arkan and Yuliawati, Arlisa and Alfina, Ika},
title = {A Comparison Between SymSpell and a Combination of Damerau-Levenshtein Distance With the Trie Data Structure},
journal = {2023 10th International Conference on Advanced Informatics: Concept, Theory and Application (ICAICTA)},
volume = {},
year = {2023},
url = {https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10390399&casa_token=HtJUCIGGlWYAAAAA:q8ll1RWmpHtSAq2Qp5uQAE1NJETx7tUYFZIvTO1IWoaYy4eqFETSsm9p6C7tJwLZBGq5y8zc3A&tag=1},
doi = {},
biburl = {https://github.com/ir-nlp-csui/saltik?tab=readme-ov-file#references},
bibsource = {https://github.com/ir-nlp-csui/saltik?tab=readme-ov-file#references}
}
"""

_DATASETNAME = "saltik"
_DESCRIPTION = """\
Saltik is a dataset for benchmarking non-word error correction method accuracy in evaluating Indonesian words.
It consists of 58,532 non-word errors generated from 3,000 of the most popular Indonesian words.
"""
_HOMEPAGE = "https://github.com/ir-nlp-csui/saltik"
_LANGUAGES = ["ind"]
_LICENSE = Licenses.AGPL_3_0.value
_LOCAL = False
_URLS = {
_DATASETNAME: "https://raw.githubusercontent.com/ir-nlp-csui/saltik/main/saltik.json",
}
_SUPPORTED_TASKS = [Tasks.NON_WORD_ERROR_SPELLING_CORRECTION]
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"


class Saltik(datasets.GeneratorBasedBuilder):
"""It consists of 58,532 non-word errors generated from 3,000 of the most popular Indonesian words."""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)

BUILDER_CONFIGS = [
SEACrowdConfig(
name=f"{_DATASETNAME}_source",
version=SOURCE_VERSION,
description=f"{_DATASETNAME} source schema",
schema="source",
subset_id=f"{_DATASETNAME}",
),
]

DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"

def _info(self) -> datasets.DatasetInfo:

if self.config.schema == "source":
# EX: Arbitrary NER type dataset
features = datasets.Features(
{
"id": datasets.Value("string"),
"word": datasets.Value("string"),
"errors": [
{
"typo": datasets.Value("string"),
"error_type": datasets.Value("string"),
}
],
}
)
else:
raise NotImplementedError()

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""
urls = _URLS[_DATASETNAME]
file_path = dl_manager.download(urls)
data = self._read_jsonl(file_path)
all_words = list(data.keys())
processed_data = []
id = 0
for word in all_words:
processed_data.append({"id": id, "word": word, "errors": data[word]})
id += 1
self._write_jsonl(file_path + ".jsonl", processed_data)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# Whatever you put in gen_kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": file_path + ".jsonl",
"split": "train",
},
),
]

def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""
if self.config.schema == "source":
i = 0
with jsonlines.open(filepath) as f:
for each_data in f.iter():
ex = {
"id": each_data["id"],
"word": each_data["word"],
"errors": each_data["errors"],
}

yield i, ex
i += 1

def _read_jsonl(self, filepath: Path):
with open(filepath) as user_file:
parsed_json = json.load(user_file)
return parsed_json

def _write_jsonl(self, filepath, values):
with jsonlines.open(filepath, "w") as writer:
for line in values:
writer.write(line)

0 comments on commit 79e6641

Please sign in to comment.