Closes SEACrowd#313 | Add dataloader for Saltik (SEACrowd#387)

* add dataloader for indonesian_madurese_bible_translation * add dataloader for saltik * Delete seacrowd/sea_datasets/indonesian_madurese_bible_translation/indonesian_madurese_bible_translation.py * update based on the reviewer comment * update based on the reviewer comment * Remove the modified constants.py from PR --------- Co-authored-by: Holy Lovenia <[email protected]>
R-Damanhuri · Apr 15, 2024 · 79e6641 · 79e6641
1 parent 94d34c6
commit 79e6641
Show file tree

Hide file tree

Showing 2 changed files with 133 additions and 0 deletions.
diff --git a/seacrowd/sea_datasets/saltik/__init__.py b/seacrowd/sea_datasets/saltik/__init__.py
diff --git a/seacrowd/sea_datasets/saltik/saltik.py b/seacrowd/sea_datasets/saltik/saltik.py
@@ -0,0 +1,133 @@
+import json
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+import jsonlines
+
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Licenses, Tasks
+
+_CITATION = """\
+@article{,
+  author    = {Audah, Hanif Arkan and Yuliawati, Arlisa and Alfina, Ika},
+  title     = {A Comparison Between SymSpell and a Combination of Damerau-Levenshtein Distance With the Trie Data Structure},
+  journal   = {2023 10th International Conference on Advanced Informatics: Concept, Theory and Application (ICAICTA)},
+  volume    = {},
+  year      = {2023},
+  url       = {https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10390399&casa_token=HtJUCIGGlWYAAAAA:q8ll1RWmpHtSAq2Qp5uQAE1NJETx7tUYFZIvTO1IWoaYy4eqFETSsm9p6C7tJwLZBGq5y8zc3A&tag=1},
+  doi       = {},
+  biburl    = {https://github.com/ir-nlp-csui/saltik?tab=readme-ov-file#references},
+  bibsource = {https://github.com/ir-nlp-csui/saltik?tab=readme-ov-file#references}
+}
+"""
+
+_DATASETNAME = "saltik"
+_DESCRIPTION = """\
+Saltik is a dataset for benchmarking non-word error correction method accuracy in evaluating Indonesian words.
+It consists of 58,532 non-word errors generated from 3,000 of the most popular Indonesian words.
+"""
+_HOMEPAGE = "https://github.com/ir-nlp-csui/saltik"
+_LANGUAGES = ["ind"]
+_LICENSE = Licenses.AGPL_3_0.value
+_LOCAL = False
+_URLS = {
+    _DATASETNAME: "https://raw.githubusercontent.com/ir-nlp-csui/saltik/main/saltik.json",
+}
+_SUPPORTED_TASKS = [Tasks.NON_WORD_ERROR_SPELLING_CORRECTION]
+_SOURCE_VERSION = "1.0.0"
+_SEACROWD_VERSION = "1.0.0"
+
+
+class Saltik(datasets.GeneratorBasedBuilder):
+    """It consists of 58,532 non-word errors generated from 3,000 of the most popular Indonesian words."""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    BUILDER_CONFIGS = [
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_source",
+            version=SOURCE_VERSION,
+            description=f"{_DATASETNAME} source schema",
+            schema="source",
+            subset_id=f"{_DATASETNAME}",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        if self.config.schema == "source":
+            # EX: Arbitrary NER type dataset
+            features = datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "word": datasets.Value("string"),
+                    "errors": [
+                        {
+                            "typo": datasets.Value("string"),
+                            "error_type": datasets.Value("string"),
+                        }
+                    ],
+                }
+            )
+        else:
+            raise NotImplementedError()
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        urls = _URLS[_DATASETNAME]
+        file_path = dl_manager.download(urls)
+        data = self._read_jsonl(file_path)
+        all_words = list(data.keys())
+        processed_data = []
+        id = 0
+        for word in all_words:
+            processed_data.append({"id": id, "word": word, "errors": data[word]})
+            id += 1
+        self._write_jsonl(file_path + ".jsonl", processed_data)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # Whatever you put in gen_kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": file_path + ".jsonl",
+                    "split": "train",
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+        if self.config.schema == "source":
+            i = 0
+            with jsonlines.open(filepath) as f:
+                for each_data in f.iter():
+                    ex = {
+                        "id": each_data["id"],
+                        "word": each_data["word"],
+                        "errors": each_data["errors"],
+                    }
+
+                    yield i, ex
+                    i += 1
+
+    def _read_jsonl(self, filepath: Path):
+        with open(filepath) as user_file:
+            parsed_json = json.load(user_file)
+            return parsed_json
+
+    def _write_jsonl(self, filepath, values):
+        with jsonlines.open(filepath, "w") as writer:
+            for line in values:
+                writer.write(line)