Closes SEACrowd#264 | Create dataset loader for mySentence SEACrowd#264…

… (SEACrowd#291) * add mysentences dataloader * align the config name to subset_id * update mysentence config * Update mysentence.py * remove comment line * Update mysentence.py * Update mysentence config * Update mysentence.py * Update seacrowd/sea_datasets/mysentence/mysentence.py Fix the subset_id case-checking for data download
raileymontalan · Feb 27, 2024 · b0d16b1 · b0d16b1
1 parent 2f44a2b
commit b0d16b1
Show file tree

Hide file tree

Showing 2 changed files with 170 additions and 0 deletions.
diff --git a/seacrowd/sea_datasets/mysentence/__init__.py b/seacrowd/sea_datasets/mysentence/__init__.py
diff --git a/seacrowd/sea_datasets/mysentence/mysentence.py b/seacrowd/sea_datasets/mysentence/mysentence.py
@@ -0,0 +1,170 @@
+# coding=utf-8
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Licenses, Tasks
+
+_CITATION = """\
+@article{Aung_Kyaw Thu_Hlaing_2023, place={Nonthaburi, Thailand}, title={mySentence: Sentence Segmentation for Myanmar Language
+using Neural Machine Translation Approach}, volume={9}, url={https://ph05.tci-thaijo.org/index.php/JIIST/article/view/87},
+number={October}, 
+abstract="In the informal Myanmar language, for which most NLP applications are used, there is no predefined rule to mark the end of the sentence.
+Therefore, in this paper, we contributed the first Myanmar sentence segmentation corpus and systemat
+ically experimented with twelve neural sequence
+labeling architectures trained and tested on both sentence and sentence+paragraph data. The word LSTM + Softmax achieved the highest accuracy of 99.95{\%}
+while trained and tested on sentence-only data and 97.40{\%} while trained and tested on sentence + paragraph data.",
+journal={Journal of Intelligent Informatics
+and Smart Technology}, author={Aung, Thura and Kyaw Thu , Ye and Hlaing , Zar Zar}, year={2023}, month={Nov.}, pages={e001} };
+
+@InProceedings{10.1007/978-3-031-36886-8_24,
+author="Thu, Ye Kyaw
+and Aung, Thura
+and Supnithi, Thepchai",
+editor="Nguyen, Ngoc Thanh
+and Le-Minh, Hoa
+and Huynh, Cong-Phap
+and Nguyen, Quang-Vu",
+title="Neural Sequence Labeling Based Sentence Segmentation for Myanmar Language",
+booktitle="The 12th Conference on Information Technology and Its Applications",
+year="2023",
+publisher="Springer Nature Switzerland",
+address="Cham",
+pages="285--296",
+abstract="In the informal Myanmar language, for which most NLP applications are used, there is no predefined rule to mark the end of the sentence.
+Therefore, in this paper, we contributed the first Myanmar sentence segmentation corpus and systemat
+ically experimented with twelve neural sequence
+labeling architectures trained and tested on both sentence and sentence+paragraph data. The word LSTM + Softmax achieved the highest accuracy of 99.95{\%}
+while trained and tested on sentence-only data and 97.40{\%} while trained and tested on sentence + paragraph data.",
+isbn="978-3-031-36886-8"
+}
+
+"""
+
+_DATASETNAME = "mysentence"
+_DESCRIPTION = """\
+mySentence is a corpus with a total size of around 55K for Myanmar sentence segmentation. In formal Burmese (Myanmar language), sentences are grammatically structured
+and typically end with the "။" pote-ma symbol. However, informal language, more commonly used in daily conversations due to its natural flow, does not always follow predefined
+rules for ending sentences, making it challenging for machines to identify sentence boundaries. In this corpus, each token of the sentences and paragraphs is tagged from start to finish.
+"""
+
+_HOMEPAGE = "https://github.com/ye-kyaw-thu/mySentence"
+_LANGUAGES = ["mya"]
+_LICENSE = Licenses.CC_BY_NC_SA_4_0.value
+_LOCAL = False
+_URLS = {
+    "sent": {
+        "train": "https://raw.githubusercontent.com/ye-kyaw-thu/mySentence/main/ver1.0/data/data-sent/sent_tagged/train.tagged",
+        "valid": "https://raw.githubusercontent.com/ye-kyaw-thu/mySentence/main/ver1.0/data/data-sent/sent_tagged/valid.tagged",
+        "test": "https://raw.githubusercontent.com/ye-kyaw-thu/mySentence/main/ver1.0/data/data-sent/sent_tagged/test.tagged",
+    },
+    "sent+para": {
+        "train": "https://raw.githubusercontent.com/ye-kyaw-thu/mySentence/main/ver1.0/data/data-sent+para/sent+para_tagged/train.tagged",
+        "valid": "https://raw.githubusercontent.com/ye-kyaw-thu/mySentence/main/ver1.0/data/data-sent+para/sent+para_tagged/valid.tagged",
+        "test": "https://raw.githubusercontent.com/ye-kyaw-thu/mySentence/main/ver1.0/data/data-sent+para/sent+para_tagged/test.tagged",
+    },
+}
+
+_SUPPORTED_TASKS = [Tasks.POS_TAGGING]
+_SOURCE_VERSION = "1.0.0"
+_SEACROWD_VERSION = "1.0.0"
+
+
+class MysentenceDataset(datasets.GeneratorBasedBuilder):
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+    BUILDER_CONFIGS = [
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_source",
+            version=SOURCE_VERSION,
+            description=_DESCRIPTION,
+            schema="source",
+            subset_id=f"{_DATASETNAME}",
+        ),
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_seacrowd_seq_label",
+            version=SEACROWD_VERSION,
+            description="sentences SEACrowd schema",
+            schema="seacrowd_seq_label",
+            subset_id=f"{_DATASETNAME}",
+        ),
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_and_paragraphs_source",
+            version=SOURCE_VERSION,
+            description="sentences para source schema",
+            schema="source",
+            subset_id=f"{_DATASETNAME}_and_paragraphs",
+        ),
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_and_paragraphs_seacrowd_seq_label",
+            version=SEACROWD_VERSION,
+            description="sentence para SEACrowd schema",
+            schema="seacrowd_seq_label",
+            subset_id=f"{_DATASETNAME}_and_paragraphs",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "tokens": datasets.Sequence(datasets.Value("string")),
+                    "labels": datasets.Sequence(datasets.Value("string")),
+                }
+            )
+        else:
+            features = schemas.seq_label_features(["B", "O", "N", "E"])
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,  # B (Begin), O (Other), N (Next), and E (End)
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        if self.config.subset_id == f"{_DATASETNAME}":
+            DATA_URL_ = _URLS["sent"]
+        elif self.config.subset_id == f"{_DATASETNAME}_and_paragraphs":
+            DATA_URL_ = _URLS["sent+para"]
+        else:
+            raise ValueError(f"No related dataset id for {self.config.subset_id}")
+
+        data_dir = dl_manager.download_and_extract(DATA_URL_)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": data_dir["train"]},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={"filepath": data_dir["test"]},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": data_dir["valid"],
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]:
+
+        with open(filepath, "r") as filein:
+            examples = [line.strip("\n").split(" ") for line in filein.readlines()]
+            for eid, exam in enumerate(examples):
+                tokens = []
+                pos = []
+                for tok_chunk in exam:
+                    tok_ = tok_chunk.split("/")
+                    tokens.append(tok_[0])
+                    pos.append(tok_[1])
+                yield eid, {"id": str(eid), "tokens": tokens, "labels": pos}