Resolve conflict and reflect changes in

sabilmakbar · Nov 20, 2023 · 4202870 · 4202870
2 parents 38d6785 + bccbdaf
commit 4202870
Show file tree

Hide file tree

Showing 6 changed files with 473 additions and 0 deletions.
diff --git a/seacrowd/sea_datasets/ijelid/__init__.py b/seacrowd/sea_datasets/ijelid/__init__.py
diff --git a/seacrowd/sea_datasets/ijelid/ijelid.py b/seacrowd/sea_datasets/ijelid/ijelid.py
@@ -0,0 +1,142 @@
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+from datasets.download.download_manager import DownloadManager
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Licenses, Tasks
+
+_CITATION = """
+@article{hidayatullah2023corpus,
+  title={Corpus creation and language identification for code-mixed Indonesian-Javanese-English Tweets},
+  author={Hidayatullah, Ahmad Fathan and Apong, Rosyzie Anna and Lai, Daphne TC and Qazi, Atika},
+  journal={PeerJ Computer Science},
+  volume={9},
+  pages={e1312},
+  year={2023},
+  publisher={PeerJ Inc.}
+}
+"""
+
+_LOCAL = False
+_LANGUAGES = ["ind", "jav", "eng"]
+_DATASETNAME = "ijelid"
+_DESCRIPTION = """\
+This is a code-mixed Indonesian-Javanese-English dataset for token-level
+language identification. We named this dataset as IJELID
+(Indonesian-Javanese-English Language Identification). This dataset contains
+tweets that have been tokenized with the corresponding token and its language
+label. There are seven language labels in the dataset, namely: ID (Indonesian)JV
+(Javanese), EN (English), MIX_ID_EN (mixed Indonesian-English), MIX_ID_JV (mixed
+Indonesian-Javanese), MIX_JV_EN (mixed Javanese-English), OTH (Other).
+"""
+
+_HOMEPAGE = "https://github.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data"
+_LICENSE = Licenses.CC_BY_NC_SA_4_0.value
+_URLS = {
+    "train": "https://raw.githubusercontent.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data/main/train.tsv",
+    "dev": "https://raw.githubusercontent.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data/main/val.tsv",
+    "test": "https://raw.githubusercontent.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data/main/test.tsv",
+}
+
+_SUPPORTED_TASKS = [Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION]
+_SOURCE_VERSION = "1.0.0"
+_SEACROWD_VERSION = "1.0.0"
+
+
+class IJELIDDataset(datasets.GeneratorBasedBuilder):
+    """IJELID dataset from https://github.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data"""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    SEACROWD_SCHEMA_NAME = "seq_label"
+    LABEL_CLASSES = ["ID", "JV", "EN", "MIX_ID_EN", "MIX_ID_JV", "MIX_JV_EN", "OTH"]
+
+    BUILDER_CONFIGS = [
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_source",
+            version=SOURCE_VERSION,
+            description=f"{_DATASETNAME} source schema",
+            schema="source",
+            subset_id=_DATASETNAME,
+        ),
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}",
+            version=SEACROWD_VERSION,
+            description=f"{_DATASETNAME} SEACrowd schema",
+            schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}",
+            subset_id=_DATASETNAME,
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+        # No specific schema for the source, so for consistency,
+        # I will use the same schema with SEACrowd
+        features = schemas.seq_label_features(self.LABEL_CLASSES)
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        data_files = {
+            "train": Path(dl_manager.download_and_extract(_URLS["train"])),
+            "dev": Path(dl_manager.download_and_extract(_URLS["dev"])),
+            "test": Path(dl_manager.download_and_extract(_URLS["test"])),
+        }
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": data_files["train"], "split": "train"},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"filepath": data_files["dev"], "split": "dev"},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={"filepath": data_files["test"], "split": "test"},
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
+        """Yield examples as (key, example) tuples"""
+        with open(filepath, encoding="utf-8") as f:
+            guid = 0
+            tokens = []
+            labels = []
+            for line in f:
+                if line == "" or line == "\n":
+                    if tokens:
+                        yield guid, {
+                            "id": str(guid),
+                            "tokens": tokens,
+                            "labels": labels,
+                        }
+                        guid += 1
+                        tokens = []
+                        labels = []
+                else:
+                    # IJELID TSV are separated by \t
+                    token, label = line.split("\t")
+                    tokens.append(token)
+                    labels.append(label.rstrip())
+
+            # Last example
+            if tokens:
+                yield guid, {
+                    "id": str(guid),
+                    "tokens": tokens,
+                    "labels": labels,
+                }
diff --git a/seacrowd/sea_datasets/sea_wiki/__init__.py b/seacrowd/sea_datasets/sea_wiki/__init__.py
diff --git a/seacrowd/sea_datasets/sea_wiki/lang_config.json b/seacrowd/sea_datasets/sea_wiki/lang_config.json
@@ -0,0 +1,110 @@
+{
+  "ace": {
+    "name": "Acehnese",
+    "source_subset": "ace"
+  },
+  "ban": {
+    "name": "Balinese",
+    "source_subset": "ban"
+  },
+  "bcl": {
+    "name": "Central Bicolano",
+    "source_subset": "bcl"
+  },
+  "bjn": {
+    "name": "Banjarese",
+    "source_subset": "bjn"
+  },
+  "bug": {
+    "name": "Buginese",
+    "source_subset": "bug"
+  },
+  "cbk": {
+    "name": "Chavacano",
+    "source_subset": "cbk-zam"
+  },
+  "gor": {
+    "name": "Gorontalo",
+    "source_subset": "gor"
+  },
+  "ilo": {
+    "name": "Ilokano",
+    "source_subset": "ilo"
+  },
+  "ind": {
+    "name": "Indonesian",
+    "source_subset": "id"
+  },
+  "jav": {
+    "name": "Javanese",
+    "source_subset": "jv"
+  },
+  "khm": {
+    "name": "Khmer",
+    "source_subset": "km"
+  },
+  "lao": {
+    "name": "Lao",
+    "source_subset": "lo"
+  },
+  "mad": {
+    "name": "Madurese",
+    "source_subset": "mad"
+  },
+  "map_bms": {
+    "name": "Banyumasan (Dialect of Javanese)",
+    "source_subset": "map-bms"
+  },
+  "min": {
+    "name": "Minangkabau",
+    "source_subset": "min"
+  },
+  "mnw": {
+    "name": "Mon",
+    "source_subset": "min"
+  },
+  "mya": {
+    "name": "Burmese",
+    "source_subset": "my"
+  },
+  "nia": {
+    "name": "Nias",
+    "source_subset": "nia"
+  },
+  "pag": {
+    "name": "Pangasinan",
+    "source_subset": "pag"
+  },
+  "pam": {
+    "name": "Kapampangan",
+    "source_subset": "pam"
+  },
+  "shn": {
+    "name": "Shan",
+    "source_subset": "shn"
+  },
+  "sun": {
+    "name": "Sundanese",
+    "source_subset": "su"
+  },
+  "tet": {
+    "name": "Tetum",
+    "source_subset": "tet"
+  },
+  "tgl": {
+    "name": "Tagalog",
+    "source_subset": "tl"
+  },
+  "tha": {
+    "name": "Thai",
+    "source_subset": "th"
+  },
+  "vie": {
+    "name": "Vietnamese",
+    "source_subset": "vi"
+  },
+  "war": {
+    "name": "Waray",
+    "source_subset": "war"
+  }
+}