From 85ef91d4793292b4e61a589d9e6ea8f07f48a9ac Mon Sep 17 00:00:00 2001
From: Lj Miranda <ljvmiranda@gmail.com>
Date: Mon, 13 Nov 2023 23:23:13 +0800
Subject: [PATCH 1/3] Add dataset loader for ijelid

---
 seacrowd/sea_datasets/ijelid/__init__.py |   0
 seacrowd/sea_datasets/ijelid/ijelid.py   | 142 +++++++++++++++++++++++
 2 files changed, 142 insertions(+)
 create mode 100644 seacrowd/sea_datasets/ijelid/__init__.py
 create mode 100644 seacrowd/sea_datasets/ijelid/ijelid.py

diff --git a/seacrowd/sea_datasets/ijelid/__init__.py b/seacrowd/sea_datasets/ijelid/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/seacrowd/sea_datasets/ijelid/ijelid.py b/seacrowd/sea_datasets/ijelid/ijelid.py
new file mode 100644
index 000000000..8e81bf46c
--- /dev/null
+++ b/seacrowd/sea_datasets/ijelid/ijelid.py
@@ -0,0 +1,142 @@
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+from datasets.download.download_manager import DownloadManager
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Licenses, Tasks
+
+_CITATION = """
+@article{hidayatullah2023corpus,
+  title={Corpus creation and language identification for code-mixed Indonesian-Javanese-English Tweets},
+  author={Hidayatullah, Ahmad Fathan and Apong, Rosyzie Anna and Lai, Daphne TC and Qazi, Atika},
+  journal={PeerJ Computer Science},
+  volume={9},
+  pages={e1312},
+  year={2023},
+  publisher={PeerJ Inc.}
+}
+"""
+
+_LOCAL = False
+_LANGUAGES = ["ind", "jav", "eng"]
+_DATASETNAME = "ijelid"
+_DESCRIPTION = """\
+This is a code-mixed Indonesian-Javanese-English dataset for token-level
+language identification. We named this dataset as IJELID
+(Indonesian-Javanese-English Language Identification). This dataset contains
+tweets that have been tokenized with the corresponding token and its language
+label. There are seven language labels in the dataset, namely: ID (Indonesian)JV
+(Javanese), EN (English), MIX_ID_EN (mixed Indonesian-English), MIX_ID_JV (mixed
+Indonesian-Javanese), MIX_JV_EN (mixed Javanese-English), OTH (Other).
+"""
+
+_HOMEPAGE = "https://github.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data"
+_LICENSE = Licenses.CC_BY_NC_SA_4_0.value
+_URLS = {
+    "train": "https://raw.githubusercontent.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data/main/train.tsv",
+    "dev": "https://raw.githubusercontent.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data/main/val.tsv",
+    "test": "https://raw.githubusercontent.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data/main/test.tsv",
+}
+
+_SUPPORTED_TASKS = [Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION]
+_SOURCE_VERSION = "1.0.0"
+_SEACROWD_VERSION = "1.0.0"
+
+
+class IJELIDDataset(datasets.GeneratorBasedBuilder):
+    """IJELID dataset from https://github.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data"""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    SEACROWD_SCHEMA_NAME = "seq_label"
+    LABEL_CLASSES = ["ID", "JV", "EN", "MIX_ID_EN", "MIX_ID_JV", "MIX_JV_EN", "OTH"]
+
+    BUILDER_CONFIGS = [
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_source",
+            version=SOURCE_VERSION,
+            description=f"{_DATASETNAME} source schema",
+            schema="source",
+            subset_id=_DATASETNAME,
+        ),
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}",
+            version=SEACROWD_VERSION,
+            description=f"{_DATASETNAME} SEACrowd schema",
+            schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}",
+            subset_id=_DATASETNAME,
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+        # No specific schema for the source, so for consistency,
+        # I will use the same schema with SEACrowd
+        features = schemas.seq_label_features(self.LABEL_CLASSES)
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        data_files = {
+            "train": Path(dl_manager.download_and_extract(_URLS["train"])),
+            "dev": Path(dl_manager.download_and_extract(_URLS["dev"])),
+            "test": Path(dl_manager.download_and_extract(_URLS["test"])),
+        }
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": data_files["train"], "split": "train"},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"filepath": data_files["dev"], "split": "dev"},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={"filepath": data_files["test"], "split": "test"},
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
+        """Yield examples as (key, example) tuples"""
+        with open(filepath, encoding="utf-8") as f:
+            guid = 0
+            tokens = []
+            labels = []
+            for line in f:
+                if line == "" or line == "\n":
+                    if tokens:
+                        yield guid, {
+                            "id": str(guid),
+                            "tokens": tokens,
+                            "labels": labels,
+                        }
+                        guid += 1
+                        tokens = []
+                        labels = []
+                else:
+                    # IJELID TSV are separated by \t
+                    token, label = line.split("\t")
+                    tokens.append(token)
+                    labels.append(label.rstrip())
+
+            # Last example
+            if tokens:
+                yield guid, {
+                    "id": str(guid),
+                    "tokens": tokens,
+                    "labels": labels,
+                }

From b472a1c53531f71f8091de41226c94e0ef3d002e Mon Sep 17 00:00:00 2001
From: Lj Miranda <ljvmiranda@gmail.com>
Date: Tue, 14 Nov 2023 10:55:32 +0800
Subject: [PATCH 2/3] Add task for token-level language identification

---
 seacrowd/utils/constants.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/seacrowd/utils/constants.py b/seacrowd/utils/constants.py
index 388ac1c62..27182cb88 100644
--- a/seacrowd/utils/constants.py
+++ b/seacrowd/utils/constants.py
@@ -53,6 +53,7 @@ class Tasks(Enum):
     KEYWORD_TAGGING = "KT"
     NAMED_ENTITY_RECOGNITION = "NER"
     SENTENCE_ORDERING = "SO"
+    TOKEN_LEVEL_LANGUAGE_IDENTIFICATION = "LANGID"
 
     # Pair Text Classification
     QUESTION_ANSWERING = "QA"
@@ -182,6 +183,7 @@ class Licenses(Enum):
     Tasks.POS_TAGGING: "SEQ_LABEL",
     Tasks.KEYWORD_TAGGING: "SEQ_LABEL",
     Tasks.SENTENCE_ORDERING: "SEQ_LABEL",
+    Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION: "SEQ_LABEL",
     Tasks.QUESTION_ANSWERING: "QA",
     Tasks.TEXTUAL_ENTAILMENT: "PAIRS",
     Tasks.SEMANTIC_SIMILARITY: "PAIRS_SCORE",

From bccbdaf424a24440c3b15004369597988dc8ebb0 Mon Sep 17 00:00:00 2001
From: Salsabil Maulana Akbar <maulana.1998@yahoo.co.id>
Date: Mon, 20 Nov 2023 09:59:23 +0700
Subject: [PATCH 3/3] Closes #28 | Add SEA Wiki loader (#38)

* Add SEA Wiki loader

* Update dset script and json to conform with unit-tests and configs defined

* Add exception on _info for unexpected schema name received

* Fix values in SEACrowd Schema post-review
---
 seacrowd/sea_datasets/sea_wiki/__init__.py    |   0
 .../sea_datasets/sea_wiki/lang_config.json    | 110 +++++++++
 seacrowd/sea_datasets/sea_wiki/sea_wiki.py    | 219 ++++++++++++++++++
 3 files changed, 329 insertions(+)
 create mode 100644 seacrowd/sea_datasets/sea_wiki/__init__.py
 create mode 100644 seacrowd/sea_datasets/sea_wiki/lang_config.json
 create mode 100644 seacrowd/sea_datasets/sea_wiki/sea_wiki.py

diff --git a/seacrowd/sea_datasets/sea_wiki/__init__.py b/seacrowd/sea_datasets/sea_wiki/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/seacrowd/sea_datasets/sea_wiki/lang_config.json b/seacrowd/sea_datasets/sea_wiki/lang_config.json
new file mode 100644
index 000000000..b35740bc0
--- /dev/null
+++ b/seacrowd/sea_datasets/sea_wiki/lang_config.json
@@ -0,0 +1,110 @@
+{
+  "ace": {
+    "name": "Acehnese",
+    "source_subset": "ace"
+  },
+  "ban": {
+    "name": "Balinese",
+    "source_subset": "ban"
+  },
+  "bcl": {
+    "name": "Central Bicolano",
+    "source_subset": "bcl"
+  },
+  "bjn": {
+    "name": "Banjarese",
+    "source_subset": "bjn"
+  },
+  "bug": {
+    "name": "Buginese",
+    "source_subset": "bug"
+  },
+  "cbk": {
+    "name": "Chavacano",
+    "source_subset": "cbk-zam"
+  },
+  "gor": {
+    "name": "Gorontalo",
+    "source_subset": "gor"
+  },
+  "ilo": {
+    "name": "Ilokano",
+    "source_subset": "ilo"
+  },
+  "ind": {
+    "name": "Indonesian",
+    "source_subset": "id"
+  },
+  "jav": {
+    "name": "Javanese",
+    "source_subset": "jv"
+  },
+  "khm": {
+    "name": "Khmer",
+    "source_subset": "km"
+  },
+  "lao": {
+    "name": "Lao",
+    "source_subset": "lo"
+  },
+  "mad": {
+    "name": "Madurese",
+    "source_subset": "mad"
+  },
+  "map_bms": {
+    "name": "Banyumasan (Dialect of Javanese)",
+    "source_subset": "map-bms"
+  },
+  "min": {
+    "name": "Minangkabau",
+    "source_subset": "min"
+  },
+  "mnw": {
+    "name": "Mon",
+    "source_subset": "min"
+  },
+  "mya": {
+    "name": "Burmese",
+    "source_subset": "my"
+  },
+  "nia": {
+    "name": "Nias",
+    "source_subset": "nia"
+  },
+  "pag": {
+    "name": "Pangasinan",
+    "source_subset": "pag"
+  },
+  "pam": {
+    "name": "Kapampangan",
+    "source_subset": "pam"
+  },
+  "shn": {
+    "name": "Shan",
+    "source_subset": "shn"
+  },
+  "sun": {
+    "name": "Sundanese",
+    "source_subset": "su"
+  },
+  "tet": {
+    "name": "Tetum",
+    "source_subset": "tet"
+  },
+  "tgl": {
+    "name": "Tagalog",
+    "source_subset": "tl"
+  },
+  "tha": {
+    "name": "Thai",
+    "source_subset": "th"
+  },
+  "vie": {
+    "name": "Vietnamese",
+    "source_subset": "vi"
+  },
+  "war": {
+    "name": "Waray",
+    "source_subset": "war"
+  }
+}
diff --git a/seacrowd/sea_datasets/sea_wiki/sea_wiki.py b/seacrowd/sea_datasets/sea_wiki/sea_wiki.py
new file mode 100644
index 000000000..f763173e2
--- /dev/null
+++ b/seacrowd/sea_datasets/sea_wiki/sea_wiki.py
@@ -0,0 +1,219 @@
+"""
+SEA Crowd Data Loader for SEA Wiki.
+"""
+
+import json
+from itertools import product
+from typing import Dict, List, Tuple
+
+import datasets
+from datasets import load_dataset
+from datasets.download.download_manager import DownloadManager
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Licenses, Tasks
+
+_CITATION = """
+@ONLINE{wikidump,
+    author = "Wikimedia Foundation",
+    title  = "Wikimedia Downloads",
+    url    = "https://dumps.wikimedia.org"}
+@ONLINE{wikipedia-hf,
+    title  = "Huggingface Wikipedia Dataset",
+    url    = "https://huggingface.co/datasets/wikipedia"}
+@ONLINE{wikipedia-hf,
+    title  = "Huggingface SEA Wikipedia Dataset",
+    url    = "https://huggingface.co/datasets/sabilmakbar/sea_wiki"}
+"""
+
+logger = datasets.logging.get_logger(__name__)
+
+
+with open(DownloadManager().download_and_extract("seacrowd/sea_datasets/sea_wiki/lang_config.json"), "r") as f:
+    _LANG_CONFIG = json.load(f)
+
+_LOCAL = False
+_LANGUAGES = list(_LANG_CONFIG.keys())
+
+_DATASETNAME = "sea_wiki"
+_DESCRIPTION = """\
+    SEA Lang & Local Langs Wikipedia Archives, dumped from WIkipedia HF and processed by boilerplate removal.
+    This dataset consists of URL of referred Wikipedia Article, its Title, and its Text Data (Article Contents).
+"""
+
+_HOMEPAGE = "https://huggingface.co/datasets/sabilmakbar/sea_wiki"
+_LICENSE = Licenses.CC_BY_SA_4_0.value
+
+# url won't be used since it will implement load_dataset method on HF URL provided
+_URL = "https://huggingface.co/datasets/sabilmakbar/sea_wiki"
+
+_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING, Tasks.SUMMARIZATION]
+_SOURCE_VERSION = "1.0.0"
+_SEACROWD_VERSION = "1.0.0"
+
+CONFIG_SUFFIXES_FOR_TASK = ["ssp", "t2t"]
+
+
+def conform_init_config():
+    """Assertion Function for Instantiated Configs"""
+    if len(_LANGUAGES) == 0:
+        raise AssertionError("No Languages detected from config!")
+    if len(CONFIG_SUFFIXES_FOR_TASK) != len(_SUPPORTED_TASKS):
+        raise AssertionError("Config prefixes doesn't matched in terms of `len` with `_SUPPORTED_TASKS`!")
+    if len(CONFIG_SUFFIXES_FOR_TASK) == 0:
+        raise AssertionError("Config prefixes and `_SUPPORTED_TASKS` have `len` of 0!")
+
+
+conform_init_config()
+
+# construct zipped arg for config instantiation
+SCHEMA_PREFIX_AND_VERSION_PAIRS = list(zip(("source", "seacrowd"), (_SOURCE_VERSION, _SEACROWD_VERSION)))
+CONFIG_NAME_AND_TASKS_PAIRS = list(zip(CONFIG_SUFFIXES_FOR_TASK, _SUPPORTED_TASKS))
+
+
+def construct_configs(languages: list = None) -> List[SEACrowdConfig]:
+    """
+    The function `construct_configs` constructs a list of SEACrowdConfig objects based on the provided
+    languages or a default language, and returns the list.
+
+    input:
+        languages (list, default None): The `languages` parameter is a list that specifies the languages for which the
+        configurations need to be constructed. If no languages are provided (value=None), the first value in language config
+        will be used.
+    output:
+        a list of `SEACrowdConfig` objects based on instantiated init variables
+    """
+    # set output var
+    config_list = []
+
+    # set default task for default config w/o task arg name (set to Tasks.SUMMARIZATION)
+    _DEFAULT_TASK_IDX = [idx for idx, val in enumerate(_SUPPORTED_TASKS) if val == Tasks.SUMMARIZATION]
+
+    # assert `_DEFAULT_TASK_IDX` to have len of 1
+    if len(_DEFAULT_TASK_IDX) != 1:
+        raise AssertionError("Unexpected `_DEFAULT_TASK` #item!")
+
+    _DEFAULT_CONFIG_SUFFIX, _DEFAULT_TASK = list(CONFIG_NAME_AND_TASKS_PAIRS)[_DEFAULT_TASK_IDX[0]]
+
+    # check `languages` variable and create config accordingly
+    if languages is None:
+        # set languages arg as list of first entry in `_LANGUAGES` if no lang arg received
+        _languages = _LANGUAGES[0]
+
+        config_list += [
+            SEACrowdConfig(
+                name=f"{_DATASETNAME}_{config_name_prefix}",
+                version=datasets.Version(version),
+                description=f"{_DATASETNAME} {config_name_prefix} schema for default task arg ({_DEFAULT_TASK})",
+                schema=f"{config_name_prefix}_{_DEFAULT_CONFIG_SUFFIX}",
+                subset_id=_languages,
+            )
+            for (config_name_prefix, version) in SCHEMA_PREFIX_AND_VERSION_PAIRS
+        ]
+        config_list += [
+            SEACrowdConfig(
+                name=f"{_DATASETNAME}_{config_name_prefix}_{config_name_suffix}",
+                version=datasets.Version(version),
+                description=f"{_DATASETNAME} {config_name_prefix} schema for {task_obj.name}",
+                schema=f"{config_name_prefix}_{config_name_suffix}",
+                subset_id=_languages,
+            )
+            for (config_name_prefix, version), (config_name_suffix, task_obj) in product(SCHEMA_PREFIX_AND_VERSION_PAIRS, CONFIG_NAME_AND_TASKS_PAIRS)
+        ]
+
+    # else, construct configs based on its lang
+    else:
+        for _LANG in languages:
+            config_list += [
+                SEACrowdConfig(
+                    name=f"{_DATASETNAME}_{config_name_prefix}_{_LANG}_{config_name_suffix}",
+                    version=datasets.Version(version),
+                    description=f"{_DATASETNAME} {config_name_prefix} schema for {task_obj.name} and language code {_LANG}",
+                    schema=f"{config_name_prefix}_{config_name_suffix}",
+                    subset_id=_LANG,
+                )
+                for (config_name_prefix, version), (config_name_suffix, task_obj) in product(SCHEMA_PREFIX_AND_VERSION_PAIRS, CONFIG_NAME_AND_TASKS_PAIRS)
+            ]
+
+    return config_list
+
+
+class SEAWikiDataset(datasets.GeneratorBasedBuilder):
+    """SEA Wiki dataset from https://huggingface.co/datasets/sabilmakbar/sea_wiki"""
+
+    # get all schema w/o lang arg + get all schema w/ lang arg
+    BUILDER_CONFIGS = construct_configs() + construct_configs(_LANGUAGES)
+
+    def _info(self) -> datasets.DatasetInfo:
+        _config_schema_name = self.config.schema
+        logger.info(f"Received schema name: {self.config.schema}")
+        # self supervised training schema
+        if CONFIG_SUFFIXES_FOR_TASK[0] in _config_schema_name:
+            if "source" in _config_schema_name:
+                features = datasets.Features({"url": datasets.Value("string"), "text": datasets.Value("string")})
+
+            elif "seacrowd" in _config_schema_name:
+                features = schemas.ssp_features
+
+            else:
+                raise ValueError(f"Unexpected schema received! {_config_schema_name}")
+
+        # summarization schema
+        elif CONFIG_SUFFIXES_FOR_TASK[1] in _config_schema_name:
+            if "source" in _config_schema_name:
+                features = datasets.Features({"url": datasets.Value("string"), "title": datasets.Value("string"), "text": datasets.Value("string")})
+
+            elif "seacrowd" in _config_schema_name:
+                features = schemas.text2text_features
+
+            else:
+                raise ValueError(f"Unexpected schema received! {_config_schema_name}")
+
+        else:
+            raise ValueError(f"Received unexpected config schema of {_config_schema_name}!")
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]:
+        # args of dl_manager is a placeholder since this data loader will wrap the hf `load_dataset` from given _URL
+        # directly using `_load_hf_data_from_remote`
+        return [datasets.SplitGenerator(name=datasets.Split.TRAIN)]
+
+    def _load_hf_data_from_remote(self):
+        # construct remote_hf_reference by the last 2 of string-spliited of "/"
+        _remote_hf_reference = "/".join(_URL.split("/")[-2:])
+        _lang_args = _LANG_CONFIG[self.config.subset_id]["source_subset"]
+        _split = "train"
+
+        logger.info(f"Loading dataset from remote HF {_remote_hf_reference} with seacrowd lang args of {self.config.subset_id} and source lang args of {_lang_args} and split args of {_split}")
+        _hf_dataset_source = load_dataset(_remote_hf_reference, lang=_lang_args, split=_split)
+
+        return _hf_dataset_source
+
+    def _generate_examples(self) -> Tuple[int, Dict]:
+
+        _config_schema_name = self.config.schema
+        loaded_data = self._load_hf_data_from_remote()
+
+        # iterate over datapoints and arrange hf dataset schema in source to match w/ config args:
+        for id_, _data in enumerate(loaded_data):
+            if "source" in _config_schema_name:
+                yield id_, {colname: _data[colname] for colname in self.info.features}
+
+            # for ssp schema
+            elif "seacrowd" in _config_schema_name and CONFIG_SUFFIXES_FOR_TASK[0] in _config_schema_name:
+                yield id_, {"id": id_, "text": _data["text"]}
+
+            # for summary schema
+            elif "seacrowd" in _config_schema_name and CONFIG_SUFFIXES_FOR_TASK[1] in _config_schema_name:
+                yield id_, {"id": id_, "text_1": _data["text"], "text_2": _data["title"], "text_1_name": "document", "text_2_name": "title"}
+
+            else:
+                raise ValueError(f"Received unexpected config schema of {_config_schema_name}!")