From 85ef91d4793292b4e61a589d9e6ea8f07f48a9ac Mon Sep 17 00:00:00 2001 From: Lj Miranda Date: Mon, 13 Nov 2023 23:23:13 +0800 Subject: [PATCH 1/3] Add dataset loader for ijelid --- seacrowd/sea_datasets/ijelid/__init__.py | 0 seacrowd/sea_datasets/ijelid/ijelid.py | 142 +++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 seacrowd/sea_datasets/ijelid/__init__.py create mode 100644 seacrowd/sea_datasets/ijelid/ijelid.py diff --git a/seacrowd/sea_datasets/ijelid/__init__.py b/seacrowd/sea_datasets/ijelid/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/ijelid/ijelid.py b/seacrowd/sea_datasets/ijelid/ijelid.py new file mode 100644 index 000000000..8e81bf46c --- /dev/null +++ b/seacrowd/sea_datasets/ijelid/ijelid.py @@ -0,0 +1,142 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +from datasets.download.download_manager import DownloadManager + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@article{hidayatullah2023corpus, + title={Corpus creation and language identification for code-mixed Indonesian-Javanese-English Tweets}, + author={Hidayatullah, Ahmad Fathan and Apong, Rosyzie Anna and Lai, Daphne TC and Qazi, Atika}, + journal={PeerJ Computer Science}, + volume={9}, + pages={e1312}, + year={2023}, + publisher={PeerJ Inc.} +} +""" + +_LOCAL = False +_LANGUAGES = ["ind", "jav", "eng"] +_DATASETNAME = "ijelid" +_DESCRIPTION = """\ +This is a code-mixed Indonesian-Javanese-English dataset for token-level +language identification. We named this dataset as IJELID +(Indonesian-Javanese-English Language Identification). This dataset contains +tweets that have been tokenized with the corresponding token and its language +label. There are seven language labels in the dataset, namely: ID (Indonesian)JV +(Javanese), EN (English), MIX_ID_EN (mixed Indonesian-English), MIX_ID_JV (mixed +Indonesian-Javanese), MIX_JV_EN (mixed Javanese-English), OTH (Other). +""" + +_HOMEPAGE = "https://github.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data" +_LICENSE = Licenses.CC_BY_NC_SA_4_0.value +_URLS = { + "train": "https://raw.githubusercontent.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data/main/train.tsv", + "dev": "https://raw.githubusercontent.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data/main/val.tsv", + "test": "https://raw.githubusercontent.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data/main/test.tsv", +} + +_SUPPORTED_TASKS = [Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class IJELIDDataset(datasets.GeneratorBasedBuilder): + """IJELID dataset from https://github.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "seq_label" + LABEL_CLASSES = ["ID", "JV", "EN", "MIX_ID_EN", "MIX_ID_JV", "MIX_JV_EN", "OTH"] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + # No specific schema for the source, so for consistency, + # I will use the same schema with SEACrowd + features = schemas.seq_label_features(self.LABEL_CLASSES) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + data_files = { + "train": Path(dl_manager.download_and_extract(_URLS["train"])), + "dev": Path(dl_manager.download_and_extract(_URLS["dev"])), + "test": Path(dl_manager.download_and_extract(_URLS["test"])), + } + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_files["train"], "split": "train"}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": data_files["dev"], "split": "dev"}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": data_files["test"], "split": "test"}, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yield examples as (key, example) tuples""" + with open(filepath, encoding="utf-8") as f: + guid = 0 + tokens = [] + labels = [] + for line in f: + if line == "" or line == "\n": + if tokens: + yield guid, { + "id": str(guid), + "tokens": tokens, + "labels": labels, + } + guid += 1 + tokens = [] + labels = [] + else: + # IJELID TSV are separated by \t + token, label = line.split("\t") + tokens.append(token) + labels.append(label.rstrip()) + + # Last example + if tokens: + yield guid, { + "id": str(guid), + "tokens": tokens, + "labels": labels, + } From b472a1c53531f71f8091de41226c94e0ef3d002e Mon Sep 17 00:00:00 2001 From: Lj Miranda Date: Tue, 14 Nov 2023 10:55:32 +0800 Subject: [PATCH 2/3] Add task for token-level language identification --- seacrowd/utils/constants.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/seacrowd/utils/constants.py b/seacrowd/utils/constants.py index 388ac1c62..27182cb88 100644 --- a/seacrowd/utils/constants.py +++ b/seacrowd/utils/constants.py @@ -53,6 +53,7 @@ class Tasks(Enum): KEYWORD_TAGGING = "KT" NAMED_ENTITY_RECOGNITION = "NER" SENTENCE_ORDERING = "SO" + TOKEN_LEVEL_LANGUAGE_IDENTIFICATION = "LANGID" # Pair Text Classification QUESTION_ANSWERING = "QA" @@ -182,6 +183,7 @@ class Licenses(Enum): Tasks.POS_TAGGING: "SEQ_LABEL", Tasks.KEYWORD_TAGGING: "SEQ_LABEL", Tasks.SENTENCE_ORDERING: "SEQ_LABEL", + Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION: "SEQ_LABEL", Tasks.QUESTION_ANSWERING: "QA", Tasks.TEXTUAL_ENTAILMENT: "PAIRS", Tasks.SEMANTIC_SIMILARITY: "PAIRS_SCORE", From bccbdaf424a24440c3b15004369597988dc8ebb0 Mon Sep 17 00:00:00 2001 From: Salsabil Maulana Akbar Date: Mon, 20 Nov 2023 09:59:23 +0700 Subject: [PATCH 3/3] Closes #28 | Add SEA Wiki loader (#38) * Add SEA Wiki loader * Update dset script and json to conform with unit-tests and configs defined * Add exception on _info for unexpected schema name received * Fix values in SEACrowd Schema post-review --- seacrowd/sea_datasets/sea_wiki/__init__.py | 0 .../sea_datasets/sea_wiki/lang_config.json | 110 +++++++++ seacrowd/sea_datasets/sea_wiki/sea_wiki.py | 219 ++++++++++++++++++ 3 files changed, 329 insertions(+) create mode 100644 seacrowd/sea_datasets/sea_wiki/__init__.py create mode 100644 seacrowd/sea_datasets/sea_wiki/lang_config.json create mode 100644 seacrowd/sea_datasets/sea_wiki/sea_wiki.py diff --git a/seacrowd/sea_datasets/sea_wiki/__init__.py b/seacrowd/sea_datasets/sea_wiki/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/sea_wiki/lang_config.json b/seacrowd/sea_datasets/sea_wiki/lang_config.json new file mode 100644 index 000000000..b35740bc0 --- /dev/null +++ b/seacrowd/sea_datasets/sea_wiki/lang_config.json @@ -0,0 +1,110 @@ +{ + "ace": { + "name": "Acehnese", + "source_subset": "ace" + }, + "ban": { + "name": "Balinese", + "source_subset": "ban" + }, + "bcl": { + "name": "Central Bicolano", + "source_subset": "bcl" + }, + "bjn": { + "name": "Banjarese", + "source_subset": "bjn" + }, + "bug": { + "name": "Buginese", + "source_subset": "bug" + }, + "cbk": { + "name": "Chavacano", + "source_subset": "cbk-zam" + }, + "gor": { + "name": "Gorontalo", + "source_subset": "gor" + }, + "ilo": { + "name": "Ilokano", + "source_subset": "ilo" + }, + "ind": { + "name": "Indonesian", + "source_subset": "id" + }, + "jav": { + "name": "Javanese", + "source_subset": "jv" + }, + "khm": { + "name": "Khmer", + "source_subset": "km" + }, + "lao": { + "name": "Lao", + "source_subset": "lo" + }, + "mad": { + "name": "Madurese", + "source_subset": "mad" + }, + "map_bms": { + "name": "Banyumasan (Dialect of Javanese)", + "source_subset": "map-bms" + }, + "min": { + "name": "Minangkabau", + "source_subset": "min" + }, + "mnw": { + "name": "Mon", + "source_subset": "min" + }, + "mya": { + "name": "Burmese", + "source_subset": "my" + }, + "nia": { + "name": "Nias", + "source_subset": "nia" + }, + "pag": { + "name": "Pangasinan", + "source_subset": "pag" + }, + "pam": { + "name": "Kapampangan", + "source_subset": "pam" + }, + "shn": { + "name": "Shan", + "source_subset": "shn" + }, + "sun": { + "name": "Sundanese", + "source_subset": "su" + }, + "tet": { + "name": "Tetum", + "source_subset": "tet" + }, + "tgl": { + "name": "Tagalog", + "source_subset": "tl" + }, + "tha": { + "name": "Thai", + "source_subset": "th" + }, + "vie": { + "name": "Vietnamese", + "source_subset": "vi" + }, + "war": { + "name": "Waray", + "source_subset": "war" + } +} diff --git a/seacrowd/sea_datasets/sea_wiki/sea_wiki.py b/seacrowd/sea_datasets/sea_wiki/sea_wiki.py new file mode 100644 index 000000000..f763173e2 --- /dev/null +++ b/seacrowd/sea_datasets/sea_wiki/sea_wiki.py @@ -0,0 +1,219 @@ +""" +SEA Crowd Data Loader for SEA Wiki. +""" + +import json +from itertools import product +from typing import Dict, List, Tuple + +import datasets +from datasets import load_dataset +from datasets.download.download_manager import DownloadManager + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@ONLINE{wikidump, + author = "Wikimedia Foundation", + title = "Wikimedia Downloads", + url = "https://dumps.wikimedia.org"} +@ONLINE{wikipedia-hf, + title = "Huggingface Wikipedia Dataset", + url = "https://huggingface.co/datasets/wikipedia"} +@ONLINE{wikipedia-hf, + title = "Huggingface SEA Wikipedia Dataset", + url = "https://huggingface.co/datasets/sabilmakbar/sea_wiki"} +""" + +logger = datasets.logging.get_logger(__name__) + + +with open(DownloadManager().download_and_extract("seacrowd/sea_datasets/sea_wiki/lang_config.json"), "r") as f: + _LANG_CONFIG = json.load(f) + +_LOCAL = False +_LANGUAGES = list(_LANG_CONFIG.keys()) + +_DATASETNAME = "sea_wiki" +_DESCRIPTION = """\ + SEA Lang & Local Langs Wikipedia Archives, dumped from WIkipedia HF and processed by boilerplate removal. + This dataset consists of URL of referred Wikipedia Article, its Title, and its Text Data (Article Contents). +""" + +_HOMEPAGE = "https://huggingface.co/datasets/sabilmakbar/sea_wiki" +_LICENSE = Licenses.CC_BY_SA_4_0.value + +# url won't be used since it will implement load_dataset method on HF URL provided +_URL = "https://huggingface.co/datasets/sabilmakbar/sea_wiki" + +_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING, Tasks.SUMMARIZATION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + +CONFIG_SUFFIXES_FOR_TASK = ["ssp", "t2t"] + + +def conform_init_config(): + """Assertion Function for Instantiated Configs""" + if len(_LANGUAGES) == 0: + raise AssertionError("No Languages detected from config!") + if len(CONFIG_SUFFIXES_FOR_TASK) != len(_SUPPORTED_TASKS): + raise AssertionError("Config prefixes doesn't matched in terms of `len` with `_SUPPORTED_TASKS`!") + if len(CONFIG_SUFFIXES_FOR_TASK) == 0: + raise AssertionError("Config prefixes and `_SUPPORTED_TASKS` have `len` of 0!") + + +conform_init_config() + +# construct zipped arg for config instantiation +SCHEMA_PREFIX_AND_VERSION_PAIRS = list(zip(("source", "seacrowd"), (_SOURCE_VERSION, _SEACROWD_VERSION))) +CONFIG_NAME_AND_TASKS_PAIRS = list(zip(CONFIG_SUFFIXES_FOR_TASK, _SUPPORTED_TASKS)) + + +def construct_configs(languages: list = None) -> List[SEACrowdConfig]: + """ + The function `construct_configs` constructs a list of SEACrowdConfig objects based on the provided + languages or a default language, and returns the list. + + input: + languages (list, default None): The `languages` parameter is a list that specifies the languages for which the + configurations need to be constructed. If no languages are provided (value=None), the first value in language config + will be used. + output: + a list of `SEACrowdConfig` objects based on instantiated init variables + """ + # set output var + config_list = [] + + # set default task for default config w/o task arg name (set to Tasks.SUMMARIZATION) + _DEFAULT_TASK_IDX = [idx for idx, val in enumerate(_SUPPORTED_TASKS) if val == Tasks.SUMMARIZATION] + + # assert `_DEFAULT_TASK_IDX` to have len of 1 + if len(_DEFAULT_TASK_IDX) != 1: + raise AssertionError("Unexpected `_DEFAULT_TASK` #item!") + + _DEFAULT_CONFIG_SUFFIX, _DEFAULT_TASK = list(CONFIG_NAME_AND_TASKS_PAIRS)[_DEFAULT_TASK_IDX[0]] + + # check `languages` variable and create config accordingly + if languages is None: + # set languages arg as list of first entry in `_LANGUAGES` if no lang arg received + _languages = _LANGUAGES[0] + + config_list += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{config_name_prefix}", + version=datasets.Version(version), + description=f"{_DATASETNAME} {config_name_prefix} schema for default task arg ({_DEFAULT_TASK})", + schema=f"{config_name_prefix}_{_DEFAULT_CONFIG_SUFFIX}", + subset_id=_languages, + ) + for (config_name_prefix, version) in SCHEMA_PREFIX_AND_VERSION_PAIRS + ] + config_list += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{config_name_prefix}_{config_name_suffix}", + version=datasets.Version(version), + description=f"{_DATASETNAME} {config_name_prefix} schema for {task_obj.name}", + schema=f"{config_name_prefix}_{config_name_suffix}", + subset_id=_languages, + ) + for (config_name_prefix, version), (config_name_suffix, task_obj) in product(SCHEMA_PREFIX_AND_VERSION_PAIRS, CONFIG_NAME_AND_TASKS_PAIRS) + ] + + # else, construct configs based on its lang + else: + for _LANG in languages: + config_list += [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{config_name_prefix}_{_LANG}_{config_name_suffix}", + version=datasets.Version(version), + description=f"{_DATASETNAME} {config_name_prefix} schema for {task_obj.name} and language code {_LANG}", + schema=f"{config_name_prefix}_{config_name_suffix}", + subset_id=_LANG, + ) + for (config_name_prefix, version), (config_name_suffix, task_obj) in product(SCHEMA_PREFIX_AND_VERSION_PAIRS, CONFIG_NAME_AND_TASKS_PAIRS) + ] + + return config_list + + +class SEAWikiDataset(datasets.GeneratorBasedBuilder): + """SEA Wiki dataset from https://huggingface.co/datasets/sabilmakbar/sea_wiki""" + + # get all schema w/o lang arg + get all schema w/ lang arg + BUILDER_CONFIGS = construct_configs() + construct_configs(_LANGUAGES) + + def _info(self) -> datasets.DatasetInfo: + _config_schema_name = self.config.schema + logger.info(f"Received schema name: {self.config.schema}") + # self supervised training schema + if CONFIG_SUFFIXES_FOR_TASK[0] in _config_schema_name: + if "source" in _config_schema_name: + features = datasets.Features({"url": datasets.Value("string"), "text": datasets.Value("string")}) + + elif "seacrowd" in _config_schema_name: + features = schemas.ssp_features + + else: + raise ValueError(f"Unexpected schema received! {_config_schema_name}") + + # summarization schema + elif CONFIG_SUFFIXES_FOR_TASK[1] in _config_schema_name: + if "source" in _config_schema_name: + features = datasets.Features({"url": datasets.Value("string"), "title": datasets.Value("string"), "text": datasets.Value("string")}) + + elif "seacrowd" in _config_schema_name: + features = schemas.text2text_features + + else: + raise ValueError(f"Unexpected schema received! {_config_schema_name}") + + else: + raise ValueError(f"Received unexpected config schema of {_config_schema_name}!") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: + # args of dl_manager is a placeholder since this data loader will wrap the hf `load_dataset` from given _URL + # directly using `_load_hf_data_from_remote` + return [datasets.SplitGenerator(name=datasets.Split.TRAIN)] + + def _load_hf_data_from_remote(self): + # construct remote_hf_reference by the last 2 of string-spliited of "/" + _remote_hf_reference = "/".join(_URL.split("/")[-2:]) + _lang_args = _LANG_CONFIG[self.config.subset_id]["source_subset"] + _split = "train" + + logger.info(f"Loading dataset from remote HF {_remote_hf_reference} with seacrowd lang args of {self.config.subset_id} and source lang args of {_lang_args} and split args of {_split}") + _hf_dataset_source = load_dataset(_remote_hf_reference, lang=_lang_args, split=_split) + + return _hf_dataset_source + + def _generate_examples(self) -> Tuple[int, Dict]: + + _config_schema_name = self.config.schema + loaded_data = self._load_hf_data_from_remote() + + # iterate over datapoints and arrange hf dataset schema in source to match w/ config args: + for id_, _data in enumerate(loaded_data): + if "source" in _config_schema_name: + yield id_, {colname: _data[colname] for colname in self.info.features} + + # for ssp schema + elif "seacrowd" in _config_schema_name and CONFIG_SUFFIXES_FOR_TASK[0] in _config_schema_name: + yield id_, {"id": id_, "text": _data["text"]} + + # for summary schema + elif "seacrowd" in _config_schema_name and CONFIG_SUFFIXES_FOR_TASK[1] in _config_schema_name: + yield id_, {"id": id_, "text_1": _data["text"], "text_2": _data["title"], "text_1_name": "document", "text_2_name": "title"} + + else: + raise ValueError(f"Received unexpected config schema of {_config_schema_name}!")