Skip to content

Commit

Permalink
Closes #28 | Add SEA Wiki loader (#38)
Browse files Browse the repository at this point in the history
* Add SEA Wiki loader

* Update dset script and json to conform with unit-tests and configs defined

* Add exception on _info for unexpected schema name received

* Fix values in SEACrowd Schema post-review
  • Loading branch information
sabilmakbar authored Nov 20, 2023
1 parent 68e34f7 commit bccbdaf
Show file tree
Hide file tree
Showing 3 changed files with 329 additions and 0 deletions.
Empty file.
110 changes: 110 additions & 0 deletions seacrowd/sea_datasets/sea_wiki/lang_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
{
"ace": {
"name": "Acehnese",
"source_subset": "ace"
},
"ban": {
"name": "Balinese",
"source_subset": "ban"
},
"bcl": {
"name": "Central Bicolano",
"source_subset": "bcl"
},
"bjn": {
"name": "Banjarese",
"source_subset": "bjn"
},
"bug": {
"name": "Buginese",
"source_subset": "bug"
},
"cbk": {
"name": "Chavacano",
"source_subset": "cbk-zam"
},
"gor": {
"name": "Gorontalo",
"source_subset": "gor"
},
"ilo": {
"name": "Ilokano",
"source_subset": "ilo"
},
"ind": {
"name": "Indonesian",
"source_subset": "id"
},
"jav": {
"name": "Javanese",
"source_subset": "jv"
},
"khm": {
"name": "Khmer",
"source_subset": "km"
},
"lao": {
"name": "Lao",
"source_subset": "lo"
},
"mad": {
"name": "Madurese",
"source_subset": "mad"
},
"map_bms": {
"name": "Banyumasan (Dialect of Javanese)",
"source_subset": "map-bms"
},
"min": {
"name": "Minangkabau",
"source_subset": "min"
},
"mnw": {
"name": "Mon",
"source_subset": "min"
},
"mya": {
"name": "Burmese",
"source_subset": "my"
},
"nia": {
"name": "Nias",
"source_subset": "nia"
},
"pag": {
"name": "Pangasinan",
"source_subset": "pag"
},
"pam": {
"name": "Kapampangan",
"source_subset": "pam"
},
"shn": {
"name": "Shan",
"source_subset": "shn"
},
"sun": {
"name": "Sundanese",
"source_subset": "su"
},
"tet": {
"name": "Tetum",
"source_subset": "tet"
},
"tgl": {
"name": "Tagalog",
"source_subset": "tl"
},
"tha": {
"name": "Thai",
"source_subset": "th"
},
"vie": {
"name": "Vietnamese",
"source_subset": "vi"
},
"war": {
"name": "Waray",
"source_subset": "war"
}
}
219 changes: 219 additions & 0 deletions seacrowd/sea_datasets/sea_wiki/sea_wiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
"""
SEA Crowd Data Loader for SEA Wiki.
"""

import json
from itertools import product
from typing import Dict, List, Tuple

import datasets
from datasets import load_dataset
from datasets.download.download_manager import DownloadManager

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Licenses, Tasks

_CITATION = """
@ONLINE{wikidump,
author = "Wikimedia Foundation",
title = "Wikimedia Downloads",
url = "https://dumps.wikimedia.org"}
@ONLINE{wikipedia-hf,
title = "Huggingface Wikipedia Dataset",
url = "https://huggingface.co/datasets/wikipedia"}
@ONLINE{wikipedia-hf,
title = "Huggingface SEA Wikipedia Dataset",
url = "https://huggingface.co/datasets/sabilmakbar/sea_wiki"}
"""

logger = datasets.logging.get_logger(__name__)


with open(DownloadManager().download_and_extract("seacrowd/sea_datasets/sea_wiki/lang_config.json"), "r") as f:
_LANG_CONFIG = json.load(f)

_LOCAL = False
_LANGUAGES = list(_LANG_CONFIG.keys())

_DATASETNAME = "sea_wiki"
_DESCRIPTION = """\
SEA Lang & Local Langs Wikipedia Archives, dumped from WIkipedia HF and processed by boilerplate removal.
This dataset consists of URL of referred Wikipedia Article, its Title, and its Text Data (Article Contents).
"""

_HOMEPAGE = "https://huggingface.co/datasets/sabilmakbar/sea_wiki"
_LICENSE = Licenses.CC_BY_SA_4_0.value

# url won't be used since it will implement load_dataset method on HF URL provided
_URL = "https://huggingface.co/datasets/sabilmakbar/sea_wiki"

_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING, Tasks.SUMMARIZATION]
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"

CONFIG_SUFFIXES_FOR_TASK = ["ssp", "t2t"]


def conform_init_config():
"""Assertion Function for Instantiated Configs"""
if len(_LANGUAGES) == 0:
raise AssertionError("No Languages detected from config!")
if len(CONFIG_SUFFIXES_FOR_TASK) != len(_SUPPORTED_TASKS):
raise AssertionError("Config prefixes doesn't matched in terms of `len` with `_SUPPORTED_TASKS`!")
if len(CONFIG_SUFFIXES_FOR_TASK) == 0:
raise AssertionError("Config prefixes and `_SUPPORTED_TASKS` have `len` of 0!")


conform_init_config()

# construct zipped arg for config instantiation
SCHEMA_PREFIX_AND_VERSION_PAIRS = list(zip(("source", "seacrowd"), (_SOURCE_VERSION, _SEACROWD_VERSION)))
CONFIG_NAME_AND_TASKS_PAIRS = list(zip(CONFIG_SUFFIXES_FOR_TASK, _SUPPORTED_TASKS))


def construct_configs(languages: list = None) -> List[SEACrowdConfig]:
"""
The function `construct_configs` constructs a list of SEACrowdConfig objects based on the provided
languages or a default language, and returns the list.
input:
languages (list, default None): The `languages` parameter is a list that specifies the languages for which the
configurations need to be constructed. If no languages are provided (value=None), the first value in language config
will be used.
output:
a list of `SEACrowdConfig` objects based on instantiated init variables
"""
# set output var
config_list = []

# set default task for default config w/o task arg name (set to Tasks.SUMMARIZATION)
_DEFAULT_TASK_IDX = [idx for idx, val in enumerate(_SUPPORTED_TASKS) if val == Tasks.SUMMARIZATION]

# assert `_DEFAULT_TASK_IDX` to have len of 1
if len(_DEFAULT_TASK_IDX) != 1:
raise AssertionError("Unexpected `_DEFAULT_TASK` #item!")

_DEFAULT_CONFIG_SUFFIX, _DEFAULT_TASK = list(CONFIG_NAME_AND_TASKS_PAIRS)[_DEFAULT_TASK_IDX[0]]

# check `languages` variable and create config accordingly
if languages is None:
# set languages arg as list of first entry in `_LANGUAGES` if no lang arg received
_languages = _LANGUAGES[0]

config_list += [
SEACrowdConfig(
name=f"{_DATASETNAME}_{config_name_prefix}",
version=datasets.Version(version),
description=f"{_DATASETNAME} {config_name_prefix} schema for default task arg ({_DEFAULT_TASK})",
schema=f"{config_name_prefix}_{_DEFAULT_CONFIG_SUFFIX}",
subset_id=_languages,
)
for (config_name_prefix, version) in SCHEMA_PREFIX_AND_VERSION_PAIRS
]
config_list += [
SEACrowdConfig(
name=f"{_DATASETNAME}_{config_name_prefix}_{config_name_suffix}",
version=datasets.Version(version),
description=f"{_DATASETNAME} {config_name_prefix} schema for {task_obj.name}",
schema=f"{config_name_prefix}_{config_name_suffix}",
subset_id=_languages,
)
for (config_name_prefix, version), (config_name_suffix, task_obj) in product(SCHEMA_PREFIX_AND_VERSION_PAIRS, CONFIG_NAME_AND_TASKS_PAIRS)
]

# else, construct configs based on its lang
else:
for _LANG in languages:
config_list += [
SEACrowdConfig(
name=f"{_DATASETNAME}_{config_name_prefix}_{_LANG}_{config_name_suffix}",
version=datasets.Version(version),
description=f"{_DATASETNAME} {config_name_prefix} schema for {task_obj.name} and language code {_LANG}",
schema=f"{config_name_prefix}_{config_name_suffix}",
subset_id=_LANG,
)
for (config_name_prefix, version), (config_name_suffix, task_obj) in product(SCHEMA_PREFIX_AND_VERSION_PAIRS, CONFIG_NAME_AND_TASKS_PAIRS)
]

return config_list


class SEAWikiDataset(datasets.GeneratorBasedBuilder):
"""SEA Wiki dataset from https://huggingface.co/datasets/sabilmakbar/sea_wiki"""

# get all schema w/o lang arg + get all schema w/ lang arg
BUILDER_CONFIGS = construct_configs() + construct_configs(_LANGUAGES)

def _info(self) -> datasets.DatasetInfo:
_config_schema_name = self.config.schema
logger.info(f"Received schema name: {self.config.schema}")
# self supervised training schema
if CONFIG_SUFFIXES_FOR_TASK[0] in _config_schema_name:
if "source" in _config_schema_name:
features = datasets.Features({"url": datasets.Value("string"), "text": datasets.Value("string")})

elif "seacrowd" in _config_schema_name:
features = schemas.ssp_features

else:
raise ValueError(f"Unexpected schema received! {_config_schema_name}")

# summarization schema
elif CONFIG_SUFFIXES_FOR_TASK[1] in _config_schema_name:
if "source" in _config_schema_name:
features = datasets.Features({"url": datasets.Value("string"), "title": datasets.Value("string"), "text": datasets.Value("string")})

elif "seacrowd" in _config_schema_name:
features = schemas.text2text_features

else:
raise ValueError(f"Unexpected schema received! {_config_schema_name}")

else:
raise ValueError(f"Received unexpected config schema of {_config_schema_name}!")

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]:
# args of dl_manager is a placeholder since this data loader will wrap the hf `load_dataset` from given _URL
# directly using `_load_hf_data_from_remote`
return [datasets.SplitGenerator(name=datasets.Split.TRAIN)]

def _load_hf_data_from_remote(self):
# construct remote_hf_reference by the last 2 of string-spliited of "/"
_remote_hf_reference = "/".join(_URL.split("/")[-2:])
_lang_args = _LANG_CONFIG[self.config.subset_id]["source_subset"]
_split = "train"

logger.info(f"Loading dataset from remote HF {_remote_hf_reference} with seacrowd lang args of {self.config.subset_id} and source lang args of {_lang_args} and split args of {_split}")
_hf_dataset_source = load_dataset(_remote_hf_reference, lang=_lang_args, split=_split)

return _hf_dataset_source

def _generate_examples(self) -> Tuple[int, Dict]:

_config_schema_name = self.config.schema
loaded_data = self._load_hf_data_from_remote()

# iterate over datapoints and arrange hf dataset schema in source to match w/ config args:
for id_, _data in enumerate(loaded_data):
if "source" in _config_schema_name:
yield id_, {colname: _data[colname] for colname in self.info.features}

# for ssp schema
elif "seacrowd" in _config_schema_name and CONFIG_SUFFIXES_FOR_TASK[0] in _config_schema_name:
yield id_, {"id": id_, "text": _data["text"]}

# for summary schema
elif "seacrowd" in _config_schema_name and CONFIG_SUFFIXES_FOR_TASK[1] in _config_schema_name:
yield id_, {"id": id_, "text_1": _data["text"], "text_2": _data["title"], "text_1_name": "document", "text_2_name": "title"}

else:
raise ValueError(f"Received unexpected config schema of {_config_schema_name}!")

0 comments on commit bccbdaf

Please sign in to comment.