Skip to content

Commit

Permalink
Add config helper (WIP) and missed constants in existing dataloaders
Browse files Browse the repository at this point in the history
  • Loading branch information
holylovenia committed Apr 4, 2024
1 parent 19b7793 commit b23bec1
Show file tree
Hide file tree
Showing 20 changed files with 1,080 additions and 9 deletions.
1,038 changes: 1,038 additions & 0 deletions seacrowd/config_helper.py

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/belebele/belebele.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@

_DEFAULT_LANG = "zsm"

_LOCAL = False

def config_constructor(belebele_subset: str, schema: str, version: str) -> SEACrowdConfig:
lang = _LANGUAGES[_SOURCE_NAMES.index(belebele_subset)]
return SEACrowdConfig(
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/bhinneka_korpus/bhinneka_korpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
"mkn": "kupang-malay",
}

_LOCAL = False


class BhinnekaKorpusDataset(datasets.GeneratorBasedBuilder):
"""A Collection of Multilingual Parallel Datasets for 5 Indonesian Local Languages."""
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/burmese_romanize/burmese_romanize.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"

_LOCAL = False


class BurmeseRomanizeDataset(datasets.GeneratorBasedBuilder):
"""Romanization of names in Burmese script"""
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/coco_35l/coco_35l.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@

_LANGUAGES = {"fil": "fil", "ind": "id", "tha": "th", "vie": "vi"}

_LOCAL = False

class Coco35LDataset(datasets.GeneratorBasedBuilder):
"""
COCO-35L is a machine-generated image caption dataset, constructed by translating COCO Captions (Chen et al., 2015) to the other 34 languages using Google’s machine translation API.
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/dengue_filipino/dengue_filipino.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"

_LOCAL = False


class DengueFilipinoDataset(datasets.GeneratorBasedBuilder):
"""Dengue Dataset Low-Resource Multi-label Text Classification Dataset in Filipino"""
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/id_msvd/id_msvd.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"

_LOCAL = False


class IdMsvdDataset(datasets.GeneratorBasedBuilder):
"""MSVD dataset with Indonesian translation."""
Expand Down
2 changes: 1 addition & 1 deletion seacrowd/sea_datasets/indommlu/indommlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ class IndoMMLUDataset(datasets.GeneratorBasedBuilder):
name=f"{_DATASETNAME}_{lang}_seacrowd_qa",
version=SEACROWD_VERSION,
description=f"{_DATASETNAME} {lang} SEACrowd schema",
schema=f"seacrowd_{lang}_qa",
schema=f"seacrowd_qa",
subset_id=_DATASETNAME,
)
BUILDER_CONFIGS.append(lang_config)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
}
"""

_LANGUAGES = ["ind"]

_DATASETNAME = "indonesian_news_dataset"

_DESCRIPTION = """An imbalanced dataset to classify Indonesian News articles.
Expand All @@ -44,6 +46,8 @@

_TAGS = ["bola", "news", "bisnis", "tekno", "otomotif"]

_LOCAL = False


class IndonesianNewsDataset(datasets.GeneratorBasedBuilder):
"""The dataset contains 5 Indonesian News articles with imbalanced classes"""
Expand Down
1 change: 1 addition & 0 deletions seacrowd/sea_datasets/mc4_indo/mc4_indo.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
# "full": {"train": 1, "validation": 1}
# }

_LOCAL = False

_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING]
_SOURCE_VERSION = "1.0.0"
Expand Down
3 changes: 3 additions & 0 deletions seacrowd/sea_datasets/memolon/memolon.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,16 @@
}

_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"

_LANGUAGES = ["ceb", "tgl", "ind", "sun", "jav", "zsm", "vie", "tha", "mya"]

_LANGUAGE_MAP = {"ceb": "Cebuano", "tgl": "Tagalog", "ind": "Indonesian", "sun": "Sundanese", "jav": "Javanese", "zsm": "Malay", "vie": "Vietnamese", "tha": "Thai", "mya": "Burmese"}

_SUPPORTED_TASKS = [Tasks.EMOTION_CLASSIFICATION]

_LOCAL = False


def seacrowd_config_constructor(lang: str, schema: str, version: str) -> SEACrowdConfig:
if lang not in _LANGUAGE_MAP:
Expand Down
3 changes: 3 additions & 0 deletions seacrowd/sea_datasets/miracl/miracl.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,11 @@

_SEACROWD_VERSION = "1.0.0"

_LOCAL = False


def load_topic(fn):

qid2topic = {}
with open(fn, encoding="utf-8") as f:
for line in f:
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/mlqa/mlqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"

_LOCAL = False


class MLQADataset(datasets.GeneratorBasedBuilder):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class MTOPIntentClassificationDataset(datasets.GeneratorBasedBuilder):
version=datasets.Version(_SOURCE_VERSION),
description=f"{_DATASETNAME} source schema for {subset} subset",
schema="source",
subset_id=subset,
subset_id=f"{_DATASETNAME}_{subset}",
)
for subset in SUBSETS
] + [
Expand All @@ -76,7 +76,7 @@ class MTOPIntentClassificationDataset(datasets.GeneratorBasedBuilder):
version=datasets.Version(_SEACROWD_VERSION),
description=f"{_DATASETNAME} SEACrowd schema for {subset} subset",
schema="seacrowd_text",
subset_id=subset,
subset_id=f"{_DATASETNAME}_{subset}",
)
for subset in SUBSETS
]
Expand Down
12 changes: 6 additions & 6 deletions seacrowd/sea_datasets/my_paraphrase/my_paraphrase.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,42 +74,42 @@ class MyParaphraseDataset(datasets.GeneratorBasedBuilder):
name=f"{_DATASETNAME}_source", # source
version=SOURCE_VERSION,
description=f"{_DATASETNAME} source schema",
schema="paraphrase_source",
schema="source",
subset_id=f"{_DATASETNAME}_paraphrase",
),
SEACrowdConfig(
name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", # schema
version=SEACROWD_VERSION,
description=f"{_DATASETNAME} SEACrowd schema",
schema=f"seacrowd_paraphrase_{SEACROWD_SCHEMA_NAME}",
schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}",
subset_id=f"{_DATASETNAME}_paraphrase",
),
SEACrowdConfig(
name=f"{_DATASETNAME}_non_paraphrase_source", # source
version=SEACROWD_VERSION,
description=f"{_DATASETNAME} SEACrowd schema",
schema="non_paraphrase_source",
schema="source",
subset_id=f"{_DATASETNAME}_non_paraphrase",
),
SEACrowdConfig(
name=f"{_DATASETNAME}_non_paraphrase_seacrowd_{SEACROWD_SCHEMA_NAME}", # schema
version=SEACROWD_VERSION,
description=f"{_DATASETNAME} SEACrowd schema",
schema=f"seacrowd_non_paraphrase_{SEACROWD_SCHEMA_NAME}",
schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}",
subset_id=f"{_DATASETNAME}_non_paraphrase",
),
SEACrowdConfig(
name=f"{_DATASETNAME}_all_source", # source
version=SOURCE_VERSION,
description=f"{_DATASETNAME} source schema",
schema="all_source",
schema="source",
subset_id=f"{_DATASETNAME}_all",
),
SEACrowdConfig(
name=f"{_DATASETNAME}_all_seacrowd_{SEACROWD_SCHEMA_NAME}", # schema
version=SEACROWD_VERSION,
description=f"{_DATASETNAME} SEACrowd schema",
schema=f"seacrowd_all_{SEACROWD_SCHEMA_NAME}",
schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}",
subset_id=f"{_DATASETNAME}_all",
),
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@

_SUPPORTED_TASKS = [Tasks.FACT_CHECKING]
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"


class PhilippineFakeNewsDataset(datasets.GeneratorBasedBuilder):
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/sap_wat/sap_wat.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@

_SUBSET = ["id", "ms", "th", "vi"]

_LOCAL = False

class SapWatDataset(datasets.GeneratorBasedBuilder):
"""SAP WAT is a software documentation dataset for machine translation. The current language scope is English to Hindi,
Indonesian, Japanese, Korean, Malay, Thai, Vietnamese, Simplified Chinese and Traditional Chinese. Here, we only consider
Expand Down
1 change: 1 addition & 0 deletions seacrowd/sea_datasets/tydiqa/tydiqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING]
_LANGUAGES = ["ind", "tha"]
_LOCAL = False
_SOURCE_VERSION = "1.0.0"
_SOURCE_VERSION_P = "1.0.0"
_SOURCE_VERSION_S = "1.1.0"
_SEACROWD_VERSION = "1.0.0"
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@

_SEACROWD_VERSION = "1.0.0"

_LOCAL = False


def construct_label_classes():
IOB_tag = ["I", "O", "B"]
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/xm3600/xm3600.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@

_LANGUAGES = ["fil", "id", "th", "vi"]

_LOCAL = False


class XM3600Dataset(datasets.GeneratorBasedBuilder):
"""
Expand Down

0 comments on commit b23bec1

Please sign in to comment.