Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add config helper (WIP) and missed constants in existing dataloaders #605

Merged
merged 30 commits into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
b23bec1
Add config helper (WIP) and missed constants in existing dataloaders
holylovenia Apr 4, 2024
a3911c2
Clarify method names
holylovenia Apr 4, 2024
5858e56
Fix some error-triggering parts
holylovenia Apr 12, 2024
ad8bb9b
Modify package setup
holylovenia Apr 12, 2024
0ecc98d
Fix bug
holylovenia Apr 17, 2024
3f8c571
Add the latest version
holylovenia Apr 21, 2024
648bfcc
Merge branch 'master' of https://github.com/SEACrowd/seacrowd-datahub…
holylovenia Apr 30, 2024
a5541fa
Remove task for source-only dataset
holylovenia Apr 30, 2024
6468361
Change train to test split
holylovenia Apr 30, 2024
a667cc3
Include all languages in the data
holylovenia Apr 30, 2024
1e7026d
Change train to test
holylovenia Apr 30, 2024
1467309
Change train to test
holylovenia Apr 30, 2024
700c177
Remove numbering from options and answer
holylovenia Apr 30, 2024
7d0ab71
Add the newest version
holylovenia Apr 30, 2024
f8e1212
remove main entry of module for dataloaders (#662)
sabilmakbar May 2, 2024
9bd7bf6
Fix tgl --> fil for AYA dataset
holylovenia May 8, 2024
c1ec65d
Change train -> test and add eng as the MT lang pair
holylovenia May 10, 2024
ef89194
Change incorrect name
holylovenia May 11, 2024
970afbc
Change incorrect name
holylovenia May 11, 2024
636ebfa
Change subset id to '*_{lang}_eng_*' or '*_eng_{lang}_*'
holylovenia May 12, 2024
4902542
Merge branch 'master' of https://github.com/SEACrowd/seacrowd-datahub…
holylovenia May 12, 2024
b3ad89c
Fix paracotta_id's download issue
holylovenia May 12, 2024
83901a6
Merge branch 'master' of https://github.com/SEACrowd/seacrowd-datahub…
holylovenia May 13, 2024
09051b2
Normalize subset names and enable eng_{lang} pairings
holylovenia May 13, 2024
dba6e62
Merge branch 'master' of https://github.com/SEACrowd/seacrowd-datahub…
holylovenia Jun 19, 2024
0be9162
Fix load_* methods
holylovenia Jun 19, 2024
0e590b6
Fix available_* methods
holylovenia Jun 19, 2024
2e7509b
Change _SEACROWD_VERSION to reflect the date of last update
holylovenia Jun 19, 2024
4ac5ffa
Add SEACrowd benchmark config list
holylovenia Jun 19, 2024
9db6d22
Update seacrowd to 0.1.0
holylovenia Jun 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,038 changes: 1,038 additions & 0 deletions seacrowd/config_helper.py

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/belebele/belebele.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@

_DEFAULT_LANG = "zsm"

_LOCAL = False

def config_constructor(belebele_subset: str, schema: str, version: str) -> SEACrowdConfig:
lang = _LANGUAGES[_SOURCE_NAMES.index(belebele_subset)]
return SEACrowdConfig(
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/bhinneka_korpus/bhinneka_korpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
"mkn": "kupang-malay",
}

_LOCAL = False


class BhinnekaKorpusDataset(datasets.GeneratorBasedBuilder):
"""A Collection of Multilingual Parallel Datasets for 5 Indonesian Local Languages."""
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/burmese_romanize/burmese_romanize.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"

_LOCAL = False


class BurmeseRomanizeDataset(datasets.GeneratorBasedBuilder):
"""Romanization of names in Burmese script"""
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/coco_35l/coco_35l.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@

_LANGUAGES = {"fil": "fil", "ind": "id", "tha": "th", "vie": "vi"}

_LOCAL = False

class Coco35LDataset(datasets.GeneratorBasedBuilder):
"""
COCO-35L is a machine-generated image caption dataset, constructed by translating COCO Captions (Chen et al., 2015) to the other 34 languages using Google’s machine translation API.
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/dengue_filipino/dengue_filipino.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"

_LOCAL = False


class DengueFilipinoDataset(datasets.GeneratorBasedBuilder):
"""Dengue Dataset Low-Resource Multi-label Text Classification Dataset in Filipino"""
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/id_msvd/id_msvd.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"

_LOCAL = False


class IdMsvdDataset(datasets.GeneratorBasedBuilder):
"""MSVD dataset with Indonesian translation."""
Expand Down
2 changes: 1 addition & 1 deletion seacrowd/sea_datasets/indommlu/indommlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ class IndoMMLUDataset(datasets.GeneratorBasedBuilder):
name=f"{_DATASETNAME}_{lang}_seacrowd_qa",
version=SEACROWD_VERSION,
description=f"{_DATASETNAME} {lang} SEACrowd schema",
schema=f"seacrowd_{lang}_qa",
schema=f"seacrowd_qa",
subset_id=_DATASETNAME,
)
BUILDER_CONFIGS.append(lang_config)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
}
"""

_LANGUAGES = ["ind"]

_DATASETNAME = "indonesian_news_dataset"

_DESCRIPTION = """An imbalanced dataset to classify Indonesian News articles.
Expand All @@ -44,6 +46,8 @@

_TAGS = ["bola", "news", "bisnis", "tekno", "otomotif"]

_LOCAL = False


class IndonesianNewsDataset(datasets.GeneratorBasedBuilder):
"""The dataset contains 5 Indonesian News articles with imbalanced classes"""
Expand Down
1 change: 1 addition & 0 deletions seacrowd/sea_datasets/mc4_indo/mc4_indo.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
# "full": {"train": 1, "validation": 1}
# }

_LOCAL = False

_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING]
_SOURCE_VERSION = "1.0.0"
Expand Down
3 changes: 3 additions & 0 deletions seacrowd/sea_datasets/memolon/memolon.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,16 @@
}

_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"

_LANGUAGES = ["ceb", "tgl", "ind", "sun", "jav", "zsm", "vie", "tha", "mya"]

_LANGUAGE_MAP = {"ceb": "Cebuano", "tgl": "Tagalog", "ind": "Indonesian", "sun": "Sundanese", "jav": "Javanese", "zsm": "Malay", "vie": "Vietnamese", "tha": "Thai", "mya": "Burmese"}

_SUPPORTED_TASKS = [Tasks.EMOTION_CLASSIFICATION]

_LOCAL = False


def seacrowd_config_constructor(lang: str, schema: str, version: str) -> SEACrowdConfig:
if lang not in _LANGUAGE_MAP:
Expand Down
3 changes: 3 additions & 0 deletions seacrowd/sea_datasets/miracl/miracl.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,11 @@

_SEACROWD_VERSION = "1.0.0"

_LOCAL = False


def load_topic(fn):

qid2topic = {}
with open(fn, encoding="utf-8") as f:
for line in f:
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/mlqa/mlqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"

_LOCAL = False


class MLQADataset(datasets.GeneratorBasedBuilder):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class MTOPIntentClassificationDataset(datasets.GeneratorBasedBuilder):
version=datasets.Version(_SOURCE_VERSION),
description=f"{_DATASETNAME} source schema for {subset} subset",
schema="source",
subset_id=subset,
subset_id=f"{_DATASETNAME}_{subset}",
)
for subset in SUBSETS
] + [
Expand All @@ -76,7 +76,7 @@ class MTOPIntentClassificationDataset(datasets.GeneratorBasedBuilder):
version=datasets.Version(_SEACROWD_VERSION),
description=f"{_DATASETNAME} SEACrowd schema for {subset} subset",
schema="seacrowd_text",
subset_id=subset,
subset_id=f"{_DATASETNAME}_{subset}",
)
for subset in SUBSETS
]
Expand Down
12 changes: 6 additions & 6 deletions seacrowd/sea_datasets/my_paraphrase/my_paraphrase.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,42 +74,42 @@ class MyParaphraseDataset(datasets.GeneratorBasedBuilder):
name=f"{_DATASETNAME}_source", # source
version=SOURCE_VERSION,
description=f"{_DATASETNAME} source schema",
schema="paraphrase_source",
schema="source",
subset_id=f"{_DATASETNAME}_paraphrase",
),
SEACrowdConfig(
name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", # schema
version=SEACROWD_VERSION,
description=f"{_DATASETNAME} SEACrowd schema",
schema=f"seacrowd_paraphrase_{SEACROWD_SCHEMA_NAME}",
schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}",
subset_id=f"{_DATASETNAME}_paraphrase",
),
SEACrowdConfig(
name=f"{_DATASETNAME}_non_paraphrase_source", # source
version=SEACROWD_VERSION,
description=f"{_DATASETNAME} SEACrowd schema",
schema="non_paraphrase_source",
schema="source",
subset_id=f"{_DATASETNAME}_non_paraphrase",
),
SEACrowdConfig(
name=f"{_DATASETNAME}_non_paraphrase_seacrowd_{SEACROWD_SCHEMA_NAME}", # schema
version=SEACROWD_VERSION,
description=f"{_DATASETNAME} SEACrowd schema",
schema=f"seacrowd_non_paraphrase_{SEACROWD_SCHEMA_NAME}",
schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}",
subset_id=f"{_DATASETNAME}_non_paraphrase",
),
SEACrowdConfig(
name=f"{_DATASETNAME}_all_source", # source
version=SOURCE_VERSION,
description=f"{_DATASETNAME} source schema",
schema="all_source",
schema="source",
subset_id=f"{_DATASETNAME}_all",
),
SEACrowdConfig(
name=f"{_DATASETNAME}_all_seacrowd_{SEACROWD_SCHEMA_NAME}", # schema
version=SEACROWD_VERSION,
description=f"{_DATASETNAME} SEACrowd schema",
schema=f"seacrowd_all_{SEACROWD_SCHEMA_NAME}",
schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}",
subset_id=f"{_DATASETNAME}_all",
),
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@

_SUPPORTED_TASKS = [Tasks.FACT_CHECKING]
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"


class PhilippineFakeNewsDataset(datasets.GeneratorBasedBuilder):
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/sap_wat/sap_wat.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@

_SUBSET = ["id", "ms", "th", "vi"]

_LOCAL = False

class SapWatDataset(datasets.GeneratorBasedBuilder):
"""SAP WAT is a software documentation dataset for machine translation. The current language scope is English to Hindi,
Indonesian, Japanese, Korean, Malay, Thai, Vietnamese, Simplified Chinese and Traditional Chinese. Here, we only consider
Expand Down
1 change: 1 addition & 0 deletions seacrowd/sea_datasets/tydiqa/tydiqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING]
_LANGUAGES = ["ind", "tha"]
_LOCAL = False
_SOURCE_VERSION = "1.0.0"
_SOURCE_VERSION_P = "1.0.0"
_SOURCE_VERSION_S = "1.1.0"
_SEACROWD_VERSION = "1.0.0"
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@

_SEACROWD_VERSION = "1.0.0"

_LOCAL = False


def construct_label_classes():
IOB_tag = ["I", "O", "B"]
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/xm3600/xm3600.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@

_LANGUAGES = ["fil", "id", "th", "vi"]

_LOCAL = False


class XM3600Dataset(datasets.GeneratorBasedBuilder):
"""
Expand Down