Skip to content

Commit

Permalink
Rectify SEACrowd Internal Vars (SEACrowd#386)
Browse files Browse the repository at this point in the history
* Add missing __init__.py

* add init

* fix bug in phoatis load

* add lang variables in dataloaders

* Add dataset use ack on source HF repo into description
  • Loading branch information
sabilmakbar authored Jan 29, 2024
1 parent d0c1105 commit 0267efe
Show file tree
Hide file tree
Showing 24 changed files with 44 additions and 8 deletions.
Empty file.
1 change: 1 addition & 0 deletions seacrowd/sea_datasets/burapha_th/burapha_th.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
_LICENSE = Licenses.UNKNOWN.value

_LOCAL = False
_LANGUAGES = ["tha"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)

_URLS = {
"character": {"test": "https://services.informatics.buu.ac.th/datasets/Burapha-TH/character/20210306-test.zip", "train": "https://services.informatics.buu.ac.th/datasets/Burapha-TH/character/20210306-train.zip"},
Expand Down
3 changes: 3 additions & 0 deletions seacrowd/sea_datasets/cub_bahasa/cub_bahasa.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
are required at least 10 words, without any information on subcategories and actions.
"""

_LOCAL=False
_LANGUAGES = ["ind"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)

_HOMEPAGE = "https://github.com/share424/Indonesian-Text-to-Image-synthesis-with-Sentence-BERT-and-FastGAN"
_LICENSE = Licenses.UNKNOWN.value
_URLS = {
Expand Down
4 changes: 4 additions & 0 deletions seacrowd/sea_datasets/culturax/culturax.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,12 @@
mC4 and OSCAR corpora, emphasizing non-English languages to support multilingual model
training. For data cleaning validation, CulturaX employs a SentencePiece tokenizer and
KenLM language models, utilizing recent Wikipedia dumps for perplexity scoring.
Before using this dataloader, please accept the acknowledgement at https://huggingface.co/datasets/uonlp/CulturaX and use huggingface-cli login for authentication.
"""

_LOCAL=False
_LANGUAGES = ["ind", "jav", "khm", "lao", "tgl", "min", "mya", "sun", "tha", "vie", "zlm", "ceb", "war", "cbk", "bcl"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)

_HOMEPAGE = "https://huggingface.co/datasets/uonlp/CulturaX"
_LICENSE = f"""{Licenses.OTHERS.value} | \
The licence terms for CulturaX strictly follows those of mC4 and OSCAR. \
Expand Down
7 changes: 5 additions & 2 deletions seacrowd/sea_datasets/glotstorybook/glotstorybook.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,13 @@
'CC BY-NC-SA', 'CC-BY', 'CC-BY-NC', and 'Public Domain'. We also license the code, actual
packaging and the metadata of these data under the cc0-1.0.
"""

_LOCAL=False
_LANGUAGES = ["khg", "khm", "mya", "tet", "tha", "vie"]

_URLS = "https://huggingface.co/datasets/cis-lmu/GlotStoryBook/resolve/main/GlotStoryBook.csv"

_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING]
_SUPPORTED_LANGS = ["khg", "khm", "mya", "tet", "tha", "vie"]
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"

Expand Down Expand Up @@ -117,7 +120,7 @@ def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datase
def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""
df = pd.read_csv(filepath)
df = df[df["ISO639-3"].isin(_SUPPORTED_LANGS)]
df = df[df["ISO639-3"].isin(_LANGUAGES)]

if self.config.schema == "source":
for i, row in df.iterrows():
Expand Down
Empty file.
Empty file.
Empty file.
Empty file.
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/massive/massive.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@
_HOMEPAGE = "https://github.com/alexa/massive"
_LICENSE = Licenses.CC_BY_4_0.value
_LOCAL = False
_LANGUAGES = ["ind", "jav", "khm", "zlm", "mya", "tha", "tgl", "vie"]

_URLS = {
_DATASETNAME: "https://amazon-massive-nlu-dataset.s3.amazonaws.com/amazon-massive-dataset-1.1.tar.gz",
}
Expand Down
Empty file.
Empty file.
1 change: 1 addition & 0 deletions seacrowd/sea_datasets/mlqa/mlqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

_HOMEPAGE = "https://github.com/facebookresearch/MLQA"
_LICENSE = Licenses.CC_BY_SA_3_0.value
_LANGUAGES = ["vie"]
_URL = "https://dl.fbaipublicfiles.com/MLQA/"
_DEV_TEST_URL = "MLQA_V1.zip"
_TRANSLATE_TEST_URL = "mlqa-translate-test.tar.gz"
Expand Down
Empty file.
3 changes: 3 additions & 0 deletions seacrowd/sea_datasets/oscar_2201/oscar_2201.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,9 @@
_LICENSE = Licenses.CC0_1_0.value
_BASE_URL = "https://huggingface.co/datasets/oscar-corpus/OSCAR-2201/resolve/main/compressed/{lang}_meta/"

_LOCAL = False
_LANGUAGES = ["war", "ceb", "min", "vie", "ilo", "tgl", "lao", "khm", "mya", "jav", "ind", "tha", "sun", "zlm"]

_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING]
_SOURCE_VERSION = "2022.1.0"
_SEACROWD_VERSION = "1.0.0"
Expand Down
5 changes: 4 additions & 1 deletion seacrowd/sea_datasets/phoatis/phoatis.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@
}
}

_LOCAL = False
_LANGUAGES = ["vie"]

_SUPPORTED_TASKS = [Tasks.INTENT_CLASSIFICATION, Tasks.SLOT_FILLING]

_SOURCE_VERSION = "1.0.0"
Expand Down Expand Up @@ -136,7 +139,7 @@ class PhoATIS(datasets.GeneratorBasedBuilder):
]
)

DEFAULT_CONFIG_NAME = "phoatis_intent_cls_syllable_source"
DEFAULT_CONFIG_NAME = "phoatis_source"

def _info(self) -> datasets.DatasetInfo:

Expand Down
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@

_HOMEPAGE = "https://github.com/imperialite/Philippine-Languages-Online-Corpora/tree/master/Tweets/Annotated%20Yolanda"

_LOCAL = False
_LANGUAGES = ["fil"]

_LICENSE = Licenses.CC_BY_4_0.value

_ROOT_URL = "https://raw.githubusercontent.com/imperialite/Philippine-Languages-Online-Corpora/master/Tweets/Annotated%20Yolanda/"
Expand Down
3 changes: 3 additions & 0 deletions seacrowd/sea_datasets/udhr/udhr.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@
"zlm": "Malay", # default mly_latn
}

_LOCAL=False
_LANGUAGES=["ace", "ban", "bcl", "blt", "bug", "ceb", "cfm", "cnh", "ctd", "duu", "hil", "hlt", "hni", "hnj", "ilo", "ind", "jav", "khm", "kkh", "lao", "lus", "mad", "min", "mnw", "mya", "pam", "shn", "sun", "tdt", "tet", "tgl", "tha", "vie", "war", "zlm"]

def seacrowd_config_constructor(src_lang, schema, version):
if src_lang == "":
raise ValueError(f"Invalid src_lang {src_lang}")
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/sea_datasets/vitext2sql/vitext2sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@
},
}

_LOCAL = False
_LANGUAGES = ["vie"]
_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION]

_SEACROWD_VERSION = "1.0.0"
Expand Down
8 changes: 4 additions & 4 deletions seacrowd/sea_datasets/xm3600/xm3600.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@

_SEACROWD_VERSION = "1.0.0"

_LANGS = ["fil", "id", "th", "vi"]
_LANGUAGES = ["fil", "id", "th", "vi"]


class XM3600Dataset(datasets.GeneratorBasedBuilder):
Expand All @@ -79,7 +79,7 @@ class XM3600Dataset(datasets.GeneratorBasedBuilder):
schema="source",
subset_id=f"{_DATASETNAME}_{lang}",
)
for lang in _LANGS
for lang in _LANGUAGES
] + [
SEACrowdConfig(
name=f"{_DATASETNAME}_{lang}_seacrowd_imtext",
Expand All @@ -88,10 +88,10 @@ class XM3600Dataset(datasets.GeneratorBasedBuilder):
schema="seacrowd_imtext",
subset_id=f"{_DATASETNAME}_{lang}",
)
for lang in _LANGS
for lang in _LANGUAGES
]

DEFAULT_CONFIG_NAME = f"xm3600_{sorted(_LANGS)[0]}_source"
DEFAULT_CONFIG_NAME = f"xm3600_{sorted(_LANGUAGES)[0]}_source"

def _info(self) -> datasets.DatasetInfo:
if self.config.schema == "source":
Expand Down
3 changes: 3 additions & 0 deletions seacrowd/sea_datasets/xquad/xquad.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@

_LICENSE = Licenses.CC_BY_SA_4_0.value

_LOCAL = False
_LANGUAGES = ["tha", "vie"]

_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING]

_SOURCE_VERSION = "1.0.0"
Expand Down
3 changes: 3 additions & 0 deletions seacrowd/sea_datasets/yunshan_cup_2020/yunshan_cup_2020.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@

_HOMEPAGE = "https://github.com/GKLMIP/Yunshan-Cup-2020"

_LOCAL = False
_LANGUAGES = ["lao"]

_LICENSE = Licenses.UNKNOWN.value # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value

_URLS = {
Expand Down
4 changes: 3 additions & 1 deletion seacrowd/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ class Tasks(Enum):
NAMED_ENTITY_RECOGNITION = "NER"
POS_TAGGING = "POS"
SENTENCE_ORDERING = "SO"
SLOT_FILLING = "SF"
SPAN_BASED_ABSA = "SPAN_ABSA"
TOKEN_LEVEL_LANGUAGE_IDENTIFICATION = "LANGID"

Expand Down Expand Up @@ -220,14 +221,15 @@ class Licenses(Enum):
Tasks.DEPENDENCY_PARSING: "KB",
Tasks.CONSTITUENCY_PARSING: "TREE",
Tasks.E2E_TASK_ORIENTED_DIALOGUE: "TOD",
Tasks.DIALOGUE_SYSTEM: "T2T",
Tasks.WORD_SENSE_DISAMBIGUATION: "T2T",
Tasks.WORD_ANALOGY: "T2T",
Tasks.KEYWORD_EXTRACTION: "SEQ_LABEL",
Tasks.DIALOGUE_SYSTEM: "T2T",
Tasks.KEYWORD_TAGGING: "SEQ_LABEL",
Tasks.NAMED_ENTITY_RECOGNITION: "SEQ_LABEL",
Tasks.POS_TAGGING: "SEQ_LABEL",
Tasks.SENTENCE_ORDERING: "SEQ_LABEL",
Tasks.SLOT_FILLING: "SEQ_LABEL",
Tasks.SPAN_BASED_ABSA: "SEQ_LABEL",
Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION: "SEQ_LABEL",
Tasks.COMMONSENSE_REASONING: "QA",
Expand Down

0 comments on commit 0267efe

Please sign in to comment.