diff --git a/seacrowd/sea_datasets/belebele/__init__.py b/seacrowd/sea_datasets/belebele/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/burapha_th/burapha_th.py b/seacrowd/sea_datasets/burapha_th/burapha_th.py index 3f476f24e..f71bae77a 100644 --- a/seacrowd/sea_datasets/burapha_th/burapha_th.py +++ b/seacrowd/sea_datasets/burapha_th/burapha_th.py @@ -38,6 +38,7 @@ _LICENSE = Licenses.UNKNOWN.value _LOCAL = False +_LANGUAGES = ["tha"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) _URLS = { "character": {"test": "https://services.informatics.buu.ac.th/datasets/Burapha-TH/character/20210306-test.zip", "train": "https://services.informatics.buu.ac.th/datasets/Burapha-TH/character/20210306-train.zip"}, diff --git a/seacrowd/sea_datasets/cub_bahasa/cub_bahasa.py b/seacrowd/sea_datasets/cub_bahasa/cub_bahasa.py index e14cc16a4..acc7cc21d 100644 --- a/seacrowd/sea_datasets/cub_bahasa/cub_bahasa.py +++ b/seacrowd/sea_datasets/cub_bahasa/cub_bahasa.py @@ -27,6 +27,9 @@ are required at least 10 words, without any information on subcategories and actions. """ +_LOCAL=False +_LANGUAGES = ["ind"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + _HOMEPAGE = "https://github.com/share424/Indonesian-Text-to-Image-synthesis-with-Sentence-BERT-and-FastGAN" _LICENSE = Licenses.UNKNOWN.value _URLS = { diff --git a/seacrowd/sea_datasets/culturax/culturax.py b/seacrowd/sea_datasets/culturax/culturax.py index 1a90f92e5..db5899492 100644 --- a/seacrowd/sea_datasets/culturax/culturax.py +++ b/seacrowd/sea_datasets/culturax/culturax.py @@ -29,8 +29,12 @@ mC4 and OSCAR corpora, emphasizing non-English languages to support multilingual model training. For data cleaning validation, CulturaX employs a SentencePiece tokenizer and KenLM language models, utilizing recent Wikipedia dumps for perplexity scoring. +Before using this dataloader, please accept the acknowledgement at https://huggingface.co/datasets/uonlp/CulturaX and use huggingface-cli login for authentication. """ +_LOCAL=False +_LANGUAGES = ["ind", "jav", "khm", "lao", "tgl", "min", "mya", "sun", "tha", "vie", "zlm", "ceb", "war", "cbk", "bcl"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + _HOMEPAGE = "https://huggingface.co/datasets/uonlp/CulturaX" _LICENSE = f"""{Licenses.OTHERS.value} | \ The licence terms for CulturaX strictly follows those of mC4 and OSCAR. \ diff --git a/seacrowd/sea_datasets/glotstorybook/glotstorybook.py b/seacrowd/sea_datasets/glotstorybook/glotstorybook.py index e750a89e1..b89ff0d90 100644 --- a/seacrowd/sea_datasets/glotstorybook/glotstorybook.py +++ b/seacrowd/sea_datasets/glotstorybook/glotstorybook.py @@ -39,10 +39,13 @@ 'CC BY-NC-SA', 'CC-BY', 'CC-BY-NC', and 'Public Domain'. We also license the code, actual packaging and the metadata of these data under the cc0-1.0. """ + +_LOCAL=False +_LANGUAGES = ["khg", "khm", "mya", "tet", "tha", "vie"] + _URLS = "https://huggingface.co/datasets/cis-lmu/GlotStoryBook/resolve/main/GlotStoryBook.csv" _SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] -_SUPPORTED_LANGS = ["khg", "khm", "mya", "tet", "tha", "vie"] _SOURCE_VERSION = "1.0.0" _SEACROWD_VERSION = "1.0.0" @@ -117,7 +120,7 @@ def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datase def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" df = pd.read_csv(filepath) - df = df[df["ISO639-3"].isin(_SUPPORTED_LANGS)] + df = df[df["ISO639-3"].isin(_LANGUAGES)] if self.config.schema == "source": for i, row in df.iterrows(): diff --git a/seacrowd/sea_datasets/id_coreference_resolution/__init__.py b/seacrowd/sea_datasets/id_coreference_resolution/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/id_wsd/__init__.py b/seacrowd/sea_datasets/id_wsd/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/indocamrest/__init__.py b/seacrowd/sea_datasets/indocamrest/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/kawat/__init__.py b/seacrowd/sea_datasets/kawat/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/massive/massive.py b/seacrowd/sea_datasets/massive/massive.py index e2ebe7ca0..6b2fceb5d 100644 --- a/seacrowd/sea_datasets/massive/massive.py +++ b/seacrowd/sea_datasets/massive/massive.py @@ -51,6 +51,8 @@ _HOMEPAGE = "https://github.com/alexa/massive" _LICENSE = Licenses.CC_BY_4_0.value _LOCAL = False +_LANGUAGES = ["ind", "jav", "khm", "zlm", "mya", "tha", "tgl", "vie"] + _URLS = { _DATASETNAME: "https://amazon-massive-nlu-dataset.s3.amazonaws.com/amazon-massive-dataset-1.1.tar.gz", } diff --git a/seacrowd/sea_datasets/memolon/__init__.py b/seacrowd/sea_datasets/memolon/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/miracl/__init__.py b/seacrowd/sea_datasets/miracl/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/mlqa/mlqa.py b/seacrowd/sea_datasets/mlqa/mlqa.py index a9b07717e..f2884e0f1 100644 --- a/seacrowd/sea_datasets/mlqa/mlqa.py +++ b/seacrowd/sea_datasets/mlqa/mlqa.py @@ -29,6 +29,7 @@ _HOMEPAGE = "https://github.com/facebookresearch/MLQA" _LICENSE = Licenses.CC_BY_SA_3_0.value +_LANGUAGES = ["vie"] _URL = "https://dl.fbaipublicfiles.com/MLQA/" _DEV_TEST_URL = "MLQA_V1.zip" _TRANSLATE_TEST_URL = "mlqa-translate-test.tar.gz" diff --git a/seacrowd/sea_datasets/ntrex_128/__init__.py b/seacrowd/sea_datasets/ntrex_128/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/oscar_2201/oscar_2201.py b/seacrowd/sea_datasets/oscar_2201/oscar_2201.py index 0b78a445f..89fd66348 100644 --- a/seacrowd/sea_datasets/oscar_2201/oscar_2201.py +++ b/seacrowd/sea_datasets/oscar_2201/oscar_2201.py @@ -216,6 +216,9 @@ _LICENSE = Licenses.CC0_1_0.value _BASE_URL = "https://huggingface.co/datasets/oscar-corpus/OSCAR-2201/resolve/main/compressed/{lang}_meta/" +_LOCAL = False +_LANGUAGES = ["war", "ceb", "min", "vie", "ilo", "tgl", "lao", "khm", "mya", "jav", "ind", "tha", "sun", "zlm"] + _SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] _SOURCE_VERSION = "2022.1.0" _SEACROWD_VERSION = "1.0.0" diff --git a/seacrowd/sea_datasets/phoatis/phoatis.py b/seacrowd/sea_datasets/phoatis/phoatis.py index c3d046c3b..6524711b8 100644 --- a/seacrowd/sea_datasets/phoatis/phoatis.py +++ b/seacrowd/sea_datasets/phoatis/phoatis.py @@ -67,6 +67,9 @@ } } +_LOCAL = False +_LANGUAGES = ["vie"] + _SUPPORTED_TASKS = [Tasks.INTENT_CLASSIFICATION, Tasks.SLOT_FILLING] _SOURCE_VERSION = "1.0.0" @@ -136,7 +139,7 @@ class PhoATIS(datasets.GeneratorBasedBuilder): ] ) - DEFAULT_CONFIG_NAME = "phoatis_intent_cls_syllable_source" + DEFAULT_CONFIG_NAME = "phoatis_source" def _info(self) -> datasets.DatasetInfo: diff --git a/seacrowd/sea_datasets/sampiran/__init__.py b/seacrowd/sea_datasets/sampiran/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py index f3a76d21e..e0b48d7ed 100644 --- a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py +++ b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py @@ -30,6 +30,9 @@ _HOMEPAGE = "https://github.com/imperialite/Philippine-Languages-Online-Corpora/tree/master/Tweets/Annotated%20Yolanda" +_LOCAL = False +_LANGUAGES = ["fil"] + _LICENSE = Licenses.CC_BY_4_0.value _ROOT_URL = "https://raw.githubusercontent.com/imperialite/Philippine-Languages-Online-Corpora/master/Tweets/Annotated%20Yolanda/" diff --git a/seacrowd/sea_datasets/udhr/udhr.py b/seacrowd/sea_datasets/udhr/udhr.py index bb4445da6..dd0a2dcd2 100644 --- a/seacrowd/sea_datasets/udhr/udhr.py +++ b/seacrowd/sea_datasets/udhr/udhr.py @@ -72,6 +72,9 @@ "zlm": "Malay", # default mly_latn } +_LOCAL=False +_LANGUAGES=["ace", "ban", "bcl", "blt", "bug", "ceb", "cfm", "cnh", "ctd", "duu", "hil", "hlt", "hni", "hnj", "ilo", "ind", "jav", "khm", "kkh", "lao", "lus", "mad", "min", "mnw", "mya", "pam", "shn", "sun", "tdt", "tet", "tgl", "tha", "vie", "war", "zlm"] + def seacrowd_config_constructor(src_lang, schema, version): if src_lang == "": raise ValueError(f"Invalid src_lang {src_lang}") diff --git a/seacrowd/sea_datasets/vitext2sql/vitext2sql.py b/seacrowd/sea_datasets/vitext2sql/vitext2sql.py index ad8be6de4..a9b3cd14f 100644 --- a/seacrowd/sea_datasets/vitext2sql/vitext2sql.py +++ b/seacrowd/sea_datasets/vitext2sql/vitext2sql.py @@ -57,6 +57,8 @@ }, } +_LOCAL = False +_LANGUAGES = ["vie"] _SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] _SEACROWD_VERSION = "1.0.0" diff --git a/seacrowd/sea_datasets/xm3600/xm3600.py b/seacrowd/sea_datasets/xm3600/xm3600.py index 9dbf750f3..9dc847013 100644 --- a/seacrowd/sea_datasets/xm3600/xm3600.py +++ b/seacrowd/sea_datasets/xm3600/xm3600.py @@ -56,7 +56,7 @@ _SEACROWD_VERSION = "1.0.0" -_LANGS = ["fil", "id", "th", "vi"] +_LANGUAGES = ["fil", "id", "th", "vi"] class XM3600Dataset(datasets.GeneratorBasedBuilder): @@ -79,7 +79,7 @@ class XM3600Dataset(datasets.GeneratorBasedBuilder): schema="source", subset_id=f"{_DATASETNAME}_{lang}", ) - for lang in _LANGS + for lang in _LANGUAGES ] + [ SEACrowdConfig( name=f"{_DATASETNAME}_{lang}_seacrowd_imtext", @@ -88,10 +88,10 @@ class XM3600Dataset(datasets.GeneratorBasedBuilder): schema="seacrowd_imtext", subset_id=f"{_DATASETNAME}_{lang}", ) - for lang in _LANGS + for lang in _LANGUAGES ] - DEFAULT_CONFIG_NAME = f"xm3600_{sorted(_LANGS)[0]}_source" + DEFAULT_CONFIG_NAME = f"xm3600_{sorted(_LANGUAGES)[0]}_source" def _info(self) -> datasets.DatasetInfo: if self.config.schema == "source": diff --git a/seacrowd/sea_datasets/xquad/xquad.py b/seacrowd/sea_datasets/xquad/xquad.py index 627c0d0ca..113db88eb 100644 --- a/seacrowd/sea_datasets/xquad/xquad.py +++ b/seacrowd/sea_datasets/xquad/xquad.py @@ -32,6 +32,9 @@ _LICENSE = Licenses.CC_BY_SA_4_0.value +_LOCAL = False +_LANGUAGES = ["tha", "vie"] + _SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] _SOURCE_VERSION = "1.0.0" diff --git a/seacrowd/sea_datasets/yunshan_cup_2020/yunshan_cup_2020.py b/seacrowd/sea_datasets/yunshan_cup_2020/yunshan_cup_2020.py index 5b7d34907..9298c7e50 100644 --- a/seacrowd/sea_datasets/yunshan_cup_2020/yunshan_cup_2020.py +++ b/seacrowd/sea_datasets/yunshan_cup_2020/yunshan_cup_2020.py @@ -38,6 +38,9 @@ _HOMEPAGE = "https://github.com/GKLMIP/Yunshan-Cup-2020" +_LOCAL = False +_LANGUAGES = ["lao"] + _LICENSE = Licenses.UNKNOWN.value # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value _URLS = { diff --git a/seacrowd/utils/constants.py b/seacrowd/utils/constants.py index 613700bc4..586f4bfdf 100644 --- a/seacrowd/utils/constants.py +++ b/seacrowd/utils/constants.py @@ -74,6 +74,7 @@ class Tasks(Enum): NAMED_ENTITY_RECOGNITION = "NER" POS_TAGGING = "POS" SENTENCE_ORDERING = "SO" + SLOT_FILLING = "SF" SPAN_BASED_ABSA = "SPAN_ABSA" TOKEN_LEVEL_LANGUAGE_IDENTIFICATION = "LANGID" @@ -220,14 +221,15 @@ class Licenses(Enum): Tasks.DEPENDENCY_PARSING: "KB", Tasks.CONSTITUENCY_PARSING: "TREE", Tasks.E2E_TASK_ORIENTED_DIALOGUE: "TOD", + Tasks.DIALOGUE_SYSTEM: "T2T", Tasks.WORD_SENSE_DISAMBIGUATION: "T2T", Tasks.WORD_ANALOGY: "T2T", Tasks.KEYWORD_EXTRACTION: "SEQ_LABEL", - Tasks.DIALOGUE_SYSTEM: "T2T", Tasks.KEYWORD_TAGGING: "SEQ_LABEL", Tasks.NAMED_ENTITY_RECOGNITION: "SEQ_LABEL", Tasks.POS_TAGGING: "SEQ_LABEL", Tasks.SENTENCE_ORDERING: "SEQ_LABEL", + Tasks.SLOT_FILLING: "SEQ_LABEL", Tasks.SPAN_BASED_ABSA: "SEQ_LABEL", Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION: "SEQ_LABEL", Tasks.COMMONSENSE_REASONING: "QA",