Rectify SEACrowd Internal Vars (SEACrowd#386)

* Add missing __init__.py * add init * fix bug in phoatis load * add lang variables in dataloaders * Add dataset use ack on source HF repo into description
ilhamfp · Jan 29, 2024 · 0267efe · 0267efe
1 parent d0c1105
commit 0267efe
Show file tree

Hide file tree

Showing 24 changed files with 44 additions and 8 deletions.
diff --git a/seacrowd/sea_datasets/belebele/__init__.py b/seacrowd/sea_datasets/belebele/__init__.py
diff --git a/seacrowd/sea_datasets/burapha_th/burapha_th.py b/seacrowd/sea_datasets/burapha_th/burapha_th.py
@@ -38,6 +38,7 @@
 _LICENSE = Licenses.UNKNOWN.value
 
 _LOCAL = False
+_LANGUAGES = ["tha"]  # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
 
 _URLS = {
     "character": {"test": "https://services.informatics.buu.ac.th/datasets/Burapha-TH/character/20210306-test.zip", "train": "https://services.informatics.buu.ac.th/datasets/Burapha-TH/character/20210306-train.zip"},

diff --git a/seacrowd/sea_datasets/cub_bahasa/cub_bahasa.py b/seacrowd/sea_datasets/cub_bahasa/cub_bahasa.py
@@ -27,6 +27,9 @@
 are required at least 10 words, without any information on subcategories and actions.
 """
 
+_LOCAL=False
+_LANGUAGES = ["ind"]  # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
+
 _HOMEPAGE = "https://github.com/share424/Indonesian-Text-to-Image-synthesis-with-Sentence-BERT-and-FastGAN"
 _LICENSE = Licenses.UNKNOWN.value
 _URLS = {

diff --git a/seacrowd/sea_datasets/culturax/culturax.py b/seacrowd/sea_datasets/culturax/culturax.py
@@ -29,8 +29,12 @@
 mC4 and OSCAR corpora, emphasizing non-English languages to support multilingual model
 training. For data cleaning validation, CulturaX employs a SentencePiece tokenizer and
 KenLM language models, utilizing recent Wikipedia dumps for perplexity scoring.
+Before using this dataloader, please accept the acknowledgement at https://huggingface.co/datasets/uonlp/CulturaX and use huggingface-cli login for authentication.
 """
 
+_LOCAL=False
+_LANGUAGES = ["ind", "jav", "khm", "lao", "tgl", "min", "mya", "sun", "tha", "vie", "zlm", "ceb", "war", "cbk", "bcl"]  # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
+
 _HOMEPAGE = "https://huggingface.co/datasets/uonlp/CulturaX"
 _LICENSE = f"""{Licenses.OTHERS.value} | \
     The licence terms for CulturaX strictly follows those of mC4 and OSCAR. \

diff --git a/seacrowd/sea_datasets/glotstorybook/glotstorybook.py b/seacrowd/sea_datasets/glotstorybook/glotstorybook.py
@@ -39,10 +39,13 @@
 'CC BY-NC-SA', 'CC-BY', 'CC-BY-NC', and 'Public Domain'. We also license the code, actual
 packaging and the metadata of these data under the cc0-1.0.
 """
+
+_LOCAL=False
+_LANGUAGES = ["khg", "khm", "mya", "tet", "tha", "vie"]
+
 _URLS = "https://huggingface.co/datasets/cis-lmu/GlotStoryBook/resolve/main/GlotStoryBook.csv"
 
 _SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING]
-_SUPPORTED_LANGS = ["khg", "khm", "mya", "tet", "tha", "vie"]
 _SOURCE_VERSION = "1.0.0"
 _SEACROWD_VERSION = "1.0.0"
 
@@ -117,7 +120,7 @@ def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datase
     def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
         """Yields examples as (key, example) tuples."""
         df = pd.read_csv(filepath)
-        df = df[df["ISO639-3"].isin(_SUPPORTED_LANGS)]
+        df = df[df["ISO639-3"].isin(_LANGUAGES)]
 
         if self.config.schema == "source":
             for i, row in df.iterrows():

diff --git a/seacrowd/sea_datasets/id_coreference_resolution/__init__.py b/seacrowd/sea_datasets/id_coreference_resolution/__init__.py
diff --git a/seacrowd/sea_datasets/id_wsd/__init__.py b/seacrowd/sea_datasets/id_wsd/__init__.py
diff --git a/seacrowd/sea_datasets/indocamrest/__init__.py b/seacrowd/sea_datasets/indocamrest/__init__.py
diff --git a/seacrowd/sea_datasets/kawat/__init__.py b/seacrowd/sea_datasets/kawat/__init__.py
diff --git a/seacrowd/sea_datasets/massive/massive.py b/seacrowd/sea_datasets/massive/massive.py
@@ -51,6 +51,8 @@
 _HOMEPAGE = "https://github.com/alexa/massive"
 _LICENSE = Licenses.CC_BY_4_0.value
 _LOCAL = False
+_LANGUAGES = ["ind", "jav", "khm", "zlm", "mya", "tha", "tgl", "vie"]
+
 _URLS = {
     _DATASETNAME: "https://amazon-massive-nlu-dataset.s3.amazonaws.com/amazon-massive-dataset-1.1.tar.gz",
 }

diff --git a/seacrowd/sea_datasets/memolon/__init__.py b/seacrowd/sea_datasets/memolon/__init__.py
diff --git a/seacrowd/sea_datasets/miracl/__init__.py b/seacrowd/sea_datasets/miracl/__init__.py
diff --git a/seacrowd/sea_datasets/mlqa/mlqa.py b/seacrowd/sea_datasets/mlqa/mlqa.py
@@ -29,6 +29,7 @@
 
 _HOMEPAGE = "https://github.com/facebookresearch/MLQA"
 _LICENSE = Licenses.CC_BY_SA_3_0.value
+_LANGUAGES = ["vie"]
 _URL = "https://dl.fbaipublicfiles.com/MLQA/"
 _DEV_TEST_URL = "MLQA_V1.zip"
 _TRANSLATE_TEST_URL = "mlqa-translate-test.tar.gz"

diff --git a/seacrowd/sea_datasets/ntrex_128/__init__.py b/seacrowd/sea_datasets/ntrex_128/__init__.py
diff --git a/seacrowd/sea_datasets/oscar_2201/oscar_2201.py b/seacrowd/sea_datasets/oscar_2201/oscar_2201.py
@@ -216,6 +216,9 @@
 _LICENSE = Licenses.CC0_1_0.value
 _BASE_URL = "https://huggingface.co/datasets/oscar-corpus/OSCAR-2201/resolve/main/compressed/{lang}_meta/"
 
+_LOCAL = False
+_LANGUAGES = ["war", "ceb", "min", "vie", "ilo", "tgl", "lao", "khm", "mya", "jav", "ind", "tha", "sun", "zlm"]
+
 _SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING]
 _SOURCE_VERSION = "2022.1.0"
 _SEACROWD_VERSION = "1.0.0"

diff --git a/seacrowd/sea_datasets/phoatis/phoatis.py b/seacrowd/sea_datasets/phoatis/phoatis.py
@@ -67,6 +67,9 @@
     }
 }
 
+_LOCAL = False
+_LANGUAGES = ["vie"]
+
 _SUPPORTED_TASKS = [Tasks.INTENT_CLASSIFICATION, Tasks.SLOT_FILLING]
 
 _SOURCE_VERSION = "1.0.0"
@@ -136,7 +139,7 @@ class PhoATIS(datasets.GeneratorBasedBuilder):
         ]
     )
 
-    DEFAULT_CONFIG_NAME = "phoatis_intent_cls_syllable_source"
+    DEFAULT_CONFIG_NAME = "phoatis_source"
 
     def _info(self) -> datasets.DatasetInfo:
 

diff --git a/seacrowd/sea_datasets/sampiran/__init__.py b/seacrowd/sea_datasets/sampiran/__init__.py
diff --git a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py
@@ -30,6 +30,9 @@
 
 _HOMEPAGE = "https://github.com/imperialite/Philippine-Languages-Online-Corpora/tree/master/Tweets/Annotated%20Yolanda"
 
+_LOCAL = False
+_LANGUAGES = ["fil"]
+
 _LICENSE = Licenses.CC_BY_4_0.value
 
 _ROOT_URL = "https://raw.githubusercontent.com/imperialite/Philippine-Languages-Online-Corpora/master/Tweets/Annotated%20Yolanda/"

diff --git a/seacrowd/sea_datasets/udhr/udhr.py b/seacrowd/sea_datasets/udhr/udhr.py
@@ -72,6 +72,9 @@
     "zlm": "Malay",  # default mly_latn
 }
 
+_LOCAL=False
+_LANGUAGES=["ace", "ban", "bcl", "blt", "bug", "ceb", "cfm", "cnh", "ctd", "duu", "hil", "hlt", "hni", "hnj", "ilo", "ind", "jav", "khm", "kkh", "lao", "lus", "mad", "min", "mnw", "mya", "pam", "shn", "sun", "tdt", "tet", "tgl", "tha", "vie", "war", "zlm"]
+
 def seacrowd_config_constructor(src_lang, schema, version):
     if src_lang == "":
         raise ValueError(f"Invalid src_lang {src_lang}")

diff --git a/seacrowd/sea_datasets/vitext2sql/vitext2sql.py b/seacrowd/sea_datasets/vitext2sql/vitext2sql.py
@@ -57,6 +57,8 @@
     },
 }
 
+_LOCAL = False
+_LANGUAGES = ["vie"]
 _SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION]
 
 _SEACROWD_VERSION = "1.0.0"

diff --git a/seacrowd/sea_datasets/xm3600/xm3600.py b/seacrowd/sea_datasets/xm3600/xm3600.py
@@ -56,7 +56,7 @@
 
 _SEACROWD_VERSION = "1.0.0"
 
-_LANGS = ["fil", "id", "th", "vi"]
+_LANGUAGES = ["fil", "id", "th", "vi"]
 
 
 class XM3600Dataset(datasets.GeneratorBasedBuilder):
@@ -79,7 +79,7 @@ class XM3600Dataset(datasets.GeneratorBasedBuilder):
             schema="source",
             subset_id=f"{_DATASETNAME}_{lang}",
         )
-        for lang in _LANGS
+        for lang in _LANGUAGES
     ] + [
         SEACrowdConfig(
             name=f"{_DATASETNAME}_{lang}_seacrowd_imtext",
@@ -88,10 +88,10 @@ class XM3600Dataset(datasets.GeneratorBasedBuilder):
             schema="seacrowd_imtext",
             subset_id=f"{_DATASETNAME}_{lang}",
         )
-        for lang in _LANGS
+        for lang in _LANGUAGES
     ]
 
-    DEFAULT_CONFIG_NAME = f"xm3600_{sorted(_LANGS)[0]}_source"
+    DEFAULT_CONFIG_NAME = f"xm3600_{sorted(_LANGUAGES)[0]}_source"
 
     def _info(self) -> datasets.DatasetInfo:
         if self.config.schema == "source":

diff --git a/seacrowd/sea_datasets/xquad/xquad.py b/seacrowd/sea_datasets/xquad/xquad.py
@@ -32,6 +32,9 @@
 
 _LICENSE = Licenses.CC_BY_SA_4_0.value
 
+_LOCAL = False
+_LANGUAGES = ["tha", "vie"]
+
 _SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING]
 
 _SOURCE_VERSION = "1.0.0"

diff --git a/seacrowd/sea_datasets/yunshan_cup_2020/yunshan_cup_2020.py b/seacrowd/sea_datasets/yunshan_cup_2020/yunshan_cup_2020.py
@@ -38,6 +38,9 @@
 
 _HOMEPAGE = "https://github.com/GKLMIP/Yunshan-Cup-2020"
 
+_LOCAL = False
+_LANGUAGES = ["lao"]
+
 _LICENSE = Licenses.UNKNOWN.value  # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value
 
 _URLS = {

diff --git a/seacrowd/utils/constants.py b/seacrowd/utils/constants.py
@@ -74,6 +74,7 @@ class Tasks(Enum):
     NAMED_ENTITY_RECOGNITION = "NER"
     POS_TAGGING = "POS"
     SENTENCE_ORDERING = "SO"
+    SLOT_FILLING = "SF"
     SPAN_BASED_ABSA = "SPAN_ABSA"
     TOKEN_LEVEL_LANGUAGE_IDENTIFICATION = "LANGID"
 
@@ -220,14 +221,15 @@ class Licenses(Enum):
     Tasks.DEPENDENCY_PARSING: "KB",
     Tasks.CONSTITUENCY_PARSING: "TREE",
     Tasks.E2E_TASK_ORIENTED_DIALOGUE: "TOD",
+    Tasks.DIALOGUE_SYSTEM: "T2T",
     Tasks.WORD_SENSE_DISAMBIGUATION: "T2T",
     Tasks.WORD_ANALOGY: "T2T",
     Tasks.KEYWORD_EXTRACTION: "SEQ_LABEL",
-    Tasks.DIALOGUE_SYSTEM: "T2T",
     Tasks.KEYWORD_TAGGING: "SEQ_LABEL",
     Tasks.NAMED_ENTITY_RECOGNITION: "SEQ_LABEL",
     Tasks.POS_TAGGING: "SEQ_LABEL",
     Tasks.SENTENCE_ORDERING: "SEQ_LABEL",
+    Tasks.SLOT_FILLING: "SEQ_LABEL",
     Tasks.SPAN_BASED_ABSA: "SEQ_LABEL",
     Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION: "SEQ_LABEL",
     Tasks.COMMONSENSE_REASONING: "QA",