From b23bec10725a7a41f0366e577d4f84903e65d6a9 Mon Sep 17 00:00:00 2001 From: Holy Lovenia Date: Thu, 4 Apr 2024 13:57:20 +0000 Subject: [PATCH] Add config helper (WIP) and missed constants in existing dataloaders --- seacrowd/config_helper.py | 1038 +++++++++++++++++ seacrowd/sea_datasets/belebele/belebele.py | 2 + .../bhinneka_korpus/bhinneka_korpus.py | 2 + .../burmese_romanize/burmese_romanize.py | 2 + seacrowd/sea_datasets/coco_35l/coco_35l.py | 2 + .../dengue_filipino/dengue_filipino.py | 2 + seacrowd/sea_datasets/id_msvd/id_msvd.py | 2 + seacrowd/sea_datasets/indommlu/indommlu.py | 2 +- .../indonesian_news_dataset.py | 4 + seacrowd/sea_datasets/mc4_indo/mc4_indo.py | 1 + seacrowd/sea_datasets/memolon/memolon.py | 3 + seacrowd/sea_datasets/miracl/miracl.py | 3 + seacrowd/sea_datasets/mlqa/mlqa.py | 2 + .../mtop_intent_classification.py | 4 +- .../my_paraphrase/my_paraphrase.py | 12 +- .../ph_fake_news_corpus.py | 1 + seacrowd/sea_datasets/sap_wat/sap_wat.py | 2 + seacrowd/sea_datasets/tydiqa/tydiqa.py | 1 + .../sea_datasets/uit_visd4sa/uit_visd4sa.py | 2 + seacrowd/sea_datasets/xm3600/xm3600.py | 2 + 20 files changed, 1080 insertions(+), 9 deletions(-) create mode 100644 seacrowd/config_helper.py diff --git a/seacrowd/config_helper.py b/seacrowd/config_helper.py new file mode 100644 index 000000000..cb6567fd1 --- /dev/null +++ b/seacrowd/config_helper.py @@ -0,0 +1,1038 @@ +""" +Utility for filtering and loading SEACrowd datasets. +""" +from collections import Counter +from importlib.machinery import SourceFileLoader +import logging +import os +import pathlib +from tqdm import tqdm +from types import ModuleType +from typing import Callable, Iterable, List, Optional, Dict + +from dataclasses import dataclass +from dataclasses import field +import datasets + +from .utils.configs import SEACrowdConfig +from .utils.constants import Tasks, SCHEMA_TO_TASKS +import pandas as pd + +_LARGE_CONFIG_NAMES = [ + 'covost2_ind_eng_seacrowd_sptext', + 'covost2_eng_ind_seacrowd_sptext', + 'covost2_ind_eng_seacrowd_t2t', + 'covost2_eng_ind_seacrowd_t2t', + 'cc100_ind_source', + 'cc100_jav_source', + 'cc100_sun_source', + 'cc100_ind_seacrowd_ssp', + 'cc100_jav_seacrowd_ssp', + 'cc100_sun_seacrowd_ssp', + 'indo4b_source', 'indo4b_seacrowd_ssp', + 'indo4b_plus_source', 'indo4b_plus_seacrowd_ssp', + 'kopi_cc_all-raw_source', + 'kopi_cc_all-dedup_source', + 'kopi_cc_all-neardup_source', + 'kopi_cc_all-neardup_clean_source', + 'kopi_cc_2021_10-dedup_source', + 'kopi_cc_2021_10-neardup_source', + 'kopi_cc_2021_10-neardup_clean_source', + 'kopi_cc_2021_17-raw_source', + 'kopi_cc_2021_17-dedup_source', + 'kopi_cc_2021_17-neardup_source', + 'kopi_cc_2021_17-neardup_clean_source', + 'kopi_cc_2021_21-raw_source', + 'kopi_cc_2021_21-dedup_source', + 'kopi_cc_2021_21-neardup_source', + 'kopi_cc_2021_21-neardup_clean_source', + 'kopi_cc_2021_25-raw_source', + 'kopi_cc_2021_25-dedup_source', + 'kopi_cc_2021_25-neardup_source', + 'kopi_cc_2021_25-neardup_clean_source', + 'kopi_cc_2021_31-raw_source', + 'kopi_cc_2021_31-dedup_source', + 'kopi_cc_2021_31-neardup_source', + 'kopi_cc_2021_31-neardup_clean_source', + 'kopi_cc_2021_39-raw_source', + 'kopi_cc_2021_39-dedup_source', + 'kopi_cc_2021_39-neardup_source', + 'kopi_cc_2021_39-neardup_clean_source', + 'kopi_cc_2021_43-raw_source', + 'kopi_cc_2021_43-dedup_source', + 'kopi_cc_2021_43-neardup_source', + 'kopi_cc_2021_43-neardup_clean_source', + 'kopi_cc_2021_49-dedup_source', + 'kopi_cc_2021_49-neardup_source', + 'kopi_cc_2021_49-neardup_clean_source', + 'kopi_cc_2022_05-raw_source', + 'kopi_cc_2022_05-dedup_source', + 'kopi_cc_2022_05-neardup_source', + 'kopi_cc_2022_05-neardup_clean_source', + 'kopi_cc_2022_21-raw_source', + 'kopi_cc_2022_21-dedup_source', + 'kopi_cc_2022_21-neardup_source', + 'kopi_cc_2022_21-neardup_clean_source', + 'kopi_cc_2022_27-raw_source', + 'kopi_cc_2022_27-dedup_source', + 'kopi_cc_2022_27-neardup_source', + 'kopi_cc_2022_27-neardup_clean_source', + 'kopi_cc_all-raw_seacrowd_ssp', + 'kopi_cc_all-dedup_seacrowd_ssp', + 'kopi_cc_all-neardup_seacrowd_ssp', + 'kopi_cc_all-neardup_clean_seacrowd_ssp', + 'kopi_cc_2021_10-dedup_seacrowd_ssp', + 'kopi_cc_2021_10-neardup_seacrowd_ssp', + 'kopi_cc_2021_10-neardup_clean_seacrowd_ssp', + 'kopi_cc_2021_17-raw_seacrowd_ssp', + 'kopi_cc_2021_17-dedup_seacrowd_ssp', + 'kopi_cc_2021_17-neardup_seacrowd_ssp', + 'kopi_cc_2021_17-neardup_clean_seacrowd_ssp', + 'kopi_cc_2021_21-raw_seacrowd_ssp', + 'kopi_cc_2021_21-dedup_seacrowd_ssp', + 'kopi_cc_2021_21-neardup_seacrowd_ssp', + 'kopi_cc_2021_21-neardup_clean_seacrowd_ssp', + 'kopi_cc_2021_25-raw_seacrowd_ssp', + 'kopi_cc_2021_25-dedup_seacrowd_ssp', + 'kopi_cc_2021_25-neardup_seacrowd_ssp', + 'kopi_cc_2021_25-neardup_clean_seacrowd_ssp', + 'kopi_cc_2021_31-raw_seacrowd_ssp', + 'kopi_cc_2021_31-dedup_seacrowd_ssp', + 'kopi_cc_2021_31-neardup_seacrowd_ssp', + 'kopi_cc_2021_31-neardup_clean_seacrowd_ssp', + 'kopi_cc_2021_39-raw_seacrowd_ssp', + 'kopi_cc_2021_39-dedup_seacrowd_ssp', + 'kopi_cc_2021_39-neardup_seacrowd_ssp', + 'kopi_cc_2021_39-neardup_clean_seacrowd_ssp', + 'kopi_cc_2021_43-raw_seacrowd_ssp', + 'kopi_cc_2021_43-dedup_seacrowd_ssp', + 'kopi_cc_2021_43-neardup_seacrowd_ssp', + 'kopi_cc_2021_43-neardup_clean_seacrowd_ssp', + 'kopi_cc_2021_49-dedup_seacrowd_ssp', + 'kopi_cc_2021_49-neardup_seacrowd_ssp', + 'kopi_cc_2021_49-neardup_clean_seacrowd_ssp', + 'kopi_cc_2022_05-raw_seacrowd_ssp', + 'kopi_cc_2022_05-dedup_seacrowd_ssp', + 'kopi_cc_2022_05-neardup_seacrowd_ssp', + 'kopi_cc_2022_05-neardup_clean_seacrowd_ssp', + 'kopi_cc_2022_21-raw_seacrowd_ssp', + 'kopi_cc_2022_21-dedup_seacrowd_ssp', + 'kopi_cc_2022_21-neardup_seacrowd_ssp', + 'kopi_cc_2022_21-neardup_clean_seacrowd_ssp', + 'kopi_cc_2022_27-raw_seacrowd_ssp', + 'kopi_cc_2022_27-dedup_seacrowd_ssp', + 'kopi_cc_2022_27-neardup_seacrowd_ssp', + 'kopi_cc_2022_27-neardup_clean_seacrowd_ssp', + 'kopi_cc_news_2016_source', + 'kopi_cc_news_2017_source', + 'kopi_cc_news_2018_source', + 'kopi_cc_news_2019_source', + 'kopi_cc_news_2020_source', + 'kopi_cc_news_2021_source', + 'kopi_cc_news_2022_source', + 'kopi_cc_news_all_source', + 'kopi_cc_news_2016_seacrowd_ssp', + 'kopi_cc_news_2017_seacrowd_ssp', + 'kopi_cc_news_2018_seacrowd_ssp', + 'kopi_cc_news_2019_seacrowd_ssp', + 'kopi_cc_news_2020_seacrowd_ssp', + 'kopi_cc_news_2021_seacrowd_ssp', + 'kopi_cc_news_2022_seacrowd_ssp', + 'kopi_cc_news_all_seacrowd_ssp', + 'kopi_nllb_all-raw_source', + 'kopi_nllb_all-dedup_source', + 'kopi_nllb_all-neardup_source', + 'kopi_nllb_ace_Latn-raw_source', + 'kopi_nllb_ace_Latn-dedup_source', + 'kopi_nllb_ace_Latn-neardup_source', + 'kopi_nllb_ban_Latn-raw_source', + 'kopi_nllb_ban_Latn-dedup_source', + 'kopi_nllb_ban_Latn-neardup_source', + 'kopi_nllb_bjn_Latn-raw_source', + 'kopi_nllb_bjn_Latn-dedup_source', + 'kopi_nllb_bjn_Latn-neardup_source', + 'kopi_nllb_ind_Latn-raw_source', + 'kopi_nllb_ind_Latn-dedup_source', + 'kopi_nllb_ind_Latn-neardup_source', + 'kopi_nllb_jav_Latn-raw_source', + 'kopi_nllb_jav_Latn-dedup_source', + 'kopi_nllb_jav_Latn-neardup_source', + 'kopi_nllb_min_Latn-raw_source', + 'kopi_nllb_min_Latn-dedup_source', + 'kopi_nllb_min_Latn-neardup_source', + 'kopi_nllb_sun_Latn-raw_source', + 'kopi_nllb_sun_Latn-dedup_source', + 'kopi_nllb_sun_Latn-neardup_source', + 'kopi_nllb_all-raw_seacrowd_ssp', + 'kopi_nllb_all-dedup_seacrowd_ssp', + 'kopi_nllb_all-neardup_seacrowd_ssp', + 'kopi_nllb_ace_Latn-raw_seacrowd_ssp', + 'kopi_nllb_ace_Latn-dedup_seacrowd_ssp', + 'kopi_nllb_ace_Latn-neardup_seacrowd_ssp', + 'kopi_nllb_ban_Latn-raw_seacrowd_ssp', + 'kopi_nllb_ban_Latn-dedup_seacrowd_ssp', + 'kopi_nllb_ban_Latn-neardup_seacrowd_ssp', + 'kopi_nllb_bjn_Latn-raw_seacrowd_ssp', + 'kopi_nllb_bjn_Latn-dedup_seacrowd_ssp', + 'kopi_nllb_bjn_Latn-neardup_seacrowd_ssp', + 'kopi_nllb_ind_Latn-raw_seacrowd_ssp', + 'kopi_nllb_ind_Latn-dedup_seacrowd_ssp', + 'kopi_nllb_ind_Latn-neardup_seacrowd_ssp', + 'kopi_nllb_jav_Latn-raw_seacrowd_ssp', + 'kopi_nllb_jav_Latn-dedup_seacrowd_ssp', + 'kopi_nllb_jav_Latn-neardup_seacrowd_ssp', + 'kopi_nllb_min_Latn-raw_seacrowd_ssp', + 'kopi_nllb_min_Latn-dedup_seacrowd_ssp', + 'kopi_nllb_min_Latn-neardup_seacrowd_ssp', + 'kopi_nllb_sun_Latn-raw_seacrowd_ssp', + 'kopi_nllb_sun_Latn-dedup_seacrowd_ssp', + 'kopi_nllb_sun_Latn-neardup_seacrowd_ssp' +] + +_RESOURCE_CONFIG_NAMES = [ + 'inset_lexicon_seacrowd_text', + 'kamus_alay_seacrowd_t2t' +] + +_CURRENTLY_BROKEN_NAMES = [ + +] + +BENCHMARK_DICT = { + 'IndoNLU': [ + 'emot_seacrowd_text', + 'smsa_seacrowd_text', + 'wrete_seacrowd_pairs', + 'casa_seacrowd_text_multi', + 'hoasa_seacrowd_text_multi', + 'facqa_seacrowd_qa', + 'indonlu_nergrit_seacrowd_seq_label', + 'nerp_seacrowd_seq_label', + 'posp_seacrowd_seq_label', + 'term_a_seacrowd_seq_label', + 'keps_seacrowd_seq_label', + 'idn_tagged_corpus_csui_seacrowd_seq_label' + ], + 'IndoNLG': [ + # MT + 'bible_en_id_seacrowd_t2t', + 'bible_su_id_seacrowd_t2t', + 'bible_jv_id_seacrowd_t2t', + 'ted_en_id_seacrowd_t2t', + 'indo_general_mt_en_id_seacrowd_t2t', + 'news_en_id_seacrowd_t2t', + # Summarization + 'indosum_fold0_seacrowd_t2t', + 'liputan6_canonical_seacrowd_t2t', + 'liputan6_xtreme_seacrowd_t2t', + # Chit Chat + 'xpersona_id_seacrowd_t2t', + # QA + 'tydiqa_id_seacrowd_qa', + ], + 'IndoLEM': [ + 'indolem_ntp_seacrowd_pairs', + 'indolem_sentiment_seacrowd_text', + 'indolem_ner_ugm_fold0_seacrowd_seq_label', + 'indolem_ner_ugm_fold1_seacrowd_seq_label', + 'indolem_ner_ugm_fold2_seacrowd_seq_label', + 'indolem_ner_ugm_fold3_seacrowd_seq_label', + 'indolem_ner_ugm_fold4_seacrowd_seq_label', + 'indolem_ud_id_gsd_seacrowd_kb', + 'indolem_ud_id_pud_seacrowd_kb', + 'indolem_tweet_ordering_seacrowd_seq_label', + 'indolem_nerui_fold0_seacrowd_seq_label', + 'indolem_nerui_fold1_seacrowd_seq_label', + 'indolem_nerui_fold2_seacrowd_seq_label', + 'indolem_nerui_fold3_seacrowd_seq_label', + 'indolem_nerui_fold4_seacrowd_seq_label' + ], + 'NusaX': [ + # Ind - XXX + 'nusax_mt_ind_ace_seacrowd_t2t', + 'nusax_mt_ind_ban_seacrowd_t2t', + 'nusax_mt_ind_bjn_seacrowd_t2t', + 'nusax_mt_ind_bug_seacrowd_t2t', + 'nusax_mt_ind_eng_seacrowd_t2t', + 'nusax_mt_ind_jav_seacrowd_t2t', + 'nusax_mt_ind_mad_seacrowd_t2t', + 'nusax_mt_ind_min_seacrowd_t2t', + 'nusax_mt_ind_nij_seacrowd_t2t', + 'nusax_mt_ind_sun_seacrowd_t2t', + 'nusax_mt_ind_bbc_seacrowd_t2t', + + # XXX - Ind + 'nusax_mt_ace_ind_seacrowd_t2t', + 'nusax_mt_ban_ind_seacrowd_t2t', + 'nusax_mt_bjn_ind_seacrowd_t2t', + 'nusax_mt_bug_ind_seacrowd_t2t', + 'nusax_mt_eng_ind_seacrowd_t2t', + 'nusax_mt_jav_ind_seacrowd_t2t', + 'nusax_mt_mad_ind_seacrowd_t2t', + 'nusax_mt_min_ind_seacrowd_t2t', + 'nusax_mt_nij_ind_seacrowd_t2t', + 'nusax_mt_sun_ind_seacrowd_t2t', + 'nusax_mt_bbc_ind_seacrowd_t2t', + ], + 'NusaNLU': [ + 'emot_seacrowd_text', + 'emotcmt_seacrowd_text', + 'emotion_id_opinion_seacrowd_text', + 'id_abusive_seacrowd_text', + 'id_google_play_review_seacrowd_text', + 'id_google_play_review_posneg_seacrowd_text', + 'id_hatespeech_seacrowd_text', + 'imdb_jv_seacrowd_text', + 'indolem_sentiment_seacrowd_text', + 'jadi_ide_seacrowd_text', + 'nusax_senti_ace_seacrowd_text', + 'nusax_senti_ban_seacrowd_text', + 'nusax_senti_bjn_seacrowd_text', + 'nusax_senti_bug_seacrowd_text', + 'nusax_senti_eng_seacrowd_text', + 'nusax_senti_ind_seacrowd_text', + 'nusax_senti_jav_seacrowd_text', + 'nusax_senti_mad_seacrowd_text', + 'nusax_senti_min_seacrowd_text', + 'nusax_senti_nij_seacrowd_text', + 'nusax_senti_sun_seacrowd_text', + 'nusax_senti_bbc_seacrowd_text', + 'sentiment_nathasa_review_seacrowd_text', + 'smsa_seacrowd_text', + 'indolem_ntp_seacrowd_pairs', + 'indonli_seacrowd_pairs', + 'code_mixed_jv_id_jv_seacrowd_text', + 'code_mixed_jv_id_id_seacrowd_text', + 'id_am2ico_seacrowd_pairs', + 'id_abusive_news_comment_seacrowd_text', + 'id_hoax_news_seacrowd_text', + 'id_hsd_nofaaulia_seacrowd_text', + 'id_stance_seacrowd_pairs', + 'indo_law_seacrowd_text', + 'indotacos_seacrowd_text', + 'karonese_sentiment_seacrowd_text', + 'su_emot_seacrowd_text', + 'wrete_seacrowd_pairs', + 'id_short_answer_grading_seacrowd_pairs' + ], + 'NusaNLG': [ + 'bible_en_id_seacrowd_t2t', + 'bible_jv_id_seacrowd_t2t', + 'bible_su_id_seacrowd_t2t', + 'id_panl_bppt_seacrowd_t2t', + 'indo_general_mt_en_id_seacrowd_t2t', + 'indo_religious_mt_en_id_seacrowd_t2t', + 'minangnlp_mt_seacrowd_t2t', + 'news_en_id_seacrowd_t2t', + 'nusax_mt_ace_ind_seacrowd_t2t', + 'nusax_mt_ban_ind_seacrowd_t2t', + 'nusax_mt_bjn_ind_seacrowd_t2t', + 'nusax_mt_bug_ind_seacrowd_t2t', + 'nusax_mt_eng_ind_seacrowd_t2t', + 'nusax_mt_ind_ace_seacrowd_t2t', + 'nusax_mt_ind_ban_seacrowd_t2t', + 'nusax_mt_ind_bjn_seacrowd_t2t', + 'nusax_mt_ind_bug_seacrowd_t2t', + 'nusax_mt_ind_eng_seacrowd_t2t', + 'nusax_mt_ind_jav_seacrowd_t2t', + 'nusax_mt_ind_mad_seacrowd_t2t', + 'nusax_mt_ind_min_seacrowd_t2t', + 'nusax_mt_ind_nij_seacrowd_t2t', + 'nusax_mt_ind_sun_seacrowd_t2t', + 'nusax_mt_ind_bbc_seacrowd_t2t', + 'nusax_mt_jav_ind_seacrowd_t2t', + 'nusax_mt_mad_ind_seacrowd_t2t', + 'nusax_mt_min_ind_seacrowd_t2t', + 'nusax_mt_nij_ind_seacrowd_t2t', + 'nusax_mt_sun_ind_seacrowd_t2t', + 'nusax_mt_bbc_ind_seacrowd_t2t', + 'parallel_su_id_seacrowd_t2t', + 'ted_en_id_seacrowd_t2t', + 'ud_id_csui_seacrowd_t2t', + 'korpus_seacrowd_ind_jav_seacrowd_t2t', + 'korpus_seacrowd_ind_xdy_seacrowd_t2t', + 'korpus_seacrowd_ind_bug_seacrowd_t2t', + 'korpus_seacrowd_ind_sun_seacrowd_t2t', + 'korpus_seacrowd_ind_mad_seacrowd_t2t', + 'korpus_seacrowd_ind_bjn_seacrowd_t2t', + 'korpus_seacrowd_ind_bbc_seacrowd_t2t', + 'korpus_seacrowd_ind_khek_seacrowd_t2t', + 'korpus_seacrowd_ind_msa_seacrowd_t2t', + 'korpus_seacrowd_ind_min_seacrowd_t2t', + 'korpus_seacrowd_ind_tiociu_seacrowd_t2t', + 'korpus_seacrowd_jav_ind_seacrowd_t2t', + 'korpus_seacrowd_xdy_ind_seacrowd_t2t', + 'korpus_seacrowd_bug_ind_seacrowd_t2t', + 'korpus_seacrowd_sun_ind_seacrowd_t2t', + 'korpus_seacrowd_mad_ind_seacrowd_t2t', + 'korpus_seacrowd_bjn_ind_seacrowd_t2t', + 'korpus_seacrowd_bbc_ind_seacrowd_t2t', + 'korpus_seacrowd_khek_ind_seacrowd_t2t', + 'korpus_seacrowd_msa_ind_seacrowd_t2t', + 'korpus_seacrowd_min_ind_seacrowd_t2t', + 'korpus_seacrowd_tiociu_ind_seacrowd_t2t', + 'indosum_fold0_seacrowd_t2t', + 'liputan6_canonical_seacrowd_t2t', + 'xl_sum_seacrowd_t2t', + 'id_qqp_seacrowd_t2t', + 'multilexnorm_seacrowd_t2t', + 'paracotta_id_seacrowd_t2t', + 'stif_indonesia_seacrowd_t2t', + 'xpersona_id_seacrowd_t2t' + 'facqa_seacrowd_qa', + 'idk_mrc_seacrowd_qa', + 'tydiqa_id_seacrowd_qa' + ], + 'NusaASR': [ + # Ind + 'indspeech_digit_cdsr_seacrowd_sptext', + 'indspeech_news_lvcsr_seacrowd_sptext', + 'indspeech_teldialog_lvcsr_seacrowd_sptext', + 'indspeech_teldialog_svcsr_seacrowd_sptext', + 'librivox_indonesia_ind_seacrowd_sptext', + 'titml_idn_seacrowd_sptext' + # Sun + 'indspeech_newstra_ethnicsr_nooverlap_sun_seacrowd_sptext', + 'indspeech_news_ethnicsr_su_nooverlap_seacrowd_sptext', + 'librivox_indonesia_sun_seacrowd_sptext', + 'su_id_asr_seacrowd_sptext', + # Jav + 'indspeech_newstra_ethnicsr_nooverlap_jav_seacrowd_sptext', + 'indspeech_news_ethnicsr_jv_nooverlap_seacrowd_sptext', + 'librivox_indonesia_jav_seacrowd_sptext', + 'jv_id_asr_seacrowd_sptext', + # Ban + 'indspeech_newstra_ethnicsr_nooverlap_ban_seacrowd_sptext', + 'librivox_indonesia_ban_seacrowd_sptext', + # Btk + 'indspeech_newstra_ethnicsr_nooverlap_btk_seacrowd_sptext', + # Ace + 'librivox_indonesia_ace_seacrowd_sptext', + # Bug + 'librivox_indonesia_bug_seacrowd_sptext', + # Min + 'librivox_indonesia_min_seacrowd_sptext', + ], + 'NusaTranslation': [ + # NusaTranslation Senti + 'nusatranslation_senti_abs_seacrowd_text', + 'nusatranslation_senti_btk_seacrowd_text', + 'nusatranslation_senti_bew_seacrowd_text', + 'nusatranslation_senti_bhp_seacrowd_text', + 'nusatranslation_senti_jav_seacrowd_text', + 'nusatranslation_senti_mad_seacrowd_text', + 'nusatranslation_senti_mak_seacrowd_text', + 'nusatranslation_senti_min_seacrowd_text', + 'nusatranslation_senti_mui_seacrowd_text', + 'nusatranslation_senti_rej_seacrowd_text', + 'nusatranslation_senti_sun_seacrowd_text', + + # NusaTranslation Emot + 'nusatranslation_emot_abs_seacrowd_text', + 'nusatranslation_emot_btk_seacrowd_text', + 'nusatranslation_emot_bew_seacrowd_text', + 'nusatranslation_emot_bhp_seacrowd_text', + 'nusatranslation_emot_jav_seacrowd_text', + 'nusatranslation_emot_mad_seacrowd_text', + 'nusatranslation_emot_mak_seacrowd_text', + 'nusatranslation_emot_min_seacrowd_text', + 'nusatranslation_emot_mui_seacrowd_text', + 'nusatranslation_emot_rej_seacrowd_text', + 'nusatranslation_emot_sun_seacrowd_text', + + # NusaTranslation MT Ind-XXX + 'nusatranslation_mt_ind_abs_seacrowd_t2t', + 'nusatranslation_mt_ind_btk_seacrowd_t2t', + 'nusatranslation_mt_ind_bew_seacrowd_t2t', + 'nusatranslation_mt_ind_bhp_seacrowd_t2t', + 'nusatranslation_mt_ind_jav_seacrowd_t2t', + 'nusatranslation_mt_ind_mad_seacrowd_t2t', + 'nusatranslation_mt_ind_mak_seacrowd_t2t', + 'nusatranslation_mt_ind_min_seacrowd_t2t', + 'nusatranslation_mt_ind_mui_seacrowd_t2t', + 'nusatranslation_mt_ind_rej_seacrowd_t2t', + 'nusatranslation_mt_ind_sun_seacrowd_t2t', + + # NusaTranslation MT XXX-Ind + 'nusatranslation_mt_abs_ind_seacrowd_t2t', + 'nusatranslation_mt_btk_ind_seacrowd_t2t', + 'nusatranslation_mt_bew_ind_seacrowd_t2t', + 'nusatranslation_mt_bhp_ind_seacrowd_t2t', + 'nusatranslation_mt_jav_ind_seacrowd_t2t', + 'nusatranslation_mt_mad_ind_seacrowd_t2t', + 'nusatranslation_mt_mak_ind_seacrowd_t2t', + 'nusatranslation_mt_min_ind_seacrowd_t2t', + 'nusatranslation_mt_mui_ind_seacrowd_t2t', + 'nusatranslation_mt_rej_ind_seacrowd_t2t', + 'nusatranslation_mt_sun_ind_seacrowd_t2t', + ], + 'NusaParagraph': [ + # NusaParagraph Topic + 'nusaparagraph_topic_btk_seacrowd_text', + 'nusaparagraph_topic_bew_seacrowd_text', + 'nusaparagraph_topic_bug_seacrowd_text', + 'nusaparagraph_topic_jav_seacrowd_text', + 'nusaparagraph_topic_mad_seacrowd_text', + 'nusaparagraph_topic_mak_seacrowd_text', + 'nusaparagraph_topic_min_seacrowd_text', + 'nusaparagraph_topic_mui_seacrowd_text', + 'nusaparagraph_topic_rej_seacrowd_text', + 'nusaparagraph_topic_sun_seacrowd_text', + + # NusaParagraph Rhetoric + 'nusaparagraph_rhetoric_btk_seacrowd_text', + 'nusaparagraph_rhetoric_bew_seacrowd_text', + 'nusaparagraph_rhetoric_bug_seacrowd_text', + 'nusaparagraph_rhetoric_jav_seacrowd_text', + 'nusaparagraph_rhetoric_mad_seacrowd_text', + 'nusaparagraph_rhetoric_mak_seacrowd_text', + 'nusaparagraph_rhetoric_min_seacrowd_text', + 'nusaparagraph_rhetoric_mui_seacrowd_text', + 'nusaparagraph_rhetoric_rej_seacrowd_text', + 'nusaparagraph_rhetoric_sun_seacrowd_text', + + # NusaParagraph Emot + 'nusaparagraph_emot_btk_seacrowd_text', + 'nusaparagraph_emot_bew_seacrowd_text', + 'nusaparagraph_emot_bug_seacrowd_text', + 'nusaparagraph_emot_jav_seacrowd_text', + 'nusaparagraph_emot_mad_seacrowd_text', + 'nusaparagraph_emot_mak_seacrowd_text', + 'nusaparagraph_emot_min_seacrowd_text', + 'nusaparagraph_emot_mui_seacrowd_text', + 'nusaparagraph_emot_rej_seacrowd_text', + 'nusaparagraph_emot_sun_seacrowd_text', + ], + 'NusaWrites': [ + # NusaTranslation Senti + 'nusatranslation_senti_abs_seacrowd_text', + 'nusatranslation_senti_btk_seacrowd_text', + 'nusatranslation_senti_bew_seacrowd_text', + 'nusatranslation_senti_bhp_seacrowd_text', + 'nusatranslation_senti_jav_seacrowd_text', + 'nusatranslation_senti_mad_seacrowd_text', + 'nusatranslation_senti_mak_seacrowd_text', + 'nusatranslation_senti_min_seacrowd_text', + 'nusatranslation_senti_mui_seacrowd_text', + 'nusatranslation_senti_rej_seacrowd_text', + 'nusatranslation_senti_sun_seacrowd_text', + + # NusaTranslation Emot + 'nusatranslation_emot_abs_seacrowd_text', + 'nusatranslation_emot_btk_seacrowd_text', + 'nusatranslation_emot_bew_seacrowd_text', + 'nusatranslation_emot_bhp_seacrowd_text', + 'nusatranslation_emot_jav_seacrowd_text', + 'nusatranslation_emot_mad_seacrowd_text', + 'nusatranslation_emot_mak_seacrowd_text', + 'nusatranslation_emot_min_seacrowd_text', + 'nusatranslation_emot_mui_seacrowd_text', + 'nusatranslation_emot_rej_seacrowd_text', + 'nusatranslation_emot_sun_seacrowd_text', + + # NusaTranslation MT Ind-XXX + 'nusatranslation_mt_ind_abs_seacrowd_t2t', + 'nusatranslation_mt_ind_btk_seacrowd_t2t', + 'nusatranslation_mt_ind_bew_seacrowd_t2t', + 'nusatranslation_mt_ind_bhp_seacrowd_t2t', + 'nusatranslation_mt_ind_jav_seacrowd_t2t', + 'nusatranslation_mt_ind_mad_seacrowd_t2t', + 'nusatranslation_mt_ind_mak_seacrowd_t2t', + 'nusatranslation_mt_ind_min_seacrowd_t2t', + 'nusatranslation_mt_ind_mui_seacrowd_t2t', + 'nusatranslation_mt_ind_rej_seacrowd_t2t', + 'nusatranslation_mt_ind_sun_seacrowd_t2t', + + # NusaTranslation MT XXX-Ind + 'nusatranslation_mt_abs_ind_seacrowd_t2t', + 'nusatranslation_mt_btk_ind_seacrowd_t2t', + 'nusatranslation_mt_bew_ind_seacrowd_t2t', + 'nusatranslation_mt_bhp_ind_seacrowd_t2t', + 'nusatranslation_mt_jav_ind_seacrowd_t2t', + 'nusatranslation_mt_mad_ind_seacrowd_t2t', + 'nusatranslation_mt_mak_ind_seacrowd_t2t', + 'nusatranslation_mt_min_ind_seacrowd_t2t', + 'nusatranslation_mt_mui_ind_seacrowd_t2t', + 'nusatranslation_mt_rej_ind_seacrowd_t2t', + 'nusatranslation_mt_sun_ind_seacrowd_t2t', + + # NusaParagraph Topic + 'nusaparagraph_topic_btk_seacrowd_text', + 'nusaparagraph_topic_bew_seacrowd_text', + 'nusaparagraph_topic_bug_seacrowd_text', + 'nusaparagraph_topic_jav_seacrowd_text', + 'nusaparagraph_topic_mad_seacrowd_text', + 'nusaparagraph_topic_mak_seacrowd_text', + 'nusaparagraph_topic_min_seacrowd_text', + 'nusaparagraph_topic_mui_seacrowd_text', + 'nusaparagraph_topic_rej_seacrowd_text', + 'nusaparagraph_topic_sun_seacrowd_text', + + # NusaParagraph Rhetoric + 'nusaparagraph_rhetoric_btk_seacrowd_text', + 'nusaparagraph_rhetoric_bew_seacrowd_text', + 'nusaparagraph_rhetoric_bug_seacrowd_text', + 'nusaparagraph_rhetoric_jav_seacrowd_text', + 'nusaparagraph_rhetoric_mad_seacrowd_text', + 'nusaparagraph_rhetoric_mak_seacrowd_text', + 'nusaparagraph_rhetoric_min_seacrowd_text', + 'nusaparagraph_rhetoric_mui_seacrowd_text', + 'nusaparagraph_rhetoric_rej_seacrowd_text', + 'nusaparagraph_rhetoric_sun_seacrowd_text', + + # NusaParagraph Emot + 'nusaparagraph_emot_btk_seacrowd_text', + 'nusaparagraph_emot_bew_seacrowd_text', + 'nusaparagraph_emot_bug_seacrowd_text', + 'nusaparagraph_emot_jav_seacrowd_text', + 'nusaparagraph_emot_mad_seacrowd_text', + 'nusaparagraph_emot_mak_seacrowd_text', + 'nusaparagraph_emot_min_seacrowd_text', + 'nusaparagraph_emot_mui_seacrowd_text', + 'nusaparagraph_emot_rej_seacrowd_text', + 'nusaparagraph_emot_sun_seacrowd_text', + ], +} + +@dataclass +class SEACrowdMetadata: + """Metadata for one config of a dataset.""" + + script: pathlib.Path + dataset_name: str + tasks: List[Tasks] + languages: List[str] + config: SEACrowdConfig + is_local: bool + is_seacrowd_schema: bool + seacrowd_schema_caps: Optional[str] + is_large: bool + is_resource: bool + is_default: bool + is_broken: bool + seacrowd_version: str + source_version: str + citation: str + description: str + homepage: str + license: str + + _ds_module: datasets.load.DatasetModule = field(repr=False) + _py_module: ModuleType = field(repr=False) + _ds_cls: type = field(repr=False) + + def get_load_dataset_kwargs( + self, + **extra_load_dataset_kwargs, + ): + return { + "path": self.script, + "name": self.config.name, + **extra_load_dataset_kwargs, + } + + def load_dataset( + self, + trust_remote_code=True, + **extra_load_dataset_kwargs, + ): + return datasets.load_dataset( + path=self.script, + name=self.config.name, + trust_remote_code=trust_remote_code, + **extra_load_dataset_kwargs, + ) + + def get_metadata(self, **extra_load_dataset_kwargs): + if not self.is_seacrowd_schema: + raise ValueError("only supported for seacrowd schemas") + dsd = self.load_dataset(**extra_load_dataset_kwargs) + split_metas = {} + for split, ds in dsd.items(): + meta = SCHEMA_TO_METADATA_CLS[self.config.schema].from_dataset(ds) + split_metas[split] = meta + return split_metas + + +def default_is_keeper(metadata: SEACrowdMetadata) -> bool: + return not metadata.is_large and not metadata.is_resource and metadata.is_seacrowd_schema + +class SEACrowdConfigHelper: + """ + Handles creating and filtering SEACrowdMetadata instances. + """ + + def __init__( + self, + helpers: Optional[Iterable[SEACrowdMetadata]] = None, + keep_broken: bool = False, + trust_remote_code: bool = True, + ): + + path_to_here = pathlib.Path(__file__).parent.absolute() + self.path_to_sea_datasets = (path_to_here / "sea_datasets").resolve() + self.dataloader_scripts = sorted( + self.path_to_sea_datasets.glob(os.path.join("*", "*.py")) + ) + self.dataloader_scripts = [ + el for el in self.dataloader_scripts if el.name != "__init__.py" + ] + + # if helpers are passed in, just attach and go + if helpers is not None: + if keep_broken: + self._helpers = helpers + else: + self._helpers = [helper for helper in helpers if not helper.is_broken] + return + + # otherwise, create all helpers available in package + helpers = [] + for dataloader_script in tqdm(self.dataloader_scripts): + if dataloader_script.stem != dataloader_script.parent.stem: + continue + dataset_name = dataloader_script.stem + py_module = SourceFileLoader( + dataset_name, dataloader_script.as_posix() + ).load_module() + ds_module = datasets.load.dataset_module_factory( + dataloader_script.as_posix(), trust_remote_code=trust_remote_code, + ) + ds_cls = datasets.load.import_main_class(ds_module.module_path) + + for config in ds_cls.BUILDER_CONFIGS: + + is_seacrowd_schema = config.schema.startswith("seacrowd") + if is_seacrowd_schema: + seacrowd_schema_caps = '_'.join(config.schema.split("_")[1:]).upper() + tasks = SCHEMA_TO_TASKS[seacrowd_schema_caps] & set( + py_module._SUPPORTED_TASKS + ) + else: + tasks = py_module._SUPPORTED_TASKS + seacrowd_schema_caps = None + + helpers.append( + SEACrowdMetadata( + script=dataloader_script.as_posix(), + dataset_name=dataset_name, + tasks=tasks, + languages=py_module._LANGUAGES, + config=config, + is_local=py_module._LOCAL, + is_seacrowd_schema=is_seacrowd_schema, + seacrowd_schema_caps=seacrowd_schema_caps, + is_large=config.name in _LARGE_CONFIG_NAMES, + is_resource=config.name in _RESOURCE_CONFIG_NAMES, + is_default=config.name == ds_cls.DEFAULT_CONFIG_NAME, + is_broken=config.name in _CURRENTLY_BROKEN_NAMES, + seacrowd_version=py_module._SEACROWD_VERSION, + source_version=py_module._SOURCE_VERSION, + citation=py_module._CITATION, + description=py_module._DESCRIPTION, + homepage=py_module._HOMEPAGE, + license=py_module._LICENSE, + _ds_module=ds_module, + _py_module=py_module, + _ds_cls=ds_cls, + ) + ) + + if keep_broken: + self._helpers = helpers + else: + self._helpers = [helper for helper in helpers if not helper.is_broken] + + @property + def available_dataset_names(self) -> List[str]: + return sorted(list(set([helper.dataset_name for helper in self]))) + + @property + def available_config_names(self) -> List[str]: + return sorted(list(set([helper.config.name for helper in self]))) + + def for_dataset(self, dataset_name: str) -> "SEACrowdConfigHelper": + helpers = [helper for helper in self if helper.dataset_name == dataset_name] + if len(helpers) == 0: + raise ValueError(f"no helper with helper.dataset_name = {dataset_name}") + return SEACrowdConfigHelper(helpers=helpers) + + def for_config_name(self, config_name: str) -> "SEACrowdMetadata": + helpers = [helper for helper in self if helper.config.name == config_name] + if len(helpers) == 0: + raise ValueError(f"no helper with helper.config.name = {config_name}") + if len(helpers) > 1: + raise ValueError( + f"multiple helpers with helper.config.name = {config_name}" + ) + return helpers[0] + + def default_for_dataset(self, dataset_name: str) -> "SEACrowdMetadata": + helpers = [ + helper + for helper in self + if helper.is_default and helper.dataset_name == dataset_name + ] + assert len(helpers) == 1 + return helpers[0] + + def filtered( + self, is_keeper: Callable[[SEACrowdMetadata], bool] + ) -> "SEACrowdConfigHelper": + """Return dataset config helpers that match is_keeper.""" + return SEACrowdConfigHelper( + helpers=[helper for helper in self if is_keeper(helper)] + ) + + def __repr__(self): + # return "\n\n".join([helper.__repr__() for helper in self]) + return f"SEACrowd Config Helper. Datasets include: {self.available_dataset_names}." + + def __str__(self): + return self.__repr__() + # # return f"SEACrowd Config Helper. Datasets include: {self.available_dataset_names}." + # return "SEACrowd Config Helper." + + def __iter__(self): + for helper in self._helpers: + yield helper + + def __len__(self): + return len(self._helpers) + + def __getitem__(self, key): + if isinstance(key, slice): + start, stop, step = key.indices(len(self)) + return SEACrowdConfigHelper( + helpers=[self._helpers[ii] for ii in range(start, stop, step)] + ) + elif isinstance(key, int): + if key < 0: # Handle negative indices + key += len(self) + if key < 0 or key >= len(self): + raise IndexError(f"The index ({key}) is out of range.") + return self._helpers[key] + else: + raise TypeError("Invalid argument type.") + + def list_datasets(self, with_config=False): + name_to_schema = {} + for helper in self: + if helper.dataset_name not in name_to_schema: + name_to_schema[helper.dataset_name] = [] + name_to_schema[helper.dataset_name].append(helper.config.name) + if not with_config: + return list(name_to_schema.keys()) + else: + return name_to_schema + + def load_dataset(self, dataset_name, schema='seacrowd'): + try: + for helper in sorted(self.filtered( + lambda x: ( + (dataset_name == x.dataset_name) and + (x.is_seacrowd_schema if schema == 'seacrowd' else not x.is_seacrowd_schema) + ) + ), key=lambda x: len(x.config.name)): + return helper.load_dataset() + except: + raise ValueError(f"Couldn't find dataset with name=`{dataset_name}` and schema=`{schema}`") + + def load_datasets(self, dataset_names, schema='seacrowd'): + return { + helper.config.name: helper.load_dataset() + for helper in self.filtered( + lambda x: ( + (x.dataset_name in dataset_names) and + (x.is_seacrowd_schema if schema == 'seacrowd' else not x.is_seacrowd_schema) + ) + ) + } + + def load_config_name(self, config_name, schema='seacrowd'): + try: + for helper in sorted(self.filtered( + lambda x: ( + (config_name == x.config.name) and + (x.is_seacrowd_schema if schema == 'seacrowd' else not x.is_seacrowd_schema) + ) + ), key=lambda x: len(x.config.name)): + return helper.load_dataset() + except: + raise ValueError(f"Couldn't find dataset with config.name=`{config_name}` and schema=`{schema}`") + + def load_config_names(self, config_names, schema='seacrowd'): + return { + helper.config.name: helper.load_dataset() + for helper in self.filtered( + lambda x: ( + (x.config.name in config_names) and + (x.is_seacrowd_schema if schema == 'seacrowd' else not x.is_seacrowd_schema) + ) + ) + } + + def list_benchmarks(self): + return list(BENCHMARK_DICT.keys()) + + def load_benchmark(self, benchmark_name): + return { + helper.config.name: helper.load_dataset() + for helper in self.filtered( + lambda x: ( + x.config.name in BENCHMARK_DICT[benchmark_name] + ) + ) + } + +# Metadata Helper +@dataclass +class MetaDict: + data: dict = None + +class SEACrowdMetadataHelper: + """ + Handles creating and filtering SEACrowdMetadata instances. + """ + + def __init__( + self, + meta_df: Optional[pd.DataFrame] = None, + keep_broken: bool = False + ): + # Load Config Helper + self._conhelps = SEACrowdConfigHelper() + + # if meta_df are passed in, just attach and go + if meta_df is not None: + if keep_broken: + self._meta_df = meta_df + else: + self._meta_df = meta_df[~meta_df.is_broken] + return + + # Load Metadata + self._meta_df = pd.read_csv('https://docs.google.com/spreadsheets/d/17o83IvWxmtGLYridZis0nEprHhsZIMeFtHGtXV35h6M/export?format=csv&gid=879729812', skiprows=1) + self._meta_df = self._meta_df[self._meta_df['Implemented'] != 0].rename({ + 'No.': 'id', 'Name': 'name', 'Subsets': 'subsets', 'Link': 'source_link', 'Description': 'description', + 'HF Link': 'hf_link', 'License': 'license', 'Year': 'year', 'Collection Style': 'collection_style', + 'Language': 'language', 'Dialect': 'dialect', 'Domain': 'domain', 'Form': 'modality', 'Tasks': 'tasks', + 'Volume': 'volume', 'Unit': 'unit', 'Ethical Risks': 'ethical_risk', 'Provider': 'provider', + 'Paper Title': 'paper_title', 'Paper Link': 'paper_link', 'Access': 'access', 'Derived From': 'derived_from', + 'Test Split': 'is_splitted', 'Notes': 'notes', 'Dataloader': 'dataloader', 'Implemented': 'implemented' + }, axis=1) + self._meta_df['is_splitted'] = self._meta_df['is_splitted'].apply(lambda x: True if x =='Yes' else False) + + # Merge Metadata with Config + name_to_meta_map = {} + for cfg_meta in self._conhelps: + # Assign metadata to meta dataframe + self._meta_df.loc[self._meta_df.dataloader == cfg_meta.dataset_name, [ + 'is_large', 'is_resource', 'is_default', 'is_broken', + 'is_local', 'citation', 'license', 'homepage', 'tasks' + ]] = [ + cfg_meta.is_large, cfg_meta.is_resource, cfg_meta.is_default, cfg_meta.is_broken, + cfg_meta.is_local, cfg_meta.citation, cfg_meta.license, cfg_meta.homepage, '|'.join([task.value for task in cfg_meta.tasks]) + ] + + if cfg_meta.dataset_name not in name_to_meta_map: + name_to_meta_map[cfg_meta.dataset_name] = {} + if cfg_meta.config.schema not in name_to_meta_map[cfg_meta.dataset_name]: + name_to_meta_map[cfg_meta.dataset_name][cfg_meta.config.schema] = [] + name_to_meta_map[cfg_meta.dataset_name][cfg_meta.config.schema].append(cfg_meta) + + self._meta_df = self._meta_df.fillna(False) + for dset_name in name_to_meta_map.keys(): + self._meta_df.loc[self._meta_df.dataloader == dset_name, 'metadata'] = MetaDict(data=name_to_meta_map[dset_name]) + + if not keep_broken: + self._meta_df = self._meta_df[~self._meta_df.is_broken] + + def filtered( + self, is_keeper: Callable[[], bool] + ) -> "SEACrowdConfigHelper": + """Return dataset config helpers that match is_keeper.""" + meta_df = self._meta_df[self._meta_df.apply(is_keeper, axis=1, reduce=True)] + return SEACrowdMetadataHelper(meta_df=meta_df) + + def filter_and_load( + self, is_keeper: Callable[[], bool] + ) -> "Dict": + """Return dataset that match is_keeper.""" + filtered_helper = self.filtered(is_keeper) + for metas in filtered_helper._meta_df.metadata: + if schema in metas.data: + for meta in metas.data[schema]: + if len(meta.languages) > 1: + if lang in meta.config.name: + datasets[meta.config.name] = meta.load_dataset() + else: + datasets[meta.config.name] = meta.load_dataset() + + @property + def available_dataset_names(self) -> List[str]: + return sorted(self._meta_df.name) + + def __repr__(self): + return self._meta_df.to_string() + + def __str__(self): + # return self.__repr__() + return "test" + + def __iter__(self): + for row in self._meta_df.iterrows(): + yield row + + def __len__(self): + return len(self._meta_df) + +### +# SEACrowd Interface +### + +def list_datasets(with_config=False): + conhelps = SEACrowdConfigHelper() + return conhelps.list_datasets(with_config=with_config) + +def load_dataset(dataset_name, schema='seacrowd'): + conhelps = SEACrowdConfigHelper() + return conhelps.load_dataset(dataset_name=dataset_name, schema=schema) + +def load_datasets(dataset_names, schema='seacrowd'): + conhelps = SEACrowdConfigHelper() + return conhelps.load_datasets(dataset_names=dataset_names, schema=schema) + +def list_benchmarks(): + conhelps = SEACrowdConfigHelper() + return conhelps.list_benchmarks() + +def load_benchmark(benchmark_name): + conhelps = SEACrowdConfigHelper() + return conhelps.load_benchmark(benchmark_name=benchmark_name) + +if __name__ == "__main__": + print(f'LIST DATASETS') + dset_names = list_datasets() + print(dset_names[:10]) + print() + + print(f'LOAD DATASET `{dset_names[1]}`') + dset = load_dataset(dset_names[1]) + print(dset) + print() + + print(f'LOAD DATASETS [{dset_names[1:4]}]') + dsets = load_datasets(dset_names[1:4]) + print(dsets) + print() + + print(f'LIST BENCHMARKS') + benchmark_names = list_benchmarks() + print(benchmark_names[:3]) + print() + + print(f'LOAD BENCHMARK `{benchmark_names[0]}`') + benchmark_dsets = load_benchmark(benchmark_names[0]) + print(benchmark_dsets) + print() \ No newline at end of file diff --git a/seacrowd/sea_datasets/belebele/belebele.py b/seacrowd/sea_datasets/belebele/belebele.py index 3f6b48a5d..8f515ca4b 100644 --- a/seacrowd/sea_datasets/belebele/belebele.py +++ b/seacrowd/sea_datasets/belebele/belebele.py @@ -82,6 +82,8 @@ _DEFAULT_LANG = "zsm" +_LOCAL = False + def config_constructor(belebele_subset: str, schema: str, version: str) -> SEACrowdConfig: lang = _LANGUAGES[_SOURCE_NAMES.index(belebele_subset)] return SEACrowdConfig( diff --git a/seacrowd/sea_datasets/bhinneka_korpus/bhinneka_korpus.py b/seacrowd/sea_datasets/bhinneka_korpus/bhinneka_korpus.py index ab7e1fc1c..628330aeb 100644 --- a/seacrowd/sea_datasets/bhinneka_korpus/bhinneka_korpus.py +++ b/seacrowd/sea_datasets/bhinneka_korpus/bhinneka_korpus.py @@ -44,6 +44,8 @@ "mkn": "kupang-malay", } +_LOCAL = False + class BhinnekaKorpusDataset(datasets.GeneratorBasedBuilder): """A Collection of Multilingual Parallel Datasets for 5 Indonesian Local Languages.""" diff --git a/seacrowd/sea_datasets/burmese_romanize/burmese_romanize.py b/seacrowd/sea_datasets/burmese_romanize/burmese_romanize.py index e2ff64c0b..06d82bfec 100644 --- a/seacrowd/sea_datasets/burmese_romanize/burmese_romanize.py +++ b/seacrowd/sea_datasets/burmese_romanize/burmese_romanize.py @@ -40,6 +40,8 @@ _SOURCE_VERSION = "1.0.0" _SEACROWD_VERSION = "1.0.0" +_LOCAL = False + class BurmeseRomanizeDataset(datasets.GeneratorBasedBuilder): """Romanization of names in Burmese script""" diff --git a/seacrowd/sea_datasets/coco_35l/coco_35l.py b/seacrowd/sea_datasets/coco_35l/coco_35l.py index 78770aea3..4346dadcc 100644 --- a/seacrowd/sea_datasets/coco_35l/coco_35l.py +++ b/seacrowd/sea_datasets/coco_35l/coco_35l.py @@ -61,6 +61,8 @@ _LANGUAGES = {"fil": "fil", "ind": "id", "tha": "th", "vie": "vi"} +_LOCAL = False + class Coco35LDataset(datasets.GeneratorBasedBuilder): """ COCO-35L is a machine-generated image caption dataset, constructed by translating COCO Captions (Chen et al., 2015) to the other 34 languages using Google’s machine translation API. diff --git a/seacrowd/sea_datasets/dengue_filipino/dengue_filipino.py b/seacrowd/sea_datasets/dengue_filipino/dengue_filipino.py index 3aaa30d05..493691dfb 100644 --- a/seacrowd/sea_datasets/dengue_filipino/dengue_filipino.py +++ b/seacrowd/sea_datasets/dengue_filipino/dengue_filipino.py @@ -38,6 +38,8 @@ _SOURCE_VERSION = "1.0.0" _SEACROWD_VERSION = "1.0.0" +_LOCAL = False + class DengueFilipinoDataset(datasets.GeneratorBasedBuilder): """Dengue Dataset Low-Resource Multi-label Text Classification Dataset in Filipino""" diff --git a/seacrowd/sea_datasets/id_msvd/id_msvd.py b/seacrowd/sea_datasets/id_msvd/id_msvd.py index 6e0bd928a..a560f9aae 100644 --- a/seacrowd/sea_datasets/id_msvd/id_msvd.py +++ b/seacrowd/sea_datasets/id_msvd/id_msvd.py @@ -36,6 +36,8 @@ _SOURCE_VERSION = "1.0.0" _SEACROWD_VERSION = "1.0.0" +_LOCAL = False + class IdMsvdDataset(datasets.GeneratorBasedBuilder): """MSVD dataset with Indonesian translation.""" diff --git a/seacrowd/sea_datasets/indommlu/indommlu.py b/seacrowd/sea_datasets/indommlu/indommlu.py index 7d7959cff..b204b3fc0 100644 --- a/seacrowd/sea_datasets/indommlu/indommlu.py +++ b/seacrowd/sea_datasets/indommlu/indommlu.py @@ -199,7 +199,7 @@ class IndoMMLUDataset(datasets.GeneratorBasedBuilder): name=f"{_DATASETNAME}_{lang}_seacrowd_qa", version=SEACROWD_VERSION, description=f"{_DATASETNAME} {lang} SEACrowd schema", - schema=f"seacrowd_{lang}_qa", + schema=f"seacrowd_qa", subset_id=_DATASETNAME, ) BUILDER_CONFIGS.append(lang_config) diff --git a/seacrowd/sea_datasets/indonesian_news_dataset/indonesian_news_dataset.py b/seacrowd/sea_datasets/indonesian_news_dataset/indonesian_news_dataset.py index 55d083783..dec3c425d 100644 --- a/seacrowd/sea_datasets/indonesian_news_dataset/indonesian_news_dataset.py +++ b/seacrowd/sea_datasets/indonesian_news_dataset/indonesian_news_dataset.py @@ -19,6 +19,8 @@ } """ +_LANGUAGES = ["ind"] + _DATASETNAME = "indonesian_news_dataset" _DESCRIPTION = """An imbalanced dataset to classify Indonesian News articles. @@ -44,6 +46,8 @@ _TAGS = ["bola", "news", "bisnis", "tekno", "otomotif"] +_LOCAL = False + class IndonesianNewsDataset(datasets.GeneratorBasedBuilder): """The dataset contains 5 Indonesian News articles with imbalanced classes""" diff --git a/seacrowd/sea_datasets/mc4_indo/mc4_indo.py b/seacrowd/sea_datasets/mc4_indo/mc4_indo.py index 6349eaa41..6c5d6d4d3 100644 --- a/seacrowd/sea_datasets/mc4_indo/mc4_indo.py +++ b/seacrowd/sea_datasets/mc4_indo/mc4_indo.py @@ -52,6 +52,7 @@ # "full": {"train": 1, "validation": 1} # } +_LOCAL = False _SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] _SOURCE_VERSION = "1.0.0" diff --git a/seacrowd/sea_datasets/memolon/memolon.py b/seacrowd/sea_datasets/memolon/memolon.py index 5b3c6b97c..5b83f3bb1 100644 --- a/seacrowd/sea_datasets/memolon/memolon.py +++ b/seacrowd/sea_datasets/memolon/memolon.py @@ -57,6 +57,7 @@ } _SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" _LANGUAGES = ["ceb", "tgl", "ind", "sun", "jav", "zsm", "vie", "tha", "mya"] @@ -64,6 +65,8 @@ _SUPPORTED_TASKS = [Tasks.EMOTION_CLASSIFICATION] +_LOCAL = False + def seacrowd_config_constructor(lang: str, schema: str, version: str) -> SEACrowdConfig: if lang not in _LANGUAGE_MAP: diff --git a/seacrowd/sea_datasets/miracl/miracl.py b/seacrowd/sea_datasets/miracl/miracl.py index 0320cfd67..ad08bcd2e 100644 --- a/seacrowd/sea_datasets/miracl/miracl.py +++ b/seacrowd/sea_datasets/miracl/miracl.py @@ -96,8 +96,11 @@ _SEACROWD_VERSION = "1.0.0" +_LOCAL = False + def load_topic(fn): + qid2topic = {} with open(fn, encoding="utf-8") as f: for line in f: diff --git a/seacrowd/sea_datasets/mlqa/mlqa.py b/seacrowd/sea_datasets/mlqa/mlqa.py index f2884e0f1..8064c65e0 100644 --- a/seacrowd/sea_datasets/mlqa/mlqa.py +++ b/seacrowd/sea_datasets/mlqa/mlqa.py @@ -39,6 +39,8 @@ _SOURCE_VERSION = "1.0.0" _SEACROWD_VERSION = "1.0.0" +_LOCAL = False + class MLQADataset(datasets.GeneratorBasedBuilder): """ diff --git a/seacrowd/sea_datasets/mtop_intent_classification/mtop_intent_classification.py b/seacrowd/sea_datasets/mtop_intent_classification/mtop_intent_classification.py index a8ff6ef4f..f0423d2c0 100644 --- a/seacrowd/sea_datasets/mtop_intent_classification/mtop_intent_classification.py +++ b/seacrowd/sea_datasets/mtop_intent_classification/mtop_intent_classification.py @@ -67,7 +67,7 @@ class MTOPIntentClassificationDataset(datasets.GeneratorBasedBuilder): version=datasets.Version(_SOURCE_VERSION), description=f"{_DATASETNAME} source schema for {subset} subset", schema="source", - subset_id=subset, + subset_id=f"{_DATASETNAME}_{subset}", ) for subset in SUBSETS ] + [ @@ -76,7 +76,7 @@ class MTOPIntentClassificationDataset(datasets.GeneratorBasedBuilder): version=datasets.Version(_SEACROWD_VERSION), description=f"{_DATASETNAME} SEACrowd schema for {subset} subset", schema="seacrowd_text", - subset_id=subset, + subset_id=f"{_DATASETNAME}_{subset}", ) for subset in SUBSETS ] diff --git a/seacrowd/sea_datasets/my_paraphrase/my_paraphrase.py b/seacrowd/sea_datasets/my_paraphrase/my_paraphrase.py index 486ece5a6..44fc09d0a 100644 --- a/seacrowd/sea_datasets/my_paraphrase/my_paraphrase.py +++ b/seacrowd/sea_datasets/my_paraphrase/my_paraphrase.py @@ -74,42 +74,42 @@ class MyParaphraseDataset(datasets.GeneratorBasedBuilder): name=f"{_DATASETNAME}_source", # source version=SOURCE_VERSION, description=f"{_DATASETNAME} source schema", - schema="paraphrase_source", + schema="source", subset_id=f"{_DATASETNAME}_paraphrase", ), SEACrowdConfig( name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", # schema version=SEACROWD_VERSION, description=f"{_DATASETNAME} SEACrowd schema", - schema=f"seacrowd_paraphrase_{SEACROWD_SCHEMA_NAME}", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", subset_id=f"{_DATASETNAME}_paraphrase", ), SEACrowdConfig( name=f"{_DATASETNAME}_non_paraphrase_source", # source version=SEACROWD_VERSION, description=f"{_DATASETNAME} SEACrowd schema", - schema="non_paraphrase_source", + schema="source", subset_id=f"{_DATASETNAME}_non_paraphrase", ), SEACrowdConfig( name=f"{_DATASETNAME}_non_paraphrase_seacrowd_{SEACROWD_SCHEMA_NAME}", # schema version=SEACROWD_VERSION, description=f"{_DATASETNAME} SEACrowd schema", - schema=f"seacrowd_non_paraphrase_{SEACROWD_SCHEMA_NAME}", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", subset_id=f"{_DATASETNAME}_non_paraphrase", ), SEACrowdConfig( name=f"{_DATASETNAME}_all_source", # source version=SOURCE_VERSION, description=f"{_DATASETNAME} source schema", - schema="all_source", + schema="source", subset_id=f"{_DATASETNAME}_all", ), SEACrowdConfig( name=f"{_DATASETNAME}_all_seacrowd_{SEACROWD_SCHEMA_NAME}", # schema version=SEACROWD_VERSION, description=f"{_DATASETNAME} SEACrowd schema", - schema=f"seacrowd_all_{SEACROWD_SCHEMA_NAME}", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", subset_id=f"{_DATASETNAME}_all", ), ] diff --git a/seacrowd/sea_datasets/ph_fake_news_corpus/ph_fake_news_corpus.py b/seacrowd/sea_datasets/ph_fake_news_corpus/ph_fake_news_corpus.py index a11a420c3..4718d626f 100644 --- a/seacrowd/sea_datasets/ph_fake_news_corpus/ph_fake_news_corpus.py +++ b/seacrowd/sea_datasets/ph_fake_news_corpus/ph_fake_news_corpus.py @@ -51,6 +51,7 @@ _SUPPORTED_TASKS = [Tasks.FACT_CHECKING] _SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" class PhilippineFakeNewsDataset(datasets.GeneratorBasedBuilder): diff --git a/seacrowd/sea_datasets/sap_wat/sap_wat.py b/seacrowd/sea_datasets/sap_wat/sap_wat.py index b638e17af..0ff04d7fc 100644 --- a/seacrowd/sea_datasets/sap_wat/sap_wat.py +++ b/seacrowd/sea_datasets/sap_wat/sap_wat.py @@ -74,6 +74,8 @@ _SUBSET = ["id", "ms", "th", "vi"] +_LOCAL = False + class SapWatDataset(datasets.GeneratorBasedBuilder): """SAP WAT is a software documentation dataset for machine translation. The current language scope is English to Hindi, Indonesian, Japanese, Korean, Malay, Thai, Vietnamese, Simplified Chinese and Traditional Chinese. Here, we only consider diff --git a/seacrowd/sea_datasets/tydiqa/tydiqa.py b/seacrowd/sea_datasets/tydiqa/tydiqa.py index 2379144e6..10da64b06 100644 --- a/seacrowd/sea_datasets/tydiqa/tydiqa.py +++ b/seacrowd/sea_datasets/tydiqa/tydiqa.py @@ -81,6 +81,7 @@ _SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] _LANGUAGES = ["ind", "tha"] _LOCAL = False +_SOURCE_VERSION = "1.0.0" _SOURCE_VERSION_P = "1.0.0" _SOURCE_VERSION_S = "1.1.0" _SEACROWD_VERSION = "1.0.0" diff --git a/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py index 5e18a4211..0c2e562c0 100644 --- a/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py +++ b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py @@ -54,6 +54,8 @@ _SEACROWD_VERSION = "1.0.0" +_LOCAL = False + def construct_label_classes(): IOB_tag = ["I", "O", "B"] diff --git a/seacrowd/sea_datasets/xm3600/xm3600.py b/seacrowd/sea_datasets/xm3600/xm3600.py index 9dc847013..c54d985e2 100644 --- a/seacrowd/sea_datasets/xm3600/xm3600.py +++ b/seacrowd/sea_datasets/xm3600/xm3600.py @@ -58,6 +58,8 @@ _LANGUAGES = ["fil", "id", "th", "vi"] +_LOCAL = False + class XM3600Dataset(datasets.GeneratorBasedBuilder): """