Add fix for load_benchmark and load_datasets_by_config_names

SEACrowd · Jun 19, 2024 · b6301d2 · b6301d2
1 parent 9db6d22
commit b6301d2
Show file tree

Hide file tree

Showing 7 changed files with 69 additions and 78 deletions.
diff --git a/seacrowd/__init__.py b/seacrowd/__init__.py
@@ -2,4 +2,4 @@
 from .config_helper import list_datasets, load_dataset, load_datasets, list_benchmarks, load_benchmark
 from .config_helper import SEACrowdMetadata, SEACrowdConfigHelper, SEACrowdMetadataHelper
 
-__version__ = "0.1.0"
+__version__ = "0.1.3"
diff --git a/seacrowd/config_helper.py b/seacrowd/config_helper.py
@@ -1078,11 +1078,11 @@ def for_config_name(self, config_name: str) -> "SEACrowdMetadata":
             )
         return helpers[0]
 
-    def for_config_names(self, config_name: str) -> "SEACrowdMetadata":
-        helpers = [helper for helper in self if helper.config.name == config_name]
+    def for_config_names(self, config_names: list[str]) -> "SEACrowdMetadata":
+        helpers = [helper for helper in self if helper.config.name in config_names]
 
         if len(helpers) == 0:
-            raise ValueError(f"No helper with helper.config.name = {config_name}.")
+            raise ValueError(f"No helper with helper.config.name = {config_names}.")
         return helpers
 
     def default_for_dataset(self, dataset_name: str) -> "SEACrowdMetadata":
@@ -1164,14 +1164,9 @@ def list_benchmarks(self):
         return list(BENCHMARK_DICT.keys())
 
     def load_benchmark(self, benchmark_name):
-        return {
-            helper.config.name: helper.load_dataset()
-            for helper in self.filtered(
-                lambda x: (
-                    x.config.name in BENCHMARK_DICT[benchmark_name]
-                )
-            )
-        }
+        config_list = BENCHMARK_DICT[benchmark_name]
+        helpers = self.for_config_names(config_list)
+        return [helper.load_dataset() for helper in helpers]
 
 # Metadata Helper
 @dataclass

diff --git a/seacrowd/sea_datasets/glotstorybook/glotstorybook.py b/seacrowd/sea_datasets/glotstorybook/glotstorybook.py
@@ -8,15 +8,12 @@
 from seacrowd.utils.constants import Tasks, Licenses, TASK_TO_SCHEMA, SCHEMA_TO_FEATURES
 
 _CITATION = """\
-@inproceedings{kargaran2023glotlid,
-    title     = {{GlotLID}: Language Identification for Low-Resource Languages},
-    author    = {Kargaran, Amir Hossein and
-                Imani, Ayyoob and
-                Yvon, Fran{\c{c}}ois
-                and Sch{\"u}tze, Hinrich},
-    booktitle = {The 2023 Conference on Empirical Methods in Natural Language Processing},
-    year      = {2023},
-    url       = {https://openreview.net/forum?id=dl4e3EBz5j}
+    @inproceedings{kargaran2023glotlid,
+    title        = {{GlotLID: Language Identification for Low-Resource Languages}},
+    author       = {Kargaran, Amir Hossein and Imani, Ayyoob and Yvon, Fran{\c{c}}ois and Sch{\"u}tze, Hinrich},
+    year         = 2023,
+    booktitle    = {The 2023 Conference on Empirical Methods in Natural Language Processing},
+    url          = {https://openreview.net/forum?id=dl4e3EBz5j}
 }
 """
 

diff --git a/seacrowd/sea_datasets/indocoref/indocoref.py b/seacrowd/sea_datasets/indocoref/indocoref.py
@@ -17,16 +17,16 @@
 
 _CITATION = """\
 @inproceedings{artari-etal-2021-multi,
-  title        = {A Multi-Pass Sieve Coreference Resolution for {I}ndonesian},
-  author       = {Artari, Valentina Kania Prameswara  and Mahendra, Rahmad  and Jiwanggi, Meganingrum Arista  and Anggraito, Adityo  and Budi, Indra},
-  year         = 2021,
-  month        = sep,
-  booktitle    = {Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)},
-  publisher    = {INCOMA Ltd.},
-  address      = {Held Online},
-  pages        = {79--85},
-  url          = {https://aclanthology.org/2021.ranlp-1.10},
-  abstract     = {Coreference resolution is an NLP task to find out whether the set of referring expressions belong to the same concept in discourse. A multi-pass sieve is a deterministic coreference model that implements several layers of sieves, where each sieve takes a pair of correlated mentions from a collection of non-coherent mentions. The multi-pass sieve is based on the principle of high precision, followed by increased recall in each sieve. In this work, we examine the portability of the multi-pass sieve coreference resolution model to the Indonesian language. We conduct the experiment on 201 Wikipedia documents and the multi-pass sieve system yields 72.74{\%} of MUC F-measure and 52.18{\%} of BCUBED F-measure.}
+    title        = {{A Multi-Pass Sieve Coreference Resolution for Indonesian}},
+    author       = {Artari, Valentina Kania Prameswara  and Mahendra, Rahmad  and Jiwanggi, Meganingrum Arista  and Anggraito, Adityo  and Budi, Indra},
+    year         = 2021,
+    month        = {Sep},
+    booktitle    = {Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)},
+    publisher    = {INCOMA Ltd.},
+    address      = {Held Online},
+    pages        = {79--85},
+    url          = {https://aclanthology.org/2021.ranlp-1.10},
+    abstract     = {Coreference resolution is an NLP task to find out whether the set of referring expressions belong to the same concept in discourse. A multi-pass sieve is a deterministic coreference model that implements several layers of sieves, where each sieve takes a pair of correlated mentions from a collection of non-coherent mentions. The multi-pass sieve is based on the principle of high precision, followed by increased recall in each sieve. In this work, we examine the portability of the multi-pass sieve coreference resolution model to the Indonesian language. We conduct the experiment on 201 Wikipedia documents and the multi-pass sieve system yields 72.74{\%} of MUC F-measure and 52.18{\%} of BCUBED F-measure.}
 }
 """
 

diff --git a/seacrowd/sea_datasets/miracl/miracl.py b/seacrowd/sea_datasets/miracl/miracl.py
@@ -34,19 +34,19 @@
 from collections import defaultdict
 
 _CITATION = """\
-@article{10.1162/tacl_a_00595,
-    author = {Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy},
-    title = "{MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages}",
-    journal = {Transactions of the Association for Computational Linguistics},
-    volume = {11},
-    pages = {1114-1131},
-    year = {2023},
-    month = {09},
-    abstract = "{MIRACL is a multilingual dataset for ad hoc retrieval across 18 languages that collectively encompass over three billion native speakers around the world. This resource is designed to support monolingual retrieval tasks, where the queries and the corpora are in the same language. In total, we have gathered over 726k high-quality relevance judgments for 78k queries over Wikipedia in these languages, where all annotations have been performed by native speakers hired by our team. MIRACL covers languages that are both typologically close as well as distant from 10 language families and 13 sub-families, associated with varying amounts of publicly available resources. Extensive automatic heuristic verification and manual assessments were performed during the annotation process to control data quality. In total, MIRACL represents an investment of around five person-years of human annotator effort. Our goal is to spur research on improving retrieval across a continuum of languages, thus enhancing information access capabilities for diverse populations around the world, particularly those that have traditionally been underserved. MIRACL is available at http://miracl.ai/.}",
-    issn = {2307-387X},
-    doi = {10.1162/tacl_a_00595},
-    url = {https://doi.org/10.1162/tacl\_a\_00595},
-    eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00595/2157340/tacl\_a\_00595.pdf},
+    @article{10.1162/tacl_a_00595,
+    title        = {{MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages}},
+    author       = {Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy},
+    year         = 2023,
+    month        = {09},
+    journal      = {Transactions of the Association for Computational Linguistics},
+    volume       = 11,
+    pages        = {1114--1131},
+    doi          = {10.1162/tacl\_a\_00595},
+    issn         = {2307-387X},
+    url          = {https://doi.org/10.1162/tacl\%5Fa\%5F00595},
+    abstract     = {{MIRACL is a multilingual dataset for ad hoc retrieval across 18 languages that collectively encompass over three billion native speakers around the world. This resource is designed to support monolingual retrieval tasks, where the queries and the corpora are in the same language. In total, we have gathered over 726k high-quality relevance judgments for 78k queries over Wikipedia in these languages, where all annotations have been performed by native speakers hired by our team. MIRACL covers languages that are both typologically close as well as distant from 10 language families and 13 sub-families, associated with varying amounts of publicly available resources. Extensive automatic heuristic verification and manual assessments were performed during the annotation process to control data quality. In total, MIRACL represents an investment of around five person-years of human annotator effort. Our goal is to spur research on improving retrieval across a continuum of languages, thus enhancing information access capabilities for diverse populations around the world, particularly those that have traditionally been underserved. MIRACL is available at http://miracl.ai/.}},
+    eprint       = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00595/2157340/tacl\_a\_00595.pdf}
 }
 """
 

diff --git a/seacrowd/sea_datasets/mysentence/mysentence.py b/seacrowd/sea_datasets/mysentence/mysentence.py
@@ -9,39 +9,31 @@
 from seacrowd.utils.constants import Licenses, Tasks
 
 _CITATION = """\
-@article{Aung_Kyaw Thu_Hlaing_2023, place={Nonthaburi, Thailand}, title={mySentence: Sentence Segmentation for Myanmar Language
-using Neural Machine Translation Approach}, volume={9}, url={https://ph05.tci-thaijo.org/index.php/JIIST/article/view/87},
-number={October}, 
-abstract="In the informal Myanmar language, for which most NLP applications are used, there is no predefined rule to mark the end of the sentence.
-Therefore, in this paper, we contributed the first Myanmar sentence segmentation corpus and systemat
-ically experimented with twelve neural sequence
-labeling architectures trained and tested on both sentence and sentence+paragraph data. The word LSTM + Softmax achieved the highest accuracy of 99.95{\%}
-while trained and tested on sentence-only data and 97.40{\%} while trained and tested on sentence + paragraph data.",
-journal={Journal of Intelligent Informatics
-and Smart Technology}, author={Aung, Thura and Kyaw Thu , Ye and Hlaing , Zar Zar}, year={2023}, month={Nov.}, pages={e001} };
-
-@InProceedings{10.1007/978-3-031-36886-8_24,
-author="Thu, Ye Kyaw
-and Aung, Thura
-and Supnithi, Thepchai",
-editor="Nguyen, Ngoc Thanh
-and Le-Minh, Hoa
-and Huynh, Cong-Phap
-and Nguyen, Quang-Vu",
-title="Neural Sequence Labeling Based Sentence Segmentation for Myanmar Language",
-booktitle="The 12th Conference on Information Technology and Its Applications",
-year="2023",
-publisher="Springer Nature Switzerland",
-address="Cham",
-pages="285--296",
-abstract="In the informal Myanmar language, for which most NLP applications are used, there is no predefined rule to mark the end of the sentence.
-Therefore, in this paper, we contributed the first Myanmar sentence segmentation corpus and systemat
-ically experimented with twelve neural sequence
-labeling architectures trained and tested on both sentence and sentence+paragraph data. The word LSTM + Softmax achieved the highest accuracy of 99.95{\%}
-while trained and tested on sentence-only data and 97.40{\%} while trained and tested on sentence + paragraph data.",
-isbn="978-3-031-36886-8"
+@article{Aung_Kyaw_Thu_Hlaing_2023,
+    title        = {{mySentence: Sentence Segmentation for Myanmar Language using Neural Machine Translation Approach}},
+    author       = {Aung, Thura and Kyaw Thu , Ye and Hlaing , Zar Zar},
+    year         = 2023,
+    month        = {Nov.},
+    journal      = {Journal of Intelligent Informatics and Smart Technology},
+    volume       = 9,
+    number       = {October},
+    pages        = {e001},
+    url          = {https://ph05.tci-thaijo.org/index.php/JIIST/article/view/87},
+    place        = {Nonthaburi, Thailand},
+    abstract     = {In the informal Myanmar language, for which most NLP applications are used, there is no predefined rule to mark the end of the sentence. Therefore, in this paper, we contributed the first Myanmar sentence segmentation corpus and systemat ically experimented with twelve neural sequence labeling architectures trained and tested on both sentence and sentence+paragraph data. The word LSTM + Softmax achieved the highest accuracy of 99.95{\%} while trained and tested on sentence-only data and 97.40{\%} while trained and tested on sentence + paragraph data.}
+}
+@inproceedings{10.1007/978-3-031-36886-8_24,
+    title        = {{Neural Sequence Labeling Based Sentence Segmentation for Myanmar Language}},
+    author       = {Thu, Ye Kyaw and Aung, Thura and Supnithi, Thepchai},
+    year         = 2023,
+    booktitle    = {The 12th Conference on Information Technology and Its Applications},
+    publisher    = {Springer Nature Switzerland},
+    address      = {Cham},
+    pages        = {285--296},
+    isbn         = {978-3-031-36886-8},
+    editor       = {Nguyen, Ngoc Thanh and Le-Minh, Hoa and Huynh, Cong-Phap and Nguyen, Quang-Vu},
+    abstract     = {In the informal Myanmar language, for which most NLP applications are used, there is no predefined rule to mark the end of the sentence. Therefore, in this paper, we contributed the first Myanmar sentence segmentation corpus and systemat ically experimented with twelve neural sequence labeling architectures trained and tested on both sentence and sentence+paragraph data. The word LSTM + Softmax achieved the highest accuracy of 99.95{\%} while trained and tested on sentence-only data and 97.40{\%} while trained and tested on sentence + paragraph data.}
 }
-
 """
 
 _DATASETNAME = "mysentence"

diff --git a/seacrowd/sea_datasets/national_speech_corpus_sg_imda/national_speech_corpus_sg_imda.py b/seacrowd/sea_datasets/national_speech_corpus_sg_imda/national_speech_corpus_sg_imda.py
@@ -3,10 +3,17 @@
 from pathlib import Path
 from typing import Dict, List, Tuple
 
-import audiosegment
+try:
+    import audiosegment
+except:
+    print("Please install audiosegment to use the `national_speech_corpus_sg_imda` dataloader.")
 import datasets
 import pandas as pd
-import textgrid
+
+try:
+    import textgrid
+except:
+    print("Please install textgrid to use the `national_speech_corpus_sg_imda` dataloader.")
 
 from seacrowd.utils import schemas
 from seacrowd.utils.configs import SEACrowdConfig