Skip to content

Commit

Permalink
Add fix for load_benchmark and load_datasets_by_config_names
Browse files Browse the repository at this point in the history
  • Loading branch information
holylovenia committed Jun 19, 2024
1 parent 9db6d22 commit b6301d2
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 78 deletions.
2 changes: 1 addition & 1 deletion seacrowd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
from .config_helper import list_datasets, load_dataset, load_datasets, list_benchmarks, load_benchmark
from .config_helper import SEACrowdMetadata, SEACrowdConfigHelper, SEACrowdMetadataHelper

__version__ = "0.1.0"
__version__ = "0.1.3"
17 changes: 6 additions & 11 deletions seacrowd/config_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -1078,11 +1078,11 @@ def for_config_name(self, config_name: str) -> "SEACrowdMetadata":
)
return helpers[0]

def for_config_names(self, config_name: str) -> "SEACrowdMetadata":
helpers = [helper for helper in self if helper.config.name == config_name]
def for_config_names(self, config_names: list[str]) -> "SEACrowdMetadata":
helpers = [helper for helper in self if helper.config.name in config_names]

if len(helpers) == 0:
raise ValueError(f"No helper with helper.config.name = {config_name}.")
raise ValueError(f"No helper with helper.config.name = {config_names}.")
return helpers

def default_for_dataset(self, dataset_name: str) -> "SEACrowdMetadata":
Expand Down Expand Up @@ -1164,14 +1164,9 @@ def list_benchmarks(self):
return list(BENCHMARK_DICT.keys())

def load_benchmark(self, benchmark_name):
return {
helper.config.name: helper.load_dataset()
for helper in self.filtered(
lambda x: (
x.config.name in BENCHMARK_DICT[benchmark_name]
)
)
}
config_list = BENCHMARK_DICT[benchmark_name]
helpers = self.for_config_names(config_list)
return [helper.load_dataset() for helper in helpers]

# Metadata Helper
@dataclass
Expand Down
15 changes: 6 additions & 9 deletions seacrowd/sea_datasets/glotstorybook/glotstorybook.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,12 @@
from seacrowd.utils.constants import Tasks, Licenses, TASK_TO_SCHEMA, SCHEMA_TO_FEATURES

_CITATION = """\
@inproceedings{kargaran2023glotlid,
title = {{GlotLID}: Language Identification for Low-Resource Languages},
author = {Kargaran, Amir Hossein and
Imani, Ayyoob and
Yvon, Fran{\c{c}}ois
and Sch{\"u}tze, Hinrich},
booktitle = {The 2023 Conference on Empirical Methods in Natural Language Processing},
year = {2023},
url = {https://openreview.net/forum?id=dl4e3EBz5j}
@inproceedings{kargaran2023glotlid,
title = {{GlotLID: Language Identification for Low-Resource Languages}},
author = {Kargaran, Amir Hossein and Imani, Ayyoob and Yvon, Fran{\c{c}}ois and Sch{\"u}tze, Hinrich},
year = 2023,
booktitle = {The 2023 Conference on Empirical Methods in Natural Language Processing},
url = {https://openreview.net/forum?id=dl4e3EBz5j}
}
"""

Expand Down
20 changes: 10 additions & 10 deletions seacrowd/sea_datasets/indocoref/indocoref.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,16 @@

_CITATION = """\
@inproceedings{artari-etal-2021-multi,
title = {A Multi-Pass Sieve Coreference Resolution for {I}ndonesian},
author = {Artari, Valentina Kania Prameswara and Mahendra, Rahmad and Jiwanggi, Meganingrum Arista and Anggraito, Adityo and Budi, Indra},
year = 2021,
month = sep,
booktitle = {Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)},
publisher = {INCOMA Ltd.},
address = {Held Online},
pages = {79--85},
url = {https://aclanthology.org/2021.ranlp-1.10},
abstract = {Coreference resolution is an NLP task to find out whether the set of referring expressions belong to the same concept in discourse. A multi-pass sieve is a deterministic coreference model that implements several layers of sieves, where each sieve takes a pair of correlated mentions from a collection of non-coherent mentions. The multi-pass sieve is based on the principle of high precision, followed by increased recall in each sieve. In this work, we examine the portability of the multi-pass sieve coreference resolution model to the Indonesian language. We conduct the experiment on 201 Wikipedia documents and the multi-pass sieve system yields 72.74{\%} of MUC F-measure and 52.18{\%} of BCUBED F-measure.}
title = {{A Multi-Pass Sieve Coreference Resolution for Indonesian}},
author = {Artari, Valentina Kania Prameswara and Mahendra, Rahmad and Jiwanggi, Meganingrum Arista and Anggraito, Adityo and Budi, Indra},
year = 2021,
month = {Sep},
booktitle = {Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)},
publisher = {INCOMA Ltd.},
address = {Held Online},
pages = {79--85},
url = {https://aclanthology.org/2021.ranlp-1.10},
abstract = {Coreference resolution is an NLP task to find out whether the set of referring expressions belong to the same concept in discourse. A multi-pass sieve is a deterministic coreference model that implements several layers of sieves, where each sieve takes a pair of correlated mentions from a collection of non-coherent mentions. The multi-pass sieve is based on the principle of high precision, followed by increased recall in each sieve. In this work, we examine the portability of the multi-pass sieve coreference resolution model to the Indonesian language. We conduct the experiment on 201 Wikipedia documents and the multi-pass sieve system yields 72.74{\%} of MUC F-measure and 52.18{\%} of BCUBED F-measure.}
}
"""

Expand Down
26 changes: 13 additions & 13 deletions seacrowd/sea_datasets/miracl/miracl.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,19 @@
from collections import defaultdict

_CITATION = """\
@article{10.1162/tacl_a_00595,
author = {Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy},
title = "{MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages}",
journal = {Transactions of the Association for Computational Linguistics},
volume = {11},
pages = {1114-1131},
year = {2023},
month = {09},
abstract = "{MIRACL is a multilingual dataset for ad hoc retrieval across 18 languages that collectively encompass over three billion native speakers around the world. This resource is designed to support monolingual retrieval tasks, where the queries and the corpora are in the same language. In total, we have gathered over 726k high-quality relevance judgments for 78k queries over Wikipedia in these languages, where all annotations have been performed by native speakers hired by our team. MIRACL covers languages that are both typologically close as well as distant from 10 language families and 13 sub-families, associated with varying amounts of publicly available resources. Extensive automatic heuristic verification and manual assessments were performed during the annotation process to control data quality. In total, MIRACL represents an investment of around five person-years of human annotator effort. Our goal is to spur research on improving retrieval across a continuum of languages, thus enhancing information access capabilities for diverse populations around the world, particularly those that have traditionally been underserved. MIRACL is available at http://miracl.ai/.}",
issn = {2307-387X},
doi = {10.1162/tacl_a_00595},
url = {https://doi.org/10.1162/tacl\_a\_00595},
eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00595/2157340/tacl\_a\_00595.pdf},
@article{10.1162/tacl_a_00595,
title = {{MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages}},
author = {Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy},
year = 2023,
month = {09},
journal = {Transactions of the Association for Computational Linguistics},
volume = 11,
pages = {1114--1131},
doi = {10.1162/tacl\_a\_00595},
issn = {2307-387X},
url = {https://doi.org/10.1162/tacl\%5Fa\%5F00595},
abstract = {{MIRACL is a multilingual dataset for ad hoc retrieval across 18 languages that collectively encompass over three billion native speakers around the world. This resource is designed to support monolingual retrieval tasks, where the queries and the corpora are in the same language. In total, we have gathered over 726k high-quality relevance judgments for 78k queries over Wikipedia in these languages, where all annotations have been performed by native speakers hired by our team. MIRACL covers languages that are both typologically close as well as distant from 10 language families and 13 sub-families, associated with varying amounts of publicly available resources. Extensive automatic heuristic verification and manual assessments were performed during the annotation process to control data quality. In total, MIRACL represents an investment of around five person-years of human annotator effort. Our goal is to spur research on improving retrieval across a continuum of languages, thus enhancing information access capabilities for diverse populations around the world, particularly those that have traditionally been underserved. MIRACL is available at http://miracl.ai/.}},
eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00595/2157340/tacl\_a\_00595.pdf}
}
"""

Expand Down
56 changes: 24 additions & 32 deletions seacrowd/sea_datasets/mysentence/mysentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,39 +9,31 @@
from seacrowd.utils.constants import Licenses, Tasks

_CITATION = """\
@article{Aung_Kyaw Thu_Hlaing_2023, place={Nonthaburi, Thailand}, title={mySentence: Sentence Segmentation for Myanmar Language
using Neural Machine Translation Approach}, volume={9}, url={https://ph05.tci-thaijo.org/index.php/JIIST/article/view/87},
number={October},
abstract="In the informal Myanmar language, for which most NLP applications are used, there is no predefined rule to mark the end of the sentence.
Therefore, in this paper, we contributed the first Myanmar sentence segmentation corpus and systemat
ically experimented with twelve neural sequence
labeling architectures trained and tested on both sentence and sentence+paragraph data. The word LSTM + Softmax achieved the highest accuracy of 99.95{\%}
while trained and tested on sentence-only data and 97.40{\%} while trained and tested on sentence + paragraph data.",
journal={Journal of Intelligent Informatics
and Smart Technology}, author={Aung, Thura and Kyaw Thu , Ye and Hlaing , Zar Zar}, year={2023}, month={Nov.}, pages={e001} };
@InProceedings{10.1007/978-3-031-36886-8_24,
author="Thu, Ye Kyaw
and Aung, Thura
and Supnithi, Thepchai",
editor="Nguyen, Ngoc Thanh
and Le-Minh, Hoa
and Huynh, Cong-Phap
and Nguyen, Quang-Vu",
title="Neural Sequence Labeling Based Sentence Segmentation for Myanmar Language",
booktitle="The 12th Conference on Information Technology and Its Applications",
year="2023",
publisher="Springer Nature Switzerland",
address="Cham",
pages="285--296",
abstract="In the informal Myanmar language, for which most NLP applications are used, there is no predefined rule to mark the end of the sentence.
Therefore, in this paper, we contributed the first Myanmar sentence segmentation corpus and systemat
ically experimented with twelve neural sequence
labeling architectures trained and tested on both sentence and sentence+paragraph data. The word LSTM + Softmax achieved the highest accuracy of 99.95{\%}
while trained and tested on sentence-only data and 97.40{\%} while trained and tested on sentence + paragraph data.",
isbn="978-3-031-36886-8"
@article{Aung_Kyaw_Thu_Hlaing_2023,
title = {{mySentence: Sentence Segmentation for Myanmar Language using Neural Machine Translation Approach}},
author = {Aung, Thura and Kyaw Thu , Ye and Hlaing , Zar Zar},
year = 2023,
month = {Nov.},
journal = {Journal of Intelligent Informatics and Smart Technology},
volume = 9,
number = {October},
pages = {e001},
url = {https://ph05.tci-thaijo.org/index.php/JIIST/article/view/87},
place = {Nonthaburi, Thailand},
abstract = {In the informal Myanmar language, for which most NLP applications are used, there is no predefined rule to mark the end of the sentence. Therefore, in this paper, we contributed the first Myanmar sentence segmentation corpus and systemat ically experimented with twelve neural sequence labeling architectures trained and tested on both sentence and sentence+paragraph data. The word LSTM + Softmax achieved the highest accuracy of 99.95{\%} while trained and tested on sentence-only data and 97.40{\%} while trained and tested on sentence + paragraph data.}
}
@inproceedings{10.1007/978-3-031-36886-8_24,
title = {{Neural Sequence Labeling Based Sentence Segmentation for Myanmar Language}},
author = {Thu, Ye Kyaw and Aung, Thura and Supnithi, Thepchai},
year = 2023,
booktitle = {The 12th Conference on Information Technology and Its Applications},
publisher = {Springer Nature Switzerland},
address = {Cham},
pages = {285--296},
isbn = {978-3-031-36886-8},
editor = {Nguyen, Ngoc Thanh and Le-Minh, Hoa and Huynh, Cong-Phap and Nguyen, Quang-Vu},
abstract = {In the informal Myanmar language, for which most NLP applications are used, there is no predefined rule to mark the end of the sentence. Therefore, in this paper, we contributed the first Myanmar sentence segmentation corpus and systemat ically experimented with twelve neural sequence labeling architectures trained and tested on both sentence and sentence+paragraph data. The word LSTM + Softmax achieved the highest accuracy of 99.95{\%} while trained and tested on sentence-only data and 97.40{\%} while trained and tested on sentence + paragraph data.}
}
"""

_DATASETNAME = "mysentence"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,17 @@
from pathlib import Path
from typing import Dict, List, Tuple

import audiosegment
try:
import audiosegment
except:
print("Please install audiosegment to use the `national_speech_corpus_sg_imda` dataloader.")
import datasets
import pandas as pd
import textgrid

try:
import textgrid
except:
print("Please install textgrid to use the `national_speech_corpus_sg_imda` dataloader.")

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
Expand Down

0 comments on commit b6301d2

Please sign in to comment.