From e95f83e09c53de88b1389441157e3286951bbb4d Mon Sep 17 00:00:00 2001 From: Lj Miranda Date: Fri, 17 Nov 2023 22:01:10 +0800 Subject: [PATCH] Change delimiter from period to underscore --- seacrowd/sea_datasets/tatoeba/tatoeba.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/seacrowd/sea_datasets/tatoeba/tatoeba.py b/seacrowd/sea_datasets/tatoeba/tatoeba.py index 56f7b0851..5974ded91 100644 --- a/seacrowd/sea_datasets/tatoeba/tatoeba.py +++ b/seacrowd/sea_datasets/tatoeba/tatoeba.py @@ -49,7 +49,7 @@ class TatoebaDataset(datasets.GeneratorBasedBuilder): SEACROWD_SCHEMA_NAME = "t2t" - dataset_names = sorted([f"tatoeba.{lang}" for lang in _LANGUAGES]) + dataset_names = sorted([f"tatoeba_{lang}" for lang in _LANGUAGES]) BUILDER_CONFIGS = [] for name in dataset_names: source_config = SEACrowdConfig( @@ -94,8 +94,7 @@ def _info(self) -> datasets.DatasetInfo: def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: """Return SplitGenerators.""" - lang_source = self.config.name.split(".")[1] - lang = lang_source.split("_")[0] + lang = self.config.name.split("_")[1] tatoeba_source_data = dl_manager.download_and_extract(_URL + f"tatoeba.{lang}-eng.{lang}") tatoeba_eng_data = dl_manager.download_and_extract(_URL + f"tatoeba.{lang}-eng.eng") return [