[wip] Update

SEACrowd · Nov 17, 2023 · a434bfa · a434bfa
1 parent c90a2b7
commit a434bfa
Showing 1 changed file with 39 additions and 21 deletions.
diff --git a/seacrowd/sea_datasets/tatoeba/tatoeba.py b/seacrowd/sea_datasets/tatoeba/tatoeba.py
@@ -117,37 +117,55 @@ def _info(self) -> datasets.DatasetInfo:
 
     def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]:
         """Return SplitGenerators."""
-        lang = self.config.name.split("_")[1]
-        tatoeba_source_data = dl_manager.download_and_extract(_URL + f"tatoeba.{lang}-eng.{lang}")
-        tatoeba_eng_data = dl_manager.download_and_extract(_URL + f"tatoeba.{lang}-eng.eng")
+        languages = []
+        tatoeba_source_data = []
+        tatoeba_eng_data = []
+
+        lang = self.config.name.split("_")[1] 
+        if lang in _LANGUAGES:
+            # Load data per language
+            tatoeba_source_data.append(dl_manager.download_and_extract(_URL + f"tatoeba.{lang}-eng.{lang}"))
+            tatoeba_eng_data.append(dl_manager.download_and_extract(_URL + f"tatoeba.{lang}-eng.eng"))
+            languages.append(lang)
+        else:
+            # Load examples from all languages at once
+            # We just want to run this part when tatoeba_source / tatoeba_seacrowd_t2t was chosen.
+            for lang in _LANGUAGES:
+                tatoeba_source_data.append(dl_manager.download_and_extract(_URL + f"tatoeba.{lang}-eng.{lang}"))
+                tatoeba_eng_data.append(dl_manager.download_and_extract(_URL + f"tatoeba.{lang}-eng.eng"))
+                languages.append(lang)
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
                 gen_kwargs={
-                    "filepath": (tatoeba_source_data, tatoeba_eng_data),
+                    "filepaths": (tatoeba_source_data, tatoeba_eng_data),
                     "split": "dev",
-                    "lang": lang,
+                    "languages": languages,
                 },
             )
         ]
 
-    def _generate_examples(self, filepath: Tuple[Path, Path], split: str, lang: str) -> Tuple[int, Dict]:
+    def _generate_examples(self, filepaths: Tuple[List[Path], List[Path]], split: str, languages: List[str]) -> Tuple[int, Dict]:
         """Yield examples as (key, example) tuples"""
-        source_file = filepath[0]
-        target_file = filepath[1]
-        source_sentences = []
-        target_sentences = []
-        with open(source_file, encoding="utf-8") as f1:
-            for row in f1:
-                source_sentences.append(row.strip())
-        with open(target_file, encoding="utf-8") as f2:
-            for row in f2:
-                target_sentences.append(row.strip())
-        for idx in range(len(source_sentences)):
+        source_files, target_files = filepaths
+        source_sents = []
+        target_sents = []
+        source_langs = []
+
+        for source_file, target_file, lang in zip(source_files, target_files, languages):
+            with open(source_file, encoding="utf-8") as f1:
+                for row in f1:
+                    source_sents.append(row.strip())
+                    source_langs.append(lang)
+            with open(target_file, encoding="utf-8") as f2:
+                for row in f2:
+                    target_sents.append(row.strip())
+
+        for idx, (source, target, lang) in enumerate(zip(source_sents, target_sents, source_langs)):
             if self.config.schema == "source":
                 example = {
-                    "source_sentence": source_sentences[idx],
-                    "target_sentence": target_sentences[idx],
+                    "source_sentence": source,
+                    "target_sentence": target,
                     # The source_lang in the HuggingFace source seems incorrect
                     # I am overriding it with the actual language code.
                     "source_lang": lang,
@@ -156,8 +174,8 @@ def _generate_examples(self, filepath: Tuple[Path, Path], split: str, lang: str)
             elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}":
                 example = {
                     "id": str(idx),
-                    "text_1": source_sentences[idx],
-                    "text_2": target_sentences[idx],
+                    "text_1": source,
+                    "text_2": target,
                     # The source_lang in the HuggingFace source seems incorrect
                     # I am overriding it with the actual language code.
                     "text_1_name": lang,