Closes SEACrowd#49 | Updated existing TICO_19 dataloader to support m…

…ore sea languages (SEACrowd#414) * Updated existing TICO_19 dataloader to support more sea languages * added sea languages to _LANGUAGES --------- Co-authored-by: ssfei81 <[email protected]>
R-Damanhuri · Mar 16, 2024 · 6db6bee · 6db6bee
1 parent 8c12a33
commit 6db6bee
Showing 1 changed file with 58 additions and 70 deletions.
diff --git a/seacrowd/sea_datasets/tico_19/tico_19.py b/seacrowd/sea_datasets/tico_19/tico_19.py
@@ -14,14 +14,13 @@
 # limitations under the License.
 
 import csv
-from fnmatch import translate
 import os
 import re
 from pathlib import Path
 from typing import Dict, List, Tuple
-from translate.storage.tmx import tmxfile
 
 import datasets
+from translate.storage.tmx import tmxfile
 
 from seacrowd.utils import schemas
 from seacrowd.utils.configs import SEACrowdConfig
@@ -59,45 +58,58 @@
 """
 
 # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
-_LANGUAGES = ["ind", "ara", "spa", "fra", "hin", "por", "rus", "zho", "eng"]
+_LANGUAGES = ["ind", "ara", "spa", "fra", "hin", "por", "rus", "zho", "eng", "khm", "zlm", "mya", "tgl", "tam"]
 _LOCAL = False
 _SUPPORTED_LANG_PAIRS = [
-    ("ind", "ara"), ("ind", "spa"), ("ind", "fra"), ("ind", "hin"), ("ind", "por"), ("ind", "rus"), ("ind", "zho"), ("ind", "eng"),
-    ("ara", "ind"), ("spa", "ind"), ("fra", "ind"), ("hin", "ind"), ("por", "ind"), ("rus", "ind"), ("zho", "ind"), ("eng", "ind")
+    ("ind", "ara"),
+    ("ind", "spa"),
+    ("ind", "fra"),
+    ("ind", "hin"),
+    ("ind", "por"),
+    ("ind", "rus"),
+    ("ind", "zho"),
+    ("ind", "eng"),
+    ("ara", "ind"),
+    ("spa", "ind"),
+    ("fra", "ind"),
+    ("hin", "ind"),
+    ("por", "ind"),
+    ("rus", "ind"),
+    ("zho", "ind"),
+    ("eng", "ind"),
+    ("khm", "eng"),
+    ("eng", "khm"),
+    ("mya", "eng"),
+    ("eng", "mya"),
+    ("zlm", "eng"),
+    ("eng", "zlm"),
+    ("tgl", "eng"),
+    ("eng", "tgl"),
+    ("tam", "eng"),
+    ("eng", "tam"),
 ]
 
-_LANG_CODE_MAP = {
-    "ind": "id",
-    "ara": "ar",
-    "spa": "es-LA",
-    "fra": "fr",
-    "hin": "hi",
-    "por": "pt-BR",
-    "rus": "ru",
-    "zho": "zh",
-    "eng": "en"
-}
+_LANG_CODE_MAP = {"ind": "id", "ara": "ar", "spa": "es-LA", "fra": "fr", "hin": "hi", "por": "pt-BR", "rus": "ru", "zho": "zh", "eng": "en", "khm": "km", "zlm": "ms", "mya": "my", "tgl": "tl", "tam": "ta"}
+
+_DEVTEST_LANG_PAIRS = [_LANG_CODE_MAP[source_lang] + "-" + _LANG_CODE_MAP[target_lang] for (source_lang, target_lang) in _SUPPORTED_LANG_PAIRS if (source_lang == "eng" or target_lang == "eng")]
 
 _DATASETNAME = "tico_19"
 
 _DESCRIPTION = """\
-TICO-19 (Translation Initiative for COVID-19) is sampled from a variety of public sources containing 
-COVID-19 related content, representing different domains (e.g., news, wiki articles, and others). TICO-19 
-includes 30 documents (3071 sentences, 69.7k words) translated from English into 36 languages: Amharic, 
-Arabic (Modern Standard), Bengali, Chinese (Simplified), Dari, Dinka, Farsi, French (European), Hausa, 
-Hindi, Indonesian, Kanuri, Khmer (Central), Kinyarwanda, Kurdish Kurmanji, Kurdish Sorani, Lingala, 
-Luganda, Malay, Marathi, Myanmar, Nepali, Nigerian Fulfulde, Nuer, Oromo, Pashto, Portuguese (Brazilian), 
+TICO-19 (Translation Initiative for COVID-19) is sampled from a variety of public sources containing
+COVID-19 related content, representing different domains (e.g., news, wiki articles, and others). TICO-19
+includes 30 documents (3071 sentences, 69.7k words) translated from English into 36 languages: Amharic,
+Arabic (Modern Standard), Bengali, Chinese (Simplified), Dari, Dinka, Farsi, French (European), Hausa,
+Hindi, Indonesian, Kanuri, Khmer (Central), Kinyarwanda, Kurdish Kurmanji, Kurdish Sorani, Lingala,
+Luganda, Malay, Marathi, Myanmar, Nepali, Nigerian Fulfulde, Nuer, Oromo, Pashto, Portuguese (Brazilian),
 Russian, Somali, Spanish (Latin American), Swahili, Congolese Swahili, Tagalog, Tamil, Tigrinya, Urdu, Zulu.
 """
 
 _HOMEPAGE = "https://tico-19.github.io"
 
 _LICENSE = "CC0"
 
-_URLS = {
-    "evaluation": "https://tico-19.github.io/data/tico19-testset.zip",
-    "all": "https://tico-19.github.io/data/TM/all.{lang_pairs}.tmx.zip"
-}
+_URLS = {"evaluation": "https://tico-19.github.io/data/tico19-testset.zip", "all": "https://tico-19.github.io/data/TM/all.{lang_pairs}.tmx.zip"}
 
 _SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION]
 
@@ -128,16 +140,14 @@ def seacrowd_config_constructor(lang_source, lang_target, schema, version):
             subset_id="tico_19",
         )
 
+
 class Tico19(datasets.GeneratorBasedBuilder):
     """TICO-19 is MT dataset sampled from a variety of public sources containing COVID-19 related content"""
 
     SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
     SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
 
-    BUILDER_CONFIGS = [
-        seacrowd_config_constructor(src, tgt, schema, version)
-        for src, tgt in [("", "")] + _SUPPORTED_LANG_PAIRS for schema, version in zip(["source", "seacrowd_t2t"], [_SOURCE_VERSION, _SEACROWD_VERSION])
-    ]
+    BUILDER_CONFIGS = [seacrowd_config_constructor(src, tgt, schema, version) for src, tgt in [("", "")] + _SUPPORTED_LANG_PAIRS for schema, version in zip(["source", "seacrowd_t2t"], [_SOURCE_VERSION, _SEACROWD_VERSION])]
 
     DEFAULT_CONFIG_NAME = "tico_19_source"
 
@@ -168,7 +178,7 @@ def _info(self) -> datasets.DatasetInfo:
 
     def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
         """Returns SplitGenerators."""
-        
+
         try:
             lang_pairs_config = re.search("tico_19_(.+?)_(source|seacrowd_t2t)", self.config.name).group(1)
             lang_src, lang_tgt = lang_pairs_config.split("_")
@@ -177,46 +187,36 @@ def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datase
 
         lang_pairs = _LANG_CODE_MAP[lang_src] + "-" + _LANG_CODE_MAP[lang_tgt]
 
-        # dev & test split only applicable to eng-ind language pair
-        if lang_pairs in ["en-id", "id-en"]:
+        # dev & test split only applicable to eng-[sea language] language pair
+        if lang_pairs in set(_DEVTEST_LANG_PAIRS):
+            lang_sea = _LANG_CODE_MAP[lang_tgt] if lang_src == "eng" else _LANG_CODE_MAP[lang_src]
+
             data_dir = dl_manager.download_and_extract(_URLS["evaluation"])
             return [
                 datasets.SplitGenerator(
                     name=datasets.Split.TEST,
-                    gen_kwargs={
-                        "filepath": os.path.join(data_dir, "tico19-testset", "test", f"test.en-id.tsv"),
-                        "lang_source": lang_src,
-                        "lang_target": lang_tgt
-                    },
+                    gen_kwargs={"filepath": os.path.join(data_dir, "tico19-testset", "test", f"test.en-{lang_sea}.tsv"), "lang_source": lang_src, "lang_target": lang_tgt},
                 ),
                 datasets.SplitGenerator(
                     name=datasets.Split.VALIDATION,
-                    gen_kwargs={
-                        "filepath": os.path.join(data_dir, "tico19-testset", "dev", f"dev.en-id.tsv"),
-                        "lang_source": lang_src,
-                        "lang_target": lang_tgt
-                    },
+                    gen_kwargs={"filepath": os.path.join(data_dir, "tico19-testset", "dev", f"dev.en-{lang_sea}.tsv"), "lang_source": lang_src, "lang_target": lang_tgt},
                 ),
             ]
         else:
             data_dir = dl_manager.download_and_extract(_URLS["all"].format(lang_pairs=lang_pairs))
             return [
                 datasets.SplitGenerator(
                     name=datasets.Split.TRAIN,
-                    gen_kwargs={
-                        "filepath": os.path.join(data_dir, f"all.{lang_pairs}.tmx"),
-                        "lang_source": lang_src,
-                        "lang_target": lang_tgt
-                    },
+                    gen_kwargs={"filepath": os.path.join(data_dir, f"all.{lang_pairs}.tmx"), "lang_source": lang_src, "lang_target": lang_tgt},
                 )
             ]
 
     def _generate_examples(self, filepath: Path, lang_source: str, lang_target: str) -> Tuple[int, Dict]:
         """Yields examples as (key, example) tuples."""
-        
+
         if self.config.schema == "source":
-            # eng-ind language pair dataset provided in .tsv format
-            if (lang_source == "eng" and lang_target == "ind") or (lang_source == "ind" and lang_target == "eng"):
+            # eng-[sea language] language pair dataset provided in .tsv format
+            if f"{_LANG_CODE_MAP[lang_source]}-{_LANG_CODE_MAP[lang_target]}" in set(_DEVTEST_LANG_PAIRS):
                 with open(filepath, encoding="utf-8") as f:
                     reader = csv.reader(f, delimiter="\t", quotechar='"')
                     for id_, row in enumerate(reader):
@@ -242,16 +242,16 @@ def _generate_examples(self, filepath: Path, lang_source: str, lang_target: str)
                             "license": row[6],
                             "translatorId": row[7],
                         }
-            
+
             # all language pairs except eng-ind dataset provided in .tmx format
             else:
                 with open(filepath, "rb") as f:
                     tmx_file = tmxfile(f)
 
                 for id_, node in enumerate(tmx_file.unit_iter()):
                     try:
-                        url = [text for text in node.xmlelement.itertext('prop')][0]
-                    except:
+                        url = [text for text in node.xmlelement.itertext("prop")][0]
+                    except Exception:
                         url = ""
                     yield id_, {
                         "sourceLang": _LANG_CODE_MAP[lang_source],
@@ -265,7 +265,7 @@ def _generate_examples(self, filepath: Path, lang_source: str, lang_target: str)
                     }
 
         elif self.config.schema == "seacrowd_t2t":
-            if (lang_source == "eng" and lang_target == "ind") or (lang_source == "ind" and lang_target == "eng"):
+            if f"{_LANG_CODE_MAP[lang_source]}-{_LANG_CODE_MAP[lang_target]}" in set(_DEVTEST_LANG_PAIRS):
                 with open(filepath, encoding="utf-8") as f:
                     reader = csv.reader(f, delimiter="\t", quotechar='"')
                     for id_, row in enumerate(reader):
@@ -277,22 +277,10 @@ def _generate_examples(self, filepath: Path, lang_source: str, lang_target: str)
                         else:
                             source_string = row[3]
                             target_string = row[2]
-                        yield id_, {
-                            "id": row[4],
-                            "text_1": source_string,
-                            "text_2": target_string,
-                            "text_1_name": lang_source,
-                            "text_2_name": lang_target
-                        }
+                        yield id_, {"id": row[4], "text_1": source_string, "text_2": target_string, "text_1_name": lang_source, "text_2_name": lang_target}
             else:
                 with open(filepath, "rb") as f:
                     tmx_file = tmxfile(f)
-                
+
                 for id_, node in enumerate(tmx_file.unit_iter()):
-                    yield id_, {
-                        "id": node.getid(),
-                        "text_1": node.source,
-                        "text_2": node.target,
-                        "text_1_name": lang_source,
-                        "text_2_name": lang_target
-                    }
+                    yield id_, {"id": node.getid(), "text_1": node.source, "text_2": node.target, "text_1_name": lang_source, "text_2_name": lang_target}