Skip to content

Commit

Permalink
Closes SEACrowd#49 | Updated existing TICO_19 dataloader to support m…
Browse files Browse the repository at this point in the history
…ore sea languages (SEACrowd#414)

* Updated existing TICO_19 dataloader to support more sea languages

* added sea languages to _LANGUAGES

---------

Co-authored-by: ssfei81 <[email protected]>
  • Loading branch information
ssun32 and ssfei81 authored Mar 16, 2024
1 parent 8c12a33 commit 6db6bee
Showing 1 changed file with 58 additions and 70 deletions.
128 changes: 58 additions & 70 deletions seacrowd/sea_datasets/tico_19/tico_19.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,13 @@
# limitations under the License.

import csv
from fnmatch import translate
import os
import re
from pathlib import Path
from typing import Dict, List, Tuple
from translate.storage.tmx import tmxfile

import datasets
from translate.storage.tmx import tmxfile

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
Expand Down Expand Up @@ -59,45 +58,58 @@
"""

# We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
_LANGUAGES = ["ind", "ara", "spa", "fra", "hin", "por", "rus", "zho", "eng"]
_LANGUAGES = ["ind", "ara", "spa", "fra", "hin", "por", "rus", "zho", "eng", "khm", "zlm", "mya", "tgl", "tam"]
_LOCAL = False
_SUPPORTED_LANG_PAIRS = [
("ind", "ara"), ("ind", "spa"), ("ind", "fra"), ("ind", "hin"), ("ind", "por"), ("ind", "rus"), ("ind", "zho"), ("ind", "eng"),
("ara", "ind"), ("spa", "ind"), ("fra", "ind"), ("hin", "ind"), ("por", "ind"), ("rus", "ind"), ("zho", "ind"), ("eng", "ind")
("ind", "ara"),
("ind", "spa"),
("ind", "fra"),
("ind", "hin"),
("ind", "por"),
("ind", "rus"),
("ind", "zho"),
("ind", "eng"),
("ara", "ind"),
("spa", "ind"),
("fra", "ind"),
("hin", "ind"),
("por", "ind"),
("rus", "ind"),
("zho", "ind"),
("eng", "ind"),
("khm", "eng"),
("eng", "khm"),
("mya", "eng"),
("eng", "mya"),
("zlm", "eng"),
("eng", "zlm"),
("tgl", "eng"),
("eng", "tgl"),
("tam", "eng"),
("eng", "tam"),
]

_LANG_CODE_MAP = {
"ind": "id",
"ara": "ar",
"spa": "es-LA",
"fra": "fr",
"hin": "hi",
"por": "pt-BR",
"rus": "ru",
"zho": "zh",
"eng": "en"
}
_LANG_CODE_MAP = {"ind": "id", "ara": "ar", "spa": "es-LA", "fra": "fr", "hin": "hi", "por": "pt-BR", "rus": "ru", "zho": "zh", "eng": "en", "khm": "km", "zlm": "ms", "mya": "my", "tgl": "tl", "tam": "ta"}

_DEVTEST_LANG_PAIRS = [_LANG_CODE_MAP[source_lang] + "-" + _LANG_CODE_MAP[target_lang] for (source_lang, target_lang) in _SUPPORTED_LANG_PAIRS if (source_lang == "eng" or target_lang == "eng")]

_DATASETNAME = "tico_19"

_DESCRIPTION = """\
TICO-19 (Translation Initiative for COVID-19) is sampled from a variety of public sources containing
COVID-19 related content, representing different domains (e.g., news, wiki articles, and others). TICO-19
includes 30 documents (3071 sentences, 69.7k words) translated from English into 36 languages: Amharic,
Arabic (Modern Standard), Bengali, Chinese (Simplified), Dari, Dinka, Farsi, French (European), Hausa,
Hindi, Indonesian, Kanuri, Khmer (Central), Kinyarwanda, Kurdish Kurmanji, Kurdish Sorani, Lingala,
Luganda, Malay, Marathi, Myanmar, Nepali, Nigerian Fulfulde, Nuer, Oromo, Pashto, Portuguese (Brazilian),
TICO-19 (Translation Initiative for COVID-19) is sampled from a variety of public sources containing
COVID-19 related content, representing different domains (e.g., news, wiki articles, and others). TICO-19
includes 30 documents (3071 sentences, 69.7k words) translated from English into 36 languages: Amharic,
Arabic (Modern Standard), Bengali, Chinese (Simplified), Dari, Dinka, Farsi, French (European), Hausa,
Hindi, Indonesian, Kanuri, Khmer (Central), Kinyarwanda, Kurdish Kurmanji, Kurdish Sorani, Lingala,
Luganda, Malay, Marathi, Myanmar, Nepali, Nigerian Fulfulde, Nuer, Oromo, Pashto, Portuguese (Brazilian),
Russian, Somali, Spanish (Latin American), Swahili, Congolese Swahili, Tagalog, Tamil, Tigrinya, Urdu, Zulu.
"""

_HOMEPAGE = "https://tico-19.github.io"

_LICENSE = "CC0"

_URLS = {
"evaluation": "https://tico-19.github.io/data/tico19-testset.zip",
"all": "https://tico-19.github.io/data/TM/all.{lang_pairs}.tmx.zip"
}
_URLS = {"evaluation": "https://tico-19.github.io/data/tico19-testset.zip", "all": "https://tico-19.github.io/data/TM/all.{lang_pairs}.tmx.zip"}

_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION]

Expand Down Expand Up @@ -128,16 +140,14 @@ def seacrowd_config_constructor(lang_source, lang_target, schema, version):
subset_id="tico_19",
)


class Tico19(datasets.GeneratorBasedBuilder):
"""TICO-19 is MT dataset sampled from a variety of public sources containing COVID-19 related content"""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)

BUILDER_CONFIGS = [
seacrowd_config_constructor(src, tgt, schema, version)
for src, tgt in [("", "")] + _SUPPORTED_LANG_PAIRS for schema, version in zip(["source", "seacrowd_t2t"], [_SOURCE_VERSION, _SEACROWD_VERSION])
]
BUILDER_CONFIGS = [seacrowd_config_constructor(src, tgt, schema, version) for src, tgt in [("", "")] + _SUPPORTED_LANG_PAIRS for schema, version in zip(["source", "seacrowd_t2t"], [_SOURCE_VERSION, _SEACROWD_VERSION])]

DEFAULT_CONFIG_NAME = "tico_19_source"

Expand Down Expand Up @@ -168,7 +178,7 @@ def _info(self) -> datasets.DatasetInfo:

def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""

try:
lang_pairs_config = re.search("tico_19_(.+?)_(source|seacrowd_t2t)", self.config.name).group(1)
lang_src, lang_tgt = lang_pairs_config.split("_")
Expand All @@ -177,46 +187,36 @@ def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datase

lang_pairs = _LANG_CODE_MAP[lang_src] + "-" + _LANG_CODE_MAP[lang_tgt]

# dev & test split only applicable to eng-ind language pair
if lang_pairs in ["en-id", "id-en"]:
# dev & test split only applicable to eng-[sea language] language pair
if lang_pairs in set(_DEVTEST_LANG_PAIRS):
lang_sea = _LANG_CODE_MAP[lang_tgt] if lang_src == "eng" else _LANG_CODE_MAP[lang_src]

data_dir = dl_manager.download_and_extract(_URLS["evaluation"])
return [
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": os.path.join(data_dir, "tico19-testset", "test", f"test.en-id.tsv"),
"lang_source": lang_src,
"lang_target": lang_tgt
},
gen_kwargs={"filepath": os.path.join(data_dir, "tico19-testset", "test", f"test.en-{lang_sea}.tsv"), "lang_source": lang_src, "lang_target": lang_tgt},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": os.path.join(data_dir, "tico19-testset", "dev", f"dev.en-id.tsv"),
"lang_source": lang_src,
"lang_target": lang_tgt
},
gen_kwargs={"filepath": os.path.join(data_dir, "tico19-testset", "dev", f"dev.en-{lang_sea}.tsv"), "lang_source": lang_src, "lang_target": lang_tgt},
),
]
else:
data_dir = dl_manager.download_and_extract(_URLS["all"].format(lang_pairs=lang_pairs))
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": os.path.join(data_dir, f"all.{lang_pairs}.tmx"),
"lang_source": lang_src,
"lang_target": lang_tgt
},
gen_kwargs={"filepath": os.path.join(data_dir, f"all.{lang_pairs}.tmx"), "lang_source": lang_src, "lang_target": lang_tgt},
)
]

def _generate_examples(self, filepath: Path, lang_source: str, lang_target: str) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""

if self.config.schema == "source":
# eng-ind language pair dataset provided in .tsv format
if (lang_source == "eng" and lang_target == "ind") or (lang_source == "ind" and lang_target == "eng"):
# eng-[sea language] language pair dataset provided in .tsv format
if f"{_LANG_CODE_MAP[lang_source]}-{_LANG_CODE_MAP[lang_target]}" in set(_DEVTEST_LANG_PAIRS):
with open(filepath, encoding="utf-8") as f:
reader = csv.reader(f, delimiter="\t", quotechar='"')
for id_, row in enumerate(reader):
Expand All @@ -242,16 +242,16 @@ def _generate_examples(self, filepath: Path, lang_source: str, lang_target: str)
"license": row[6],
"translatorId": row[7],
}

# all language pairs except eng-ind dataset provided in .tmx format
else:
with open(filepath, "rb") as f:
tmx_file = tmxfile(f)

for id_, node in enumerate(tmx_file.unit_iter()):
try:
url = [text for text in node.xmlelement.itertext('prop')][0]
except:
url = [text for text in node.xmlelement.itertext("prop")][0]
except Exception:
url = ""
yield id_, {
"sourceLang": _LANG_CODE_MAP[lang_source],
Expand All @@ -265,7 +265,7 @@ def _generate_examples(self, filepath: Path, lang_source: str, lang_target: str)
}

elif self.config.schema == "seacrowd_t2t":
if (lang_source == "eng" and lang_target == "ind") or (lang_source == "ind" and lang_target == "eng"):
if f"{_LANG_CODE_MAP[lang_source]}-{_LANG_CODE_MAP[lang_target]}" in set(_DEVTEST_LANG_PAIRS):
with open(filepath, encoding="utf-8") as f:
reader = csv.reader(f, delimiter="\t", quotechar='"')
for id_, row in enumerate(reader):
Expand All @@ -277,22 +277,10 @@ def _generate_examples(self, filepath: Path, lang_source: str, lang_target: str)
else:
source_string = row[3]
target_string = row[2]
yield id_, {
"id": row[4],
"text_1": source_string,
"text_2": target_string,
"text_1_name": lang_source,
"text_2_name": lang_target
}
yield id_, {"id": row[4], "text_1": source_string, "text_2": target_string, "text_1_name": lang_source, "text_2_name": lang_target}
else:
with open(filepath, "rb") as f:
tmx_file = tmxfile(f)

for id_, node in enumerate(tmx_file.unit_iter()):
yield id_, {
"id": node.getid(),
"text_1": node.source,
"text_2": node.target,
"text_1_name": lang_source,
"text_2_name": lang_target
}
yield id_, {"id": node.getid(), "text_1": node.source, "text_2": node.target, "text_1_name": lang_source, "text_2_name": lang_target}

0 comments on commit 6db6bee

Please sign in to comment.