forked from SEACrowd/seacrowd-datahub
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Closes SEACrowd#314 | Add dataloader for Indonesia chinese mt robust …
…eval (SEACrowd#388) * add dataloader for indonesian_madurese_bible_translation * update dataloader for indonesia_chinese_mtrobusteval * Delete seacrowd/sea_datasets/indonesian_madurese_bible_translation/indonesian_madurese_bible_translation.py * Update indonesia_chinese_mtrobusteval.py * update code based on the reviewer comments * add __init__.py * Update seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/indonesia_chinese_mtrobusteval.py * Update seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/indonesia_chinese_mtrobusteval.py --------- Co-authored-by: Jennifer Santoso <[email protected]>
- Loading branch information
Showing
2 changed files
with
151 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
151 changes: 151 additions & 0 deletions
151
seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/indonesia_chinese_mtrobusteval.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
from pathlib import Path | ||
from typing import Dict, List, Tuple | ||
|
||
import datasets | ||
import jsonlines | ||
import pandas as pd | ||
|
||
from seacrowd.utils import schemas | ||
from seacrowd.utils.configs import SEACrowdConfig | ||
from seacrowd.utils.constants import Licenses, Tasks | ||
|
||
_CITATION = """\ | ||
@article{, | ||
author = {supryzhu}, | ||
title = {Indonesia-Chinese-MTRobustEval}, | ||
journal = {None}, | ||
volume = {None}, | ||
year = {2023}, | ||
url = {https://github.com/supryzhu/Indonesia-Chinese-MTRobustEval}, | ||
doi = {None}, | ||
biburl = {None}, | ||
bibsource = {None} | ||
} | ||
""" | ||
|
||
|
||
_DATASETNAME = "indonesia_chinese_mtrobusteval" | ||
|
||
_DESCRIPTION = """\ | ||
The dataset is curated for the purpose of evaluating the robustness of Neural Machine Translation (NMT) towards natural occuring noise | ||
(typo, slang, code switching, etc.). The dataset is crawled from Twitter, then pre-processed to obtain sentences with noise. | ||
The dataset consists of a thousand noisy sentences. The dataset is translated into Chinese manually as the benchmark for evaluating the robustness of NMT. | ||
""" | ||
|
||
_HOMEPAGE = "https://github.com/supryzhu/Indonesia-Chinese-MTRobustEval" | ||
|
||
_LANGUAGES = ["ind", "cmn"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) | ||
|
||
|
||
_LICENSE = Licenses.MIT.value # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value | ||
|
||
_LOCAL = False | ||
|
||
_URLS = { | ||
_DATASETNAME: "https://github.com/supryzhu/Indonesia-Chinese-MTRobustEval/raw/main/data/Indonesia-Chinese.xlsx", | ||
} | ||
|
||
_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] | ||
|
||
_SOURCE_VERSION = "1.0.0" | ||
|
||
_SEACROWD_VERSION = "1.0.0" | ||
|
||
|
||
class IndonesiaChineseMtRobustEval(datasets.GeneratorBasedBuilder): | ||
"""The dataset consists of a thousand noisy sentences. The dataset is translated into Chinese manually as the benchmark for evaluating the robustness of NMT.""" | ||
|
||
SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) | ||
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) | ||
|
||
BUILDER_CONFIGS = [ | ||
SEACrowdConfig( | ||
name=f"{_DATASETNAME}_source", | ||
version=SOURCE_VERSION, | ||
description="indonesia_chinese_mtrobusteval source schema", | ||
schema="source", | ||
subset_id=f"{_DATASETNAME}", | ||
), | ||
SEACrowdConfig( | ||
name=f"{_DATASETNAME}_seacrowd_t2t", | ||
version=SEACROWD_VERSION, | ||
description="indonesia_chinese_mtrobusteval SEACrowd schema", | ||
schema="seacrowd_t2t", | ||
subset_id=f"{_DATASETNAME}", | ||
), | ||
] | ||
|
||
DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" | ||
|
||
def _info(self) -> datasets.DatasetInfo: | ||
|
||
if self.config.schema == "source": | ||
features = datasets.Features( | ||
{ | ||
"id": datasets.Value("string"), | ||
"src": datasets.Value("string"), | ||
"tgt": datasets.Value("string"), | ||
} | ||
) | ||
|
||
elif self.config.schema == "seacrowd_t2t": | ||
features = schemas.text2text_features | ||
|
||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=features, | ||
homepage=_HOMEPAGE, | ||
license=_LICENSE, | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: | ||
"""Returns SplitGenerators.""" | ||
urls = _URLS[_DATASETNAME] | ||
file_path = dl_manager.download(urls) | ||
df = pd.read_excel(file_path) | ||
src = df["Indonesia"].tolist() | ||
tgt = df["Chinese"].tolist() | ||
results = [] | ||
for i, item in enumerate(src): | ||
results.append({"id": str(i), "src": item, "tgt": tgt[i]}) | ||
self._write_jsonl(file_path + ".jsonl", results) | ||
|
||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, | ||
# Whatever you put in gen_kwargs will be passed to _generate_examples | ||
gen_kwargs={ | ||
"filepath": file_path + ".jsonl", | ||
"split": "train", | ||
}, | ||
) | ||
] | ||
|
||
|
||
def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: | ||
if self.config.schema == "source": | ||
i = 0 | ||
with jsonlines.open(filepath) as f: | ||
for each_data in f.iter(): | ||
ex = { | ||
"id": each_data["id"], | ||
"src": each_data["src"], | ||
"tgt": each_data["tgt"], | ||
} | ||
yield i, ex | ||
i += 1 | ||
|
||
elif self.config.schema == "seacrowd_t2t": | ||
i = 0 | ||
with jsonlines.open(filepath) as f: | ||
for each_data in f.iter(): | ||
ex = {"id": each_data["id"], "text_1": each_data["src"], "text_2": each_data["tgt"], "text_1_name": "ind", "text_2_name": "cmn"} | ||
yield i, ex | ||
i += 1 | ||
|
||
def _write_jsonl(self, filepath, values): | ||
with jsonlines.open(filepath, "w") as writer: | ||
for line in values: | ||
writer.write(line) | ||
|