-
Notifications
You must be signed in to change notification settings - Fork 57
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
* [Feat] Add Generated Review ENTH dataloader * [Chore] Tidy up: run formatter, remove args, use f-string * [Fix] Change citation to follow scb-mt-en-th-2020
- Loading branch information
Showing
2 changed files
with
153 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
153 changes: 153 additions & 0 deletions
153
seacrowd/sea_datasets/generated_review_enth/generated_review_enth.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
import os | ||
from pathlib import Path | ||
from typing import Dict, List, Tuple | ||
|
||
import datasets | ||
import jsonlines | ||
|
||
from seacrowd.utils import schemas | ||
from seacrowd.utils.configs import SEACrowdConfig | ||
from seacrowd.utils.constants import Licenses, Tasks | ||
|
||
_CITATION = """\ | ||
@article{Lowphansirikul2021, | ||
author={Lowphansirikul, Lalita | ||
and Polpanumas, Charin | ||
and Rutherford, Attapol T. | ||
and Nutanong, Sarana}, | ||
title={A large English--Thai parallel corpus from the web and machine-generated text}, | ||
journal={Language Resources and Evaluation}, | ||
year={2021}, | ||
month={Mar}, | ||
day={30}, | ||
issn={1574-0218}, | ||
doi={10.1007/s10579-021-09536-6}, | ||
url={https://doi.org/10.1007/s10579-021-09536-6} | ||
""" | ||
|
||
_DATASETNAME = "generated_review_enth" | ||
|
||
_DESCRIPTION = """\ | ||
Generated Reviews ENTH is created as part of scb-mt-en-th-2020 for machine translation task. This dataset (referred to as generated_reviews_yn in scb-mt-en-th-2020) are English product reviews generated by CTRL, translated by Google Translate API and annotated as accepted or rejected (correct) based on fluency and adequacy of the translation by human annotators. This allows it to be used for English-to-Thai translation quality esitmation (binary label), machine translation, and sentiment analysis. For SEACrowd, we use data with correct = 1. | ||
""" | ||
|
||
_HOMEPAGE = "https://github.com/vistec-ai/generated_reviews_enth" | ||
|
||
_LANGUAGES = ["tha", "eng"] | ||
|
||
_LICENSE = Licenses.CC_BY_SA_4_0.value | ||
|
||
_LOCAL = False | ||
|
||
_URLS = {_DATASETNAME: "https://github.com/vistec-AI/generated_reviews_enth/raw/main/data.zip"} | ||
|
||
_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] | ||
|
||
_SOURCE_VERSION = "1.0.0" | ||
|
||
_SEACROWD_VERSION = "1.0.0" | ||
|
||
|
||
class GeneratedReviewENTHDataset(datasets.GeneratorBasedBuilder): | ||
"""This dataset (referred to as generated_reviews_yn in scb-mt-en-th-2020) are English product reviews generated by CTRL, translated by Google Translate API and annotated as accepted or rejected (correct) based on fluency and adequacy of the translation by human annotators. This allows it to be used for English-to-Thai translation quality esitmation (binary label), machine translation, and sentiment analysis.""" | ||
|
||
SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) | ||
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) | ||
|
||
BUILDER_CONFIGS = [ | ||
SEACrowdConfig( | ||
name=f"{_DATASETNAME}_source", | ||
version=SOURCE_VERSION, | ||
description="Generated Review EN-TH source schema", | ||
schema="source", | ||
subset_id=f"{_DATASETNAME}", | ||
), | ||
SEACrowdConfig( | ||
name=f"{_DATASETNAME}_seacrowd_t2t", | ||
version=SEACROWD_VERSION, | ||
description="Generated Review EN-TH SEACrowd schema", | ||
schema="seacrowd_t2t", | ||
subset_id=f"{_DATASETNAME}", | ||
), | ||
] | ||
|
||
DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" | ||
|
||
def _info(self) -> datasets.DatasetInfo: | ||
if self.config.schema == "source": | ||
features = datasets.Features( | ||
{ | ||
"en_segment": datasets.Value("string"), | ||
"th_segment": datasets.Value("string"), | ||
"review_star": datasets.Value("int32"), | ||
"correct": datasets.Value("bool"), | ||
} | ||
) | ||
|
||
elif self.config.schema == "seacrowd_t2t": | ||
features = schemas.text2text_features | ||
|
||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=features, | ||
homepage=_HOMEPAGE, | ||
license=_LICENSE, | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: | ||
"""Returns SplitGenerators.""" | ||
|
||
urls = _URLS[_DATASETNAME] | ||
data_dir = dl_manager.download_and_extract(urls) | ||
data_dir = os.path.join(data_dir, "data") | ||
|
||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, | ||
gen_kwargs={ | ||
"filepath": os.path.join(data_dir, "train.jsonl"), | ||
}, | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TEST, | ||
gen_kwargs={ | ||
"filepath": os.path.join(data_dir, "test.jsonl"), | ||
}, | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.VALIDATION, | ||
gen_kwargs={ | ||
"filepath": os.path.join(data_dir, "valid.jsonl"), | ||
}, | ||
), | ||
] | ||
|
||
def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: | ||
"""Yields examples as (key, example) tuples.""" | ||
|
||
if self.config.schema == "source": | ||
with jsonlines.open(filepath) as f: | ||
i = -1 | ||
for example in f.iter(): | ||
i += 1 | ||
yield str(i), { | ||
"en_segment": example["en_segment"], | ||
"th_segment": example["th_segment"], | ||
"review_star": example["review_star"], | ||
"correct": example["correct"], | ||
} | ||
|
||
elif self.config.schema == "seacrowd_t2t": | ||
with jsonlines.open(filepath) as f: | ||
i = -1 | ||
for example in f.iter(): | ||
if example["correct"]: # SEACrowd only use correct data | ||
i += 1 | ||
yield str(i), { | ||
"id": str(i), | ||
"text_1": example["en_segment"], | ||
"text_2": example["th_segment"], | ||
"text_1_name": "eng", | ||
"text_2_name": "tha", | ||
} |