Skip to content

Commit

Permalink
Closes #392 | Create dataset loader for Generated Review ENTH (#435)
Browse files Browse the repository at this point in the history
* [Feat] Add Generated Review ENTH dataloader

* [Chore] Tidy up: run formatter, remove args, use f-string

* [Fix] Change citation to follow scb-mt-en-th-2020
  • Loading branch information
khelli07 authored Apr 19, 2024
1 parent 1959846 commit a19097e
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 0 deletions.
Empty file.
153 changes: 153 additions & 0 deletions seacrowd/sea_datasets/generated_review_enth/generated_review_enth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import os
from pathlib import Path
from typing import Dict, List, Tuple

import datasets
import jsonlines

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Licenses, Tasks

_CITATION = """\
@article{Lowphansirikul2021,
author={Lowphansirikul, Lalita
and Polpanumas, Charin
and Rutherford, Attapol T.
and Nutanong, Sarana},
title={A large English--Thai parallel corpus from the web and machine-generated text},
journal={Language Resources and Evaluation},
year={2021},
month={Mar},
day={30},
issn={1574-0218},
doi={10.1007/s10579-021-09536-6},
url={https://doi.org/10.1007/s10579-021-09536-6}
"""

_DATASETNAME = "generated_review_enth"

_DESCRIPTION = """\
Generated Reviews ENTH is created as part of scb-mt-en-th-2020 for machine translation task. This dataset (referred to as generated_reviews_yn in scb-mt-en-th-2020) are English product reviews generated by CTRL, translated by Google Translate API and annotated as accepted or rejected (correct) based on fluency and adequacy of the translation by human annotators. This allows it to be used for English-to-Thai translation quality esitmation (binary label), machine translation, and sentiment analysis. For SEACrowd, we use data with correct = 1.
"""

_HOMEPAGE = "https://github.com/vistec-ai/generated_reviews_enth"

_LANGUAGES = ["tha", "eng"]

_LICENSE = Licenses.CC_BY_SA_4_0.value

_LOCAL = False

_URLS = {_DATASETNAME: "https://github.com/vistec-AI/generated_reviews_enth/raw/main/data.zip"}

_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION]

_SOURCE_VERSION = "1.0.0"

_SEACROWD_VERSION = "1.0.0"


class GeneratedReviewENTHDataset(datasets.GeneratorBasedBuilder):
"""This dataset (referred to as generated_reviews_yn in scb-mt-en-th-2020) are English product reviews generated by CTRL, translated by Google Translate API and annotated as accepted or rejected (correct) based on fluency and adequacy of the translation by human annotators. This allows it to be used for English-to-Thai translation quality esitmation (binary label), machine translation, and sentiment analysis."""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)

BUILDER_CONFIGS = [
SEACrowdConfig(
name=f"{_DATASETNAME}_source",
version=SOURCE_VERSION,
description="Generated Review EN-TH source schema",
schema="source",
subset_id=f"{_DATASETNAME}",
),
SEACrowdConfig(
name=f"{_DATASETNAME}_seacrowd_t2t",
version=SEACROWD_VERSION,
description="Generated Review EN-TH SEACrowd schema",
schema="seacrowd_t2t",
subset_id=f"{_DATASETNAME}",
),
]

DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"

def _info(self) -> datasets.DatasetInfo:
if self.config.schema == "source":
features = datasets.Features(
{
"en_segment": datasets.Value("string"),
"th_segment": datasets.Value("string"),
"review_star": datasets.Value("int32"),
"correct": datasets.Value("bool"),
}
)

elif self.config.schema == "seacrowd_t2t":
features = schemas.text2text_features

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""

urls = _URLS[_DATASETNAME]
data_dir = dl_manager.download_and_extract(urls)
data_dir = os.path.join(data_dir, "data")

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": os.path.join(data_dir, "train.jsonl"),
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": os.path.join(data_dir, "test.jsonl"),
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": os.path.join(data_dir, "valid.jsonl"),
},
),
]

def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""

if self.config.schema == "source":
with jsonlines.open(filepath) as f:
i = -1
for example in f.iter():
i += 1
yield str(i), {
"en_segment": example["en_segment"],
"th_segment": example["th_segment"],
"review_star": example["review_star"],
"correct": example["correct"],
}

elif self.config.schema == "seacrowd_t2t":
with jsonlines.open(filepath) as f:
i = -1
for example in f.iter():
if example["correct"]: # SEACrowd only use correct data
i += 1
yield str(i), {
"id": str(i),
"text_1": example["en_segment"],
"text_2": example["th_segment"],
"text_1_name": "eng",
"text_2_name": "tha",
}

0 comments on commit a19097e

Please sign in to comment.