From fee0d5427168b3340d93f548b4093ab42aac1319 Mon Sep 17 00:00:00 2001 From: Yuze GAO Date: Wed, 21 Feb 2024 16:28:25 +0800 Subject: [PATCH 1/3] add vispamreviews dataloader --- .../sea_datasets/vispamreviews/__init__.py | 0 .../vispamreviews/vispamreviews.py | 150 ++++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100644 seacrowd/sea_datasets/vispamreviews/__init__.py create mode 100644 seacrowd/sea_datasets/vispamreviews/vispamreviews.py diff --git a/seacrowd/sea_datasets/vispamreviews/__init__.py b/seacrowd/sea_datasets/vispamreviews/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/vispamreviews/vispamreviews.py b/seacrowd/sea_datasets/vispamreviews/vispamreviews.py new file mode 100644 index 000000000..0277c17db --- /dev/null +++ b/seacrowd/sea_datasets/vispamreviews/vispamreviews.py @@ -0,0 +1,150 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from typing import Dict, List, Tuple + +import datasets +import pandas + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@InProceedings{10.1007/978-3-031-21743-2_48, +author="Van Dinh, Co +and Luu, Son T. +and Nguyen, Anh Gia-Tuan", +editor="Nguyen, Ngoc Thanh +and Tran, Tien Khoa +and Tukayev, Ualsher +and Hong, Tzung-Pei +and Trawi{\'{n}}ski, Bogdan +and Szczerbicki, Edward", +title="Detecting Spam Reviews on Vietnamese E-Commerce Websites", +booktitle="Intelligent Information and Database Systems", +year="2022", +publisher="Springer International Publishing", +address="Cham", +pages="595--607", +abstract="The reviews of customers play an essential role in online shopping. +People often refer to reviews or comments of previous customers to decide whether +to buy a new product. Catching up with this behavior, some people create untruths and +illegitimate reviews to hoax customers about the fake quality of products. These are called +spam reviews, confusing consumers on online shopping platforms and negatively affecting online +shopping behaviors. We propose the dataset called ViSpamReviews, which has a strict annotation +procedure for detecting spam reviews on e-commerce platforms. Our dataset consists of two tasks: +the binary classification task for detecting whether a review is spam or not and the multi-class +classification task for identifying the type of spam. The PhoBERT obtained the highest results on +both tasks, 86.89%, and 72.17%, respectively, by macro average F1 score.", +isbn="978-3-031-21743-2" +} +""" + +_LOCAL = False +_LANGUAGES = ["vie"] +_DATASETNAME = "vispamreviews" +_DESCRIPTION = """ +The dataset was collected from leading online shopping platforms in Vietnam. Some of the most recent +selling products for each product category were selected and up to 15 reviews per product were collected. +Each review was then labeled as either NO-SPAM, SPAM-1 (fake review), SPAM-2 (review on brand only), or +SPAM-3 (irrelevant content). +""" + +_HOMEPAGE = "https://github.com/sonlam1102/vispamdetection/" +_LICENSE = Licenses.CC_BY_NC_4_0.value +_URL = "https://raw.githubusercontent.com/sonlam1102/vispamdetection/main/dataset/vispamdetection_dataset.zip" + +_Split_Path = { + "train": "dataset/train.csv", + "validation": "dataset/dev.csv", + "test": "dataset/test.csv", +} + +_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS] # Text Classification +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class ViSpamReviewsDataset(datasets.GeneratorBasedBuilder): + """ + The SeaCrowd dataloader for the review dataset shopping platforms in Vietnam (ViSpamReviews). + """ + + CLASS_LABELS = [0, 1] + SPAM_LABELS = [0, 1, 2, 3] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_text", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema ", + schema="seacrowd_text", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features({"id": datasets.Value("int32"), "text": datasets.Value("string"), "label": datasets.Value("string"), "spam_label": datasets.Value("string"), "rating": datasets.Value("int32")}) + + elif self.config.schema == "seacrowd_text": + features = schemas.text_features(label_names=self.CLASS_LABELS) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + file_paths = dl_manager.download_and_extract(_URL) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": os.path.join(file_paths, _Split_Path["train"])}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": os.path.join(file_paths, _Split_Path["validation"])}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": os.path.join(file_paths, _Split_Path["test"])}, + ), + ] + + def _generate_examples(self, filepath) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + data_lines = pandas.read_csv(filepath) + if self.config.schema == "source": + for rid, row in enumerate(data_lines.itertuples()): + example = {"id": str(rid), "text": row.Comment, "label": row.Label, "spam_label": row.SpamLabel, "rating": row.Rating} + yield rid, example + elif self.config.schema == "seacrowd_text": + for rid, row in enumerate(data_lines.itertuples()): + example = {"id": str(rid), "text": row.Comment, "label": row.Label} + yield rid, example From f4188577c3569fc132237c39e9e5f6dc741614d0 Mon Sep 17 00:00:00 2001 From: Yuze GAO Date: Tue, 26 Mar 2024 18:27:39 +0800 Subject: [PATCH 2/3] update vispamreviews --- .../vispamreviews/vispamreviews.py | 44 ++++++++++++++----- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/seacrowd/sea_datasets/vispamreviews/vispamreviews.py b/seacrowd/sea_datasets/vispamreviews/vispamreviews.py index 0277c17db..53046e231 100644 --- a/seacrowd/sea_datasets/vispamreviews/vispamreviews.py +++ b/seacrowd/sea_datasets/vispamreviews/vispamreviews.py @@ -84,7 +84,7 @@ class ViSpamReviewsDataset(datasets.GeneratorBasedBuilder): """ CLASS_LABELS = [0, 1] - SPAM_LABELS = [0, 1, 2, 3] + SPAM_TYPE_LABELS = [0, 1, 2, 3] BUILDER_CONFIGS = [ SEACrowdConfig( @@ -101,16 +101,35 @@ class ViSpamReviewsDataset(datasets.GeneratorBasedBuilder): schema="seacrowd_text", subset_id=f"{_DATASETNAME}", ), + SEACrowdConfig( + name=f"{_DATASETNAME}_spam_seacrowd_text", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd schema ", + schema="seacrowd_text", + subset_id=f"{_DATASETNAME}_spam", + ), ] DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" def _info(self) -> datasets.DatasetInfo: if self.config.schema == "source": - features = datasets.Features({"id": datasets.Value("int32"), "text": datasets.Value("string"), "label": datasets.Value("string"), "spam_label": datasets.Value("string"), "rating": datasets.Value("int32")}) - - elif self.config.schema == "seacrowd_text": + features = (datasets.Features + ( + {"id": datasets.Value("int32"), + "text": datasets.Value("string"), + "label": datasets.Value("string"), + "spam_label": datasets.Value("string"), + "rating": datasets.Value("int32") + } + )) + + elif self.config.name == "vispamreviews_seacrowd_text": features = schemas.text_features(label_names=self.CLASS_LABELS) + elif self.config.name == "vispamreviews_spam_seacrowd_text": + features = schemas.text_features(label_names=self.SPAM_TYPE_LABELS) + else: + raise ValueError("Invalid schema") return datasets.DatasetInfo( description=_DESCRIPTION, @@ -140,11 +159,14 @@ def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datase def _generate_examples(self, filepath) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" data_lines = pandas.read_csv(filepath) - if self.config.schema == "source": - for rid, row in enumerate(data_lines.itertuples()): - example = {"id": str(rid), "text": row.Comment, "label": row.Label, "spam_label": row.SpamLabel, "rating": row.Rating} - yield rid, example - elif self.config.schema == "seacrowd_text": - for rid, row in enumerate(data_lines.itertuples()): + for rid, row in enumerate(data_lines.itertuples()): + if self.config.schema == "source": + example = {"id": str(rid), "text": row.Comment, "label": row.Label, "spam_label": row.SpamLabel, + "rating": row.Rating} + elif self.config.name == "vispamreviews_seacrowd_text": example = {"id": str(rid), "text": row.Comment, "label": row.Label} - yield rid, example + elif self.config.schema == "vispamreviews_spam_seacrowd_text": + example = {"id": str(rid), "text": row.Comment, "label": row.SpamLabel} + else: + raise ValueError("Invalid schema") + yield rid, example From 8ba1ce1b9261f23f2ef21308bfcfb632f1e5c544 Mon Sep 17 00:00:00 2001 From: Yuze GAO Date: Tue, 26 Mar 2024 22:38:29 +0800 Subject: [PATCH 3/3] update schema --- .../sea_datasets/vispamreviews/vispamreviews.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/seacrowd/sea_datasets/vispamreviews/vispamreviews.py b/seacrowd/sea_datasets/vispamreviews/vispamreviews.py index 53046e231..4bbd1871d 100644 --- a/seacrowd/sea_datasets/vispamreviews/vispamreviews.py +++ b/seacrowd/sea_datasets/vispamreviews/vispamreviews.py @@ -94,6 +94,13 @@ class ViSpamReviewsDataset(datasets.GeneratorBasedBuilder): schema="source", subset_id=f"{_DATASETNAME}", ), + SEACrowdConfig( + name=f"{_DATASETNAME}_spam_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), SEACrowdConfig( name=f"{_DATASETNAME}_seacrowd_text", version=datasets.Version(_SEACROWD_VERSION), @@ -113,7 +120,7 @@ class ViSpamReviewsDataset(datasets.GeneratorBasedBuilder): DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" def _info(self) -> datasets.DatasetInfo: - if self.config.schema == "source": + if self.config.name.endswith("source"): features = (datasets.Features ( {"id": datasets.Value("int32"), @@ -129,7 +136,7 @@ def _info(self) -> datasets.DatasetInfo: elif self.config.name == "vispamreviews_spam_seacrowd_text": features = schemas.text_features(label_names=self.SPAM_TYPE_LABELS) else: - raise ValueError("Invalid schema") + raise ValueError(f"Invalid schema {self.config.name}") return datasets.DatasetInfo( description=_DESCRIPTION, @@ -160,13 +167,13 @@ def _generate_examples(self, filepath) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" data_lines = pandas.read_csv(filepath) for rid, row in enumerate(data_lines.itertuples()): - if self.config.schema == "source": + if self.config.name.endswith("source"): example = {"id": str(rid), "text": row.Comment, "label": row.Label, "spam_label": row.SpamLabel, "rating": row.Rating} elif self.config.name == "vispamreviews_seacrowd_text": example = {"id": str(rid), "text": row.Comment, "label": row.Label} - elif self.config.schema == "vispamreviews_spam_seacrowd_text": + elif self.config.name == "vispamreviews_spam_seacrowd_text": example = {"id": str(rid), "text": row.Comment, "label": row.SpamLabel} else: - raise ValueError("Invalid schema") + raise ValueError(f"Invalid schema {self.config.schema}") yield rid, example