From fee0d5427168b3340d93f548b4093ab42aac1319 Mon Sep 17 00:00:00 2001
From: Yuze GAO <yuze.gao@outlook.com>
Date: Wed, 21 Feb 2024 16:28:25 +0800
Subject: [PATCH 1/3] add vispamreviews dataloader

---
 .../sea_datasets/vispamreviews/__init__.py    |   0
 .../vispamreviews/vispamreviews.py            | 150 ++++++++++++++++++
 2 files changed, 150 insertions(+)
 create mode 100644 seacrowd/sea_datasets/vispamreviews/__init__.py
 create mode 100644 seacrowd/sea_datasets/vispamreviews/vispamreviews.py

diff --git a/seacrowd/sea_datasets/vispamreviews/__init__.py b/seacrowd/sea_datasets/vispamreviews/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/seacrowd/sea_datasets/vispamreviews/vispamreviews.py b/seacrowd/sea_datasets/vispamreviews/vispamreviews.py
new file mode 100644
index 000000000..0277c17db
--- /dev/null
+++ b/seacrowd/sea_datasets/vispamreviews/vispamreviews.py
@@ -0,0 +1,150 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Dict, List, Tuple
+
+import datasets
+import pandas
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Licenses, Tasks
+
+_CITATION = """
+@InProceedings{10.1007/978-3-031-21743-2_48,
+author="Van Dinh, Co
+and Luu, Son T.
+and Nguyen, Anh Gia-Tuan",
+editor="Nguyen, Ngoc Thanh
+and Tran, Tien Khoa
+and Tukayev, Ualsher
+and Hong, Tzung-Pei
+and Trawi{\'{n}}ski, Bogdan
+and Szczerbicki, Edward",
+title="Detecting Spam Reviews on Vietnamese E-Commerce Websites",
+booktitle="Intelligent Information and Database Systems",
+year="2022",
+publisher="Springer International Publishing",
+address="Cham",
+pages="595--607",
+abstract="The reviews of customers play an essential role in online shopping.
+People often refer to reviews or comments of previous customers to decide whether
+to buy a new product. Catching up with this behavior, some people create untruths and
+illegitimate reviews to hoax customers about the fake quality of products. These are called
+spam reviews, confusing consumers on online shopping platforms and negatively affecting online
+shopping behaviors. We propose the dataset called ViSpamReviews, which has a strict annotation
+procedure for detecting spam reviews on e-commerce platforms. Our dataset consists of two tasks:
+the binary classification task for detecting whether a review is spam or not and the multi-class
+classification task for identifying the type of spam. The PhoBERT obtained the highest results on
+both tasks, 86.89%, and 72.17%, respectively, by macro average F1 score.",
+isbn="978-3-031-21743-2"
+}
+"""
+
+_LOCAL = False
+_LANGUAGES = ["vie"]
+_DATASETNAME = "vispamreviews"
+_DESCRIPTION = """
+The dataset was collected from leading online shopping platforms in Vietnam. Some of the most recent
+selling products for each product category were selected and up to 15 reviews per product were collected.
+Each review was then labeled as either NO-SPAM, SPAM-1 (fake review), SPAM-2 (review on brand only), or
+SPAM-3 (irrelevant content).
+"""
+
+_HOMEPAGE = "https://github.com/sonlam1102/vispamdetection/"
+_LICENSE = Licenses.CC_BY_NC_4_0.value
+_URL = "https://raw.githubusercontent.com/sonlam1102/vispamdetection/main/dataset/vispamdetection_dataset.zip"
+
+_Split_Path = {
+    "train": "dataset/train.csv",
+    "validation": "dataset/dev.csv",
+    "test": "dataset/test.csv",
+}
+
+_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS]  # Text Classification
+_SOURCE_VERSION = "1.0.0"
+_SEACROWD_VERSION = "1.0.0"
+
+
+class ViSpamReviewsDataset(datasets.GeneratorBasedBuilder):
+    """
+    The SeaCrowd dataloader for the review dataset shopping platforms in Vietnam (ViSpamReviews).
+    """
+
+    CLASS_LABELS = [0, 1]
+    SPAM_LABELS = [0, 1, 2, 3]
+
+    BUILDER_CONFIGS = [
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_source",
+            version=datasets.Version(_SOURCE_VERSION),
+            description=f"{_DATASETNAME} source schema",
+            schema="source",
+            subset_id=f"{_DATASETNAME}",
+        ),
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_seacrowd_text",
+            version=datasets.Version(_SEACROWD_VERSION),
+            description=f"{_DATASETNAME} SEACrowd schema ",
+            schema="seacrowd_text",
+            subset_id=f"{_DATASETNAME}",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+        if self.config.schema == "source":
+            features = datasets.Features({"id": datasets.Value("int32"), "text": datasets.Value("string"), "label": datasets.Value("string"), "spam_label": datasets.Value("string"), "rating": datasets.Value("int32")})
+
+        elif self.config.schema == "seacrowd_text":
+            features = schemas.text_features(label_names=self.CLASS_LABELS)
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        file_paths = dl_manager.download_and_extract(_URL)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": os.path.join(file_paths, _Split_Path["train"])},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"filepath": os.path.join(file_paths, _Split_Path["validation"])},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={"filepath": os.path.join(file_paths, _Split_Path["test"])},
+            ),
+        ]
+
+    def _generate_examples(self, filepath) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+        data_lines = pandas.read_csv(filepath)
+        if self.config.schema == "source":
+            for rid, row in enumerate(data_lines.itertuples()):
+                example = {"id": str(rid), "text": row.Comment, "label": row.Label, "spam_label": row.SpamLabel, "rating": row.Rating}
+                yield rid, example
+        elif self.config.schema == "seacrowd_text":
+            for rid, row in enumerate(data_lines.itertuples()):
+                example = {"id": str(rid), "text": row.Comment, "label": row.Label}
+                yield rid, example

From f4188577c3569fc132237c39e9e5f6dc741614d0 Mon Sep 17 00:00:00 2001
From: Yuze GAO <yuze.gao@outlook.com>
Date: Tue, 26 Mar 2024 18:27:39 +0800
Subject: [PATCH 2/3] update vispamreviews

---
 .../vispamreviews/vispamreviews.py            | 44 ++++++++++++++-----
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/seacrowd/sea_datasets/vispamreviews/vispamreviews.py b/seacrowd/sea_datasets/vispamreviews/vispamreviews.py
index 0277c17db..53046e231 100644
--- a/seacrowd/sea_datasets/vispamreviews/vispamreviews.py
+++ b/seacrowd/sea_datasets/vispamreviews/vispamreviews.py
@@ -84,7 +84,7 @@ class ViSpamReviewsDataset(datasets.GeneratorBasedBuilder):
     """
 
     CLASS_LABELS = [0, 1]
-    SPAM_LABELS = [0, 1, 2, 3]
+    SPAM_TYPE_LABELS = [0, 1, 2, 3]
 
     BUILDER_CONFIGS = [
         SEACrowdConfig(
@@ -101,16 +101,35 @@ class ViSpamReviewsDataset(datasets.GeneratorBasedBuilder):
             schema="seacrowd_text",
             subset_id=f"{_DATASETNAME}",
         ),
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_spam_seacrowd_text",
+            version=datasets.Version(_SEACROWD_VERSION),
+            description=f"{_DATASETNAME} SEACrowd schema ",
+            schema="seacrowd_text",
+            subset_id=f"{_DATASETNAME}_spam",
+        ),
     ]
 
     DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
 
     def _info(self) -> datasets.DatasetInfo:
         if self.config.schema == "source":
-            features = datasets.Features({"id": datasets.Value("int32"), "text": datasets.Value("string"), "label": datasets.Value("string"), "spam_label": datasets.Value("string"), "rating": datasets.Value("int32")})
-
-        elif self.config.schema == "seacrowd_text":
+            features = (datasets.Features
+                (
+                {"id": datasets.Value("int32"),
+                 "text": datasets.Value("string"),
+                 "label": datasets.Value("string"),
+                 "spam_label": datasets.Value("string"),
+                 "rating": datasets.Value("int32")
+                 }
+            ))
+
+        elif self.config.name == "vispamreviews_seacrowd_text":
             features = schemas.text_features(label_names=self.CLASS_LABELS)
+        elif self.config.name == "vispamreviews_spam_seacrowd_text":
+            features = schemas.text_features(label_names=self.SPAM_TYPE_LABELS)
+        else:
+            raise ValueError("Invalid schema")
 
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
@@ -140,11 +159,14 @@ def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datase
     def _generate_examples(self, filepath) -> Tuple[int, Dict]:
         """Yields examples as (key, example) tuples."""
         data_lines = pandas.read_csv(filepath)
-        if self.config.schema == "source":
-            for rid, row in enumerate(data_lines.itertuples()):
-                example = {"id": str(rid), "text": row.Comment, "label": row.Label, "spam_label": row.SpamLabel, "rating": row.Rating}
-                yield rid, example
-        elif self.config.schema == "seacrowd_text":
-            for rid, row in enumerate(data_lines.itertuples()):
+        for rid, row in enumerate(data_lines.itertuples()):
+            if self.config.schema == "source":
+                example = {"id": str(rid), "text": row.Comment, "label": row.Label, "spam_label": row.SpamLabel,
+                           "rating": row.Rating}
+            elif self.config.name == "vispamreviews_seacrowd_text":
                 example = {"id": str(rid), "text": row.Comment, "label": row.Label}
-                yield rid, example
+            elif self.config.schema == "vispamreviews_spam_seacrowd_text":
+                example = {"id": str(rid), "text": row.Comment, "label": row.SpamLabel}
+            else:
+                raise ValueError("Invalid schema")
+            yield rid, example

From 8ba1ce1b9261f23f2ef21308bfcfb632f1e5c544 Mon Sep 17 00:00:00 2001
From: Yuze GAO <yuze.gao@outlook.com>
Date: Tue, 26 Mar 2024 22:38:29 +0800
Subject: [PATCH 3/3] update schema

---
 .../sea_datasets/vispamreviews/vispamreviews.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/seacrowd/sea_datasets/vispamreviews/vispamreviews.py b/seacrowd/sea_datasets/vispamreviews/vispamreviews.py
index 53046e231..4bbd1871d 100644
--- a/seacrowd/sea_datasets/vispamreviews/vispamreviews.py
+++ b/seacrowd/sea_datasets/vispamreviews/vispamreviews.py
@@ -94,6 +94,13 @@ class ViSpamReviewsDataset(datasets.GeneratorBasedBuilder):
             schema="source",
             subset_id=f"{_DATASETNAME}",
         ),
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_spam_source",
+            version=datasets.Version(_SOURCE_VERSION),
+            description=f"{_DATASETNAME} source schema",
+            schema="source",
+            subset_id=f"{_DATASETNAME}",
+        ),
         SEACrowdConfig(
             name=f"{_DATASETNAME}_seacrowd_text",
             version=datasets.Version(_SEACROWD_VERSION),
@@ -113,7 +120,7 @@ class ViSpamReviewsDataset(datasets.GeneratorBasedBuilder):
     DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
 
     def _info(self) -> datasets.DatasetInfo:
-        if self.config.schema == "source":
+        if self.config.name.endswith("source"):
             features = (datasets.Features
                 (
                 {"id": datasets.Value("int32"),
@@ -129,7 +136,7 @@ def _info(self) -> datasets.DatasetInfo:
         elif self.config.name == "vispamreviews_spam_seacrowd_text":
             features = schemas.text_features(label_names=self.SPAM_TYPE_LABELS)
         else:
-            raise ValueError("Invalid schema")
+            raise ValueError(f"Invalid schema {self.config.name}")
 
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
@@ -160,13 +167,13 @@ def _generate_examples(self, filepath) -> Tuple[int, Dict]:
         """Yields examples as (key, example) tuples."""
         data_lines = pandas.read_csv(filepath)
         for rid, row in enumerate(data_lines.itertuples()):
-            if self.config.schema == "source":
+            if self.config.name.endswith("source"):
                 example = {"id": str(rid), "text": row.Comment, "label": row.Label, "spam_label": row.SpamLabel,
                            "rating": row.Rating}
             elif self.config.name == "vispamreviews_seacrowd_text":
                 example = {"id": str(rid), "text": row.Comment, "label": row.Label}
-            elif self.config.schema == "vispamreviews_spam_seacrowd_text":
+            elif self.config.name == "vispamreviews_spam_seacrowd_text":
                 example = {"id": str(rid), "text": row.Comment, "label": row.SpamLabel}
             else:
-                raise ValueError("Invalid schema")
+                raise ValueError(f"Invalid schema {self.config.schema}")
             yield rid, example