From d13b987de1f4bbfa2bd400aaae6d81715377dad8 Mon Sep 17 00:00:00 2001
From: Akhdan Fadhilah <akhdan.fadh@gmail.com>
Date: Tue, 28 Nov 2023 18:59:31 +0900
Subject: [PATCH 1/5] implement xstorycloze dataloader

---
 seacrowd/sea_datasets/xstorycloze/__init__.py |   0
 .../sea_datasets/xstorycloze/xstorycloze.py   | 188 ++++++++++++++++++
 2 files changed, 188 insertions(+)
 create mode 100644 seacrowd/sea_datasets/xstorycloze/__init__.py
 create mode 100644 seacrowd/sea_datasets/xstorycloze/xstorycloze.py

diff --git a/seacrowd/sea_datasets/xstorycloze/__init__.py b/seacrowd/sea_datasets/xstorycloze/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/seacrowd/sea_datasets/xstorycloze/xstorycloze.py b/seacrowd/sea_datasets/xstorycloze/xstorycloze.py
new file mode 100644
index 000000000..df0783cca
--- /dev/null
+++ b/seacrowd/sea_datasets/xstorycloze/xstorycloze.py
@@ -0,0 +1,188 @@
+import csv
+import itertools
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import (SCHEMA_TO_FEATURES, TASK_TO_SCHEMA,
+                                      Licenses, Tasks)
+
+_CITATION = """\
+@inproceedings{lin2022fewshot,
+    author       = {Xi Victoria Lin and
+                    Todor Mihaylov and
+                    Mikel Artetxe and
+                    Tianlu Wang and
+                    Shuohui Chen and
+                    Daniel Simig and
+                    Myle Ott and
+                    Naman Goyal and
+                    Shruti Bhosale and
+                    Jingfei Du and
+                    Ramakanth Pasunuru and
+                    Sam Shleifer and
+                    Punit Singh Koura and
+                    Vishrav Chaudhary and
+                    Brian O'Horo and
+                    Jeff Wang and
+                    Luke Zettlemoyer and
+                    Zornitsa Kozareva and
+                    Mona T. Diab and
+                    Veselin Stoyanov and
+                    Xian Li},
+    editor       = {Yoav Goldberg and
+                    Zornitsa Kozareva and
+                    Yue Zhang},
+    title        = {Few-shot Learning with Multilingual Generative Language Models},
+    booktitle    = {Proceedings of the 2022 Conference on Empirical Methods in Natural
+                    Language Processing, {EMNLP} 2022, Abu Dhabi, United Arab Emirates,
+                    December 7-11, 2022},
+    pages        = {9019--9052},
+    publisher    = {Association for Computational Linguistics},
+    year         = {2022},
+    url          = {https://doi.org/10.18653/v1/2022.emnlp-main.616},
+    doi          = {10.18653/V1/2022.EMNLP-MAIN.616},
+}
+"""
+
+_DATASETNAME = "xstorycloze"
+_DESCRIPTION = """\
+XStoryCloze consists of the professionally translated version of the English StoryCloze
+dataset (Spring 2016 version) to 10 non-English languages. This dataset is released by
+Meta AI.
+"""
+_HOMEPAGE = "https://huggingface.co/datasets/juletxara/xstory_cloze"
+_LICENSE = Licenses.CC_BY_SA_4_0.value
+
+_LOCAL = False
+_BASE_URL = "https://huggingface.co/datasets/juletxara/xstory_cloze/resolve/main/spring2016.val.{lang}.tsv.split_20_80_{split}.tsv"
+_SUPPORTED_TASKS = [Tasks.COMMONSENSE_REASONING, Tasks.SELF_SUPERVISED_PRETRAINING]
+_SOURCE_VERSION = "1.0.0"
+_SEACROWD_VERSION = "1.0.0"
+
+
+class xStoryClozeDataset(datasets.GeneratorBasedBuilder):
+    """XStoryCloze subset for Indonesian and Burmese language."""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    SEACROWD_SCHEMA_NAME = [TASK_TO_SCHEMA[task].lower() for task in _SUPPORTED_TASKS]
+    SEACROWD_SUBSET = ["id", "my"]
+
+    BUILDER_CONFIGS = [
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_{subset}_source",
+            version=datasets.Version(_SOURCE_VERSION),
+            description=f"{_DATASETNAME} {subset} source schema",
+            schema="source",
+            subset_id=f"{_DATASETNAME}_{subset}",
+        ) 
+        for subset in SEACROWD_SUBSET
+    ] + [
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_{subset}_seacrowd_{schema}",
+            version=datasets.Version(_SEACROWD_VERSION),
+            description=f"{_DATASETNAME} {subset} SEACrowd schema",
+            schema=f"seacrowd_{schema}",
+            subset_id=f"{_DATASETNAME}_{subset}",
+        )
+        for subset, schema in list(itertools.product(SEACROWD_SUBSET, SEACROWD_SCHEMA_NAME))
+    ]
+
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{SEACROWD_SUBSET[0]}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "story_id": datasets.Value("string"),
+                    "input_sentence_1": datasets.Value("string"),
+                    "input_sentence_2": datasets.Value("string"),
+                    "input_sentence_3": datasets.Value("string"),
+                    "input_sentence_4": datasets.Value("string"),
+                    "sentence_quiz1": datasets.Value("string"),
+                    "sentence_quiz2": datasets.Value("string"),
+                    "answer_right_ending": datasets.Value("int32"),
+                }
+            )
+        elif self.config.schema == f"seacrowd_{(schema := self.SEACROWD_SCHEMA_NAME[0])}":
+            features = SCHEMA_TO_FEATURES[schema.upper()]  # qa_features
+        elif self.config.schema == f"seacrowd_{(schema := self.SEACROWD_SCHEMA_NAME[1])}":
+            features = SCHEMA_TO_FEATURES[schema.upper()]  # ssp_features
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        lang = self.config.name.split("_")[1]
+        filepaths = dl_manager.download_and_extract(
+            {
+                "train": _BASE_URL.format(lang=lang, split="train"),
+                "test": _BASE_URL.format(lang=lang, split="eval"),
+            }
+        )
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": filepaths["train"],
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": filepaths["test"],
+                    "split": "test",
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
+        with open(filepath, encoding="utf-8") as f:
+            data = csv.reader(f, quotechar='"', delimiter="\t", quoting=csv.QUOTE_ALL, skipinitialspace=True)
+            _ = next(data)  # skip header
+            if self.config.schema == "source":
+                for id, row in enumerate(data):
+                    yield id, {
+                        "story_id": row[0],
+                        "input_sentence_1": row[1],
+                        "input_sentence_2": row[2],
+                        "input_sentence_3": row[3],
+                        "input_sentence_4": row[4],
+                        "sentence_quiz1": row[5],
+                        "sentence_quiz2": row[6],
+                        "answer_right_ending": int(row[7]),
+                    }
+            elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME[0]}":
+                for id, row in enumerate(data):
+                    question = " ".join(row[1:5])
+                    choices = [row[5], row[6]]
+                    yield id, {
+                        "id": str(id),
+                        "question_id": row[0],
+                        "document_id": None,
+                        "question": question,
+                        "type": "multiple_choice",
+                        "choices": choices,
+                        "context": None,
+                        "answer": [choices[int(row[7]) - 1]],
+                        "meta": {},
+                    }
+            elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME[1]}":
+                for id, row in enumerate(data):
+                    question = " ".join(row[1:5])
+                    correct = row[5] if int(row[7]) == 1 else row[6]
+                    yield id, {
+                        "id": str(id),
+                        "text": question + " " + correct,
+                    }
\ No newline at end of file

From 51140e440f16567f0dd8e6e8a9ae7048d56bbaf7 Mon Sep 17 00:00:00 2001
From: Chenxi <chenxi.whitehouse@gmail.com>
Date: Wed, 14 Feb 2024 18:20:11 +0000
Subject: [PATCH 2/5] Closes #183 | Implement `wongnai_reviews` dataloader
 (#325)

* Implement dataloader for wongnai_reviews

* add __init__.py

* update

* update
---
 .../sea_datasets/wongnai_reviews/__init__.py  |   0
 .../wongnai_reviews/wongnai_reviews.py        | 116 ++++++++++++++++++
 2 files changed, 116 insertions(+)
 create mode 100644 seacrowd/sea_datasets/wongnai_reviews/__init__.py
 create mode 100644 seacrowd/sea_datasets/wongnai_reviews/wongnai_reviews.py

diff --git a/seacrowd/sea_datasets/wongnai_reviews/__init__.py b/seacrowd/sea_datasets/wongnai_reviews/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/seacrowd/sea_datasets/wongnai_reviews/wongnai_reviews.py b/seacrowd/sea_datasets/wongnai_reviews/wongnai_reviews.py
new file mode 100644
index 000000000..e52741897
--- /dev/null
+++ b/seacrowd/sea_datasets/wongnai_reviews/wongnai_reviews.py
@@ -0,0 +1,116 @@
+import csv
+import os
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Licenses, Tasks
+
+# no BibTeX citation
+_CITATION = ""
+
+_DATASETNAME = "wongnai_reviews"
+
+_DESCRIPTION = """
+Wongnai features over 200,000 restaurants, beauty salons, and spas across Thailand on its platform, with detailed
+information about each merchant and user reviews. Its over two million registered users can search for what’s top rated
+in Bangkok, follow their friends, upload photos, and do quick write-ups about the places they visit. Each write-up
+(review) also comes with a rating score ranging from 1-5 stars. The task here is to create a rating prediction model
+using only textual information.
+"""
+
+_HOMEPAGE = "https://huggingface.co/datasets/wongnai_reviews"
+
+_LANGUAGES = ["tha"]
+
+_LICENSE = Licenses.LGPL_3_0.value
+
+_LOCAL = False
+
+_URLS = {_DATASETNAME: "https://archive.org/download/wongnai_reviews/wongnai_reviews_withtest.zip"}
+
+_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS]
+
+_SOURCE_VERSION = "1.0.0"
+
+_SEACROWD_VERSION = "1.0.0"
+
+_CLASSES = ["1", "2", "3", "4", "5"]
+
+
+class WongnaiReviewsDataset(datasets.GeneratorBasedBuilder):
+    """WongnaiReviews consists reviews for over 200,000 restaurants, beauty salons, and spas across Thailand."""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    BUILDER_CONFIGS = [
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_source",
+            version=SOURCE_VERSION,
+            description=f"{_DATASETNAME} source schema",
+            schema="source",
+            subset_id=_DATASETNAME,
+        ),
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_seacrowd_text",
+            version=SEACROWD_VERSION,
+            description=f"{_DATASETNAME} SEACrowd schema",
+            schema="seacrowd_text",
+            subset_id=_DATASETNAME,
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "review_body": datasets.Value("string"),
+                    "star_rating": datasets.ClassLabel(names=_CLASSES),
+                }
+            )
+
+        elif self.config.schema == "seacrowd_text":
+            features = schemas.text_features(label_names=_CLASSES)
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        urls = _URLS[_DATASETNAME]
+        data_dir = dl_manager.download_and_extract(urls)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": os.path.join(data_dir, "w_review_train.csv"), "split": "train"},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={"filepath": os.path.join(data_dir, "w_review_test.csv"), "split": "test"},
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
+        if self.config.schema == "source":
+            with open(filepath, encoding="utf-8") as f:
+                spamreader = csv.reader(f, delimiter=";", quotechar='"')
+                for i, row in enumerate(spamreader):
+                    yield i, {"review_body": row[0], "star_rating": row[1]}
+
+        elif self.config.schema == "seacrowd_text":
+            with open(filepath, encoding="utf-8") as f:
+                spamreader = csv.reader(f, delimiter=";", quotechar='"')
+                for i, row in enumerate(spamreader):
+                    yield i, {"id": str(i), "text": row[0], "label": _CLASSES[int(row[1].strip()) - 1]}

From 0e1089eaa9992ffc3921a6ec180849b46db626d0 Mon Sep 17 00:00:00 2001
From: akhdanfadh <akhdan.fadh@gmail.com>
Date: Thu, 15 Feb 2024 20:16:31 +0900
Subject: [PATCH 3/5] remove ssp schema; add _LANGUAGES

---
 .../sea_datasets/xstorycloze/xstorycloze.py   | 31 ++++++-------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/seacrowd/sea_datasets/xstorycloze/xstorycloze.py b/seacrowd/sea_datasets/xstorycloze/xstorycloze.py
index df0783cca..c0639db17 100644
--- a/seacrowd/sea_datasets/xstorycloze/xstorycloze.py
+++ b/seacrowd/sea_datasets/xstorycloze/xstorycloze.py
@@ -6,8 +6,7 @@
 import datasets
 
 from seacrowd.utils.configs import SEACrowdConfig
-from seacrowd.utils.constants import (SCHEMA_TO_FEATURES, TASK_TO_SCHEMA,
-                                      Licenses, Tasks)
+from seacrowd.utils.constants import SCHEMA_TO_FEATURES, Licenses, Tasks
 
 _CITATION = """\
 @inproceedings{lin2022fewshot,
@@ -54,11 +53,12 @@
 Meta AI.
 """
 _HOMEPAGE = "https://huggingface.co/datasets/juletxara/xstory_cloze"
+_LANGUAGES = ["ind", "mya"]
 _LICENSE = Licenses.CC_BY_SA_4_0.value
 
 _LOCAL = False
 _BASE_URL = "https://huggingface.co/datasets/juletxara/xstory_cloze/resolve/main/spring2016.val.{lang}.tsv.split_20_80_{split}.tsv"
-_SUPPORTED_TASKS = [Tasks.COMMONSENSE_REASONING, Tasks.SELF_SUPERVISED_PRETRAINING]
+_SUPPORTED_TASKS = [Tasks.COMMONSENSE_REASONING]
 _SOURCE_VERSION = "1.0.0"
 _SEACROWD_VERSION = "1.0.0"
 
@@ -69,7 +69,6 @@ class xStoryClozeDataset(datasets.GeneratorBasedBuilder):
     SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
     SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
 
-    SEACROWD_SCHEMA_NAME = [TASK_TO_SCHEMA[task].lower() for task in _SUPPORTED_TASKS]
     SEACROWD_SUBSET = ["id", "my"]
 
     BUILDER_CONFIGS = [
@@ -79,17 +78,17 @@ class xStoryClozeDataset(datasets.GeneratorBasedBuilder):
             description=f"{_DATASETNAME} {subset} source schema",
             schema="source",
             subset_id=f"{_DATASETNAME}_{subset}",
-        ) 
+        )
         for subset in SEACROWD_SUBSET
     ] + [
         SEACrowdConfig(
-            name=f"{_DATASETNAME}_{subset}_seacrowd_{schema}",
+            name=f"{_DATASETNAME}_{subset}_seacrowd_qa",
             version=datasets.Version(_SEACROWD_VERSION),
             description=f"{_DATASETNAME} {subset} SEACrowd schema",
-            schema=f"seacrowd_{schema}",
+            schema="seacrowd_qa",
             subset_id=f"{_DATASETNAME}_{subset}",
         )
-        for subset, schema in list(itertools.product(SEACROWD_SUBSET, SEACROWD_SCHEMA_NAME))
+        for subset in SEACROWD_SUBSET
     ]
 
     DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{SEACROWD_SUBSET[0]}_source"
@@ -108,10 +107,8 @@ def _info(self) -> datasets.DatasetInfo:
                     "answer_right_ending": datasets.Value("int32"),
                 }
             )
-        elif self.config.schema == f"seacrowd_{(schema := self.SEACROWD_SCHEMA_NAME[0])}":
-            features = SCHEMA_TO_FEATURES[schema.upper()]  # qa_features
-        elif self.config.schema == f"seacrowd_{(schema := self.SEACROWD_SCHEMA_NAME[1])}":
-            features = SCHEMA_TO_FEATURES[schema.upper()]  # ssp_features
+        elif self.config.schema == "seacrowd_qa":
+            features = SCHEMA_TO_FEATURES["QA"]
 
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
@@ -163,7 +160,7 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
                         "sentence_quiz2": row[6],
                         "answer_right_ending": int(row[7]),
                     }
-            elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME[0]}":
+            elif self.config.schema == "seacrowd_qa":
                 for id, row in enumerate(data):
                     question = " ".join(row[1:5])
                     choices = [row[5], row[6]]
@@ -178,11 +175,3 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
                         "answer": [choices[int(row[7]) - 1]],
                         "meta": {},
                     }
-            elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME[1]}":
-                for id, row in enumerate(data):
-                    question = " ".join(row[1:5])
-                    correct = row[5] if int(row[7]) == 1 else row[6]
-                    yield id, {
-                        "id": str(id),
-                        "text": question + " " + correct,
-                    }
\ No newline at end of file

From 9a6bcd6a0155eb13308002728d2a86d8788174fb Mon Sep 17 00:00:00 2001
From: Akhdan Fadhilah <akhdan.fadh@gmail.com>
Date: Thu, 29 Feb 2024 20:00:08 +0900
Subject: [PATCH 4/5] Revert "Closes #183 | Implement `wongnai_reviews`
 dataloader (#325)"

This reverts commit 51140e440f16567f0dd8e6e8a9ae7048d56bbaf7.
---
 .../sea_datasets/wongnai_reviews/__init__.py  |   0
 .../wongnai_reviews/wongnai_reviews.py        | 116 ------------------
 2 files changed, 116 deletions(-)
 delete mode 100644 seacrowd/sea_datasets/wongnai_reviews/__init__.py
 delete mode 100644 seacrowd/sea_datasets/wongnai_reviews/wongnai_reviews.py

diff --git a/seacrowd/sea_datasets/wongnai_reviews/__init__.py b/seacrowd/sea_datasets/wongnai_reviews/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/seacrowd/sea_datasets/wongnai_reviews/wongnai_reviews.py b/seacrowd/sea_datasets/wongnai_reviews/wongnai_reviews.py
deleted file mode 100644
index e52741897..000000000
--- a/seacrowd/sea_datasets/wongnai_reviews/wongnai_reviews.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import csv
-import os
-from pathlib import Path
-from typing import Dict, List, Tuple
-
-import datasets
-
-from seacrowd.utils import schemas
-from seacrowd.utils.configs import SEACrowdConfig
-from seacrowd.utils.constants import Licenses, Tasks
-
-# no BibTeX citation
-_CITATION = ""
-
-_DATASETNAME = "wongnai_reviews"
-
-_DESCRIPTION = """
-Wongnai features over 200,000 restaurants, beauty salons, and spas across Thailand on its platform, with detailed
-information about each merchant and user reviews. Its over two million registered users can search for what’s top rated
-in Bangkok, follow their friends, upload photos, and do quick write-ups about the places they visit. Each write-up
-(review) also comes with a rating score ranging from 1-5 stars. The task here is to create a rating prediction model
-using only textual information.
-"""
-
-_HOMEPAGE = "https://huggingface.co/datasets/wongnai_reviews"
-
-_LANGUAGES = ["tha"]
-
-_LICENSE = Licenses.LGPL_3_0.value
-
-_LOCAL = False
-
-_URLS = {_DATASETNAME: "https://archive.org/download/wongnai_reviews/wongnai_reviews_withtest.zip"}
-
-_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS]
-
-_SOURCE_VERSION = "1.0.0"
-
-_SEACROWD_VERSION = "1.0.0"
-
-_CLASSES = ["1", "2", "3", "4", "5"]
-
-
-class WongnaiReviewsDataset(datasets.GeneratorBasedBuilder):
-    """WongnaiReviews consists reviews for over 200,000 restaurants, beauty salons, and spas across Thailand."""
-
-    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
-    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
-
-    BUILDER_CONFIGS = [
-        SEACrowdConfig(
-            name=f"{_DATASETNAME}_source",
-            version=SOURCE_VERSION,
-            description=f"{_DATASETNAME} source schema",
-            schema="source",
-            subset_id=_DATASETNAME,
-        ),
-        SEACrowdConfig(
-            name=f"{_DATASETNAME}_seacrowd_text",
-            version=SEACROWD_VERSION,
-            description=f"{_DATASETNAME} SEACrowd schema",
-            schema="seacrowd_text",
-            subset_id=_DATASETNAME,
-        ),
-    ]
-
-    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
-
-    def _info(self) -> datasets.DatasetInfo:
-        if self.config.schema == "source":
-            features = datasets.Features(
-                {
-                    "review_body": datasets.Value("string"),
-                    "star_rating": datasets.ClassLabel(names=_CLASSES),
-                }
-            )
-
-        elif self.config.schema == "seacrowd_text":
-            features = schemas.text_features(label_names=_CLASSES)
-
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
-        """Returns SplitGenerators."""
-        urls = _URLS[_DATASETNAME]
-        data_dir = dl_manager.download_and_extract(urls)
-
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={"filepath": os.path.join(data_dir, "w_review_train.csv"), "split": "train"},
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={"filepath": os.path.join(data_dir, "w_review_test.csv"), "split": "test"},
-            ),
-        ]
-
-    def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
-        if self.config.schema == "source":
-            with open(filepath, encoding="utf-8") as f:
-                spamreader = csv.reader(f, delimiter=";", quotechar='"')
-                for i, row in enumerate(spamreader):
-                    yield i, {"review_body": row[0], "star_rating": row[1]}
-
-        elif self.config.schema == "seacrowd_text":
-            with open(filepath, encoding="utf-8") as f:
-                spamreader = csv.reader(f, delimiter=";", quotechar='"')
-                for i, row in enumerate(spamreader):
-                    yield i, {"id": str(i), "text": row[0], "label": _CLASSES[int(row[1].strip()) - 1]}

From d507653905cad856ecc5051cc97efeb3a223be6b Mon Sep 17 00:00:00 2001
From: Akhdan Fadhilah <akhdan.fadh@gmail.com>
Date: Thu, 29 Feb 2024 20:04:41 +0900
Subject: [PATCH 5/5] remove unnecessary import; pascal case for class name

---
 seacrowd/sea_datasets/xstorycloze/xstorycloze.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/seacrowd/sea_datasets/xstorycloze/xstorycloze.py b/seacrowd/sea_datasets/xstorycloze/xstorycloze.py
index c0639db17..3922a4ca5 100644
--- a/seacrowd/sea_datasets/xstorycloze/xstorycloze.py
+++ b/seacrowd/sea_datasets/xstorycloze/xstorycloze.py
@@ -1,5 +1,4 @@
 import csv
-import itertools
 from pathlib import Path
 from typing import Dict, List, Tuple
 
@@ -63,7 +62,7 @@
 _SEACROWD_VERSION = "1.0.0"
 
 
-class xStoryClozeDataset(datasets.GeneratorBasedBuilder):
+class XStoryClozeDataset(datasets.GeneratorBasedBuilder):
     """XStoryCloze subset for Indonesian and Burmese language."""
 
     SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)