From d13b987de1f4bbfa2bd400aaae6d81715377dad8 Mon Sep 17 00:00:00 2001 From: Akhdan Fadhilah Date: Tue, 28 Nov 2023 18:59:31 +0900 Subject: [PATCH 1/5] implement xstorycloze dataloader --- seacrowd/sea_datasets/xstorycloze/__init__.py | 0 .../sea_datasets/xstorycloze/xstorycloze.py | 188 ++++++++++++++++++ 2 files changed, 188 insertions(+) create mode 100644 seacrowd/sea_datasets/xstorycloze/__init__.py create mode 100644 seacrowd/sea_datasets/xstorycloze/xstorycloze.py diff --git a/seacrowd/sea_datasets/xstorycloze/__init__.py b/seacrowd/sea_datasets/xstorycloze/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/xstorycloze/xstorycloze.py b/seacrowd/sea_datasets/xstorycloze/xstorycloze.py new file mode 100644 index 000000000..df0783cca --- /dev/null +++ b/seacrowd/sea_datasets/xstorycloze/xstorycloze.py @@ -0,0 +1,188 @@ +import csv +import itertools +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import (SCHEMA_TO_FEATURES, TASK_TO_SCHEMA, + Licenses, Tasks) + +_CITATION = """\ +@inproceedings{lin2022fewshot, + author = {Xi Victoria Lin and + Todor Mihaylov and + Mikel Artetxe and + Tianlu Wang and + Shuohui Chen and + Daniel Simig and + Myle Ott and + Naman Goyal and + Shruti Bhosale and + Jingfei Du and + Ramakanth Pasunuru and + Sam Shleifer and + Punit Singh Koura and + Vishrav Chaudhary and + Brian O'Horo and + Jeff Wang and + Luke Zettlemoyer and + Zornitsa Kozareva and + Mona T. Diab and + Veselin Stoyanov and + Xian Li}, + editor = {Yoav Goldberg and + Zornitsa Kozareva and + Yue Zhang}, + title = {Few-shot Learning with Multilingual Generative Language Models}, + booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural + Language Processing, {EMNLP} 2022, Abu Dhabi, United Arab Emirates, + December 7-11, 2022}, + pages = {9019--9052}, + publisher = {Association for Computational Linguistics}, + year = {2022}, + url = {https://doi.org/10.18653/v1/2022.emnlp-main.616}, + doi = {10.18653/V1/2022.EMNLP-MAIN.616}, +} +""" + +_DATASETNAME = "xstorycloze" +_DESCRIPTION = """\ +XStoryCloze consists of the professionally translated version of the English StoryCloze +dataset (Spring 2016 version) to 10 non-English languages. This dataset is released by +Meta AI. +""" +_HOMEPAGE = "https://huggingface.co/datasets/juletxara/xstory_cloze" +_LICENSE = Licenses.CC_BY_SA_4_0.value + +_LOCAL = False +_BASE_URL = "https://huggingface.co/datasets/juletxara/xstory_cloze/resolve/main/spring2016.val.{lang}.tsv.split_20_80_{split}.tsv" +_SUPPORTED_TASKS = [Tasks.COMMONSENSE_REASONING, Tasks.SELF_SUPERVISED_PRETRAINING] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class xStoryClozeDataset(datasets.GeneratorBasedBuilder): + """XStoryCloze subset for Indonesian and Burmese language.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = [TASK_TO_SCHEMA[task].lower() for task in _SUPPORTED_TASKS] + SEACROWD_SUBSET = ["id", "my"] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} {subset} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{subset}", + ) + for subset in SEACROWD_SUBSET + ] + [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset}_seacrowd_{schema}", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} {subset} SEACrowd schema", + schema=f"seacrowd_{schema}", + subset_id=f"{_DATASETNAME}_{subset}", + ) + for subset, schema in list(itertools.product(SEACROWD_SUBSET, SEACROWD_SCHEMA_NAME)) + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{SEACROWD_SUBSET[0]}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "story_id": datasets.Value("string"), + "input_sentence_1": datasets.Value("string"), + "input_sentence_2": datasets.Value("string"), + "input_sentence_3": datasets.Value("string"), + "input_sentence_4": datasets.Value("string"), + "sentence_quiz1": datasets.Value("string"), + "sentence_quiz2": datasets.Value("string"), + "answer_right_ending": datasets.Value("int32"), + } + ) + elif self.config.schema == f"seacrowd_{(schema := self.SEACROWD_SCHEMA_NAME[0])}": + features = SCHEMA_TO_FEATURES[schema.upper()] # qa_features + elif self.config.schema == f"seacrowd_{(schema := self.SEACROWD_SCHEMA_NAME[1])}": + features = SCHEMA_TO_FEATURES[schema.upper()] # ssp_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + lang = self.config.name.split("_")[1] + filepaths = dl_manager.download_and_extract( + { + "train": _BASE_URL.format(lang=lang, split="train"), + "test": _BASE_URL.format(lang=lang, split="eval"), + } + ) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": filepaths["train"], + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": filepaths["test"], + "split": "test", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + with open(filepath, encoding="utf-8") as f: + data = csv.reader(f, quotechar='"', delimiter="\t", quoting=csv.QUOTE_ALL, skipinitialspace=True) + _ = next(data) # skip header + if self.config.schema == "source": + for id, row in enumerate(data): + yield id, { + "story_id": row[0], + "input_sentence_1": row[1], + "input_sentence_2": row[2], + "input_sentence_3": row[3], + "input_sentence_4": row[4], + "sentence_quiz1": row[5], + "sentence_quiz2": row[6], + "answer_right_ending": int(row[7]), + } + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME[0]}": + for id, row in enumerate(data): + question = " ".join(row[1:5]) + choices = [row[5], row[6]] + yield id, { + "id": str(id), + "question_id": row[0], + "document_id": None, + "question": question, + "type": "multiple_choice", + "choices": choices, + "context": None, + "answer": [choices[int(row[7]) - 1]], + "meta": {}, + } + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME[1]}": + for id, row in enumerate(data): + question = " ".join(row[1:5]) + correct = row[5] if int(row[7]) == 1 else row[6] + yield id, { + "id": str(id), + "text": question + " " + correct, + } \ No newline at end of file From 51140e440f16567f0dd8e6e8a9ae7048d56bbaf7 Mon Sep 17 00:00:00 2001 From: Chenxi Date: Wed, 14 Feb 2024 18:20:11 +0000 Subject: [PATCH 2/5] Closes #183 | Implement `wongnai_reviews` dataloader (#325) * Implement dataloader for wongnai_reviews * add __init__.py * update * update --- .../sea_datasets/wongnai_reviews/__init__.py | 0 .../wongnai_reviews/wongnai_reviews.py | 116 ++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 seacrowd/sea_datasets/wongnai_reviews/__init__.py create mode 100644 seacrowd/sea_datasets/wongnai_reviews/wongnai_reviews.py diff --git a/seacrowd/sea_datasets/wongnai_reviews/__init__.py b/seacrowd/sea_datasets/wongnai_reviews/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/wongnai_reviews/wongnai_reviews.py b/seacrowd/sea_datasets/wongnai_reviews/wongnai_reviews.py new file mode 100644 index 000000000..e52741897 --- /dev/null +++ b/seacrowd/sea_datasets/wongnai_reviews/wongnai_reviews.py @@ -0,0 +1,116 @@ +import csv +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +# no BibTeX citation +_CITATION = "" + +_DATASETNAME = "wongnai_reviews" + +_DESCRIPTION = """ +Wongnai features over 200,000 restaurants, beauty salons, and spas across Thailand on its platform, with detailed +information about each merchant and user reviews. Its over two million registered users can search for what’s top rated +in Bangkok, follow their friends, upload photos, and do quick write-ups about the places they visit. Each write-up +(review) also comes with a rating score ranging from 1-5 stars. The task here is to create a rating prediction model +using only textual information. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/wongnai_reviews" + +_LANGUAGES = ["tha"] + +_LICENSE = Licenses.LGPL_3_0.value + +_LOCAL = False + +_URLS = {_DATASETNAME: "https://archive.org/download/wongnai_reviews/wongnai_reviews_withtest.zip"} + +_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + +_CLASSES = ["1", "2", "3", "4", "5"] + + +class WongnaiReviewsDataset(datasets.GeneratorBasedBuilder): + """WongnaiReviews consists reviews for over 200,000 restaurants, beauty salons, and spas across Thailand.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_text", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_text", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "review_body": datasets.Value("string"), + "star_rating": datasets.ClassLabel(names=_CLASSES), + } + ) + + elif self.config.schema == "seacrowd_text": + features = schemas.text_features(label_names=_CLASSES) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": os.path.join(data_dir, "w_review_train.csv"), "split": "train"}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": os.path.join(data_dir, "w_review_test.csv"), "split": "test"}, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + if self.config.schema == "source": + with open(filepath, encoding="utf-8") as f: + spamreader = csv.reader(f, delimiter=";", quotechar='"') + for i, row in enumerate(spamreader): + yield i, {"review_body": row[0], "star_rating": row[1]} + + elif self.config.schema == "seacrowd_text": + with open(filepath, encoding="utf-8") as f: + spamreader = csv.reader(f, delimiter=";", quotechar='"') + for i, row in enumerate(spamreader): + yield i, {"id": str(i), "text": row[0], "label": _CLASSES[int(row[1].strip()) - 1]} From 0e1089eaa9992ffc3921a6ec180849b46db626d0 Mon Sep 17 00:00:00 2001 From: akhdanfadh Date: Thu, 15 Feb 2024 20:16:31 +0900 Subject: [PATCH 3/5] remove ssp schema; add _LANGUAGES --- .../sea_datasets/xstorycloze/xstorycloze.py | 31 ++++++------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/seacrowd/sea_datasets/xstorycloze/xstorycloze.py b/seacrowd/sea_datasets/xstorycloze/xstorycloze.py index df0783cca..c0639db17 100644 --- a/seacrowd/sea_datasets/xstorycloze/xstorycloze.py +++ b/seacrowd/sea_datasets/xstorycloze/xstorycloze.py @@ -6,8 +6,7 @@ import datasets from seacrowd.utils.configs import SEACrowdConfig -from seacrowd.utils.constants import (SCHEMA_TO_FEATURES, TASK_TO_SCHEMA, - Licenses, Tasks) +from seacrowd.utils.constants import SCHEMA_TO_FEATURES, Licenses, Tasks _CITATION = """\ @inproceedings{lin2022fewshot, @@ -54,11 +53,12 @@ Meta AI. """ _HOMEPAGE = "https://huggingface.co/datasets/juletxara/xstory_cloze" +_LANGUAGES = ["ind", "mya"] _LICENSE = Licenses.CC_BY_SA_4_0.value _LOCAL = False _BASE_URL = "https://huggingface.co/datasets/juletxara/xstory_cloze/resolve/main/spring2016.val.{lang}.tsv.split_20_80_{split}.tsv" -_SUPPORTED_TASKS = [Tasks.COMMONSENSE_REASONING, Tasks.SELF_SUPERVISED_PRETRAINING] +_SUPPORTED_TASKS = [Tasks.COMMONSENSE_REASONING] _SOURCE_VERSION = "1.0.0" _SEACROWD_VERSION = "1.0.0" @@ -69,7 +69,6 @@ class xStoryClozeDataset(datasets.GeneratorBasedBuilder): SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) - SEACROWD_SCHEMA_NAME = [TASK_TO_SCHEMA[task].lower() for task in _SUPPORTED_TASKS] SEACROWD_SUBSET = ["id", "my"] BUILDER_CONFIGS = [ @@ -79,17 +78,17 @@ class xStoryClozeDataset(datasets.GeneratorBasedBuilder): description=f"{_DATASETNAME} {subset} source schema", schema="source", subset_id=f"{_DATASETNAME}_{subset}", - ) + ) for subset in SEACROWD_SUBSET ] + [ SEACrowdConfig( - name=f"{_DATASETNAME}_{subset}_seacrowd_{schema}", + name=f"{_DATASETNAME}_{subset}_seacrowd_qa", version=datasets.Version(_SEACROWD_VERSION), description=f"{_DATASETNAME} {subset} SEACrowd schema", - schema=f"seacrowd_{schema}", + schema="seacrowd_qa", subset_id=f"{_DATASETNAME}_{subset}", ) - for subset, schema in list(itertools.product(SEACROWD_SUBSET, SEACROWD_SCHEMA_NAME)) + for subset in SEACROWD_SUBSET ] DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{SEACROWD_SUBSET[0]}_source" @@ -108,10 +107,8 @@ def _info(self) -> datasets.DatasetInfo: "answer_right_ending": datasets.Value("int32"), } ) - elif self.config.schema == f"seacrowd_{(schema := self.SEACROWD_SCHEMA_NAME[0])}": - features = SCHEMA_TO_FEATURES[schema.upper()] # qa_features - elif self.config.schema == f"seacrowd_{(schema := self.SEACROWD_SCHEMA_NAME[1])}": - features = SCHEMA_TO_FEATURES[schema.upper()] # ssp_features + elif self.config.schema == "seacrowd_qa": + features = SCHEMA_TO_FEATURES["QA"] return datasets.DatasetInfo( description=_DESCRIPTION, @@ -163,7 +160,7 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: "sentence_quiz2": row[6], "answer_right_ending": int(row[7]), } - elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME[0]}": + elif self.config.schema == "seacrowd_qa": for id, row in enumerate(data): question = " ".join(row[1:5]) choices = [row[5], row[6]] @@ -178,11 +175,3 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: "answer": [choices[int(row[7]) - 1]], "meta": {}, } - elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME[1]}": - for id, row in enumerate(data): - question = " ".join(row[1:5]) - correct = row[5] if int(row[7]) == 1 else row[6] - yield id, { - "id": str(id), - "text": question + " " + correct, - } \ No newline at end of file From 9a6bcd6a0155eb13308002728d2a86d8788174fb Mon Sep 17 00:00:00 2001 From: Akhdan Fadhilah Date: Thu, 29 Feb 2024 20:00:08 +0900 Subject: [PATCH 4/5] Revert "Closes #183 | Implement `wongnai_reviews` dataloader (#325)" This reverts commit 51140e440f16567f0dd8e6e8a9ae7048d56bbaf7. --- .../sea_datasets/wongnai_reviews/__init__.py | 0 .../wongnai_reviews/wongnai_reviews.py | 116 ------------------ 2 files changed, 116 deletions(-) delete mode 100644 seacrowd/sea_datasets/wongnai_reviews/__init__.py delete mode 100644 seacrowd/sea_datasets/wongnai_reviews/wongnai_reviews.py diff --git a/seacrowd/sea_datasets/wongnai_reviews/__init__.py b/seacrowd/sea_datasets/wongnai_reviews/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/seacrowd/sea_datasets/wongnai_reviews/wongnai_reviews.py b/seacrowd/sea_datasets/wongnai_reviews/wongnai_reviews.py deleted file mode 100644 index e52741897..000000000 --- a/seacrowd/sea_datasets/wongnai_reviews/wongnai_reviews.py +++ /dev/null @@ -1,116 +0,0 @@ -import csv -import os -from pathlib import Path -from typing import Dict, List, Tuple - -import datasets - -from seacrowd.utils import schemas -from seacrowd.utils.configs import SEACrowdConfig -from seacrowd.utils.constants import Licenses, Tasks - -# no BibTeX citation -_CITATION = "" - -_DATASETNAME = "wongnai_reviews" - -_DESCRIPTION = """ -Wongnai features over 200,000 restaurants, beauty salons, and spas across Thailand on its platform, with detailed -information about each merchant and user reviews. Its over two million registered users can search for what’s top rated -in Bangkok, follow their friends, upload photos, and do quick write-ups about the places they visit. Each write-up -(review) also comes with a rating score ranging from 1-5 stars. The task here is to create a rating prediction model -using only textual information. -""" - -_HOMEPAGE = "https://huggingface.co/datasets/wongnai_reviews" - -_LANGUAGES = ["tha"] - -_LICENSE = Licenses.LGPL_3_0.value - -_LOCAL = False - -_URLS = {_DATASETNAME: "https://archive.org/download/wongnai_reviews/wongnai_reviews_withtest.zip"} - -_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS] - -_SOURCE_VERSION = "1.0.0" - -_SEACROWD_VERSION = "1.0.0" - -_CLASSES = ["1", "2", "3", "4", "5"] - - -class WongnaiReviewsDataset(datasets.GeneratorBasedBuilder): - """WongnaiReviews consists reviews for over 200,000 restaurants, beauty salons, and spas across Thailand.""" - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) - - BUILDER_CONFIGS = [ - SEACrowdConfig( - name=f"{_DATASETNAME}_source", - version=SOURCE_VERSION, - description=f"{_DATASETNAME} source schema", - schema="source", - subset_id=_DATASETNAME, - ), - SEACrowdConfig( - name=f"{_DATASETNAME}_seacrowd_text", - version=SEACROWD_VERSION, - description=f"{_DATASETNAME} SEACrowd schema", - schema="seacrowd_text", - subset_id=_DATASETNAME, - ), - ] - - DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" - - def _info(self) -> datasets.DatasetInfo: - if self.config.schema == "source": - features = datasets.Features( - { - "review_body": datasets.Value("string"), - "star_rating": datasets.ClassLabel(names=_CLASSES), - } - ) - - elif self.config.schema == "seacrowd_text": - features = schemas.text_features(label_names=_CLASSES) - - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=features, - homepage=_HOMEPAGE, - license=_LICENSE, - citation=_CITATION, - ) - - def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: - """Returns SplitGenerators.""" - urls = _URLS[_DATASETNAME] - data_dir = dl_manager.download_and_extract(urls) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={"filepath": os.path.join(data_dir, "w_review_train.csv"), "split": "train"}, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={"filepath": os.path.join(data_dir, "w_review_test.csv"), "split": "test"}, - ), - ] - - def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: - if self.config.schema == "source": - with open(filepath, encoding="utf-8") as f: - spamreader = csv.reader(f, delimiter=";", quotechar='"') - for i, row in enumerate(spamreader): - yield i, {"review_body": row[0], "star_rating": row[1]} - - elif self.config.schema == "seacrowd_text": - with open(filepath, encoding="utf-8") as f: - spamreader = csv.reader(f, delimiter=";", quotechar='"') - for i, row in enumerate(spamreader): - yield i, {"id": str(i), "text": row[0], "label": _CLASSES[int(row[1].strip()) - 1]} From d507653905cad856ecc5051cc97efeb3a223be6b Mon Sep 17 00:00:00 2001 From: Akhdan Fadhilah Date: Thu, 29 Feb 2024 20:04:41 +0900 Subject: [PATCH 5/5] remove unnecessary import; pascal case for class name --- seacrowd/sea_datasets/xstorycloze/xstorycloze.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/seacrowd/sea_datasets/xstorycloze/xstorycloze.py b/seacrowd/sea_datasets/xstorycloze/xstorycloze.py index c0639db17..3922a4ca5 100644 --- a/seacrowd/sea_datasets/xstorycloze/xstorycloze.py +++ b/seacrowd/sea_datasets/xstorycloze/xstorycloze.py @@ -1,5 +1,4 @@ import csv -import itertools from pathlib import Path from typing import Dict, List, Tuple @@ -63,7 +62,7 @@ _SEACROWD_VERSION = "1.0.0" -class xStoryClozeDataset(datasets.GeneratorBasedBuilder): +class XStoryClozeDataset(datasets.GeneratorBasedBuilder): """XStoryCloze subset for Indonesian and Burmese language.""" SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)