From 8c4ab6dcf4e9295d7e7fa6ff9c7c1f17dfa8fd62 Mon Sep 17 00:00:00 2001 From: mega Date: Mon, 18 Mar 2024 13:41:41 +0700 Subject: [PATCH 1/3] add seacrowd/sea_datasets/mongabay/mongabay.py --- seacrowd/sea_datasets/mongabay/__init__.py | 0 seacrowd/sea_datasets/mongabay/mongabay.py | 159 +++++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 seacrowd/sea_datasets/mongabay/__init__.py create mode 100644 seacrowd/sea_datasets/mongabay/mongabay.py diff --git a/seacrowd/sea_datasets/mongabay/__init__.py b/seacrowd/sea_datasets/mongabay/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/mongabay/mongabay.py b/seacrowd/sea_datasets/mongabay/mongabay.py new file mode 100644 index 000000000..4a4f86613 --- /dev/null +++ b/seacrowd/sea_datasets/mongabay/mongabay.py @@ -0,0 +1,159 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks + +_CITATION = """\ +@misc{fransiska2023utilizing, + title={Utilizing Weak Supervision To Generate Indonesian Conservation Dataset}, + author={Mega Fransiska and Diah Pitaloka and Saripudin and Satrio Putra and Lintang Sutawika}, + year={2023}, + eprint={2310.11258}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +""" + +_DATASETNAME = "mongabay" + +_DESCRIPTION = """\ +Conservation dataset that was collected from mongabay.co.id contains + topic-classification task (multi-label format) and sentiment classification. + The dataset consists of 31 important topics that are commonly found in + Indonesian conservation articles or general news, and each article can + belong to more than one topic. After gathering topics for each article, + each article will be classified into one of author's sentiments + (positive, neutral, negative) based on related topics. +""" + +_HOMEPAGE = "" + +_LICENSE = "The Unlicense (unlicense)" + +_URLS = {"mongabay-tag-classification": "https://huggingface.co/datasets/Datasaur/Mongabay-tags-classification", "mongabay-sentiment-classification": "https://huggingface.co/datasets/Datasaur/Mongabay-sentiment-classification"} + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + +_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS, Tasks.TOPIC_MODELING] + +_LANGUAGES = ["ind"] + + +class Mongabay(datasets.GeneratorBasedBuilder): + """mongabay is a dataset sourced from mongabay.co.id's Indonesian articles from 2012-2023. Each article is chunked to maximum 512 tokens to ease experiment process""" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="mongabay-tag-classification_source", + version=datasets.Version(_SOURCE_VERSION), + description="mongabay-tag-classification source schema", + schema="source", + subset_id="mongabay-tag-classification", + ), + SEACrowdConfig( + name="mongabay-tag-classification_seacrowd_t2t", + version=datasets.Version(_SEACROWD_VERSION), + description="mongabay-tag-classification SEACrowd schema", + schema="seacrowd_t2t", + subset_id="mongabay-tag-classification", + ), + SEACrowdConfig( + name="mongabay-sentiment-classification_source", + version=datasets.Version(_SOURCE_VERSION), + description="mongabay-sentiment-classification source schema", + schema="source", + subset_id="mongabay-sentiment-classification", + ), + SEACrowdConfig( + name="mongabay-sentiment-classification_seacrowd_t2t", + version=datasets.Version(_SEACROWD_VERSION), + description="mongabay-sentiment-classification SEACrowd schema", + schema="seacrowd_t2t", + subset_id="mongabay-sentiment-classification", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + if "mongabay-sentiment-classification" in self.config.name: + features = datasets.Features( + { + "text": datasets.Value("string"), + "tags": datasets.Value("string"), + "label": datasets.Value("string"), + } + ) + elif "mongabay-tag-classification" in self.config.name: + features = datasets.Features( + { + "text": datasets.Value("string"), + "label": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + name = self.config.name.replace("_" + self.config.schema, "") + url = _URLS[name] + filename = "/".join(url.split("/")[-2:]) + + output = [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filename": filename, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filename": filename, + "split": "validation", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filename": filename, + "split": "test", + }, + ), + ] + + return output + + def _generate_examples(self, filename: Path, split: str) -> Tuple[int, Dict]: + """Yield examples as tuples of idx, (text, tags[optional], and label)""" + + try: + dataset = datasets.load_dataset(filename)[split] + + if self.config.schema == "source": + for idx, row in enumerate(dataset): + yield idx, row + + elif self.config.schema == "seacrowd_t2t": + for idx, row in enumerate(dataset): + sample = {"id": str(idx), "text_1": row["text"], "text_2": row["label"], "text_1_name": "text", "text_2_name": "weak_label"} + yield idx, sample + except datasets.exceptions.DatasetGenerationError as e: + print(e) From 1cccca0c728a24416e158e44e3d25b85c4d44373 Mon Sep 17 00:00:00 2001 From: mega Date: Mon, 18 Mar 2024 13:43:24 +0700 Subject: [PATCH 2/3] add seacrowd/sea_datasets/mongabay/README.md --- seacrowd/sea_datasets/mongabay/README.md | 45 ++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 seacrowd/sea_datasets/mongabay/README.md diff --git a/seacrowd/sea_datasets/mongabay/README.md b/seacrowd/sea_datasets/mongabay/README.md new file mode 100644 index 000000000..e833ca7d9 --- /dev/null +++ b/seacrowd/sea_datasets/mongabay/README.md @@ -0,0 +1,45 @@ +### PROVIDED DATA +- "mongabay-tag-classification" +- "mongabay-sentiment-classification" + +### DATA CALLING EXAMPLE + +- seacrowd format + + - mongabay-tag-classification + ``` + from datasets import load_dataset + + data = load_dataset("seacrowd/sea_datasets/mongabay/mongabay.py", name="mongabay-tag-classification_seacrowd_t2t") + + >>> data["train"][0] + {'id': '0', 'text_1': 'Pandemi, Momentum bagi Negara Serius Lindungi Hak Masyarakat Adat | ...', 'text_2': '[0.1111111119389534, 0.0, 0.0, 0.0, 0.1111111119389534, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.1111111119389534, 0.0, 0.1111111119389534, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.0, 0.1111111119389534, 0.0, 0.1111111119389534, 0.0, 0.0, 0.1111111119389534, 0.0, 0.0, 0.0]', 'text_1_name': 'text', 'text_2_name': 'weak_label'} + ``` + + - mongabay-sentiment-classification + ``` + from datasets import load_dataset + + data = load_dataset("seacrowd/sea_datasets/mongabay/mongabay.py", name="mongabay-sentiment-classification_seacrowd_t2t") + + >>> data["train"][0] + {'id': '0', 'text_1': 'Pandemi, Momentum bagi Negara Serius Lindungi Hak Masyarakat Adat | ...', 'text_2': '[1.0, 1.4414156535025313e-09, 1.320]', 'text_1_name': 'text', 'text_2_name': 'weak_label'} + ``` + +- source format + - mongabay-tag-classification + ``` + from datasets import load_dataset + + data = load_dataset("seacrowd/sea_datasets/mongabay/mongabay.py", name="mongabay-tag-classification_source") + + data['train'][0] + {'text': 'Pandemi, Momentum bagi Negara Serius Lindungi Hak Masyarakat Adat | ...', 'label': '[0.1111111119389534, 0.0, 0.0, 0.0, 0.1111111119389534, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.1111111119389534, 0.0, 0.1111111119389534, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.0, 0.1111111119389534, 0.0, 0.1111111119389534, 0.0, 0.0, 0.1111111119389534, 0.0, 0.0, 0.0]'} + ``` + - mongabay-seniment-classification + ``` + from datasets import load_dataset + + data = load_dataset("seacrowd/sea_datasets/mongabay/mongabay.py", name="mongabay-sentiment-classification_source") + {'text': 'Pandemi, Momentum bagi Negara Serius Lindungi Hak Masyarakat Adat | ...', 'tags': "['Aparatur Sipil Negara' 'masyarakat desa' 'konflik' 'perusahaan' 'tambang']", 'label': '[1.0, 1.4414156535025313e-09, 1.3204033422198336e-09]'} + ``` From 6c1cce864b4aa5bddfa442aa34c46c7256912d6c Mon Sep 17 00:00:00 2001 From: mega Date: Mon, 18 Mar 2024 15:33:53 +0700 Subject: [PATCH 3/3] change _SUPOORTED_TASKS --- seacrowd/sea_datasets/mongabay/mongabay.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seacrowd/sea_datasets/mongabay/mongabay.py b/seacrowd/sea_datasets/mongabay/mongabay.py index 4a4f86613..57e67ff39 100644 --- a/seacrowd/sea_datasets/mongabay/mongabay.py +++ b/seacrowd/sea_datasets/mongabay/mongabay.py @@ -40,7 +40,7 @@ _SEACROWD_VERSION = "1.0.0" -_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS, Tasks.TOPIC_MODELING] +_SUPPORTED_TASKS = [Tasks.PARAPHRASING] _LANGUAGES = ["ind"]