Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closes #63 | Create dataloader for MongabayConservation #538

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions seacrowd/sea_datasets/mongabay/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
### PROVIDED DATA
- "mongabay-tag-classification"
- "mongabay-sentiment-classification"

### DATA CALLING EXAMPLE

- seacrowd format

- mongabay-tag-classification
```
from datasets import load_dataset

data = load_dataset("seacrowd/sea_datasets/mongabay/mongabay.py", name="mongabay-tag-classification_seacrowd_t2t")

>>> data["train"][0]
{'id': '0', 'text_1': 'Pandemi, Momentum bagi Negara Serius Lindungi Hak Masyarakat Adat | ...', 'text_2': '[0.1111111119389534, 0.0, 0.0, 0.0, 0.1111111119389534, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.1111111119389534, 0.0, 0.1111111119389534, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.0, 0.1111111119389534, 0.0, 0.1111111119389534, 0.0, 0.0, 0.1111111119389534, 0.0, 0.0, 0.0]', 'text_1_name': 'text', 'text_2_name': 'weak_label'}
```

- mongabay-sentiment-classification
```
from datasets import load_dataset

data = load_dataset("seacrowd/sea_datasets/mongabay/mongabay.py", name="mongabay-sentiment-classification_seacrowd_t2t")

>>> data["train"][0]
{'id': '0', 'text_1': 'Pandemi, Momentum bagi Negara Serius Lindungi Hak Masyarakat Adat | ...', 'text_2': '[1.0, 1.4414156535025313e-09, 1.320]', 'text_1_name': 'text', 'text_2_name': 'weak_label'}
```

- source format
- mongabay-tag-classification
```
from datasets import load_dataset

data = load_dataset("seacrowd/sea_datasets/mongabay/mongabay.py", name="mongabay-tag-classification_source")

data['train'][0]
{'text': 'Pandemi, Momentum bagi Negara Serius Lindungi Hak Masyarakat Adat | ...', 'label': '[0.1111111119389534, 0.0, 0.0, 0.0, 0.1111111119389534, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.1111111119389534, 0.0, 0.1111111119389534, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111119389534, 0.0, 0.1111111119389534, 0.0, 0.1111111119389534, 0.0, 0.0, 0.1111111119389534, 0.0, 0.0, 0.0]'}
```
- mongabay-seniment-classification
```
from datasets import load_dataset

data = load_dataset("seacrowd/sea_datasets/mongabay/mongabay.py", name="mongabay-sentiment-classification_source")
{'text': 'Pandemi, Momentum bagi Negara Serius Lindungi Hak Masyarakat Adat | ...', 'tags': "['Aparatur Sipil Negara' 'masyarakat desa' 'konflik' 'perusahaan' 'tambang']", 'label': '[1.0, 1.4414156535025313e-09, 1.3204033422198336e-09]'}
```
Empty file.
159 changes: 159 additions & 0 deletions seacrowd/sea_datasets/mongabay/mongabay.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
from pathlib import Path
from typing import Dict, List, Tuple

import datasets

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Tasks
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
from seacrowd.utils.constants import Tasks
from seacrowd.utils.constants import Tasks, Licenses


_CITATION = """\
@misc{fransiska2023utilizing,
title={Utilizing Weak Supervision To Generate Indonesian Conservation Dataset},
author={Mega Fransiska and Diah Pitaloka and Saripudin and Satrio Putra and Lintang Sutawika},
year={2023},
eprint={2310.11258},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""

_DATASETNAME = "mongabay"

_DESCRIPTION = """\
Conservation dataset that was collected from mongabay.co.id contains
topic-classification task (multi-label format) and sentiment classification.
The dataset consists of 31 important topics that are commonly found in
Indonesian conservation articles or general news, and each article can
belong to more than one topic. After gathering topics for each article,
each article will be classified into one of author's sentiments
(positive, neutral, negative) based on related topics.
"""

_HOMEPAGE = ""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
_HOMEPAGE = ""
_HOMEPAGE = "https://huggingface.co/datasets/Datasaur/mongabay-experiment"


_LICENSE = "The Unlicense (unlicense)"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
_LICENSE = "The Unlicense (unlicense)"
_LICENSE = Licenses.UNLICENSE.value


Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
_LOCAL = False

_URLS = {"mongabay-tag-classification": "https://huggingface.co/datasets/Datasaur/Mongabay-tags-classification", "mongabay-sentiment-classification": "https://huggingface.co/datasets/Datasaur/Mongabay-sentiment-classification"}

_SOURCE_VERSION = "1.0.0"

_SEACROWD_VERSION = "1.0.0"

_SUPPORTED_TASKS = [Tasks.PARAPHRASING]

_LANGUAGES = ["ind"]


class Mongabay(datasets.GeneratorBasedBuilder):
"""mongabay is a dataset sourced from mongabay.co.id's Indonesian articles from 2012-2023. Each article is chunked to maximum 512 tokens to ease experiment process"""

BUILDER_CONFIGS = [
SEACrowdConfig(
name="mongabay-tag-classification_source",
version=datasets.Version(_SOURCE_VERSION),
description="mongabay-tag-classification source schema",
schema="source",
subset_id="mongabay-tag-classification",
),
SEACrowdConfig(
name="mongabay-tag-classification_seacrowd_t2t",
version=datasets.Version(_SEACROWD_VERSION),
description="mongabay-tag-classification SEACrowd schema",
schema="seacrowd_t2t",
subset_id="mongabay-tag-classification",
),
SEACrowdConfig(
name="mongabay-sentiment-classification_source",
version=datasets.Version(_SOURCE_VERSION),
description="mongabay-sentiment-classification source schema",
schema="source",
subset_id="mongabay-sentiment-classification",
),
SEACrowdConfig(
name="mongabay-sentiment-classification_seacrowd_t2t",
version=datasets.Version(_SEACROWD_VERSION),
description="mongabay-sentiment-classification SEACrowd schema",
schema="seacrowd_t2t",
subset_id="mongabay-sentiment-classification",
),
]

DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
Copy link
Collaborator

@akhdanfadh akhdanfadh Mar 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be one of the config name defined previously.


def _info(self) -> datasets.DatasetInfo:

if self.config.schema == "source":
if "mongabay-sentiment-classification" in self.config.name:
features = datasets.Features(
{
"text": datasets.Value("string"),
"tags": datasets.Value("string"),
"label": datasets.Value("string"),
}
)
elif "mongabay-tag-classification" in self.config.name:
features = datasets.Features(
{
"text": datasets.Value("string"),
"label": datasets.Value("string"),
}
)
elif self.config.schema == "seacrowd_t2t":
features = schemas.text2text_features

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
name = self.config.name.replace("_" + self.config.schema, "")
url = _URLS[name]
filename = "/".join(url.split("/")[-2:])

output = [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filename": filename,
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filename": filename,
"split": "validation",
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filename": filename,
"split": "test",
},
),
]

return output

def _generate_examples(self, filename: Path, split: str) -> Tuple[int, Dict]:
"""Yield examples as tuples of idx, (text, tags[optional], and label)"""

try:
dataset = datasets.load_dataset(filename)[split]

if self.config.schema == "source":
for idx, row in enumerate(dataset):
yield idx, row

elif self.config.schema == "seacrowd_t2t":
for idx, row in enumerate(dataset):
sample = {"id": str(idx), "text_1": row["text"], "text_2": row["label"], "text_1_name": "text", "text_2_name": "weak_label"}
yield idx, sample
except datasets.exceptions.DatasetGenerationError as e:
print(e)