From fb55efceb9fb5a96412961209372766270111550 Mon Sep 17 00:00:00 2001 From: Ilham Firdausi Putra Date: Tue, 27 Feb 2024 00:06:21 +0800 Subject: [PATCH 1/2] Add thai depression --- .../sea_datasets/thai_depression/__init__.py | 0 .../thai_depression/thai_depression.py | 147 ++++++++++++++++++ 2 files changed, 147 insertions(+) create mode 100644 seacrowd/sea_datasets/thai_depression/__init__.py create mode 100644 seacrowd/sea_datasets/thai_depression/thai_depression.py diff --git a/seacrowd/sea_datasets/thai_depression/__init__.py b/seacrowd/sea_datasets/thai_depression/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/thai_depression/thai_depression.py b/seacrowd/sea_datasets/thai_depression/thai_depression.py new file mode 100644 index 000000000..41c975587 --- /dev/null +++ b/seacrowd/sea_datasets/thai_depression/thai_depression.py @@ -0,0 +1,147 @@ +import json +from pathlib import Path +from typing import List + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import (DEFAULT_SEACROWD_VIEW_NAME, + DEFAULT_SOURCE_VIEW_NAME, Licenses, + Tasks) + +_DATASETNAME = "thai_depression" +_SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME +_UNIFIED_VIEW_NAME = DEFAULT_SEACROWD_VIEW_NAME + +_LANGUAGES = ["tha"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) +_LOCAL = False +_CITATION = """\ +@inproceedings{hamalainen-etal-2021-detecting, + title = "Detecting Depression in Thai Blog Posts: a Dataset and a Baseline", + author = {H{\"a}m{\"a}l{\"a}inen, Mika and + Patpong, Pattama and + Alnajjar, Khalid and + Partanen, Niko and + Rueter, Jack}, + editor = "Xu, Wei and + Ritter, Alan and + Baldwin, Tim and + Rahimi, Afshin", + booktitle = "Proceedings of the Seventh Workshop on Noisy User-generated Text (W-NUT 2021)", + month = nov, + year = "2021", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.wnut-1.3", + doi = "10.18653/v1/2021.wnut-1.3", + pages = "20--25", + abstract = "We present the first openly available corpus for detecting depression in Thai. Our corpus is compiled by expert verified cases of depression in several online blogs. + We experiment with two different LSTM based models and two different BERT based models. We achieve a 77.53%% accuracy with a Thai BERT model in detecting depression. + This establishes a good baseline for future researcher on the same corpus. Furthermore, we identify a need for Thai embeddings that have been trained on a more varied corpus than Wikipedia. + Our corpus, code and trained models have been released openly on Zenodo.", +} +""" + +_DESCRIPTION = """\ +We present the first openly available corpus for detecting depression in Thai. Our corpus is compiled by expert verified cases of depression in several online blogs. +We experiment with two different LSTM based models and two different BERT based models. We achieve a 77.53%% accuracy with a Thai BERT model in detecting depression. +This establishes a good baseline for future researcher on the same corpus. Furthermore, we identify a need for Thai embeddings that have been trained on a more varied corpus than Wikipedia. +Our corpus, code and trained models have been released openly on Zenodo. +""" + +_HOMEPAGE = "https://zenodo.org/records/4734552" + +_LICENSE = Licenses.CC_BY_NC_ND_4_0.value + +_URLs = "https://zenodo.org/records/4734552/files/data.zip?download=1" + +_SUPPORTED_TASKS = [Tasks.EMOTION_CLASSIFICATION] + +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class ThaiDepressionDataset(datasets.GeneratorBasedBuilder): + """Thai depression detection dataset.""" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_text", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} seacrowd schema", + schema="seacrowd_text", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self): + if self.config.schema == "source": + features = datasets.Features( + { + "text": datasets.Value("string"), + "label": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_text": + features = schemas.text_features(["depression", "no_depression"]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + path = Path(dl_manager.download_and_extract(_URLs)) + data_files = { + "train": path / "splits/train.json", + "test": path / "splits/test.json", + "valid": path / "splits/valid.json", + } + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_files["train"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": data_files["valid"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": data_files["test"]}, + ), + ] + + def _parse_and_label(self, file_path): + with open(file_path, "r", encoding="utf-8") as file: + data = json.load(file) + + parsed_data = [] + for item in data: + parsed_data.append({"text": item[0], "label": item[1]}) + + return parsed_data + + def _generate_examples(self, filepath: Path): + print("Reading ", filepath) + for id, row in enumerate(self._parse_and_label(filepath)): + if self.config.schema == "source": + yield id, {"text": row["text"], "label": row["label"]} + elif self.config.schema == "seacrowd_text": + yield id, {"id": str(id), "text": row["text"], "label": row["label"]} + else: + raise ValueError(f"Invalid config: {self.config.name}") From 25b8c4325d3a693b66380506c5bca0979180f48a Mon Sep 17 00:00:00 2001 From: Ilham F Putra <31740013+ilhamfp@users.noreply.github.com> Date: Thu, 7 Mar 2024 09:52:23 +0800 Subject: [PATCH 2/2] Update seacrowd/sea_datasets/thai_depression/thai_depression.py Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> --- seacrowd/sea_datasets/thai_depression/thai_depression.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/seacrowd/sea_datasets/thai_depression/thai_depression.py b/seacrowd/sea_datasets/thai_depression/thai_depression.py index 41c975587..e21a120c6 100644 --- a/seacrowd/sea_datasets/thai_depression/thai_depression.py +++ b/seacrowd/sea_datasets/thai_depression/thai_depression.py @@ -6,9 +6,7 @@ from seacrowd.utils import schemas from seacrowd.utils.configs import SEACrowdConfig -from seacrowd.utils.constants import (DEFAULT_SEACROWD_VIEW_NAME, - DEFAULT_SOURCE_VIEW_NAME, Licenses, - Tasks) +from seacrowd.utils.constants import DEFAULT_SEACROWD_VIEW_NAME, DEFAULT_SOURCE_VIEW_NAME, Licenses, Tasks _DATASETNAME = "thai_depression" _SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME