From abd5de45378cb850af4a1c8a7295ba2ddf82372a Mon Sep 17 00:00:00 2001 From: IvanHalimP Date: Tue, 14 Nov 2023 21:44:33 +0700 Subject: [PATCH 1/5] Typhoon Yolanda Tweets dataloader --- .../typhoon_yolanda_tweets.py | 139 ++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py diff --git a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py new file mode 100644 index 000000000..bc15e1ef6 --- /dev/null +++ b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py @@ -0,0 +1,139 @@ +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.DEPENDENCY_PARSING] + +_CITATION = """\ +@misc{imperial2019sentiment, + title={Sentiment Analysis of Typhoon Related Tweets using Standard and Bidirectional Recurrent Neural Networks}, + author={Joseph Marvin Imperial and Jeyrome Orosco and Shiela Mae Mazo and Lany Maceda}, + year={2019}, + eprint={1908.01765}, + archivePrefix={arXiv}, + primaryClass={cs.NE} +} +""" + +_DATASETNAME = "typhoon_yolanda_tweets" + +_DESCRIPTION = """\ +The dataset contains annotated typhoon and disaster-related tweets in Filipino collected before, during, +and after one month of Typhoon Yolanda in 2013. The dataset has been annotated by an expert into three +sentiment categories: positive, negative, and neutral. +""" + +_HOMEPAGE = "https://github.com/imperialite/Philippine-Languages-Online-Corpora/tree/master/Tweets/Annotated%20Yolanda" + +_LICENSE = Licenses.CC_BY_4_0.value + +_ROOT_URL = "https://raw.githubusercontent.com/imperialite/Philippine-Languages-Online-Corpora/master/Tweets/Annotated%20Yolanda/" +_URLS = {"train": {-1: _ROOT_URL + "train/-1.txt", 0: _ROOT_URL + "train/0.txt", 1: _ROOT_URL + "train/1.txt"}, "test": {-1: _ROOT_URL + "test/-1.txt", 0: _ROOT_URL + "test/0.txt", 1: _ROOT_URL + "test/1.txt"}} + +_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case +class TyphoonYolandaTweets(datasets.GeneratorBasedBuilder): + """ + The dataset contains annotated typhoon and disaster-related tweets in Filipino collected before, during, and + after one month of Typhoon Yolanda in 2013. The dataset has been annotated by an expert into three sentiment + categories: positive, negative, and neutral. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="typhoon_yolanda_tweets_source", + version=SOURCE_VERSION, + description="Typhoon Yolanda Tweets source schema", + schema="source", + subset_id="typhoon_yolanda_tweets", + ), + SEACrowdConfig( + name="typhoon_yolanda_tweets_seacrowd_text", + version=SEACROWD_VERSION, + description="Typhoon Yolanda Tweets SEACrowd schema", + schema="seacrowd_text", + subset_id="typhoon_yolanda_tweets", + ), + ] + + DEFAULT_CONFIG_NAME = "typhoon_yolanda_tweets_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "text": datasets.Value("string"), + "label": datasets.Value("string"), + } + ) + elif self.config.schema == "seacrowd_text": + features = schemas.text_features(["-1", "0", "1"]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + emos = [-1, 0, 1] + # TODO: KEEP if your dataset is LOCAL; remove if NOT + if self.config.name == "typhoon_yolanda_tweets_source" or self.config.name == "typhoon_yolanda_tweets_seacrowd_text": + train_path = dl_manager.download_and_extract({emo: _URLS["train"][emo] for emo in emos}) + + test_path = dl_manager.download_and_extract({emo: _URLS["test"][emo] for emo in emos}) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": train_path, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": test_path, + "split": "test", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + if self.config.schema != "source" and self.config.schema != "seacrowd_text": + raise ValueError(f"Invalid config: {self.config.name}") + + df = pd.DataFrame(columns=["text", "label"]) + + if self.config.name == "typhoon_yolanda_tweets_source" or self.config.name == "typhoon_yolanda_tweets_seacrowd_text": + for emo, file in filepath.items(): + with open(file) as f: + t = f.readlines() + l = [str(emo) for i in range(len(t))] + tmp_df = pd.DataFrame.from_dict({"text": t, "label": l}) + df = pd.concat([df, tmp_df], ignore_index=True) + + for row in df.itertuples(): + print(row) + ex = {"id": str(row.Index), "text": row.text, "label": row.label} + yield row.Index, ex From a211e470b3041809f5846f3fd67c13563fbd0c31 Mon Sep 17 00:00:00 2001 From: IvanHalimP Date: Tue, 14 Nov 2023 21:49:12 +0700 Subject: [PATCH 2/5] Create __init__.py --- seacrowd/sea_datasets/typhoon_yolanda_tweets/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 seacrowd/sea_datasets/typhoon_yolanda_tweets/__init__.py diff --git a/seacrowd/sea_datasets/typhoon_yolanda_tweets/__init__.py b/seacrowd/sea_datasets/typhoon_yolanda_tweets/__init__.py new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/seacrowd/sea_datasets/typhoon_yolanda_tweets/__init__.py @@ -0,0 +1 @@ + From 640d46ad8e4b9128b182067ec2c0447357f47908 Mon Sep 17 00:00:00 2001 From: IvanHalimP Date: Sun, 19 Nov 2023 16:33:10 +0700 Subject: [PATCH 3/5] Update seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py Co-authored-by: James Jaya <2089265+jamesjaya@users.noreply.github.com> --- .../typhoon_yolanda_tweets/typhoon_yolanda_tweets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py index bc15e1ef6..7c7c94ac4 100644 --- a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py +++ b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py @@ -37,7 +37,7 @@ _ROOT_URL = "https://raw.githubusercontent.com/imperialite/Philippine-Languages-Online-Corpora/master/Tweets/Annotated%20Yolanda/" _URLS = {"train": {-1: _ROOT_URL + "train/-1.txt", 0: _ROOT_URL + "train/0.txt", 1: _ROOT_URL + "train/1.txt"}, "test": {-1: _ROOT_URL + "test/-1.txt", 0: _ROOT_URL + "test/0.txt", 1: _ROOT_URL + "test/1.txt"}} -_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] +_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS] _SOURCE_VERSION = "1.0.0" From 62850e70887bed35b98b9963b5c8ae0a3918dced Mon Sep 17 00:00:00 2001 From: IvanHalimP Date: Sun, 19 Nov 2023 16:36:38 +0700 Subject: [PATCH 4/5] Update typhoon_yolanda_tweets.py Updated according to comments. Please tell me if there are something else that I miss. --- .../typhoon_yolanda_tweets/typhoon_yolanda_tweets.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py index 7c7c94ac4..a65568d96 100644 --- a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py +++ b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py @@ -9,8 +9,6 @@ from seacrowd.utils.configs import SEACrowdConfig from seacrowd.utils.constants import Licenses, Tasks -_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.DEPENDENCY_PARSING] - _CITATION = """\ @misc{imperial2019sentiment, title={Sentiment Analysis of Typhoon Related Tweets using Standard and Bidirectional Recurrent Neural Networks}, @@ -96,7 +94,6 @@ def _info(self) -> datasets.DatasetInfo: def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: emos = [-1, 0, 1] - # TODO: KEEP if your dataset is LOCAL; remove if NOT if self.config.name == "typhoon_yolanda_tweets_source" or self.config.name == "typhoon_yolanda_tweets_seacrowd_text": train_path = dl_manager.download_and_extract({emo: _URLS["train"][emo] for emo in emos}) @@ -129,11 +126,10 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: for emo, file in filepath.items(): with open(file) as f: t = f.readlines() - l = [str(emo) for i in range(len(t))] + l = [str(emo)]*(len(t)) tmp_df = pd.DataFrame.from_dict({"text": t, "label": l}) df = pd.concat([df, tmp_df], ignore_index=True) for row in df.itertuples(): - print(row) ex = {"id": str(row.Index), "text": row.text, "label": row.label} yield row.Index, ex From 6f6fb23d203f8834de8c89821f5899b1e01ad893 Mon Sep 17 00:00:00 2001 From: IvanHalimP Date: Sun, 19 Nov 2023 16:38:49 +0700 Subject: [PATCH 5/5] Update typhoon_yolanda_tweets.py removed "TODO" and extra newlines --- .../typhoon_yolanda_tweets/typhoon_yolanda_tweets.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py index a65568d96..f3a76d21e 100644 --- a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py +++ b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py @@ -41,8 +41,6 @@ _SEACROWD_VERSION = "1.0.0" - -# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case class TyphoonYolandaTweets(datasets.GeneratorBasedBuilder): """ The dataset contains annotated typhoon and disaster-related tweets in Filipino collected before, during, and