From abd5de45378cb850af4a1c8a7295ba2ddf82372a Mon Sep 17 00:00:00 2001
From: IvanHalimP <an7hr4x@gmail.com>
Date: Tue, 14 Nov 2023 21:44:33 +0700
Subject: [PATCH 1/5] Typhoon Yolanda Tweets dataloader

---
 .../typhoon_yolanda_tweets.py                 | 139 ++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py

diff --git a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py
new file mode 100644
index 000000000..bc15e1ef6
--- /dev/null
+++ b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py
@@ -0,0 +1,139 @@
+import os
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+import pandas as pd
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Licenses, Tasks
+
+_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.DEPENDENCY_PARSING]
+
+_CITATION = """\
+@misc{imperial2019sentiment,
+      title={Sentiment Analysis of Typhoon Related Tweets using Standard and Bidirectional Recurrent Neural Networks}, 
+      author={Joseph Marvin Imperial and Jeyrome Orosco and Shiela Mae Mazo and Lany Maceda},
+      year={2019},
+      eprint={1908.01765},
+      archivePrefix={arXiv},
+      primaryClass={cs.NE}
+}
+"""
+
+_DATASETNAME = "typhoon_yolanda_tweets"
+
+_DESCRIPTION = """\
+The dataset contains annotated typhoon and disaster-related tweets in Filipino collected before, during, 
+and after one month of Typhoon Yolanda in 2013. The dataset has been annotated by an expert into three 
+sentiment categories: positive, negative, and neutral.
+"""
+
+_HOMEPAGE = "https://github.com/imperialite/Philippine-Languages-Online-Corpora/tree/master/Tweets/Annotated%20Yolanda"
+
+_LICENSE = Licenses.CC_BY_4_0.value
+
+_ROOT_URL = "https://raw.githubusercontent.com/imperialite/Philippine-Languages-Online-Corpora/master/Tweets/Annotated%20Yolanda/"
+_URLS = {"train": {-1: _ROOT_URL + "train/-1.txt", 0: _ROOT_URL + "train/0.txt", 1: _ROOT_URL + "train/1.txt"}, "test": {-1: _ROOT_URL + "test/-1.txt", 0: _ROOT_URL + "test/0.txt", 1: _ROOT_URL + "test/1.txt"}}
+
+_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS]  # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
+
+_SOURCE_VERSION = "1.0.0"
+
+_SEACROWD_VERSION = "1.0.0"
+
+
+# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case
+class TyphoonYolandaTweets(datasets.GeneratorBasedBuilder):
+    """
+    The dataset contains annotated typhoon and disaster-related tweets in Filipino collected before, during, and
+    after one month of Typhoon Yolanda in 2013. The dataset has been annotated by an expert into three sentiment
+    categories: positive, negative, and neutral.
+    """
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    BUILDER_CONFIGS = [
+        SEACrowdConfig(
+            name="typhoon_yolanda_tweets_source",
+            version=SOURCE_VERSION,
+            description="Typhoon Yolanda Tweets source schema",
+            schema="source",
+            subset_id="typhoon_yolanda_tweets",
+        ),
+        SEACrowdConfig(
+            name="typhoon_yolanda_tweets_seacrowd_text",
+            version=SEACROWD_VERSION,
+            description="Typhoon Yolanda Tweets SEACrowd schema",
+            schema="seacrowd_text",
+            subset_id="typhoon_yolanda_tweets",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "typhoon_yolanda_tweets_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "text": datasets.Value("string"),
+                    "label": datasets.Value("string"),
+                }
+            )
+        elif self.config.schema == "seacrowd_text":
+            features = schemas.text_features(["-1", "0", "1"])
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        emos = [-1, 0, 1]
+        # TODO: KEEP if your dataset is LOCAL; remove if NOT
+        if self.config.name == "typhoon_yolanda_tweets_source" or self.config.name == "typhoon_yolanda_tweets_seacrowd_text":
+            train_path = dl_manager.download_and_extract({emo: _URLS["train"][emo] for emo in emos})
+
+            test_path = dl_manager.download_and_extract({emo: _URLS["test"][emo] for emo in emos})
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": train_path,
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": test_path,
+                    "split": "test",
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
+        if self.config.schema != "source" and self.config.schema != "seacrowd_text":
+            raise ValueError(f"Invalid config: {self.config.name}")
+
+        df = pd.DataFrame(columns=["text", "label"])
+
+        if self.config.name == "typhoon_yolanda_tweets_source" or self.config.name == "typhoon_yolanda_tweets_seacrowd_text":
+            for emo, file in filepath.items():
+                with open(file) as f:
+                    t = f.readlines()
+                    l = [str(emo) for i in range(len(t))]
+                    tmp_df = pd.DataFrame.from_dict({"text": t, "label": l})
+                    df = pd.concat([df, tmp_df], ignore_index=True)
+
+        for row in df.itertuples():
+            print(row)
+            ex = {"id": str(row.Index), "text": row.text, "label": row.label}
+            yield row.Index, ex

From a211e470b3041809f5846f3fd67c13563fbd0c31 Mon Sep 17 00:00:00 2001
From: IvanHalimP <an7hr4x@gmail.com>
Date: Tue, 14 Nov 2023 21:49:12 +0700
Subject: [PATCH 2/5] Create __init__.py

---
 seacrowd/sea_datasets/typhoon_yolanda_tweets/__init__.py | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 seacrowd/sea_datasets/typhoon_yolanda_tweets/__init__.py

diff --git a/seacrowd/sea_datasets/typhoon_yolanda_tweets/__init__.py b/seacrowd/sea_datasets/typhoon_yolanda_tweets/__init__.py
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/seacrowd/sea_datasets/typhoon_yolanda_tweets/__init__.py
@@ -0,0 +1 @@
+

From 640d46ad8e4b9128b182067ec2c0447357f47908 Mon Sep 17 00:00:00 2001
From: IvanHalimP <an7hr4x@gmail.com>
Date: Sun, 19 Nov 2023 16:33:10 +0700
Subject: [PATCH 3/5] Update
 seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py

Co-authored-by: James Jaya <2089265+jamesjaya@users.noreply.github.com>
---
 .../typhoon_yolanda_tweets/typhoon_yolanda_tweets.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py
index bc15e1ef6..7c7c94ac4 100644
--- a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py
+++ b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py
@@ -37,7 +37,7 @@
 _ROOT_URL = "https://raw.githubusercontent.com/imperialite/Philippine-Languages-Online-Corpora/master/Tweets/Annotated%20Yolanda/"
 _URLS = {"train": {-1: _ROOT_URL + "train/-1.txt", 0: _ROOT_URL + "train/0.txt", 1: _ROOT_URL + "train/1.txt"}, "test": {-1: _ROOT_URL + "test/-1.txt", 0: _ROOT_URL + "test/0.txt", 1: _ROOT_URL + "test/1.txt"}}
 
-_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS]  # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
+_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS]
 
 _SOURCE_VERSION = "1.0.0"
 

From 62850e70887bed35b98b9963b5c8ae0a3918dced Mon Sep 17 00:00:00 2001
From: IvanHalimP <an7hr4x@gmail.com>
Date: Sun, 19 Nov 2023 16:36:38 +0700
Subject: [PATCH 4/5] Update typhoon_yolanda_tweets.py

Updated according to comments.
Please tell me if there are something else that I miss.
---
 .../typhoon_yolanda_tweets/typhoon_yolanda_tweets.py        | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py
index 7c7c94ac4..a65568d96 100644
--- a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py
+++ b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py
@@ -9,8 +9,6 @@
 from seacrowd.utils.configs import SEACrowdConfig
 from seacrowd.utils.constants import Licenses, Tasks
 
-_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.DEPENDENCY_PARSING]
-
 _CITATION = """\
 @misc{imperial2019sentiment,
       title={Sentiment Analysis of Typhoon Related Tweets using Standard and Bidirectional Recurrent Neural Networks}, 
@@ -96,7 +94,6 @@ def _info(self) -> datasets.DatasetInfo:
 
     def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
         emos = [-1, 0, 1]
-        # TODO: KEEP if your dataset is LOCAL; remove if NOT
         if self.config.name == "typhoon_yolanda_tweets_source" or self.config.name == "typhoon_yolanda_tweets_seacrowd_text":
             train_path = dl_manager.download_and_extract({emo: _URLS["train"][emo] for emo in emos})
 
@@ -129,11 +126,10 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
             for emo, file in filepath.items():
                 with open(file) as f:
                     t = f.readlines()
-                    l = [str(emo) for i in range(len(t))]
+                    l = [str(emo)]*(len(t))
                     tmp_df = pd.DataFrame.from_dict({"text": t, "label": l})
                     df = pd.concat([df, tmp_df], ignore_index=True)
 
         for row in df.itertuples():
-            print(row)
             ex = {"id": str(row.Index), "text": row.text, "label": row.label}
             yield row.Index, ex

From 6f6fb23d203f8834de8c89821f5899b1e01ad893 Mon Sep 17 00:00:00 2001
From: IvanHalimP <an7hr4x@gmail.com>
Date: Sun, 19 Nov 2023 16:38:49 +0700
Subject: [PATCH 5/5] Update typhoon_yolanda_tweets.py

removed "TODO" and extra newlines
---
 .../typhoon_yolanda_tweets/typhoon_yolanda_tweets.py            | 2 --
 1 file changed, 2 deletions(-)

diff --git a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py
index a65568d96..f3a76d21e 100644
--- a/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py
+++ b/seacrowd/sea_datasets/typhoon_yolanda_tweets/typhoon_yolanda_tweets.py
@@ -41,8 +41,6 @@
 
 _SEACROWD_VERSION = "1.0.0"
 
-
-# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case
 class TyphoonYolandaTweets(datasets.GeneratorBasedBuilder):
     """
     The dataset contains annotated typhoon and disaster-related tweets in Filipino collected before, during, and