forked from SEACrowd/seacrowd-datahub
-
Notifications
You must be signed in to change notification settings - Fork 0
/
smsa.py
137 lines (115 loc) · 5.45 KB
/
smsa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from pathlib import Path
from typing import List
import datasets
import pandas as pd
from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Tasks, DEFAULT_SOURCE_VIEW_NAME, DEFAULT_SEACROWD_VIEW_NAME
_DATASETNAME = "smsa"
_SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME
_UNIFIED_VIEW_NAME = DEFAULT_SEACROWD_VIEW_NAME
_LANGUAGES = ["ind"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
_LOCAL = False
_CITATION = """\
@INPROCEEDINGS{8904199,
author={Purwarianti, Ayu and Crisdayanti, Ida Ayu Putu Ari},
booktitle={2019 International Conference of Advanced Informatics: Concepts, Theory and Applications (ICAICTA)},
title={Improving Bi-LSTM Performance for Indonesian Sentiment Analysis Using Paragraph Vector},
year={2019},
pages={1-5},
doi={10.1109/ICAICTA.2019.8904199}
}
@inproceedings{wilie2020indonlu,
title={IndoNLU: Benchmark and Resources for Evaluating Indonesian Natural Language Understanding},
author={Wilie, Bryan and Vincentio, Karissa and Winata, Genta Indra and Cahyawijaya, Samuel and Li, Xiaohong and Lim, Zhi Yuan and Soleman, Sidik and Mahendra, Rahmad and Fung, Pascale and Bahar, Syafri and others},
booktitle={Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing},
pages={843--857},
year={2020}
}
"""
_DESCRIPTION = """\
SmSA is a sentence-level sentiment analysis dataset (Purwarianti and Crisdayanti, 2019) is a collection of comments and reviews
in Indonesian obtained from multiple online platforms. The text was crawled and then annotated by several Indonesian linguists
to construct this dataset. There are three possible sentiments on the SmSA dataset: positive, negative, and neutral
"""
_HOMEPAGE = "https://github.com/IndoNLP/indonlu"
_LICENSE = "Creative Commons Attribution Share-Alike 4.0 International"
_URLs = {
"train": "https://github.com/IndoNLP/indonlu/raw/master/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv",
"validation": "https://github.com/IndoNLP/indonlu/raw/master/dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv",
"test": "https://github.com/IndoNLP/indonlu/raw/master/dataset/smsa_doc-sentiment-prosa/test_preprocess.tsv",
}
_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS]
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "2024.06.20"
class SMSA(datasets.GeneratorBasedBuilder):
"""SMSA is a sentiment analysis dataset consisting of 3 labels (positive, neutral, and negative) which comes from comments and reviews collected from multiple online platforms."""
BUILDER_CONFIGS = [
SEACrowdConfig(
name="smsa_source",
version=datasets.Version(_SOURCE_VERSION),
description="SMSA source schema",
schema="source",
subset_id="smsa",
),
SEACrowdConfig(
name="smsa_seacrowd_text",
version=datasets.Version(_SEACROWD_VERSION),
description="SMSA Nusantara schema",
schema="seacrowd_text",
subset_id="smsa",
),
]
DEFAULT_CONFIG_NAME = "smsa_source"
def _info(self):
if self.config.schema == "source":
features = datasets.Features({"index": datasets.Value("string"), "sentence": datasets.Value("string"), "label": datasets.Value("string")})
elif self.config.schema == "seacrowd_text":
features = schemas.text_features(["negative", "neutral", "positive"])
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
train_tsv_path = Path(dl_manager.download_and_extract(_URLs["train"]))
validation_tsv_path = Path(dl_manager.download_and_extract(_URLs["validation"]))
test_tsv_path = Path(dl_manager.download_and_extract(_URLs["test"]))
data_files = {
"train": train_tsv_path,
"validation": validation_tsv_path,
"test": test_tsv_path,
}
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"filepath": data_files["train"]},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={"filepath": data_files["validation"]},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={"filepath": data_files["test"]},
),
]
def _generate_examples(self, filepath: Path):
df = pd.read_csv(filepath, sep="\t", header=None).reset_index()
df.columns = ["id", "sentence", "label"]
if self.config.schema == "source":
for row in df.itertuples():
ex = {"index": str(row.id), "sentence": row.sentence, "label": row.label}
yield row.id, ex
elif self.config.schema == "seacrowd_text":
for row in df.itertuples():
ex = {
"id": str(row.id),
"text": row.sentence,
"label": row.label
}
yield row.id, ex
else:
raise ValueError(f"Invalid config: {self.config.name}")