From 492d19eb3dab4fc5918398de28f90aa4164863b7 Mon Sep 17 00:00:00 2001 From: Jennifer Santoso Date: Sun, 19 Nov 2023 17:14:32 +0900 Subject: [PATCH 1/8] Add dataset loader for UIT-ViSD4SA --- seacrowd/sea_datasets/uit_visd4sa/__init__.py | 0 .../sea_datasets/uit_visd4sa/uit_visd4sa.py | 184 ++++++++++++++++++ 2 files changed, 184 insertions(+) create mode 100644 seacrowd/sea_datasets/uit_visd4sa/__init__.py create mode 100644 seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py diff --git a/seacrowd/sea_datasets/uit_visd4sa/__init__.py b/seacrowd/sea_datasets/uit_visd4sa/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py new file mode 100644 index 000000000..f6fe31d0a --- /dev/null +++ b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py @@ -0,0 +1,184 @@ +# coding=utf-8 + +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses +from seacrowd.utils.constants import Tasks + +_CITATION = """\ +@inproceedings{thanh-etal-2021-span, + title = "Span Detection for Aspect-Based Sentiment Analysis in Vietnamese", + author = "Thanh, Kim Nguyen Thi and + Khai, Sieu Huynh and + Huynh, Phuc Pham and + Luc, Luong Phan and + Nguyen, Duc-Vu and + Van, Kiet Nguyen", + booktitle = "Proceedings of the 35th Pacific Asia Conference on Language, Information and Computation", + year = "2021", + publisher = "Association for Computational Lingustics", + url = "https://aclanthology.org/2021.paclic-1.34", + pages = "318--328", +} +""" + +_DATASETNAME = "uit_visd4sa" + +_DESCRIPTION = """\ +This dataset is designed for span detection for aspect-based sentiment analysis NLP task. +A Vietnamese dataset consisting of 35,396 human-annotated spans on 11,122 feedback +comments for evaluating span detection for aspect-based sentiment analysis for mobile e-commerce +""" + +_HOMEPAGE = "https://github.com/kimkim00/UIT-ViSD4SA" + +_LICENSE = Licenses.UNKNOWN.value # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value + +_URLS = { + "train": "https://raw.githubusercontent.com/kimkim00/UIT-ViSD4SA/main/data/train.jsonl", + "dev": "https://raw.githubusercontent.com/kimkim00/UIT-ViSD4SA/main/data/dev.jsonl", + "test": "https://raw.githubusercontent.com/kimkim00/UIT-ViSD4SA/main/data/test.jsonl" +} + +_SUPPORTED_TASKS = [ + Tasks.ASPECT_BASED_SENTIMENT_ANALYSIS] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class UITViSD4SADataset(datasets.GeneratorBasedBuilder): + """This dataset is designed for span detection for aspect-based sentiment analysis NLP task. +A Vietnamese dataset consisting of 35,396 human-annotated spans on 11,122 feedback +comments for evaluating span detection for aspect-based sentiment analysis for mobile e-commerce""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="uit_visd4sa_source", + version=SOURCE_VERSION, + description="uit_visd4sa source schema", + schema="source", + subset_id="uit_visd4sa", + ), + SEACrowdConfig( + name="uit_visd4sa_seacrowd_kb", + version=SEACROWD_VERSION, + description="uit_visd4sa SEACrowd schema", + schema="seacrowd_kb", + subset_id="uit_visd4sa", + ), + ] + + DEFAULT_CONFIG_NAME = "uit_visd4sa_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "text": datasets.Value("string"), + "label": datasets.Sequence({ + "start": datasets.Value("int32"), + "end": datasets.Value("int32"), + "aspect": datasets.Value("string"), + "rating": datasets.Value("string") + }), + } + ) + + elif self.config.schema == "seacrowd_kb": + # e.g. features = schemas.kb_features + features = schemas.kb_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + train_path = dl_manager.download_and_extract(_URLS["train"]) + dev_path = dl_manager.download_and_extract(_URLS["dev"]) + test_path = dl_manager.download_and_extract(_URLS["test"]) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": train_path, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": test_path, + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": dev_path, + "split": "dev", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + with open(filepath, "r") as f: + df = [json.loads(line) for line in f.readlines()] + f.close() + if self.config.schema == "source": + for _id, row in enumerate(df): + labels = row["labels"] + entry_label = [] + for lb in labels: + entry_label.append({ + "start": lb[0], + "end": lb[1], + "aspect": lb[-1].split('#')[0], + "rating": lb[-1].split('#')[-1] + }) + entry = { + "text": row["text"], + "label": entry_label, + } + yield _id, entry + + elif self.config.schema == "seacrowd_kb": + for _id, row in enumerate(df): + entry = { + "id": _id, + "passages": [{ + "id": _id, + "type": "text", + "text": [row["text"]], + "offsets": [[0, len(row["text"])]], + }], + "entities": [{ + "id": str(_id) + '-' + str(lbl_id), + "type": label[-1].split('#')[0], # ASPECT NAME + "text": [label[-1].split('#')[-1]], # RATING (POSITIVE / NEGATIVE) + "offsets": [label[:2]], # [START, END] + "normalized": [], + } for lbl_id, label in enumerate(row["labels"])], + "events": [], + "coreferences": [], + "relations": [], + } + yield _id, entry From a431cc1afa96b4c67c8eebc9ef528b1c248ee4b3 Mon Sep 17 00:00:00 2001 From: Jennifer Santoso Date: Mon, 20 Nov 2023 18:34:20 +0900 Subject: [PATCH 2/8] Adjusting the task closest to the fitting schema and change the entity/coref setting --- .../sea_datasets/uit_visd4sa/uit_visd4sa.py | 61 ++++++++----------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py index f6fe31d0a..1d182ca52 100644 --- a/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py +++ b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py @@ -8,8 +8,7 @@ from seacrowd.utils import schemas from seacrowd.utils.configs import SEACrowdConfig -from seacrowd.utils.constants import Licenses -from seacrowd.utils.constants import Tasks +from seacrowd.utils.constants import Licenses, Tasks _CITATION = """\ @inproceedings{thanh-etal-2021-span, @@ -32,7 +31,7 @@ _DESCRIPTION = """\ This dataset is designed for span detection for aspect-based sentiment analysis NLP task. -A Vietnamese dataset consisting of 35,396 human-annotated spans on 11,122 feedback +A Vietnamese dataset consisting of 35,396 human-annotated spans on 11,122 feedback comments for evaluating span detection for aspect-based sentiment analysis for mobile e-commerce """ @@ -43,11 +42,10 @@ _URLS = { "train": "https://raw.githubusercontent.com/kimkim00/UIT-ViSD4SA/main/data/train.jsonl", "dev": "https://raw.githubusercontent.com/kimkim00/UIT-ViSD4SA/main/data/dev.jsonl", - "test": "https://raw.githubusercontent.com/kimkim00/UIT-ViSD4SA/main/data/test.jsonl" + "test": "https://raw.githubusercontent.com/kimkim00/UIT-ViSD4SA/main/data/test.jsonl", } -_SUPPORTED_TASKS = [ - Tasks.ASPECT_BASED_SENTIMENT_ANALYSIS] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] +_SUPPORTED_TASKS = [Tasks.COREFERENCE_RESOLUTION] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] _SOURCE_VERSION = "1.0.0" @@ -56,8 +54,8 @@ class UITViSD4SADataset(datasets.GeneratorBasedBuilder): """This dataset is designed for span detection for aspect-based sentiment analysis NLP task. -A Vietnamese dataset consisting of 35,396 human-annotated spans on 11,122 feedback -comments for evaluating span detection for aspect-based sentiment analysis for mobile e-commerce""" + A Vietnamese dataset consisting of 35,396 human-annotated spans on 11,122 feedback + comments for evaluating span detection for aspect-based sentiment analysis for mobile e-commerce""" SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) @@ -87,12 +85,7 @@ def _info(self) -> datasets.DatasetInfo: features = datasets.Features( { "text": datasets.Value("string"), - "label": datasets.Sequence({ - "start": datasets.Value("int32"), - "end": datasets.Value("int32"), - "aspect": datasets.Value("string"), - "rating": datasets.Value("string") - }), + "label": datasets.Sequence({"start": datasets.Value("int32"), "end": datasets.Value("int32"), "aspect": datasets.Value("string"), "rating": datasets.Value("string")}), } ) @@ -148,12 +141,7 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: labels = row["labels"] entry_label = [] for lb in labels: - entry_label.append({ - "start": lb[0], - "end": lb[1], - "aspect": lb[-1].split('#')[0], - "rating": lb[-1].split('#')[-1] - }) + entry_label.append({"start": lb[0], "end": lb[1], "aspect": lb[-1].split("#")[0], "rating": lb[-1].split("#")[-1]}) entry = { "text": row["text"], "label": entry_label, @@ -164,21 +152,26 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: for _id, row in enumerate(df): entry = { "id": _id, - "passages": [{ - "id": _id, - "type": "text", - "text": [row["text"]], - "offsets": [[0, len(row["text"])]], - }], - "entities": [{ - "id": str(_id) + '-' + str(lbl_id), - "type": label[-1].split('#')[0], # ASPECT NAME - "text": [label[-1].split('#')[-1]], # RATING (POSITIVE / NEGATIVE) - "offsets": [label[:2]], # [START, END] - "normalized": [], - } for lbl_id, label in enumerate(row["labels"])], + "passages": [ + { + "id": "text-" + str(_id), + "type": "text", + "text": [row["text"]], + "offsets": [[0, len(row["text"])]], + } + ], + "entities": [ + { + "id": str(_id) + "-aspect-rating-" + str(lbl_id), + "type": label[-1], # (ASPECT NAME # RATING (POSITIVE / NEGATIVE)) + "text": [row["text"][label[0] : label[1]]], # PART OF TEXT AFFECTED BY THE TYPE, + "offsets": [label[:2]], # [START, END] + "normalized": [], + } + for lbl_id, label in enumerate(row["labels"]) + ], "events": [], - "coreferences": [], + "coreferences": [{"id": str(_id) + "-0", "entity_ids": [str(_id) + "-aspect-rating-" + str(lbl_id) for lbl_id, _ in enumerate(row["labels"])]}], "relations": [], } yield _id, entry From dd5ecea62a04469f658f719865694693f18d61aa Mon Sep 17 00:00:00 2001 From: Jennifer Santoso Date: Mon, 4 Dec 2023 00:05:09 +0900 Subject: [PATCH 3/8] Change task to SPAN_BASED_ABSA --- seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py index 1d182ca52..2cb75a60d 100644 --- a/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py +++ b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py @@ -45,7 +45,7 @@ "test": "https://raw.githubusercontent.com/kimkim00/UIT-ViSD4SA/main/data/test.jsonl", } -_SUPPORTED_TASKS = [Tasks.COREFERENCE_RESOLUTION] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] +_SUPPORTED_TASKS = [Tasks.SPAN_BASED_ABSA] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] _SOURCE_VERSION = "1.0.0" From e614dbc4ee89f0ff355cc1845b42c2db1890ce30 Mon Sep 17 00:00:00 2001 From: Jennifer Santoso Date: Mon, 4 Dec 2023 00:07:07 +0900 Subject: [PATCH 4/8] Change config name with _DATASETNAME --- seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py index 2cb75a60d..d3ea28be1 100644 --- a/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py +++ b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py @@ -37,7 +37,7 @@ _HOMEPAGE = "https://github.com/kimkim00/UIT-ViSD4SA" -_LICENSE = Licenses.UNKNOWN.value # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value +_LICENSE = Licenses.UNKNOWN.value _URLS = { "train": "https://raw.githubusercontent.com/kimkim00/UIT-ViSD4SA/main/data/train.jsonl", @@ -45,7 +45,7 @@ "test": "https://raw.githubusercontent.com/kimkim00/UIT-ViSD4SA/main/data/test.jsonl", } -_SUPPORTED_TASKS = [Tasks.SPAN_BASED_ABSA] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] +_SUPPORTED_TASKS = [Tasks.SPAN_BASED_ABSA] _SOURCE_VERSION = "1.0.0" @@ -62,14 +62,14 @@ class UITViSD4SADataset(datasets.GeneratorBasedBuilder): BUILDER_CONFIGS = [ SEACrowdConfig( - name="uit_visd4sa_source", + name=f"{_DATASETNAME}_source", version=SOURCE_VERSION, description="uit_visd4sa source schema", schema="source", subset_id="uit_visd4sa", ), SEACrowdConfig( - name="uit_visd4sa_seacrowd_kb", + name=f"{_DATASETNAME}_seacrowd_kb", version=SEACROWD_VERSION, description="uit_visd4sa SEACrowd schema", schema="seacrowd_kb", @@ -77,7 +77,7 @@ class UITViSD4SADataset(datasets.GeneratorBasedBuilder): ), ] - DEFAULT_CONFIG_NAME = "uit_visd4sa_source" + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" def _info(self) -> datasets.DatasetInfo: From 4322a8c5514526d1eae149c70927888774c2fae0 Mon Sep 17 00:00:00 2001 From: Jennifer Santoso Date: Mon, 4 Dec 2023 00:08:21 +0900 Subject: [PATCH 5/8] Remove unecessary split kwargs and simplify path --- seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py index d3ea28be1..e4b28e06e 100644 --- a/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py +++ b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py @@ -103,35 +103,31 @@ def _info(self) -> datasets.DatasetInfo: def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" - train_path = dl_manager.download_and_extract(_URLS["train"]) - dev_path = dl_manager.download_and_extract(_URLS["dev"]) - test_path = dl_manager.download_and_extract(_URLS["test"]) + path_dict = dl_manager.download_and_extract(_URLS) + train_path, dev_path, test_path = path_dict["train"], path_dict["dev"], path_dict["test"] return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "filepath": train_path, - "split": "train", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "filepath": test_path, - "split": "test", }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "filepath": dev_path, - "split": "dev", }, ), ] - def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" with open(filepath, "r") as f: df = [json.loads(line) for line in f.readlines()] From c9d218931bc75a717b014031e2b8d3682d444ca5 Mon Sep 17 00:00:00 2001 From: Jennifer Santoso Date: Wed, 13 Dec 2023 17:03:53 +0900 Subject: [PATCH 6/8] change the schema to seacrowd_seq_label and implement the IOB sequence label --- .../sea_datasets/uit_visd4sa/uit_visd4sa.py | 91 ++++++++++++------- 1 file changed, 58 insertions(+), 33 deletions(-) diff --git a/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py index e4b28e06e..befd1bddb 100644 --- a/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py +++ b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py @@ -2,6 +2,7 @@ import json from pathlib import Path +import re from typing import Dict, List, Tuple import datasets @@ -52,6 +53,52 @@ _SEACROWD_VERSION = "1.0.0" +def construct_label_classes(): + IOB_tag = ["I", "O", "B"] + aspects = ["SCREEN", "CAMERA", "FEATURES", "BATTERY", "PERFORMANCE", "STORAGE", "DESIGN", "PRICE", "GENERAL", "SER&ACC"] + ratings = ["POSITIVE", "NEUTRAL", "NEGATIVE"] + label_classes = [] + for iob in IOB_tag: + if iob == "O": + label_classes.append("O") + else: + for aspect in aspects: + for rating in ratings: + label_classes.append("{iob}-{aspect}#{rating}".format(iob=iob, aspect=aspect, rating=rating)) + return label_classes + + +def construct_IOB_sequences(text, labels): + labels.sort() + word_start = [0] + [match.start() + 1 for match in re.finditer(" ", text)] + is_not_O = False + iob_sequence = [] + word_count = 0 + lb_count = 0 + + while word_count < len(word_start): + if lb_count == len(labels): + for x in range(word_count, len(word_start)): + iob_sequence.append("O") + break + if not is_not_O: + if word_start[word_count] >= labels[lb_count][0]: + is_not_O = True + iob_sequence.append("B-" + labels[lb_count][-1]) + word_count += 1 + else: + iob_sequence.append("O") + word_count += 1 + else: + if word_start[word_count] > labels[lb_count][1]: + is_not_O = False + lb_count += 1 + else: + iob_sequence.append("I-" + labels[lb_count][-1]) + word_count += 1 + return iob_sequence + + class UITViSD4SADataset(datasets.GeneratorBasedBuilder): """This dataset is designed for span detection for aspect-based sentiment analysis NLP task. A Vietnamese dataset consisting of 35,396 human-annotated spans on 11,122 feedback @@ -69,10 +116,10 @@ class UITViSD4SADataset(datasets.GeneratorBasedBuilder): subset_id="uit_visd4sa", ), SEACrowdConfig( - name=f"{_DATASETNAME}_seacrowd_kb", + name=f"{_DATASETNAME}_seacrowd_seq_label", version=SEACROWD_VERSION, description="uit_visd4sa SEACrowd schema", - schema="seacrowd_kb", + schema="seacrowd_seq_label", subset_id="uit_visd4sa", ), ] @@ -80,7 +127,6 @@ class UITViSD4SADataset(datasets.GeneratorBasedBuilder): DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" def _info(self) -> datasets.DatasetInfo: - if self.config.schema == "source": features = datasets.Features( { @@ -89,9 +135,8 @@ def _info(self) -> datasets.DatasetInfo: } ) - elif self.config.schema == "seacrowd_kb": - # e.g. features = schemas.kb_features - features = schemas.kb_features + elif self.config.schema == "seacrowd_seq_label": + features = schemas.seq_label_features(construct_label_classes()) return datasets.DatasetInfo( description=_DESCRIPTION, @@ -135,39 +180,19 @@ def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: if self.config.schema == "source": for _id, row in enumerate(df): labels = row["labels"] - entry_label = [] + entry_labels = [] for lb in labels: - entry_label.append({"start": lb[0], "end": lb[1], "aspect": lb[-1].split("#")[0], "rating": lb[-1].split("#")[-1]}) + entry_labels.append({"start": lb[0], "end": lb[1], "aspect": lb[-1].split("#")[0], "rating": lb[-1].split("#")[-1]}) entry = { "text": row["text"], - "label": entry_label, + "label": entry_labels, } yield _id, entry - - elif self.config.schema == "seacrowd_kb": + elif self.config.schema == "seacrowd_seq_label": for _id, row in enumerate(df): entry = { - "id": _id, - "passages": [ - { - "id": "text-" + str(_id), - "type": "text", - "text": [row["text"]], - "offsets": [[0, len(row["text"])]], - } - ], - "entities": [ - { - "id": str(_id) + "-aspect-rating-" + str(lbl_id), - "type": label[-1], # (ASPECT NAME # RATING (POSITIVE / NEGATIVE)) - "text": [row["text"][label[0] : label[1]]], # PART OF TEXT AFFECTED BY THE TYPE, - "offsets": [label[:2]], # [START, END] - "normalized": [], - } - for lbl_id, label in enumerate(row["labels"]) - ], - "events": [], - "coreferences": [{"id": str(_id) + "-0", "entity_ids": [str(_id) + "-aspect-rating-" + str(lbl_id) for lbl_id, _ in enumerate(row["labels"])]}], - "relations": [], + "id": str(_id), + "tokens": row["text"].split(" "), + "labels": construct_IOB_sequences(row["text"], row["labels"]), } yield _id, entry From 564f7bdb4112526dceb1fe9124d2355166b46ae2 Mon Sep 17 00:00:00 2001 From: Jennifer Santoso Date: Tue, 19 Dec 2023 13:51:22 +0900 Subject: [PATCH 7/8] Add _LANGUAGES constant --- seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py index befd1bddb..61e199633 100644 --- a/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py +++ b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py @@ -40,6 +40,8 @@ _LICENSE = Licenses.UNKNOWN.value +_LANGUAGES = "vie" + _URLS = { "train": "https://raw.githubusercontent.com/kimkim00/UIT-ViSD4SA/main/data/train.jsonl", "dev": "https://raw.githubusercontent.com/kimkim00/UIT-ViSD4SA/main/data/dev.jsonl", From b83b43a52b4a68e854fa5efef20b74effc5cdfc3 Mon Sep 17 00:00:00 2001 From: Jennifer Santoso Date: Tue, 19 Dec 2023 14:14:46 +0900 Subject: [PATCH 8/8] _LANGUAGES constant changed to array --- seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py index 61e199633..5e18a4211 100644 --- a/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py +++ b/seacrowd/sea_datasets/uit_visd4sa/uit_visd4sa.py @@ -40,7 +40,7 @@ _LICENSE = Licenses.UNKNOWN.value -_LANGUAGES = "vie" +_LANGUAGES = ["vie"] _URLS = { "train": "https://raw.githubusercontent.com/kimkim00/UIT-ViSD4SA/main/data/train.jsonl",