From 7a4d766d3728456d8b74b57ddaf6a0cceab363a4 Mon Sep 17 00:00:00 2001 From: Lj Miranda Date: Thu, 9 Nov 2023 12:00:16 +0800 Subject: [PATCH 1/8] Add CebuaNER data loader Closes #23 --- seacrowd/sea_datasets/cebuaner/__init__.py | 0 seacrowd/sea_datasets/cebuaner/cebuaner.py | 151 +++++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 seacrowd/sea_datasets/cebuaner/__init__.py create mode 100644 seacrowd/sea_datasets/cebuaner/cebuaner.py diff --git a/seacrowd/sea_datasets/cebuaner/__init__.py b/seacrowd/sea_datasets/cebuaner/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/cebuaner/cebuaner.py b/seacrowd/sea_datasets/cebuaner/cebuaner.py new file mode 100644 index 000000000..f9de39944 --- /dev/null +++ b/seacrowd/sea_datasets/cebuaner/cebuaner.py @@ -0,0 +1,151 @@ +from pathlib import Path +from typing import Dict, Iterable, List, Tuple + +import datasets +from datasets.download.download_manager import DownloadManager + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """ +@misc{pilar2023cebuaner, + title={CebuaNER - A New Baseline Cebuano Named Entity Recognition Model}, + author={Ma. Beatrice Emanuela Pilar and Ellyza Mari Papas and Mary Loise Buenaventura and Dane Dedoroy and Myron Darrel Montefalcon and Jay Rhald Padilla and Lany Maceda and Mideth Abisado and Joseph Marvin Imperial}, + year={2023}, + eprint={2310.00679}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +""" + +_LOCAL = False +_LANGUAGES = ["ceb"] +_DATASETNAME = "cebuaner" +_DESCRIPTION = """\ +The CebuaNER dataset contains 4000+ news articles that have been tagged by +native speakers of Cebuano usin gthe BIO encoding schema for the named entity +recognition (NER) task. +""" + +_HOMEPAGE = "https://github.com/mebzmoren/CebuaNER" +_LICENSE = Licenses.CC_BY_NC_SA_4_0.value +_URL = "https://github.com/mebzmoren/CebuaNER/raw/main/data/annotated_data/final-1.txt" + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] +_SOURCE_VERSION = "1.0.0" +_SEACROWD_VERSION = "1.0.0" + + +class CebuaNERDataset(datasets.GeneratorBasedBuilder): + """CebuaNER dataset from https://github.com/mebzmoren/CebuaNER""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + SEACROWD_SCHEMA_NAME = "seq_label" + LABEL_CLASSES = [ + "O", + "B-PER", + "I-PER", + "B-ORG", + "I-ORG", + "B-LOC", + "I-LOC", + "B-OTHER", + "I-OTHER", + ] + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "tokens": datasets.Sequence(datasets.Value("string")), + "ner_tags": datasets.Sequence(datasets.features.ClassLabel(names=self.LABEL_CLASSES)), + } + ) + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.seq_label_features(self.LABEL_CLASSES) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: + data_file = Path(dl_manager.download_and_extract(_URL)) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_file, "split": "train"}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": data_file, "split": "dev"}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": data_file, "split": "test"}, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + label_key = "ner_tags" if self.config.schema == "source" else "labels" + examples: Iterable[Dict[str, List[str]]] = [] + with open(filepath, encoding="utf-8") as f: + tokens = [] + ner_tags = [] + for line in f: + if line.startswith("-DOCSTART-") or line == "" or line == "\n": + if tokens: + examples.append({"tokens": tokens, label_key: ner_tags}) + if len(tokens) != len(ner_tags): + raise ValueError(f"Tokens and tags are not aligned! {len(tokens)} != {len(ner_tags)}") + tokens = [] + ner_tags = [] + else: + # CebuaNER iob are separated by spaces + token, _, _, ner_tag = line.split(" ") + tokens.append(token) + ner_tags.append(ner_tag.rstrip()) + if tokens: + examples.append({"tokens": tokens, label_key: ner_tags}) + if len(tokens) != len(ner_tags): + raise ValueError(f"Tokens and tags are not aligned! {len(tokens)} != {len(ner_tags)}") + + # The CebuaNER paper doesn't provide a recommended split. However, the Github repository + # contains a notebook example of the split they used in the report: + # https://github.com/mebzmoren/CebuaNER/blob/main/notebooks/Named-Entity-Recognition-with-Conditional-Random-Fields.ipynb + if split == "train": + final_examples = examples[0:2980] + if split == "test": + final_examples = examples[2980:3831] + if split == "dev": + final_examples = examples[3831:] + + for idx, eg in enumerate(final_examples): + eg["id"] = idx + yield idx, eg From 75455c74f6cfbaa8b0be918e903a52ed5d341cfb Mon Sep 17 00:00:00 2001 From: Lj Miranda Date: Sun, 19 Nov 2023 15:56:22 +0800 Subject: [PATCH 2/8] Turn _CITATION into a raw text --- seacrowd/sea_datasets/cebuaner/cebuaner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/seacrowd/sea_datasets/cebuaner/cebuaner.py b/seacrowd/sea_datasets/cebuaner/cebuaner.py index f9de39944..925cd5931 100644 --- a/seacrowd/sea_datasets/cebuaner/cebuaner.py +++ b/seacrowd/sea_datasets/cebuaner/cebuaner.py @@ -8,9 +8,9 @@ from seacrowd.utils.configs import SEACrowdConfig from seacrowd.utils.constants import Licenses, Tasks -_CITATION = """ +_CITATION = r""" @misc{pilar2023cebuaner, - title={CebuaNER - A New Baseline Cebuano Named Entity Recognition Model}, + title={CebuaNER: A New Baseline Cebuano Named Entity Recognition Model}, author={Ma. Beatrice Emanuela Pilar and Ellyza Mari Papas and Mary Loise Buenaventura and Dane Dedoroy and Myron Darrel Montefalcon and Jay Rhald Padilla and Lany Maceda and Mideth Abisado and Joseph Marvin Imperial}, year={2023}, eprint={2310.00679}, From b146acc9e928ff4bfb56136e7a365cf2895ad798 Mon Sep 17 00:00:00 2001 From: Lj Miranda Date: Sun, 19 Nov 2023 15:59:46 +0800 Subject: [PATCH 3/8] Add comment on how each document is separated in the file --- seacrowd/sea_datasets/cebuaner/cebuaner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/seacrowd/sea_datasets/cebuaner/cebuaner.py b/seacrowd/sea_datasets/cebuaner/cebuaner.py index 925cd5931..8cc3b24d7 100644 --- a/seacrowd/sea_datasets/cebuaner/cebuaner.py +++ b/seacrowd/sea_datasets/cebuaner/cebuaner.py @@ -119,6 +119,8 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: tokens = [] ner_tags = [] for line in f: + # There's no clear delimiter in the IOB file so I'm separating each example based on the newline. + # The -DOCSTART- delimiter only shows up in the very first example. if line.startswith("-DOCSTART-") or line == "" or line == "\n": if tokens: examples.append({"tokens": tokens, label_key: ner_tags}) From f35b0c9cff3ba6270845152307a3bcdd5d7b4c96 Mon Sep 17 00:00:00 2001 From: Lj Miranda Date: Sun, 19 Nov 2023 16:35:28 +0800 Subject: [PATCH 4/8] Create different configs for each annotator --- seacrowd/sea_datasets/cebuaner/cebuaner.py | 38 ++++++++++++++-------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/seacrowd/sea_datasets/cebuaner/cebuaner.py b/seacrowd/sea_datasets/cebuaner/cebuaner.py index 8cc3b24d7..02e39cf5e 100644 --- a/seacrowd/sea_datasets/cebuaner/cebuaner.py +++ b/seacrowd/sea_datasets/cebuaner/cebuaner.py @@ -30,7 +30,10 @@ _HOMEPAGE = "https://github.com/mebzmoren/CebuaNER" _LICENSE = Licenses.CC_BY_NC_SA_4_0.value -_URL = "https://github.com/mebzmoren/CebuaNER/raw/main/data/annotated_data/final-1.txt" +_URLS = { + "annotator_0": "https://github.com/mebzmoren/CebuaNER/raw/main/data/annotated_data/final-1.txt", + "annotator_1": "https://github.com/mebzmoren/CebuaNER/raw/main/data/annotated_data/final-2.txt", +} _SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] _SOURCE_VERSION = "1.0.0" @@ -56,24 +59,31 @@ class CebuaNERDataset(datasets.GeneratorBasedBuilder): "I-OTHER", ] - BUILDER_CONFIGS = [ - SEACrowdConfig( - name=f"{_DATASETNAME}_source", + # There are two annotators in the CebuaNER dataset but there's no canonical + # label. Here, we decided to create loaders for both annotators. The + # inter-annotator reliability is high so it's possible to treat either as + # gold-standard data. + dataset_names = sorted([f"{_DATASETNAME}_{annot}" for annot in _URLS.keys()]) + BUILDER_CONFIGS = [] + for name in dataset_names: + source_config = SEACrowdConfig( + name=f"{name}_source", version=SOURCE_VERSION, description=f"{_DATASETNAME} source schema", schema="source", - subset_id=_DATASETNAME, - ), - SEACrowdConfig( - name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=name, + ) + BUILDER_CONFIGS.append(source_config) + seacrowd_config = SEACrowdConfig( + name=f"{name}_seacrowd_{SEACROWD_SCHEMA_NAME}", version=SEACROWD_VERSION, description=f"{_DATASETNAME} SEACrowd schema", schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", - subset_id=_DATASETNAME, - ), - ] + subset_id=name, + ) + BUILDER_CONFIGS.append(seacrowd_config) - DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_1_source" def _info(self) -> datasets.DatasetInfo: if self.config.schema == "source": @@ -96,7 +106,9 @@ def _info(self) -> datasets.DatasetInfo: ) def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: - data_file = Path(dl_manager.download_and_extract(_URL)) + _, annotator = self.config.subset_id.split("_", 1) + url = _URLS[annotator] + data_file = Path(dl_manager.download_and_extract(url)) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, From e18deefe25b45aa810126e7936a7b38d342e35eb Mon Sep 17 00:00:00 2001 From: Lj Miranda Date: Mon, 20 Nov 2023 09:02:43 +0800 Subject: [PATCH 5/8] Expand comment on article delimiters --- seacrowd/sea_datasets/cebuaner/cebuaner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/seacrowd/sea_datasets/cebuaner/cebuaner.py b/seacrowd/sea_datasets/cebuaner/cebuaner.py index 02e39cf5e..a2235a640 100644 --- a/seacrowd/sea_datasets/cebuaner/cebuaner.py +++ b/seacrowd/sea_datasets/cebuaner/cebuaner.py @@ -132,7 +132,9 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: ner_tags = [] for line in f: # There's no clear delimiter in the IOB file so I'm separating each example based on the newline. - # The -DOCSTART- delimiter only shows up in the very first example. + # The -DOCSTART- delimiter only shows up in the very first example. In their notebook example + # https://github.com/mebzmoren/CebuaNER/blob/main/notebooks/Named-Entity-Recognition-with-Conditional-Random-Fields.ipynb, + # they used '' as their article delimiter. if line.startswith("-DOCSTART-") or line == "" or line == "\n": if tokens: examples.append({"tokens": tokens, label_key: ner_tags}) From 46c255ad0beb95c258f9fa2f36e2475b96d09098 Mon Sep 17 00:00:00 2001 From: Lj Miranda Date: Mon, 20 Nov 2023 09:22:17 +0800 Subject: [PATCH 6/8] Reimplement base config using annotator_1 --- seacrowd/sea_datasets/cebuaner/cebuaner.py | 28 ++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/seacrowd/sea_datasets/cebuaner/cebuaner.py b/seacrowd/sea_datasets/cebuaner/cebuaner.py index a2235a640..fcdce46bd 100644 --- a/seacrowd/sea_datasets/cebuaner/cebuaner.py +++ b/seacrowd/sea_datasets/cebuaner/cebuaner.py @@ -83,6 +83,25 @@ class CebuaNERDataset(datasets.GeneratorBasedBuilder): ) BUILDER_CONFIGS.append(seacrowd_config) + # Create a configuration that loads the annotations of the first annotator + # and treat that as the default. + BUILDER_CONFIGS.extend([ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=_DATASETNAME, + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=_DATASETNAME, + ), + ]) + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_1_source" def _info(self) -> datasets.DatasetInfo: @@ -106,8 +125,11 @@ def _info(self) -> datasets.DatasetInfo: ) def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: - _, annotator = self.config.subset_id.split("_", 1) - url = _URLS[annotator] + if self.config.subset_id == _DATASETNAME: + url = _URLS["annotator_0"] + else: + _, annotator = self.config.subset_id.split("_", 1) + url = _URLS[annotator] data_file = Path(dl_manager.download_and_extract(url)) return [ datasets.SplitGenerator( @@ -145,6 +167,8 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: else: # CebuaNER iob are separated by spaces token, _, _, ner_tag = line.split(" ") + if ner_tag.rstrip() not in self.LABEL_CLASSES: + print(token, ner_tag, line) tokens.append(token) ner_tags.append(ner_tag.rstrip()) if tokens: From 841c6f136cc167aab8e1fb2d74179edd92e7b063 Mon Sep 17 00:00:00 2001 From: Lj Miranda Date: Mon, 20 Nov 2023 18:20:26 +0800 Subject: [PATCH 7/8] Remove debug line --- seacrowd/sea_datasets/cebuaner/cebuaner.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/seacrowd/sea_datasets/cebuaner/cebuaner.py b/seacrowd/sea_datasets/cebuaner/cebuaner.py index fcdce46bd..f7fabdeaf 100644 --- a/seacrowd/sea_datasets/cebuaner/cebuaner.py +++ b/seacrowd/sea_datasets/cebuaner/cebuaner.py @@ -167,8 +167,6 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: else: # CebuaNER iob are separated by spaces token, _, _, ner_tag = line.split(" ") - if ner_tag.rstrip() not in self.LABEL_CLASSES: - print(token, ner_tag, line) tokens.append(token) ner_tags.append(ner_tag.rstrip()) if tokens: From b0c0226ad4415ccba2aaf7fcaa1727f9a6d9118f Mon Sep 17 00:00:00 2001 From: Lj Miranda Date: Mon, 20 Nov 2023 20:11:39 +0800 Subject: [PATCH 8/8] Implement minor updates - Don't hardcode "annotator_1" and use _DEFAULT_ANNOTATOR with comments. - Fix some issues with the default config --- seacrowd/sea_datasets/cebuaner/cebuaner.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/seacrowd/sea_datasets/cebuaner/cebuaner.py b/seacrowd/sea_datasets/cebuaner/cebuaner.py index f7fabdeaf..55a1a525f 100644 --- a/seacrowd/sea_datasets/cebuaner/cebuaner.py +++ b/seacrowd/sea_datasets/cebuaner/cebuaner.py @@ -31,10 +31,14 @@ _HOMEPAGE = "https://github.com/mebzmoren/CebuaNER" _LICENSE = Licenses.CC_BY_NC_SA_4_0.value _URLS = { - "annotator_0": "https://github.com/mebzmoren/CebuaNER/raw/main/data/annotated_data/final-1.txt", - "annotator_1": "https://github.com/mebzmoren/CebuaNER/raw/main/data/annotated_data/final-2.txt", + "annotator_1": "https://github.com/mebzmoren/CebuaNER/raw/main/data/annotated_data/final-1.txt", + "annotator_2": "https://github.com/mebzmoren/CebuaNER/raw/main/data/annotated_data/final-2.txt", } +# The alignment between annotators is high, and both can be used as gold-standard data. +# Hence, we chose the first value on the index. +_DEFAULT_ANNOTATOR = "annotator_1" + _SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] _SOURCE_VERSION = "1.0.0" _SEACROWD_VERSION = "1.0.0" @@ -102,7 +106,7 @@ class CebuaNERDataset(datasets.GeneratorBasedBuilder): ), ]) - DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_1_source" + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" def _info(self) -> datasets.DatasetInfo: if self.config.schema == "source": @@ -126,7 +130,7 @@ def _info(self) -> datasets.DatasetInfo: def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]: if self.config.subset_id == _DATASETNAME: - url = _URLS["annotator_0"] + url = _URLS[_DEFAULT_ANNOTATOR] else: _, annotator = self.config.subset_id.split("_", 1) url = _URLS[annotator]