From bccbffbd0d6161676adec509665606ed64f0c420 Mon Sep 17 00:00:00 2001 From: jonibek1999 Date: Sat, 6 Jan 2024 22:21:10 +0400 Subject: [PATCH 1/3] Add alt_burmese_treebank dataloader --- .../alt_burmese_treebank/__init__.py | 0 .../alt_burmese_treebank.py | 151 ++++++++++++++++++ .../alt_burmese_treebank/utils/__init__.py | 0 .../utils/alt_burmese_treebank_utils.py | 71 ++++++++ 4 files changed, 222 insertions(+) create mode 100644 seacrowd/sea_datasets/alt_burmese_treebank/__init__.py create mode 100644 seacrowd/sea_datasets/alt_burmese_treebank/alt_burmese_treebank.py create mode 100644 seacrowd/sea_datasets/alt_burmese_treebank/utils/__init__.py create mode 100644 seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/__init__.py b/seacrowd/sea_datasets/alt_burmese_treebank/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/alt_burmese_treebank.py b/seacrowd/sea_datasets/alt_burmese_treebank/alt_burmese_treebank.py new file mode 100644 index 000000000..ad781f105 --- /dev/null +++ b/seacrowd/sea_datasets/alt_burmese_treebank/alt_burmese_treebank.py @@ -0,0 +1,151 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.sea_datasets.alt_burmese_treebank.utils.alt_burmese_treebank_utils import extract_data +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{ + 10.1145/3373268, + author = {Ding, Chenchen and Yee, Sann Su Su and Pa, Win Pa and Soe, Khin Mar and Utiyama, Masao and Sumita, Eiichiro}, + title = {A Burmese (Myanmar) Treebank: Guideline and Analysis}, + year = {2020}, + issue_date = {May 2020}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, + volume = {19}, + number = {3}, + issn = {2375-4699}, + url = {https://doi.org/10.1145/3373268}, + doi = {10.1145/3373268}, + abstract = {A 20,000-sentence Burmese (Myanmar) treebank on news articles has been released under a CC BY-NC-SA license.\ + Complete phrase structure annotation was developed for each sentence from the morphologically annotated data\ + prepared in previous work of Ding et al. [1]. As the final result of the Burmese component in the Asian\ + Language Treebank Project, this is the first large-scale, open-access treebank for the Burmese language.\ + The annotation details and features of this treebank are presented.\ + }, + journal = {ACM Trans. Asian Low-Resour. Lang. Inf. Process.}, + month = {jan}, + articleno = {40}, + numpages = {13}, + keywords = {Burmese (Myanmar), phrase structure, treebank} +} +""" + +_DATASETNAME = "alt_burmese_treebank" + +_DESCRIPTION = """\ +A 20,000-sentence Burmese (Myanmar) treebank on news articles containing complete phrase structure annotation.\ +As the final result of the Burmese component in the Asian Language Treebank Project, this is the first large-scale,\ +open-access treebank for the Burmese language. +""" + +_HOMEPAGE = "https://zenodo.org/records/3463010" + +_LANGUAGES = ["mya"] + +_LICENSE = Licenses.CC_BY_NC_SA_4_0.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://zenodo.org/records/3463010/files/my-alt-190530.zip?download=1", +} + +_SUPPORTED_TASKS = [Tasks.CONSTITUENCY_PARSING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class AltBurmeseTreebank(datasets.GeneratorBasedBuilder): + """A 20,000-sentence Burmese (Myanmar) treebank on news articles containing complete phrase structure annotation.\ + As the final result of the Burmese component in the Asian Language Treebank Project, this is the first large-scale,\ + open-access treebank for the Burmese language.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_tree", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_tree", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features({"id": datasets.Value("string"), "text": datasets.Value("string")}) + elif self.config.schema == "seacrowd_tree": + features = schemas.tree_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join(data_dir, "my-alt-190530/data"), + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + if self.config.schema == "source": + with open(filepath, "r") as f: + for idx, line in enumerate(f): + example = {"id": line.split("\t")[0], "text": line.split("\t")[1]} + yield idx, example + + elif self.config.schema == "seacrowd_tree": + with open(filepath, "r") as f: + for idx, line in enumerate(f): + example = extract_data(line) + yield idx, example diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/utils/__init__.py b/seacrowd/sea_datasets/alt_burmese_treebank/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py b/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py new file mode 100644 index 000000000..53dcfec1f --- /dev/null +++ b/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py @@ -0,0 +1,71 @@ +import re + + +def extract_parts(input_string): + parts = [] + stack = [] + current_part = "" + + for char in input_string: + if char == "(": + stack.append("(") + elif char == ")": + if stack: + stack.pop() + if not stack: + parts.append(current_part[1:].strip()) + current_part = "" + else: + parts.append(current_part[1:].strip()) + current_part = "" + if stack: + current_part += char + + return parts + + +def extract_sentence(input_string): + innermost_pattern = re.compile(r"\(([^()]+)\)") + innermost_matches = re.findall(innermost_pattern, input_string) + extracted_sentence = " ".join(match.split()[1] for match in innermost_matches) + if len(extracted_sentence) == 0: + extracted_sentence = " ".join(input_string.split()[1:]) + return extracted_sentence + + +def extract_data(sentence): + nodes = [] + sub_nodes = {} + sub_node_ids = [] + id_pattern = re.compile(r"SNT\.\d+\.\d+") + + # Extract id, sub_nodes and text of ROOT + sentence_id = id_pattern.search(sentence).group() + root_sent = sentence[sentence.find("ROOT") : -1] + root_subnodes = extract_parts(root_sent) + sub_nodes.update({i + 1: root_subnodes[i] for i in range(len(root_subnodes))}) + sub_node_ids.extend([i + 1 for i in range(len(root_subnodes))]) + root_text = extract_sentence(root_sent) + + nodes.append({"id": "0", "type": "ROOT", "text": root_text, "offsets": [0, len(root_text) - 1], "subnodes": [f"{len(nodes)+i+1}" for i in range(len(sub_nodes))]}) + + while sub_node_ids: + sub_node_id = sub_node_ids.pop(0) + text = extract_sentence(sub_nodes[sub_node_id]) + + cur_subnodes = extract_parts(sub_nodes[sub_node_id]) + + if len(cur_subnodes) > 0: + id_to_add = sub_node_ids[-1] if len(sub_node_ids) > 0 else sub_node_id + cur_subnode_ids = [id_to_add + i + 1 for i in range(len(cur_subnodes))] + sub_nodes.update({id_to_add + i + 1: cur_subnodes[i] for i in range(len(cur_subnodes))}) + sub_node_ids.extend(cur_subnode_ids) + else: + cur_subnode_ids = [] + + node_type = sub_nodes[sub_node_id].split(" ")[0] + start = root_text.find(text) + end = start + len(text) - 1 + + nodes.append({"id": f"{sub_node_id}", "type": node_type, "text": text, "offsets": [start, end], "subnodes": [f"{i}" for i in cur_subnode_ids]}) + return {"id": sentence_id, "passage": {"id": sentence_id, "type": None, "text": [nodes[0]["text"]], "offsets": nodes[0]["offsets"]}, "nodes": nodes} From 6013b4088929904493ad408f9992e04732f930c9 Mon Sep 17 00:00:00 2001 From: jonibek1999 Date: Fri, 19 Jan 2024 22:43:41 +0400 Subject: [PATCH 2/3] Fix bug unique ids --- .../utils/alt_burmese_treebank_utils.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py b/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py index 53dcfec1f..e08e23b40 100644 --- a/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py +++ b/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py @@ -37,17 +37,16 @@ def extract_data(sentence): nodes = [] sub_nodes = {} sub_node_ids = [] - id_pattern = re.compile(r"SNT\.\d+\.\d+") # Extract id, sub_nodes and text of ROOT - sentence_id = id_pattern.search(sentence).group() + sentence_id = sentence.split("\t")[0] root_sent = sentence[sentence.find("ROOT") : -1] root_subnodes = extract_parts(root_sent) sub_nodes.update({i + 1: root_subnodes[i] for i in range(len(root_subnodes))}) sub_node_ids.extend([i + 1 for i in range(len(root_subnodes))]) root_text = extract_sentence(root_sent) - nodes.append({"id": "0", "type": "ROOT", "text": root_text, "offsets": [0, len(root_text) - 1], "subnodes": [f"{len(nodes)+i+1}" for i in range(len(sub_nodes))]}) + nodes.append({"id": f"{sentence_id+'.'+str(0)}", "type": "ROOT", "text": root_text, "offsets": [0, len(root_text) - 1], "subnodes": [f"{len(nodes)+i+1}" for i in range(len(sub_nodes))]}) while sub_node_ids: sub_node_id = sub_node_ids.pop(0) @@ -67,5 +66,5 @@ def extract_data(sentence): start = root_text.find(text) end = start + len(text) - 1 - nodes.append({"id": f"{sub_node_id}", "type": node_type, "text": text, "offsets": [start, end], "subnodes": [f"{i}" for i in cur_subnode_ids]}) - return {"id": sentence_id, "passage": {"id": sentence_id, "type": None, "text": [nodes[0]["text"]], "offsets": nodes[0]["offsets"]}, "nodes": nodes} + nodes.append({"id": f"{sentence_id+'.'+str(sub_node_id)}", "type": node_type, "text": text, "offsets": [start, end], "subnodes": [f"{sentence_id+'.'+str(i)}" for i in cur_subnode_ids]}) + return {"id": sentence_id, "passage": {"id": sentence_id + "_0", "type": None, "text": [nodes[0]["text"]], "offsets": nodes[0]["offsets"]}, "nodes": nodes} From 213ad0bfcb1774ea50c8258264308e536332058a Mon Sep 17 00:00:00 2001 From: jonibek1999 Date: Mon, 5 Feb 2024 15:20:51 +0400 Subject: [PATCH 3/3] Fix subnodes ids for root node alt_burmese_treebank --- .../alt_burmese_treebank/utils/alt_burmese_treebank_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py b/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py index e08e23b40..3e78e6cc6 100644 --- a/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py +++ b/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py @@ -46,7 +46,7 @@ def extract_data(sentence): sub_node_ids.extend([i + 1 for i in range(len(root_subnodes))]) root_text = extract_sentence(root_sent) - nodes.append({"id": f"{sentence_id+'.'+str(0)}", "type": "ROOT", "text": root_text, "offsets": [0, len(root_text) - 1], "subnodes": [f"{len(nodes)+i+1}" for i in range(len(sub_nodes))]}) + nodes.append({"id": f"{sentence_id+'.'+str(0)}", "type": "ROOT", "text": root_text, "offsets": [0, len(root_text) - 1], "subnodes": [f"{sentence_id+'.'+str(i)}" for i in sub_node_ids]}) while sub_node_ids: sub_node_id = sub_node_ids.pop(0)