diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/__init__.py b/seacrowd/sea_datasets/alt_burmese_treebank/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/alt_burmese_treebank.py b/seacrowd/sea_datasets/alt_burmese_treebank/alt_burmese_treebank.py new file mode 100644 index 000000000..ad781f105 --- /dev/null +++ b/seacrowd/sea_datasets/alt_burmese_treebank/alt_burmese_treebank.py @@ -0,0 +1,151 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.sea_datasets.alt_burmese_treebank.utils.alt_burmese_treebank_utils import extract_data +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +_CITATION = """\ +@article{ + 10.1145/3373268, + author = {Ding, Chenchen and Yee, Sann Su Su and Pa, Win Pa and Soe, Khin Mar and Utiyama, Masao and Sumita, Eiichiro}, + title = {A Burmese (Myanmar) Treebank: Guideline and Analysis}, + year = {2020}, + issue_date = {May 2020}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, + volume = {19}, + number = {3}, + issn = {2375-4699}, + url = {https://doi.org/10.1145/3373268}, + doi = {10.1145/3373268}, + abstract = {A 20,000-sentence Burmese (Myanmar) treebank on news articles has been released under a CC BY-NC-SA license.\ + Complete phrase structure annotation was developed for each sentence from the morphologically annotated data\ + prepared in previous work of Ding et al. [1]. As the final result of the Burmese component in the Asian\ + Language Treebank Project, this is the first large-scale, open-access treebank for the Burmese language.\ + The annotation details and features of this treebank are presented.\ + }, + journal = {ACM Trans. Asian Low-Resour. Lang. Inf. Process.}, + month = {jan}, + articleno = {40}, + numpages = {13}, + keywords = {Burmese (Myanmar), phrase structure, treebank} +} +""" + +_DATASETNAME = "alt_burmese_treebank" + +_DESCRIPTION = """\ +A 20,000-sentence Burmese (Myanmar) treebank on news articles containing complete phrase structure annotation.\ +As the final result of the Burmese component in the Asian Language Treebank Project, this is the first large-scale,\ +open-access treebank for the Burmese language. +""" + +_HOMEPAGE = "https://zenodo.org/records/3463010" + +_LANGUAGES = ["mya"] + +_LICENSE = Licenses.CC_BY_NC_SA_4_0.value + +_LOCAL = False + +_URLS = { + _DATASETNAME: "https://zenodo.org/records/3463010/files/my-alt-190530.zip?download=1", +} + +_SUPPORTED_TASKS = [Tasks.CONSTITUENCY_PARSING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class AltBurmeseTreebank(datasets.GeneratorBasedBuilder): + """A 20,000-sentence Burmese (Myanmar) treebank on news articles containing complete phrase structure annotation.\ + As the final result of the Burmese component in the Asian Language Treebank Project, this is the first large-scale,\ + open-access treebank for the Burmese language.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_tree", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_tree", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features({"id": datasets.Value("string"), "text": datasets.Value("string")}) + elif self.config.schema == "seacrowd_tree": + features = schemas.tree_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join(data_dir, "my-alt-190530/data"), + "split": "train", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + if self.config.schema == "source": + with open(filepath, "r") as f: + for idx, line in enumerate(f): + example = {"id": line.split("\t")[0], "text": line.split("\t")[1]} + yield idx, example + + elif self.config.schema == "seacrowd_tree": + with open(filepath, "r") as f: + for idx, line in enumerate(f): + example = extract_data(line) + yield idx, example diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/utils/__init__.py b/seacrowd/sea_datasets/alt_burmese_treebank/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py b/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py new file mode 100644 index 000000000..3e78e6cc6 --- /dev/null +++ b/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py @@ -0,0 +1,70 @@ +import re + + +def extract_parts(input_string): + parts = [] + stack = [] + current_part = "" + + for char in input_string: + if char == "(": + stack.append("(") + elif char == ")": + if stack: + stack.pop() + if not stack: + parts.append(current_part[1:].strip()) + current_part = "" + else: + parts.append(current_part[1:].strip()) + current_part = "" + if stack: + current_part += char + + return parts + + +def extract_sentence(input_string): + innermost_pattern = re.compile(r"\(([^()]+)\)") + innermost_matches = re.findall(innermost_pattern, input_string) + extracted_sentence = " ".join(match.split()[1] for match in innermost_matches) + if len(extracted_sentence) == 0: + extracted_sentence = " ".join(input_string.split()[1:]) + return extracted_sentence + + +def extract_data(sentence): + nodes = [] + sub_nodes = {} + sub_node_ids = [] + + # Extract id, sub_nodes and text of ROOT + sentence_id = sentence.split("\t")[0] + root_sent = sentence[sentence.find("ROOT") : -1] + root_subnodes = extract_parts(root_sent) + sub_nodes.update({i + 1: root_subnodes[i] for i in range(len(root_subnodes))}) + sub_node_ids.extend([i + 1 for i in range(len(root_subnodes))]) + root_text = extract_sentence(root_sent) + + nodes.append({"id": f"{sentence_id+'.'+str(0)}", "type": "ROOT", "text": root_text, "offsets": [0, len(root_text) - 1], "subnodes": [f"{sentence_id+'.'+str(i)}" for i in sub_node_ids]}) + + while sub_node_ids: + sub_node_id = sub_node_ids.pop(0) + text = extract_sentence(sub_nodes[sub_node_id]) + + cur_subnodes = extract_parts(sub_nodes[sub_node_id]) + + if len(cur_subnodes) > 0: + id_to_add = sub_node_ids[-1] if len(sub_node_ids) > 0 else sub_node_id + cur_subnode_ids = [id_to_add + i + 1 for i in range(len(cur_subnodes))] + sub_nodes.update({id_to_add + i + 1: cur_subnodes[i] for i in range(len(cur_subnodes))}) + sub_node_ids.extend(cur_subnode_ids) + else: + cur_subnode_ids = [] + + node_type = sub_nodes[sub_node_id].split(" ")[0] + start = root_text.find(text) + end = start + len(text) - 1 + + nodes.append({"id": f"{sentence_id+'.'+str(sub_node_id)}", "type": node_type, "text": text, "offsets": [start, end], "subnodes": [f"{sentence_id+'.'+str(i)}" for i in cur_subnode_ids]}) + return {"id": sentence_id, "passage": {"id": sentence_id + "_0", "type": None, "text": [nodes[0]["text"]], "offsets": nodes[0]["offsets"]}, "nodes": nodes}