SEACrowd · MJonibek · Jan 6, 2024 · Jan 6, 2024
@@ -0,0 +1,151 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+
+from seacrowd.sea_datasets.alt_burmese_treebank.utils.alt_burmese_treebank_utils import extract_data
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Licenses, Tasks
+
+_CITATION = """\
+@article{
+    10.1145/3373268,
+    author = {Ding, Chenchen and Yee, Sann Su Su and Pa, Win Pa and Soe, Khin Mar and Utiyama, Masao and Sumita, Eiichiro},
+    title = {A Burmese (Myanmar) Treebank: Guideline and Analysis},
+    year = {2020},
+    issue_date = {May 2020},
+    publisher = {Association for Computing Machinery},
+    address = {New York, NY, USA},
+    volume = {19},
+    number = {3},
+    issn = {2375-4699},
+    url = {https://doi.org/10.1145/3373268},
+    doi = {10.1145/3373268},
+    abstract = {A 20,000-sentence Burmese (Myanmar) treebank on news articles has been released under a CC BY-NC-SA license.\
+               Complete phrase structure annotation was developed for each sentence from the morphologically annotated data\
+               prepared in previous work of Ding et&nbsp;al. [1]. As the final result of the Burmese component in the Asian\
+               Language Treebank Project, this is the first large-scale, open-access treebank for the Burmese language.\
+               The annotation details and features of this treebank are presented.\
+               },
+    journal = {ACM Trans. Asian Low-Resour. Lang. Inf. Process.},
+    month = {jan},
+    articleno = {40},
+    numpages = {13},
+    keywords = {Burmese (Myanmar), phrase structure, treebank}
+}
+"""
+
+_DATASETNAME = "alt_burmese_treebank"
+
+_DESCRIPTION = """\
+A 20,000-sentence Burmese (Myanmar) treebank on news articles containing complete phrase structure annotation.\
+As the final result of the Burmese component in the Asian Language Treebank Project, this is the first large-scale,\
+open-access treebank for the Burmese language.
+"""
+
+_HOMEPAGE = "https://zenodo.org/records/3463010"
+
+_LANGUAGES = ["mya"]
+
+_LICENSE = Licenses.CC_BY_NC_SA_4_0.value
+
+_LOCAL = False
+
+_URLS = {
+    _DATASETNAME: "https://zenodo.org/records/3463010/files/my-alt-190530.zip?download=1",
+}
+
+_SUPPORTED_TASKS = [Tasks.CONSTITUENCY_PARSING]
+
+_SOURCE_VERSION = "1.0.0"
+
+_SEACROWD_VERSION = "1.0.0"
+
+
+class AltBurmeseTreebank(datasets.GeneratorBasedBuilder):
+    """A 20,000-sentence Burmese (Myanmar) treebank on news articles containing complete phrase structure annotation.\
+       As the final result of the Burmese component in the Asian Language Treebank Project, this is the first large-scale,\
+       open-access treebank for the Burmese language."""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    BUILDER_CONFIGS = [
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_source",
+            version=SOURCE_VERSION,
+            description=f"{_DATASETNAME} source schema",
+            schema="source",
+            subset_id=f"{_DATASETNAME}",
+        ),
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_seacrowd_tree",
+            version=SEACROWD_VERSION,
+            description=f"{_DATASETNAME} SEACrowd schema",
+            schema="seacrowd_tree",
+            subset_id=f"{_DATASETNAME}",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        if self.config.schema == "source":
+            features = datasets.Features({"id": datasets.Value("string"), "text": datasets.Value("string")})
+        elif self.config.schema == "seacrowd_tree":
+            features = schemas.tree_features
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        urls = _URLS[_DATASETNAME]
+        data_dir = dl_manager.download_and_extract(urls)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "my-alt-190530/data"),
+                    "split": "train",
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+
+        if self.config.schema == "source":
+            with open(filepath, "r") as f:
+                for idx, line in enumerate(f):
+                    example = {"id": line.split("\t")[0], "text": line.split("\t")[1]}
+                    yield idx, example
+
+        elif self.config.schema == "seacrowd_tree":
+            with open(filepath, "r") as f:
+                for idx, line in enumerate(f):
+                    example = extract_data(line)
+                    yield idx, example
@@ -0,0 +1,71 @@
+import re
+
+
+def extract_parts(input_string):
+    parts = []
+    stack = []
+    current_part = ""
+
+    for char in input_string:
+        if char == "(":
+            stack.append("(")
+        elif char == ")":
+            if stack:
+                stack.pop()
+                if not stack:
+                    parts.append(current_part[1:].strip())
+                    current_part = ""
+            else:
+                parts.append(current_part[1:].strip())
+                current_part = ""
+        if stack:
+            current_part += char
+
+    return parts
+
+
+def extract_sentence(input_string):
+    innermost_pattern = re.compile(r"\(([^()]+)\)")
+    innermost_matches = re.findall(innermost_pattern, input_string)
+    extracted_sentence = " ".join(match.split()[1] for match in innermost_matches)
+    if len(extracted_sentence) == 0:
+        extracted_sentence = " ".join(input_string.split()[1:])
+    return extracted_sentence
+
+
+def extract_data(sentence):
+    nodes = []
+    sub_nodes = {}
+    sub_node_ids = []
+    id_pattern = re.compile(r"SNT\.\d+\.\d+")
+
+    # Extract id, sub_nodes and text of ROOT
+    sentence_id = id_pattern.search(sentence).group()
+    root_sent = sentence[sentence.find("ROOT") : -1]
+    root_subnodes = extract_parts(root_sent)
+    sub_nodes.update({i + 1: root_subnodes[i] for i in range(len(root_subnodes))})
+    sub_node_ids.extend([i + 1 for i in range(len(root_subnodes))])
+    root_text = extract_sentence(root_sent)
+
+    nodes.append({"id": "0", "type": "ROOT", "text": root_text, "offsets": [0, len(root_text) - 1], "subnodes": [f"{len(nodes)+i+1}" for i in range(len(sub_nodes))]})
+
+    while sub_node_ids:
+        sub_node_id = sub_node_ids.pop(0)
+        text = extract_sentence(sub_nodes[sub_node_id])
+
+        cur_subnodes = extract_parts(sub_nodes[sub_node_id])
+
+        if len(cur_subnodes) > 0:
+            id_to_add = sub_node_ids[-1] if len(sub_node_ids) > 0 else sub_node_id
+            cur_subnode_ids = [id_to_add + i + 1 for i in range(len(cur_subnodes))]
+            sub_nodes.update({id_to_add + i + 1: cur_subnodes[i] for i in range(len(cur_subnodes))})
+            sub_node_ids.extend(cur_subnode_ids)
+        else:
+            cur_subnode_ids = []
+
+        node_type = sub_nodes[sub_node_id].split(" ")[0]
+        start = root_text.find(text)
+        end = start + len(text) - 1
+
+        nodes.append({"id": f"{sub_node_id}", "type": node_type, "text": text, "offsets": [start, end], "subnodes": [f"{i}" for i in cur_subnode_ids]})
+    return {"id": sentence_id, "passage": {"id": sentence_id, "type": None, "text": [nodes[0]["text"]], "offsets": nodes[0]["offsets"]}, "nodes": nodes}
@@ -5,6 +5,7 @@
 from seacrowd.utils.schemas import (
     image_text_features,
     kb_features,
+    tree_features,
     pairs_features,
     pairs_features_score,
     pairs_multi_features,
@@ -45,6 +46,9 @@ class Tasks(Enum):
     COREFERENCE_RESOLUTION = "COREF"
     SPAN_BASED_ABSA = "SPAN_ABSA"
 
+    # Tree
+    CONSTITUENCY_PARSING = "CONST_PAR"
+
     # Single Text Classification
     ASPECT_BASED_SENTIMENT_ANALYSIS = "ABSA"
     EMOTION_CLASSIFICATION = "EC"
@@ -202,6 +206,7 @@ class Licenses(Enum):
 
 TASK_TO_SCHEMA = {
     Tasks.DEPENDENCY_PARSING: "KB",
+    Tasks.CONSTITUENCY_PARSING: "TREE",
     Tasks.WORD_SENSE_DISAMBIGUATION: "T2T",
     Tasks.WORD_ANALOGY: "T2T",
     Tasks.KEYWORD_EXTRACTION: "SEQ_LABEL",
@@ -268,6 +273,7 @@ class Licenses(Enum):
 
 SCHEMA_TO_FEATURES = {
     "KB": kb_features,
+    "TREE": tree_features,
     "QA": qa_features,
     "T2T": text2text_features,
     "TEXT": text_features(),

@@ -0,0 +1,35 @@
+"""
+Tree Schema
+
+This schema assumes a document with subnodes elements
+and a tree hierarchy.
+
+For example:
+            NODE1    .....
+        //
+ROOT    -   NODE2    .....
+        \\
+            NODE3    .....
+"""
+import datasets
+
+features = datasets.Features(
+    {
+        "id": datasets.Value("string"),
+        "passage": {
+            "id": datasets.Value("string"),
+            "type": datasets.Value("string"),
+            "text": datasets.Sequence(datasets.Value("string")),
+            "offsets": datasets.Sequence(datasets.Value("int32")),
+        },
+        "nodes": [
+            {
+                "id": datasets.Value("string"),
+                "type": datasets.Value("string"),
+                "text": datasets.Value("string"),
+                "offsets": datasets.Sequence(datasets.Value("int32")),
+                "subnodes": datasets.Sequence(datasets.Value("string")),  # ids of subnodes
+            }
+        ],
+    }
+)