From bccbffbd0d6161676adec509665606ed64f0c420 Mon Sep 17 00:00:00 2001
From: jonibek1999 <mansurovjonibek1999@gmail.com>
Date: Sat, 6 Jan 2024 22:21:10 +0400
Subject: [PATCH 1/3] Add alt_burmese_treebank dataloader

---
 .../alt_burmese_treebank/__init__.py          |   0
 .../alt_burmese_treebank.py                   | 151 ++++++++++++++++++
 .../alt_burmese_treebank/utils/__init__.py    |   0
 .../utils/alt_burmese_treebank_utils.py       |  71 ++++++++
 4 files changed, 222 insertions(+)
 create mode 100644 seacrowd/sea_datasets/alt_burmese_treebank/__init__.py
 create mode 100644 seacrowd/sea_datasets/alt_burmese_treebank/alt_burmese_treebank.py
 create mode 100644 seacrowd/sea_datasets/alt_burmese_treebank/utils/__init__.py
 create mode 100644 seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py

diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/__init__.py b/seacrowd/sea_datasets/alt_burmese_treebank/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/alt_burmese_treebank.py b/seacrowd/sea_datasets/alt_burmese_treebank/alt_burmese_treebank.py
new file mode 100644
index 000000000..ad781f105
--- /dev/null
+++ b/seacrowd/sea_datasets/alt_burmese_treebank/alt_burmese_treebank.py
@@ -0,0 +1,151 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+
+from seacrowd.sea_datasets.alt_burmese_treebank.utils.alt_burmese_treebank_utils import extract_data
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Licenses, Tasks
+
+_CITATION = """\
+@article{
+    10.1145/3373268,
+    author = {Ding, Chenchen and Yee, Sann Su Su and Pa, Win Pa and Soe, Khin Mar and Utiyama, Masao and Sumita, Eiichiro},
+    title = {A Burmese (Myanmar) Treebank: Guideline and Analysis},
+    year = {2020},
+    issue_date = {May 2020},
+    publisher = {Association for Computing Machinery},
+    address = {New York, NY, USA},
+    volume = {19},
+    number = {3},
+    issn = {2375-4699},
+    url = {https://doi.org/10.1145/3373268},
+    doi = {10.1145/3373268},
+    abstract = {A 20,000-sentence Burmese (Myanmar) treebank on news articles has been released under a CC BY-NC-SA license.\
+               Complete phrase structure annotation was developed for each sentence from the morphologically annotated data\
+               prepared in previous work of Ding et&nbsp;al. [1]. As the final result of the Burmese component in the Asian\
+               Language Treebank Project, this is the first large-scale, open-access treebank for the Burmese language.\
+               The annotation details and features of this treebank are presented.\
+               },
+    journal = {ACM Trans. Asian Low-Resour. Lang. Inf. Process.},
+    month = {jan},
+    articleno = {40},
+    numpages = {13},
+    keywords = {Burmese (Myanmar), phrase structure, treebank}
+}
+"""
+
+_DATASETNAME = "alt_burmese_treebank"
+
+_DESCRIPTION = """\
+A 20,000-sentence Burmese (Myanmar) treebank on news articles containing complete phrase structure annotation.\
+As the final result of the Burmese component in the Asian Language Treebank Project, this is the first large-scale,\
+open-access treebank for the Burmese language.
+"""
+
+_HOMEPAGE = "https://zenodo.org/records/3463010"
+
+_LANGUAGES = ["mya"]
+
+_LICENSE = Licenses.CC_BY_NC_SA_4_0.value
+
+_LOCAL = False
+
+_URLS = {
+    _DATASETNAME: "https://zenodo.org/records/3463010/files/my-alt-190530.zip?download=1",
+}
+
+_SUPPORTED_TASKS = [Tasks.CONSTITUENCY_PARSING]
+
+_SOURCE_VERSION = "1.0.0"
+
+_SEACROWD_VERSION = "1.0.0"
+
+
+class AltBurmeseTreebank(datasets.GeneratorBasedBuilder):
+    """A 20,000-sentence Burmese (Myanmar) treebank on news articles containing complete phrase structure annotation.\
+       As the final result of the Burmese component in the Asian Language Treebank Project, this is the first large-scale,\
+       open-access treebank for the Burmese language."""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    BUILDER_CONFIGS = [
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_source",
+            version=SOURCE_VERSION,
+            description=f"{_DATASETNAME} source schema",
+            schema="source",
+            subset_id=f"{_DATASETNAME}",
+        ),
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_seacrowd_tree",
+            version=SEACROWD_VERSION,
+            description=f"{_DATASETNAME} SEACrowd schema",
+            schema="seacrowd_tree",
+            subset_id=f"{_DATASETNAME}",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        if self.config.schema == "source":
+            features = datasets.Features({"id": datasets.Value("string"), "text": datasets.Value("string")})
+        elif self.config.schema == "seacrowd_tree":
+            features = schemas.tree_features
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        urls = _URLS[_DATASETNAME]
+        data_dir = dl_manager.download_and_extract(urls)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "my-alt-190530/data"),
+                    "split": "train",
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+
+        if self.config.schema == "source":
+            with open(filepath, "r") as f:
+                for idx, line in enumerate(f):
+                    example = {"id": line.split("\t")[0], "text": line.split("\t")[1]}
+                    yield idx, example
+
+        elif self.config.schema == "seacrowd_tree":
+            with open(filepath, "r") as f:
+                for idx, line in enumerate(f):
+                    example = extract_data(line)
+                    yield idx, example
diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/utils/__init__.py b/seacrowd/sea_datasets/alt_burmese_treebank/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py b/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py
new file mode 100644
index 000000000..53dcfec1f
--- /dev/null
+++ b/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py
@@ -0,0 +1,71 @@
+import re
+
+
+def extract_parts(input_string):
+    parts = []
+    stack = []
+    current_part = ""
+
+    for char in input_string:
+        if char == "(":
+            stack.append("(")
+        elif char == ")":
+            if stack:
+                stack.pop()
+                if not stack:
+                    parts.append(current_part[1:].strip())
+                    current_part = ""
+            else:
+                parts.append(current_part[1:].strip())
+                current_part = ""
+        if stack:
+            current_part += char
+
+    return parts
+
+
+def extract_sentence(input_string):
+    innermost_pattern = re.compile(r"\(([^()]+)\)")
+    innermost_matches = re.findall(innermost_pattern, input_string)
+    extracted_sentence = " ".join(match.split()[1] for match in innermost_matches)
+    if len(extracted_sentence) == 0:
+        extracted_sentence = " ".join(input_string.split()[1:])
+    return extracted_sentence
+
+
+def extract_data(sentence):
+    nodes = []
+    sub_nodes = {}
+    sub_node_ids = []
+    id_pattern = re.compile(r"SNT\.\d+\.\d+")
+
+    # Extract id, sub_nodes and text of ROOT
+    sentence_id = id_pattern.search(sentence).group()
+    root_sent = sentence[sentence.find("ROOT") : -1]
+    root_subnodes = extract_parts(root_sent)
+    sub_nodes.update({i + 1: root_subnodes[i] for i in range(len(root_subnodes))})
+    sub_node_ids.extend([i + 1 for i in range(len(root_subnodes))])
+    root_text = extract_sentence(root_sent)
+
+    nodes.append({"id": "0", "type": "ROOT", "text": root_text, "offsets": [0, len(root_text) - 1], "subnodes": [f"{len(nodes)+i+1}" for i in range(len(sub_nodes))]})
+
+    while sub_node_ids:
+        sub_node_id = sub_node_ids.pop(0)
+        text = extract_sentence(sub_nodes[sub_node_id])
+
+        cur_subnodes = extract_parts(sub_nodes[sub_node_id])
+
+        if len(cur_subnodes) > 0:
+            id_to_add = sub_node_ids[-1] if len(sub_node_ids) > 0 else sub_node_id
+            cur_subnode_ids = [id_to_add + i + 1 for i in range(len(cur_subnodes))]
+            sub_nodes.update({id_to_add + i + 1: cur_subnodes[i] for i in range(len(cur_subnodes))})
+            sub_node_ids.extend(cur_subnode_ids)
+        else:
+            cur_subnode_ids = []
+
+        node_type = sub_nodes[sub_node_id].split(" ")[0]
+        start = root_text.find(text)
+        end = start + len(text) - 1
+
+        nodes.append({"id": f"{sub_node_id}", "type": node_type, "text": text, "offsets": [start, end], "subnodes": [f"{i}" for i in cur_subnode_ids]})
+    return {"id": sentence_id, "passage": {"id": sentence_id, "type": None, "text": [nodes[0]["text"]], "offsets": nodes[0]["offsets"]}, "nodes": nodes}

From 6013b4088929904493ad408f9992e04732f930c9 Mon Sep 17 00:00:00 2001
From: jonibek1999 <mansurovjonibek1999@gmail.com>
Date: Fri, 19 Jan 2024 22:43:41 +0400
Subject: [PATCH 2/3] Fix bug unique ids

---
 .../utils/alt_burmese_treebank_utils.py                  | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py b/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py
index 53dcfec1f..e08e23b40 100644
--- a/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py
+++ b/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py
@@ -37,17 +37,16 @@ def extract_data(sentence):
     nodes = []
     sub_nodes = {}
     sub_node_ids = []
-    id_pattern = re.compile(r"SNT\.\d+\.\d+")
 
     # Extract id, sub_nodes and text of ROOT
-    sentence_id = id_pattern.search(sentence).group()
+    sentence_id = sentence.split("\t")[0]
     root_sent = sentence[sentence.find("ROOT") : -1]
     root_subnodes = extract_parts(root_sent)
     sub_nodes.update({i + 1: root_subnodes[i] for i in range(len(root_subnodes))})
     sub_node_ids.extend([i + 1 for i in range(len(root_subnodes))])
     root_text = extract_sentence(root_sent)
 
-    nodes.append({"id": "0", "type": "ROOT", "text": root_text, "offsets": [0, len(root_text) - 1], "subnodes": [f"{len(nodes)+i+1}" for i in range(len(sub_nodes))]})
+    nodes.append({"id": f"{sentence_id+'.'+str(0)}", "type": "ROOT", "text": root_text, "offsets": [0, len(root_text) - 1], "subnodes": [f"{len(nodes)+i+1}" for i in range(len(sub_nodes))]})
 
     while sub_node_ids:
         sub_node_id = sub_node_ids.pop(0)
@@ -67,5 +66,5 @@ def extract_data(sentence):
         start = root_text.find(text)
         end = start + len(text) - 1
 
-        nodes.append({"id": f"{sub_node_id}", "type": node_type, "text": text, "offsets": [start, end], "subnodes": [f"{i}" for i in cur_subnode_ids]})
-    return {"id": sentence_id, "passage": {"id": sentence_id, "type": None, "text": [nodes[0]["text"]], "offsets": nodes[0]["offsets"]}, "nodes": nodes}
+        nodes.append({"id": f"{sentence_id+'.'+str(sub_node_id)}", "type": node_type, "text": text, "offsets": [start, end], "subnodes": [f"{sentence_id+'.'+str(i)}" for i in cur_subnode_ids]})
+    return {"id": sentence_id, "passage": {"id": sentence_id + "_0", "type": None, "text": [nodes[0]["text"]], "offsets": nodes[0]["offsets"]}, "nodes": nodes}

From 213ad0bfcb1774ea50c8258264308e536332058a Mon Sep 17 00:00:00 2001
From: jonibek1999 <mansurovjonibek1999@gmail.com>
Date: Mon, 5 Feb 2024 15:20:51 +0400
Subject: [PATCH 3/3] Fix subnodes ids for root node alt_burmese_treebank

---
 .../alt_burmese_treebank/utils/alt_burmese_treebank_utils.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py b/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py
index e08e23b40..3e78e6cc6 100644
--- a/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py
+++ b/seacrowd/sea_datasets/alt_burmese_treebank/utils/alt_burmese_treebank_utils.py
@@ -46,7 +46,7 @@ def extract_data(sentence):
     sub_node_ids.extend([i + 1 for i in range(len(root_subnodes))])
     root_text = extract_sentence(root_sent)
 
-    nodes.append({"id": f"{sentence_id+'.'+str(0)}", "type": "ROOT", "text": root_text, "offsets": [0, len(root_text) - 1], "subnodes": [f"{len(nodes)+i+1}" for i in range(len(sub_nodes))]})
+    nodes.append({"id": f"{sentence_id+'.'+str(0)}", "type": "ROOT", "text": root_text, "offsets": [0, len(root_text) - 1], "subnodes": [f"{sentence_id+'.'+str(i)}" for i in sub_node_ids]})
 
     while sub_node_ids:
         sub_node_id = sub_node_ids.pop(0)