From f3d1b9a3ddd0f31e0cd22034c3d4a0ef0ac6ae38 Mon Sep 17 00:00:00 2001
From: Frederikus Hudi <frederikus.hudi@gmail.com>
Date: Wed, 1 May 2024 22:30:44 +0900
Subject: [PATCH 1/7] Create dataset loader for UP2.0 (#571)

---
 seacrowd/sea_datasets/up2/__init__.py |   0
 seacrowd/sea_datasets/up2/up2.py      | 203 ++++++++++++++++++++++++++
 2 files changed, 203 insertions(+)
 create mode 100644 seacrowd/sea_datasets/up2/__init__.py
 create mode 100644 seacrowd/sea_datasets/up2/up2.py

diff --git a/seacrowd/sea_datasets/up2/__init__.py b/seacrowd/sea_datasets/up2/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/seacrowd/sea_datasets/up2/up2.py b/seacrowd/sea_datasets/up2/up2.py
new file mode 100644
index 000000000..48995eb2c
--- /dev/null
+++ b/seacrowd/sea_datasets/up2/up2.py
@@ -0,0 +1,203 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Southeast Asian language subsets from Universal Propositions (UP) 2.0 dataset.
+Semantic role labeling (SRL) is a shallow semantic parsing task that identifies “who did what to whom when, where etc” for each predicate in a sentence.
+It provides an intermediate (shallow) level of a semantic representation that helps the map from syntactic parse structures to more fully-specified representations of meaning.
+"""
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Tasks, Licenses
+from seacrowd.utils.common_parser import load_ud_data
+
+_CITATION = """\
+@inproceedings{jindal-etal-2022-universal,
+    title = "Universal {P}roposition {B}ank 2.0",
+    author = "Jindal, Ishan  and
+      Rademaker, Alexandre  and
+      Ulewicz, Micha{\l}  and
+      Linh, Ha  and
+      Nguyen, Huyen  and
+      Tran, Khoi-Nguyen  and
+      Zhu, Huaiyu  and
+      Li, Yunyao",
+    booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
+    month = jun,
+    year = "2022",
+    address = "Marseille, France",
+    publisher = "European Language Resources Association",
+    url = "https://aclanthology.org/2022.lrec-1.181",
+    pages = "1700--1711",
+}}
+"""
+
+_DATASETNAME = "up2"
+
+_DESCRIPTION = """\
+Southeast Asian language subsets from Universal Propositions (UP) 2.0 dataset.
+Semantic role labeling (SRL) is a shallow semantic parsing task that identifies “who did what to whom when, where etc” for each predicate in a sentence.
+It provides an intermediate (shallow) level of a semantic representation that helps the map from syntactic parse structures to more fully-specified representations of meaning.
+"""
+
+_HOMEPAGE = "https://universalpropositions.github.io/"
+
+_LANGUAGES = ["ind", "vie"]  # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
+
+_LICENSE = Licenses.CDLA_SHARING_1_0.value
+
+_LOCAL = False
+
+_URLS = {
+    split: {
+        "ind": [
+            f"https://raw.githubusercontent.com/UniversalPropositions/UP_Indonesian-GSD/main/id_gsd-up-{split}.conllup",
+            f"https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-{split}.conllu",
+            # f"https://raw.githubusercontent.com/indolem/indolem/main/dependency_parsing/UD_Indonesian_GSD/id_gsd-ud-{split}.conllu",   # there are missing sent_id from the IndoLEM's dataset.
+        ],
+        "vie": [
+            f"https://raw.githubusercontent.com/UniversalPropositions/UP_Vietnamese-VTB/main/vi_vtb-up-{split}.conllup",
+            # f"https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-{split}.conllu", # new data => mismatch.
+            f"https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/0edef6d63df949aea0494c6d4ff4f91bb1959019/vi_vtb-ud-{split}.conllu",  # r2.8
+        ]
+    }
+    for split in ["train", "test", "dev"]
+}
+
+# TODO: add supported task by dataset. One dataset may support multiple tasks --> # TODO: add supported task by dataset. One dataset may support multiple tasks.
+_SUPPORTED_TASKS = []
+
+_SOURCE_VERSION = "1.0.0"
+
+_SEACROWD_VERSION = "1.0.0"
+
+
+class UP2Dataset(datasets.GeneratorBasedBuilder):
+    """
+    Southeast Asian language subsets from Universal Propositions (UP) 2.0 dataset.
+    Semantic role labeling (SRL) is a shallow semantic parsing task that identifies “who did what to whom when, where etc” for each predicate in a sentence.
+    It provides an intermediate (shallow) level of a semantic representation that helps the map from syntactic parse structures to more fully-specified representations of meaning.
+    """
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    BUILDER_CONFIGS = [
+        *[SEACrowdConfig(
+            name=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}_source",
+            version=datasets.Version(_SOURCE_VERSION),
+            description=f"{_DATASETNAME} source schema",
+            schema="source",
+            subset_id=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}",
+        ) for _LANG in ['', *_LANGUAGES]],
+        # *[SEACrowdConfig(
+        #     name=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}_seacrowd_[seacrowd_schema_name]",
+        #     version=datasets.Version(_SEACROWD_VERSION),
+        #     description=f"{_DATASETNAME} SEACrowd schema",
+        #     schema="seacrowd_[seacrowd_schema_name]",
+        #     subset_id=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}",
+        # ) for _LANG in ['', *_LANGUAGES]],
+    ]
+
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_LANGUAGES[0]}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        if self.config.schema == "source":
+            features = datasets.Features(
+               {
+                   "lang": datasets.Value("string"),
+                   "source_sent_id": datasets.Value("string"),
+                   "sent_id": datasets.Value("string"),
+                   "text": datasets.Value("string"),
+                   "id": [datasets.Value("string")],
+                   "up:pred": [datasets.Value("string")],
+                   "up:argheads": [datasets.Value("string")],
+                   "up:argspans": [datasets.Value("string")],
+               }
+            )
+
+        # For example seacrowd_kb, seacrowd_t2t
+        # elif self.config.schema == "seacrowd_[seacrowdschema_name]":
+        #     features = schemas.kb_features
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        _subset_id = self.config.subset_id.split("_")
+        if len(_subset_id) > 1:
+            _lang = _subset_id[1]
+            urls = {split: {_lang: urls_up_ud[_lang]} for split, urls_up_ud in _URLS.items()}
+        else:
+            urls = _URLS
+        data_dir = dl_manager.download_and_extract(urls)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepaths": data_dir["train"],
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepaths": data_dir["test"],
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepaths": data_dir["dev"],
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepaths: Dict[str, List[Path]]) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+
+        _subset_id = self.config.subset_id.split("_")
+        _langs = [_subset_id[1]] if (len(_subset_id) > 1) else _LANGUAGES
+
+        for _lang in _langs:
+            data = list(load_ud_data(filepaths[_lang][0]))
+            sentid2text = {_b["sent_id"]: _b["text"] for _b in load_ud_data(filepaths[_lang][1])}
+
+            for cur_data in data:
+                txt_src = sentid2text[cur_data["sent_id"]]
+                txt_up = cur_data["text"].rsplit("..........", 1)[0].rstrip(" -")
+                assert txt_up == txt_src[:len(txt_up)], f"Text mismatch. Found '{txt_up}' in conllup but source is '{txt_src[:len(txt_up)]}'"
+                cur_data["text"] = txt_src
+                cur_data["lang"] = _lang
+
+            if self.config.schema == "source":
+                for key, example in enumerate(data):
+                    yield f"{_lang}_{key}", example
+
+            # elif self.config.schema == "seacrowd_[seacrowd_schema_name]":
+            #     for key, example in enumerate(data):
+            #         yield key, {}

From a3281a35b0619552bad718af0baa4a48f5d011a1 Mon Sep 17 00:00:00 2001
From: Frederikus Hudi <frederikus.hudi@gmail.com>
Date: Tue, 21 May 2024 20:02:07 +0900
Subject: [PATCH 2/7] Update seacrowd/sea_datasets/up2/up2.py

Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com>
---
 seacrowd/sea_datasets/up2/up2.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/seacrowd/sea_datasets/up2/up2.py b/seacrowd/sea_datasets/up2/up2.py
index 48995eb2c..ff0af16da 100644
--- a/seacrowd/sea_datasets/up2/up2.py
+++ b/seacrowd/sea_datasets/up2/up2.py
@@ -134,9 +134,6 @@ def _info(self) -> datasets.DatasetInfo:
                }
             )
 
-        # For example seacrowd_kb, seacrowd_t2t
-        # elif self.config.schema == "seacrowd_[seacrowdschema_name]":
-        #     features = schemas.kb_features
 
         return datasets.DatasetInfo(
             description=_DESCRIPTION,

From fea5abcff748533205e471c9e490ca5fe171e4ca Mon Sep 17 00:00:00 2001
From: Frederikus Hudi <frederikus.hudi@gmail.com>
Date: Tue, 21 May 2024 20:02:20 +0900
Subject: [PATCH 3/7] Update seacrowd/sea_datasets/up2/up2.py

Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com>
---
 seacrowd/sea_datasets/up2/up2.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/seacrowd/sea_datasets/up2/up2.py b/seacrowd/sea_datasets/up2/up2.py
index ff0af16da..c0dff04a8 100644
--- a/seacrowd/sea_datasets/up2/up2.py
+++ b/seacrowd/sea_datasets/up2/up2.py
@@ -81,7 +81,6 @@
     for split in ["train", "test", "dev"]
 }
 
-# TODO: add supported task by dataset. One dataset may support multiple tasks --> # TODO: add supported task by dataset. One dataset may support multiple tasks.
 _SUPPORTED_TASKS = []
 
 _SOURCE_VERSION = "1.0.0"

From cfd49c3ae54229566f32f72a5a74be83d79ebc6f Mon Sep 17 00:00:00 2001
From: Frederikus Hudi <frederikus.hudi@gmail.com>
Date: Tue, 21 May 2024 20:02:31 +0900
Subject: [PATCH 4/7] Update seacrowd/sea_datasets/up2/up2.py

Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com>
---
 seacrowd/sea_datasets/up2/up2.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/seacrowd/sea_datasets/up2/up2.py b/seacrowd/sea_datasets/up2/up2.py
index c0dff04a8..79700625c 100644
--- a/seacrowd/sea_datasets/up2/up2.py
+++ b/seacrowd/sea_datasets/up2/up2.py
@@ -194,6 +194,3 @@ def _generate_examples(self, filepaths: Dict[str, List[Path]]) -> Tuple[int, Dic
                 for key, example in enumerate(data):
                     yield f"{_lang}_{key}", example
 
-            # elif self.config.schema == "seacrowd_[seacrowd_schema_name]":
-            #     for key, example in enumerate(data):
-            #         yield key, {}

From 9d55ba242dabb10fd23a7ea1565540b3e58f3adc Mon Sep 17 00:00:00 2001
From: Frederikus Hudi <frederikus.hudi@gmail.com>
Date: Tue, 21 May 2024 20:03:23 +0900
Subject: [PATCH 5/7] Update up2.py

---
 seacrowd/sea_datasets/up2/up2.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/seacrowd/sea_datasets/up2/up2.py b/seacrowd/sea_datasets/up2/up2.py
index 79700625c..0f7cff8b1 100644
--- a/seacrowd/sea_datasets/up2/up2.py
+++ b/seacrowd/sea_datasets/up2/up2.py
@@ -106,13 +106,6 @@ class UP2Dataset(datasets.GeneratorBasedBuilder):
             schema="source",
             subset_id=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}",
         ) for _LANG in ['', *_LANGUAGES]],
-        # *[SEACrowdConfig(
-        #     name=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}_seacrowd_[seacrowd_schema_name]",
-        #     version=datasets.Version(_SEACROWD_VERSION),
-        #     description=f"{_DATASETNAME} SEACrowd schema",
-        #     schema="seacrowd_[seacrowd_schema_name]",
-        #     subset_id=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}",
-        # ) for _LANG in ['', *_LANGUAGES]],
     ]
 
     DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_LANGUAGES[0]}_source"

From 8ba35b5f780192b94fca0434cfce187be2ca4a5e Mon Sep 17 00:00:00 2001
From: Frederikus Hudi <frederikus.hudi@gmail.com>
Date: Fri, 31 May 2024 01:40:43 +0900
Subject: [PATCH 6/7] Update up2.py, reformat from makefile.

---
 seacrowd/sea_datasets/up2/up2.py | 50 ++++++++++++++++----------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/seacrowd/sea_datasets/up2/up2.py b/seacrowd/sea_datasets/up2/up2.py
index 0f7cff8b1..010bc9c4e 100644
--- a/seacrowd/sea_datasets/up2/up2.py
+++ b/seacrowd/sea_datasets/up2/up2.py
@@ -23,17 +23,16 @@
 
 import datasets
 
-from seacrowd.utils import schemas
-from seacrowd.utils.configs import SEACrowdConfig
-from seacrowd.utils.constants import Tasks, Licenses
 from seacrowd.utils.common_parser import load_ud_data
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Licenses
 
 _CITATION = """\
 @inproceedings{jindal-etal-2022-universal,
     title = "Universal {P}roposition {B}ank 2.0",
     author = "Jindal, Ishan  and
       Rademaker, Alexandre  and
-      Ulewicz, Micha{\l}  and
+      Ulewicz, Micha{l}  and
       Linh, Ha  and
       Nguyen, Huyen  and
       Tran, Khoi-Nguyen  and
@@ -76,7 +75,7 @@
             f"https://raw.githubusercontent.com/UniversalPropositions/UP_Vietnamese-VTB/main/vi_vtb-up-{split}.conllup",
             # f"https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-{split}.conllu", # new data => mismatch.
             f"https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/0edef6d63df949aea0494c6d4ff4f91bb1959019/vi_vtb-ud-{split}.conllu",  # r2.8
-        ]
+        ],
     }
     for split in ["train", "test", "dev"]
 }
@@ -99,13 +98,16 @@ class UP2Dataset(datasets.GeneratorBasedBuilder):
     SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
 
     BUILDER_CONFIGS = [
-        *[SEACrowdConfig(
-            name=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}_source",
-            version=datasets.Version(_SOURCE_VERSION),
-            description=f"{_DATASETNAME} source schema",
-            schema="source",
-            subset_id=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}",
-        ) for _LANG in ['', *_LANGUAGES]],
+        *[
+            SEACrowdConfig(
+                name=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}_source",
+                version=datasets.Version(_SOURCE_VERSION),
+                description=f"{_DATASETNAME} source schema",
+                schema="source",
+                subset_id=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}",
+            )
+            for _LANG in ["", *_LANGUAGES]
+        ],
     ]
 
     DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_LANGUAGES[0]}_source"
@@ -114,19 +116,18 @@ def _info(self) -> datasets.DatasetInfo:
 
         if self.config.schema == "source":
             features = datasets.Features(
-               {
-                   "lang": datasets.Value("string"),
-                   "source_sent_id": datasets.Value("string"),
-                   "sent_id": datasets.Value("string"),
-                   "text": datasets.Value("string"),
-                   "id": [datasets.Value("string")],
-                   "up:pred": [datasets.Value("string")],
-                   "up:argheads": [datasets.Value("string")],
-                   "up:argspans": [datasets.Value("string")],
-               }
+                {
+                    "lang": datasets.Value("string"),
+                    "source_sent_id": datasets.Value("string"),
+                    "sent_id": datasets.Value("string"),
+                    "text": datasets.Value("string"),
+                    "id": [datasets.Value("string")],
+                    "up:pred": [datasets.Value("string")],
+                    "up:argheads": [datasets.Value("string")],
+                    "up:argspans": [datasets.Value("string")],
+                }
             )
 
-
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
@@ -179,11 +180,10 @@ def _generate_examples(self, filepaths: Dict[str, List[Path]]) -> Tuple[int, Dic
             for cur_data in data:
                 txt_src = sentid2text[cur_data["sent_id"]]
                 txt_up = cur_data["text"].rsplit("..........", 1)[0].rstrip(" -")
-                assert txt_up == txt_src[:len(txt_up)], f"Text mismatch. Found '{txt_up}' in conllup but source is '{txt_src[:len(txt_up)]}'"
+                assert txt_up == txt_src[: len(txt_up)], f"Text mismatch. Found '{txt_up}' in conllup but source is '{txt_src[:len(txt_up)]}'"
                 cur_data["text"] = txt_src
                 cur_data["lang"] = _lang
 
             if self.config.schema == "source":
                 for key, example in enumerate(data):
                     yield f"{_lang}_{key}", example
-

From 91d389c8d3a78982de6fb76739d028ffdce78b3d Mon Sep 17 00:00:00 2001
From: Frederikus Hudi <frederikus.hudi@gmail.com>
Date: Fri, 31 May 2024 01:41:52 +0900
Subject: [PATCH 7/7] Update common_parser.py for a safer IO process.

---
 seacrowd/utils/common_parser.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/seacrowd/utils/common_parser.py b/seacrowd/utils/common_parser.py
index 8c05b6e94..efc0a6163 100644
--- a/seacrowd/utils/common_parser.py
+++ b/seacrowd/utils/common_parser.py
@@ -34,7 +34,9 @@ def load_ud_data(filepath, filter_kwargs=None, assert_fn=None):
     :param assert_fn: assertion to make sure raw data is in the expected format
     :return: generator with schema following CONLLU
     """
-    dataset_raw = parse(open(filepath).read())
+    with open(filepath, "r", encoding="utf8") as f:
+        raw_data = f.read()
+    dataset_raw = parse(raw_data)
 
     filter_kwargs = filter_kwargs or dict()
     if callable(assert_fn):