From f3d1b9a3ddd0f31e0cd22034c3d4a0ef0ac6ae38 Mon Sep 17 00:00:00 2001 From: Frederikus Hudi Date: Wed, 1 May 2024 22:30:44 +0900 Subject: [PATCH 1/7] Create dataset loader for UP2.0 (#571) --- seacrowd/sea_datasets/up2/__init__.py | 0 seacrowd/sea_datasets/up2/up2.py | 203 ++++++++++++++++++++++++++ 2 files changed, 203 insertions(+) create mode 100644 seacrowd/sea_datasets/up2/__init__.py create mode 100644 seacrowd/sea_datasets/up2/up2.py diff --git a/seacrowd/sea_datasets/up2/__init__.py b/seacrowd/sea_datasets/up2/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/up2/up2.py b/seacrowd/sea_datasets/up2/up2.py new file mode 100644 index 000000000..48995eb2c --- /dev/null +++ b/seacrowd/sea_datasets/up2/up2.py @@ -0,0 +1,203 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Southeast Asian language subsets from Universal Propositions (UP) 2.0 dataset. +Semantic role labeling (SRL) is a shallow semantic parsing task that identifies “who did what to whom when, where etc” for each predicate in a sentence. +It provides an intermediate (shallow) level of a semantic representation that helps the map from syntactic parse structures to more fully-specified representations of meaning. +""" +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses +from seacrowd.utils.common_parser import load_ud_data + +_CITATION = """\ +@inproceedings{jindal-etal-2022-universal, + title = "Universal {P}roposition {B}ank 2.0", + author = "Jindal, Ishan and + Rademaker, Alexandre and + Ulewicz, Micha{\l} and + Linh, Ha and + Nguyen, Huyen and + Tran, Khoi-Nguyen and + Zhu, Huaiyu and + Li, Yunyao", + booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference", + month = jun, + year = "2022", + address = "Marseille, France", + publisher = "European Language Resources Association", + url = "https://aclanthology.org/2022.lrec-1.181", + pages = "1700--1711", +}} +""" + +_DATASETNAME = "up2" + +_DESCRIPTION = """\ +Southeast Asian language subsets from Universal Propositions (UP) 2.0 dataset. +Semantic role labeling (SRL) is a shallow semantic parsing task that identifies “who did what to whom when, where etc” for each predicate in a sentence. +It provides an intermediate (shallow) level of a semantic representation that helps the map from syntactic parse structures to more fully-specified representations of meaning. +""" + +_HOMEPAGE = "https://universalpropositions.github.io/" + +_LANGUAGES = ["ind", "vie"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +_LICENSE = Licenses.CDLA_SHARING_1_0.value + +_LOCAL = False + +_URLS = { + split: { + "ind": [ + f"https://raw.githubusercontent.com/UniversalPropositions/UP_Indonesian-GSD/main/id_gsd-up-{split}.conllup", + f"https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-{split}.conllu", + # f"https://raw.githubusercontent.com/indolem/indolem/main/dependency_parsing/UD_Indonesian_GSD/id_gsd-ud-{split}.conllu", # there are missing sent_id from the IndoLEM's dataset. + ], + "vie": [ + f"https://raw.githubusercontent.com/UniversalPropositions/UP_Vietnamese-VTB/main/vi_vtb-up-{split}.conllup", + # f"https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-{split}.conllu", # new data => mismatch. + f"https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/0edef6d63df949aea0494c6d4ff4f91bb1959019/vi_vtb-ud-{split}.conllu", # r2.8 + ] + } + for split in ["train", "test", "dev"] +} + +# TODO: add supported task by dataset. One dataset may support multiple tasks --> # TODO: add supported task by dataset. One dataset may support multiple tasks. +_SUPPORTED_TASKS = [] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class UP2Dataset(datasets.GeneratorBasedBuilder): + """ + Southeast Asian language subsets from Universal Propositions (UP) 2.0 dataset. + Semantic role labeling (SRL) is a shallow semantic parsing task that identifies “who did what to whom when, where etc” for each predicate in a sentence. + It provides an intermediate (shallow) level of a semantic representation that helps the map from syntactic parse structures to more fully-specified representations of meaning. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + *[SEACrowdConfig( + name=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}", + ) for _LANG in ['', *_LANGUAGES]], + # *[SEACrowdConfig( + # name=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}_seacrowd_[seacrowd_schema_name]", + # version=datasets.Version(_SEACROWD_VERSION), + # description=f"{_DATASETNAME} SEACrowd schema", + # schema="seacrowd_[seacrowd_schema_name]", + # subset_id=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}", + # ) for _LANG in ['', *_LANGUAGES]], + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_LANGUAGES[0]}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "lang": datasets.Value("string"), + "source_sent_id": datasets.Value("string"), + "sent_id": datasets.Value("string"), + "text": datasets.Value("string"), + "id": [datasets.Value("string")], + "up:pred": [datasets.Value("string")], + "up:argheads": [datasets.Value("string")], + "up:argspans": [datasets.Value("string")], + } + ) + + # For example seacrowd_kb, seacrowd_t2t + # elif self.config.schema == "seacrowd_[seacrowdschema_name]": + # features = schemas.kb_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + _subset_id = self.config.subset_id.split("_") + if len(_subset_id) > 1: + _lang = _subset_id[1] + urls = {split: {_lang: urls_up_ud[_lang]} for split, urls_up_ud in _URLS.items()} + else: + urls = _URLS + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepaths": data_dir["train"], + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepaths": data_dir["test"], + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepaths": data_dir["dev"], + }, + ), + ] + + def _generate_examples(self, filepaths: Dict[str, List[Path]]) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + _subset_id = self.config.subset_id.split("_") + _langs = [_subset_id[1]] if (len(_subset_id) > 1) else _LANGUAGES + + for _lang in _langs: + data = list(load_ud_data(filepaths[_lang][0])) + sentid2text = {_b["sent_id"]: _b["text"] for _b in load_ud_data(filepaths[_lang][1])} + + for cur_data in data: + txt_src = sentid2text[cur_data["sent_id"]] + txt_up = cur_data["text"].rsplit("..........", 1)[0].rstrip(" -") + assert txt_up == txt_src[:len(txt_up)], f"Text mismatch. Found '{txt_up}' in conllup but source is '{txt_src[:len(txt_up)]}'" + cur_data["text"] = txt_src + cur_data["lang"] = _lang + + if self.config.schema == "source": + for key, example in enumerate(data): + yield f"{_lang}_{key}", example + + # elif self.config.schema == "seacrowd_[seacrowd_schema_name]": + # for key, example in enumerate(data): + # yield key, {} From a3281a35b0619552bad718af0baa4a48f5d011a1 Mon Sep 17 00:00:00 2001 From: Frederikus Hudi Date: Tue, 21 May 2024 20:02:07 +0900 Subject: [PATCH 2/7] Update seacrowd/sea_datasets/up2/up2.py Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> --- seacrowd/sea_datasets/up2/up2.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/seacrowd/sea_datasets/up2/up2.py b/seacrowd/sea_datasets/up2/up2.py index 48995eb2c..ff0af16da 100644 --- a/seacrowd/sea_datasets/up2/up2.py +++ b/seacrowd/sea_datasets/up2/up2.py @@ -134,9 +134,6 @@ def _info(self) -> datasets.DatasetInfo: } ) - # For example seacrowd_kb, seacrowd_t2t - # elif self.config.schema == "seacrowd_[seacrowdschema_name]": - # features = schemas.kb_features return datasets.DatasetInfo( description=_DESCRIPTION, From fea5abcff748533205e471c9e490ca5fe171e4ca Mon Sep 17 00:00:00 2001 From: Frederikus Hudi Date: Tue, 21 May 2024 20:02:20 +0900 Subject: [PATCH 3/7] Update seacrowd/sea_datasets/up2/up2.py Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> --- seacrowd/sea_datasets/up2/up2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/seacrowd/sea_datasets/up2/up2.py b/seacrowd/sea_datasets/up2/up2.py index ff0af16da..c0dff04a8 100644 --- a/seacrowd/sea_datasets/up2/up2.py +++ b/seacrowd/sea_datasets/up2/up2.py @@ -81,7 +81,6 @@ for split in ["train", "test", "dev"] } -# TODO: add supported task by dataset. One dataset may support multiple tasks --> # TODO: add supported task by dataset. One dataset may support multiple tasks. _SUPPORTED_TASKS = [] _SOURCE_VERSION = "1.0.0" From cfd49c3ae54229566f32f72a5a74be83d79ebc6f Mon Sep 17 00:00:00 2001 From: Frederikus Hudi Date: Tue, 21 May 2024 20:02:31 +0900 Subject: [PATCH 4/7] Update seacrowd/sea_datasets/up2/up2.py Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> --- seacrowd/sea_datasets/up2/up2.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/seacrowd/sea_datasets/up2/up2.py b/seacrowd/sea_datasets/up2/up2.py index c0dff04a8..79700625c 100644 --- a/seacrowd/sea_datasets/up2/up2.py +++ b/seacrowd/sea_datasets/up2/up2.py @@ -194,6 +194,3 @@ def _generate_examples(self, filepaths: Dict[str, List[Path]]) -> Tuple[int, Dic for key, example in enumerate(data): yield f"{_lang}_{key}", example - # elif self.config.schema == "seacrowd_[seacrowd_schema_name]": - # for key, example in enumerate(data): - # yield key, {} From 9d55ba242dabb10fd23a7ea1565540b3e58f3adc Mon Sep 17 00:00:00 2001 From: Frederikus Hudi Date: Tue, 21 May 2024 20:03:23 +0900 Subject: [PATCH 5/7] Update up2.py --- seacrowd/sea_datasets/up2/up2.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/seacrowd/sea_datasets/up2/up2.py b/seacrowd/sea_datasets/up2/up2.py index 79700625c..0f7cff8b1 100644 --- a/seacrowd/sea_datasets/up2/up2.py +++ b/seacrowd/sea_datasets/up2/up2.py @@ -106,13 +106,6 @@ class UP2Dataset(datasets.GeneratorBasedBuilder): schema="source", subset_id=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}", ) for _LANG in ['', *_LANGUAGES]], - # *[SEACrowdConfig( - # name=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}_seacrowd_[seacrowd_schema_name]", - # version=datasets.Version(_SEACROWD_VERSION), - # description=f"{_DATASETNAME} SEACrowd schema", - # schema="seacrowd_[seacrowd_schema_name]", - # subset_id=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}", - # ) for _LANG in ['', *_LANGUAGES]], ] DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_LANGUAGES[0]}_source" From 8ba35b5f780192b94fca0434cfce187be2ca4a5e Mon Sep 17 00:00:00 2001 From: Frederikus Hudi Date: Fri, 31 May 2024 01:40:43 +0900 Subject: [PATCH 6/7] Update up2.py, reformat from makefile. --- seacrowd/sea_datasets/up2/up2.py | 50 ++++++++++++++++---------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/seacrowd/sea_datasets/up2/up2.py b/seacrowd/sea_datasets/up2/up2.py index 0f7cff8b1..010bc9c4e 100644 --- a/seacrowd/sea_datasets/up2/up2.py +++ b/seacrowd/sea_datasets/up2/up2.py @@ -23,17 +23,16 @@ import datasets -from seacrowd.utils import schemas -from seacrowd.utils.configs import SEACrowdConfig -from seacrowd.utils.constants import Tasks, Licenses from seacrowd.utils.common_parser import load_ud_data +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses _CITATION = """\ @inproceedings{jindal-etal-2022-universal, title = "Universal {P}roposition {B}ank 2.0", author = "Jindal, Ishan and Rademaker, Alexandre and - Ulewicz, Micha{\l} and + Ulewicz, Micha{l} and Linh, Ha and Nguyen, Huyen and Tran, Khoi-Nguyen and @@ -76,7 +75,7 @@ f"https://raw.githubusercontent.com/UniversalPropositions/UP_Vietnamese-VTB/main/vi_vtb-up-{split}.conllup", # f"https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-{split}.conllu", # new data => mismatch. f"https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/0edef6d63df949aea0494c6d4ff4f91bb1959019/vi_vtb-ud-{split}.conllu", # r2.8 - ] + ], } for split in ["train", "test", "dev"] } @@ -99,13 +98,16 @@ class UP2Dataset(datasets.GeneratorBasedBuilder): SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) BUILDER_CONFIGS = [ - *[SEACrowdConfig( - name=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}_source", - version=datasets.Version(_SOURCE_VERSION), - description=f"{_DATASETNAME} source schema", - schema="source", - subset_id=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}", - ) for _LANG in ['', *_LANGUAGES]], + *[ + SEACrowdConfig( + name=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}_source", + version=datasets.Version(_SOURCE_VERSION), + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}{'_' if _LANG else ''}{_LANG}", + ) + for _LANG in ["", *_LANGUAGES] + ], ] DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_LANGUAGES[0]}_source" @@ -114,19 +116,18 @@ def _info(self) -> datasets.DatasetInfo: if self.config.schema == "source": features = datasets.Features( - { - "lang": datasets.Value("string"), - "source_sent_id": datasets.Value("string"), - "sent_id": datasets.Value("string"), - "text": datasets.Value("string"), - "id": [datasets.Value("string")], - "up:pred": [datasets.Value("string")], - "up:argheads": [datasets.Value("string")], - "up:argspans": [datasets.Value("string")], - } + { + "lang": datasets.Value("string"), + "source_sent_id": datasets.Value("string"), + "sent_id": datasets.Value("string"), + "text": datasets.Value("string"), + "id": [datasets.Value("string")], + "up:pred": [datasets.Value("string")], + "up:argheads": [datasets.Value("string")], + "up:argspans": [datasets.Value("string")], + } ) - return datasets.DatasetInfo( description=_DESCRIPTION, features=features, @@ -179,11 +180,10 @@ def _generate_examples(self, filepaths: Dict[str, List[Path]]) -> Tuple[int, Dic for cur_data in data: txt_src = sentid2text[cur_data["sent_id"]] txt_up = cur_data["text"].rsplit("..........", 1)[0].rstrip(" -") - assert txt_up == txt_src[:len(txt_up)], f"Text mismatch. Found '{txt_up}' in conllup but source is '{txt_src[:len(txt_up)]}'" + assert txt_up == txt_src[: len(txt_up)], f"Text mismatch. Found '{txt_up}' in conllup but source is '{txt_src[:len(txt_up)]}'" cur_data["text"] = txt_src cur_data["lang"] = _lang if self.config.schema == "source": for key, example in enumerate(data): yield f"{_lang}_{key}", example - From 91d389c8d3a78982de6fb76739d028ffdce78b3d Mon Sep 17 00:00:00 2001 From: Frederikus Hudi Date: Fri, 31 May 2024 01:41:52 +0900 Subject: [PATCH 7/7] Update common_parser.py for a safer IO process. --- seacrowd/utils/common_parser.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/seacrowd/utils/common_parser.py b/seacrowd/utils/common_parser.py index 8c05b6e94..efc0a6163 100644 --- a/seacrowd/utils/common_parser.py +++ b/seacrowd/utils/common_parser.py @@ -34,7 +34,9 @@ def load_ud_data(filepath, filter_kwargs=None, assert_fn=None): :param assert_fn: assertion to make sure raw data is in the expected format :return: generator with schema following CONLLU """ - dataset_raw = parse(open(filepath).read()) + with open(filepath, "r", encoding="utf8") as f: + raw_data = f.read() + dataset_raw = parse(raw_data) filter_kwargs = filter_kwargs or dict() if callable(assert_fn):