Skip to content

Commit

Permalink
Closes SEACrowd#114 | Implement dataloader for VnDT (SEACrowd#467)
Browse files Browse the repository at this point in the history
* Implement dataloader for VnDT

* Add utility to impute missing sent_id and text fields from CoNLL files

* Fix imputed outputs

---------

Co-authored-by: Railey Montalan <[email protected]>
  • Loading branch information
raileymontalan and Railey Montalan authored Apr 1, 2024
1 parent b4a3c27 commit 94bc96a
Show file tree
Hide file tree
Showing 3 changed files with 258 additions and 0 deletions.
Empty file.
61 changes: 61 additions & 0 deletions seacrowd/sea_datasets/vndt/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import typing as T

from conllu.exceptions import ParseException
from conllu.models import Metadata, TokenList
from conllu.parser import (DEFAULT_FIELD_PARSERS, DEFAULT_FIELDS,
_FieldParserType, _MetadataParserType,
parse_comment_line, parse_line)

imputed_sent_id: int = 1


def parse_token_and_impute_metadata(data: str, fields: T.Optional[T.Sequence[str]] = None, field_parsers: T.Optional[T.Dict[str, _FieldParserType]] = None, metadata_parsers: T.Optional[T.Dict[str, _MetadataParserType]] = None) -> TokenList:
"""
Overrides conllu.parse_token_and_metadata via monkey patching.
This function imputes the following metadata, if these are not found in the .conllu file:
- sent_id (int): an integer identifier for each sentence.
- text (str): a concatenated string of token forms. This assumes that all token forms
are separated with an empty space ' ', and does not consider the `SpaceAfter` field.
"""

if not data:
raise ParseException("Can't create TokenList, no data sent to constructor.")

fields = fields or DEFAULT_FIELDS
global imputed_sent_id

if not field_parsers:
field_parsers = DEFAULT_FIELD_PARSERS.copy()
elif sorted(field_parsers.keys()) != sorted(fields):
new_field_parsers = DEFAULT_FIELD_PARSERS.copy()
new_field_parsers.update(field_parsers)
field_parsers = new_field_parsers

tokens = []
metadata = Metadata()

for line in data.split('\n'):
line = line.strip()

if not line:
continue

if line.startswith('#'):
pairs = parse_comment_line(line, metadata_parsers=metadata_parsers)
for key, value in pairs:
metadata[key] = value

else:
tokens.append(parse_line(line, fields, field_parsers))

if 'sent_id' not in metadata:
metadata['sent_id'] = str(imputed_sent_id)
imputed_sent_id += 1

if 'text' not in metadata:
imputed_text = ""
for token in tokens:
imputed_text += str(token['form']) + " "
metadata['text'] = imputed_text

return TokenList(tokens, metadata, default_fields=fields)
197 changes: 197 additions & 0 deletions seacrowd/sea_datasets/vndt/vndt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path
from typing import Dict, List, Tuple

import conllu
import datasets

from seacrowd.sea_datasets.vndt.utils import parse_token_and_impute_metadata
from seacrowd.utils import schemas
from seacrowd.utils.common_parser import (load_ud_data,
load_ud_data_as_seacrowd_kb)
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Licenses, Tasks

_CITATION = """\
@InProceedings{Nguyen2014NLDB,
author = {Nguyen, Dat Quoc and Nguyen, Dai Quoc and Pham, Son Bao and Nguyen, Phuong-Thai and Nguyen, Minh Le},
title = {{From Treebank Conversion to Automatic Dependency Parsing for Vietnamese}},
booktitle = {{Proceedings of 19th International Conference on Application of Natural Language to Information Systems}},
year = {2014},
pages = {196-207},
url = {https://github.com/datquocnguyen/VnDT},
}
"""

_DATASETNAME = "vndt"

_DESCRIPTION = """\
VnDT is a Vietnamese dependency treebank, consisting of 10K+ sentences (219k words). The VnDT Treebank is automatically
converted from the input Vietnamese Treebank.
"""

_HOMEPAGE = "https://github.com/datquocnguyen/VnDT"

_LANGUAGES = {"vie": "vi"}

_LICENSE = Licenses.UNKNOWN.value

_LOCAL = False

_URLS = {
"gold-dev": "https://raw.githubusercontent.com/datquocnguyen/VnDT/master/VnDTv1.1-gold-POS-tags-dev.conll",
"gold-test": "https://raw.githubusercontent.com/datquocnguyen/VnDT/master/VnDTv1.1-gold-POS-tags-test.conll",
"gold-train": "https://raw.githubusercontent.com/datquocnguyen/VnDT/master/VnDTv1.1-gold-POS-tags-train.conll",
"predicted-dev": "https://raw.githubusercontent.com/datquocnguyen/VnDT/master/VnDTv1.1-predicted-POS-tags-dev.conll",
"predicted-test": "https://raw.githubusercontent.com/datquocnguyen/VnDT/master/VnDTv1.1-predicted-POS-tags-test.conll",
"predicted-train": "https://raw.githubusercontent.com/datquocnguyen/VnDT/master/VnDTv1.1-predicted-POS-tags-train.conll",
}

_SUPPORTED_TASKS = [Tasks.DEPENDENCY_PARSING]

_SOURCE_VERSION = "1.0.0"

_SEACROWD_VERSION = "1.0.0"

class VnDTDataset(datasets.GeneratorBasedBuilder):
"""
VnDT is a Vietnamese dependency treebank from https://github.com/datquocnguyen/VnDT.
"""

# Override conllu.parse_token_and_metadata via monkey patching
conllu.parse_token_and_metadata = parse_token_and_impute_metadata

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)

BUILDER_CONFIGS = [
SEACrowdConfig(
name=f"{_DATASETNAME}_gold_source",
version=datasets.Version(_SOURCE_VERSION),
description=f"{_DATASETNAME} gold standard source schema",
schema="source",
subset_id="gold",
),
SEACrowdConfig(
name=f"{_DATASETNAME}_gold_seacrowd_kb",
version=datasets.Version(_SEACROWD_VERSION),
description=f"{_DATASETNAME} gold standard SEACrowd schema",
schema="seacrowd_kb",
subset_id="gold",
),
SEACrowdConfig(
name=f"{_DATASETNAME}_predicted_source",
version=datasets.Version(_SOURCE_VERSION),
description=f"{_DATASETNAME} predicted source schema",
schema="source",
subset_id="predicted",
),
SEACrowdConfig(
name=f"{_DATASETNAME}_predicted_seacrowd_kb",
version=datasets.Version(_SEACROWD_VERSION),
description=f"{_DATASETNAME} predicted SEACrowd schema",
schema="seacrowd_kb",
subset_id="predicted",
),
]

def _info(self) -> datasets.DatasetInfo:
if self.config.schema == "source":
features = datasets.Features(
{
"id": datasets.Sequence(datasets.Value("int8")),
"form": datasets.Sequence(datasets.Value("string")),
"lemma": datasets.Sequence(datasets.Value("string")),
"upos": datasets.Sequence(datasets.Value("string")),
"xpos": datasets.Sequence(datasets.Value("string")),
"feats": datasets.Sequence(datasets.Value("string")),
"head": datasets.Sequence(datasets.Value("int8")),
"deprel": datasets.Sequence(datasets.Value("string")),
"deps": datasets.Sequence(datasets.Value("string")),
"misc": datasets.Sequence(datasets.Value("string")),
}
)
elif self.config.schema == "seacrowd_kb":
features = schemas.kb_features
else:
raise ValueError(f"Invalid schema: '{self.config.schema}'")

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
"""
Returns SplitGenerators.
"""

paths = {key: dl_manager.download_and_extract(value) for key, value in _URLS.items()}

if self.config.subset_id == "gold":
filtered_paths = {key: value for key, value in paths.items() if "gold" in key}
elif self.config.subset_id == "predicted":
filtered_paths = {key: value for key, value in paths.items() if "predicted" in key}
else:
raise NotImplementedError(f"Invalid subset: '{self.config.subset_id}'.")

return [
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepaths": [value for key, value in filtered_paths.items() if "dev" in key],
"split": "validation",
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepaths": [value for key, value in filtered_paths.items() if "test" in key],
"split": "test",
},
),
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepaths": [value for key, value in filtered_paths.items() if "train" in key],
"split": "train",
},
),
]

def _generate_examples(self, filepaths: Path, split: str) -> Tuple[int, Dict]:
"""
Yields examples as (key, example) tuples.
"""

dataset = None
for file in filepaths:
if self.config.schema == "source":
dataset = list(load_ud_data(file))
elif self.config.schema == "seacrowd_kb":
dataset = list(load_ud_data_as_seacrowd_kb(file, dataset))
else:
raise ValueError(f"Invalid config: '{self.config.name}'")

for idx, example in enumerate(dataset):
if self.config.schema == "source":
example.pop('sent_id', None)
example.pop('text', None)
yield idx, example

0 comments on commit 94bc96a

Please sign in to comment.