From 672d20b8b2fe60cb21189d944cdb0c0f907a4963 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Wed, 1 Nov 2023 16:53:41 +0100 Subject: [PATCH 01/15] add hf brat dataset script and card from https://huggingface.co/datasets/DFKI-SLT/brat --- dataset_builders/hf/brat/README.md | 115 ++++++++++ dataset_builders/hf/brat/brat.py | 339 +++++++++++++++++++++++++++++ 2 files changed, 454 insertions(+) create mode 100644 dataset_builders/hf/brat/README.md create mode 100644 dataset_builders/hf/brat/brat.py diff --git a/dataset_builders/hf/brat/README.md b/dataset_builders/hf/brat/README.md new file mode 100644 index 00000000..eee4e282 --- /dev/null +++ b/dataset_builders/hf/brat/README.md @@ -0,0 +1,115 @@ +--- +annotations_creators: + - expert-generated +language_creators: + - found +license: [] +task_categories: + - token-classification +task_ids: + - parsing +--- + +# Information Card for Brat + +## Table of Contents + +- [Description](#description) + - [Summary](#summary) +- [Dataset Structure](#dataset-structure) +- [Data Instances](#data-instances) +- [Data Fields](#data-instances) +- [Usage](#usage) +- [Additional Information](#additional-information) + - [Licensing Information](#licensing-information) + - [Citation Information](#citation-information) + +## Description + +- **Homepage:** https://brat.nlplab.org +- **Paper:** https://aclanthology.org/E12-2021/ +- **Leaderboard:** \[Needs More Information\] +- **Point of Contact:** \[Needs More Information\] + +### Summary + +Brat is an intuitive web-based tool for text annotation supported by Natural Language Processing (NLP) technology. BRAT has been developed for rich structured annota- tion for a variety of NLP tasks and aims to support manual curation efforts and increase annotator productivity using NLP techniques. brat is designed in particular for structured annotation, where the notes are not free form text but have a fixed form that can be automatically processed and interpreted by a computer. + +## Dataset Structure + +Dataset annotated with brat format is processed using this script. Annotations created in brat are stored on disk in a standoff format: annotations are stored separately from the annotated document text, which is never modified by the tool. For each text document in the system, there is a corresponding annotation file. The two are associated by the file naming convention that their base name (file name without suffix) is the same: for example, the file DOC-1000.ann contains annotations for the file DOC-1000.txt. More information can be found [here](https://brat.nlplab.org/standoff.html). + +### Data Instances + +\[Needs More Information\] + +### Data Fields + +``` +-context: html content of data file as string +-file_name: a string name of file +-spans: a sequence containing id, type, location and text of a span +-relations: a sequence containing id, type and arguments of a relation +-equivalence_relations: +-events: +-attributions: +-normalizations: +-notes: +``` + +### Usage + +brat script can be used by calling `load_dataset()` method and passing `kwargs` (arguments to the [BuilderConfig](https://huggingface.co/docs/datasets/v2.2.1/en/package_reference/builder_classes#datasets.BuilderConfig)) which should include at least `url` of the dataset prepared using brat. We provide an example of [SciArg](https://aclanthology.org/W18-5206.pdf) dataset below, + +```python +from datasets import load_dataset +kwargs = { +"description" : + """This dataset is an extension of the Dr. Inventor corpus (Fisas et al., 2015, 2016) with an annotation layer containing + fine-grained argumentative components and relations. It is the first argument-annotated corpus of scientific + publications (in English), which allows for joint analyses of argumentation and other rhetorical dimensions of + scientific writing.""", +"citation" : + """@inproceedings{lauscher2018b, + title = {An argument-annotated corpus of scientific publications}, + booktitle = {Proceedings of the 5th Workshop on Mining Argumentation}, + publisher = {Association for Computational Linguistics}, + author = {Lauscher, Anne and Glava\v{s}, Goran and Ponzetto, Simone Paolo}, + address = {Brussels, Belgium}, + year = {2018}, + pages = {40–46} + }""", +"homepage": "https://github.com/anlausch/ArguminSci", +"url": "http://data.dws.informatik.uni-mannheim.de/sci-arg/compiled_corpus.zip", +"file_name_blacklist": ['A28'], +} + +dataset = load_dataset('dfki-nlp/brat', **kwargs) +``` + +## Additional Information + +### Licensing Information + +\[Needs More Information\] + +### Citation Information + +``` +@inproceedings{stenetorp-etal-2012-brat, + title = "brat: a Web-based Tool for {NLP}-Assisted Text Annotation", + author = "Stenetorp, Pontus and + Pyysalo, Sampo and + Topi{\'c}, Goran and + Ohta, Tomoko and + Ananiadou, Sophia and + Tsujii, Jun{'}ichi", + booktitle = "Proceedings of the Demonstrations at the 13th Conference of the {E}uropean Chapter of the Association for Computational Linguistics", + month = apr, + year = "2012", + address = "Avignon, France", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/E12-2021", + pages = "102--107", +} +``` diff --git a/dataset_builders/hf/brat/brat.py b/dataset_builders/hf/brat/brat.py new file mode 100644 index 00000000..4f4146ef --- /dev/null +++ b/dataset_builders/hf/brat/brat.py @@ -0,0 +1,339 @@ +import glob +import logging +from dataclasses import dataclass +from os import listdir, path +from typing import Dict, List, Optional + +import datasets +from datasets import ( + BuilderConfig, + DatasetInfo, + Features, + Sequence, + SplitGenerator, + Value, +) + +logger = logging.getLogger(__name__) + + +@dataclass +class BratConfig(BuilderConfig): + """BuilderConfig for BRAT.""" + + url: str = None # type: ignore + description: Optional[str] = None + citation: Optional[str] = None + homepage: Optional[str] = None + + subdirectory_mapping: Optional[Dict[str, str]] = None + file_name_blacklist: Optional[List[str]] = None + ann_file_extension: str = "ann" + txt_file_extension: str = "txt" + + +class Brat(datasets.GeneratorBasedBuilder): + BUILDER_CONFIG_CLASS = BratConfig + + def _info(self): + return DatasetInfo( + description=self.config.description, + citation=self.config.citation, + homepage=self.config.homepage, + features=Features( + { + "context": Value("string"), + "file_name": Value("string"), + "spans": Sequence( + { + "id": Value("string"), + "type": Value("string"), + "locations": Sequence( + { + "start": Value("int32"), + "end": Value("int32"), + } + ), + "text": Value("string"), + } + ), + "relations": Sequence( + { + "id": Value("string"), + "type": Value("string"), + "arguments": Sequence( + {"type": Value("string"), "target": Value("string")} + ), + } + ), + "equivalence_relations": Sequence( + { + "type": Value("string"), + "targets": Sequence(Value("string")), + } + ), + "events": Sequence( + { + "id": Value("string"), + "type": Value("string"), + "trigger": Value("string"), + "arguments": Sequence( + {"type": Value("string"), "target": Value("string")} + ), + } + ), + "attributions": Sequence( + { + "id": Value("string"), + "type": Value("string"), + "target": Value("string"), + "value": Value("string"), + } + ), + "normalizations": Sequence( + { + "id": Value("string"), + "type": Value("string"), + "target": Value("string"), + "resource_id": Value("string"), + "entity_id": Value("string"), + } + ), + "notes": Sequence( + { + "id": Value("string"), + "type": Value("string"), + "target": Value("string"), + "note": Value("string"), + } + ), + } + ), + ) + + @staticmethod + def _get_location(location_string): + parts = location_string.split(" ") + assert ( + len(parts) == 2 + ), f"Wrong number of entries in location string. Expected 2, but found: {parts}" + return {"start": int(parts[0]), "end": int(parts[1])} + + @staticmethod + def _get_span_annotation(annotation_line): + """ + example input: + T1 Organization 0 4 Sony + """ + + _id, remaining, text = annotation_line.split("\t", maxsplit=2) + _type, locations = remaining.split(" ", maxsplit=1) + return { + "id": _id, + "text": text, + "type": _type, + "locations": [Brat._get_location(loc) for loc in locations.split(";")], + } + + @staticmethod + def _get_event_annotation(annotation_line): + """ + example input: + E1 MERGE-ORG:T2 Org1:T1 Org2:T3 + """ + _id, remaining = annotation_line.strip().split("\t") + args = [dict(zip(["type", "target"], a.split(":"))) for a in remaining.split(" ")] + return { + "id": _id, + "type": args[0]["type"], + "trigger": args[0]["target"], + "arguments": args[1:], + } + + @staticmethod + def _get_relation_annotation(annotation_line): + """ + example input: + R1 Origin Arg1:T3 Arg2:T4 + """ + + _id, remaining = annotation_line.strip().split("\t") + _type, remaining = remaining.split(" ", maxsplit=1) + args = [dict(zip(["type", "target"], a.split(":"))) for a in remaining.split(" ")] + return {"id": _id, "type": _type, "arguments": args} + + @staticmethod + def _get_equivalence_relation_annotation(annotation_line): + """ + example input: + * Equiv T1 T2 T3 + """ + _, remaining = annotation_line.strip().split("\t") + parts = remaining.split(" ") + return {"type": parts[0], "targets": parts[1:]} + + @staticmethod + def _get_attribute_annotation(annotation_line): + """Example input (binary: implicit value is True, if present, False otherwise): + + A1 Negation E1 example input (multi-value: explicit value) A2 Confidence E2 L1 + """ + + _id, remaining = annotation_line.strip().split("\t") + parts = remaining.split(" ") + # if no value is present, it is implicitly "true" + if len(parts) == 2: + parts.append("true") + return { + "id": _id, + "type": parts[0], + "target": parts[1], + "value": parts[2], + } + + @staticmethod + def _get_normalization_annotation(annotation_line): + """ + example input: + N1 Reference T1 Wikipedia:534366 Barack Obama + """ + _id, remaining, text = annotation_line.split("\t", maxsplit=2) + _type, target, ref = remaining.split(" ") + res_id, ent_id = ref.split(":") + return { + "id": _id, + "type": _type, + "target": target, + "resource_id": res_id, + "entity_id": ent_id, + } + + @staticmethod + def _get_note_annotation(annotation_line): + """ + example input: + #1 AnnotatorNotes T1 this annotation is suspect + """ + _id, remaining, note = annotation_line.split("\t", maxsplit=2) + _type, target = remaining.split(" ") + return { + "id": _id, + "type": _type, + "target": target, + "note": note, + } + + @staticmethod + def _read_annotation_file(filename): + """ + reads a BRAT v1.3 annotations file (see https://brat.nlplab.org/standoff.html) + """ + + res = { + "spans": [], + "events": [], + "relations": [], + "equivalence_relations": [], + "attributions": [], + "normalizations": [], + "notes": [], + } + + with open(filename) as file: + for i, line in enumerate(file): + if len(line.strip()) == 0: + continue + ann_type = line[0] + + # strip away the new line character + if line.endswith("\n"): + line = line[:-1] + + if ann_type == "T": + res["spans"].append(Brat._get_span_annotation(line)) + elif ann_type == "E": + res["events"].append(Brat._get_event_annotation(line)) + elif ann_type == "R": + res["relations"].append(Brat._get_relation_annotation(line)) + elif ann_type == "*": + res["equivalence_relations"].append( + Brat._get_equivalence_relation_annotation(line) + ) + elif ann_type in ["A", "M"]: + res["attributions"].append(Brat._get_attribute_annotation(line)) + elif ann_type == "N": + res["normalizations"].append(Brat._get_normalization_annotation(line)) + elif ann_type == "#": + res["notes"].append(Brat._get_note_annotation(line)) + else: + raise ValueError( + f'unknown BRAT annotation id type: "{line}" (from file {filename} @line {i}). ' + f"Annotation ids have to start with T (spans), E (events), R (relations), " + f"A (attributions), or N (normalizations). See " + f"https://brat.nlplab.org/standoff.html for the BRAT annotation file " + f"specification." + ) + return res + + def _generate_examples(self, files=None, directory=None): + """Read context (.txt) and annotation (.ann) files.""" + if files is None: + assert ( + directory is not None + ), "If files is None, directory has to be provided, but it is also None." + _files = glob.glob(f"{directory}/*.{self.config.ann_file_extension}") + files = sorted(path.splitext(fn)[0] for fn in _files) + + for filename in files: + basename = path.basename(filename) + if ( + self.config.file_name_blacklist is not None + and basename in self.config.file_name_blacklist + ): + logger.info(f"skip annotation file: {basename} (blacklisted)") + continue + + ann_fn = f"{filename}.{self.config.ann_file_extension}" + brat_annotations = Brat._read_annotation_file(ann_fn) + + txt_fn = f"{filename}.{self.config.txt_file_extension}" + txt_content = open(txt_fn).read() + brat_annotations["context"] = txt_content + brat_annotations["file_name"] = basename + + yield basename, brat_annotations + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + + if self.config.data_dir is not None: + data_dir = self.config.data_dir + logging.warning(f"load from data_dir: {data_dir}") + else: + # since subclasses of BuilderConfig are not allowed to define + # attributes without defaults, check here + assert self.config.url is not None, "data url not specified" + + data_dir = dl_manager.download_and_extract(self.config.url) + + subdirectory_mapping = self.config.subdirectory_mapping + # if no subdirectory mapping is provided, ... + if subdirectory_mapping is None: + # ... use available subdirectories as split names ... + subdirs = [f for f in listdir(data_dir) if path.isdir(path.join(data_dir, f))] + if len(subdirs) > 0: + subdirectory_mapping = {subdir: subdir for subdir in subdirs} + else: + # ... otherwise, default to a single train split with the base directory + subdirectory_mapping = {"": "train"} + + return [ + SplitGenerator( + name=split, + # These kwargs will be passed to _generate_examples + gen_kwargs={ + "directory": path.join(data_dir, subdir), + }, + ) + for subdir, split in subdirectory_mapping.items() + ] From 8270b080190c77d470938263e1d620537e740449 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Wed, 1 Nov 2023 16:55:32 +0100 Subject: [PATCH 02/15] add pie brat dataset script https://huggingface.co/datasets/pie/brat --- dataset_builders/pie/brat/brat.py | 302 ++++++++++++++++++++++++++++++ 1 file changed, 302 insertions(+) create mode 100644 dataset_builders/pie/brat/brat.py diff --git a/dataset_builders/pie/brat/brat.py b/dataset_builders/pie/brat/brat.py new file mode 100644 index 00000000..945926aa --- /dev/null +++ b/dataset_builders/pie/brat/brat.py @@ -0,0 +1,302 @@ +import dataclasses +import logging +from typing import Any, Dict, List, Optional, Tuple, Union + +import datasets +import pytorch_ie +from pytorch_ie.annotations import ( + BinaryRelation, + LabeledMultiSpan, + LabeledSpan, + _post_init_single_label, +) +from pytorch_ie.core import Annotation, AnnotationList, Document, annotation_field + +logger = logging.getLogger(__name__) + + +def dl2ld(dict_of_lists: Dict[str, List[Any]]) -> List[Dict[str, Any]]: + return [dict(zip(dict_of_lists, t)) for t in zip(*dict_of_lists.values())] + + +def ld2dl( + list_fo_dicts: List[Dict[str, Any]], keys: Optional[List[str]] = None +) -> Dict[str, List[Any]]: + keys = keys or list(list_fo_dicts[0]) + return {k: [dic[k] for dic in list_fo_dicts] for k in keys} + + +@dataclasses.dataclass(eq=True, frozen=True) +class Attribute(Annotation): + target_annotation: Annotation + label: str + value: Optional[str] = None + score: float = 1.0 + + def __post_init__(self) -> None: + _post_init_single_label(self) + + +@dataclasses.dataclass +class BratDocument(Document): + text: str + id: Optional[str] = None + metadata: Dict[str, Any] = dataclasses.field(default_factory=dict) + spans: AnnotationList[LabeledMultiSpan] = annotation_field(target="text") + relations: AnnotationList[BinaryRelation] = annotation_field(target="spans") + span_attributions: AnnotationList[Attribute] = annotation_field(target="spans") + relation_attributions: AnnotationList[Attribute] = annotation_field(target="relations") + + +@dataclasses.dataclass +class BratDocumentWithMergedSpans(Document): + text: str + id: Optional[str] = None + metadata: Dict[str, Any] = dataclasses.field(default_factory=dict) + spans: AnnotationList[LabeledSpan] = annotation_field(target="text") + relations: AnnotationList[BinaryRelation] = annotation_field(target="spans") + span_attributions: AnnotationList[Attribute] = annotation_field(target="spans") + relation_attributions: AnnotationList[Attribute] = annotation_field(target="relations") + + +def example_to_document( + example: Dict[str, Any], merge_non_contiguous_spans: bool = False +) -> BratDocument: + if merge_non_contiguous_spans: + doc = BratDocumentWithMergedSpans(text=example["context"], id=example["file_name"]) + else: + doc = BratDocument(text=example["context"], id=example["file_name"]) + + spans: Dict[str, LabeledSpan] = dict() + span_locations: List[Tuple[Tuple[int, int]]] = [] + span_texts: List[str] = [] + for span_dict in dl2ld(example["spans"]): + starts: List[int] = span_dict["locations"]["start"] + ends: List[int] = span_dict["locations"]["end"] + slices = tuple(zip(starts, ends)) + span_locations.append(slices) + span_texts.append(span_dict["text"]) + # sanity check + span_text_parts = [doc.text[start:end] for start, end in slices] + joined_span_texts_stripped = " ".join(span_text_parts).strip() + span_text_stripped = span_dict["text"].strip() + if joined_span_texts_stripped != span_text_stripped: + logger.warning( + f"joined span parts do not match stripped span text field content. " + f'joined_span_texts_stripped: "{joined_span_texts_stripped}" != stripped "text": "{span_text_stripped}"' + ) + if merge_non_contiguous_spans: + if len(starts) > 1: + # check if the text in between the fragments holds only space + merged_content_texts = [ + doc.text[start:end] for start, end in zip(ends[:-1], starts[1:]) + ] + merged_content_texts_not_empty = [ + text.strip() for text in merged_content_texts if text.strip() != "" + ] + if len(merged_content_texts_not_empty) > 0: + logger.warning( + f"document '{doc.id}' contains a non-contiguous span with text content in between (will be merged into a single span): " + f"newly covered text parts: {merged_content_texts_not_empty}, " + f"merged span text: '{doc.text[starts[0]:ends[-1]]}', " + f"annotation: {span_dict}" + ) + # just take everything + start = min(starts) + end = max(ends) + span = LabeledSpan(start=start, end=end, label=span_dict["type"]) + else: + span = LabeledMultiSpan(slices=slices, label=span_dict["type"]) + spans[span_dict["id"]] = span + + doc.spans.extend(spans.values()) + doc.metadata["span_ids"] = list(spans.keys()) + doc.metadata["span_locations"] = span_locations + doc.metadata["span_texts"] = span_texts + + relations: Dict[str, BinaryRelation] = dict() + for rel_dict in dl2ld(example["relations"]): + arguments = dict(zip(rel_dict["arguments"]["type"], rel_dict["arguments"]["target"])) + assert set(arguments) == {"Arg1", "Arg2"} + head = spans[arguments["Arg1"]] + tail = spans[arguments["Arg2"]] + rel = BinaryRelation(head=head, tail=tail, label=rel_dict["type"]) + relations[rel_dict["id"]] = rel + + doc.relations.extend(relations.values()) + doc.metadata["relation_ids"] = list(relations.keys()) + + equivalence_relations = dl2ld(example["equivalence_relations"]) + if len(equivalence_relations) > 0: + raise NotImplementedError("converting equivalence_relations is not yet implemented") + + events = dl2ld(example["events"]) + if len(events) > 0: + raise NotImplementedError("converting events is not yet implemented") + + span_attributions: Dict[str, Attribute] = dict() + attribution_ids = [] + for attribution_dict in dl2ld(example["attributions"]): + target_id = attribution_dict["target"] + if target_id in spans: + target_layer_name = "spans" + target_annotation = spans[target_id] + elif target_id in relations: + target_layer_name = "relations" + target_annotation = relations[target_id] + else: + raise Exception("only span and relation attributions are supported yet") + attribution = Attribute( + target_annotation=target_annotation, + label=attribution_dict["type"], + value=attribution_dict["value"], + ) + span_attributions[attribution_dict["id"]] = attribution + attribution_ids.append((target_layer_name, attribution_dict["id"])) + + doc.span_attributions.extend(span_attributions.values()) + doc.metadata["attribution_ids"] = attribution_ids + + normalizations = dl2ld(example["normalizations"]) + if len(normalizations) > 0: + raise NotImplementedError("converting normalizations is not yet implemented") + + notes = dl2ld(example["notes"]) + if len(notes) > 0: + raise NotImplementedError("converting notes is not yet implemented") + + return doc + + +def document_to_example( + document: Union[BratDocument, BratDocumentWithMergedSpans] +) -> Dict[str, Any]: + example = { + "context": document.text, + "file_name": document.id, + } + span_dicts: Dict[Union[LabeledSpan, LabeledMultiSpan], Dict[str, Any]] = dict() + assert len(document.metadata["span_locations"]) == len(document.spans) + assert len(document.metadata["span_texts"]) == len(document.spans) + assert len(document.metadata["span_ids"]) == len(document.spans) + for i, span in enumerate(document.spans): + locations = tuple((start, end) for start, end in document.metadata["span_locations"][i]) + if isinstance(span, LabeledSpan): + assert locations[0][0] == span.start + assert locations[-1][1] == span.end + elif isinstance(span, LabeledMultiSpan): + assert span.slices == locations + else: + raise TypeError(f"span has unknown type [{type(span)}]: {span}") + + starts, ends = zip(*locations) + span_dict = { + "id": document.metadata["span_ids"][i], + "locations": { + "start": list(starts), + "end": list(ends), + }, + "text": document.metadata["span_texts"][i], + "type": span.label, + } + if span in span_dicts: + prev_ann_dict = span_dicts[span] + ann_dict = span_dict + logger.warning( + f"document {document.id}: annotation exists twice: {prev_ann_dict['id']} and {ann_dict['id']} are identical" + ) + span_dicts[span] = span_dict + example["spans"] = ld2dl(list(span_dicts.values()), keys=["id", "type", "locations", "text"]) + + relation_dicts: Dict[BinaryRelation, Dict[str, Any]] = dict() + assert len(document.metadata["relation_ids"]) == len(document.relations) + for i, rel in enumerate(document.relations): + arg1_id = span_dicts[rel.head]["id"] + arg2_id = span_dicts[rel.tail]["id"] + relation_dict = { + "id": document.metadata["relation_ids"][i], + "type": rel.label, + "arguments": { + "type": ["Arg1", "Arg2"], + "target": [arg1_id, arg2_id], + }, + } + if rel in relation_dicts: + prev_ann_dict = relation_dicts[rel] + ann_dict = relation_dict + logger.warning( + f"document {document.id}: annotation exists twice: {prev_ann_dict['id']} and {ann_dict['id']} are identical" + ) + relation_dicts[rel] = relation_dict + + example["relations"] = ld2dl(list(relation_dicts.values()), keys=["id", "type", "arguments"]) + + example["equivalence_relations"] = ld2dl([], keys=["type", "targets"]) + example["events"] = ld2dl([], keys=["id", "type", "trigger", "arguments"]) + + attribution_dicts: Dict[Annotation, Dict[str, Any]] = dict() + span_attribution_ids = [ + attribution_id + for target_layer, attribution_id in document.metadata["attribution_ids"] + if target_layer == "spans" + ] + assert len(span_attribution_ids) == len(document.span_attributions) + for i, span_attribution in enumerate(document.span_attributions): + target_id = span_dicts[span_attribution.target_annotation]["id"] + attribution_dict = { + "id": span_attribution_ids[i], + "type": span_attribution.label, + "target": target_id, + "value": span_attribution.value, + } + if span_attribution in attribution_dicts: + prev_ann_dict = attribution_dicts[span_attribution] + ann_dict = span_attribution + logger.warning( + f"document {document.id}: annotation exists twice: {prev_ann_dict['id']} and {ann_dict['id']} are identical" + ) + attribution_dicts[span_attribution] = attribution_dict + + example["attributions"] = ld2dl( + list(attribution_dicts.values()), keys=["id", "type", "target", "value"] + ) + example["normalizations"] = ld2dl( + [], keys=["id", "type", "target", "resource_id", "entity_id"] + ) + example["notes"] = ld2dl([], keys=["id", "type", "target", "note"]) + + return example + + +class BratConfig(datasets.BuilderConfig): + """BuilderConfig for BratDatasetLoader.""" + + def __init__(self, merge_non_contiguous_spans: bool = False, **kwargs): + """BuilderConfig for DocRED. + + Args: + **kwargs: keyword arguments forwarded to super. + """ + super().__init__(**kwargs) + self.merge_non_contiguous_spans = merge_non_contiguous_spans + + +class BratDatasetLoader(pytorch_ie.data.builder.GeneratorBasedBuilder): + # this requires https://github.com/ChristophAlt/pytorch-ie/pull/288 + DOCUMENT_TYPES = { + "default": BratDocument, + "merge_non_contiguous_spans": BratDocumentWithMergedSpans, + } + + DEFAULT_CONFIG_NAME = "default" + BUILDER_CONFIGS = [ + BratConfig(name="default"), + BratConfig(name="merge_non_contiguous_spans", merge_non_contiguous_spans=True), + ] + + BASE_DATASET_PATH = "DFKI-SLT/brat" + + def _generate_document(self, example, **kwargs): + return example_to_document( + example, merge_non_contiguous_spans=self.config.merge_non_contiguous_spans + ) From fd5ab81150a1b0edc61620ad70e885f71a9c0a19 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Thu, 2 Nov 2023 21:03:42 +0100 Subject: [PATCH 03/15] remove dataset_builders/hf/brat --- dataset_builders/hf/brat/README.md | 115 ---------- dataset_builders/hf/brat/brat.py | 339 ----------------------------- 2 files changed, 454 deletions(-) delete mode 100644 dataset_builders/hf/brat/README.md delete mode 100644 dataset_builders/hf/brat/brat.py diff --git a/dataset_builders/hf/brat/README.md b/dataset_builders/hf/brat/README.md deleted file mode 100644 index eee4e282..00000000 --- a/dataset_builders/hf/brat/README.md +++ /dev/null @@ -1,115 +0,0 @@ ---- -annotations_creators: - - expert-generated -language_creators: - - found -license: [] -task_categories: - - token-classification -task_ids: - - parsing ---- - -# Information Card for Brat - -## Table of Contents - -- [Description](#description) - - [Summary](#summary) -- [Dataset Structure](#dataset-structure) -- [Data Instances](#data-instances) -- [Data Fields](#data-instances) -- [Usage](#usage) -- [Additional Information](#additional-information) - - [Licensing Information](#licensing-information) - - [Citation Information](#citation-information) - -## Description - -- **Homepage:** https://brat.nlplab.org -- **Paper:** https://aclanthology.org/E12-2021/ -- **Leaderboard:** \[Needs More Information\] -- **Point of Contact:** \[Needs More Information\] - -### Summary - -Brat is an intuitive web-based tool for text annotation supported by Natural Language Processing (NLP) technology. BRAT has been developed for rich structured annota- tion for a variety of NLP tasks and aims to support manual curation efforts and increase annotator productivity using NLP techniques. brat is designed in particular for structured annotation, where the notes are not free form text but have a fixed form that can be automatically processed and interpreted by a computer. - -## Dataset Structure - -Dataset annotated with brat format is processed using this script. Annotations created in brat are stored on disk in a standoff format: annotations are stored separately from the annotated document text, which is never modified by the tool. For each text document in the system, there is a corresponding annotation file. The two are associated by the file naming convention that their base name (file name without suffix) is the same: for example, the file DOC-1000.ann contains annotations for the file DOC-1000.txt. More information can be found [here](https://brat.nlplab.org/standoff.html). - -### Data Instances - -\[Needs More Information\] - -### Data Fields - -``` --context: html content of data file as string --file_name: a string name of file --spans: a sequence containing id, type, location and text of a span --relations: a sequence containing id, type and arguments of a relation --equivalence_relations: --events: --attributions: --normalizations: --notes: -``` - -### Usage - -brat script can be used by calling `load_dataset()` method and passing `kwargs` (arguments to the [BuilderConfig](https://huggingface.co/docs/datasets/v2.2.1/en/package_reference/builder_classes#datasets.BuilderConfig)) which should include at least `url` of the dataset prepared using brat. We provide an example of [SciArg](https://aclanthology.org/W18-5206.pdf) dataset below, - -```python -from datasets import load_dataset -kwargs = { -"description" : - """This dataset is an extension of the Dr. Inventor corpus (Fisas et al., 2015, 2016) with an annotation layer containing - fine-grained argumentative components and relations. It is the first argument-annotated corpus of scientific - publications (in English), which allows for joint analyses of argumentation and other rhetorical dimensions of - scientific writing.""", -"citation" : - """@inproceedings{lauscher2018b, - title = {An argument-annotated corpus of scientific publications}, - booktitle = {Proceedings of the 5th Workshop on Mining Argumentation}, - publisher = {Association for Computational Linguistics}, - author = {Lauscher, Anne and Glava\v{s}, Goran and Ponzetto, Simone Paolo}, - address = {Brussels, Belgium}, - year = {2018}, - pages = {40–46} - }""", -"homepage": "https://github.com/anlausch/ArguminSci", -"url": "http://data.dws.informatik.uni-mannheim.de/sci-arg/compiled_corpus.zip", -"file_name_blacklist": ['A28'], -} - -dataset = load_dataset('dfki-nlp/brat', **kwargs) -``` - -## Additional Information - -### Licensing Information - -\[Needs More Information\] - -### Citation Information - -``` -@inproceedings{stenetorp-etal-2012-brat, - title = "brat: a Web-based Tool for {NLP}-Assisted Text Annotation", - author = "Stenetorp, Pontus and - Pyysalo, Sampo and - Topi{\'c}, Goran and - Ohta, Tomoko and - Ananiadou, Sophia and - Tsujii, Jun{'}ichi", - booktitle = "Proceedings of the Demonstrations at the 13th Conference of the {E}uropean Chapter of the Association for Computational Linguistics", - month = apr, - year = "2012", - address = "Avignon, France", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/E12-2021", - pages = "102--107", -} -``` diff --git a/dataset_builders/hf/brat/brat.py b/dataset_builders/hf/brat/brat.py deleted file mode 100644 index 4f4146ef..00000000 --- a/dataset_builders/hf/brat/brat.py +++ /dev/null @@ -1,339 +0,0 @@ -import glob -import logging -from dataclasses import dataclass -from os import listdir, path -from typing import Dict, List, Optional - -import datasets -from datasets import ( - BuilderConfig, - DatasetInfo, - Features, - Sequence, - SplitGenerator, - Value, -) - -logger = logging.getLogger(__name__) - - -@dataclass -class BratConfig(BuilderConfig): - """BuilderConfig for BRAT.""" - - url: str = None # type: ignore - description: Optional[str] = None - citation: Optional[str] = None - homepage: Optional[str] = None - - subdirectory_mapping: Optional[Dict[str, str]] = None - file_name_blacklist: Optional[List[str]] = None - ann_file_extension: str = "ann" - txt_file_extension: str = "txt" - - -class Brat(datasets.GeneratorBasedBuilder): - BUILDER_CONFIG_CLASS = BratConfig - - def _info(self): - return DatasetInfo( - description=self.config.description, - citation=self.config.citation, - homepage=self.config.homepage, - features=Features( - { - "context": Value("string"), - "file_name": Value("string"), - "spans": Sequence( - { - "id": Value("string"), - "type": Value("string"), - "locations": Sequence( - { - "start": Value("int32"), - "end": Value("int32"), - } - ), - "text": Value("string"), - } - ), - "relations": Sequence( - { - "id": Value("string"), - "type": Value("string"), - "arguments": Sequence( - {"type": Value("string"), "target": Value("string")} - ), - } - ), - "equivalence_relations": Sequence( - { - "type": Value("string"), - "targets": Sequence(Value("string")), - } - ), - "events": Sequence( - { - "id": Value("string"), - "type": Value("string"), - "trigger": Value("string"), - "arguments": Sequence( - {"type": Value("string"), "target": Value("string")} - ), - } - ), - "attributions": Sequence( - { - "id": Value("string"), - "type": Value("string"), - "target": Value("string"), - "value": Value("string"), - } - ), - "normalizations": Sequence( - { - "id": Value("string"), - "type": Value("string"), - "target": Value("string"), - "resource_id": Value("string"), - "entity_id": Value("string"), - } - ), - "notes": Sequence( - { - "id": Value("string"), - "type": Value("string"), - "target": Value("string"), - "note": Value("string"), - } - ), - } - ), - ) - - @staticmethod - def _get_location(location_string): - parts = location_string.split(" ") - assert ( - len(parts) == 2 - ), f"Wrong number of entries in location string. Expected 2, but found: {parts}" - return {"start": int(parts[0]), "end": int(parts[1])} - - @staticmethod - def _get_span_annotation(annotation_line): - """ - example input: - T1 Organization 0 4 Sony - """ - - _id, remaining, text = annotation_line.split("\t", maxsplit=2) - _type, locations = remaining.split(" ", maxsplit=1) - return { - "id": _id, - "text": text, - "type": _type, - "locations": [Brat._get_location(loc) for loc in locations.split(";")], - } - - @staticmethod - def _get_event_annotation(annotation_line): - """ - example input: - E1 MERGE-ORG:T2 Org1:T1 Org2:T3 - """ - _id, remaining = annotation_line.strip().split("\t") - args = [dict(zip(["type", "target"], a.split(":"))) for a in remaining.split(" ")] - return { - "id": _id, - "type": args[0]["type"], - "trigger": args[0]["target"], - "arguments": args[1:], - } - - @staticmethod - def _get_relation_annotation(annotation_line): - """ - example input: - R1 Origin Arg1:T3 Arg2:T4 - """ - - _id, remaining = annotation_line.strip().split("\t") - _type, remaining = remaining.split(" ", maxsplit=1) - args = [dict(zip(["type", "target"], a.split(":"))) for a in remaining.split(" ")] - return {"id": _id, "type": _type, "arguments": args} - - @staticmethod - def _get_equivalence_relation_annotation(annotation_line): - """ - example input: - * Equiv T1 T2 T3 - """ - _, remaining = annotation_line.strip().split("\t") - parts = remaining.split(" ") - return {"type": parts[0], "targets": parts[1:]} - - @staticmethod - def _get_attribute_annotation(annotation_line): - """Example input (binary: implicit value is True, if present, False otherwise): - - A1 Negation E1 example input (multi-value: explicit value) A2 Confidence E2 L1 - """ - - _id, remaining = annotation_line.strip().split("\t") - parts = remaining.split(" ") - # if no value is present, it is implicitly "true" - if len(parts) == 2: - parts.append("true") - return { - "id": _id, - "type": parts[0], - "target": parts[1], - "value": parts[2], - } - - @staticmethod - def _get_normalization_annotation(annotation_line): - """ - example input: - N1 Reference T1 Wikipedia:534366 Barack Obama - """ - _id, remaining, text = annotation_line.split("\t", maxsplit=2) - _type, target, ref = remaining.split(" ") - res_id, ent_id = ref.split(":") - return { - "id": _id, - "type": _type, - "target": target, - "resource_id": res_id, - "entity_id": ent_id, - } - - @staticmethod - def _get_note_annotation(annotation_line): - """ - example input: - #1 AnnotatorNotes T1 this annotation is suspect - """ - _id, remaining, note = annotation_line.split("\t", maxsplit=2) - _type, target = remaining.split(" ") - return { - "id": _id, - "type": _type, - "target": target, - "note": note, - } - - @staticmethod - def _read_annotation_file(filename): - """ - reads a BRAT v1.3 annotations file (see https://brat.nlplab.org/standoff.html) - """ - - res = { - "spans": [], - "events": [], - "relations": [], - "equivalence_relations": [], - "attributions": [], - "normalizations": [], - "notes": [], - } - - with open(filename) as file: - for i, line in enumerate(file): - if len(line.strip()) == 0: - continue - ann_type = line[0] - - # strip away the new line character - if line.endswith("\n"): - line = line[:-1] - - if ann_type == "T": - res["spans"].append(Brat._get_span_annotation(line)) - elif ann_type == "E": - res["events"].append(Brat._get_event_annotation(line)) - elif ann_type == "R": - res["relations"].append(Brat._get_relation_annotation(line)) - elif ann_type == "*": - res["equivalence_relations"].append( - Brat._get_equivalence_relation_annotation(line) - ) - elif ann_type in ["A", "M"]: - res["attributions"].append(Brat._get_attribute_annotation(line)) - elif ann_type == "N": - res["normalizations"].append(Brat._get_normalization_annotation(line)) - elif ann_type == "#": - res["notes"].append(Brat._get_note_annotation(line)) - else: - raise ValueError( - f'unknown BRAT annotation id type: "{line}" (from file {filename} @line {i}). ' - f"Annotation ids have to start with T (spans), E (events), R (relations), " - f"A (attributions), or N (normalizations). See " - f"https://brat.nlplab.org/standoff.html for the BRAT annotation file " - f"specification." - ) - return res - - def _generate_examples(self, files=None, directory=None): - """Read context (.txt) and annotation (.ann) files.""" - if files is None: - assert ( - directory is not None - ), "If files is None, directory has to be provided, but it is also None." - _files = glob.glob(f"{directory}/*.{self.config.ann_file_extension}") - files = sorted(path.splitext(fn)[0] for fn in _files) - - for filename in files: - basename = path.basename(filename) - if ( - self.config.file_name_blacklist is not None - and basename in self.config.file_name_blacklist - ): - logger.info(f"skip annotation file: {basename} (blacklisted)") - continue - - ann_fn = f"{filename}.{self.config.ann_file_extension}" - brat_annotations = Brat._read_annotation_file(ann_fn) - - txt_fn = f"{filename}.{self.config.txt_file_extension}" - txt_content = open(txt_fn).read() - brat_annotations["context"] = txt_content - brat_annotations["file_name"] = basename - - yield basename, brat_annotations - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - - if self.config.data_dir is not None: - data_dir = self.config.data_dir - logging.warning(f"load from data_dir: {data_dir}") - else: - # since subclasses of BuilderConfig are not allowed to define - # attributes without defaults, check here - assert self.config.url is not None, "data url not specified" - - data_dir = dl_manager.download_and_extract(self.config.url) - - subdirectory_mapping = self.config.subdirectory_mapping - # if no subdirectory mapping is provided, ... - if subdirectory_mapping is None: - # ... use available subdirectories as split names ... - subdirs = [f for f in listdir(data_dir) if path.isdir(path.join(data_dir, f))] - if len(subdirs) > 0: - subdirectory_mapping = {subdir: subdir for subdir in subdirs} - else: - # ... otherwise, default to a single train split with the base directory - subdirectory_mapping = {"": "train"} - - return [ - SplitGenerator( - name=split, - # These kwargs will be passed to _generate_examples - gen_kwargs={ - "directory": path.join(data_dir, subdir), - }, - ) - for subdir, split in subdirectory_mapping.items() - ] From 1377d821d633b939efd01f042906abf04d0c9ba3 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 7 Nov 2023 15:41:40 +0100 Subject: [PATCH 04/15] add fixture data --- tests/fixtures/dataset_builders/pie/brat/brat/1.ann | 2 ++ tests/fixtures/dataset_builders/pie/brat/brat/1.txt | 1 + tests/fixtures/dataset_builders/pie/brat/brat/2.ann | 3 +++ tests/fixtures/dataset_builders/pie/brat/brat/2.txt | 1 + 4 files changed, 7 insertions(+) create mode 100644 tests/fixtures/dataset_builders/pie/brat/brat/1.ann create mode 100644 tests/fixtures/dataset_builders/pie/brat/brat/1.txt create mode 100644 tests/fixtures/dataset_builders/pie/brat/brat/2.ann create mode 100644 tests/fixtures/dataset_builders/pie/brat/brat/2.txt diff --git a/tests/fixtures/dataset_builders/pie/brat/brat/1.ann b/tests/fixtures/dataset_builders/pie/brat/brat/1.ann new file mode 100644 index 00000000..2586acc6 --- /dev/null +++ b/tests/fixtures/dataset_builders/pie/brat/brat/1.ann @@ -0,0 +1,2 @@ +T1 person 0 4 Jane +T2 city 14 20 Berlin diff --git a/tests/fixtures/dataset_builders/pie/brat/brat/1.txt b/tests/fixtures/dataset_builders/pie/brat/brat/1.txt new file mode 100644 index 00000000..a83d9d48 --- /dev/null +++ b/tests/fixtures/dataset_builders/pie/brat/brat/1.txt @@ -0,0 +1 @@ +Jane lives in Berlin. diff --git a/tests/fixtures/dataset_builders/pie/brat/brat/2.ann b/tests/fixtures/dataset_builders/pie/brat/brat/2.ann new file mode 100644 index 00000000..cc9805ba --- /dev/null +++ b/tests/fixtures/dataset_builders/pie/brat/brat/2.ann @@ -0,0 +1,3 @@ +T1 city 0 7 Seattle +T2 person 25 37 Jenny Durkan +R1 mayor_of Arg1:T2 Arg2:T1 diff --git a/tests/fixtures/dataset_builders/pie/brat/brat/2.txt b/tests/fixtures/dataset_builders/pie/brat/brat/2.txt new file mode 100644 index 00000000..02859e37 --- /dev/null +++ b/tests/fixtures/dataset_builders/pie/brat/brat/2.txt @@ -0,0 +1 @@ +Seattle is a rainy city. Jenny Durkan is the city's mayor. From bc9603d993b87f2800f29e7c346f91c8486732fe Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 7 Nov 2023 15:45:15 +0100 Subject: [PATCH 05/15] move fixture data --- tests/fixtures/dataset_builders/pie/brat/{brat => train}/1.ann | 0 tests/fixtures/dataset_builders/pie/brat/{brat => train}/1.txt | 0 tests/fixtures/dataset_builders/pie/brat/{brat => train}/2.ann | 0 tests/fixtures/dataset_builders/pie/brat/{brat => train}/2.txt | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename tests/fixtures/dataset_builders/pie/brat/{brat => train}/1.ann (100%) rename tests/fixtures/dataset_builders/pie/brat/{brat => train}/1.txt (100%) rename tests/fixtures/dataset_builders/pie/brat/{brat => train}/2.ann (100%) rename tests/fixtures/dataset_builders/pie/brat/{brat => train}/2.txt (100%) diff --git a/tests/fixtures/dataset_builders/pie/brat/brat/1.ann b/tests/fixtures/dataset_builders/pie/brat/train/1.ann similarity index 100% rename from tests/fixtures/dataset_builders/pie/brat/brat/1.ann rename to tests/fixtures/dataset_builders/pie/brat/train/1.ann diff --git a/tests/fixtures/dataset_builders/pie/brat/brat/1.txt b/tests/fixtures/dataset_builders/pie/brat/train/1.txt similarity index 100% rename from tests/fixtures/dataset_builders/pie/brat/brat/1.txt rename to tests/fixtures/dataset_builders/pie/brat/train/1.txt diff --git a/tests/fixtures/dataset_builders/pie/brat/brat/2.ann b/tests/fixtures/dataset_builders/pie/brat/train/2.ann similarity index 100% rename from tests/fixtures/dataset_builders/pie/brat/brat/2.ann rename to tests/fixtures/dataset_builders/pie/brat/train/2.ann diff --git a/tests/fixtures/dataset_builders/pie/brat/brat/2.txt b/tests/fixtures/dataset_builders/pie/brat/train/2.txt similarity index 100% rename from tests/fixtures/dataset_builders/pie/brat/brat/2.txt rename to tests/fixtures/dataset_builders/pie/brat/train/2.txt From 7d3ed069c4ee1060997b8e57da63eabe94a20063 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 7 Nov 2023 18:09:46 +0100 Subject: [PATCH 06/15] derive BratDocument(WithMergedSpans) from TextBasedDocument; rename parameter merge_non_contiguous_spans to merge_fragmented_spans; rename Attribute.target_annotation to Attribute.annotation; --- dataset_builders/pie/brat/brat.py | 47 +++++++++++++++---------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/dataset_builders/pie/brat/brat.py b/dataset_builders/pie/brat/brat.py index 945926aa..8e6fc75d 100644 --- a/dataset_builders/pie/brat/brat.py +++ b/dataset_builders/pie/brat/brat.py @@ -10,7 +10,8 @@ LabeledSpan, _post_init_single_label, ) -from pytorch_ie.core import Annotation, AnnotationList, Document, annotation_field +from pytorch_ie.core import Annotation, AnnotationList, annotation_field +from pytorch_ie.documents import TextBasedDocument logger = logging.getLogger(__name__) @@ -28,20 +29,17 @@ def ld2dl( @dataclasses.dataclass(eq=True, frozen=True) class Attribute(Annotation): - target_annotation: Annotation + annotation: Annotation label: str value: Optional[str] = None - score: float = 1.0 + score: Optional[float] = dataclasses.field(default=None, compare=False) def __post_init__(self) -> None: _post_init_single_label(self) @dataclasses.dataclass -class BratDocument(Document): - text: str - id: Optional[str] = None - metadata: Dict[str, Any] = dataclasses.field(default_factory=dict) +class BratDocument(TextBasedDocument): spans: AnnotationList[LabeledMultiSpan] = annotation_field(target="text") relations: AnnotationList[BinaryRelation] = annotation_field(target="spans") span_attributions: AnnotationList[Attribute] = annotation_field(target="spans") @@ -49,10 +47,7 @@ class BratDocument(Document): @dataclasses.dataclass -class BratDocumentWithMergedSpans(Document): - text: str - id: Optional[str] = None - metadata: Dict[str, Any] = dataclasses.field(default_factory=dict) +class BratDocumentWithMergedSpans(TextBasedDocument): spans: AnnotationList[LabeledSpan] = annotation_field(target="text") relations: AnnotationList[BinaryRelation] = annotation_field(target="spans") span_attributions: AnnotationList[Attribute] = annotation_field(target="spans") @@ -60,9 +55,9 @@ class BratDocumentWithMergedSpans(Document): def example_to_document( - example: Dict[str, Any], merge_non_contiguous_spans: bool = False + example: Dict[str, Any], merge_fragmented_spans: bool = False ) -> BratDocument: - if merge_non_contiguous_spans: + if merge_fragmented_spans: doc = BratDocumentWithMergedSpans(text=example["context"], id=example["file_name"]) else: doc = BratDocument(text=example["context"], id=example["file_name"]) @@ -85,7 +80,7 @@ def example_to_document( f"joined span parts do not match stripped span text field content. " f'joined_span_texts_stripped: "{joined_span_texts_stripped}" != stripped "text": "{span_text_stripped}"' ) - if merge_non_contiguous_spans: + if merge_fragmented_spans: if len(starts) > 1: # check if the text in between the fragments holds only space merged_content_texts = [ @@ -96,7 +91,8 @@ def example_to_document( ] if len(merged_content_texts_not_empty) > 0: logger.warning( - f"document '{doc.id}' contains a non-contiguous span with text content in between (will be merged into a single span): " + f"document '{doc.id}' contains a non-contiguous span with text content in between " + f"(will be merged into a single span): " f"newly covered text parts: {merged_content_texts_not_empty}, " f"merged span text: '{doc.text[starts[0]:ends[-1]]}', " f"annotation: {span_dict}" @@ -140,14 +136,14 @@ def example_to_document( target_id = attribution_dict["target"] if target_id in spans: target_layer_name = "spans" - target_annotation = spans[target_id] + annotation = spans[target_id] elif target_id in relations: target_layer_name = "relations" - target_annotation = relations[target_id] + annotation = relations[target_id] else: raise Exception("only span and relation attributions are supported yet") attribution = Attribute( - target_annotation=target_annotation, + annotation=annotation, label=attribution_dict["type"], value=attribution_dict["value"], ) @@ -242,7 +238,7 @@ def document_to_example( ] assert len(span_attribution_ids) == len(document.span_attributions) for i, span_attribution in enumerate(document.span_attributions): - target_id = span_dicts[span_attribution.target_annotation]["id"] + target_id = span_dicts[span_attribution.annotation]["id"] attribution_dict = { "id": span_attribution_ids[i], "type": span_attribution.label, @@ -253,7 +249,8 @@ def document_to_example( prev_ann_dict = attribution_dicts[span_attribution] ann_dict = span_attribution logger.warning( - f"document {document.id}: annotation exists twice: {prev_ann_dict['id']} and {ann_dict['id']} are identical" + f"document {document.id}: annotation exists twice: {prev_ann_dict['id']} and {ann_dict['id']} " + f"are identical" ) attribution_dicts[span_attribution] = attribution_dict @@ -271,32 +268,32 @@ def document_to_example( class BratConfig(datasets.BuilderConfig): """BuilderConfig for BratDatasetLoader.""" - def __init__(self, merge_non_contiguous_spans: bool = False, **kwargs): + def __init__(self, merge_fragmented_spans: bool = False, **kwargs): """BuilderConfig for DocRED. Args: **kwargs: keyword arguments forwarded to super. """ super().__init__(**kwargs) - self.merge_non_contiguous_spans = merge_non_contiguous_spans + self.merge_fragmented_spans = merge_fragmented_spans class BratDatasetLoader(pytorch_ie.data.builder.GeneratorBasedBuilder): # this requires https://github.com/ChristophAlt/pytorch-ie/pull/288 DOCUMENT_TYPES = { "default": BratDocument, - "merge_non_contiguous_spans": BratDocumentWithMergedSpans, + "merge_fragmented_spans": BratDocumentWithMergedSpans, } DEFAULT_CONFIG_NAME = "default" BUILDER_CONFIGS = [ BratConfig(name="default"), - BratConfig(name="merge_non_contiguous_spans", merge_non_contiguous_spans=True), + BratConfig(name="merge_fragmented_spans", merge_fragmented_spans=True), ] BASE_DATASET_PATH = "DFKI-SLT/brat" def _generate_document(self, example, **kwargs): return example_to_document( - example, merge_non_contiguous_spans=self.config.merge_non_contiguous_spans + example, merge_fragmented_spans=self.config.merge_fragmented_spans ) From 5e6683008839fafa4f317f76b54d093c6b0508e3 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 7 Nov 2023 18:15:41 +0100 Subject: [PATCH 07/15] rename span_attributions to span_attributes and relation_attributions to relation_attributes --- dataset_builders/pie/brat/brat.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dataset_builders/pie/brat/brat.py b/dataset_builders/pie/brat/brat.py index 8e6fc75d..ce92511e 100644 --- a/dataset_builders/pie/brat/brat.py +++ b/dataset_builders/pie/brat/brat.py @@ -42,16 +42,16 @@ def __post_init__(self) -> None: class BratDocument(TextBasedDocument): spans: AnnotationList[LabeledMultiSpan] = annotation_field(target="text") relations: AnnotationList[BinaryRelation] = annotation_field(target="spans") - span_attributions: AnnotationList[Attribute] = annotation_field(target="spans") - relation_attributions: AnnotationList[Attribute] = annotation_field(target="relations") + span_attributes: AnnotationList[Attribute] = annotation_field(target="spans") + relation_attributes: AnnotationList[Attribute] = annotation_field(target="relations") @dataclasses.dataclass class BratDocumentWithMergedSpans(TextBasedDocument): spans: AnnotationList[LabeledSpan] = annotation_field(target="text") relations: AnnotationList[BinaryRelation] = annotation_field(target="spans") - span_attributions: AnnotationList[Attribute] = annotation_field(target="spans") - relation_attributions: AnnotationList[Attribute] = annotation_field(target="relations") + span_attributes: AnnotationList[Attribute] = annotation_field(target="spans") + relation_attributes: AnnotationList[Attribute] = annotation_field(target="relations") def example_to_document( @@ -130,7 +130,7 @@ def example_to_document( if len(events) > 0: raise NotImplementedError("converting events is not yet implemented") - span_attributions: Dict[str, Attribute] = dict() + span_attributes: Dict[str, Attribute] = dict() attribution_ids = [] for attribution_dict in dl2ld(example["attributions"]): target_id = attribution_dict["target"] @@ -147,10 +147,10 @@ def example_to_document( label=attribution_dict["type"], value=attribution_dict["value"], ) - span_attributions[attribution_dict["id"]] = attribution + span_attributes[attribution_dict["id"]] = attribution attribution_ids.append((target_layer_name, attribution_dict["id"])) - doc.span_attributions.extend(span_attributions.values()) + doc.span_attributes.extend(span_attributes.values()) doc.metadata["attribution_ids"] = attribution_ids normalizations = dl2ld(example["normalizations"]) @@ -236,8 +236,8 @@ def document_to_example( for target_layer, attribution_id in document.metadata["attribution_ids"] if target_layer == "spans" ] - assert len(span_attribution_ids) == len(document.span_attributions) - for i, span_attribution in enumerate(document.span_attributions): + assert len(span_attribution_ids) == len(document.span_attributes) + for i, span_attribution in enumerate(document.span_attributes): target_id = span_dicts[span_attribution.annotation]["id"] attribution_dict = { "id": span_attribution_ids[i], From c5115940c36833ef809586f05daad44dde33fd55 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 7 Nov 2023 18:36:06 +0100 Subject: [PATCH 08/15] handle relation attributes --- dataset_builders/pie/brat/brat.py | 93 ++++++++++++++++++------------- 1 file changed, 53 insertions(+), 40 deletions(-) diff --git a/dataset_builders/pie/brat/brat.py b/dataset_builders/pie/brat/brat.py index ce92511e..a7f8e350 100644 --- a/dataset_builders/pie/brat/brat.py +++ b/dataset_builders/pie/brat/brat.py @@ -1,5 +1,6 @@ import dataclasses import logging +from collections import defaultdict from typing import Any, Dict, List, Optional, Tuple, Union import datasets @@ -130,10 +131,10 @@ def example_to_document( if len(events) > 0: raise NotImplementedError("converting events is not yet implemented") - span_attributes: Dict[str, Attribute] = dict() - attribution_ids = [] - for attribution_dict in dl2ld(example["attributions"]): - target_id = attribution_dict["target"] + attribute_annotations: Dict[str, Dict[str, Attribute]] = defaultdict(dict) + attribute_ids = [] + for attribute_dict in dl2ld(example["attributes"]): + target_id = attribute_dict["target"] if target_id in spans: target_layer_name = "spans" annotation = spans[target_id] @@ -141,17 +142,18 @@ def example_to_document( target_layer_name = "relations" annotation = relations[target_id] else: - raise Exception("only span and relation attributions are supported yet") - attribution = Attribute( + raise Exception("only span and relation attributes are supported yet") + attribute = Attribute( annotation=annotation, - label=attribution_dict["type"], - value=attribution_dict["value"], + label=attribute_dict["type"], + value=attribute_dict["value"], ) - span_attributes[attribution_dict["id"]] = attribution - attribution_ids.append((target_layer_name, attribution_dict["id"])) + attribute_annotations[target_layer_name][attribute_dict["id"]] = attribute + attribute_ids.append((target_layer_name, attribute_dict["id"])) - doc.span_attributes.extend(span_attributes.values()) - doc.metadata["attribution_ids"] = attribution_ids + doc.span_attributes.extend(attribute_annotations["spans"].values()) + doc.relation_attributes.extend(attribute_annotations["relations"].values()) + doc.metadata["attribute_ids"] = attribute_ids normalizations = dl2ld(example["normalizations"]) if len(normalizations) > 0: @@ -199,7 +201,8 @@ def document_to_example( prev_ann_dict = span_dicts[span] ann_dict = span_dict logger.warning( - f"document {document.id}: annotation exists twice: {prev_ann_dict['id']} and {ann_dict['id']} are identical" + f"document {document.id}: annotation exists twice: {prev_ann_dict['id']} and {ann_dict['id']} " + f"are identical" ) span_dicts[span] = span_dict example["spans"] = ld2dl(list(span_dicts.values()), keys=["id", "type", "locations", "text"]) @@ -221,7 +224,8 @@ def document_to_example( prev_ann_dict = relation_dicts[rel] ann_dict = relation_dict logger.warning( - f"document {document.id}: annotation exists twice: {prev_ann_dict['id']} and {ann_dict['id']} are identical" + f"document {document.id}: annotation exists twice: {prev_ann_dict['id']} and {ann_dict['id']} " + f"are identical" ) relation_dicts[rel] = relation_dict @@ -230,32 +234,41 @@ def document_to_example( example["equivalence_relations"] = ld2dl([], keys=["type", "targets"]) example["events"] = ld2dl([], keys=["id", "type", "trigger", "arguments"]) - attribution_dicts: Dict[Annotation, Dict[str, Any]] = dict() - span_attribution_ids = [ - attribution_id - for target_layer, attribution_id in document.metadata["attribution_ids"] - if target_layer == "spans" - ] - assert len(span_attribution_ids) == len(document.span_attributes) - for i, span_attribution in enumerate(document.span_attributes): - target_id = span_dicts[span_attribution.annotation]["id"] - attribution_dict = { - "id": span_attribution_ids[i], - "type": span_attribution.label, - "target": target_id, - "value": span_attribution.value, - } - if span_attribution in attribution_dicts: - prev_ann_dict = attribution_dicts[span_attribution] - ann_dict = span_attribution - logger.warning( - f"document {document.id}: annotation exists twice: {prev_ann_dict['id']} and {ann_dict['id']} " - f"are identical" - ) - attribution_dicts[span_attribution] = attribution_dict - - example["attributions"] = ld2dl( - list(attribution_dicts.values()), keys=["id", "type", "target", "value"] + annotation_dicts = { + "spans": span_dicts, + "relations": relation_dicts, + } + all_attribute_annotations = { + "spans": document.span_attributes, + "relations": document.relation_attributes, + } + attribute_dicts: Dict[Annotation, Dict[str, Any]] = dict() + attribute_ids_per_target = defaultdict(list) + for target_layer, attribute_id in document.metadata["attribute_ids"]: + attribute_ids_per_target[target_layer].append(attribute_id) + + for target_layer, attribute_ids in attribute_ids_per_target.items(): + attribute_annotations = all_attribute_annotations[target_layer] + assert len(attribute_ids) == len(attribute_annotations) + for i, attribute_annotation in enumerate(document.span_attributes): + target_id = annotation_dicts[target_layer][attribute_annotation.annotation]["id"] + attribute_dict = { + "id": attribute_ids_per_target[target_layer][i], + "type": attribute_annotation.label, + "target": target_id, + "value": attribute_annotation.value, + } + if attribute_annotation in attribute_dicts: + prev_ann_dict = attribute_dicts[attribute_annotation] + ann_dict = attribute_annotation + logger.warning( + f"document {document.id}: annotation exists twice: {prev_ann_dict['id']} and {ann_dict['id']} " + f"are identical" + ) + attribute_dicts[attribute_annotation] = attribute_dict + + example["attributes"] = ld2dl( + list(attribute_dicts.values()), keys=["id", "type", "target", "value"] ) example["normalizations"] = ld2dl( [], keys=["id", "type", "target", "resource_id", "entity_id"] From 84f75766c1d5159a9f8801e8e7cbce1a639c887f Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 7 Nov 2023 18:36:30 +0100 Subject: [PATCH 09/15] add README.md --- dataset_builders/pie/brat/README.md | 80 +++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 dataset_builders/pie/brat/README.md diff --git a/dataset_builders/pie/brat/README.md b/dataset_builders/pie/brat/README.md new file mode 100644 index 00000000..979f321f --- /dev/null +++ b/dataset_builders/pie/brat/README.md @@ -0,0 +1,80 @@ +# PIE Dataset Card for "conll2003" + +This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the +[BRAT Huggingface dataset loading script](https://huggingface.co/datasets/DFKI-SLT/brat). + +## Data Schema + +The document type for this dataset is `BratDocument` or `BratDocumentWithMergedSpans`, depending on if the +data was loaded with `merge_fragmented_spans=True` (default: `False`). They define the following data fields: + +- `text` (str) +- `id` (str, optional) +- `metadata` (dictionary, optional) + +and the following annotation layers: + +- `spans` (annotation type: `LabeledMultiSpan` in the case of `BratDocument` and `LabeledSpan` and in the case of `BratDocumentWithMergedSpans`, target: `text`) +- `relations` (annotation type: `BinaryRelation`, target: `spans`) +- `span_attributes` (annotation type: `Attribute`, target: `spans`) +- `relation_attributes` (annotation type: `Attribute`, target: `relations`) + +The `Attribute` annotation type is defined as follows: + +- `annotation` (type: `Annotation`): the annotation to which the attribute is attached +- `label` (type: `str`) +- `value` (type: `str`, optional) +- `score` (type: `float`, optional, not included in comparison) + +See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/annotations.py) for the remaining annotation type definitions. + +## Document Converters + +The dataset provides no predefined document converters because the BRAT format is very flexible and can be used +for many different tasks. You can add your own document converter by doing the following: + +```python +import dataclasses +from typing import Optional + +from pytorch_ie.core import AnnotationList, annotation_field +from pytorch_ie.documents import TextBasedDocument +from pytorch_ie.annotations import LabeledSpan + +from pie_datasets import DatasetDict + +# define your document class +@dataclasses.dataclass +class MyDocument(TextBasedDocument): + my_field: Optional[str] = None + my_span_annotations: AnnotationList[LabeledSpan] = annotation_field(target="text") + +# define your document converter +def my_converter(document: BratDocumentWithMergedSpans) -> MyDocument: + # create your document with the data from the original document. + # The fields "text", "id" and "metadata" are derived from the TextBasedDocument. + my_document = MyDocument(id=document.id, text=document.text, metadata=document.metadata, my_field="my_value") + + # create a new span annotation + new_span = LabeledSpan(label="my_label", start=2, end=10) + # add the new span annotation to your document + my_document.my_span_annotations.append(new_span) + + # add annotations from the document to your document + for span in document.spans: + # we need to copy the span because an annotation can only be attached to one document + my_document.my_span_annotations.append(span.copy()) + + return my_document + + +# load the dataset. We use the "merge_fragmented_spans" dataset variant here +# because it provides documents of type BratDocumentWithMergedSpans. +dataset = DatasetDict.load_dataset("pie/brat", name="merge_fragmented_spans", data_dir="path/to/brat/data") + +# attach your document converter to the dataset +dataset.register_document_converter(my_converter) + +# convert the dataset +converted_dataset = dataset.to_document_type(MyDocument) +``` From 624fef01b7430ab43bc57dea68287ee953862d18 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 7 Nov 2023 18:41:27 +0100 Subject: [PATCH 10/15] add attributes to BRAT fixture data --- tests/fixtures/dataset_builders/pie/brat/train/2.ann | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/fixtures/dataset_builders/pie/brat/train/2.ann b/tests/fixtures/dataset_builders/pie/brat/train/2.ann index cc9805ba..d46cf3d6 100644 --- a/tests/fixtures/dataset_builders/pie/brat/train/2.ann +++ b/tests/fixtures/dataset_builders/pie/brat/train/2.ann @@ -1,3 +1,5 @@ T1 city 0 7 Seattle T2 person 25 37 Jenny Durkan R1 mayor_of Arg1:T2 Arg2:T1 +A1 factuality T1 actual +A2 statement R1 From bd0ef771f48dd42fa9835db6c5693fbaf1515bcc Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 7 Nov 2023 20:11:47 +0100 Subject: [PATCH 11/15] remove __post_init__ from Attribute to allow for score=None --- dataset_builders/pie/brat/brat.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/dataset_builders/pie/brat/brat.py b/dataset_builders/pie/brat/brat.py index a7f8e350..ba40adf8 100644 --- a/dataset_builders/pie/brat/brat.py +++ b/dataset_builders/pie/brat/brat.py @@ -35,9 +35,6 @@ class Attribute(Annotation): value: Optional[str] = None score: Optional[float] = dataclasses.field(default=None, compare=False) - def __post_init__(self) -> None: - _post_init_single_label(self) - @dataclasses.dataclass class BratDocument(TextBasedDocument): From 55d3791d10e405e5a05785e3b4f46b97c7ae6932 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 7 Nov 2023 20:12:03 +0100 Subject: [PATCH 12/15] fix document_to_example() --- dataset_builders/pie/brat/brat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset_builders/pie/brat/brat.py b/dataset_builders/pie/brat/brat.py index ba40adf8..aa70cca1 100644 --- a/dataset_builders/pie/brat/brat.py +++ b/dataset_builders/pie/brat/brat.py @@ -247,7 +247,7 @@ def document_to_example( for target_layer, attribute_ids in attribute_ids_per_target.items(): attribute_annotations = all_attribute_annotations[target_layer] assert len(attribute_ids) == len(attribute_annotations) - for i, attribute_annotation in enumerate(document.span_attributes): + for i, attribute_annotation in enumerate(attribute_annotations): target_id = annotation_dicts[target_layer][attribute_annotation.annotation]["id"] attribute_dict = { "id": attribute_ids_per_target[target_layer][i], From 4a9c871098466ad0c7a3b323a6771815646b658a Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 7 Nov 2023 20:13:13 +0100 Subject: [PATCH 13/15] fix example_to_document() / document_to_example(): use "attributions" to access the attributes because the base dataset loader still produces this --- dataset_builders/pie/brat/brat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataset_builders/pie/brat/brat.py b/dataset_builders/pie/brat/brat.py index aa70cca1..56b55db6 100644 --- a/dataset_builders/pie/brat/brat.py +++ b/dataset_builders/pie/brat/brat.py @@ -130,7 +130,7 @@ def example_to_document( attribute_annotations: Dict[str, Dict[str, Attribute]] = defaultdict(dict) attribute_ids = [] - for attribute_dict in dl2ld(example["attributes"]): + for attribute_dict in dl2ld(example["attributions"]): target_id = attribute_dict["target"] if target_id in spans: target_layer_name = "spans" @@ -264,7 +264,7 @@ def document_to_example( ) attribute_dicts[attribute_annotation] = attribute_dict - example["attributes"] = ld2dl( + example["attributions"] = ld2dl( list(attribute_dicts.values()), keys=["id", "type", "target", "value"] ) example["normalizations"] = ld2dl( From 76ab4e67c41efc87e7ee7cb8e907a507d253e216 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 7 Nov 2023 20:21:02 +0100 Subject: [PATCH 14/15] add tests --- tests/dataset_builders/common.py | 1 + tests/dataset_builders/pie/test_brat.py | 227 ++++++++++++++++++++++++ 2 files changed, 228 insertions(+) create mode 100644 tests/dataset_builders/pie/test_brat.py diff --git a/tests/dataset_builders/common.py b/tests/dataset_builders/common.py index 57291fd2..70af75a7 100644 --- a/tests/dataset_builders/common.py +++ b/tests/dataset_builders/common.py @@ -11,6 +11,7 @@ HF_BASE_PATH = DATASET_BUILDER_BASE_PATH / "hf" PIE_BASE_PATH = DATASET_BUILDER_BASE_PATH / "pie" HF_DS_FIXTURE_DATA_PATH = FIXTURES_ROOT / "dataset_builders" / "hf" +PIE_DS_FIXTURE_DATA_PATH = FIXTURES_ROOT / "dataset_builders" / "pie" logger = logging.getLogger(__name__) diff --git a/tests/dataset_builders/pie/test_brat.py b/tests/dataset_builders/pie/test_brat.py new file mode 100644 index 00000000..8b036592 --- /dev/null +++ b/tests/dataset_builders/pie/test_brat.py @@ -0,0 +1,227 @@ +from typing import Any, Union + +import datasets +import pytest +from pytorch_ie.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan +from pytorch_ie.core import Annotation +from pytorch_ie.documents import TextBasedDocument + +from dataset_builders.pie.brat.brat import ( + BratDatasetLoader, + BratDocument, + BratDocumentWithMergedSpans, + document_to_example, + example_to_document, +) +from tests.dataset_builders.common import PIE_BASE_PATH, PIE_DS_FIXTURE_DATA_PATH + +datasets.disable_caching() + +DATASET_NAME = "brat" +PIE_DATASET_PATH = PIE_BASE_PATH / DATASET_NAME +HF_DATASET_PATH = BratDatasetLoader.BASE_DATASET_PATH +FIXTURE_DATA_PATH = PIE_DS_FIXTURE_DATA_PATH / DATASET_NAME +SPLIT_SIZES = {"train": 2} + + +def resolve_annotation(annotation: Annotation) -> Any: + if annotation.target is None: + return None + if isinstance(annotation, LabeledMultiSpan): + return ( + [annotation.target[start:end] for start, end in annotation.slices], + annotation.label, + ) + elif isinstance(annotation, LabeledSpan): + return (annotation.target[annotation.start : annotation.end], annotation.label) + elif isinstance(annotation, BinaryRelation): + return ( + resolve_annotation(annotation.head), + annotation.label, + resolve_annotation(annotation.tail), + ) + elif isinstance(annotation, Annotation) and str(type(annotation)).endswith("brat.Attribute'>"): + result = (resolve_annotation(annotation.annotation), annotation.label) + if annotation.value is not None: + return result + (annotation.value,) + else: + return result + else: + raise TypeError(f"Unknown annotation type: {type(annotation)}") + + +@pytest.fixture(scope="module") +def hf_dataset(): + return datasets.load_dataset(str(HF_DATASET_PATH), data_dir=str(FIXTURE_DATA_PATH)) + + +def test_hf_dataset(hf_dataset): + assert set(hf_dataset) == set(SPLIT_SIZES) + split_sizes = {split_name: len(ds) for split_name, ds in hf_dataset.items()} + assert split_sizes == SPLIT_SIZES + + +@pytest.fixture(params=range(SPLIT_SIZES["train"])) +def sample_idx(request): + return request.param + + +@pytest.fixture() +def hf_example(hf_dataset, sample_idx): + return hf_dataset["train"][sample_idx] + + +def test_hf_example(hf_example, sample_idx): + if sample_idx == 0: + assert hf_example == { + "context": "Jane lives in Berlin.\n", + "file_name": "1", + "spans": { + "id": ["T1", "T2"], + "type": ["person", "city"], + "locations": [{"start": [0], "end": [4]}, {"start": [14], "end": [20]}], + "text": ["Jane", "Berlin"], + }, + "relations": {"id": [], "type": [], "arguments": []}, + "equivalence_relations": {"type": [], "targets": []}, + "events": {"id": [], "type": [], "trigger": [], "arguments": []}, + "attributions": {"id": [], "type": [], "target": [], "value": []}, + "normalizations": { + "id": [], + "type": [], + "target": [], + "resource_id": [], + "entity_id": [], + }, + "notes": {"id": [], "type": [], "target": [], "note": []}, + } + elif sample_idx == 1: + assert hf_example == { + "context": "Seattle is a rainy city. Jenny Durkan is the city's mayor.\n", + "file_name": "2", + "spans": { + "id": ["T1", "T2"], + "type": ["city", "person"], + "locations": [{"start": [0], "end": [7]}, {"start": [25], "end": [37]}], + "text": ["Seattle", "Jenny Durkan"], + }, + "relations": { + "id": ["R1"], + "type": ["mayor_of"], + "arguments": [{"type": ["Arg1", "Arg2"], "target": ["T2", "T1"]}], + }, + "equivalence_relations": {"type": [], "targets": []}, + "events": {"id": [], "type": [], "trigger": [], "arguments": []}, + "attributions": { + "id": ["A1", "A2"], + "type": ["factuality", "statement"], + "target": ["T1", "R1"], + "value": ["actual", "true"], + }, + "normalizations": { + "id": [], + "type": [], + "target": [], + "resource_id": [], + "entity_id": [], + }, + "notes": {"id": [], "type": [], "target": [], "note": []}, + } + else: + raise ValueError(f"Unknown sample index: {sample_idx}") + + +@pytest.fixture( + params=[config.name for config in BratDatasetLoader.BUILDER_CONFIGS], # scope="module" +) +def pie_dataset_variant(request): + return request.param + + +@pytest.fixture() +def generated_document( + hf_example, hf_dataset, pie_dataset_variant +) -> Union[BratDocument, BratDocumentWithMergedSpans]: + builder = BratDatasetLoader(name=pie_dataset_variant) + kwargs = builder._generate_document_kwargs(hf_dataset["train"]) or {} + document = builder._generate_document(example=hf_example, **kwargs) + assert document is not None + return document + + +def test_generate_document(generated_document, pie_dataset_variant, sample_idx): + assert generated_document is not None + resolved_spans = [resolve_annotation(annotation=span) for span in generated_document.spans] + resolved_relations = [ + resolve_annotation(relation) for relation in generated_document.relations + ] + if sample_idx == 0: + assert len(generated_document.spans) == 2 + assert len(generated_document.relations) == 0 + assert len(generated_document.span_attributes) == 0 + assert len(generated_document.relation_attributes) == 0 + + if pie_dataset_variant == "default": + assert resolved_spans[0] == (["Jane"], "person") + assert resolved_spans[1] == (["Berlin"], "city") + elif pie_dataset_variant == "merge_fragmented_spans": + assert resolved_spans[0] == ("Jane", "person") + assert resolved_spans[1] == ("Berlin", "city") + else: + raise ValueError(f"Unknown dataset variant: {pie_dataset_variant}") + + elif sample_idx == 1: + assert len(generated_document.spans) == 2 + assert len(generated_document.relations) == 1 + assert len(generated_document.span_attributes) == 1 + assert len(generated_document.relation_attributes) == 1 + + resolved_span_attributes = [ + resolve_annotation(attribute) for attribute in generated_document.span_attributes + ] + resolved_relation_attributes = [ + resolve_annotation(attribute) for attribute in generated_document.relation_attributes + ] + + if pie_dataset_variant == "default": + assert resolved_spans[0] == (["Seattle"], "city") + assert resolved_spans[1] == (["Jenny Durkan"], "person") + assert resolved_relations[0] == ( + (["Jenny Durkan"], "person"), + "mayor_of", + (["Seattle"], "city"), + ) + assert resolved_span_attributes[0] == ((["Seattle"], "city"), "factuality", "actual") + assert resolved_relation_attributes[0] == ( + ((["Jenny Durkan"], "person"), "mayor_of", (["Seattle"], "city")), + "statement", + "true", + ) + elif pie_dataset_variant == "merge_fragmented_spans": + assert resolved_spans[0] == ("Seattle", "city") + assert resolved_spans[1] == ("Jenny Durkan", "person") + assert resolved_relations[0] == ( + ("Jenny Durkan", "person"), + "mayor_of", + ("Seattle", "city"), + ) + assert resolved_span_attributes[0] == (("Seattle", "city"), "factuality", "actual") + assert resolved_relation_attributes[0] == ( + (("Jenny Durkan", "person"), "mayor_of", ("Seattle", "city")), + "statement", + "true", + ) + else: + raise ValueError(f"Unknown dataset variant: {pie_dataset_variant}") + else: + raise ValueError(f"Unknown sample index: {sample_idx}") + + +@pytest.mark.parametrize("merge_fragmented_spans", [True, False]) +def test_example_to_document_and_back_all(hf_dataset, merge_fragmented_spans): + for split_name, split in hf_dataset.items(): + for hf_example in split: + doc = example_to_document(hf_example, merge_fragmented_spans=merge_fragmented_spans) + assert isinstance(doc, TextBasedDocument) + hf_example_back = document_to_example(doc) + assert hf_example == hf_example_back From 5cfc0091a521c30928757f9d1a1d089700f09aa4 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 7 Nov 2023 20:23:52 +0100 Subject: [PATCH 15/15] use GeneratorBasedBuilder from pie_datasets --- dataset_builders/pie/brat/brat.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/dataset_builders/pie/brat/brat.py b/dataset_builders/pie/brat/brat.py index 56b55db6..752de772 100644 --- a/dataset_builders/pie/brat/brat.py +++ b/dataset_builders/pie/brat/brat.py @@ -4,16 +4,12 @@ from typing import Any, Dict, List, Optional, Tuple, Union import datasets -import pytorch_ie -from pytorch_ie.annotations import ( - BinaryRelation, - LabeledMultiSpan, - LabeledSpan, - _post_init_single_label, -) +from pytorch_ie.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan from pytorch_ie.core import Annotation, AnnotationList, annotation_field from pytorch_ie.documents import TextBasedDocument +from pie_datasets import GeneratorBasedBuilder + logger = logging.getLogger(__name__) @@ -288,7 +284,7 @@ def __init__(self, merge_fragmented_spans: bool = False, **kwargs): self.merge_fragmented_spans = merge_fragmented_spans -class BratDatasetLoader(pytorch_ie.data.builder.GeneratorBasedBuilder): +class BratDatasetLoader(GeneratorBasedBuilder): # this requires https://github.com/ChristophAlt/pytorch-ie/pull/288 DOCUMENT_TYPES = { "default": BratDocument,