From d5dad39bf8aed27094a49ba524793986dd9d87ff Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Wed, 22 Nov 2023 17:35:52 +0100 Subject: [PATCH] add re-usable BratBuilder within builders package; move Brat-related types into that --- dataset_builders/pie/brat/brat.py | 286 +--------------------- src/pie_datasets/builders/__init__.py | 1 + src/pie_datasets/builders/brat.py | 305 ++++++++++++++++++++++++ src/pie_datasets/document/types.py | 31 +-- tests/dataset_builders/pie/test_brat.py | 20 +- 5 files changed, 321 insertions(+), 322 deletions(-) create mode 100644 src/pie_datasets/builders/__init__.py create mode 100644 src/pie_datasets/builders/brat.py diff --git a/dataset_builders/pie/brat/brat.py b/dataset_builders/pie/brat/brat.py index 60070384..134e82f4 100644 --- a/dataset_builders/pie/brat/brat.py +++ b/dataset_builders/pie/brat/brat.py @@ -1,285 +1,5 @@ -import logging -from collections import defaultdict -from typing import Any, Dict, List, Optional, Tuple, Union +from pie_datasets.builders import BratBuilder -import datasets -from pytorch_ie.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan -from pytorch_ie.core import Annotation -from pie_datasets import GeneratorBasedBuilder -from pie_datasets.document.types import ( - Attribute, - BratDocument, - BratDocumentWithMergedSpans, -) - -logger = logging.getLogger(__name__) - - -def dl2ld(dict_of_lists: Dict[str, List[Any]]) -> List[Dict[str, Any]]: - return [dict(zip(dict_of_lists, t)) for t in zip(*dict_of_lists.values())] - - -def ld2dl( - list_fo_dicts: List[Dict[str, Any]], keys: Optional[List[str]] = None -) -> Dict[str, List[Any]]: - keys = keys or list(list_fo_dicts[0]) - return {k: [dic[k] for dic in list_fo_dicts] for k in keys} - - -def example_to_document( - example: Dict[str, Any], merge_fragmented_spans: bool = False -) -> Union[BratDocument, BratDocumentWithMergedSpans]: - if merge_fragmented_spans: - doc = BratDocumentWithMergedSpans(text=example["context"], id=example["file_name"]) - else: - doc = BratDocument(text=example["context"], id=example["file_name"]) - - spans: Dict[str, LabeledSpan] = dict() - span_locations: List[Tuple[Tuple[int, int]]] = [] - span_texts: List[str] = [] - for span_dict in dl2ld(example["spans"]): - starts: List[int] = span_dict["locations"]["start"] - ends: List[int] = span_dict["locations"]["end"] - slices = tuple(zip(starts, ends)) - span_locations.append(slices) - span_texts.append(span_dict["text"]) - # sanity check - span_text_parts = [doc.text[start:end] for start, end in slices] - joined_span_texts_stripped = " ".join(span_text_parts).strip() - span_text_stripped = span_dict["text"].strip() - if joined_span_texts_stripped != span_text_stripped: - logger.warning( - f"joined span parts do not match stripped span text field content. " - f'joined_span_texts_stripped: "{joined_span_texts_stripped}" != stripped "text": "{span_text_stripped}"' - ) - if merge_fragmented_spans: - if len(starts) > 1: - # check if the text in between the fragments holds only space - merged_content_texts = [ - doc.text[start:end] for start, end in zip(ends[:-1], starts[1:]) - ] - merged_content_texts_not_empty = [ - text.strip() for text in merged_content_texts if text.strip() != "" - ] - if len(merged_content_texts_not_empty) > 0: - logger.warning( - f"document '{doc.id}' contains a non-contiguous span with text content in between " - f"(will be merged into a single span): " - f"newly covered text parts: {merged_content_texts_not_empty}, " - f"merged span text: '{doc.text[starts[0]:ends[-1]]}', " - f"annotation: {span_dict}" - ) - # just take everything - start = min(starts) - end = max(ends) - span = LabeledSpan(start=start, end=end, label=span_dict["type"]) - else: - span = LabeledMultiSpan(slices=slices, label=span_dict["type"]) - spans[span_dict["id"]] = span - - doc.spans.extend(spans.values()) - doc.metadata["span_ids"] = list(spans.keys()) - doc.metadata["span_locations"] = span_locations - doc.metadata["span_texts"] = span_texts - - relations: Dict[str, BinaryRelation] = dict() - for rel_dict in dl2ld(example["relations"]): - arguments = dict(zip(rel_dict["arguments"]["type"], rel_dict["arguments"]["target"])) - assert set(arguments) == {"Arg1", "Arg2"} - head = spans[arguments["Arg1"]] - tail = spans[arguments["Arg2"]] - rel = BinaryRelation(head=head, tail=tail, label=rel_dict["type"]) - relations[rel_dict["id"]] = rel - - doc.relations.extend(relations.values()) - doc.metadata["relation_ids"] = list(relations.keys()) - - equivalence_relations = dl2ld(example["equivalence_relations"]) - if len(equivalence_relations) > 0: - raise NotImplementedError("converting equivalence_relations is not yet implemented") - - events = dl2ld(example["events"]) - if len(events) > 0: - raise NotImplementedError("converting events is not yet implemented") - - attribute_annotations: Dict[str, Dict[str, Attribute]] = defaultdict(dict) - attribute_ids = [] - for attribute_dict in dl2ld(example["attributions"]): - target_id = attribute_dict["target"] - if target_id in spans: - target_layer_name = "spans" - annotation = spans[target_id] - elif target_id in relations: - target_layer_name = "relations" - annotation = relations[target_id] - else: - raise Exception("only span and relation attributes are supported yet") - attribute = Attribute( - annotation=annotation, - label=attribute_dict["type"], - value=attribute_dict["value"], - ) - attribute_annotations[target_layer_name][attribute_dict["id"]] = attribute - attribute_ids.append((target_layer_name, attribute_dict["id"])) - - doc.span_attributes.extend(attribute_annotations["spans"].values()) - doc.relation_attributes.extend(attribute_annotations["relations"].values()) - doc.metadata["attribute_ids"] = attribute_ids - - normalizations = dl2ld(example["normalizations"]) - if len(normalizations) > 0: - raise NotImplementedError("converting normalizations is not yet implemented") - - notes = dl2ld(example["notes"]) - if len(notes) > 0: - raise NotImplementedError("converting notes is not yet implemented") - - return doc - - -def document_to_example( - document: Union[BratDocument, BratDocumentWithMergedSpans] -) -> Dict[str, Any]: - example = { - "context": document.text, - "file_name": document.id, - } - span_dicts: Dict[Union[LabeledSpan, LabeledMultiSpan], Dict[str, Any]] = dict() - assert len(document.metadata["span_locations"]) == len(document.spans) - assert len(document.metadata["span_texts"]) == len(document.spans) - assert len(document.metadata["span_ids"]) == len(document.spans) - for i, span in enumerate(document.spans): - locations = tuple((start, end) for start, end in document.metadata["span_locations"][i]) - if isinstance(span, LabeledSpan): - assert locations[0][0] == span.start - assert locations[-1][1] == span.end - elif isinstance(span, LabeledMultiSpan): - assert span.slices == locations - else: - raise TypeError(f"span has unknown type [{type(span)}]: {span}") - - starts, ends = zip(*locations) - span_dict = { - "id": document.metadata["span_ids"][i], - "locations": { - "start": list(starts), - "end": list(ends), - }, - "text": document.metadata["span_texts"][i], - "type": span.label, - } - if span in span_dicts: - prev_ann_dict = span_dicts[span] - ann_dict = span_dict - logger.warning( - f"document {document.id}: annotation exists twice: {prev_ann_dict['id']} and {ann_dict['id']} " - f"are identical" - ) - span_dicts[span] = span_dict - example["spans"] = ld2dl(list(span_dicts.values()), keys=["id", "type", "locations", "text"]) - - relation_dicts: Dict[BinaryRelation, Dict[str, Any]] = dict() - assert len(document.metadata["relation_ids"]) == len(document.relations) - for i, rel in enumerate(document.relations): - arg1_id = span_dicts[rel.head]["id"] - arg2_id = span_dicts[rel.tail]["id"] - relation_dict = { - "id": document.metadata["relation_ids"][i], - "type": rel.label, - "arguments": { - "type": ["Arg1", "Arg2"], - "target": [arg1_id, arg2_id], - }, - } - if rel in relation_dicts: - prev_ann_dict = relation_dicts[rel] - ann_dict = relation_dict - logger.warning( - f"document {document.id}: annotation exists twice: {prev_ann_dict['id']} and {ann_dict['id']} " - f"are identical" - ) - relation_dicts[rel] = relation_dict - - example["relations"] = ld2dl(list(relation_dicts.values()), keys=["id", "type", "arguments"]) - - example["equivalence_relations"] = ld2dl([], keys=["type", "targets"]) - example["events"] = ld2dl([], keys=["id", "type", "trigger", "arguments"]) - - annotation_dicts = { - "spans": span_dicts, - "relations": relation_dicts, - } - all_attribute_annotations = { - "spans": document.span_attributes, - "relations": document.relation_attributes, - } - attribute_dicts: Dict[Annotation, Dict[str, Any]] = dict() - attribute_ids_per_target = defaultdict(list) - for target_layer, attribute_id in document.metadata["attribute_ids"]: - attribute_ids_per_target[target_layer].append(attribute_id) - - for target_layer, attribute_ids in attribute_ids_per_target.items(): - attribute_annotations = all_attribute_annotations[target_layer] - assert len(attribute_ids) == len(attribute_annotations) - for i, attribute_annotation in enumerate(attribute_annotations): - target_id = annotation_dicts[target_layer][attribute_annotation.annotation]["id"] - attribute_dict = { - "id": attribute_ids_per_target[target_layer][i], - "type": attribute_annotation.label, - "target": target_id, - "value": attribute_annotation.value, - } - if attribute_annotation in attribute_dicts: - prev_ann_dict = attribute_dicts[attribute_annotation] - ann_dict = attribute_annotation - logger.warning( - f"document {document.id}: annotation exists twice: {prev_ann_dict['id']} and {ann_dict['id']} " - f"are identical" - ) - attribute_dicts[attribute_annotation] = attribute_dict - - example["attributions"] = ld2dl( - list(attribute_dicts.values()), keys=["id", "type", "target", "value"] - ) - example["normalizations"] = ld2dl( - [], keys=["id", "type", "target", "resource_id", "entity_id"] - ) - example["notes"] = ld2dl([], keys=["id", "type", "target", "note"]) - - return example - - -class BratConfig(datasets.BuilderConfig): - """BuilderConfig for BratDatasetLoader.""" - - def __init__(self, merge_fragmented_spans: bool = False, **kwargs): - """BuilderConfig for DocRED. - - Args: - **kwargs: keyword arguments forwarded to super. - """ - super().__init__(**kwargs) - self.merge_fragmented_spans = merge_fragmented_spans - - -class BratDatasetLoader(GeneratorBasedBuilder): - # this requires https://github.com/ChristophAlt/pytorch-ie/pull/288 - DOCUMENT_TYPES = { - "default": BratDocument, - "merge_fragmented_spans": BratDocumentWithMergedSpans, - } - - DEFAULT_CONFIG_NAME = "default" - BUILDER_CONFIGS = [ - BratConfig(name="default"), - BratConfig(name="merge_fragmented_spans", merge_fragmented_spans=True), - ] - - BASE_DATASET_PATH = "DFKI-SLT/brat" - BASE_DATASET_REVISION = "70446e79e089d5e5cd5f3426061991a2fcfbf529" - - def _generate_document(self, example, **kwargs): - return example_to_document( - example, merge_fragmented_spans=self.config.merge_fragmented_spans - ) +class Brat(BratBuilder): + pass diff --git a/src/pie_datasets/builders/__init__.py b/src/pie_datasets/builders/__init__.py new file mode 100644 index 00000000..5c2c8757 --- /dev/null +++ b/src/pie_datasets/builders/__init__.py @@ -0,0 +1 @@ +from .brat import BratBuilder, BratConfig diff --git a/src/pie_datasets/builders/brat.py b/src/pie_datasets/builders/brat.py new file mode 100644 index 00000000..116b23c5 --- /dev/null +++ b/src/pie_datasets/builders/brat.py @@ -0,0 +1,305 @@ +import dataclasses +import logging +from collections import defaultdict +from typing import Any, Dict, List, Optional, Tuple, Union + +import datasets +from pytorch_ie.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan +from pytorch_ie.core import Annotation, AnnotationList, annotation_field +from pytorch_ie.documents import TextBasedDocument + +from pie_datasets import GeneratorBasedBuilder + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass(eq=True, frozen=True) +class BratAttribute(Annotation): + annotation: Annotation + label: str + value: Optional[str] = None + score: Optional[float] = dataclasses.field(default=None, compare=False) + + +@dataclasses.dataclass +class BratDocument(TextBasedDocument): + spans: AnnotationList[LabeledMultiSpan] = annotation_field(target="text") + relations: AnnotationList[BinaryRelation] = annotation_field(target="spans") + span_attributes: AnnotationList[BratAttribute] = annotation_field(target="spans") + relation_attributes: AnnotationList[BratAttribute] = annotation_field(target="relations") + + +@dataclasses.dataclass +class BratDocumentWithMergedSpans(TextBasedDocument): + spans: AnnotationList[LabeledSpan] = annotation_field(target="text") + relations: AnnotationList[BinaryRelation] = annotation_field(target="spans") + span_attributes: AnnotationList[BratAttribute] = annotation_field(target="spans") + relation_attributes: AnnotationList[BratAttribute] = annotation_field(target="relations") + + +def dl2ld(dict_of_lists: Dict[str, List[Any]]) -> List[Dict[str, Any]]: + return [dict(zip(dict_of_lists, t)) for t in zip(*dict_of_lists.values())] + + +def ld2dl( + list_fo_dicts: List[Dict[str, Any]], keys: Optional[List[str]] = None +) -> Dict[str, List[Any]]: + keys = keys or list(list_fo_dicts[0]) + return {k: [dic[k] for dic in list_fo_dicts] for k in keys} + + +def example_to_document( + example: Dict[str, Any], merge_fragmented_spans: bool = False +) -> Union[BratDocument, BratDocumentWithMergedSpans]: + if merge_fragmented_spans: + doc = BratDocumentWithMergedSpans(text=example["context"], id=example["file_name"]) + else: + doc = BratDocument(text=example["context"], id=example["file_name"]) + + spans: Dict[str, LabeledSpan] = dict() + span_locations: List[Tuple[Tuple[int, int], ...]] = [] + span_texts: List[str] = [] + for span_dict in dl2ld(example["spans"]): + starts: List[int] = span_dict["locations"]["start"] + ends: List[int] = span_dict["locations"]["end"] + slices = tuple(zip(starts, ends)) + span_locations.append(slices) + span_texts.append(span_dict["text"]) + # sanity check + span_text_parts = [doc.text[start:end] for start, end in slices] + joined_span_texts_stripped = " ".join(span_text_parts).strip() + span_text_stripped = span_dict["text"].strip() + if joined_span_texts_stripped != span_text_stripped: + logger.warning( + f"joined span parts do not match stripped span text field content. " + f'joined_span_texts_stripped: "{joined_span_texts_stripped}" != stripped "text": "{span_text_stripped}"' + ) + if merge_fragmented_spans: + if len(starts) > 1: + # check if the text in between the fragments holds only space + merged_content_texts = [ + doc.text[start:end] for start, end in zip(ends[:-1], starts[1:]) + ] + merged_content_texts_not_empty = [ + text.strip() for text in merged_content_texts if text.strip() != "" + ] + if len(merged_content_texts_not_empty) > 0: + logger.warning( + f"document '{doc.id}' contains a non-contiguous span with text content in between " + f"(will be merged into a single span): " + f"newly covered text parts: {merged_content_texts_not_empty}, " + f"merged span text: '{doc.text[starts[0]:ends[-1]]}', " + f"annotation: {span_dict}" + ) + # just take everything + start = min(starts) + end = max(ends) + span = LabeledSpan(start=start, end=end, label=span_dict["type"]) + else: + span = LabeledMultiSpan(slices=slices, label=span_dict["type"]) + spans[span_dict["id"]] = span + + doc.spans.extend(spans.values()) + doc.metadata["span_ids"] = list(spans.keys()) + doc.metadata["span_locations"] = span_locations + doc.metadata["span_texts"] = span_texts + + relations: Dict[str, BinaryRelation] = dict() + for rel_dict in dl2ld(example["relations"]): + arguments = dict(zip(rel_dict["arguments"]["type"], rel_dict["arguments"]["target"])) + assert set(arguments) == {"Arg1", "Arg2"} + head = spans[arguments["Arg1"]] + tail = spans[arguments["Arg2"]] + rel = BinaryRelation(head=head, tail=tail, label=rel_dict["type"]) + relations[rel_dict["id"]] = rel + + doc.relations.extend(relations.values()) + doc.metadata["relation_ids"] = list(relations.keys()) + + equivalence_relations = dl2ld(example["equivalence_relations"]) + if len(equivalence_relations) > 0: + raise NotImplementedError("converting equivalence_relations is not yet implemented") + + events = dl2ld(example["events"]) + if len(events) > 0: + raise NotImplementedError("converting events is not yet implemented") + + attribute_annotations: Dict[str, Dict[str, BratAttribute]] = defaultdict(dict) + attribute_ids = [] + for attribute_dict in dl2ld(example["attributions"]): + target_id = attribute_dict["target"] + if target_id in spans: + target_layer_name = "spans" + annotation = spans[target_id] + elif target_id in relations: + target_layer_name = "relations" + annotation = relations[target_id] + else: + raise Exception("only span and relation attributes are supported yet") + attribute = BratAttribute( + annotation=annotation, + label=attribute_dict["type"], + value=attribute_dict["value"], + ) + attribute_annotations[target_layer_name][attribute_dict["id"]] = attribute + attribute_ids.append((target_layer_name, attribute_dict["id"])) + + doc.span_attributes.extend(attribute_annotations["spans"].values()) + doc.relation_attributes.extend(attribute_annotations["relations"].values()) + doc.metadata["attribute_ids"] = attribute_ids + + normalizations = dl2ld(example["normalizations"]) + if len(normalizations) > 0: + raise NotImplementedError("converting normalizations is not yet implemented") + + notes = dl2ld(example["notes"]) + if len(notes) > 0: + raise NotImplementedError("converting notes is not yet implemented") + + return doc + + +def document_to_example( + document: Union[BratDocument, BratDocumentWithMergedSpans] +) -> Dict[str, Any]: + example = { + "context": document.text, + "file_name": document.id, + } + span_dicts: Dict[Union[LabeledSpan, LabeledMultiSpan], Dict[str, Any]] = dict() + assert len(document.metadata["span_locations"]) == len(document.spans) + assert len(document.metadata["span_texts"]) == len(document.spans) + assert len(document.metadata["span_ids"]) == len(document.spans) + for i, span in enumerate(document.spans): + locations = tuple((start, end) for start, end in document.metadata["span_locations"][i]) + if isinstance(span, LabeledSpan): + assert locations[0][0] == span.start + assert locations[-1][1] == span.end + elif isinstance(span, LabeledMultiSpan): + assert span.slices == locations + else: + raise TypeError(f"span has unknown type [{type(span)}]: {span}") + + starts, ends = zip(*locations) + span_dict = { + "id": document.metadata["span_ids"][i], + "locations": { + "start": list(starts), + "end": list(ends), + }, + "text": document.metadata["span_texts"][i], + "type": span.label, + } + if span in span_dicts: + prev_ann_dict = span_dicts[span] + ann_dict = span_dict + logger.warning( + f"document {document.id}: annotation exists twice: {prev_ann_dict['id']} and {ann_dict['id']} " + f"are identical" + ) + span_dicts[span] = span_dict + example["spans"] = ld2dl(list(span_dicts.values()), keys=["id", "type", "locations", "text"]) + + relation_dicts: Dict[BinaryRelation, Dict[str, Any]] = dict() + assert len(document.metadata["relation_ids"]) == len(document.relations) + for i, rel in enumerate(document.relations): + arg1_id = span_dicts[rel.head]["id"] + arg2_id = span_dicts[rel.tail]["id"] + relation_dict = { + "id": document.metadata["relation_ids"][i], + "type": rel.label, + "arguments": { + "type": ["Arg1", "Arg2"], + "target": [arg1_id, arg2_id], + }, + } + if rel in relation_dicts: + prev_ann_dict = relation_dicts[rel] + ann_dict = relation_dict + logger.warning( + f"document {document.id}: annotation exists twice: {prev_ann_dict['id']} and {ann_dict['id']} " + f"are identical" + ) + relation_dicts[rel] = relation_dict + + example["relations"] = ld2dl(list(relation_dicts.values()), keys=["id", "type", "arguments"]) + + example["equivalence_relations"] = ld2dl([], keys=["type", "targets"]) + example["events"] = ld2dl([], keys=["id", "type", "trigger", "arguments"]) + + annotation_dicts = { + "spans": span_dicts, + "relations": relation_dicts, + } + all_attribute_annotations = { + "spans": document.span_attributes, + "relations": document.relation_attributes, + } + attribute_dicts: Dict[Annotation, Dict[str, Any]] = dict() + attribute_ids_per_target = defaultdict(list) + for target_layer, attribute_id in document.metadata["attribute_ids"]: + attribute_ids_per_target[target_layer].append(attribute_id) + + for target_layer, attribute_ids in attribute_ids_per_target.items(): + attribute_annotations = all_attribute_annotations[target_layer] + assert len(attribute_ids) == len(attribute_annotations) + for i, attribute_annotation in enumerate(attribute_annotations): + target_id = annotation_dicts[target_layer][attribute_annotation.annotation]["id"] + attribute_dict = { + "id": attribute_ids_per_target[target_layer][i], + "type": attribute_annotation.label, + "target": target_id, + "value": attribute_annotation.value, + } + if attribute_annotation in attribute_dicts: + prev_ann_dict = attribute_dicts[attribute_annotation] + ann_dict = attribute_annotation + logger.warning( + f"document {document.id}: annotation exists twice: {prev_ann_dict['id']} and {ann_dict['id']} " + f"are identical" + ) + attribute_dicts[attribute_annotation] = attribute_dict + + example["attributions"] = ld2dl( + list(attribute_dicts.values()), keys=["id", "type", "target", "value"] + ) + example["normalizations"] = ld2dl( + [], keys=["id", "type", "target", "resource_id", "entity_id"] + ) + example["notes"] = ld2dl([], keys=["id", "type", "target", "note"]) + + return example + + +class BratConfig(datasets.BuilderConfig): + """BuilderConfig for BratDatasetLoader.""" + + def __init__(self, merge_fragmented_spans: bool = False, **kwargs): + """BuilderConfig for DocRED. + + Args: + **kwargs: keyword arguments forwarded to super. + """ + super().__init__(**kwargs) + self.merge_fragmented_spans = merge_fragmented_spans + + +class BratBuilder(GeneratorBasedBuilder): + DOCUMENT_TYPES = { + "default": BratDocument, + "merge_fragmented_spans": BratDocumentWithMergedSpans, + } + + DEFAULT_CONFIG_NAME = "default" + BUILDER_CONFIGS = [ + BratConfig(name="default"), + BratConfig(name="merge_fragmented_spans", merge_fragmented_spans=True), + ] + + BASE_DATASET_PATH = "DFKI-SLT/brat" + BASE_DATASET_REVISION = "70446e79e089d5e5cd5f3426061991a2fcfbf529" + + def _generate_document(self, example, **kwargs): + return example_to_document( + example, merge_fragmented_spans=self.config.merge_fragmented_spans + ) diff --git a/src/pie_datasets/document/types.py b/src/pie_datasets/document/types.py index 983d80ff..0ea32ed5 100644 --- a/src/pie_datasets/document/types.py +++ b/src/pie_datasets/document/types.py @@ -1,33 +1,8 @@ import dataclasses -from typing import Optional -from pytorch_ie.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan -from pytorch_ie.core import Annotation, AnnotationList, annotation_field -from pytorch_ie.documents import TextBasedDocument, TokenBasedDocument - - -@dataclasses.dataclass(eq=True, frozen=True) -class Attribute(Annotation): - annotation: Annotation - label: str - value: Optional[str] = None - score: Optional[float] = dataclasses.field(default=None, compare=False) - - -@dataclasses.dataclass -class BratDocument(TextBasedDocument): - spans: AnnotationList[LabeledMultiSpan] = annotation_field(target="text") - relations: AnnotationList[BinaryRelation] = annotation_field(target="spans") - span_attributes: AnnotationList[Attribute] = annotation_field(target="spans") - relation_attributes: AnnotationList[Attribute] = annotation_field(target="relations") - - -@dataclasses.dataclass -class BratDocumentWithMergedSpans(TextBasedDocument): - spans: AnnotationList[LabeledSpan] = annotation_field(target="text") - relations: AnnotationList[BinaryRelation] = annotation_field(target="spans") - span_attributes: AnnotationList[Attribute] = annotation_field(target="spans") - relation_attributes: AnnotationList[Attribute] = annotation_field(target="relations") +from pytorch_ie.annotations import BinaryRelation, LabeledSpan +from pytorch_ie.core import AnnotationList, annotation_field +from pytorch_ie.documents import TokenBasedDocument @dataclasses.dataclass diff --git a/tests/dataset_builders/pie/test_brat.py b/tests/dataset_builders/pie/test_brat.py index 13c48ea6..d4928edf 100644 --- a/tests/dataset_builders/pie/test_brat.py +++ b/tests/dataset_builders/pie/test_brat.py @@ -6,15 +6,13 @@ from pytorch_ie.core import Annotation from pytorch_ie.documents import TextBasedDocument -from dataset_builders.pie.brat.brat import ( - BratDatasetLoader, - document_to_example, - example_to_document, -) -from pie_datasets.document.types import ( - Attribute, +from dataset_builders.pie.brat.brat import BratBuilder +from pie_datasets.builders.brat import ( + BratAttribute, BratDocument, BratDocumentWithMergedSpans, + document_to_example, + example_to_document, ) from tests.dataset_builders.common import PIE_BASE_PATH, PIE_DS_FIXTURE_DATA_PATH @@ -22,7 +20,7 @@ DATASET_NAME = "brat" PIE_DATASET_PATH = PIE_BASE_PATH / DATASET_NAME -HF_DATASET_PATH = BratDatasetLoader.BASE_DATASET_PATH +HF_DATASET_PATH = BratBuilder.BASE_DATASET_PATH FIXTURE_DATA_PATH = PIE_DS_FIXTURE_DATA_PATH / DATASET_NAME SPLIT_SIZES = {"train": 2} @@ -43,7 +41,7 @@ def resolve_annotation(annotation: Annotation) -> Any: annotation.label, resolve_annotation(annotation.tail), ) - elif isinstance(annotation, Attribute): + elif isinstance(annotation, BratAttribute): result = (resolve_annotation(annotation.annotation), annotation.label) if annotation.value is not None: return result + (annotation.value,) @@ -135,7 +133,7 @@ def test_hf_example(hf_example, sample_idx): @pytest.fixture( - params=[config.name for config in BratDatasetLoader.BUILDER_CONFIGS], # scope="module" + params=[config.name for config in BratBuilder.BUILDER_CONFIGS], # scope="module" ) def pie_dataset_variant(request): return request.param @@ -145,7 +143,7 @@ def pie_dataset_variant(request): def generated_document( hf_example, hf_dataset, pie_dataset_variant ) -> Union[BratDocument, BratDocumentWithMergedSpans]: - builder = BratDatasetLoader(name=pie_dataset_variant) + builder = BratBuilder(name=pie_dataset_variant) kwargs = builder._generate_document_kwargs(hf_dataset["train"]) or {} document = builder._generate_document(example=hf_example, **kwargs) assert document is not None