Biorel dataset [WIP] (#134)

* Created data loading script for BioRel dataset * Created data loading script for BioRel dataset * Update dataset_builders/pie/biorel/biorel.py Co-authored-by: ArneBinder <[email protected]> * Adjustments for BioRel loading script, specifically introducing document_to_example() method and further tests * Added requirements.txt and README.md * precommit forgotten * moved biorel test to correct folder * Test Pie Dataset and Document * Created document converter and tests * Added generate_example method to BioRel class * Adjusted test method for converted documents and added metadata * Wrote README.md * Adjusted according to review * Update dataset_builders/pie/biorel/biorel.py Co-authored-by: ArneBinder <[email protected]> --------- Co-authored-by: ArneBinder <[email protected]>
ArneBinder · Jul 17, 2024 · 937a954 · 937a954
1 parent 38b9580
commit 937a954
Show file tree

Hide file tree

Showing 4 changed files with 428 additions and 0 deletions.
diff --git a/dataset_builders/pie/biorel/README.md b/dataset_builders/pie/biorel/README.md
@@ -0,0 +1,34 @@
+# PIE Dataset Card for "BioRel"
+
+This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the
+[BioRel Huggingface dataset loading script](https://huggingface.co/datasets/DFKI-SLT/BioRel).
+
+## Data Schema
+
+The document type for this dataset is `BioRelDocument` which defines the following data fields:
+
+- `text` (str)
+
+and the following annotation layers:
+
+- `entities` (annotation type: `SpanWithIdAndName`, target: `text`)
+- `relations` (annotation type: `BinaryRelation`, target: `entities`)
+
+`SpanWithIdAndName` is a custom annotation type that extends typical `Span` with the following data fields:
+
+- `id` (str, for entity identification)
+- `name` (str, entity string between span start and end)
+
+See [here](https://github.com/ArneBinder/pie-modules/blob/main/src/pie_modules/annotations.py) and
+[here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/annotations.py) for the annotation
+type definitions.
+
+## Document Converters
+
+The dataset provides predefined document converters for the following target document types:
+
+- `pie_modules.documents.TextDocumentWithLabeledSpansAndBinaryRelations`
+
+See [here](https://github.com/ArneBinder/pie-modules/blob/main/src/pie_modules/documents.py) and
+[here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type
+definitions.
diff --git a/dataset_builders/pie/biorel/biorel.py b/dataset_builders/pie/biorel/biorel.py
@@ -0,0 +1,128 @@
+import dataclasses
+import logging
+from typing import Any
+
+import datasets
+from pytorch_ie import AnnotationLayer, annotation_field
+from pytorch_ie.annotations import BinaryRelation, LabeledSpan, Span
+from pytorch_ie.documents import (
+    TextBasedDocument,
+    TextDocumentWithLabeledSpansAndBinaryRelations,
+)
+
+from pie_datasets import ArrowBasedBuilder, GeneratorBasedBuilder
+
+logger = logging.getLogger(__name__)
+warning_counter = 0
+
+
+@dataclasses.dataclass(frozen=True)
+class SpanWithIdAndName(Span):
+    id: str
+    name: str
+
+    def resolve(self) -> Any:
+        return self.id, self.name, super().resolve()
+
+
+@dataclasses.dataclass
+class BioRelDocument(TextBasedDocument):
+    entities: AnnotationLayer[SpanWithIdAndName] = annotation_field(target="text")
+    relations: AnnotationLayer[BinaryRelation] = annotation_field(target="entities")
+
+
+def example_to_document(example) -> BioRelDocument:
+    document = BioRelDocument(text=example["text"])
+    head = SpanWithIdAndName(
+        id=example["h"]["id"],
+        name=example["h"]["name"],
+        start=example["h"]["pos"][0],
+        end=example["h"]["pos"][1],
+    )
+    tail = SpanWithIdAndName(
+        id=example["t"]["id"],
+        name=example["t"]["name"],
+        start=example["t"]["pos"][0],
+        end=example["t"]["pos"][1],
+    )
+    document.entities.extend([head, tail])
+
+    relation = BinaryRelation(head=head, tail=tail, label=example["relation"])
+    document.relations.append(relation)
+    return document
+
+
+def document_to_example(document):
+    head = document.entities[0]
+    tail = document.entities[1]
+    return {
+        "text": document.text,
+        "relation": document.relations[0].label,
+        "h": {"id": head.id, "name": head.name, "pos": [head.start, head.end]},
+        "t": {"id": tail.id, "name": tail.name, "pos": [tail.start, tail.end]},
+    }
+
+
+def convert_to_text_document_with_labeled_spans_and_binary_relations(
+    document: BioRelDocument,
+) -> TextDocumentWithLabeledSpansAndBinaryRelations:
+    text_document = TextDocumentWithLabeledSpansAndBinaryRelations(text=document.text)
+    old2new_spans = {}
+    ids = []
+    names = []
+
+    for entity in document.entities:  # in our case two entities (head and tail)
+        # create LabeledSpan and append
+        labeled_span = LabeledSpan(start=entity.start, end=entity.end, label="ENTITY")
+        text_document.labeled_spans.append(labeled_span)
+
+        # check if the labeled span text is the same as the entity name
+        if str(labeled_span) != entity.name:
+            logger.warning(
+                f"Expected labeled span text to be '{entity.name}', got '{labeled_span}'"
+            )
+
+        # Map the original entity to the new labeled span
+        old2new_spans[entity] = labeled_span
+
+        ids.append(entity.id)
+        names.append(entity.name)
+
+    if len(document.relations) != 1:  # one relation between two entities
+        raise ValueError(f"Expected exactly one relation, got {len(document.relations)}")
+    old_rel = document.relations[0]
+
+    # create BinaryRelation and append
+    rel = BinaryRelation(
+        head=old2new_spans[old_rel.head],
+        tail=old2new_spans[old_rel.tail],
+        label=old_rel.label,
+    )
+    text_document.binary_relations.append(rel)
+    text_document.metadata["entity_ids"] = ids
+    text_document.metadata["entity_names"] = names
+
+    return text_document
+
+
+class BioRel(ArrowBasedBuilder):
+    DOCUMENT_TYPE = BioRelDocument
+    BASE_DATASET_PATH = "DFKI-SLT/BioRel"
+    BASE_DATASET_REVISION = "e4869c484c582cfbc7ead10d4d421bd4b275fa4e"
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            version=datasets.Version("1.0.0"),
+            description="BioRel dataset",
+        )
+    ]
+
+    DOCUMENT_CONVERTERS = {
+        TextDocumentWithLabeledSpansAndBinaryRelations: convert_to_text_document_with_labeled_spans_and_binary_relations
+    }
+
+    def _generate_document(self, example, **kwargs):
+        return example_to_document(example)
+
+    def _generate_example(self, document: BioRelDocument, **kwargs):
+        return document_to_example(document)
diff --git a/dataset_builders/pie/biorel/requirements.txt b/dataset_builders/pie/biorel/requirements.txt
@@ -0,0 +1 @@
+pie-datasets>=0.6.0,<0.11.0