-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Created data loading script for BioRel dataset * Created data loading script for BioRel dataset * Update dataset_builders/pie/biorel/biorel.py Co-authored-by: ArneBinder <[email protected]> * Adjustments for BioRel loading script, specifically introducing document_to_example() method and further tests * Added requirements.txt and README.md * precommit forgotten * moved biorel test to correct folder * Test Pie Dataset and Document * Created document converter and tests * Added generate_example method to BioRel class * Adjusted test method for converted documents and added metadata * Wrote README.md * Adjusted according to review * Update dataset_builders/pie/biorel/biorel.py Co-authored-by: ArneBinder <[email protected]> --------- Co-authored-by: ArneBinder <[email protected]>
- Loading branch information
1 parent
38b9580
commit 937a954
Showing
4 changed files
with
428 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# PIE Dataset Card for "BioRel" | ||
|
||
This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the | ||
[BioRel Huggingface dataset loading script](https://huggingface.co/datasets/DFKI-SLT/BioRel). | ||
|
||
## Data Schema | ||
|
||
The document type for this dataset is `BioRelDocument` which defines the following data fields: | ||
|
||
- `text` (str) | ||
|
||
and the following annotation layers: | ||
|
||
- `entities` (annotation type: `SpanWithIdAndName`, target: `text`) | ||
- `relations` (annotation type: `BinaryRelation`, target: `entities`) | ||
|
||
`SpanWithIdAndName` is a custom annotation type that extends typical `Span` with the following data fields: | ||
|
||
- `id` (str, for entity identification) | ||
- `name` (str, entity string between span start and end) | ||
|
||
See [here](https://github.com/ArneBinder/pie-modules/blob/main/src/pie_modules/annotations.py) and | ||
[here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/annotations.py) for the annotation | ||
type definitions. | ||
|
||
## Document Converters | ||
|
||
The dataset provides predefined document converters for the following target document types: | ||
|
||
- `pie_modules.documents.TextDocumentWithLabeledSpansAndBinaryRelations` | ||
|
||
See [here](https://github.com/ArneBinder/pie-modules/blob/main/src/pie_modules/documents.py) and | ||
[here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type | ||
definitions. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
import dataclasses | ||
import logging | ||
from typing import Any | ||
|
||
import datasets | ||
from pytorch_ie import AnnotationLayer, annotation_field | ||
from pytorch_ie.annotations import BinaryRelation, LabeledSpan, Span | ||
from pytorch_ie.documents import ( | ||
TextBasedDocument, | ||
TextDocumentWithLabeledSpansAndBinaryRelations, | ||
) | ||
|
||
from pie_datasets import ArrowBasedBuilder, GeneratorBasedBuilder | ||
|
||
logger = logging.getLogger(__name__) | ||
warning_counter = 0 | ||
|
||
|
||
@dataclasses.dataclass(frozen=True) | ||
class SpanWithIdAndName(Span): | ||
id: str | ||
name: str | ||
|
||
def resolve(self) -> Any: | ||
return self.id, self.name, super().resolve() | ||
|
||
|
||
@dataclasses.dataclass | ||
class BioRelDocument(TextBasedDocument): | ||
entities: AnnotationLayer[SpanWithIdAndName] = annotation_field(target="text") | ||
relations: AnnotationLayer[BinaryRelation] = annotation_field(target="entities") | ||
|
||
|
||
def example_to_document(example) -> BioRelDocument: | ||
document = BioRelDocument(text=example["text"]) | ||
head = SpanWithIdAndName( | ||
id=example["h"]["id"], | ||
name=example["h"]["name"], | ||
start=example["h"]["pos"][0], | ||
end=example["h"]["pos"][1], | ||
) | ||
tail = SpanWithIdAndName( | ||
id=example["t"]["id"], | ||
name=example["t"]["name"], | ||
start=example["t"]["pos"][0], | ||
end=example["t"]["pos"][1], | ||
) | ||
document.entities.extend([head, tail]) | ||
|
||
relation = BinaryRelation(head=head, tail=tail, label=example["relation"]) | ||
document.relations.append(relation) | ||
return document | ||
|
||
|
||
def document_to_example(document): | ||
head = document.entities[0] | ||
tail = document.entities[1] | ||
return { | ||
"text": document.text, | ||
"relation": document.relations[0].label, | ||
"h": {"id": head.id, "name": head.name, "pos": [head.start, head.end]}, | ||
"t": {"id": tail.id, "name": tail.name, "pos": [tail.start, tail.end]}, | ||
} | ||
|
||
|
||
def convert_to_text_document_with_labeled_spans_and_binary_relations( | ||
document: BioRelDocument, | ||
) -> TextDocumentWithLabeledSpansAndBinaryRelations: | ||
text_document = TextDocumentWithLabeledSpansAndBinaryRelations(text=document.text) | ||
old2new_spans = {} | ||
ids = [] | ||
names = [] | ||
|
||
for entity in document.entities: # in our case two entities (head and tail) | ||
# create LabeledSpan and append | ||
labeled_span = LabeledSpan(start=entity.start, end=entity.end, label="ENTITY") | ||
text_document.labeled_spans.append(labeled_span) | ||
|
||
# check if the labeled span text is the same as the entity name | ||
if str(labeled_span) != entity.name: | ||
logger.warning( | ||
f"Expected labeled span text to be '{entity.name}', got '{labeled_span}'" | ||
) | ||
|
||
# Map the original entity to the new labeled span | ||
old2new_spans[entity] = labeled_span | ||
|
||
ids.append(entity.id) | ||
names.append(entity.name) | ||
|
||
if len(document.relations) != 1: # one relation between two entities | ||
raise ValueError(f"Expected exactly one relation, got {len(document.relations)}") | ||
old_rel = document.relations[0] | ||
|
||
# create BinaryRelation and append | ||
rel = BinaryRelation( | ||
head=old2new_spans[old_rel.head], | ||
tail=old2new_spans[old_rel.tail], | ||
label=old_rel.label, | ||
) | ||
text_document.binary_relations.append(rel) | ||
text_document.metadata["entity_ids"] = ids | ||
text_document.metadata["entity_names"] = names | ||
|
||
return text_document | ||
|
||
|
||
class BioRel(ArrowBasedBuilder): | ||
DOCUMENT_TYPE = BioRelDocument | ||
BASE_DATASET_PATH = "DFKI-SLT/BioRel" | ||
BASE_DATASET_REVISION = "e4869c484c582cfbc7ead10d4d421bd4b275fa4e" | ||
|
||
BUILDER_CONFIGS = [ | ||
datasets.BuilderConfig( | ||
version=datasets.Version("1.0.0"), | ||
description="BioRel dataset", | ||
) | ||
] | ||
|
||
DOCUMENT_CONVERTERS = { | ||
TextDocumentWithLabeledSpansAndBinaryRelations: convert_to_text_document_with_labeled_spans_and_binary_relations | ||
} | ||
|
||
def _generate_document(self, example, **kwargs): | ||
return example_to_document(example) | ||
|
||
def _generate_example(self, document: BioRelDocument, **kwargs): | ||
return document_to_example(document) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
pie-datasets>=0.6.0,<0.11.0 |
Oops, something went wrong.