Skip to content

Commit

Permalink
remove utils.document.deduplicate_annotations in favor of `Document…
Browse files Browse the repository at this point in the history
….deduplicate_annotations` (#437)

* remove utils.document.deduplicate_annotations in favor of Document.deduplicate_annotations

* cleanup
  • Loading branch information
ArneBinder authored Dec 17, 2024
1 parent 14f6679 commit 0218abd
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 27 deletions.
26 changes: 2 additions & 24 deletions src/pytorch_ie/utils/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,27 +29,6 @@ def deduplicate_annotation_dicts(
D = TypeVar("D", bound=Document)


def deduplicate_annotations(document: D) -> D:
"""Remove duplicate annotations from a document.
Args:
document: The document to remove duplicate annotations from.
Returns:
The document with duplicate annotations removed.
"""
annotation_field_names = [field.name for field in document.annotation_fields()]
doc_dict = document.asdict()
for annotation_field_name in annotation_field_names:
doc_dict[annotation_field_name]["annotations"] = deduplicate_annotation_dicts(
doc_dict[annotation_field_name]["annotations"]
)
doc_dict[annotation_field_name]["predictions"] = deduplicate_annotation_dicts(
doc_dict[annotation_field_name]["predictions"]
)
return type(document).fromdict(doc_dict)


def save_annotation_sources_to_metadata(
document: D,
annotation_id2source: Dict[int, List[str]],
Expand Down Expand Up @@ -124,8 +103,6 @@ def merge_annotations_from_documents(
f"Document IDs do not match: {document.id} and {merged_document.id}"
)

# TODO: add_all_annotations_from_other needs to be fixed! it should return a mapping from
# original annotation *IDs* to new annotations!
# Note: this does not check for duplicates!
added_annotations = merged_document.add_all_annotations_from_other(
other=document, strict=True
Expand All @@ -135,7 +112,8 @@ def merge_annotations_from_documents(
for orig_id, new_annotation in orig_id2new_annotation.items():
added_annotation_id2source_names[new_annotation._id].append(source_name)

merged_document = deduplicate_annotations(merged_document)
# this will remove duplicates. If duplicates have different scores, the one with the highest score will be kept
merged_document = merged_document.deduplicate_annotations()

# save source names in metadata (at key metadata_key_source_annotations / metadata_key_source_predictions
# for each layer in the order of the annotations / predictions)
Expand Down
3 changes: 0 additions & 3 deletions tests/utils/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,9 @@ def test_document_merge_annotations():
assert result.id == "doc1"
assert set(result.labeled_spans) == set(base_doc.labeled_spans)
assert len(result.labeled_spans) == len(base_doc.labeled_spans) == 2
assert len(result.labeled_spans.predictions) == 4
assert result.labeled_spans.predictions.resolve() == [
("label1", "This"),
("label2", "is"),
("label1", "This"),
("label3", "is"),
]
annotations_with_sources = [
Expand All @@ -59,6 +57,5 @@ def test_document_merge_annotations():
assert predictions_with_scores == [
(LabeledSpan(start=0, end=4, label="label1", score=0.9), ["doc1"]),
(LabeledSpan(start=5, end=7, label="label2", score=0.7), ["doc1", "doc2"]),
(LabeledSpan(start=0, end=4, label="label1", score=0.8), ["doc2"]),
(LabeledSpan(start=5, end=7, label="label3", score=0.6), ["doc2"]),
]

0 comments on commit 0218abd

Please sign in to comment.