From af821dbc3970457d0bbbc5d5a9e764ef8ab39511 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Sat, 25 Nov 2023 02:10:20 +0100 Subject: [PATCH 01/24] add abstrct dataset --- dataset_builders/pie/abstrct/README.md | 23 ++ dataset_builders/pie/abstrct/abstrct.py | 41 +++ dataset_builders/pie/abstrct/requirements.txt | 1 + tests/dataset_builders/pie/test_abstrct.py | 318 ++++++++++++++++++ 4 files changed, 383 insertions(+) create mode 100644 dataset_builders/pie/abstrct/README.md create mode 100644 dataset_builders/pie/abstrct/abstrct.py create mode 100644 dataset_builders/pie/abstrct/requirements.txt create mode 100644 tests/dataset_builders/pie/test_abstrct.py diff --git a/dataset_builders/pie/abstrct/README.md b/dataset_builders/pie/abstrct/README.md new file mode 100644 index 00000000..2c10a090 --- /dev/null +++ b/dataset_builders/pie/abstrct/README.md @@ -0,0 +1,23 @@ +# PIE Dataset Card for "abstrct" + +This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the AbstRCT dataset. + +TODO: Since there is no respective HF dataset card, we should all respective information here. + +TODO: Shortly reference the PIE-Brat dataset card. + +## Data Schema + +TODO + +See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/annotations.py) for the remaining annotation type definitions. + +## Document Converters + +The dataset provides document converters for the following target document types: + +- `pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations` + - TODO + +See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type +definitions. diff --git a/dataset_builders/pie/abstrct/abstrct.py b/dataset_builders/pie/abstrct/abstrct.py new file mode 100644 index 00000000..34a00305 --- /dev/null +++ b/dataset_builders/pie/abstrct/abstrct.py @@ -0,0 +1,41 @@ +from pytorch_ie.documents import ( + TextDocumentWithLabeledSpansAndBinaryRelations, +) + +from pie_datasets.builders import BratBuilder +from pie_datasets.core.dataset import DocumentConvertersType + +URL = "https://gitlab.com/tomaye/abstrct/-/archive/master/abstrct-master.zip" +SPLIT_PATHS = { + "neoplasm_train": "abstrct-master/AbstRCT_corpus/data/train/neoplasm_train", + "neoplasm_dev": "abstrct-master/AbstRCT_corpus/data/dev/neoplasm_dev", + "neoplasm_test": "abstrct-master/AbstRCT_corpus/data/test/neoplasm_test", + "glaucoma_test": "abstrct-master/AbstRCT_corpus/data/test/glaucoma_test", + "mixed_test": "abstrct-master/AbstRCT_corpus/data/test/mixed_test", +} + + +class AbstRCT(BratBuilder): + + BASE_DATASET_PATH = "DFKI-SLT/brat" + BASE_DATASET_REVISION = "052163d34b4429d81003981bc10674cef54aa0b8" + + # we need to add None to the list of dataset variants to support the default dataset variant + BASE_BUILDER_KWARGS_DICT = { + dataset_variant: {"url": URL, "split_paths": SPLIT_PATHS} + for dataset_variant in ["default", "merge_fragmented_spans", None] + } + + @property + def document_converters(self) -> DocumentConvertersType: + if self.config.name == "default": + return {} + elif self.config.name == "merge_fragmented_spans": + return { + TextDocumentWithLabeledSpansAndBinaryRelations: { + "spans": "labeled_spans", + "relations": "binary_relations", + }, + } + else: + raise ValueError(f"Unknown dataset variant: {self.config.name}") diff --git a/dataset_builders/pie/abstrct/requirements.txt b/dataset_builders/pie/abstrct/requirements.txt new file mode 100644 index 00000000..08271d87 --- /dev/null +++ b/dataset_builders/pie/abstrct/requirements.txt @@ -0,0 +1 @@ +pie-datasets>=0.4.0,<0.5.0 diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py new file mode 100644 index 00000000..6923c5bf --- /dev/null +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -0,0 +1,318 @@ +from typing import List, Optional, Union + +import pytest +from datasets import disable_caching +from pytorch_ie.documents import ( + TextDocumentWithLabeledSpansAndBinaryRelations, +) +from transformers import AutoTokenizer, PreTrainedTokenizer + +from dataset_builders.pie.abstrct.abstrct import AbstRCT +from pie_datasets import DatasetDict +from pie_datasets.builders.brat import BratDocument, BratDocumentWithMergedSpans +from pie_datasets.document.processing import tokenize_document +from pie_datasets.document.types import ( + TokenDocumentWithLabeledSpansAndBinaryRelations, +) +from tests.dataset_builders.common import PIE_BASE_PATH + +disable_caching() + +DATASET_NAME = "abstrct" +PIE_DATASET_PATH = PIE_BASE_PATH / DATASET_NAME +SPLIT_SIZES = { + 'glaucoma_test': 100, + 'mixed_test': 100, + 'neoplasm_dev': 50, + 'neoplasm_test': 100, + 'neoplasm_train': 350, +} +SPLIT = "neoplasm_train" + + +@pytest.fixture( + scope="module", params=[config.name for config in AbstRCT.BUILDER_CONFIGS] +) +def dataset_variant(request) -> str: + return request.param + + +@pytest.fixture(scope="module") +def dataset(dataset_variant) -> DatasetDict: + return DatasetDict.load_dataset(str(PIE_DATASET_PATH), name=dataset_variant) + + +def test_dataset(dataset): + assert dataset is not None + assert {name: len(ds) for name, ds in dataset.items()} == SPLIT_SIZES + + +@pytest.fixture(scope="module") +def document(dataset, dataset_variant) -> Union[BratDocument, BratDocumentWithMergedSpans]: + result = dataset[SPLIT][0] + if dataset_variant == "default": + assert isinstance(result, BratDocument) + elif dataset_variant == "merge_fragmented_spans": + assert isinstance(result, BratDocumentWithMergedSpans) + else: + raise ValueError(f"Unknown dataset variant: {dataset_variant}") + return result + + +def test_document(document, dataset_variant): + assert document.text.startswith("Should students be taught to compete or to cooperate?") + if dataset_variant == "default": + # TODO + raise NotImplementedError() + elif dataset_variant == "merge_fragmented_spans": + # TODO + raise NotImplementedError() + else: + raise ValueError(f"Unknown dataset variant: {dataset_variant}") + + +@pytest.fixture(scope="module") +def dataset_of_text_documents_with_labeled_spans_and_binary_relations( + dataset, dataset_variant +) -> Optional[DatasetDict]: + if dataset_variant == "default": + with pytest.raises(ValueError) as excinfo: + dataset.to_document_type(TextDocumentWithLabeledSpansAndBinaryRelations) + assert ( + str(excinfo.value) + == "No valid key (either subclass or superclass) was found for the document type " + "'' in the " + "document_converters of the dataset. Available keys: set(). Consider adding a respective " + "converter to the dataset with dataset.register_document_converter(my_converter_method) " + "where my_converter_method should accept " + "as input and return ''." + ) + converted_dataset = None + elif dataset_variant == "merge_fragmented_spans": + converted_dataset = dataset.to_document_type( + TextDocumentWithLabeledSpansAndBinaryRelations + ) + else: + raise ValueError(f"Unknown dataset variant: {dataset_variant}") + return converted_dataset + + +def test_dataset_of_text_documents_with_labeled_spans_and_binary_relations( + dataset_of_text_documents_with_labeled_spans_and_binary_relations, +): + if dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None: + # Check that the conversion is correct and the data makes sense + # get a document to check + doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0] + assert isinstance(doc, TextDocumentWithLabeledSpansAndBinaryRelations) + # check the entities + assert len(doc.labeled_spans) == 183 + # sort the entities by their start position and convert them to tuples + # check the first ten entities after sorted + sorted_entity_tuples = [ + (str(ent), ent.label) + for ent in sorted(doc.labeled_spans, key=lambda ent: ent.start)[:10] + ] + # Checking the first ten entities + assert sorted_entity_tuples[0] == ( + "complicated 3D character models are widely used in fields of entertainment, virtual reality, medicine etc", + "background_claim", + ) + assert sorted_entity_tuples[1] == ( + "The range of breathtaking realistic 3D models is only limited by the creativity of artists and resolution " + "of devices", + "background_claim", + ) + assert sorted_entity_tuples[2] == ( + "Driving 3D models in a natural and believable manner is not trivial", + "background_claim", + ) + assert sorted_entity_tuples[3] == ("the model is very detailed", "data") + assert sorted_entity_tuples[4] == ( + "playback of animation becomes quite heavy and time consuming", + "data", + ) + assert sorted_entity_tuples[5] == ("a frame goes wrong", "data") + assert sorted_entity_tuples[6] == ( + "a production cannot afford major revisions", + "background_claim", + ) + assert sorted_entity_tuples[7] == ("resculpting models", "data") + assert sorted_entity_tuples[8] == ("re-rigging skeletons", "data") + assert sorted_entity_tuples[9] == ( + "providing a flexible and efficient solution to animation remains an open problem", + "own_claim", + ) + + # check the relations + assert len(doc.binary_relations) == 116 + # check the first ten relations + relation_tuples = [ + (str(rel.head), rel.label, str(rel.tail)) for rel in doc.binary_relations[:10] + ] + assert relation_tuples[0] == ( + "a production cannot afford major revisions", + "supports", + "providing a flexible and efficient solution to animation remains an open problem", + ) + assert relation_tuples[1] == ( + "its ease of implementation", + "supports", + "SSD is widely used in games, virtual reality and other realtime applications", + ) + assert relation_tuples[2] == ( + "low cost of computing", + "supports", + "SSD is widely used in games, virtual reality and other realtime applications", + ) + assert relation_tuples[3] == ( + "editing in the rest pose will influence most other poses", + "supports", + "This approach is not commonly applied", + ) + assert relation_tuples[4] == ( + "This approach is not commonly applied", + "contradicts", + "artists will edit the geometry of characters in the rest pose to fine-tune animations", + ) + assert relation_tuples[5] == ( + "the animator specifies the PSD examples after the SSD has been performed", + "contradicts", + "the examples are best interpolated in the rest pose, before the SSD has been applied", + ) + assert relation_tuples[6] == ( + "PSD may be used as a compensation to the underlying SSD", + "contradicts", + "the examples are best interpolated in the rest pose, before the SSD has been applied", + ) + assert relation_tuples[7] == ( + "the examples are best interpolated in the rest pose, before the SSD has been applied", + "supports", + "the action of the SSD and any other deformations must be “inverted” in order to push the example " + "compensation before these operations", + ) + assert relation_tuples[8] == ( + "this inverse strategy has a better performance than the same framework without it", + "semantically_same", + "this approach will improve the quality of deformation", + ) + assert relation_tuples[9] == ( + "the high cost of computing", + "supports", + "they are seldom applied to interactive applications", + ) + + +@pytest.fixture(scope="module") +def tokenizer() -> PreTrainedTokenizer: + return AutoTokenizer.from_pretrained("bert-base-uncased") + + +@pytest.fixture(scope="module") +def tokenized_documents_with_labeled_spans_and_binary_relations( + dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer +) -> Optional[List[TokenDocumentWithLabeledSpansAndBinaryRelations]]: + if dataset_of_text_documents_with_labeled_spans_and_binary_relations is None: + return None + + # get a document to check + doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0] + # Note, that this is a list of documents, because the document may be split into chunks + # if the input text is too long. + tokenized_docs = tokenize_document( + doc, + tokenizer=tokenizer, + return_overflowing_tokens=True, + result_document_type=TokenDocumentWithLabeledSpansAndBinaryRelations, + strict_span_conversion=False, + verbose=True, + ) + return tokenized_docs + + +def test_tokenized_documents_with_labeled_spans_and_binary_relations( + tokenized_documents_with_labeled_spans_and_binary_relations, +): + if tokenized_documents_with_labeled_spans_and_binary_relations is not None: + docs = tokenized_documents_with_labeled_spans_and_binary_relations + # check that the tokenization was fine + assert len(docs) == 1 + doc = docs[0] + assert len(doc.labeled_spans) == 183 + assert len(doc.tokens) == 7689 + # Check the first ten tokens + assert doc.tokens[:10] == ("[CLS]", "<", "?", "xml", "version", "=", '"', "1", ".", "0") + # Check the first ten tokenized entities after sorted by their start position + sorted_entities = sorted(doc.labeled_spans, key=lambda ent: ent.start) + assert ( + str(sorted_entities[0]) + == "('complicated', '3d', 'character', 'models', 'are', 'widely', 'used', 'in', 'fields', 'of', " + "'entertainment', ',', 'virtual', 'reality', ',', 'medicine', 'etc')" + ) + assert ( + str(sorted_entities[1]) + == "('the', 'range', 'of', 'breath', '##taking', 'realistic', '3d', 'models', 'is', 'only', 'limited', " + "'by', 'the', 'creativity', 'of', 'artists', 'and', 'resolution', 'of', 'devices')" + ) + assert ( + str(sorted_entities[2]) + == "('driving', '3d', 'models', 'in', 'a', 'natural', 'and', 'bel', '##ie', '##vable', 'manner', 'is', " + "'not', 'trivial')" + ) + assert str(sorted_entities[3]) == "('the', 'model', 'is', 'very', 'detailed')" + assert ( + str(sorted_entities[4]) + == "('playback', 'of', 'animation', 'becomes', 'quite', 'heavy', 'and', 'time', 'consuming')" + ) + assert str(sorted_entities[5]) == "('a', 'frame', 'goes', 'wrong')" + assert ( + str(sorted_entities[6]) + == "('a', 'production', 'cannot', 'afford', 'major', 'revisions')" + ) + assert str(sorted_entities[7]) == "('res', '##cu', '##lp', '##ting', 'models')" + assert str(sorted_entities[8]) == "('re', '-', 'rig', '##ging', 'skeletons')" + assert ( + str(sorted_entities[9]) + == "('providing', 'a', 'flexible', 'and', 'efficient', 'solution', 'to', 'animation', 'remains', 'an', " + "'open', 'problem')" + ) + + +def test_tokenized_documents_with_entities_and_relations_all( + dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer, dataset_variant +): + if dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None: + for ( + split, + docs, + ) in dataset_of_text_documents_with_labeled_spans_and_binary_relations.items(): + for doc in docs: + # Note, that this is a list of documents, because the document may be split into chunks + # if the input text is too long. + tokenized_docs = tokenize_document( + doc, + tokenizer=tokenizer, + return_overflowing_tokens=True, + result_document_type=TokenDocumentWithLabeledSpansAndBinaryRelations, + strict_span_conversion=False, + verbose=True, + ) + # we just ensure that we get at least one tokenized document + assert tokenized_docs is not None + assert len(tokenized_docs) > 0 + + +def test_document_converters(dataset_variant): + builder = AbstRCT(config_name=dataset_variant) + document_converters = builder.document_converters + + if dataset_variant == "default": + assert document_converters == {} + elif dataset_variant == "merge_fragmented_spans": + assert len(document_converters) == 1 + assert set(document_converters) == { + TextDocumentWithLabeledSpansAndBinaryRelations, + } + assert all(callable(v) for k, v in document_converters.items()) + else: + raise ValueError(f"Unknown dataset variant: {dataset_variant}") From 4e19b297907f63904db1db77163fee8e07384626 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Sun, 26 Nov 2023 07:30:15 +0100 Subject: [PATCH 02/24] adjust for 0.5.0 --- dataset_builders/pie/abstrct/abstrct.py | 5 +--- dataset_builders/pie/abstrct/requirements.txt | 2 +- tests/dataset_builders/pie/test_abstrct.py | 24 +++++++------------ 3 files changed, 11 insertions(+), 20 deletions(-) diff --git a/dataset_builders/pie/abstrct/abstrct.py b/dataset_builders/pie/abstrct/abstrct.py index 34a00305..6cad6260 100644 --- a/dataset_builders/pie/abstrct/abstrct.py +++ b/dataset_builders/pie/abstrct/abstrct.py @@ -1,6 +1,4 @@ -from pytorch_ie.documents import ( - TextDocumentWithLabeledSpansAndBinaryRelations, -) +from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations from pie_datasets.builders import BratBuilder from pie_datasets.core.dataset import DocumentConvertersType @@ -16,7 +14,6 @@ class AbstRCT(BratBuilder): - BASE_DATASET_PATH = "DFKI-SLT/brat" BASE_DATASET_REVISION = "052163d34b4429d81003981bc10674cef54aa0b8" diff --git a/dataset_builders/pie/abstrct/requirements.txt b/dataset_builders/pie/abstrct/requirements.txt index 08271d87..56244c60 100644 --- a/dataset_builders/pie/abstrct/requirements.txt +++ b/dataset_builders/pie/abstrct/requirements.txt @@ -1 +1 @@ -pie-datasets>=0.4.0,<0.5.0 +pie-datasets>=0.4.0,<0.6.0 diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py index 6923c5bf..ec602f94 100644 --- a/tests/dataset_builders/pie/test_abstrct.py +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -2,18 +2,14 @@ import pytest from datasets import disable_caching -from pytorch_ie.documents import ( - TextDocumentWithLabeledSpansAndBinaryRelations, -) +from pie_models.document.processing import tokenize_document +from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations from transformers import AutoTokenizer, PreTrainedTokenizer from dataset_builders.pie.abstrct.abstrct import AbstRCT from pie_datasets import DatasetDict from pie_datasets.builders.brat import BratDocument, BratDocumentWithMergedSpans -from pie_datasets.document.processing import tokenize_document -from pie_datasets.document.types import ( - TokenDocumentWithLabeledSpansAndBinaryRelations, -) +from pie_datasets.document.types import TokenDocumentWithLabeledSpansAndBinaryRelations from tests.dataset_builders.common import PIE_BASE_PATH disable_caching() @@ -21,18 +17,16 @@ DATASET_NAME = "abstrct" PIE_DATASET_PATH = PIE_BASE_PATH / DATASET_NAME SPLIT_SIZES = { - 'glaucoma_test': 100, - 'mixed_test': 100, - 'neoplasm_dev': 50, - 'neoplasm_test': 100, - 'neoplasm_train': 350, + "glaucoma_test": 100, + "mixed_test": 100, + "neoplasm_dev": 50, + "neoplasm_test": 100, + "neoplasm_train": 350, } SPLIT = "neoplasm_train" -@pytest.fixture( - scope="module", params=[config.name for config in AbstRCT.BUILDER_CONFIGS] -) +@pytest.fixture(scope="module", params=[config.name for config in AbstRCT.BUILDER_CONFIGS]) def dataset_variant(request) -> str: return request.param From e68b62d9ffb4cd238eed56d355d97a5e355563ae Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Sun, 26 Nov 2023 07:30:25 +0100 Subject: [PATCH 03/24] fix codespell --- .pre-commit-config.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 696a3290..01ac619e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -85,7 +85,8 @@ repos: args: - --skip=logs/** # arbitral: this is a legal term and used in example data (cdcp dataset) - - --ignore-words-list=arbitral + # abstrct / AbstRCT: this is a dataset name + - --ignore-words-list=arbitral,abstrct,AbstRCT # python static type checking - repo: https://github.com/pre-commit/mirrors-mypy From 94abe006d1f9b82faba0b0a0e98544f8e49a1a9c Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Sun, 26 Nov 2023 21:32:43 +0100 Subject: [PATCH 04/24] adjust for pie-modules --- dataset_builders/pie/abstrct/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset_builders/pie/abstrct/requirements.txt b/dataset_builders/pie/abstrct/requirements.txt index 56244c60..cb3deb1c 100644 --- a/dataset_builders/pie/abstrct/requirements.txt +++ b/dataset_builders/pie/abstrct/requirements.txt @@ -1 +1 @@ -pie-datasets>=0.4.0,<0.6.0 +pie-datasets>=0.4.0,<0.7.0 From 6768f9541372ef1d86387bee82b32457b6a0775e Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Mon, 27 Nov 2023 12:10:08 +0100 Subject: [PATCH 05/24] use test document types --- tests/dataset_builders/pie/test_abstrct.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py index ec602f94..2b83bf1d 100644 --- a/tests/dataset_builders/pie/test_abstrct.py +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -9,8 +9,10 @@ from dataset_builders.pie.abstrct.abstrct import AbstRCT from pie_datasets import DatasetDict from pie_datasets.builders.brat import BratDocument, BratDocumentWithMergedSpans -from pie_datasets.document.types import TokenDocumentWithLabeledSpansAndBinaryRelations -from tests.dataset_builders.common import PIE_BASE_PATH +from tests.dataset_builders.common import ( + PIE_BASE_PATH, + TestTokenDocumentWithLabeledSpansAndBinaryRelations, +) disable_caching() @@ -205,7 +207,7 @@ def tokenizer() -> PreTrainedTokenizer: @pytest.fixture(scope="module") def tokenized_documents_with_labeled_spans_and_binary_relations( dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer -) -> Optional[List[TokenDocumentWithLabeledSpansAndBinaryRelations]]: +) -> Optional[List[TestTokenDocumentWithLabeledSpansAndBinaryRelations]]: if dataset_of_text_documents_with_labeled_spans_and_binary_relations is None: return None @@ -217,7 +219,7 @@ def tokenized_documents_with_labeled_spans_and_binary_relations( doc, tokenizer=tokenizer, return_overflowing_tokens=True, - result_document_type=TokenDocumentWithLabeledSpansAndBinaryRelations, + result_document_type=TestTokenDocumentWithLabeledSpansAndBinaryRelations, strict_span_conversion=False, verbose=True, ) @@ -287,7 +289,7 @@ def test_tokenized_documents_with_entities_and_relations_all( doc, tokenizer=tokenizer, return_overflowing_tokens=True, - result_document_type=TokenDocumentWithLabeledSpansAndBinaryRelations, + result_document_type=TestTokenDocumentWithLabeledSpansAndBinaryRelations, strict_span_conversion=False, verbose=True, ) From 2560e93ffda9718280d12abb36505940de8bf42e Mon Sep 17 00:00:00 2001 From: Ruangrin L <88072261+idalr@users.noreply.github.com> Date: Fri, 1 Dec 2023 17:47:10 +0100 Subject: [PATCH 06/24] minor typo fix --- tests/dataset_builders/pie/test_abstrct.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py index 2b83bf1d..936f0797 100644 --- a/tests/dataset_builders/pie/test_abstrct.py +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -2,7 +2,7 @@ import pytest from datasets import disable_caching -from pie_models.document.processing import tokenize_document +from pie_modules.document.processing import tokenize_document from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations from transformers import AutoTokenizer, PreTrainedTokenizer From c105ee0451def6076cbb0a146f9bcda2018c963f Mon Sep 17 00:00:00 2001 From: Ruangrin L <88072261+idalr@users.noreply.github.com> Date: Thu, 7 Dec 2023 16:39:42 +0100 Subject: [PATCH 07/24] updated `BASE_DATASET_REVISION` --- dataset_builders/pie/abstrct/abstrct.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset_builders/pie/abstrct/abstrct.py b/dataset_builders/pie/abstrct/abstrct.py index 6cad6260..f27dc851 100644 --- a/dataset_builders/pie/abstrct/abstrct.py +++ b/dataset_builders/pie/abstrct/abstrct.py @@ -15,7 +15,7 @@ class AbstRCT(BratBuilder): BASE_DATASET_PATH = "DFKI-SLT/brat" - BASE_DATASET_REVISION = "052163d34b4429d81003981bc10674cef54aa0b8" + BASE_DATASET_REVISION = "844de61e8a00dc6a93fc29dc185f6e617131fbf1" # we need to add None to the list of dataset variants to support the default dataset variant BASE_BUILDER_KWARGS_DICT = { From 89ae9337869132b52c0dc1b77ec79668bcf2edfa Mon Sep 17 00:00:00 2001 From: Ruangrin L <88072261+idalr@users.noreply.github.com> Date: Fri, 8 Dec 2023 13:41:22 +0100 Subject: [PATCH 08/24] edit tests in test_abstrct.py --- tests/dataset_builders/pie/test_abstrct.py | 309 +++++++++++---------- 1 file changed, 157 insertions(+), 152 deletions(-) diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py index 936f0797..3037e1a0 100644 --- a/tests/dataset_builders/pie/test_abstrct.py +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -56,15 +56,10 @@ def document(dataset, dataset_variant) -> Union[BratDocument, BratDocumentWithMe def test_document(document, dataset_variant): - assert document.text.startswith("Should students be taught to compete or to cooperate?") - if dataset_variant == "default": - # TODO - raise NotImplementedError() - elif dataset_variant == "merge_fragmented_spans": - # TODO - raise NotImplementedError() - else: - raise ValueError(f"Unknown dataset variant: {dataset_variant}") + assert document is not None + assert document.text.startswith( + " A combination of mitoxantrone plus prednisone is preferable to prednisone alone" + ) @pytest.fixture(scope="module") @@ -96,107 +91,100 @@ def dataset_of_text_documents_with_labeled_spans_and_binary_relations( def test_dataset_of_text_documents_with_labeled_spans_and_binary_relations( dataset_of_text_documents_with_labeled_spans_and_binary_relations, ): - if dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None: - # Check that the conversion is correct and the data makes sense - # get a document to check - doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0] - assert isinstance(doc, TextDocumentWithLabeledSpansAndBinaryRelations) - # check the entities - assert len(doc.labeled_spans) == 183 - # sort the entities by their start position and convert them to tuples - # check the first ten entities after sorted - sorted_entity_tuples = [ - (str(ent), ent.label) - for ent in sorted(doc.labeled_spans, key=lambda ent: ent.start)[:10] - ] - # Checking the first ten entities - assert sorted_entity_tuples[0] == ( - "complicated 3D character models are widely used in fields of entertainment, virtual reality, medicine etc", - "background_claim", - ) - assert sorted_entity_tuples[1] == ( - "The range of breathtaking realistic 3D models is only limited by the creativity of artists and resolution " - "of devices", - "background_claim", - ) - assert sorted_entity_tuples[2] == ( - "Driving 3D models in a natural and believable manner is not trivial", - "background_claim", - ) - assert sorted_entity_tuples[3] == ("the model is very detailed", "data") - assert sorted_entity_tuples[4] == ( - "playback of animation becomes quite heavy and time consuming", - "data", - ) - assert sorted_entity_tuples[5] == ("a frame goes wrong", "data") - assert sorted_entity_tuples[6] == ( - "a production cannot afford major revisions", - "background_claim", - ) - assert sorted_entity_tuples[7] == ("resculpting models", "data") - assert sorted_entity_tuples[8] == ("re-rigging skeletons", "data") - assert sorted_entity_tuples[9] == ( - "providing a flexible and efficient solution to animation remains an open problem", - "own_claim", - ) + assert dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None + # get a document to check + converted_doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[ + "neoplasm_train" + ][0] + # check that the conversion is correct and the data makes sense + assert isinstance(converted_doc, TextDocumentWithLabeledSpansAndBinaryRelations) - # check the relations - assert len(doc.binary_relations) == 116 - # check the first ten relations - relation_tuples = [ - (str(rel.head), rel.label, str(rel.tail)) for rel in doc.binary_relations[:10] - ] - assert relation_tuples[0] == ( - "a production cannot afford major revisions", - "supports", - "providing a flexible and efficient solution to animation remains an open problem", - ) - assert relation_tuples[1] == ( - "its ease of implementation", - "supports", - "SSD is widely used in games, virtual reality and other realtime applications", - ) - assert relation_tuples[2] == ( - "low cost of computing", - "supports", - "SSD is widely used in games, virtual reality and other realtime applications", - ) - assert relation_tuples[3] == ( - "editing in the rest pose will influence most other poses", - "supports", - "This approach is not commonly applied", - ) - assert relation_tuples[4] == ( - "This approach is not commonly applied", - "contradicts", - "artists will edit the geometry of characters in the rest pose to fine-tune animations", - ) - assert relation_tuples[5] == ( - "the animator specifies the PSD examples after the SSD has been performed", - "contradicts", - "the examples are best interpolated in the rest pose, before the SSD has been applied", - ) - assert relation_tuples[6] == ( - "PSD may be used as a compensation to the underlying SSD", - "contradicts", - "the examples are best interpolated in the rest pose, before the SSD has been applied", - ) - assert relation_tuples[7] == ( - "the examples are best interpolated in the rest pose, before the SSD has been applied", - "supports", - "the action of the SSD and any other deformations must be “inverted” in order to push the example " - "compensation before these operations", - ) - assert relation_tuples[8] == ( - "this inverse strategy has a better performance than the same framework without it", - "semantically_same", - "this approach will improve the quality of deformation", - ) - assert relation_tuples[9] == ( - "the high cost of computing", - "supports", - "they are seldom applied to interactive applications", - ) + # check the entities + assert len(converted_doc.labeled_spans) == 7 + entity_tuples = [(str(ent), ent.label) for ent in converted_doc.labeled_spans] + assert entity_tuples[0] == ( + "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in men " + "with metastatic, hormone-resistant, prostate cancer.", + "MajorClaim", + ) + assert entity_tuples[1] == ( + "At 6 weeks, both groups showed improvement in several HQL domains,", + "Premise", + ) + assert entity_tuples[2] == ( + "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " + "prednisone-alone group.", + "Premise", + ) + assert entity_tuples[3] == ( + "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " + "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), " + "four functioning domains, and nine symptoms (.001 < P <. 01),", + "Premise", + ) + assert entity_tuples[4] == ( + "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " + "(.004 < P <.05).", + "Premise", + ) + assert entity_tuples[5] == ( + "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with " + "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", + "Premise", + ) + assert entity_tuples[6] == ( + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + "Claim", + ) + + # check the relations + assert len(converted_doc.binary_relations) == 6 + relation_tuples = [ + (str(rel.head), rel.label, str(rel.tail)) for rel in converted_doc.binary_relations + ] + assert relation_tuples[0] == ( + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + "Support", + "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain " + "in men with metastatic, hormone-resistant, prostate cancer.", + ) + assert relation_tuples[1] == ( + "At 6 weeks, both groups showed improvement in several HQL domains,", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[2] == ( + "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " + "prednisone-alone group.", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[3] == ( + "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " + "(.004 < P <.05).", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement in " + "several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[4] == ( + "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " + "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four " + "functioning domains, and nine symptoms (.001 < P <. 01),", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[5] == ( + "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with improvements " + "in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", + "Support", + "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in " + "men with metastatic, hormone-resistant, prostate cancer.", + ) @pytest.fixture(scope="module") @@ -229,49 +217,66 @@ def tokenized_documents_with_labeled_spans_and_binary_relations( def test_tokenized_documents_with_labeled_spans_and_binary_relations( tokenized_documents_with_labeled_spans_and_binary_relations, ): - if tokenized_documents_with_labeled_spans_and_binary_relations is not None: - docs = tokenized_documents_with_labeled_spans_and_binary_relations - # check that the tokenization was fine - assert len(docs) == 1 - doc = docs[0] - assert len(doc.labeled_spans) == 183 - assert len(doc.tokens) == 7689 - # Check the first ten tokens - assert doc.tokens[:10] == ("[CLS]", "<", "?", "xml", "version", "=", '"', "1", ".", "0") - # Check the first ten tokenized entities after sorted by their start position - sorted_entities = sorted(doc.labeled_spans, key=lambda ent: ent.start) - assert ( - str(sorted_entities[0]) - == "('complicated', '3d', 'character', 'models', 'are', 'widely', 'used', 'in', 'fields', 'of', " - "'entertainment', ',', 'virtual', 'reality', ',', 'medicine', 'etc')" - ) - assert ( - str(sorted_entities[1]) - == "('the', 'range', 'of', 'breath', '##taking', 'realistic', '3d', 'models', 'is', 'only', 'limited', " - "'by', 'the', 'creativity', 'of', 'artists', 'and', 'resolution', 'of', 'devices')" - ) - assert ( - str(sorted_entities[2]) - == "('driving', '3d', 'models', 'in', 'a', 'natural', 'and', 'bel', '##ie', '##vable', 'manner', 'is', " - "'not', 'trivial')" - ) - assert str(sorted_entities[3]) == "('the', 'model', 'is', 'very', 'detailed')" - assert ( - str(sorted_entities[4]) - == "('playback', 'of', 'animation', 'becomes', 'quite', 'heavy', 'and', 'time', 'consuming')" - ) - assert str(sorted_entities[5]) == "('a', 'frame', 'goes', 'wrong')" - assert ( - str(sorted_entities[6]) - == "('a', 'production', 'cannot', 'afford', 'major', 'revisions')" - ) - assert str(sorted_entities[7]) == "('res', '##cu', '##lp', '##ting', 'models')" - assert str(sorted_entities[8]) == "('re', '-', 'rig', '##ging', 'skeletons')" - assert ( - str(sorted_entities[9]) - == "('providing', 'a', 'flexible', 'and', 'efficient', 'solution', 'to', 'animation', 'remains', 'an', " - "'open', 'problem')" - ) + docs: List[ + TestTokenDocumentWithLabeledSpansAndBinaryRelations + ] = tokenized_documents_with_labeled_spans_and_binary_relations + # check that the tokenization was fine + assert len(docs) == 1 + doc = docs[0] + assert len(doc.tokens) == 465 + assert len(doc.labeled_spans) == 7 + ent = doc.labeled_spans[0] + assert ( + str(ent) + == "('a', 'combination', 'of', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'is', " + "'prefer', '##able', 'to', 'pre', '##d', '##nis', '##one', 'alone', 'for', 'reduction', 'of', 'pain', 'in', " + "'men', 'with', 'meta', '##static', ',', 'hormone', '-', 'resistant', ',', 'prostate', 'cancer', '.')" + ) + ent = doc.labeled_spans[1] + assert ( + str(ent) + == "('at', '6', 'weeks', ',', 'both', 'groups', 'showed', 'improvement', 'in', 'several', 'hq', '##l', " + "'domains', ',')" + ) + ent = doc.labeled_spans[2] + assert ( + str(ent) + == "('only', 'physical', 'functioning', 'and', 'pain', 'were', 'better', 'in', 'the', 'mit', '##ox', '##ant', " + "'##rone', '-', 'plus', '-', 'pre', '##d', '##nis', '##one', 'group', 'than', 'in', 'the', 'pre', '##d', " + "'##nis', '##one', '-', 'alone', 'group', '.')" + ) + ent = doc.labeled_spans[3] + assert ( + str(ent) + == "('after', '6', 'weeks', ',', 'patients', 'taking', 'pre', '##d', '##nis', '##one', 'showed', 'no', " + "'improvement', 'in', 'hq', '##l', 'scores', ',', 'whereas', 'those', 'taking', 'mit', '##ox', '##ant', " + "'##rone', 'plus', 'pre', '##d', '##nis', '##one', 'showed', 'significant', 'improvements', 'in', 'global', " + "'quality', 'of', 'life', '(', 'p', '=', '.', '00', '##9', ')', ',', 'four', 'functioning', 'domains', ',', " + "'and', 'nine', 'symptoms', '(', '.', '001', '<', 'p', '<', '.', '01', ')', ',')" + ) + ent = doc.labeled_spans[4] + assert ( + str(ent) + == "('the', 'improvement', '(', '>', '10', 'units', 'on', 'a', 'scale', 'of', '0', 'to', '##100', ')', " + "'lasted', 'longer', 'than', 'in', 'the', 'pre', '##d', '##nis', '##one', '-', 'alone', 'group', '(', '.', " + "'00', '##4', '<', 'p', '<', '.', '05', ')', '.')" + ) + ent = doc.labeled_spans[5] + assert ( + str(ent) + == "('the', 'addition', 'of', 'mit', '##ox', '##ant', '##rone', 'to', 'pre', '##d', '##nis', '##one', " + "'after', 'failure', 'of', 'pre', '##d', '##nis', '##one', 'alone', 'was', 'associated', 'with', " + "'improvements', 'in', 'pain', ',', 'pain', 'impact', ',', 'pain', 'relief', ',', 'ins', '##om', '##nia', " + "',', 'and', 'global', 'quality', 'of', 'life', '(', '.', '001', '<', 'p', '<', '.', '00', '##3', ')', '.')" + ) + ent = doc.labeled_spans[6] + assert ( + str(ent) + == "('treatment', 'with', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'was', " + "'associated', 'with', 'greater', 'and', 'longer', '-', 'lasting', 'improvement', 'in', 'several', " + "'hq', '##l', 'domains', 'and', 'symptoms', 'than', 'treatment', 'with', 'pre', '##d', '##nis', '##one', " + "'alone', '.')" + ) def test_tokenized_documents_with_entities_and_relations_all( From bf77b558a56d6fd4b73f715ef3c91fed82aa8143 Mon Sep 17 00:00:00 2001 From: Ruangrin L <88072261+idalr@users.noreply.github.com> Date: Fri, 8 Dec 2023 14:17:37 +0100 Subject: [PATCH 09/24] edit more tests in test_abstrct.py --- tests/dataset_builders/pie/test_abstrct.py | 307 ++++++++++----------- 1 file changed, 153 insertions(+), 154 deletions(-) diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py index 3037e1a0..64be662d 100644 --- a/tests/dataset_builders/pie/test_abstrct.py +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -91,100 +91,100 @@ def dataset_of_text_documents_with_labeled_spans_and_binary_relations( def test_dataset_of_text_documents_with_labeled_spans_and_binary_relations( dataset_of_text_documents_with_labeled_spans_and_binary_relations, ): - assert dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None - # get a document to check - converted_doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[ - "neoplasm_train" - ][0] - # check that the conversion is correct and the data makes sense - assert isinstance(converted_doc, TextDocumentWithLabeledSpansAndBinaryRelations) - - # check the entities - assert len(converted_doc.labeled_spans) == 7 - entity_tuples = [(str(ent), ent.label) for ent in converted_doc.labeled_spans] - assert entity_tuples[0] == ( - "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in men " - "with metastatic, hormone-resistant, prostate cancer.", - "MajorClaim", - ) - assert entity_tuples[1] == ( - "At 6 weeks, both groups showed improvement in several HQL domains,", - "Premise", - ) - assert entity_tuples[2] == ( - "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " - "prednisone-alone group.", - "Premise", - ) - assert entity_tuples[3] == ( - "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " - "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), " - "four functioning domains, and nine symptoms (.001 < P <. 01),", - "Premise", - ) - assert entity_tuples[4] == ( - "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " - "(.004 < P <.05).", - "Premise", - ) - assert entity_tuples[5] == ( - "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with " - "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", - "Premise", - ) - assert entity_tuples[6] == ( - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " - "in several HQL domains and symptoms than treatment with prednisone alone.", - "Claim", - ) + if dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None: + # get a document to check + converted_doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[ + "neoplasm_train" + ][0] + # check that the conversion is correct and the data makes sense + assert isinstance(converted_doc, TextDocumentWithLabeledSpansAndBinaryRelations) + + # check the entities + assert len(converted_doc.labeled_spans) == 7 + entity_tuples = [(str(ent), ent.label) for ent in converted_doc.labeled_spans] + assert entity_tuples[0] == ( + "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in men " + "with metastatic, hormone-resistant, prostate cancer.", + "MajorClaim", + ) + assert entity_tuples[1] == ( + "At 6 weeks, both groups showed improvement in several HQL domains,", + "Premise", + ) + assert entity_tuples[2] == ( + "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " + "prednisone-alone group.", + "Premise", + ) + assert entity_tuples[3] == ( + "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " + "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), " + "four functioning domains, and nine symptoms (.001 < P <. 01),", + "Premise", + ) + assert entity_tuples[4] == ( + "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " + "(.004 < P <.05).", + "Premise", + ) + assert entity_tuples[5] == ( + "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with " + "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", + "Premise", + ) + assert entity_tuples[6] == ( + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + "Claim", + ) - # check the relations - assert len(converted_doc.binary_relations) == 6 - relation_tuples = [ - (str(rel.head), rel.label, str(rel.tail)) for rel in converted_doc.binary_relations - ] - assert relation_tuples[0] == ( - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " - "in several HQL domains and symptoms than treatment with prednisone alone.", - "Support", - "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain " - "in men with metastatic, hormone-resistant, prostate cancer.", - ) - assert relation_tuples[1] == ( - "At 6 weeks, both groups showed improvement in several HQL domains,", - "Support", - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " - "in several HQL domains and symptoms than treatment with prednisone alone.", - ) - assert relation_tuples[2] == ( - "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " - "prednisone-alone group.", - "Support", - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " - "in several HQL domains and symptoms than treatment with prednisone alone.", - ) - assert relation_tuples[3] == ( - "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " - "(.004 < P <.05).", - "Support", - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement in " - "several HQL domains and symptoms than treatment with prednisone alone.", - ) - assert relation_tuples[4] == ( - "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " - "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four " - "functioning domains, and nine symptoms (.001 < P <. 01),", - "Support", - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " - "in several HQL domains and symptoms than treatment with prednisone alone.", - ) - assert relation_tuples[5] == ( - "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with improvements " - "in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", - "Support", - "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in " - "men with metastatic, hormone-resistant, prostate cancer.", - ) + # check the relations + assert len(converted_doc.binary_relations) == 6 + relation_tuples = [ + (str(rel.head), rel.label, str(rel.tail)) for rel in converted_doc.binary_relations + ] + assert relation_tuples[0] == ( + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + "Support", + "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain " + "in men with metastatic, hormone-resistant, prostate cancer.", + ) + assert relation_tuples[1] == ( + "At 6 weeks, both groups showed improvement in several HQL domains,", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[2] == ( + "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " + "prednisone-alone group.", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[3] == ( + "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " + "(.004 < P <.05).", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement in " + "several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[4] == ( + "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " + "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four " + "functioning domains, and nine symptoms (.001 < P <. 01),", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[5] == ( + "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with improvements " + "in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", + "Support", + "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in " + "men with metastatic, hormone-resistant, prostate cancer.", + ) @pytest.fixture(scope="module") @@ -217,66 +217,65 @@ def tokenized_documents_with_labeled_spans_and_binary_relations( def test_tokenized_documents_with_labeled_spans_and_binary_relations( tokenized_documents_with_labeled_spans_and_binary_relations, ): - docs: List[ - TestTokenDocumentWithLabeledSpansAndBinaryRelations - ] = tokenized_documents_with_labeled_spans_and_binary_relations - # check that the tokenization was fine - assert len(docs) == 1 - doc = docs[0] - assert len(doc.tokens) == 465 - assert len(doc.labeled_spans) == 7 - ent = doc.labeled_spans[0] - assert ( - str(ent) - == "('a', 'combination', 'of', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'is', " - "'prefer', '##able', 'to', 'pre', '##d', '##nis', '##one', 'alone', 'for', 'reduction', 'of', 'pain', 'in', " - "'men', 'with', 'meta', '##static', ',', 'hormone', '-', 'resistant', ',', 'prostate', 'cancer', '.')" - ) - ent = doc.labeled_spans[1] - assert ( - str(ent) - == "('at', '6', 'weeks', ',', 'both', 'groups', 'showed', 'improvement', 'in', 'several', 'hq', '##l', " - "'domains', ',')" - ) - ent = doc.labeled_spans[2] - assert ( - str(ent) - == "('only', 'physical', 'functioning', 'and', 'pain', 'were', 'better', 'in', 'the', 'mit', '##ox', '##ant', " - "'##rone', '-', 'plus', '-', 'pre', '##d', '##nis', '##one', 'group', 'than', 'in', 'the', 'pre', '##d', " - "'##nis', '##one', '-', 'alone', 'group', '.')" - ) - ent = doc.labeled_spans[3] - assert ( - str(ent) - == "('after', '6', 'weeks', ',', 'patients', 'taking', 'pre', '##d', '##nis', '##one', 'showed', 'no', " - "'improvement', 'in', 'hq', '##l', 'scores', ',', 'whereas', 'those', 'taking', 'mit', '##ox', '##ant', " - "'##rone', 'plus', 'pre', '##d', '##nis', '##one', 'showed', 'significant', 'improvements', 'in', 'global', " - "'quality', 'of', 'life', '(', 'p', '=', '.', '00', '##9', ')', ',', 'four', 'functioning', 'domains', ',', " - "'and', 'nine', 'symptoms', '(', '.', '001', '<', 'p', '<', '.', '01', ')', ',')" - ) - ent = doc.labeled_spans[4] - assert ( - str(ent) - == "('the', 'improvement', '(', '>', '10', 'units', 'on', 'a', 'scale', 'of', '0', 'to', '##100', ')', " - "'lasted', 'longer', 'than', 'in', 'the', 'pre', '##d', '##nis', '##one', '-', 'alone', 'group', '(', '.', " - "'00', '##4', '<', 'p', '<', '.', '05', ')', '.')" - ) - ent = doc.labeled_spans[5] - assert ( - str(ent) - == "('the', 'addition', 'of', 'mit', '##ox', '##ant', '##rone', 'to', 'pre', '##d', '##nis', '##one', " - "'after', 'failure', 'of', 'pre', '##d', '##nis', '##one', 'alone', 'was', 'associated', 'with', " - "'improvements', 'in', 'pain', ',', 'pain', 'impact', ',', 'pain', 'relief', ',', 'ins', '##om', '##nia', " - "',', 'and', 'global', 'quality', 'of', 'life', '(', '.', '001', '<', 'p', '<', '.', '00', '##3', ')', '.')" - ) - ent = doc.labeled_spans[6] - assert ( - str(ent) - == "('treatment', 'with', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'was', " - "'associated', 'with', 'greater', 'and', 'longer', '-', 'lasting', 'improvement', 'in', 'several', " - "'hq', '##l', 'domains', 'and', 'symptoms', 'than', 'treatment', 'with', 'pre', '##d', '##nis', '##one', " - "'alone', '.')" - ) + if tokenized_documents_with_labeled_spans_and_binary_relations is not None: + docs = tokenized_documents_with_labeled_spans_and_binary_relations + # check that the tokenization was fine + assert len(docs) == 1 + doc = docs[0] + assert len(doc.tokens) == 465 + assert len(doc.labeled_spans) == 7 + ent = doc.labeled_spans[0] + assert ( + str(ent) + == "('a', 'combination', 'of', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'is', " + "'prefer', '##able', 'to', 'pre', '##d', '##nis', '##one', 'alone', 'for', 'reduction', 'of', 'pain', 'in', " + "'men', 'with', 'meta', '##static', ',', 'hormone', '-', 'resistant', ',', 'prostate', 'cancer', '.')" + ) + ent = doc.labeled_spans[1] + assert ( + str(ent) + == "('at', '6', 'weeks', ',', 'both', 'groups', 'showed', 'improvement', 'in', 'several', 'hq', '##l', " + "'domains', ',')" + ) + ent = doc.labeled_spans[2] + assert ( + str(ent) + == "('only', 'physical', 'functioning', 'and', 'pain', 'were', 'better', 'in', 'the', 'mit', '##ox', '##ant', " + "'##rone', '-', 'plus', '-', 'pre', '##d', '##nis', '##one', 'group', 'than', 'in', 'the', 'pre', '##d', " + "'##nis', '##one', '-', 'alone', 'group', '.')" + ) + ent = doc.labeled_spans[3] + assert ( + str(ent) + == "('after', '6', 'weeks', ',', 'patients', 'taking', 'pre', '##d', '##nis', '##one', 'showed', 'no', " + "'improvement', 'in', 'hq', '##l', 'scores', ',', 'whereas', 'those', 'taking', 'mit', '##ox', '##ant', " + "'##rone', 'plus', 'pre', '##d', '##nis', '##one', 'showed', 'significant', 'improvements', 'in', 'global', " + "'quality', 'of', 'life', '(', 'p', '=', '.', '00', '##9', ')', ',', 'four', 'functioning', 'domains', ',', " + "'and', 'nine', 'symptoms', '(', '.', '001', '<', 'p', '<', '.', '01', ')', ',')" + ) + ent = doc.labeled_spans[4] + assert ( + str(ent) + == "('the', 'improvement', '(', '>', '10', 'units', 'on', 'a', 'scale', 'of', '0', 'to', '##100', ')', " + "'lasted', 'longer', 'than', 'in', 'the', 'pre', '##d', '##nis', '##one', '-', 'alone', 'group', '(', '.', " + "'00', '##4', '<', 'p', '<', '.', '05', ')', '.')" + ) + ent = doc.labeled_spans[5] + assert ( + str(ent) + == "('the', 'addition', 'of', 'mit', '##ox', '##ant', '##rone', 'to', 'pre', '##d', '##nis', '##one', " + "'after', 'failure', 'of', 'pre', '##d', '##nis', '##one', 'alone', 'was', 'associated', 'with', " + "'improvements', 'in', 'pain', ',', 'pain', 'impact', ',', 'pain', 'relief', ',', 'ins', '##om', '##nia', " + "',', 'and', 'global', 'quality', 'of', 'life', '(', '.', '001', '<', 'p', '<', '.', '00', '##3', ')', '.')" + ) + ent = doc.labeled_spans[6] + assert ( + str(ent) + == "('treatment', 'with', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'was', " + "'associated', 'with', 'greater', 'and', 'longer', '-', 'lasting', 'improvement', 'in', 'several', " + "'hq', '##l', 'domains', 'and', 'symptoms', 'than', 'treatment', 'with', 'pre', '##d', '##nis', '##one', " + "'alone', '.')" + ) def test_tokenized_documents_with_entities_and_relations_all( @@ -314,6 +313,6 @@ def test_document_converters(dataset_variant): assert set(document_converters) == { TextDocumentWithLabeledSpansAndBinaryRelations, } - assert all(callable(v) for k, v in document_converters.items()) + assert all(callable(v) for k, v in document_converters.items()) #currently not callable else: raise ValueError(f"Unknown dataset variant: {dataset_variant}") From 6403a240210bcfacc74c314b14b6db562d9f7434 Mon Sep 17 00:00:00 2001 From: Ruangrin L <88072261+idalr@users.noreply.github.com> Date: Tue, 12 Dec 2023 10:20:31 +0100 Subject: [PATCH 10/24] edit pie/readme.md --- dataset_builders/pie/abstrct/README.md | 99 +++++++++++++++++++++++--- 1 file changed, 91 insertions(+), 8 deletions(-) diff --git a/dataset_builders/pie/abstrct/README.md b/dataset_builders/pie/abstrct/README.md index 2c10a090..0b5899ae 100644 --- a/dataset_builders/pie/abstrct/README.md +++ b/dataset_builders/pie/abstrct/README.md @@ -1,23 +1,106 @@ # PIE Dataset Card for "abstrct" -This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the AbstRCT dataset. +This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the AbstRCT dataset ([paper]() and [data repository]()). Since the AbstRCT dataset is published in the [BRAT standoff format](https://brat.nlplab.org/standoff.html), this dataset builder is based on the [PyTorch-IE brat dataset loading script](https://huggingface.co/datasets/pie/brat). -TODO: Since there is no respective HF dataset card, we should all respective information here. +Therefore, the `abstrct` dataset as described here follows the data structure from the [PIE brat dataset card](https://huggingface.co/datasets/pie/brat). -TODO: Shortly reference the PIE-Brat dataset card. +### Dataset Summary -## Data Schema +### Supported Tasks and Leaderboards #TODO -TODO +- **Tasks**: Argumentation Mining, Component Identification, Relation Identification +- **Leaderboard:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards) -See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/annotations.py) for the remaining annotation type definitions. +### Languages #TODO -## Document Converters +The language in the dataset is English. + +### Dataset Variants + +See [PIE-Brat Data Variants](https://huggingface.co/datasets/pie/brat#data-variants). + +### Data Schema + +See [PIE-Brat Data Schema](https://huggingface.co/datasets/pie/brat#data-schema). + +### Usage + +```python +from pie_datasets import load_dataset, builders + +# load default version +datasets = load_dataset("pie/abstrct") +doc = datasets["train"][0] +assert isinstance(doc, builders.brat.BratDocument) + +# load version with merged span fragments +dataset_merged_spans = load_dataset("pie/abstrct", name="merge_fragmented_spans") +doc_merged_spans = dataset_merged_spans["train"][0] +assert isinstance(doc_merged_spans, builders.brat.BratDocumentWithMergedSpans) +``` + +### Document Converters #TODO The dataset provides document converters for the following target document types: - `pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations` - - TODO + - `LabeledSpans`, converted from `BratDocument`'s `spans` + - labels: + - `BinraryRelations`, converted from `BratDocument`'s `relations` + - labels: See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type definitions. + +### Data Splits #TODO + + +### Label Descriptions #TODO + +#### Components + +#### Relations + +## Dataset Creation #TODO + +### Curation Rationale + +### Source Data + +#### Initial Data Collection and Normalization + +#### Who are the source language producers? + +### Annotations #TODO + +#### Annotation process + +#### Who are the annotators? + +### Personal and Sensitive Information + +\[More Information Needed\] + +## Considerations for Using the Data #TODO + +### Social Impact of Dataset + +### Discussion of Biases + +### Other Known Limitations + +## Additional Information #TODO + +### Dataset Curators + +### Licensing Information + +### Citation Information + +``` + +``` + +### Contributions + +Thanks to [@ArneBinder](https://github.com/ArneBinder) and [@idalr](https://github.com/idalr) for adding this dataset. From 85ead58687e8ca83584462e82f5f9f5420a478b5 Mon Sep 17 00:00:00 2001 From: Ruangrin L <88072261+idalr@users.noreply.github.com> Date: Tue, 12 Dec 2023 16:00:59 +0100 Subject: [PATCH 11/24] edit pie/readme.md --- dataset_builders/pie/abstrct/README.md | 150 ++++++++++++++++++++++--- 1 file changed, 135 insertions(+), 15 deletions(-) diff --git a/dataset_builders/pie/abstrct/README.md b/dataset_builders/pie/abstrct/README.md index 0b5899ae..37f38a70 100644 --- a/dataset_builders/pie/abstrct/README.md +++ b/dataset_builders/pie/abstrct/README.md @@ -1,19 +1,23 @@ # PIE Dataset Card for "abstrct" -This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the AbstRCT dataset ([paper]() and [data repository]()). Since the AbstRCT dataset is published in the [BRAT standoff format](https://brat.nlplab.org/standoff.html), this dataset builder is based on the [PyTorch-IE brat dataset loading script](https://huggingface.co/datasets/pie/brat). +This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the AbstRCT dataset ([paper](<>) and [data repository](https://gitlab.com/tomaye/abstrct)). Since the AbstRCT dataset is published in the [BRAT standoff format](https://brat.nlplab.org/standoff.html), this dataset builder is based on the [PyTorch-IE brat dataset loading script](https://huggingface.co/datasets/pie/brat). Therefore, the `abstrct` dataset as described here follows the data structure from the [PIE brat dataset card](https://huggingface.co/datasets/pie/brat). ### Dataset Summary -### Supported Tasks and Leaderboards #TODO +A novel corpus of healthcare texts (i.e., RCT abstracts on various diseases) from the MEDLINE database, which +are annotated with argumentative components (i.e., `MajorClaim`, `Claim`, and `Premise`) and relations (i.e., `Support`, `Attack`, and `Partial-attack`), +in order to support clinicians' daily tasks in information finding and evidence-based reasoning for decision making. -- **Tasks**: Argumentation Mining, Component Identification, Relation Identification +### Supported Tasks and Leaderboards + +- **Tasks**: Argumentation Mining, Component Identification, Boundary Detection, Relation Identification, Link Prediction - **Leaderboard:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards) -### Languages #TODO +### Languages -The language in the dataset is English. +The language in the dataset is English (in the medical/healthcare domain). ### Dataset Variants @@ -39,66 +43,182 @@ doc_merged_spans = dataset_merged_spans["train"][0] assert isinstance(doc_merged_spans, builders.brat.BratDocumentWithMergedSpans) ``` -### Document Converters #TODO +### Document Converters The dataset provides document converters for the following target document types: - `pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations` - `LabeledSpans`, converted from `BratDocument`'s `spans` - - labels: + - labels: `MajorClaim`, `Claim`, `Premise` - `BinraryRelations`, converted from `BratDocument`'s `relations` - - labels: + - labels: `Support`, `Partial-Attack`, `Attack` See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type definitions. -### Data Splits #TODO +### Data Splits + +| Diseease-based Split | `neoplasm` | `glaucoma` | `mixed` | +| --------------------------------------------------------- | ----------------------: | -------------------: | -------------------: | +| No.of document
- `_train`
- `_dev`
- `_test` |
350
50
100 |


100 |


100 | + +**Important Note**: + +- `mixed_test` contains 20 abstracts on the following diseases: glaucoma, neoplasm, diabetes, hypertension, hepatitis. +- 31 out of 40 abstracts in `mixed_test` overlap with abstracts in `neoplasm_test` and `glaucoma_test`. + +### Label Descriptions +In this section, we describe labels according to [Mayer et al. (2020)](https://ebooks.iospress.nl/publication/55129), as well as our label counts on 669 abstracts. -### Label Descriptions #TODO +Unfortunately, the number we report does not correspond to what Mayer et al. reported in their paper (see Table 1, p. 2109). +Morio et al. ([2022](https://aclanthology.org/2022.tacl-1.37.pdf); p. 642, Table 1), who utilized this corpus for their AM tasks, also reported another number, claiming there were double annotation errors in the original statistic collection (see [reference](https://github.com/hitachi-nlp/graph_parser/blob/main/examples/multitask_am/README.md#qas)). #### Components +| Components | Count | Percentage | +| ------------ | ----: | ---------: | +| `MajorClaim` | 129 | 3 % | +| `Claim` | 1282 | 30.2 % | +| `Premise` | 2842 | 66.8 % | + +- `MajorClaim` are more general/concluding `claim`'s, which is supported by more specific claims +- `Claim` is a concluding statement made by the author about the outcome of the study. Claims only points to other claims. +- `Premise` (a.k.a. evidence) is an observation or measurement in the study, which supports or attacks another argument component, usually a `claim`. They are observed facts, and therefore credible without further justifications, as this is the ground truth the argumentation is based on. + +(Mayer et al. 2020, p.2110) + #### Relations -## Dataset Creation #TODO +| Relations | Count | Percentage | +| ------------------------ | ----: | ---------: | +| support: `Support` | 2289 | 87 % | +| attack: `Partial-Attack` | 275 | 10.4 % | +| attack: `Attack` | 69 | 2.6 % | + +- `Support`: All statements or observations justifying the proposition of the target component +- `Partial-Attack`: when the source component is not in full contradiction, but weakening the target component by constraining its proposition. Usually occur between two claims +- `Attack`: A component is attacking another one, if it is + - i) contradicting the proposition of the target component, or + - ii) undercutting its implicit assumption of significance constraints +- `Premise` can only be connected to either `Claim` or another `Premise` +- `Claim`'s can only point to other `Claim`'s +- There might be more than one **outgoing** and/or **incoming relation** . In rare case, there is no relation to another component at all. + +(Mayer et al. 2020, p.2110) + +## Dataset Creation ### Curation Rationale +"\[D\]espite its natural employment in healthcare applications, only few approaches have applied AM methods to this kind +of text, and their contribution is limited to the detection +of argument components, disregarding the more complex phase of +predicting the relations among them. In addition, no huge annotated +dataset for AM is available for the healthcare domain (p. 2108)...to support clinicians in decision making or in (semi)-automatically +filling evidence tables for systematic reviews in evidence-based medicine. (p. 2114)" + ### Source Data +[MEDLINE database](https://www.nlm.nih.gov/medline/medline_overview.html) + #### Initial Data Collection and Normalization +Extended from the previous dataset in [Mayer et al. 2018](https://webusers.i3s.unice.fr/~riveill/IADB/publications/2018-COMMA.pdf), 500 medical abstract from randomized controlled trials (RCTs) were retrieved directly from [PubMed](https://www.ncbi.nlm.nih.gov/pubmed/) by searching for titles or abstracts containing the disease name. + +(See the definition of RCT in the authors' [guideline](https://gitlab.com/tomaye/abstrct/-/blob/master/AbstRCT_corpus/AnnotationGuidelines.pdf) (Section 1.2) and [US National Library of Medicine](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6235704/)) + #### Who are the source language producers? -### Annotations #TODO +\[More Information Needed\] + +### Annotations #### Annotation process +"An expert in the medical domain (a pharmacist) validated the annotation +guidelines before starting the annotation process." (p. 2110) + +"Annotation was started after a training phase, where amongst others the component boundaries were topic of discussion. Gold labels +were set after a reconciliation phase, during which the annotators +tried to reach an agreement. While the number of annotators vary for +the two annotation phases (component and relation annotation). + +On the annotation of argument components, "IAA among the three annotators has been calculated +on 30 abstracts, resulting in a Fleiss’ kappa of 0.72 for argumentative +components and 0.68 for the more fine-grained distinction between +claims and evidence." (p. 2109) + +On the annotation of argumentative relation, "IAA has been calculated on 30 abstracts annotated in parallel by three annotators, +resulting in a Fleiss’ kappa of +0.62. The annotation of the remaining abstracts was carried out by +one of the above mentioned annotators." (p. 2110) + +See the [Annotation Guideline](https://gitlab.com/tomaye/abstrct/-/blob/master/AbstRCT_corpus/AnnotationGuidelines.pdf?ref_type=heads) for more information on definitions and annotated samples. + #### Who are the annotators? +Two annotators with background in computational linguistics. No information was given on the third annotator. + ### Personal and Sensitive Information \[More Information Needed\] -## Considerations for Using the Data #TODO +## Considerations for Using the Data ### Social Impact of Dataset +"These \[*intelligent*\] systems apply to clinical trials, +clinical guidelines, and electronic health records, and their solutions range from the automated detection of PICO elements +in health records to evidence-based reasoning for decision making. These applications highlight the need of clinicians to be supplied with frameworks able to extract, from the huge +quantity of data available for the different diseases and treatments, +the exact information they necessitate and to present this information in a structured way, easy to be (possibly semi-automatically) +analyzed...Given its aptness to automatically detect in text those +argumentative structures that are at the basis of evidence-based reasoning applications, AM represents a potential valuable contribution +in the healthcare domain." (p. 2108) + +"We expect that our work will have a large impact for clinicians as it +is a crucial step towards AI supported clinical deliberation at a large +scale." (p. 2114) + ### Discussion of Biases +\[More Information Needed\] + ### Other Known Limitations -## Additional Information #TODO +\[More Information Needed\] + +## Additional Information ### Dataset Curators +\[More Information Needed\] + ### Licensing Information +- **License**: the AbstRCT dataset is released under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode) +- **Funding**: This work is partly funded by the French government labelled PIA + program under its IDEX UCA JEDI project (ANR-15-IDEX-0001). + This work has been supported by the French government, through the + 3IA Cote d’Azur Investments in the Future project managed by the + National Research Agency (ANR) with the reference number ANR19-P3IA-0002 + ### Citation Information ``` - +@inproceedings{mayer2020ecai, + author = {Tobias Mayer and + Elena Cabrio and + Serena Villata}, + title = {Transformer-Based Argument Mining for Healthcare Applications}, + booktitle = {{ECAI} 2020 - 24th European Conference on Artificial Intelligence}, + series = {Frontiers in Artificial Intelligence and Applications}, + volume = {325}, + pages = {2108--2115}, + publisher = {{IOS} Press}, + year = {2020}, +} ``` ### Contributions From 98014c63b28a56c0a6d811c3ad575dc7b1eef247 Mon Sep 17 00:00:00 2001 From: Ruangrin L <88072261+idalr@users.noreply.github.com> Date: Tue, 12 Dec 2023 17:15:55 +0100 Subject: [PATCH 12/24] edit `test_document_converters` --- tests/dataset_builders/pie/test_abstrct.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py index 64be662d..9b109e8c 100644 --- a/tests/dataset_builders/pie/test_abstrct.py +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -313,6 +313,6 @@ def test_document_converters(dataset_variant): assert set(document_converters) == { TextDocumentWithLabeledSpansAndBinaryRelations, } - assert all(callable(v) for k, v in document_converters.items()) #currently not callable + assert all(dict(v) for k, v in document_converters.items()) else: raise ValueError(f"Unknown dataset variant: {dataset_variant}") From 733789122cc4b7fb1a813e751bd988b89cb5ff72 Mon Sep 17 00:00:00 2001 From: Ruangrin L <88072261+idalr@users.noreply.github.com> Date: Tue, 12 Dec 2023 17:36:04 +0100 Subject: [PATCH 13/24] minor edit --- dataset_builders/pie/abstrct/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset_builders/pie/abstrct/README.md b/dataset_builders/pie/abstrct/README.md index 37f38a70..9a626c0f 100644 --- a/dataset_builders/pie/abstrct/README.md +++ b/dataset_builders/pie/abstrct/README.md @@ -1,6 +1,6 @@ # PIE Dataset Card for "abstrct" -This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the AbstRCT dataset ([paper](<>) and [data repository](https://gitlab.com/tomaye/abstrct)). Since the AbstRCT dataset is published in the [BRAT standoff format](https://brat.nlplab.org/standoff.html), this dataset builder is based on the [PyTorch-IE brat dataset loading script](https://huggingface.co/datasets/pie/brat). +This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the AbstRCT dataset ([paper](https://ebooks.iospress.nl/publication/55129) and [data repository](https://gitlab.com/tomaye/abstrct)). Since the AbstRCT dataset is published in the [BRAT standoff format](https://brat.nlplab.org/standoff.html), this dataset builder is based on the [PyTorch-IE brat dataset loading script](https://huggingface.co/datasets/pie/brat). Therefore, the `abstrct` dataset as described here follows the data structure from the [PIE brat dataset card](https://huggingface.co/datasets/pie/brat). From 74cbfd9a47101a98e1995b09a377907dd67fcd36 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Thu, 14 Dec 2023 14:26:01 +0100 Subject: [PATCH 14/24] update BASE_DATASET_REVISION --- dataset_builders/pie/abstrct/abstrct.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset_builders/pie/abstrct/abstrct.py b/dataset_builders/pie/abstrct/abstrct.py index f27dc851..c359ff4d 100644 --- a/dataset_builders/pie/abstrct/abstrct.py +++ b/dataset_builders/pie/abstrct/abstrct.py @@ -15,7 +15,7 @@ class AbstRCT(BratBuilder): BASE_DATASET_PATH = "DFKI-SLT/brat" - BASE_DATASET_REVISION = "844de61e8a00dc6a93fc29dc185f6e617131fbf1" + BASE_DATASET_REVISION = "bb8c37d84ddf2da1e691d226c55fef48fd8149b5" # we need to add None to the list of dataset variants to support the default dataset variant BASE_BUILDER_KWARGS_DICT = { From bf0ebcf557ed8bb93d21824bf10efc590add19fb Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Thu, 14 Dec 2023 14:26:17 +0100 Subject: [PATCH 15/24] update requirements.txt --- dataset_builders/pie/abstrct/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset_builders/pie/abstrct/requirements.txt b/dataset_builders/pie/abstrct/requirements.txt index cb3deb1c..30439e3e 100644 --- a/dataset_builders/pie/abstrct/requirements.txt +++ b/dataset_builders/pie/abstrct/requirements.txt @@ -1 +1 @@ -pie-datasets>=0.4.0,<0.7.0 +pie-datasets>=0.4.0,<0.9.0 From 1a410e7afbb00d02b3394e0bbb6489f499aef933 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Thu, 14 Dec 2023 14:28:13 +0100 Subject: [PATCH 16/24] set strict_span_conversion=True for tokenize_document() --- tests/dataset_builders/pie/test_abstrct.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py index 9b109e8c..ac8aeb65 100644 --- a/tests/dataset_builders/pie/test_abstrct.py +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -208,7 +208,7 @@ def tokenized_documents_with_labeled_spans_and_binary_relations( tokenizer=tokenizer, return_overflowing_tokens=True, result_document_type=TestTokenDocumentWithLabeledSpansAndBinaryRelations, - strict_span_conversion=False, + strict_span_conversion=True, verbose=True, ) return tokenized_docs @@ -294,7 +294,7 @@ def test_tokenized_documents_with_entities_and_relations_all( tokenizer=tokenizer, return_overflowing_tokens=True, result_document_type=TestTokenDocumentWithLabeledSpansAndBinaryRelations, - strict_span_conversion=False, + strict_span_conversion=True, verbose=True, ) # we just ensure that we get at least one tokenized document From 16f015255b9479e346447f413485140e45e0ace2 Mon Sep 17 00:00:00 2001 From: Ruangrin L <88072261+idalr@users.noreply.github.com> Date: Thu, 14 Dec 2023 11:29:27 +0100 Subject: [PATCH 17/24] minor changes --- tests/dataset_builders/pie/test_abstrct.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py index ac8aeb65..266783f7 100644 --- a/tests/dataset_builders/pie/test_abstrct.py +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -17,6 +17,7 @@ disable_caching() DATASET_NAME = "abstrct" +BUILDER_CLASS = AbstRCT PIE_DATASET_PATH = PIE_BASE_PATH / DATASET_NAME SPLIT_SIZES = { "glaucoma_test": 100, @@ -28,7 +29,7 @@ SPLIT = "neoplasm_train" -@pytest.fixture(scope="module", params=[config.name for config in AbstRCT.BUILDER_CONFIGS]) +@pytest.fixture(scope="module", params=[config.name for config in BUILDER_CLASS.BUILDER_CONFIGS]) def dataset_variant(request) -> str: return request.param @@ -60,6 +61,11 @@ def test_document(document, dataset_variant): assert document.text.startswith( " A combination of mitoxantrone plus prednisone is preferable to prednisone alone" ) + # TODO: test the actual content (annotation of the document) + # if dataset_variant == "default": + # assert + # elif dataset_variant == "merge_fragmented_spans": + # assert @pytest.fixture(scope="module") @@ -303,7 +309,7 @@ def test_tokenized_documents_with_entities_and_relations_all( def test_document_converters(dataset_variant): - builder = AbstRCT(config_name=dataset_variant) + builder = BUILDER_CLASS(config_name=dataset_variant) document_converters = builder.document_converters if dataset_variant == "default": @@ -313,6 +319,7 @@ def test_document_converters(dataset_variant): assert set(document_converters) == { TextDocumentWithLabeledSpansAndBinaryRelations, } + # TODO: recheck this assert all(dict(v) for k, v in document_converters.items()) else: raise ValueError(f"Unknown dataset variant: {dataset_variant}") From c6fd863337ad540df9f21c3e688572b0a829a6c9 Mon Sep 17 00:00:00 2001 From: Ruangrin L <88072261+idalr@users.noreply.github.com> Date: Mon, 18 Dec 2023 16:38:55 +0100 Subject: [PATCH 18/24] updated `test_abstrct.py` --- tests/dataset_builders/pie/test_abstrct.py | 70 +++++++++++++++++++--- 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py index 266783f7..a70a0fb1 100644 --- a/tests/dataset_builders/pie/test_abstrct.py +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -58,14 +58,64 @@ def document(dataset, dataset_variant) -> Union[BratDocument, BratDocumentWithMe def test_document(document, dataset_variant): assert document is not None - assert document.text.startswith( - " A combination of mitoxantrone plus prednisone is preferable to prednisone alone" + assert document.id == "10561201" + + # check spans + assert len(document.spans) == 7 + span_texts = document.metadata["span_texts"] + assert ( + span_texts[0] + == "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain " + "in men with metastatic, hormone-resistant, prostate cancer." ) - # TODO: test the actual content (annotation of the document) - # if dataset_variant == "default": - # assert - # elif dataset_variant == "merge_fragmented_spans": - # assert + assert span_texts[1] == "At 6 weeks, both groups showed improvement in several HQL domains," + assert ( + span_texts[2] + == "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " + "prednisone-alone group." + ) + assert ( + span_texts[3] + == "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " + "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four " + "functioning domains, and nine symptoms (.001 < P <. 01)," + ) + assert ( + span_texts[4] + == "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " + "(.004 < P <.05)." + ) + assert ( + span_texts[5] + == "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with " + "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003)." + ) + assert ( + span_texts[6] + == "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone." + ) + + # check relations + assert len(document.relations) == 6 + document.relations[0].label == "Support" + document.relations[0].head == document.spans[6] + document.relations[0].tail == document.spans[0] + document.relations[1].label == "Support" + document.relations[1].head == document.spans[1] + document.relations[1].tail == document.spans[6] + document.relations[2].label == "Support" + document.relations[2].head == document.spans[2] + document.relations[2].tail == document.spans[6] + document.relations[3].label == "Support" + document.relations[3].head == document.spans[5] + document.relations[3].tail == document.spans[6] + document.relations[4].label == "Support" + document.relations[4].head == document.spans[3] + document.relations[4].tail == document.spans[6] + document.relations[5].label == "Support" + document.relations[5].head == document.spans[5] + document.relations[5].tail == document.spans[0] @pytest.fixture(scope="module") @@ -319,7 +369,9 @@ def test_document_converters(dataset_variant): assert set(document_converters) == { TextDocumentWithLabeledSpansAndBinaryRelations, } - # TODO: recheck this - assert all(dict(v) for k, v in document_converters.items()) + assert document_converters[TextDocumentWithLabeledSpansAndBinaryRelations] == { + "spans": "labeled_spans", + "relations": "binary_relations", + } else: raise ValueError(f"Unknown dataset variant: {dataset_variant}") From 2097a34d72aa528d6936cc20094dcce5f412777c Mon Sep 17 00:00:00 2001 From: Ruangrin L <88072261+idalr@users.noreply.github.com> Date: Mon, 18 Dec 2023 16:40:51 +0100 Subject: [PATCH 19/24] make pre-commit happy --- tests/dataset_builders/pie/test_abstrct.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py index a70a0fb1..7cbc557c 100644 --- a/tests/dataset_builders/pie/test_abstrct.py +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -66,34 +66,34 @@ def test_document(document, dataset_variant): assert ( span_texts[0] == "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain " - "in men with metastatic, hormone-resistant, prostate cancer." + "in men with metastatic, hormone-resistant, prostate cancer." ) assert span_texts[1] == "At 6 weeks, both groups showed improvement in several HQL domains," assert ( span_texts[2] == "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " - "prednisone-alone group." + "prednisone-alone group." ) assert ( span_texts[3] == "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " - "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four " - "functioning domains, and nine symptoms (.001 < P <. 01)," + "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four " + "functioning domains, and nine symptoms (.001 < P <. 01)," ) assert ( span_texts[4] == "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " - "(.004 < P <.05)." + "(.004 < P <.05)." ) assert ( span_texts[5] == "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with " - "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003)." + "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003)." ) assert ( span_texts[6] == "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " - "in several HQL domains and symptoms than treatment with prednisone alone." + "in several HQL domains and symptoms than treatment with prednisone alone." ) # check relations From ac1c097119a6c7799933ce0e0e2f42dec3470bc0 Mon Sep 17 00:00:00 2001 From: Ruangrin L <88072261+idalr@users.noreply.github.com> Date: Mon, 18 Dec 2023 20:37:20 +0100 Subject: [PATCH 20/24] edit 'test_document' --- tests/dataset_builders/pie/test_abstrct.py | 63 +++++++++++++--------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py index 7cbc557c..67b4190a 100644 --- a/tests/dataset_builders/pie/test_abstrct.py +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -60,40 +60,51 @@ def test_document(document, dataset_variant): assert document is not None assert document.id == "10561201" + # check the annotation + if dataset_variant == "default": + span_texts_labels_tuples = [ + (document.text[span.slices[0][0] : span.slices[-1][1]], span.label) + for span in document.spans + ] + elif dataset_variant == "merge_fragmented_spans": + span_texts_labels_tuples = [(str(span), span.label) for span in document.spans] + # check spans assert len(document.spans) == 7 - span_texts = document.metadata["span_texts"] - assert ( - span_texts[0] - == "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain " - "in men with metastatic, hormone-resistant, prostate cancer." + assert span_texts_labels_tuples[0] == ( + "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain " + "in men with metastatic, hormone-resistant, prostate cancer.", + "MajorClaim", + ) + assert span_texts_labels_tuples[1] == ( + "At 6 weeks, both groups showed improvement in several HQL domains,", + "Premise", ) - assert span_texts[1] == "At 6 weeks, both groups showed improvement in several HQL domains," - assert ( - span_texts[2] - == "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " - "prednisone-alone group." + assert span_texts_labels_tuples[2] == ( + "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " + "prednisone-alone group.", + "Premise", ) - assert ( - span_texts[3] - == "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " + assert span_texts_labels_tuples[3] == ( + "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four " - "functioning domains, and nine symptoms (.001 < P <. 01)," + "functioning domains, and nine symptoms (.001 < P <. 01),", + "Premise", ) - assert ( - span_texts[4] - == "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " - "(.004 < P <.05)." + assert span_texts_labels_tuples[4] == ( + "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " + "(.004 < P <.05).", + "Premise", ) - assert ( - span_texts[5] - == "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with " - "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003)." + assert span_texts_labels_tuples[5] == ( + "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with " + "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", + "Premise", ) - assert ( - span_texts[6] - == "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " - "in several HQL domains and symptoms than treatment with prednisone alone." + assert span_texts_labels_tuples[6] == ( + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + "Claim", ) # check relations From 0a2b8e923a6e83bec2251ec7f8b6cdbfc8b04d41 Mon Sep 17 00:00:00 2001 From: Ruangrin L <88072261+idalr@users.noreply.github.com> Date: Tue, 19 Dec 2023 10:58:09 +0100 Subject: [PATCH 21/24] checked fragments in spans --- tests/dataset_builders/pie/test_abstrct.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py index 67b4190a..896ec8b7 100644 --- a/tests/dataset_builders/pie/test_abstrct.py +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -44,6 +44,14 @@ def test_dataset(dataset): assert {name: len(ds) for name, ds in dataset.items()} == SPLIT_SIZES +def test_no_fragmented_spans(dataset, dataset_variant): + if dataset_variant == "default": + for split, docs in dataset.items(): + for doc in docs: + # test the number of slices of the LabeledMultiSpan annotations + assert [len(span.slices) == 1 for span in doc.spans] + + @pytest.fixture(scope="module") def document(dataset, dataset_variant) -> Union[BratDocument, BratDocumentWithMergedSpans]: result = dataset[SPLIT][0] @@ -63,7 +71,7 @@ def test_document(document, dataset_variant): # check the annotation if dataset_variant == "default": span_texts_labels_tuples = [ - (document.text[span.slices[0][0] : span.slices[-1][1]], span.label) + (" ".join([document.text[start:end] for start, end in span.slices]), span.label) for span in document.spans ] elif dataset_variant == "merge_fragmented_spans": From 682f872568aca71d06dd58f6ab83d7195b2989b7 Mon Sep 17 00:00:00 2001 From: Ruangrin L <88072261+idalr@users.noreply.github.com> Date: Tue, 19 Dec 2023 11:37:39 +0100 Subject: [PATCH 22/24] minor fix --- tests/dataset_builders/pie/test_abstrct.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py index 896ec8b7..af986623 100644 --- a/tests/dataset_builders/pie/test_abstrct.py +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -49,7 +49,7 @@ def test_no_fragmented_spans(dataset, dataset_variant): for split, docs in dataset.items(): for doc in docs: # test the number of slices of the LabeledMultiSpan annotations - assert [len(span.slices) == 1 for span in doc.spans] + assert all([len(span.slices) == 1 for span in doc.spans]) @pytest.fixture(scope="module") From 612466b8e5a5a8161e559e0648d914412e258592 Mon Sep 17 00:00:00 2001 From: Ruangrin L <88072261+idalr@users.noreply.github.com> Date: Tue, 19 Dec 2023 17:27:01 +0100 Subject: [PATCH 23/24] converted to single dataset_variant --- dataset_builders/pie/abstrct/README.md | 14 +++-- dataset_builders/pie/abstrct/abstrct.py | 30 +++++------ tests/dataset_builders/pie/test_abstrct.py | 61 ++++++++-------------- 3 files changed, 42 insertions(+), 63 deletions(-) diff --git a/dataset_builders/pie/abstrct/README.md b/dataset_builders/pie/abstrct/README.md index 9a626c0f..d0b752a4 100644 --- a/dataset_builders/pie/abstrct/README.md +++ b/dataset_builders/pie/abstrct/README.md @@ -21,7 +21,10 @@ The language in the dataset is English (in the medical/healthcare domain). ### Dataset Variants -See [PIE-Brat Data Variants](https://huggingface.co/datasets/pie/brat#data-variants). +The `abstrct` dataset comes in a single version (`default`) with `BratDocumentWithMergedSpans` as document type. Note, +that this in contrast to the base `brat` dataset, where the document type for the `default` variant is `BratDocument`. +The reason is that the AbstRCT dataset has already been published with only single-fragment spans. +Without any need to merge fragments, the document type `BratDocumentWithMergedSpans` is easier to handle for most of the task modules. ### Data Schema @@ -34,13 +37,8 @@ from pie_datasets import load_dataset, builders # load default version datasets = load_dataset("pie/abstrct") -doc = datasets["train"][0] -assert isinstance(doc, builders.brat.BratDocument) - -# load version with merged span fragments -dataset_merged_spans = load_dataset("pie/abstrct", name="merge_fragmented_spans") -doc_merged_spans = dataset_merged_spans["train"][0] -assert isinstance(doc_merged_spans, builders.brat.BratDocumentWithMergedSpans) +doc = datasets["neoplasm_train"][0] +assert isinstance(doc, builders.brat.BratDocumentWithMergedSpans) ``` ### Document Converters diff --git a/dataset_builders/pie/abstrct/abstrct.py b/dataset_builders/pie/abstrct/abstrct.py index c359ff4d..6dc12e42 100644 --- a/dataset_builders/pie/abstrct/abstrct.py +++ b/dataset_builders/pie/abstrct/abstrct.py @@ -1,7 +1,7 @@ from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations -from pie_datasets.builders import BratBuilder -from pie_datasets.core.dataset import DocumentConvertersType +from pie_datasets.builders import BratBuilder, BratConfig +from pie_datasets.builders.brat import BratDocumentWithMergedSpans URL = "https://gitlab.com/tomaye/abstrct/-/archive/master/abstrct-master.zip" SPLIT_PATHS = { @@ -17,22 +17,22 @@ class AbstRCT(BratBuilder): BASE_DATASET_PATH = "DFKI-SLT/brat" BASE_DATASET_REVISION = "bb8c37d84ddf2da1e691d226c55fef48fd8149b5" + BUILDER_CONFIGS = [ + BratConfig(name=BratBuilder.DEFAULT_CONFIG_NAME, merge_fragmented_spans=True), + ] + DOCUMENT_TYPES = { + BratBuilder.DEFAULT_CONFIG_NAME: BratDocumentWithMergedSpans, + } + # we need to add None to the list of dataset variants to support the default dataset variant BASE_BUILDER_KWARGS_DICT = { dataset_variant: {"url": URL, "split_paths": SPLIT_PATHS} for dataset_variant in ["default", "merge_fragmented_spans", None] } - @property - def document_converters(self) -> DocumentConvertersType: - if self.config.name == "default": - return {} - elif self.config.name == "merge_fragmented_spans": - return { - TextDocumentWithLabeledSpansAndBinaryRelations: { - "spans": "labeled_spans", - "relations": "binary_relations", - }, - } - else: - raise ValueError(f"Unknown dataset variant: {self.config.name}") + DOCUMENT_CONVERTERS = { + TextDocumentWithLabeledSpansAndBinaryRelations: { + "spans": "labeled_spans", + "relations": "binary_relations", + }, + } diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py index af986623..02036b8f 100644 --- a/tests/dataset_builders/pie/test_abstrct.py +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -1,14 +1,15 @@ -from typing import List, Optional, Union +from typing import List, Optional import pytest from datasets import disable_caching from pie_modules.document.processing import tokenize_document +from pytorch_ie.core import Document from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations from transformers import AutoTokenizer, PreTrainedTokenizer from dataset_builders.pie.abstrct.abstrct import AbstRCT from pie_datasets import DatasetDict -from pie_datasets.builders.brat import BratDocument, BratDocumentWithMergedSpans +from pie_datasets.builders.brat import BratDocumentWithMergedSpans from tests.dataset_builders.common import ( PIE_BASE_PATH, TestTokenDocumentWithLabeledSpansAndBinaryRelations, @@ -44,23 +45,25 @@ def test_dataset(dataset): assert {name: len(ds) for name, ds in dataset.items()} == SPLIT_SIZES -def test_no_fragmented_spans(dataset, dataset_variant): - if dataset_variant == "default": - for split, docs in dataset.items(): - for doc in docs: - # test the number of slices of the LabeledMultiSpan annotations - assert all([len(span.slices) == 1 for span in doc.spans]) +@pytest.fixture(scope="module") +def builder(dataset_variant) -> BUILDER_CLASS: + return BUILDER_CLASS(config_name=dataset_variant) + + +def test_builder(builder, dataset_variant): + assert builder is not None + assert builder.config_id == dataset_variant + assert builder.dataset_name == DATASET_NAME + assert builder.document_type == BratDocumentWithMergedSpans @pytest.fixture(scope="module") -def document(dataset, dataset_variant) -> Union[BratDocument, BratDocumentWithMergedSpans]: +def document(dataset) -> BratDocumentWithMergedSpans: result = dataset[SPLIT][0] - if dataset_variant == "default": - assert isinstance(result, BratDocument) - elif dataset_variant == "merge_fragmented_spans": - assert isinstance(result, BratDocumentWithMergedSpans) - else: - raise ValueError(f"Unknown dataset variant: {dataset_variant}") + # we can not assert the real document type because it may come from a dataset loading script + # downloaded to a temporary directory and thus have a different type object, although it is + # semantically the same + assert isinstance(result, Document) return result @@ -69,12 +72,7 @@ def test_document(document, dataset_variant): assert document.id == "10561201" # check the annotation - if dataset_variant == "default": - span_texts_labels_tuples = [ - (" ".join([document.text[start:end] for start, end in span.slices]), span.label) - for span in document.spans - ] - elif dataset_variant == "merge_fragmented_spans": + if dataset_variant == "default" or dataset_variant is None: span_texts_labels_tuples = [(str(span), span.label) for span in document.spans] # check spans @@ -141,20 +139,7 @@ def test_document(document, dataset_variant): def dataset_of_text_documents_with_labeled_spans_and_binary_relations( dataset, dataset_variant ) -> Optional[DatasetDict]: - if dataset_variant == "default": - with pytest.raises(ValueError) as excinfo: - dataset.to_document_type(TextDocumentWithLabeledSpansAndBinaryRelations) - assert ( - str(excinfo.value) - == "No valid key (either subclass or superclass) was found for the document type " - "'' in the " - "document_converters of the dataset. Available keys: set(). Consider adding a respective " - "converter to the dataset with dataset.register_document_converter(my_converter_method) " - "where my_converter_method should accept " - "as input and return ''." - ) - converted_dataset = None - elif dataset_variant == "merge_fragmented_spans": + if dataset_variant == "default" or dataset_variant is None: converted_dataset = dataset.to_document_type( TextDocumentWithLabeledSpansAndBinaryRelations ) @@ -168,9 +153,7 @@ def test_dataset_of_text_documents_with_labeled_spans_and_binary_relations( ): if dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None: # get a document to check - converted_doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[ - "neoplasm_train" - ][0] + converted_doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0] # check that the conversion is correct and the data makes sense assert isinstance(converted_doc, TextDocumentWithLabeledSpansAndBinaryRelations) @@ -382,8 +365,6 @@ def test_document_converters(dataset_variant): document_converters = builder.document_converters if dataset_variant == "default": - assert document_converters == {} - elif dataset_variant == "merge_fragmented_spans": assert len(document_converters) == 1 assert set(document_converters) == { TextDocumentWithLabeledSpansAndBinaryRelations, From 172fc14a9491a7660712716474dfe2273a00d714 Mon Sep 17 00:00:00 2001 From: Ruangrin L <88072261+idalr@users.noreply.github.com> Date: Tue, 19 Dec 2023 18:10:02 +0100 Subject: [PATCH 24/24] minor fixes --- dataset_builders/pie/abstrct/README.md | 4 +- dataset_builders/pie/abstrct/abstrct.py | 2 +- tests/dataset_builders/pie/test_abstrct.py | 353 ++++++++++----------- 3 files changed, 175 insertions(+), 184 deletions(-) diff --git a/dataset_builders/pie/abstrct/README.md b/dataset_builders/pie/abstrct/README.md index d0b752a4..45123b8a 100644 --- a/dataset_builders/pie/abstrct/README.md +++ b/dataset_builders/pie/abstrct/README.md @@ -46,9 +46,9 @@ assert isinstance(doc, builders.brat.BratDocumentWithMergedSpans) The dataset provides document converters for the following target document types: - `pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations` - - `LabeledSpans`, converted from `BratDocument`'s `spans` + - `LabeledSpans`, converted from `BratDocumentWithMergedSpans`'s `spans` - labels: `MajorClaim`, `Claim`, `Premise` - - `BinraryRelations`, converted from `BratDocument`'s `relations` + - `BinraryRelations`, converted from `BratDocumentWithMergedSpans`'s `relations` - labels: `Support`, `Partial-Attack`, `Attack` See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type diff --git a/dataset_builders/pie/abstrct/abstrct.py b/dataset_builders/pie/abstrct/abstrct.py index 6dc12e42..045c2e23 100644 --- a/dataset_builders/pie/abstrct/abstrct.py +++ b/dataset_builders/pie/abstrct/abstrct.py @@ -27,7 +27,7 @@ class AbstRCT(BratBuilder): # we need to add None to the list of dataset variants to support the default dataset variant BASE_BUILDER_KWARGS_DICT = { dataset_variant: {"url": URL, "split_paths": SPLIT_PATHS} - for dataset_variant in ["default", "merge_fragmented_spans", None] + for dataset_variant in ["default", None] } DOCUMENT_CONVERTERS = { diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py index 02036b8f..54b99bce 100644 --- a/tests/dataset_builders/pie/test_abstrct.py +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List import pytest from datasets import disable_caching @@ -71,12 +71,9 @@ def test_document(document, dataset_variant): assert document is not None assert document.id == "10561201" - # check the annotation - if dataset_variant == "default" or dataset_variant is None: - span_texts_labels_tuples = [(str(span), span.label) for span in document.spans] - - # check spans + # check the spans assert len(document.spans) == 7 + span_texts_labels_tuples = [(str(span), span.label) for span in document.spans] assert span_texts_labels_tuples[0] == ( "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain " "in men with metastatic, hormone-resistant, prostate cancer.", @@ -138,7 +135,7 @@ def test_document(document, dataset_variant): @pytest.fixture(scope="module") def dataset_of_text_documents_with_labeled_spans_and_binary_relations( dataset, dataset_variant -) -> Optional[DatasetDict]: +) -> DatasetDict: if dataset_variant == "default" or dataset_variant is None: converted_dataset = dataset.to_document_type( TextDocumentWithLabeledSpansAndBinaryRelations @@ -151,98 +148,97 @@ def dataset_of_text_documents_with_labeled_spans_and_binary_relations( def test_dataset_of_text_documents_with_labeled_spans_and_binary_relations( dataset_of_text_documents_with_labeled_spans_and_binary_relations, ): - if dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None: - # get a document to check - converted_doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0] - # check that the conversion is correct and the data makes sense - assert isinstance(converted_doc, TextDocumentWithLabeledSpansAndBinaryRelations) - - # check the entities - assert len(converted_doc.labeled_spans) == 7 - entity_tuples = [(str(ent), ent.label) for ent in converted_doc.labeled_spans] - assert entity_tuples[0] == ( - "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in men " - "with metastatic, hormone-resistant, prostate cancer.", - "MajorClaim", - ) - assert entity_tuples[1] == ( - "At 6 weeks, both groups showed improvement in several HQL domains,", - "Premise", - ) - assert entity_tuples[2] == ( - "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " - "prednisone-alone group.", - "Premise", - ) - assert entity_tuples[3] == ( - "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " - "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), " - "four functioning domains, and nine symptoms (.001 < P <. 01),", - "Premise", - ) - assert entity_tuples[4] == ( - "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " - "(.004 < P <.05).", - "Premise", - ) - assert entity_tuples[5] == ( - "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with " - "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", - "Premise", - ) - assert entity_tuples[6] == ( - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " - "in several HQL domains and symptoms than treatment with prednisone alone.", - "Claim", - ) + # get a document to check + converted_doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0] + # check that the conversion is correct and the data makes sense + assert isinstance(converted_doc, TextDocumentWithLabeledSpansAndBinaryRelations) + + # check the entities + assert len(converted_doc.labeled_spans) == 7 + entity_tuples = [(str(ent), ent.label) for ent in converted_doc.labeled_spans] + assert entity_tuples[0] == ( + "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in men " + "with metastatic, hormone-resistant, prostate cancer.", + "MajorClaim", + ) + assert entity_tuples[1] == ( + "At 6 weeks, both groups showed improvement in several HQL domains,", + "Premise", + ) + assert entity_tuples[2] == ( + "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " + "prednisone-alone group.", + "Premise", + ) + assert entity_tuples[3] == ( + "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " + "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), " + "four functioning domains, and nine symptoms (.001 < P <. 01),", + "Premise", + ) + assert entity_tuples[4] == ( + "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " + "(.004 < P <.05).", + "Premise", + ) + assert entity_tuples[5] == ( + "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with " + "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", + "Premise", + ) + assert entity_tuples[6] == ( + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + "Claim", + ) - # check the relations - assert len(converted_doc.binary_relations) == 6 - relation_tuples = [ - (str(rel.head), rel.label, str(rel.tail)) for rel in converted_doc.binary_relations - ] - assert relation_tuples[0] == ( - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " - "in several HQL domains and symptoms than treatment with prednisone alone.", - "Support", - "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain " - "in men with metastatic, hormone-resistant, prostate cancer.", - ) - assert relation_tuples[1] == ( - "At 6 weeks, both groups showed improvement in several HQL domains,", - "Support", - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " - "in several HQL domains and symptoms than treatment with prednisone alone.", - ) - assert relation_tuples[2] == ( - "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " - "prednisone-alone group.", - "Support", - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " - "in several HQL domains and symptoms than treatment with prednisone alone.", - ) - assert relation_tuples[3] == ( - "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " - "(.004 < P <.05).", - "Support", - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement in " - "several HQL domains and symptoms than treatment with prednisone alone.", - ) - assert relation_tuples[4] == ( - "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " - "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four " - "functioning domains, and nine symptoms (.001 < P <. 01),", - "Support", - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " - "in several HQL domains and symptoms than treatment with prednisone alone.", - ) - assert relation_tuples[5] == ( - "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with improvements " - "in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", - "Support", - "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in " - "men with metastatic, hormone-resistant, prostate cancer.", - ) + # check the relations + assert len(converted_doc.binary_relations) == 6 + relation_tuples = [ + (str(rel.head), rel.label, str(rel.tail)) for rel in converted_doc.binary_relations + ] + assert relation_tuples[0] == ( + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + "Support", + "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain " + "in men with metastatic, hormone-resistant, prostate cancer.", + ) + assert relation_tuples[1] == ( + "At 6 weeks, both groups showed improvement in several HQL domains,", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[2] == ( + "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " + "prednisone-alone group.", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[3] == ( + "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " + "(.004 < P <.05).", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement in " + "several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[4] == ( + "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " + "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four " + "functioning domains, and nine symptoms (.001 < P <. 01),", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[5] == ( + "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with " + "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", + "Support", + "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in " + "men with metastatic, hormone-resistant, prostate cancer.", + ) @pytest.fixture(scope="module") @@ -253,10 +249,7 @@ def tokenizer() -> PreTrainedTokenizer: @pytest.fixture(scope="module") def tokenized_documents_with_labeled_spans_and_binary_relations( dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer -) -> Optional[List[TestTokenDocumentWithLabeledSpansAndBinaryRelations]]: - if dataset_of_text_documents_with_labeled_spans_and_binary_relations is None: - return None - +) -> List[TestTokenDocumentWithLabeledSpansAndBinaryRelations]: # get a document to check doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0] # Note, that this is a list of documents, because the document may be split into chunks @@ -275,96 +268,94 @@ def tokenized_documents_with_labeled_spans_and_binary_relations( def test_tokenized_documents_with_labeled_spans_and_binary_relations( tokenized_documents_with_labeled_spans_and_binary_relations, ): - if tokenized_documents_with_labeled_spans_and_binary_relations is not None: - docs = tokenized_documents_with_labeled_spans_and_binary_relations - # check that the tokenization was fine - assert len(docs) == 1 - doc = docs[0] - assert len(doc.tokens) == 465 - assert len(doc.labeled_spans) == 7 - ent = doc.labeled_spans[0] - assert ( - str(ent) - == "('a', 'combination', 'of', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'is', " - "'prefer', '##able', 'to', 'pre', '##d', '##nis', '##one', 'alone', 'for', 'reduction', 'of', 'pain', 'in', " - "'men', 'with', 'meta', '##static', ',', 'hormone', '-', 'resistant', ',', 'prostate', 'cancer', '.')" - ) - ent = doc.labeled_spans[1] - assert ( - str(ent) - == "('at', '6', 'weeks', ',', 'both', 'groups', 'showed', 'improvement', 'in', 'several', 'hq', '##l', " - "'domains', ',')" - ) - ent = doc.labeled_spans[2] - assert ( - str(ent) - == "('only', 'physical', 'functioning', 'and', 'pain', 'were', 'better', 'in', 'the', 'mit', '##ox', '##ant', " - "'##rone', '-', 'plus', '-', 'pre', '##d', '##nis', '##one', 'group', 'than', 'in', 'the', 'pre', '##d', " - "'##nis', '##one', '-', 'alone', 'group', '.')" - ) - ent = doc.labeled_spans[3] - assert ( - str(ent) - == "('after', '6', 'weeks', ',', 'patients', 'taking', 'pre', '##d', '##nis', '##one', 'showed', 'no', " - "'improvement', 'in', 'hq', '##l', 'scores', ',', 'whereas', 'those', 'taking', 'mit', '##ox', '##ant', " - "'##rone', 'plus', 'pre', '##d', '##nis', '##one', 'showed', 'significant', 'improvements', 'in', 'global', " - "'quality', 'of', 'life', '(', 'p', '=', '.', '00', '##9', ')', ',', 'four', 'functioning', 'domains', ',', " - "'and', 'nine', 'symptoms', '(', '.', '001', '<', 'p', '<', '.', '01', ')', ',')" - ) - ent = doc.labeled_spans[4] - assert ( - str(ent) - == "('the', 'improvement', '(', '>', '10', 'units', 'on', 'a', 'scale', 'of', '0', 'to', '##100', ')', " - "'lasted', 'longer', 'than', 'in', 'the', 'pre', '##d', '##nis', '##one', '-', 'alone', 'group', '(', '.', " - "'00', '##4', '<', 'p', '<', '.', '05', ')', '.')" - ) - ent = doc.labeled_spans[5] - assert ( - str(ent) - == "('the', 'addition', 'of', 'mit', '##ox', '##ant', '##rone', 'to', 'pre', '##d', '##nis', '##one', " - "'after', 'failure', 'of', 'pre', '##d', '##nis', '##one', 'alone', 'was', 'associated', 'with', " - "'improvements', 'in', 'pain', ',', 'pain', 'impact', ',', 'pain', 'relief', ',', 'ins', '##om', '##nia', " - "',', 'and', 'global', 'quality', 'of', 'life', '(', '.', '001', '<', 'p', '<', '.', '00', '##3', ')', '.')" - ) - ent = doc.labeled_spans[6] - assert ( - str(ent) - == "('treatment', 'with', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'was', " - "'associated', 'with', 'greater', 'and', 'longer', '-', 'lasting', 'improvement', 'in', 'several', " - "'hq', '##l', 'domains', 'and', 'symptoms', 'than', 'treatment', 'with', 'pre', '##d', '##nis', '##one', " - "'alone', '.')" - ) + docs = tokenized_documents_with_labeled_spans_and_binary_relations + # check that the tokenization was fine + assert len(docs) == 1 + doc = docs[0] + assert len(doc.tokens) == 465 + assert len(doc.labeled_spans) == 7 + ent = doc.labeled_spans[0] + assert ( + str(ent) + == "('a', 'combination', 'of', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'is', " + "'prefer', '##able', 'to', 'pre', '##d', '##nis', '##one', 'alone', 'for', 'reduction', 'of', 'pain', 'in', " + "'men', 'with', 'meta', '##static', ',', 'hormone', '-', 'resistant', ',', 'prostate', 'cancer', '.')" + ) + ent = doc.labeled_spans[1] + assert ( + str(ent) + == "('at', '6', 'weeks', ',', 'both', 'groups', 'showed', 'improvement', 'in', 'several', 'hq', '##l', " + "'domains', ',')" + ) + ent = doc.labeled_spans[2] + assert ( + str(ent) + == "('only', 'physical', 'functioning', 'and', 'pain', 'were', 'better', 'in', 'the', 'mit', '##ox', '##ant', " + "'##rone', '-', 'plus', '-', 'pre', '##d', '##nis', '##one', 'group', 'than', 'in', 'the', 'pre', '##d', " + "'##nis', '##one', '-', 'alone', 'group', '.')" + ) + ent = doc.labeled_spans[3] + assert ( + str(ent) + == "('after', '6', 'weeks', ',', 'patients', 'taking', 'pre', '##d', '##nis', '##one', 'showed', 'no', " + "'improvement', 'in', 'hq', '##l', 'scores', ',', 'whereas', 'those', 'taking', 'mit', '##ox', '##ant', " + "'##rone', 'plus', 'pre', '##d', '##nis', '##one', 'showed', 'significant', 'improvements', 'in', 'global', " + "'quality', 'of', 'life', '(', 'p', '=', '.', '00', '##9', ')', ',', 'four', 'functioning', 'domains', ',', " + "'and', 'nine', 'symptoms', '(', '.', '001', '<', 'p', '<', '.', '01', ')', ',')" + ) + ent = doc.labeled_spans[4] + assert ( + str(ent) + == "('the', 'improvement', '(', '>', '10', 'units', 'on', 'a', 'scale', 'of', '0', 'to', '##100', ')', " + "'lasted', 'longer', 'than', 'in', 'the', 'pre', '##d', '##nis', '##one', '-', 'alone', 'group', '(', '.', " + "'00', '##4', '<', 'p', '<', '.', '05', ')', '.')" + ) + ent = doc.labeled_spans[5] + assert ( + str(ent) + == "('the', 'addition', 'of', 'mit', '##ox', '##ant', '##rone', 'to', 'pre', '##d', '##nis', '##one', " + "'after', 'failure', 'of', 'pre', '##d', '##nis', '##one', 'alone', 'was', 'associated', 'with', " + "'improvements', 'in', 'pain', ',', 'pain', 'impact', ',', 'pain', 'relief', ',', 'ins', '##om', '##nia', " + "',', 'and', 'global', 'quality', 'of', 'life', '(', '.', '001', '<', 'p', '<', '.', '00', '##3', ')', '.')" + ) + ent = doc.labeled_spans[6] + assert ( + str(ent) + == "('treatment', 'with', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'was', " + "'associated', 'with', 'greater', 'and', 'longer', '-', 'lasting', 'improvement', 'in', 'several', " + "'hq', '##l', 'domains', 'and', 'symptoms', 'than', 'treatment', 'with', 'pre', '##d', '##nis', '##one', " + "'alone', '.')" + ) def test_tokenized_documents_with_entities_and_relations_all( dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer, dataset_variant ): - if dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None: - for ( - split, - docs, - ) in dataset_of_text_documents_with_labeled_spans_and_binary_relations.items(): - for doc in docs: - # Note, that this is a list of documents, because the document may be split into chunks - # if the input text is too long. - tokenized_docs = tokenize_document( - doc, - tokenizer=tokenizer, - return_overflowing_tokens=True, - result_document_type=TestTokenDocumentWithLabeledSpansAndBinaryRelations, - strict_span_conversion=True, - verbose=True, - ) - # we just ensure that we get at least one tokenized document - assert tokenized_docs is not None - assert len(tokenized_docs) > 0 + for ( + split, + docs, + ) in dataset_of_text_documents_with_labeled_spans_and_binary_relations.items(): + for doc in docs: + # Note, that this is a list of documents, because the document may be split into chunks + # if the input text is too long. + tokenized_docs = tokenize_document( + doc, + tokenizer=tokenizer, + return_overflowing_tokens=True, + result_document_type=TestTokenDocumentWithLabeledSpansAndBinaryRelations, + strict_span_conversion=True, + verbose=True, + ) + # we just ensure that we get at least one tokenized document + assert tokenized_docs is not None + assert len(tokenized_docs) > 0 def test_document_converters(dataset_variant): builder = BUILDER_CLASS(config_name=dataset_variant) document_converters = builder.document_converters - if dataset_variant == "default": + if dataset_variant == "default" or dataset_variant is None: assert len(document_converters) == 1 assert set(document_converters) == { TextDocumentWithLabeledSpansAndBinaryRelations,