diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 696a3290..01ac619e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -85,7 +85,8 @@ repos: args: - --skip=logs/** # arbitral: this is a legal term and used in example data (cdcp dataset) - - --ignore-words-list=arbitral + # abstrct / AbstRCT: this is a dataset name + - --ignore-words-list=arbitral,abstrct,AbstRCT # python static type checking - repo: https://github.com/pre-commit/mirrors-mypy diff --git a/dataset_builders/pie/abstrct/README.md b/dataset_builders/pie/abstrct/README.md new file mode 100644 index 00000000..45123b8a --- /dev/null +++ b/dataset_builders/pie/abstrct/README.md @@ -0,0 +1,224 @@ +# PIE Dataset Card for "abstrct" + +This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the AbstRCT dataset ([paper](https://ebooks.iospress.nl/publication/55129) and [data repository](https://gitlab.com/tomaye/abstrct)). Since the AbstRCT dataset is published in the [BRAT standoff format](https://brat.nlplab.org/standoff.html), this dataset builder is based on the [PyTorch-IE brat dataset loading script](https://huggingface.co/datasets/pie/brat). + +Therefore, the `abstrct` dataset as described here follows the data structure from the [PIE brat dataset card](https://huggingface.co/datasets/pie/brat). + +### Dataset Summary + +A novel corpus of healthcare texts (i.e., RCT abstracts on various diseases) from the MEDLINE database, which +are annotated with argumentative components (i.e., `MajorClaim`, `Claim`, and `Premise`) and relations (i.e., `Support`, `Attack`, and `Partial-attack`), +in order to support clinicians' daily tasks in information finding and evidence-based reasoning for decision making. + +### Supported Tasks and Leaderboards + +- **Tasks**: Argumentation Mining, Component Identification, Boundary Detection, Relation Identification, Link Prediction +- **Leaderboard:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards) + +### Languages + +The language in the dataset is English (in the medical/healthcare domain). + +### Dataset Variants + +The `abstrct` dataset comes in a single version (`default`) with `BratDocumentWithMergedSpans` as document type. Note, +that this in contrast to the base `brat` dataset, where the document type for the `default` variant is `BratDocument`. +The reason is that the AbstRCT dataset has already been published with only single-fragment spans. +Without any need to merge fragments, the document type `BratDocumentWithMergedSpans` is easier to handle for most of the task modules. + +### Data Schema + +See [PIE-Brat Data Schema](https://huggingface.co/datasets/pie/brat#data-schema). + +### Usage + +```python +from pie_datasets import load_dataset, builders + +# load default version +datasets = load_dataset("pie/abstrct") +doc = datasets["neoplasm_train"][0] +assert isinstance(doc, builders.brat.BratDocumentWithMergedSpans) +``` + +### Document Converters + +The dataset provides document converters for the following target document types: + +- `pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations` + - `LabeledSpans`, converted from `BratDocumentWithMergedSpans`'s `spans` + - labels: `MajorClaim`, `Claim`, `Premise` + - `BinraryRelations`, converted from `BratDocumentWithMergedSpans`'s `relations` + - labels: `Support`, `Partial-Attack`, `Attack` + +See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type +definitions. + +### Data Splits + +| Diseease-based Split | `neoplasm` | `glaucoma` | `mixed` | +| --------------------------------------------------------- | ----------------------: | -------------------: | -------------------: | +| No.of document
- `_train`
- `_dev`
- `_test` |
350
50
100 |


100 |


100 | + +**Important Note**: + +- `mixed_test` contains 20 abstracts on the following diseases: glaucoma, neoplasm, diabetes, hypertension, hepatitis. +- 31 out of 40 abstracts in `mixed_test` overlap with abstracts in `neoplasm_test` and `glaucoma_test`. + +### Label Descriptions + +In this section, we describe labels according to [Mayer et al. (2020)](https://ebooks.iospress.nl/publication/55129), as well as our label counts on 669 abstracts. + +Unfortunately, the number we report does not correspond to what Mayer et al. reported in their paper (see Table 1, p. 2109). +Morio et al. ([2022](https://aclanthology.org/2022.tacl-1.37.pdf); p. 642, Table 1), who utilized this corpus for their AM tasks, also reported another number, claiming there were double annotation errors in the original statistic collection (see [reference](https://github.com/hitachi-nlp/graph_parser/blob/main/examples/multitask_am/README.md#qas)). + +#### Components + +| Components | Count | Percentage | +| ------------ | ----: | ---------: | +| `MajorClaim` | 129 | 3 % | +| `Claim` | 1282 | 30.2 % | +| `Premise` | 2842 | 66.8 % | + +- `MajorClaim` are more general/concluding `claim`'s, which is supported by more specific claims +- `Claim` is a concluding statement made by the author about the outcome of the study. Claims only points to other claims. +- `Premise` (a.k.a. evidence) is an observation or measurement in the study, which supports or attacks another argument component, usually a `claim`. They are observed facts, and therefore credible without further justifications, as this is the ground truth the argumentation is based on. + +(Mayer et al. 2020, p.2110) + +#### Relations + +| Relations | Count | Percentage | +| ------------------------ | ----: | ---------: | +| support: `Support` | 2289 | 87 % | +| attack: `Partial-Attack` | 275 | 10.4 % | +| attack: `Attack` | 69 | 2.6 % | + +- `Support`: All statements or observations justifying the proposition of the target component +- `Partial-Attack`: when the source component is not in full contradiction, but weakening the target component by constraining its proposition. Usually occur between two claims +- `Attack`: A component is attacking another one, if it is + - i) contradicting the proposition of the target component, or + - ii) undercutting its implicit assumption of significance constraints +- `Premise` can only be connected to either `Claim` or another `Premise` +- `Claim`'s can only point to other `Claim`'s +- There might be more than one **outgoing** and/or **incoming relation** . In rare case, there is no relation to another component at all. + +(Mayer et al. 2020, p.2110) + +## Dataset Creation + +### Curation Rationale + +"\[D\]espite its natural employment in healthcare applications, only few approaches have applied AM methods to this kind +of text, and their contribution is limited to the detection +of argument components, disregarding the more complex phase of +predicting the relations among them. In addition, no huge annotated +dataset for AM is available for the healthcare domain (p. 2108)...to support clinicians in decision making or in (semi)-automatically +filling evidence tables for systematic reviews in evidence-based medicine. (p. 2114)" + +### Source Data + +[MEDLINE database](https://www.nlm.nih.gov/medline/medline_overview.html) + +#### Initial Data Collection and Normalization + +Extended from the previous dataset in [Mayer et al. 2018](https://webusers.i3s.unice.fr/~riveill/IADB/publications/2018-COMMA.pdf), 500 medical abstract from randomized controlled trials (RCTs) were retrieved directly from [PubMed](https://www.ncbi.nlm.nih.gov/pubmed/) by searching for titles or abstracts containing the disease name. + +(See the definition of RCT in the authors' [guideline](https://gitlab.com/tomaye/abstrct/-/blob/master/AbstRCT_corpus/AnnotationGuidelines.pdf) (Section 1.2) and [US National Library of Medicine](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6235704/)) + +#### Who are the source language producers? + +\[More Information Needed\] + +### Annotations + +#### Annotation process + +"An expert in the medical domain (a pharmacist) validated the annotation +guidelines before starting the annotation process." (p. 2110) + +"Annotation was started after a training phase, where amongst others the component boundaries were topic of discussion. Gold labels +were set after a reconciliation phase, during which the annotators +tried to reach an agreement. While the number of annotators vary for +the two annotation phases (component and relation annotation). + +On the annotation of argument components, "IAA among the three annotators has been calculated +on 30 abstracts, resulting in a Fleiss’ kappa of 0.72 for argumentative +components and 0.68 for the more fine-grained distinction between +claims and evidence." (p. 2109) + +On the annotation of argumentative relation, "IAA has been calculated on 30 abstracts annotated in parallel by three annotators, +resulting in a Fleiss’ kappa of +0.62. The annotation of the remaining abstracts was carried out by +one of the above mentioned annotators." (p. 2110) + +See the [Annotation Guideline](https://gitlab.com/tomaye/abstrct/-/blob/master/AbstRCT_corpus/AnnotationGuidelines.pdf?ref_type=heads) for more information on definitions and annotated samples. + +#### Who are the annotators? + +Two annotators with background in computational linguistics. No information was given on the third annotator. + +### Personal and Sensitive Information + +\[More Information Needed\] + +## Considerations for Using the Data + +### Social Impact of Dataset + +"These \[*intelligent*\] systems apply to clinical trials, +clinical guidelines, and electronic health records, and their solutions range from the automated detection of PICO elements +in health records to evidence-based reasoning for decision making. These applications highlight the need of clinicians to be supplied with frameworks able to extract, from the huge +quantity of data available for the different diseases and treatments, +the exact information they necessitate and to present this information in a structured way, easy to be (possibly semi-automatically) +analyzed...Given its aptness to automatically detect in text those +argumentative structures that are at the basis of evidence-based reasoning applications, AM represents a potential valuable contribution +in the healthcare domain." (p. 2108) + +"We expect that our work will have a large impact for clinicians as it +is a crucial step towards AI supported clinical deliberation at a large +scale." (p. 2114) + +### Discussion of Biases + +\[More Information Needed\] + +### Other Known Limitations + +\[More Information Needed\] + +## Additional Information + +### Dataset Curators + +\[More Information Needed\] + +### Licensing Information + +- **License**: the AbstRCT dataset is released under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode) +- **Funding**: This work is partly funded by the French government labelled PIA + program under its IDEX UCA JEDI project (ANR-15-IDEX-0001). + This work has been supported by the French government, through the + 3IA Cote d’Azur Investments in the Future project managed by the + National Research Agency (ANR) with the reference number ANR19-P3IA-0002 + +### Citation Information + +``` +@inproceedings{mayer2020ecai, + author = {Tobias Mayer and + Elena Cabrio and + Serena Villata}, + title = {Transformer-Based Argument Mining for Healthcare Applications}, + booktitle = {{ECAI} 2020 - 24th European Conference on Artificial Intelligence}, + series = {Frontiers in Artificial Intelligence and Applications}, + volume = {325}, + pages = {2108--2115}, + publisher = {{IOS} Press}, + year = {2020}, +} +``` + +### Contributions + +Thanks to [@ArneBinder](https://github.com/ArneBinder) and [@idalr](https://github.com/idalr) for adding this dataset. diff --git a/dataset_builders/pie/abstrct/abstrct.py b/dataset_builders/pie/abstrct/abstrct.py new file mode 100644 index 00000000..045c2e23 --- /dev/null +++ b/dataset_builders/pie/abstrct/abstrct.py @@ -0,0 +1,38 @@ +from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations + +from pie_datasets.builders import BratBuilder, BratConfig +from pie_datasets.builders.brat import BratDocumentWithMergedSpans + +URL = "https://gitlab.com/tomaye/abstrct/-/archive/master/abstrct-master.zip" +SPLIT_PATHS = { + "neoplasm_train": "abstrct-master/AbstRCT_corpus/data/train/neoplasm_train", + "neoplasm_dev": "abstrct-master/AbstRCT_corpus/data/dev/neoplasm_dev", + "neoplasm_test": "abstrct-master/AbstRCT_corpus/data/test/neoplasm_test", + "glaucoma_test": "abstrct-master/AbstRCT_corpus/data/test/glaucoma_test", + "mixed_test": "abstrct-master/AbstRCT_corpus/data/test/mixed_test", +} + + +class AbstRCT(BratBuilder): + BASE_DATASET_PATH = "DFKI-SLT/brat" + BASE_DATASET_REVISION = "bb8c37d84ddf2da1e691d226c55fef48fd8149b5" + + BUILDER_CONFIGS = [ + BratConfig(name=BratBuilder.DEFAULT_CONFIG_NAME, merge_fragmented_spans=True), + ] + DOCUMENT_TYPES = { + BratBuilder.DEFAULT_CONFIG_NAME: BratDocumentWithMergedSpans, + } + + # we need to add None to the list of dataset variants to support the default dataset variant + BASE_BUILDER_KWARGS_DICT = { + dataset_variant: {"url": URL, "split_paths": SPLIT_PATHS} + for dataset_variant in ["default", None] + } + + DOCUMENT_CONVERTERS = { + TextDocumentWithLabeledSpansAndBinaryRelations: { + "spans": "labeled_spans", + "relations": "binary_relations", + }, + } diff --git a/dataset_builders/pie/abstrct/requirements.txt b/dataset_builders/pie/abstrct/requirements.txt new file mode 100644 index 00000000..30439e3e --- /dev/null +++ b/dataset_builders/pie/abstrct/requirements.txt @@ -0,0 +1 @@ +pie-datasets>=0.4.0,<0.9.0 diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py new file mode 100644 index 00000000..54b99bce --- /dev/null +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -0,0 +1,368 @@ +from typing import List + +import pytest +from datasets import disable_caching +from pie_modules.document.processing import tokenize_document +from pytorch_ie.core import Document +from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations +from transformers import AutoTokenizer, PreTrainedTokenizer + +from dataset_builders.pie.abstrct.abstrct import AbstRCT +from pie_datasets import DatasetDict +from pie_datasets.builders.brat import BratDocumentWithMergedSpans +from tests.dataset_builders.common import ( + PIE_BASE_PATH, + TestTokenDocumentWithLabeledSpansAndBinaryRelations, +) + +disable_caching() + +DATASET_NAME = "abstrct" +BUILDER_CLASS = AbstRCT +PIE_DATASET_PATH = PIE_BASE_PATH / DATASET_NAME +SPLIT_SIZES = { + "glaucoma_test": 100, + "mixed_test": 100, + "neoplasm_dev": 50, + "neoplasm_test": 100, + "neoplasm_train": 350, +} +SPLIT = "neoplasm_train" + + +@pytest.fixture(scope="module", params=[config.name for config in BUILDER_CLASS.BUILDER_CONFIGS]) +def dataset_variant(request) -> str: + return request.param + + +@pytest.fixture(scope="module") +def dataset(dataset_variant) -> DatasetDict: + return DatasetDict.load_dataset(str(PIE_DATASET_PATH), name=dataset_variant) + + +def test_dataset(dataset): + assert dataset is not None + assert {name: len(ds) for name, ds in dataset.items()} == SPLIT_SIZES + + +@pytest.fixture(scope="module") +def builder(dataset_variant) -> BUILDER_CLASS: + return BUILDER_CLASS(config_name=dataset_variant) + + +def test_builder(builder, dataset_variant): + assert builder is not None + assert builder.config_id == dataset_variant + assert builder.dataset_name == DATASET_NAME + assert builder.document_type == BratDocumentWithMergedSpans + + +@pytest.fixture(scope="module") +def document(dataset) -> BratDocumentWithMergedSpans: + result = dataset[SPLIT][0] + # we can not assert the real document type because it may come from a dataset loading script + # downloaded to a temporary directory and thus have a different type object, although it is + # semantically the same + assert isinstance(result, Document) + return result + + +def test_document(document, dataset_variant): + assert document is not None + assert document.id == "10561201" + + # check the spans + assert len(document.spans) == 7 + span_texts_labels_tuples = [(str(span), span.label) for span in document.spans] + assert span_texts_labels_tuples[0] == ( + "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain " + "in men with metastatic, hormone-resistant, prostate cancer.", + "MajorClaim", + ) + assert span_texts_labels_tuples[1] == ( + "At 6 weeks, both groups showed improvement in several HQL domains,", + "Premise", + ) + assert span_texts_labels_tuples[2] == ( + "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " + "prednisone-alone group.", + "Premise", + ) + assert span_texts_labels_tuples[3] == ( + "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " + "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four " + "functioning domains, and nine symptoms (.001 < P <. 01),", + "Premise", + ) + assert span_texts_labels_tuples[4] == ( + "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " + "(.004 < P <.05).", + "Premise", + ) + assert span_texts_labels_tuples[5] == ( + "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with " + "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", + "Premise", + ) + assert span_texts_labels_tuples[6] == ( + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + "Claim", + ) + + # check relations + assert len(document.relations) == 6 + document.relations[0].label == "Support" + document.relations[0].head == document.spans[6] + document.relations[0].tail == document.spans[0] + document.relations[1].label == "Support" + document.relations[1].head == document.spans[1] + document.relations[1].tail == document.spans[6] + document.relations[2].label == "Support" + document.relations[2].head == document.spans[2] + document.relations[2].tail == document.spans[6] + document.relations[3].label == "Support" + document.relations[3].head == document.spans[5] + document.relations[3].tail == document.spans[6] + document.relations[4].label == "Support" + document.relations[4].head == document.spans[3] + document.relations[4].tail == document.spans[6] + document.relations[5].label == "Support" + document.relations[5].head == document.spans[5] + document.relations[5].tail == document.spans[0] + + +@pytest.fixture(scope="module") +def dataset_of_text_documents_with_labeled_spans_and_binary_relations( + dataset, dataset_variant +) -> DatasetDict: + if dataset_variant == "default" or dataset_variant is None: + converted_dataset = dataset.to_document_type( + TextDocumentWithLabeledSpansAndBinaryRelations + ) + else: + raise ValueError(f"Unknown dataset variant: {dataset_variant}") + return converted_dataset + + +def test_dataset_of_text_documents_with_labeled_spans_and_binary_relations( + dataset_of_text_documents_with_labeled_spans_and_binary_relations, +): + # get a document to check + converted_doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0] + # check that the conversion is correct and the data makes sense + assert isinstance(converted_doc, TextDocumentWithLabeledSpansAndBinaryRelations) + + # check the entities + assert len(converted_doc.labeled_spans) == 7 + entity_tuples = [(str(ent), ent.label) for ent in converted_doc.labeled_spans] + assert entity_tuples[0] == ( + "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in men " + "with metastatic, hormone-resistant, prostate cancer.", + "MajorClaim", + ) + assert entity_tuples[1] == ( + "At 6 weeks, both groups showed improvement in several HQL domains,", + "Premise", + ) + assert entity_tuples[2] == ( + "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " + "prednisone-alone group.", + "Premise", + ) + assert entity_tuples[3] == ( + "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " + "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), " + "four functioning domains, and nine symptoms (.001 < P <. 01),", + "Premise", + ) + assert entity_tuples[4] == ( + "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " + "(.004 < P <.05).", + "Premise", + ) + assert entity_tuples[5] == ( + "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with " + "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", + "Premise", + ) + assert entity_tuples[6] == ( + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + "Claim", + ) + + # check the relations + assert len(converted_doc.binary_relations) == 6 + relation_tuples = [ + (str(rel.head), rel.label, str(rel.tail)) for rel in converted_doc.binary_relations + ] + assert relation_tuples[0] == ( + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + "Support", + "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain " + "in men with metastatic, hormone-resistant, prostate cancer.", + ) + assert relation_tuples[1] == ( + "At 6 weeks, both groups showed improvement in several HQL domains,", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[2] == ( + "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " + "prednisone-alone group.", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[3] == ( + "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " + "(.004 < P <.05).", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement in " + "several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[4] == ( + "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " + "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four " + "functioning domains, and nine symptoms (.001 < P <. 01),", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[5] == ( + "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with " + "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", + "Support", + "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in " + "men with metastatic, hormone-resistant, prostate cancer.", + ) + + +@pytest.fixture(scope="module") +def tokenizer() -> PreTrainedTokenizer: + return AutoTokenizer.from_pretrained("bert-base-uncased") + + +@pytest.fixture(scope="module") +def tokenized_documents_with_labeled_spans_and_binary_relations( + dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer +) -> List[TestTokenDocumentWithLabeledSpansAndBinaryRelations]: + # get a document to check + doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0] + # Note, that this is a list of documents, because the document may be split into chunks + # if the input text is too long. + tokenized_docs = tokenize_document( + doc, + tokenizer=tokenizer, + return_overflowing_tokens=True, + result_document_type=TestTokenDocumentWithLabeledSpansAndBinaryRelations, + strict_span_conversion=True, + verbose=True, + ) + return tokenized_docs + + +def test_tokenized_documents_with_labeled_spans_and_binary_relations( + tokenized_documents_with_labeled_spans_and_binary_relations, +): + docs = tokenized_documents_with_labeled_spans_and_binary_relations + # check that the tokenization was fine + assert len(docs) == 1 + doc = docs[0] + assert len(doc.tokens) == 465 + assert len(doc.labeled_spans) == 7 + ent = doc.labeled_spans[0] + assert ( + str(ent) + == "('a', 'combination', 'of', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'is', " + "'prefer', '##able', 'to', 'pre', '##d', '##nis', '##one', 'alone', 'for', 'reduction', 'of', 'pain', 'in', " + "'men', 'with', 'meta', '##static', ',', 'hormone', '-', 'resistant', ',', 'prostate', 'cancer', '.')" + ) + ent = doc.labeled_spans[1] + assert ( + str(ent) + == "('at', '6', 'weeks', ',', 'both', 'groups', 'showed', 'improvement', 'in', 'several', 'hq', '##l', " + "'domains', ',')" + ) + ent = doc.labeled_spans[2] + assert ( + str(ent) + == "('only', 'physical', 'functioning', 'and', 'pain', 'were', 'better', 'in', 'the', 'mit', '##ox', '##ant', " + "'##rone', '-', 'plus', '-', 'pre', '##d', '##nis', '##one', 'group', 'than', 'in', 'the', 'pre', '##d', " + "'##nis', '##one', '-', 'alone', 'group', '.')" + ) + ent = doc.labeled_spans[3] + assert ( + str(ent) + == "('after', '6', 'weeks', ',', 'patients', 'taking', 'pre', '##d', '##nis', '##one', 'showed', 'no', " + "'improvement', 'in', 'hq', '##l', 'scores', ',', 'whereas', 'those', 'taking', 'mit', '##ox', '##ant', " + "'##rone', 'plus', 'pre', '##d', '##nis', '##one', 'showed', 'significant', 'improvements', 'in', 'global', " + "'quality', 'of', 'life', '(', 'p', '=', '.', '00', '##9', ')', ',', 'four', 'functioning', 'domains', ',', " + "'and', 'nine', 'symptoms', '(', '.', '001', '<', 'p', '<', '.', '01', ')', ',')" + ) + ent = doc.labeled_spans[4] + assert ( + str(ent) + == "('the', 'improvement', '(', '>', '10', 'units', 'on', 'a', 'scale', 'of', '0', 'to', '##100', ')', " + "'lasted', 'longer', 'than', 'in', 'the', 'pre', '##d', '##nis', '##one', '-', 'alone', 'group', '(', '.', " + "'00', '##4', '<', 'p', '<', '.', '05', ')', '.')" + ) + ent = doc.labeled_spans[5] + assert ( + str(ent) + == "('the', 'addition', 'of', 'mit', '##ox', '##ant', '##rone', 'to', 'pre', '##d', '##nis', '##one', " + "'after', 'failure', 'of', 'pre', '##d', '##nis', '##one', 'alone', 'was', 'associated', 'with', " + "'improvements', 'in', 'pain', ',', 'pain', 'impact', ',', 'pain', 'relief', ',', 'ins', '##om', '##nia', " + "',', 'and', 'global', 'quality', 'of', 'life', '(', '.', '001', '<', 'p', '<', '.', '00', '##3', ')', '.')" + ) + ent = doc.labeled_spans[6] + assert ( + str(ent) + == "('treatment', 'with', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'was', " + "'associated', 'with', 'greater', 'and', 'longer', '-', 'lasting', 'improvement', 'in', 'several', " + "'hq', '##l', 'domains', 'and', 'symptoms', 'than', 'treatment', 'with', 'pre', '##d', '##nis', '##one', " + "'alone', '.')" + ) + + +def test_tokenized_documents_with_entities_and_relations_all( + dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer, dataset_variant +): + for ( + split, + docs, + ) in dataset_of_text_documents_with_labeled_spans_and_binary_relations.items(): + for doc in docs: + # Note, that this is a list of documents, because the document may be split into chunks + # if the input text is too long. + tokenized_docs = tokenize_document( + doc, + tokenizer=tokenizer, + return_overflowing_tokens=True, + result_document_type=TestTokenDocumentWithLabeledSpansAndBinaryRelations, + strict_span_conversion=True, + verbose=True, + ) + # we just ensure that we get at least one tokenized document + assert tokenized_docs is not None + assert len(tokenized_docs) > 0 + + +def test_document_converters(dataset_variant): + builder = BUILDER_CLASS(config_name=dataset_variant) + document_converters = builder.document_converters + + if dataset_variant == "default" or dataset_variant is None: + assert len(document_converters) == 1 + assert set(document_converters) == { + TextDocumentWithLabeledSpansAndBinaryRelations, + } + assert document_converters[TextDocumentWithLabeledSpansAndBinaryRelations] == { + "spans": "labeled_spans", + "relations": "binary_relations", + } + else: + raise ValueError(f"Unknown dataset variant: {dataset_variant}")