From af821dbc3970457d0bbbc5d5a9e764ef8ab39511 Mon Sep 17 00:00:00 2001
From: Arne Binder <arne.binder@dfki.de>
Date: Sat, 25 Nov 2023 02:10:20 +0100
Subject: [PATCH 01/24] add abstrct dataset

---
 dataset_builders/pie/abstrct/README.md        |  23 ++
 dataset_builders/pie/abstrct/abstrct.py       |  41 +++
 dataset_builders/pie/abstrct/requirements.txt |   1 +
 tests/dataset_builders/pie/test_abstrct.py    | 318 ++++++++++++++++++
 4 files changed, 383 insertions(+)
 create mode 100644 dataset_builders/pie/abstrct/README.md
 create mode 100644 dataset_builders/pie/abstrct/abstrct.py
 create mode 100644 dataset_builders/pie/abstrct/requirements.txt
 create mode 100644 tests/dataset_builders/pie/test_abstrct.py

diff --git a/dataset_builders/pie/abstrct/README.md b/dataset_builders/pie/abstrct/README.md
new file mode 100644
index 00000000..2c10a090
--- /dev/null
+++ b/dataset_builders/pie/abstrct/README.md
@@ -0,0 +1,23 @@
+# PIE Dataset Card for "abstrct"
+
+This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the AbstRCT dataset.
+
+TODO: Since there is no respective HF dataset card, we should all respective information here.
+
+TODO: Shortly reference the PIE-Brat dataset card.
+
+## Data Schema
+
+TODO
+
+See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/annotations.py) for the remaining annotation type definitions.
+
+## Document Converters
+
+The dataset provides document converters for the following target document types:
+
+- `pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations`
+  - TODO
+
+See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type
+definitions.
diff --git a/dataset_builders/pie/abstrct/abstrct.py b/dataset_builders/pie/abstrct/abstrct.py
new file mode 100644
index 00000000..34a00305
--- /dev/null
+++ b/dataset_builders/pie/abstrct/abstrct.py
@@ -0,0 +1,41 @@
+from pytorch_ie.documents import (
+    TextDocumentWithLabeledSpansAndBinaryRelations,
+)
+
+from pie_datasets.builders import BratBuilder
+from pie_datasets.core.dataset import DocumentConvertersType
+
+URL = "https://gitlab.com/tomaye/abstrct/-/archive/master/abstrct-master.zip"
+SPLIT_PATHS = {
+    "neoplasm_train": "abstrct-master/AbstRCT_corpus/data/train/neoplasm_train",
+    "neoplasm_dev": "abstrct-master/AbstRCT_corpus/data/dev/neoplasm_dev",
+    "neoplasm_test": "abstrct-master/AbstRCT_corpus/data/test/neoplasm_test",
+    "glaucoma_test": "abstrct-master/AbstRCT_corpus/data/test/glaucoma_test",
+    "mixed_test": "abstrct-master/AbstRCT_corpus/data/test/mixed_test",
+}
+
+
+class AbstRCT(BratBuilder):
+
+    BASE_DATASET_PATH = "DFKI-SLT/brat"
+    BASE_DATASET_REVISION = "052163d34b4429d81003981bc10674cef54aa0b8"
+
+    # we need to add None to the list of dataset variants to support the default dataset variant
+    BASE_BUILDER_KWARGS_DICT = {
+        dataset_variant: {"url": URL, "split_paths": SPLIT_PATHS}
+        for dataset_variant in ["default", "merge_fragmented_spans", None]
+    }
+
+    @property
+    def document_converters(self) -> DocumentConvertersType:
+        if self.config.name == "default":
+            return {}
+        elif self.config.name == "merge_fragmented_spans":
+            return {
+                TextDocumentWithLabeledSpansAndBinaryRelations: {
+                    "spans": "labeled_spans",
+                    "relations": "binary_relations",
+                },
+            }
+        else:
+            raise ValueError(f"Unknown dataset variant: {self.config.name}")
diff --git a/dataset_builders/pie/abstrct/requirements.txt b/dataset_builders/pie/abstrct/requirements.txt
new file mode 100644
index 00000000..08271d87
--- /dev/null
+++ b/dataset_builders/pie/abstrct/requirements.txt
@@ -0,0 +1 @@
+pie-datasets>=0.4.0,<0.5.0
diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py
new file mode 100644
index 00000000..6923c5bf
--- /dev/null
+++ b/tests/dataset_builders/pie/test_abstrct.py
@@ -0,0 +1,318 @@
+from typing import List, Optional, Union
+
+import pytest
+from datasets import disable_caching
+from pytorch_ie.documents import (
+    TextDocumentWithLabeledSpansAndBinaryRelations,
+)
+from transformers import AutoTokenizer, PreTrainedTokenizer
+
+from dataset_builders.pie.abstrct.abstrct import AbstRCT
+from pie_datasets import DatasetDict
+from pie_datasets.builders.brat import BratDocument, BratDocumentWithMergedSpans
+from pie_datasets.document.processing import tokenize_document
+from pie_datasets.document.types import (
+    TokenDocumentWithLabeledSpansAndBinaryRelations,
+)
+from tests.dataset_builders.common import PIE_BASE_PATH
+
+disable_caching()
+
+DATASET_NAME = "abstrct"
+PIE_DATASET_PATH = PIE_BASE_PATH / DATASET_NAME
+SPLIT_SIZES = {
+    'glaucoma_test': 100,
+    'mixed_test': 100,
+    'neoplasm_dev': 50,
+    'neoplasm_test': 100,
+    'neoplasm_train': 350,
+}
+SPLIT = "neoplasm_train"
+
+
+@pytest.fixture(
+    scope="module", params=[config.name for config in AbstRCT.BUILDER_CONFIGS]
+)
+def dataset_variant(request) -> str:
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def dataset(dataset_variant) -> DatasetDict:
+    return DatasetDict.load_dataset(str(PIE_DATASET_PATH), name=dataset_variant)
+
+
+def test_dataset(dataset):
+    assert dataset is not None
+    assert {name: len(ds) for name, ds in dataset.items()} == SPLIT_SIZES
+
+
+@pytest.fixture(scope="module")
+def document(dataset, dataset_variant) -> Union[BratDocument, BratDocumentWithMergedSpans]:
+    result = dataset[SPLIT][0]
+    if dataset_variant == "default":
+        assert isinstance(result, BratDocument)
+    elif dataset_variant == "merge_fragmented_spans":
+        assert isinstance(result, BratDocumentWithMergedSpans)
+    else:
+        raise ValueError(f"Unknown dataset variant: {dataset_variant}")
+    return result
+
+
+def test_document(document, dataset_variant):
+    assert document.text.startswith("Should students be taught to compete or to cooperate?")
+    if dataset_variant == "default":
+        # TODO
+        raise NotImplementedError()
+    elif dataset_variant == "merge_fragmented_spans":
+        # TODO
+        raise NotImplementedError()
+    else:
+        raise ValueError(f"Unknown dataset variant: {dataset_variant}")
+
+
+@pytest.fixture(scope="module")
+def dataset_of_text_documents_with_labeled_spans_and_binary_relations(
+    dataset, dataset_variant
+) -> Optional[DatasetDict]:
+    if dataset_variant == "default":
+        with pytest.raises(ValueError) as excinfo:
+            dataset.to_document_type(TextDocumentWithLabeledSpansAndBinaryRelations)
+        assert (
+            str(excinfo.value)
+            == "No valid key (either subclass or superclass) was found for the document type "
+            "'<class 'pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations'>' in the "
+            "document_converters of the dataset. Available keys: set(). Consider adding a respective "
+            "converter to the dataset with dataset.register_document_converter(my_converter_method) "
+            "where my_converter_method should accept <class 'pie_datasets.builders.brat.BratDocument'> "
+            "as input and return '<class 'pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations'>'."
+        )
+        converted_dataset = None
+    elif dataset_variant == "merge_fragmented_spans":
+        converted_dataset = dataset.to_document_type(
+            TextDocumentWithLabeledSpansAndBinaryRelations
+        )
+    else:
+        raise ValueError(f"Unknown dataset variant: {dataset_variant}")
+    return converted_dataset
+
+
+def test_dataset_of_text_documents_with_labeled_spans_and_binary_relations(
+    dataset_of_text_documents_with_labeled_spans_and_binary_relations,
+):
+    if dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None:
+        # Check that the conversion is correct and the data makes sense
+        # get a document to check
+        doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0]
+        assert isinstance(doc, TextDocumentWithLabeledSpansAndBinaryRelations)
+        # check the entities
+        assert len(doc.labeled_spans) == 183
+        # sort the entities by their start position and convert them to tuples
+        # check the first ten entities after sorted
+        sorted_entity_tuples = [
+            (str(ent), ent.label)
+            for ent in sorted(doc.labeled_spans, key=lambda ent: ent.start)[:10]
+        ]
+        # Checking the first ten entities
+        assert sorted_entity_tuples[0] == (
+            "complicated 3D character models are widely used in fields of entertainment, virtual reality, medicine etc",
+            "background_claim",
+        )
+        assert sorted_entity_tuples[1] == (
+            "The range of breathtaking realistic 3D models is only limited by the creativity of artists and resolution "
+            "of devices",
+            "background_claim",
+        )
+        assert sorted_entity_tuples[2] == (
+            "Driving 3D models in a natural and believable manner is not trivial",
+            "background_claim",
+        )
+        assert sorted_entity_tuples[3] == ("the model is very detailed", "data")
+        assert sorted_entity_tuples[4] == (
+            "playback of animation becomes quite heavy and time consuming",
+            "data",
+        )
+        assert sorted_entity_tuples[5] == ("a frame goes wrong", "data")
+        assert sorted_entity_tuples[6] == (
+            "a production cannot afford major revisions",
+            "background_claim",
+        )
+        assert sorted_entity_tuples[7] == ("resculpting models", "data")
+        assert sorted_entity_tuples[8] == ("re-rigging skeletons", "data")
+        assert sorted_entity_tuples[9] == (
+            "providing a flexible and efficient solution to animation remains an open problem",
+            "own_claim",
+        )
+
+        # check the relations
+        assert len(doc.binary_relations) == 116
+        # check the first ten relations
+        relation_tuples = [
+            (str(rel.head), rel.label, str(rel.tail)) for rel in doc.binary_relations[:10]
+        ]
+        assert relation_tuples[0] == (
+            "a production cannot afford major revisions",
+            "supports",
+            "providing a flexible and efficient solution to animation remains an open problem",
+        )
+        assert relation_tuples[1] == (
+            "its ease of implementation",
+            "supports",
+            "SSD is widely used in games, virtual reality and other realtime applications",
+        )
+        assert relation_tuples[2] == (
+            "low cost of computing",
+            "supports",
+            "SSD is widely used in games, virtual reality and other realtime applications",
+        )
+        assert relation_tuples[3] == (
+            "editing in the rest pose will influence most other poses",
+            "supports",
+            "This approach is not commonly applied",
+        )
+        assert relation_tuples[4] == (
+            "This approach is not commonly applied",
+            "contradicts",
+            "artists will edit the geometry of characters in the rest pose to fine-tune animations",
+        )
+        assert relation_tuples[5] == (
+            "the animator specifies the PSD examples after the SSD has been performed",
+            "contradicts",
+            "the examples are best interpolated in the rest pose, before the SSD has been applied",
+        )
+        assert relation_tuples[6] == (
+            "PSD may be used as a compensation to the underlying SSD",
+            "contradicts",
+            "the examples are best interpolated in the rest pose, before the SSD has been applied",
+        )
+        assert relation_tuples[7] == (
+            "the examples are best interpolated in the rest pose, before the SSD has been applied",
+            "supports",
+            "the action of the SSD and any other deformations must be “inverted” in order to push the example "
+            "compensation before these operations",
+        )
+        assert relation_tuples[8] == (
+            "this inverse strategy has a better performance than the same framework without it",
+            "semantically_same",
+            "this approach will improve the quality of deformation",
+        )
+        assert relation_tuples[9] == (
+            "the high cost of computing",
+            "supports",
+            "they are seldom applied to interactive applications",
+        )
+
+
+@pytest.fixture(scope="module")
+def tokenizer() -> PreTrainedTokenizer:
+    return AutoTokenizer.from_pretrained("bert-base-uncased")
+
+
+@pytest.fixture(scope="module")
+def tokenized_documents_with_labeled_spans_and_binary_relations(
+    dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer
+) -> Optional[List[TokenDocumentWithLabeledSpansAndBinaryRelations]]:
+    if dataset_of_text_documents_with_labeled_spans_and_binary_relations is None:
+        return None
+
+    # get a document to check
+    doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0]
+    # Note, that this is a list of documents, because the document may be split into chunks
+    # if the input text is too long.
+    tokenized_docs = tokenize_document(
+        doc,
+        tokenizer=tokenizer,
+        return_overflowing_tokens=True,
+        result_document_type=TokenDocumentWithLabeledSpansAndBinaryRelations,
+        strict_span_conversion=False,
+        verbose=True,
+    )
+    return tokenized_docs
+
+
+def test_tokenized_documents_with_labeled_spans_and_binary_relations(
+    tokenized_documents_with_labeled_spans_and_binary_relations,
+):
+    if tokenized_documents_with_labeled_spans_and_binary_relations is not None:
+        docs = tokenized_documents_with_labeled_spans_and_binary_relations
+        # check that the tokenization was fine
+        assert len(docs) == 1
+        doc = docs[0]
+        assert len(doc.labeled_spans) == 183
+        assert len(doc.tokens) == 7689
+        # Check the first ten tokens
+        assert doc.tokens[:10] == ("[CLS]", "<", "?", "xml", "version", "=", '"', "1", ".", "0")
+        # Check the first ten tokenized entities after sorted by their start position
+        sorted_entities = sorted(doc.labeled_spans, key=lambda ent: ent.start)
+        assert (
+            str(sorted_entities[0])
+            == "('complicated', '3d', 'character', 'models', 'are', 'widely', 'used', 'in', 'fields', 'of', "
+            "'entertainment', ',', 'virtual', 'reality', ',', 'medicine', 'etc')"
+        )
+        assert (
+            str(sorted_entities[1])
+            == "('the', 'range', 'of', 'breath', '##taking', 'realistic', '3d', 'models', 'is', 'only', 'limited', "
+            "'by', 'the', 'creativity', 'of', 'artists', 'and', 'resolution', 'of', 'devices')"
+        )
+        assert (
+            str(sorted_entities[2])
+            == "('driving', '3d', 'models', 'in', 'a', 'natural', 'and', 'bel', '##ie', '##vable', 'manner', 'is', "
+            "'not', 'trivial')"
+        )
+        assert str(sorted_entities[3]) == "('the', 'model', 'is', 'very', 'detailed')"
+        assert (
+            str(sorted_entities[4])
+            == "('playback', 'of', 'animation', 'becomes', 'quite', 'heavy', 'and', 'time', 'consuming')"
+        )
+        assert str(sorted_entities[5]) == "('a', 'frame', 'goes', 'wrong')"
+        assert (
+            str(sorted_entities[6])
+            == "('a', 'production', 'cannot', 'afford', 'major', 'revisions')"
+        )
+        assert str(sorted_entities[7]) == "('res', '##cu', '##lp', '##ting', 'models')"
+        assert str(sorted_entities[8]) == "('re', '-', 'rig', '##ging', 'skeletons')"
+        assert (
+            str(sorted_entities[9])
+            == "('providing', 'a', 'flexible', 'and', 'efficient', 'solution', 'to', 'animation', 'remains', 'an', "
+            "'open', 'problem')"
+        )
+
+
+def test_tokenized_documents_with_entities_and_relations_all(
+    dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer, dataset_variant
+):
+    if dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None:
+        for (
+            split,
+            docs,
+        ) in dataset_of_text_documents_with_labeled_spans_and_binary_relations.items():
+            for doc in docs:
+                # Note, that this is a list of documents, because the document may be split into chunks
+                # if the input text is too long.
+                tokenized_docs = tokenize_document(
+                    doc,
+                    tokenizer=tokenizer,
+                    return_overflowing_tokens=True,
+                    result_document_type=TokenDocumentWithLabeledSpansAndBinaryRelations,
+                    strict_span_conversion=False,
+                    verbose=True,
+                )
+                # we just ensure that we get at least one tokenized document
+                assert tokenized_docs is not None
+                assert len(tokenized_docs) > 0
+
+
+def test_document_converters(dataset_variant):
+    builder = AbstRCT(config_name=dataset_variant)
+    document_converters = builder.document_converters
+
+    if dataset_variant == "default":
+        assert document_converters == {}
+    elif dataset_variant == "merge_fragmented_spans":
+        assert len(document_converters) == 1
+        assert set(document_converters) == {
+            TextDocumentWithLabeledSpansAndBinaryRelations,
+        }
+        assert all(callable(v) for k, v in document_converters.items())
+    else:
+        raise ValueError(f"Unknown dataset variant: {dataset_variant}")

From 4e19b297907f63904db1db77163fee8e07384626 Mon Sep 17 00:00:00 2001
From: Arne Binder <arne.binder@dfki.de>
Date: Sun, 26 Nov 2023 07:30:15 +0100
Subject: [PATCH 02/24] adjust for 0.5.0

---
 dataset_builders/pie/abstrct/abstrct.py       |  5 +---
 dataset_builders/pie/abstrct/requirements.txt |  2 +-
 tests/dataset_builders/pie/test_abstrct.py    | 24 +++++++------------
 3 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/dataset_builders/pie/abstrct/abstrct.py b/dataset_builders/pie/abstrct/abstrct.py
index 34a00305..6cad6260 100644
--- a/dataset_builders/pie/abstrct/abstrct.py
+++ b/dataset_builders/pie/abstrct/abstrct.py
@@ -1,6 +1,4 @@
-from pytorch_ie.documents import (
-    TextDocumentWithLabeledSpansAndBinaryRelations,
-)
+from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations
 
 from pie_datasets.builders import BratBuilder
 from pie_datasets.core.dataset import DocumentConvertersType
@@ -16,7 +14,6 @@
 
 
 class AbstRCT(BratBuilder):
-
     BASE_DATASET_PATH = "DFKI-SLT/brat"
     BASE_DATASET_REVISION = "052163d34b4429d81003981bc10674cef54aa0b8"
 
diff --git a/dataset_builders/pie/abstrct/requirements.txt b/dataset_builders/pie/abstrct/requirements.txt
index 08271d87..56244c60 100644
--- a/dataset_builders/pie/abstrct/requirements.txt
+++ b/dataset_builders/pie/abstrct/requirements.txt
@@ -1 +1 @@
-pie-datasets>=0.4.0,<0.5.0
+pie-datasets>=0.4.0,<0.6.0
diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py
index 6923c5bf..ec602f94 100644
--- a/tests/dataset_builders/pie/test_abstrct.py
+++ b/tests/dataset_builders/pie/test_abstrct.py
@@ -2,18 +2,14 @@
 
 import pytest
 from datasets import disable_caching
-from pytorch_ie.documents import (
-    TextDocumentWithLabeledSpansAndBinaryRelations,
-)
+from pie_models.document.processing import tokenize_document
+from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations
 from transformers import AutoTokenizer, PreTrainedTokenizer
 
 from dataset_builders.pie.abstrct.abstrct import AbstRCT
 from pie_datasets import DatasetDict
 from pie_datasets.builders.brat import BratDocument, BratDocumentWithMergedSpans
-from pie_datasets.document.processing import tokenize_document
-from pie_datasets.document.types import (
-    TokenDocumentWithLabeledSpansAndBinaryRelations,
-)
+from pie_datasets.document.types import TokenDocumentWithLabeledSpansAndBinaryRelations
 from tests.dataset_builders.common import PIE_BASE_PATH
 
 disable_caching()
@@ -21,18 +17,16 @@
 DATASET_NAME = "abstrct"
 PIE_DATASET_PATH = PIE_BASE_PATH / DATASET_NAME
 SPLIT_SIZES = {
-    'glaucoma_test': 100,
-    'mixed_test': 100,
-    'neoplasm_dev': 50,
-    'neoplasm_test': 100,
-    'neoplasm_train': 350,
+    "glaucoma_test": 100,
+    "mixed_test": 100,
+    "neoplasm_dev": 50,
+    "neoplasm_test": 100,
+    "neoplasm_train": 350,
 }
 SPLIT = "neoplasm_train"
 
 
-@pytest.fixture(
-    scope="module", params=[config.name for config in AbstRCT.BUILDER_CONFIGS]
-)
+@pytest.fixture(scope="module", params=[config.name for config in AbstRCT.BUILDER_CONFIGS])
 def dataset_variant(request) -> str:
     return request.param
 

From e68b62d9ffb4cd238eed56d355d97a5e355563ae Mon Sep 17 00:00:00 2001
From: Arne Binder <arne.binder@dfki.de>
Date: Sun, 26 Nov 2023 07:30:25 +0100
Subject: [PATCH 03/24] fix codespell

---
 .pre-commit-config.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 696a3290..01ac619e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -85,7 +85,8 @@ repos:
         args:
           - --skip=logs/**
           # arbitral: this is a legal term and used in example data (cdcp dataset)
-          - --ignore-words-list=arbitral
+          # abstrct / AbstRCT: this is a dataset name
+          - --ignore-words-list=arbitral,abstrct,AbstRCT
 
   # python static type checking
   - repo: https://github.com/pre-commit/mirrors-mypy

From 94abe006d1f9b82faba0b0a0e98544f8e49a1a9c Mon Sep 17 00:00:00 2001
From: Arne Binder <arne.binder@dfki.de>
Date: Sun, 26 Nov 2023 21:32:43 +0100
Subject: [PATCH 04/24] adjust for pie-modules

---
 dataset_builders/pie/abstrct/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataset_builders/pie/abstrct/requirements.txt b/dataset_builders/pie/abstrct/requirements.txt
index 56244c60..cb3deb1c 100644
--- a/dataset_builders/pie/abstrct/requirements.txt
+++ b/dataset_builders/pie/abstrct/requirements.txt
@@ -1 +1 @@
-pie-datasets>=0.4.0,<0.6.0
+pie-datasets>=0.4.0,<0.7.0

From 6768f9541372ef1d86387bee82b32457b6a0775e Mon Sep 17 00:00:00 2001
From: Arne Binder <arne.binder@dfki.de>
Date: Mon, 27 Nov 2023 12:10:08 +0100
Subject: [PATCH 05/24] use test document types

---
 tests/dataset_builders/pie/test_abstrct.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py
index ec602f94..2b83bf1d 100644
--- a/tests/dataset_builders/pie/test_abstrct.py
+++ b/tests/dataset_builders/pie/test_abstrct.py
@@ -9,8 +9,10 @@
 from dataset_builders.pie.abstrct.abstrct import AbstRCT
 from pie_datasets import DatasetDict
 from pie_datasets.builders.brat import BratDocument, BratDocumentWithMergedSpans
-from pie_datasets.document.types import TokenDocumentWithLabeledSpansAndBinaryRelations
-from tests.dataset_builders.common import PIE_BASE_PATH
+from tests.dataset_builders.common import (
+    PIE_BASE_PATH,
+    TestTokenDocumentWithLabeledSpansAndBinaryRelations,
+)
 
 disable_caching()
 
@@ -205,7 +207,7 @@ def tokenizer() -> PreTrainedTokenizer:
 @pytest.fixture(scope="module")
 def tokenized_documents_with_labeled_spans_and_binary_relations(
     dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer
-) -> Optional[List[TokenDocumentWithLabeledSpansAndBinaryRelations]]:
+) -> Optional[List[TestTokenDocumentWithLabeledSpansAndBinaryRelations]]:
     if dataset_of_text_documents_with_labeled_spans_and_binary_relations is None:
         return None
 
@@ -217,7 +219,7 @@ def tokenized_documents_with_labeled_spans_and_binary_relations(
         doc,
         tokenizer=tokenizer,
         return_overflowing_tokens=True,
-        result_document_type=TokenDocumentWithLabeledSpansAndBinaryRelations,
+        result_document_type=TestTokenDocumentWithLabeledSpansAndBinaryRelations,
         strict_span_conversion=False,
         verbose=True,
     )
@@ -287,7 +289,7 @@ def test_tokenized_documents_with_entities_and_relations_all(
                     doc,
                     tokenizer=tokenizer,
                     return_overflowing_tokens=True,
-                    result_document_type=TokenDocumentWithLabeledSpansAndBinaryRelations,
+                    result_document_type=TestTokenDocumentWithLabeledSpansAndBinaryRelations,
                     strict_span_conversion=False,
                     verbose=True,
                 )

From 2560e93ffda9718280d12abb36505940de8bf42e Mon Sep 17 00:00:00 2001
From: Ruangrin L <88072261+idalr@users.noreply.github.com>
Date: Fri, 1 Dec 2023 17:47:10 +0100
Subject: [PATCH 06/24] minor typo fix

---
 tests/dataset_builders/pie/test_abstrct.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py
index 2b83bf1d..936f0797 100644
--- a/tests/dataset_builders/pie/test_abstrct.py
+++ b/tests/dataset_builders/pie/test_abstrct.py
@@ -2,7 +2,7 @@
 
 import pytest
 from datasets import disable_caching
-from pie_models.document.processing import tokenize_document
+from pie_modules.document.processing import tokenize_document
 from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations
 from transformers import AutoTokenizer, PreTrainedTokenizer
 

From c105ee0451def6076cbb0a146f9bcda2018c963f Mon Sep 17 00:00:00 2001
From: Ruangrin L <88072261+idalr@users.noreply.github.com>
Date: Thu, 7 Dec 2023 16:39:42 +0100
Subject: [PATCH 07/24] updated `BASE_DATASET_REVISION`

---
 dataset_builders/pie/abstrct/abstrct.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataset_builders/pie/abstrct/abstrct.py b/dataset_builders/pie/abstrct/abstrct.py
index 6cad6260..f27dc851 100644
--- a/dataset_builders/pie/abstrct/abstrct.py
+++ b/dataset_builders/pie/abstrct/abstrct.py
@@ -15,7 +15,7 @@
 
 class AbstRCT(BratBuilder):
     BASE_DATASET_PATH = "DFKI-SLT/brat"
-    BASE_DATASET_REVISION = "052163d34b4429d81003981bc10674cef54aa0b8"
+    BASE_DATASET_REVISION = "844de61e8a00dc6a93fc29dc185f6e617131fbf1"
 
     # we need to add None to the list of dataset variants to support the default dataset variant
     BASE_BUILDER_KWARGS_DICT = {

From 89ae9337869132b52c0dc1b77ec79668bcf2edfa Mon Sep 17 00:00:00 2001
From: Ruangrin L <88072261+idalr@users.noreply.github.com>
Date: Fri, 8 Dec 2023 13:41:22 +0100
Subject: [PATCH 08/24] edit tests in test_abstrct.py

---
 tests/dataset_builders/pie/test_abstrct.py | 309 +++++++++++----------
 1 file changed, 157 insertions(+), 152 deletions(-)

diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py
index 936f0797..3037e1a0 100644
--- a/tests/dataset_builders/pie/test_abstrct.py
+++ b/tests/dataset_builders/pie/test_abstrct.py
@@ -56,15 +56,10 @@ def document(dataset, dataset_variant) -> Union[BratDocument, BratDocumentWithMe
 
 
 def test_document(document, dataset_variant):
-    assert document.text.startswith("Should students be taught to compete or to cooperate?")
-    if dataset_variant == "default":
-        # TODO
-        raise NotImplementedError()
-    elif dataset_variant == "merge_fragmented_spans":
-        # TODO
-        raise NotImplementedError()
-    else:
-        raise ValueError(f"Unknown dataset variant: {dataset_variant}")
+    assert document is not None
+    assert document.text.startswith(
+        " A combination of mitoxantrone plus prednisone is preferable to prednisone alone"
+    )
 
 
 @pytest.fixture(scope="module")
@@ -96,107 +91,100 @@ def dataset_of_text_documents_with_labeled_spans_and_binary_relations(
 def test_dataset_of_text_documents_with_labeled_spans_and_binary_relations(
     dataset_of_text_documents_with_labeled_spans_and_binary_relations,
 ):
-    if dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None:
-        # Check that the conversion is correct and the data makes sense
-        # get a document to check
-        doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0]
-        assert isinstance(doc, TextDocumentWithLabeledSpansAndBinaryRelations)
-        # check the entities
-        assert len(doc.labeled_spans) == 183
-        # sort the entities by their start position and convert them to tuples
-        # check the first ten entities after sorted
-        sorted_entity_tuples = [
-            (str(ent), ent.label)
-            for ent in sorted(doc.labeled_spans, key=lambda ent: ent.start)[:10]
-        ]
-        # Checking the first ten entities
-        assert sorted_entity_tuples[0] == (
-            "complicated 3D character models are widely used in fields of entertainment, virtual reality, medicine etc",
-            "background_claim",
-        )
-        assert sorted_entity_tuples[1] == (
-            "The range of breathtaking realistic 3D models is only limited by the creativity of artists and resolution "
-            "of devices",
-            "background_claim",
-        )
-        assert sorted_entity_tuples[2] == (
-            "Driving 3D models in a natural and believable manner is not trivial",
-            "background_claim",
-        )
-        assert sorted_entity_tuples[3] == ("the model is very detailed", "data")
-        assert sorted_entity_tuples[4] == (
-            "playback of animation becomes quite heavy and time consuming",
-            "data",
-        )
-        assert sorted_entity_tuples[5] == ("a frame goes wrong", "data")
-        assert sorted_entity_tuples[6] == (
-            "a production cannot afford major revisions",
-            "background_claim",
-        )
-        assert sorted_entity_tuples[7] == ("resculpting models", "data")
-        assert sorted_entity_tuples[8] == ("re-rigging skeletons", "data")
-        assert sorted_entity_tuples[9] == (
-            "providing a flexible and efficient solution to animation remains an open problem",
-            "own_claim",
-        )
+    assert dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None
+    # get a document to check
+    converted_doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[
+        "neoplasm_train"
+    ][0]
+    # check that the conversion is correct and the data makes sense
+    assert isinstance(converted_doc, TextDocumentWithLabeledSpansAndBinaryRelations)
 
-        # check the relations
-        assert len(doc.binary_relations) == 116
-        # check the first ten relations
-        relation_tuples = [
-            (str(rel.head), rel.label, str(rel.tail)) for rel in doc.binary_relations[:10]
-        ]
-        assert relation_tuples[0] == (
-            "a production cannot afford major revisions",
-            "supports",
-            "providing a flexible and efficient solution to animation remains an open problem",
-        )
-        assert relation_tuples[1] == (
-            "its ease of implementation",
-            "supports",
-            "SSD is widely used in games, virtual reality and other realtime applications",
-        )
-        assert relation_tuples[2] == (
-            "low cost of computing",
-            "supports",
-            "SSD is widely used in games, virtual reality and other realtime applications",
-        )
-        assert relation_tuples[3] == (
-            "editing in the rest pose will influence most other poses",
-            "supports",
-            "This approach is not commonly applied",
-        )
-        assert relation_tuples[4] == (
-            "This approach is not commonly applied",
-            "contradicts",
-            "artists will edit the geometry of characters in the rest pose to fine-tune animations",
-        )
-        assert relation_tuples[5] == (
-            "the animator specifies the PSD examples after the SSD has been performed",
-            "contradicts",
-            "the examples are best interpolated in the rest pose, before the SSD has been applied",
-        )
-        assert relation_tuples[6] == (
-            "PSD may be used as a compensation to the underlying SSD",
-            "contradicts",
-            "the examples are best interpolated in the rest pose, before the SSD has been applied",
-        )
-        assert relation_tuples[7] == (
-            "the examples are best interpolated in the rest pose, before the SSD has been applied",
-            "supports",
-            "the action of the SSD and any other deformations must be “inverted” in order to push the example "
-            "compensation before these operations",
-        )
-        assert relation_tuples[8] == (
-            "this inverse strategy has a better performance than the same framework without it",
-            "semantically_same",
-            "this approach will improve the quality of deformation",
-        )
-        assert relation_tuples[9] == (
-            "the high cost of computing",
-            "supports",
-            "they are seldom applied to interactive applications",
-        )
+    # check the entities
+    assert len(converted_doc.labeled_spans) == 7
+    entity_tuples = [(str(ent), ent.label) for ent in converted_doc.labeled_spans]
+    assert entity_tuples[0] == (
+        "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in men "
+        "with metastatic, hormone-resistant, prostate cancer.",
+        "MajorClaim",
+    )
+    assert entity_tuples[1] == (
+        "At 6 weeks, both groups showed improvement in several HQL domains,",
+        "Premise",
+    )
+    assert entity_tuples[2] == (
+        "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the "
+        "prednisone-alone group.",
+        "Premise",
+    )
+    assert entity_tuples[3] == (
+        "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking "
+        "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), "
+        "four functioning domains, and nine symptoms (.001 < P <. 01),",
+        "Premise",
+    )
+    assert entity_tuples[4] == (
+        "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group "
+        "(.004 < P <.05).",
+        "Premise",
+    )
+    assert entity_tuples[5] == (
+        "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with "
+        "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).",
+        "Premise",
+    )
+    assert entity_tuples[6] == (
+        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
+        "in several HQL domains and symptoms than treatment with prednisone alone.",
+        "Claim",
+    )
+
+    # check the relations
+    assert len(converted_doc.binary_relations) == 6
+    relation_tuples = [
+        (str(rel.head), rel.label, str(rel.tail)) for rel in converted_doc.binary_relations
+    ]
+    assert relation_tuples[0] == (
+        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
+        "in several HQL domains and symptoms than treatment with prednisone alone.",
+        "Support",
+        "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain "
+        "in men with metastatic, hormone-resistant, prostate cancer.",
+    )
+    assert relation_tuples[1] == (
+        "At 6 weeks, both groups showed improvement in several HQL domains,",
+        "Support",
+        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
+        "in several HQL domains and symptoms than treatment with prednisone alone.",
+    )
+    assert relation_tuples[2] == (
+        "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the "
+        "prednisone-alone group.",
+        "Support",
+        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
+        "in several HQL domains and symptoms than treatment with prednisone alone.",
+    )
+    assert relation_tuples[3] == (
+        "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group "
+        "(.004 < P <.05).",
+        "Support",
+        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement in "
+        "several HQL domains and symptoms than treatment with prednisone alone.",
+    )
+    assert relation_tuples[4] == (
+        "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking "
+        "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four "
+        "functioning domains, and nine symptoms (.001 < P <. 01),",
+        "Support",
+        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
+        "in several HQL domains and symptoms than treatment with prednisone alone.",
+    )
+    assert relation_tuples[5] == (
+        "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with improvements "
+        "in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).",
+        "Support",
+        "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in "
+        "men with metastatic, hormone-resistant, prostate cancer.",
+    )
 
 
 @pytest.fixture(scope="module")
@@ -229,49 +217,66 @@ def tokenized_documents_with_labeled_spans_and_binary_relations(
 def test_tokenized_documents_with_labeled_spans_and_binary_relations(
     tokenized_documents_with_labeled_spans_and_binary_relations,
 ):
-    if tokenized_documents_with_labeled_spans_and_binary_relations is not None:
-        docs = tokenized_documents_with_labeled_spans_and_binary_relations
-        # check that the tokenization was fine
-        assert len(docs) == 1
-        doc = docs[0]
-        assert len(doc.labeled_spans) == 183
-        assert len(doc.tokens) == 7689
-        # Check the first ten tokens
-        assert doc.tokens[:10] == ("[CLS]", "<", "?", "xml", "version", "=", '"', "1", ".", "0")
-        # Check the first ten tokenized entities after sorted by their start position
-        sorted_entities = sorted(doc.labeled_spans, key=lambda ent: ent.start)
-        assert (
-            str(sorted_entities[0])
-            == "('complicated', '3d', 'character', 'models', 'are', 'widely', 'used', 'in', 'fields', 'of', "
-            "'entertainment', ',', 'virtual', 'reality', ',', 'medicine', 'etc')"
-        )
-        assert (
-            str(sorted_entities[1])
-            == "('the', 'range', 'of', 'breath', '##taking', 'realistic', '3d', 'models', 'is', 'only', 'limited', "
-            "'by', 'the', 'creativity', 'of', 'artists', 'and', 'resolution', 'of', 'devices')"
-        )
-        assert (
-            str(sorted_entities[2])
-            == "('driving', '3d', 'models', 'in', 'a', 'natural', 'and', 'bel', '##ie', '##vable', 'manner', 'is', "
-            "'not', 'trivial')"
-        )
-        assert str(sorted_entities[3]) == "('the', 'model', 'is', 'very', 'detailed')"
-        assert (
-            str(sorted_entities[4])
-            == "('playback', 'of', 'animation', 'becomes', 'quite', 'heavy', 'and', 'time', 'consuming')"
-        )
-        assert str(sorted_entities[5]) == "('a', 'frame', 'goes', 'wrong')"
-        assert (
-            str(sorted_entities[6])
-            == "('a', 'production', 'cannot', 'afford', 'major', 'revisions')"
-        )
-        assert str(sorted_entities[7]) == "('res', '##cu', '##lp', '##ting', 'models')"
-        assert str(sorted_entities[8]) == "('re', '-', 'rig', '##ging', 'skeletons')"
-        assert (
-            str(sorted_entities[9])
-            == "('providing', 'a', 'flexible', 'and', 'efficient', 'solution', 'to', 'animation', 'remains', 'an', "
-            "'open', 'problem')"
-        )
+    docs: List[
+        TestTokenDocumentWithLabeledSpansAndBinaryRelations
+    ] = tokenized_documents_with_labeled_spans_and_binary_relations
+    # check that the tokenization was fine
+    assert len(docs) == 1
+    doc = docs[0]
+    assert len(doc.tokens) == 465
+    assert len(doc.labeled_spans) == 7
+    ent = doc.labeled_spans[0]
+    assert (
+        str(ent)
+        == "('a', 'combination', 'of', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'is', "
+        "'prefer', '##able', 'to', 'pre', '##d', '##nis', '##one', 'alone', 'for', 'reduction', 'of', 'pain', 'in', "
+        "'men', 'with', 'meta', '##static', ',', 'hormone', '-', 'resistant', ',', 'prostate', 'cancer', '.')"
+    )
+    ent = doc.labeled_spans[1]
+    assert (
+        str(ent)
+        == "('at', '6', 'weeks', ',', 'both', 'groups', 'showed', 'improvement', 'in', 'several', 'hq', '##l', "
+        "'domains', ',')"
+    )
+    ent = doc.labeled_spans[2]
+    assert (
+        str(ent)
+        == "('only', 'physical', 'functioning', 'and', 'pain', 'were', 'better', 'in', 'the', 'mit', '##ox', '##ant', "
+        "'##rone', '-', 'plus', '-', 'pre', '##d', '##nis', '##one', 'group', 'than', 'in', 'the', 'pre', '##d', "
+        "'##nis', '##one', '-', 'alone', 'group', '.')"
+    )
+    ent = doc.labeled_spans[3]
+    assert (
+        str(ent)
+        == "('after', '6', 'weeks', ',', 'patients', 'taking', 'pre', '##d', '##nis', '##one', 'showed', 'no', "
+        "'improvement', 'in', 'hq', '##l', 'scores', ',', 'whereas', 'those', 'taking', 'mit', '##ox', '##ant', "
+        "'##rone', 'plus', 'pre', '##d', '##nis', '##one', 'showed', 'significant', 'improvements', 'in', 'global', "
+        "'quality', 'of', 'life', '(', 'p', '=', '.', '00', '##9', ')', ',', 'four', 'functioning', 'domains', ',', "
+        "'and', 'nine', 'symptoms', '(', '.', '001', '<', 'p', '<', '.', '01', ')', ',')"
+    )
+    ent = doc.labeled_spans[4]
+    assert (
+        str(ent)
+        == "('the', 'improvement', '(', '>', '10', 'units', 'on', 'a', 'scale', 'of', '0', 'to', '##100', ')', "
+        "'lasted', 'longer', 'than', 'in', 'the', 'pre', '##d', '##nis', '##one', '-', 'alone', 'group', '(', '.', "
+        "'00', '##4', '<', 'p', '<', '.', '05', ')', '.')"
+    )
+    ent = doc.labeled_spans[5]
+    assert (
+        str(ent)
+        == "('the', 'addition', 'of', 'mit', '##ox', '##ant', '##rone', 'to', 'pre', '##d', '##nis', '##one', "
+        "'after', 'failure', 'of', 'pre', '##d', '##nis', '##one', 'alone', 'was', 'associated', 'with', "
+        "'improvements', 'in', 'pain', ',', 'pain', 'impact', ',', 'pain', 'relief', ',', 'ins', '##om', '##nia', "
+        "',', 'and', 'global', 'quality', 'of', 'life', '(', '.', '001', '<', 'p', '<', '.', '00', '##3', ')', '.')"
+    )
+    ent = doc.labeled_spans[6]
+    assert (
+        str(ent)
+        == "('treatment', 'with', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'was', "
+        "'associated', 'with', 'greater', 'and', 'longer', '-', 'lasting', 'improvement', 'in', 'several', "
+        "'hq', '##l', 'domains', 'and', 'symptoms', 'than', 'treatment', 'with', 'pre', '##d', '##nis', '##one', "
+        "'alone', '.')"
+    )
 
 
 def test_tokenized_documents_with_entities_and_relations_all(

From bf77b558a56d6fd4b73f715ef3c91fed82aa8143 Mon Sep 17 00:00:00 2001
From: Ruangrin L <88072261+idalr@users.noreply.github.com>
Date: Fri, 8 Dec 2023 14:17:37 +0100
Subject: [PATCH 09/24] edit more tests in test_abstrct.py

---
 tests/dataset_builders/pie/test_abstrct.py | 307 ++++++++++-----------
 1 file changed, 153 insertions(+), 154 deletions(-)

diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py
index 3037e1a0..64be662d 100644
--- a/tests/dataset_builders/pie/test_abstrct.py
+++ b/tests/dataset_builders/pie/test_abstrct.py
@@ -91,100 +91,100 @@ def dataset_of_text_documents_with_labeled_spans_and_binary_relations(
 def test_dataset_of_text_documents_with_labeled_spans_and_binary_relations(
     dataset_of_text_documents_with_labeled_spans_and_binary_relations,
 ):
-    assert dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None
-    # get a document to check
-    converted_doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[
-        "neoplasm_train"
-    ][0]
-    # check that the conversion is correct and the data makes sense
-    assert isinstance(converted_doc, TextDocumentWithLabeledSpansAndBinaryRelations)
-
-    # check the entities
-    assert len(converted_doc.labeled_spans) == 7
-    entity_tuples = [(str(ent), ent.label) for ent in converted_doc.labeled_spans]
-    assert entity_tuples[0] == (
-        "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in men "
-        "with metastatic, hormone-resistant, prostate cancer.",
-        "MajorClaim",
-    )
-    assert entity_tuples[1] == (
-        "At 6 weeks, both groups showed improvement in several HQL domains,",
-        "Premise",
-    )
-    assert entity_tuples[2] == (
-        "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the "
-        "prednisone-alone group.",
-        "Premise",
-    )
-    assert entity_tuples[3] == (
-        "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking "
-        "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), "
-        "four functioning domains, and nine symptoms (.001 < P <. 01),",
-        "Premise",
-    )
-    assert entity_tuples[4] == (
-        "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group "
-        "(.004 < P <.05).",
-        "Premise",
-    )
-    assert entity_tuples[5] == (
-        "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with "
-        "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).",
-        "Premise",
-    )
-    assert entity_tuples[6] == (
-        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
-        "in several HQL domains and symptoms than treatment with prednisone alone.",
-        "Claim",
-    )
+    if dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None:
+        # get a document to check
+        converted_doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[
+            "neoplasm_train"
+        ][0]
+        # check that the conversion is correct and the data makes sense
+        assert isinstance(converted_doc, TextDocumentWithLabeledSpansAndBinaryRelations)
+
+        # check the entities
+        assert len(converted_doc.labeled_spans) == 7
+        entity_tuples = [(str(ent), ent.label) for ent in converted_doc.labeled_spans]
+        assert entity_tuples[0] == (
+            "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in men "
+            "with metastatic, hormone-resistant, prostate cancer.",
+            "MajorClaim",
+        )
+        assert entity_tuples[1] == (
+            "At 6 weeks, both groups showed improvement in several HQL domains,",
+            "Premise",
+        )
+        assert entity_tuples[2] == (
+            "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the "
+            "prednisone-alone group.",
+            "Premise",
+        )
+        assert entity_tuples[3] == (
+            "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking "
+            "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), "
+            "four functioning domains, and nine symptoms (.001 < P <. 01),",
+            "Premise",
+        )
+        assert entity_tuples[4] == (
+            "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group "
+            "(.004 < P <.05).",
+            "Premise",
+        )
+        assert entity_tuples[5] == (
+            "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with "
+            "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).",
+            "Premise",
+        )
+        assert entity_tuples[6] == (
+            "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
+            "in several HQL domains and symptoms than treatment with prednisone alone.",
+            "Claim",
+        )
 
-    # check the relations
-    assert len(converted_doc.binary_relations) == 6
-    relation_tuples = [
-        (str(rel.head), rel.label, str(rel.tail)) for rel in converted_doc.binary_relations
-    ]
-    assert relation_tuples[0] == (
-        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
-        "in several HQL domains and symptoms than treatment with prednisone alone.",
-        "Support",
-        "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain "
-        "in men with metastatic, hormone-resistant, prostate cancer.",
-    )
-    assert relation_tuples[1] == (
-        "At 6 weeks, both groups showed improvement in several HQL domains,",
-        "Support",
-        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
-        "in several HQL domains and symptoms than treatment with prednisone alone.",
-    )
-    assert relation_tuples[2] == (
-        "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the "
-        "prednisone-alone group.",
-        "Support",
-        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
-        "in several HQL domains and symptoms than treatment with prednisone alone.",
-    )
-    assert relation_tuples[3] == (
-        "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group "
-        "(.004 < P <.05).",
-        "Support",
-        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement in "
-        "several HQL domains and symptoms than treatment with prednisone alone.",
-    )
-    assert relation_tuples[4] == (
-        "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking "
-        "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four "
-        "functioning domains, and nine symptoms (.001 < P <. 01),",
-        "Support",
-        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
-        "in several HQL domains and symptoms than treatment with prednisone alone.",
-    )
-    assert relation_tuples[5] == (
-        "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with improvements "
-        "in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).",
-        "Support",
-        "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in "
-        "men with metastatic, hormone-resistant, prostate cancer.",
-    )
+        # check the relations
+        assert len(converted_doc.binary_relations) == 6
+        relation_tuples = [
+            (str(rel.head), rel.label, str(rel.tail)) for rel in converted_doc.binary_relations
+        ]
+        assert relation_tuples[0] == (
+            "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
+            "in several HQL domains and symptoms than treatment with prednisone alone.",
+            "Support",
+            "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain "
+            "in men with metastatic, hormone-resistant, prostate cancer.",
+        )
+        assert relation_tuples[1] == (
+            "At 6 weeks, both groups showed improvement in several HQL domains,",
+            "Support",
+            "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
+            "in several HQL domains and symptoms than treatment with prednisone alone.",
+        )
+        assert relation_tuples[2] == (
+            "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the "
+            "prednisone-alone group.",
+            "Support",
+            "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
+            "in several HQL domains and symptoms than treatment with prednisone alone.",
+        )
+        assert relation_tuples[3] == (
+            "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group "
+            "(.004 < P <.05).",
+            "Support",
+            "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement in "
+            "several HQL domains and symptoms than treatment with prednisone alone.",
+        )
+        assert relation_tuples[4] == (
+            "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking "
+            "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four "
+            "functioning domains, and nine symptoms (.001 < P <. 01),",
+            "Support",
+            "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
+            "in several HQL domains and symptoms than treatment with prednisone alone.",
+        )
+        assert relation_tuples[5] == (
+            "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with improvements "
+            "in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).",
+            "Support",
+            "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in "
+            "men with metastatic, hormone-resistant, prostate cancer.",
+        )
 
 
 @pytest.fixture(scope="module")
@@ -217,66 +217,65 @@ def tokenized_documents_with_labeled_spans_and_binary_relations(
 def test_tokenized_documents_with_labeled_spans_and_binary_relations(
     tokenized_documents_with_labeled_spans_and_binary_relations,
 ):
-    docs: List[
-        TestTokenDocumentWithLabeledSpansAndBinaryRelations
-    ] = tokenized_documents_with_labeled_spans_and_binary_relations
-    # check that the tokenization was fine
-    assert len(docs) == 1
-    doc = docs[0]
-    assert len(doc.tokens) == 465
-    assert len(doc.labeled_spans) == 7
-    ent = doc.labeled_spans[0]
-    assert (
-        str(ent)
-        == "('a', 'combination', 'of', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'is', "
-        "'prefer', '##able', 'to', 'pre', '##d', '##nis', '##one', 'alone', 'for', 'reduction', 'of', 'pain', 'in', "
-        "'men', 'with', 'meta', '##static', ',', 'hormone', '-', 'resistant', ',', 'prostate', 'cancer', '.')"
-    )
-    ent = doc.labeled_spans[1]
-    assert (
-        str(ent)
-        == "('at', '6', 'weeks', ',', 'both', 'groups', 'showed', 'improvement', 'in', 'several', 'hq', '##l', "
-        "'domains', ',')"
-    )
-    ent = doc.labeled_spans[2]
-    assert (
-        str(ent)
-        == "('only', 'physical', 'functioning', 'and', 'pain', 'were', 'better', 'in', 'the', 'mit', '##ox', '##ant', "
-        "'##rone', '-', 'plus', '-', 'pre', '##d', '##nis', '##one', 'group', 'than', 'in', 'the', 'pre', '##d', "
-        "'##nis', '##one', '-', 'alone', 'group', '.')"
-    )
-    ent = doc.labeled_spans[3]
-    assert (
-        str(ent)
-        == "('after', '6', 'weeks', ',', 'patients', 'taking', 'pre', '##d', '##nis', '##one', 'showed', 'no', "
-        "'improvement', 'in', 'hq', '##l', 'scores', ',', 'whereas', 'those', 'taking', 'mit', '##ox', '##ant', "
-        "'##rone', 'plus', 'pre', '##d', '##nis', '##one', 'showed', 'significant', 'improvements', 'in', 'global', "
-        "'quality', 'of', 'life', '(', 'p', '=', '.', '00', '##9', ')', ',', 'four', 'functioning', 'domains', ',', "
-        "'and', 'nine', 'symptoms', '(', '.', '001', '<', 'p', '<', '.', '01', ')', ',')"
-    )
-    ent = doc.labeled_spans[4]
-    assert (
-        str(ent)
-        == "('the', 'improvement', '(', '>', '10', 'units', 'on', 'a', 'scale', 'of', '0', 'to', '##100', ')', "
-        "'lasted', 'longer', 'than', 'in', 'the', 'pre', '##d', '##nis', '##one', '-', 'alone', 'group', '(', '.', "
-        "'00', '##4', '<', 'p', '<', '.', '05', ')', '.')"
-    )
-    ent = doc.labeled_spans[5]
-    assert (
-        str(ent)
-        == "('the', 'addition', 'of', 'mit', '##ox', '##ant', '##rone', 'to', 'pre', '##d', '##nis', '##one', "
-        "'after', 'failure', 'of', 'pre', '##d', '##nis', '##one', 'alone', 'was', 'associated', 'with', "
-        "'improvements', 'in', 'pain', ',', 'pain', 'impact', ',', 'pain', 'relief', ',', 'ins', '##om', '##nia', "
-        "',', 'and', 'global', 'quality', 'of', 'life', '(', '.', '001', '<', 'p', '<', '.', '00', '##3', ')', '.')"
-    )
-    ent = doc.labeled_spans[6]
-    assert (
-        str(ent)
-        == "('treatment', 'with', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'was', "
-        "'associated', 'with', 'greater', 'and', 'longer', '-', 'lasting', 'improvement', 'in', 'several', "
-        "'hq', '##l', 'domains', 'and', 'symptoms', 'than', 'treatment', 'with', 'pre', '##d', '##nis', '##one', "
-        "'alone', '.')"
-    )
+    if tokenized_documents_with_labeled_spans_and_binary_relations is not None:
+        docs = tokenized_documents_with_labeled_spans_and_binary_relations
+        # check that the tokenization was fine
+        assert len(docs) == 1
+        doc = docs[0]
+        assert len(doc.tokens) == 465
+        assert len(doc.labeled_spans) == 7
+        ent = doc.labeled_spans[0]
+        assert (
+            str(ent)
+            == "('a', 'combination', 'of', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'is', "
+            "'prefer', '##able', 'to', 'pre', '##d', '##nis', '##one', 'alone', 'for', 'reduction', 'of', 'pain', 'in', "
+            "'men', 'with', 'meta', '##static', ',', 'hormone', '-', 'resistant', ',', 'prostate', 'cancer', '.')"
+        )
+        ent = doc.labeled_spans[1]
+        assert (
+            str(ent)
+            == "('at', '6', 'weeks', ',', 'both', 'groups', 'showed', 'improvement', 'in', 'several', 'hq', '##l', "
+            "'domains', ',')"
+        )
+        ent = doc.labeled_spans[2]
+        assert (
+            str(ent)
+            == "('only', 'physical', 'functioning', 'and', 'pain', 'were', 'better', 'in', 'the', 'mit', '##ox', '##ant', "
+            "'##rone', '-', 'plus', '-', 'pre', '##d', '##nis', '##one', 'group', 'than', 'in', 'the', 'pre', '##d', "
+            "'##nis', '##one', '-', 'alone', 'group', '.')"
+        )
+        ent = doc.labeled_spans[3]
+        assert (
+            str(ent)
+            == "('after', '6', 'weeks', ',', 'patients', 'taking', 'pre', '##d', '##nis', '##one', 'showed', 'no', "
+            "'improvement', 'in', 'hq', '##l', 'scores', ',', 'whereas', 'those', 'taking', 'mit', '##ox', '##ant', "
+            "'##rone', 'plus', 'pre', '##d', '##nis', '##one', 'showed', 'significant', 'improvements', 'in', 'global', "
+            "'quality', 'of', 'life', '(', 'p', '=', '.', '00', '##9', ')', ',', 'four', 'functioning', 'domains', ',', "
+            "'and', 'nine', 'symptoms', '(', '.', '001', '<', 'p', '<', '.', '01', ')', ',')"
+        )
+        ent = doc.labeled_spans[4]
+        assert (
+            str(ent)
+            == "('the', 'improvement', '(', '>', '10', 'units', 'on', 'a', 'scale', 'of', '0', 'to', '##100', ')', "
+            "'lasted', 'longer', 'than', 'in', 'the', 'pre', '##d', '##nis', '##one', '-', 'alone', 'group', '(', '.', "
+            "'00', '##4', '<', 'p', '<', '.', '05', ')', '.')"
+        )
+        ent = doc.labeled_spans[5]
+        assert (
+            str(ent)
+            == "('the', 'addition', 'of', 'mit', '##ox', '##ant', '##rone', 'to', 'pre', '##d', '##nis', '##one', "
+            "'after', 'failure', 'of', 'pre', '##d', '##nis', '##one', 'alone', 'was', 'associated', 'with', "
+            "'improvements', 'in', 'pain', ',', 'pain', 'impact', ',', 'pain', 'relief', ',', 'ins', '##om', '##nia', "
+            "',', 'and', 'global', 'quality', 'of', 'life', '(', '.', '001', '<', 'p', '<', '.', '00', '##3', ')', '.')"
+        )
+        ent = doc.labeled_spans[6]
+        assert (
+            str(ent)
+            == "('treatment', 'with', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'was', "
+            "'associated', 'with', 'greater', 'and', 'longer', '-', 'lasting', 'improvement', 'in', 'several', "
+            "'hq', '##l', 'domains', 'and', 'symptoms', 'than', 'treatment', 'with', 'pre', '##d', '##nis', '##one', "
+            "'alone', '.')"
+        )
 
 
 def test_tokenized_documents_with_entities_and_relations_all(
@@ -314,6 +313,6 @@ def test_document_converters(dataset_variant):
         assert set(document_converters) == {
             TextDocumentWithLabeledSpansAndBinaryRelations,
         }
-        assert all(callable(v) for k, v in document_converters.items())
+        assert all(callable(v) for k, v in document_converters.items()) #currently not callable
     else:
         raise ValueError(f"Unknown dataset variant: {dataset_variant}")

From 6403a240210bcfacc74c314b14b6db562d9f7434 Mon Sep 17 00:00:00 2001
From: Ruangrin L <88072261+idalr@users.noreply.github.com>
Date: Tue, 12 Dec 2023 10:20:31 +0100
Subject: [PATCH 10/24] edit pie/readme.md

---
 dataset_builders/pie/abstrct/README.md | 99 +++++++++++++++++++++++---
 1 file changed, 91 insertions(+), 8 deletions(-)

diff --git a/dataset_builders/pie/abstrct/README.md b/dataset_builders/pie/abstrct/README.md
index 2c10a090..0b5899ae 100644
--- a/dataset_builders/pie/abstrct/README.md
+++ b/dataset_builders/pie/abstrct/README.md
@@ -1,23 +1,106 @@
 # PIE Dataset Card for "abstrct"
 
-This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the AbstRCT dataset.
+This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the AbstRCT dataset ([paper]() and [data repository]()). Since the AbstRCT dataset is published in the [BRAT standoff format](https://brat.nlplab.org/standoff.html), this dataset builder is based on the [PyTorch-IE brat dataset loading script](https://huggingface.co/datasets/pie/brat).
 
-TODO: Since there is no respective HF dataset card, we should all respective information here.
+Therefore, the `abstrct` dataset as described here follows the data structure from the [PIE brat dataset card](https://huggingface.co/datasets/pie/brat).
 
-TODO: Shortly reference the PIE-Brat dataset card.
+### Dataset Summary
 
-## Data Schema
+### Supported Tasks and Leaderboards #TODO
 
-TODO
+- **Tasks**: Argumentation Mining, Component Identification, Relation Identification
+- **Leaderboard:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
 
-See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/annotations.py) for the remaining annotation type definitions.
+### Languages #TODO
 
-## Document Converters
+The language in the dataset is English.
+
+### Dataset Variants
+
+See [PIE-Brat Data Variants](https://huggingface.co/datasets/pie/brat#data-variants).
+
+### Data Schema
+
+See [PIE-Brat Data Schema](https://huggingface.co/datasets/pie/brat#data-schema).
+
+### Usage
+
+```python
+from pie_datasets import load_dataset, builders
+
+# load default version
+datasets = load_dataset("pie/abstrct")
+doc = datasets["train"][0]
+assert isinstance(doc, builders.brat.BratDocument)
+
+# load version with merged span fragments
+dataset_merged_spans = load_dataset("pie/abstrct", name="merge_fragmented_spans")
+doc_merged_spans = dataset_merged_spans["train"][0]
+assert isinstance(doc_merged_spans, builders.brat.BratDocumentWithMergedSpans)
+```
+
+### Document Converters #TODO
 
 The dataset provides document converters for the following target document types:
 
 - `pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations`
-  - TODO
+  - `LabeledSpans`, converted from `BratDocument`'s `spans`
+    - labels: 
+  - `BinraryRelations`, converted from `BratDocument`'s `relations`
+    - labels: 
 
 See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type
 definitions.
+
+### Data Splits #TODO
+
+
+### Label Descriptions #TODO
+
+#### Components
+
+#### Relations
+
+## Dataset Creation #TODO
+
+### Curation Rationale
+
+### Source Data
+
+#### Initial Data Collection and Normalization
+
+#### Who are the source language producers?
+
+### Annotations #TODO
+
+#### Annotation process
+
+#### Who are the annotators?
+
+### Personal and Sensitive Information
+
+\[More Information Needed\]
+
+## Considerations for Using the Data #TODO
+
+### Social Impact of Dataset
+
+### Discussion of Biases
+
+### Other Known Limitations
+
+## Additional Information #TODO
+
+### Dataset Curators
+
+### Licensing Information
+
+### Citation Information
+
+```
+
+```
+
+### Contributions
+
+Thanks to [@ArneBinder](https://github.com/ArneBinder) and [@idalr](https://github.com/idalr) for adding this dataset.

From 85ead58687e8ca83584462e82f5f9f5420a478b5 Mon Sep 17 00:00:00 2001
From: Ruangrin L <88072261+idalr@users.noreply.github.com>
Date: Tue, 12 Dec 2023 16:00:59 +0100
Subject: [PATCH 11/24] edit pie/readme.md

---
 dataset_builders/pie/abstrct/README.md | 150 ++++++++++++++++++++++---
 1 file changed, 135 insertions(+), 15 deletions(-)

diff --git a/dataset_builders/pie/abstrct/README.md b/dataset_builders/pie/abstrct/README.md
index 0b5899ae..37f38a70 100644
--- a/dataset_builders/pie/abstrct/README.md
+++ b/dataset_builders/pie/abstrct/README.md
@@ -1,19 +1,23 @@
 # PIE Dataset Card for "abstrct"
 
-This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the AbstRCT dataset ([paper]() and [data repository]()). Since the AbstRCT dataset is published in the [BRAT standoff format](https://brat.nlplab.org/standoff.html), this dataset builder is based on the [PyTorch-IE brat dataset loading script](https://huggingface.co/datasets/pie/brat).
+This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the AbstRCT dataset ([paper](<>) and [data repository](https://gitlab.com/tomaye/abstrct)). Since the AbstRCT dataset is published in the [BRAT standoff format](https://brat.nlplab.org/standoff.html), this dataset builder is based on the [PyTorch-IE brat dataset loading script](https://huggingface.co/datasets/pie/brat).
 
 Therefore, the `abstrct` dataset as described here follows the data structure from the [PIE brat dataset card](https://huggingface.co/datasets/pie/brat).
 
 ### Dataset Summary
 
-### Supported Tasks and Leaderboards #TODO
+A novel corpus of healthcare texts (i.e., RCT abstracts on various diseases) from the MEDLINE database, which
+are annotated with argumentative components (i.e., `MajorClaim`, `Claim`, and `Premise`) and relations (i.e., `Support`, `Attack`, and `Partial-attack`),
+in order to support clinicians' daily tasks in information finding and evidence-based reasoning for decision making.
 
-- **Tasks**: Argumentation Mining, Component Identification, Relation Identification
+### Supported Tasks and Leaderboards
+
+- **Tasks**: Argumentation Mining, Component Identification, Boundary Detection, Relation Identification, Link Prediction
 - **Leaderboard:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
 
-### Languages #TODO
+### Languages
 
-The language in the dataset is English.
+The language in the dataset is English (in the medical/healthcare domain).
 
 ### Dataset Variants
 
@@ -39,66 +43,182 @@ doc_merged_spans = dataset_merged_spans["train"][0]
 assert isinstance(doc_merged_spans, builders.brat.BratDocumentWithMergedSpans)
 ```
 
-### Document Converters #TODO
+### Document Converters
 
 The dataset provides document converters for the following target document types:
 
 - `pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations`
   - `LabeledSpans`, converted from `BratDocument`'s `spans`
-    - labels: 
+    - labels: `MajorClaim`, `Claim`, `Premise`
   - `BinraryRelations`, converted from `BratDocument`'s `relations`
-    - labels: 
+    - labels:  `Support`, `Partial-Attack`, `Attack`
 
 See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type
 definitions.
 
-### Data Splits #TODO
+### Data Splits
+
+| Diseease-based Split                                      |              `neoplasm` |           `glaucoma` |              `mixed` |
+| --------------------------------------------------------- | ----------------------: | -------------------: | -------------------: |
+| No.of document <br/>- `_train`<br/>- `_dev`<br/>- `_test` | <br/>350<br/>50<br/>100 | <br/> <br/> <br/>100 | <br/> <br/> <br/>100 |
+
+**Important Note**:
+
+- `mixed_test` contains 20 abstracts on the following diseases: glaucoma, neoplasm, diabetes, hypertension, hepatitis.
+- 31 out of 40 abstracts in `mixed_test` overlap with abstracts in `neoplasm_test` and `glaucoma_test`.
+
+### Label Descriptions
 
+In this section, we describe labels according to [Mayer et al. (2020)](https://ebooks.iospress.nl/publication/55129), as well as our label counts on 669 abstracts.
 
-### Label Descriptions #TODO
+Unfortunately, the number we report does not correspond to what Mayer et al. reported in their paper (see Table 1, p. 2109).
+Morio et al. ([2022](https://aclanthology.org/2022.tacl-1.37.pdf); p. 642, Table 1), who utilized this corpus for their AM tasks, also reported another number, claiming there were double annotation errors in the original statistic collection (see [reference](https://github.com/hitachi-nlp/graph_parser/blob/main/examples/multitask_am/README.md#qas)).
 
 #### Components
 
+| Components   | Count | Percentage |
+| ------------ | ----: | ---------: |
+| `MajorClaim` |   129 |        3 % |
+| `Claim`      |  1282 |     30.2 % |
+| `Premise`    |  2842 |     66.8 % |
+
+- `MajorClaim` are more general/concluding `claim`'s, which is supported by more specific claims
+- `Claim` is a concluding statement made by the author about the outcome of the study. Claims only points to other claims.
+- `Premise` (a.k.a. evidence)  is an observation or measurement in the study, which supports or attacks another argument component, usually a `claim`. They are observed facts, and therefore credible without further justifications, as this is the ground truth the argumentation is based on.
+
+(Mayer et al. 2020, p.2110)
+
 #### Relations
 
-## Dataset Creation #TODO
+| Relations                | Count | Percentage |
+| ------------------------ | ----: | ---------: |
+| support: `Support`       |  2289 |       87 % |
+| attack: `Partial-Attack` |   275 |     10.4 % |
+| attack: `Attack`         |    69 |      2.6 % |
+
+- `Support`: All statements or observations justifying the proposition of the target component
+- `Partial-Attack`: when the source component is not in full contradiction, but weakening the target component by constraining its proposition. Usually occur between two claims
+- `Attack`: A component is attacking another one, if it is
+  - i) contradicting the proposition of the target component, or
+  - ii) undercutting its implicit assumption of significance constraints
+- `Premise` can only be connected to either `Claim` or another `Premise`
+- `Claim`'s can only point to other `Claim`'s
+- There might be more than one **outgoing** and/or **incoming relation** . In rare case, there is no relation to another component at all.
+
+(Mayer et al. 2020, p.2110)
+
+## Dataset Creation
 
 ### Curation Rationale
 
+"\[D\]espite its natural employment in healthcare applications, only few approaches have applied AM methods to this kind
+of text, and their contribution is limited to the detection
+of argument components, disregarding the more complex phase of
+predicting the relations among them. In addition, no huge annotated
+dataset for AM is available for the healthcare domain (p. 2108)...to support clinicians in decision making or in (semi)-automatically
+filling evidence tables for systematic reviews in evidence-based medicine. (p. 2114)"
+
 ### Source Data
 
+[MEDLINE database](https://www.nlm.nih.gov/medline/medline_overview.html)
+
 #### Initial Data Collection and Normalization
 
+Extended from the previous dataset in [Mayer et al. 2018](https://webusers.i3s.unice.fr/~riveill/IADB/publications/2018-COMMA.pdf), 500 medical abstract from randomized controlled trials (RCTs) were retrieved directly from [PubMed](https://www.ncbi.nlm.nih.gov/pubmed/) by searching for titles or abstracts containing the disease name.
+
+(See the definition of RCT in the authors' [guideline](https://gitlab.com/tomaye/abstrct/-/blob/master/AbstRCT_corpus/AnnotationGuidelines.pdf) (Section 1.2) and [US National Library of Medicine](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6235704/))
+
 #### Who are the source language producers?
 
-### Annotations #TODO
+\[More Information Needed\]
+
+### Annotations
 
 #### Annotation process
 
+"An expert in the medical domain (a pharmacist) validated the annotation
+guidelines before starting the annotation process." (p. 2110)
+
+"Annotation was started after a training phase, where amongst others the component boundaries were topic of discussion. Gold labels
+were set after a reconciliation phase, during which the annotators
+tried to reach an agreement. While the number of annotators vary for
+the two annotation phases (component and relation annotation).
+
+On the annotation of argument components, "IAA among the three annotators has been calculated
+on 30 abstracts, resulting in a Fleiss’ kappa of 0.72 for argumentative
+components and 0.68 for the more fine-grained distinction between
+claims and evidence." (p. 2109)
+
+On the annotation of argumentative relation, "IAA has been calculated on 30 abstracts annotated in parallel by three annotators,
+resulting in a Fleiss’ kappa of
+0.62. The annotation of the remaining abstracts was carried out by
+one of the above mentioned annotators." (p. 2110)
+
+See the [Annotation Guideline](https://gitlab.com/tomaye/abstrct/-/blob/master/AbstRCT_corpus/AnnotationGuidelines.pdf?ref_type=heads) for more information on definitions and annotated samples.
+
 #### Who are the annotators?
 
+Two annotators with background in computational linguistics. No information was given on the third annotator.
+
 ### Personal and Sensitive Information
 
 \[More Information Needed\]
 
-## Considerations for Using the Data #TODO
+## Considerations for Using the Data
 
 ### Social Impact of Dataset
 
+"These \[*intelligent*\] systems apply to clinical trials,
+clinical guidelines, and electronic health records, and their solutions range from the automated detection of PICO elements
+in health records to evidence-based reasoning for decision making. These applications highlight the need of clinicians to be supplied with frameworks able to extract, from the huge
+quantity of data available for the different diseases and treatments,
+the exact information they necessitate and to present this information in a structured way, easy to be (possibly semi-automatically)
+analyzed...Given its aptness to automatically detect in text those
+argumentative structures that are at the basis of evidence-based reasoning applications, AM represents a potential valuable contribution
+in the healthcare domain." (p. 2108)
+
+"We expect that our work will have a large impact for clinicians as it
+is a crucial step towards AI supported clinical deliberation at a large
+scale." (p. 2114)
+
 ### Discussion of Biases
 
+\[More Information Needed\]
+
 ### Other Known Limitations
 
-## Additional Information #TODO
+\[More Information Needed\]
+
+## Additional Information
 
 ### Dataset Curators
 
+\[More Information Needed\]
+
 ### Licensing Information
 
+- **License**: the AbstRCT dataset is released under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode)
+- **Funding**: This work is partly funded by the French government labelled PIA
+  program under its IDEX UCA JEDI project (ANR-15-IDEX-0001).
+  This work has been supported by the French government, through the
+  3IA Cote d’Azur Investments in the Future project managed by the
+  National Research Agency (ANR) with the reference number ANR19-P3IA-0002
+
 ### Citation Information
 
 ```
-
+@inproceedings{mayer2020ecai,
+  author    = {Tobias Mayer and
+               Elena Cabrio and
+               Serena Villata},
+  title     = {Transformer-Based Argument Mining for Healthcare Applications},
+  booktitle = {{ECAI} 2020 - 24th European Conference on Artificial Intelligence},
+  series    = {Frontiers in Artificial Intelligence and Applications},
+  volume    = {325},
+  pages     = {2108--2115},
+  publisher = {{IOS} Press},
+  year      = {2020},
+}
 ```
 
 ### Contributions

From 98014c63b28a56c0a6d811c3ad575dc7b1eef247 Mon Sep 17 00:00:00 2001
From: Ruangrin L <88072261+idalr@users.noreply.github.com>
Date: Tue, 12 Dec 2023 17:15:55 +0100
Subject: [PATCH 12/24] edit `test_document_converters`

---
 tests/dataset_builders/pie/test_abstrct.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py
index 64be662d..9b109e8c 100644
--- a/tests/dataset_builders/pie/test_abstrct.py
+++ b/tests/dataset_builders/pie/test_abstrct.py
@@ -313,6 +313,6 @@ def test_document_converters(dataset_variant):
         assert set(document_converters) == {
             TextDocumentWithLabeledSpansAndBinaryRelations,
         }
-        assert all(callable(v) for k, v in document_converters.items()) #currently not callable
+        assert all(dict(v) for k, v in document_converters.items())
     else:
         raise ValueError(f"Unknown dataset variant: {dataset_variant}")

From 733789122cc4b7fb1a813e751bd988b89cb5ff72 Mon Sep 17 00:00:00 2001
From: Ruangrin L <88072261+idalr@users.noreply.github.com>
Date: Tue, 12 Dec 2023 17:36:04 +0100
Subject: [PATCH 13/24] minor edit

---
 dataset_builders/pie/abstrct/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataset_builders/pie/abstrct/README.md b/dataset_builders/pie/abstrct/README.md
index 37f38a70..9a626c0f 100644
--- a/dataset_builders/pie/abstrct/README.md
+++ b/dataset_builders/pie/abstrct/README.md
@@ -1,6 +1,6 @@
 # PIE Dataset Card for "abstrct"
 
-This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the AbstRCT dataset ([paper](<>) and [data repository](https://gitlab.com/tomaye/abstrct)). Since the AbstRCT dataset is published in the [BRAT standoff format](https://brat.nlplab.org/standoff.html), this dataset builder is based on the [PyTorch-IE brat dataset loading script](https://huggingface.co/datasets/pie/brat).
+This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the AbstRCT dataset ([paper](https://ebooks.iospress.nl/publication/55129) and [data repository](https://gitlab.com/tomaye/abstrct)). Since the AbstRCT dataset is published in the [BRAT standoff format](https://brat.nlplab.org/standoff.html), this dataset builder is based on the [PyTorch-IE brat dataset loading script](https://huggingface.co/datasets/pie/brat).
 
 Therefore, the `abstrct` dataset as described here follows the data structure from the [PIE brat dataset card](https://huggingface.co/datasets/pie/brat).
 

From 74cbfd9a47101a98e1995b09a377907dd67fcd36 Mon Sep 17 00:00:00 2001
From: Arne Binder <arne.binder@dfki.de>
Date: Thu, 14 Dec 2023 14:26:01 +0100
Subject: [PATCH 14/24] update BASE_DATASET_REVISION

---
 dataset_builders/pie/abstrct/abstrct.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataset_builders/pie/abstrct/abstrct.py b/dataset_builders/pie/abstrct/abstrct.py
index f27dc851..c359ff4d 100644
--- a/dataset_builders/pie/abstrct/abstrct.py
+++ b/dataset_builders/pie/abstrct/abstrct.py
@@ -15,7 +15,7 @@
 
 class AbstRCT(BratBuilder):
     BASE_DATASET_PATH = "DFKI-SLT/brat"
-    BASE_DATASET_REVISION = "844de61e8a00dc6a93fc29dc185f6e617131fbf1"
+    BASE_DATASET_REVISION = "bb8c37d84ddf2da1e691d226c55fef48fd8149b5"
 
     # we need to add None to the list of dataset variants to support the default dataset variant
     BASE_BUILDER_KWARGS_DICT = {

From bf0ebcf557ed8bb93d21824bf10efc590add19fb Mon Sep 17 00:00:00 2001
From: Arne Binder <arne.binder@dfki.de>
Date: Thu, 14 Dec 2023 14:26:17 +0100
Subject: [PATCH 15/24] update requirements.txt

---
 dataset_builders/pie/abstrct/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataset_builders/pie/abstrct/requirements.txt b/dataset_builders/pie/abstrct/requirements.txt
index cb3deb1c..30439e3e 100644
--- a/dataset_builders/pie/abstrct/requirements.txt
+++ b/dataset_builders/pie/abstrct/requirements.txt
@@ -1 +1 @@
-pie-datasets>=0.4.0,<0.7.0
+pie-datasets>=0.4.0,<0.9.0

From 1a410e7afbb00d02b3394e0bbb6489f499aef933 Mon Sep 17 00:00:00 2001
From: Arne Binder <arne.binder@dfki.de>
Date: Thu, 14 Dec 2023 14:28:13 +0100
Subject: [PATCH 16/24] set strict_span_conversion=True for tokenize_document()

---
 tests/dataset_builders/pie/test_abstrct.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py
index 9b109e8c..ac8aeb65 100644
--- a/tests/dataset_builders/pie/test_abstrct.py
+++ b/tests/dataset_builders/pie/test_abstrct.py
@@ -208,7 +208,7 @@ def tokenized_documents_with_labeled_spans_and_binary_relations(
         tokenizer=tokenizer,
         return_overflowing_tokens=True,
         result_document_type=TestTokenDocumentWithLabeledSpansAndBinaryRelations,
-        strict_span_conversion=False,
+        strict_span_conversion=True,
         verbose=True,
     )
     return tokenized_docs
@@ -294,7 +294,7 @@ def test_tokenized_documents_with_entities_and_relations_all(
                     tokenizer=tokenizer,
                     return_overflowing_tokens=True,
                     result_document_type=TestTokenDocumentWithLabeledSpansAndBinaryRelations,
-                    strict_span_conversion=False,
+                    strict_span_conversion=True,
                     verbose=True,
                 )
                 # we just ensure that we get at least one tokenized document

From 16f015255b9479e346447f413485140e45e0ace2 Mon Sep 17 00:00:00 2001
From: Ruangrin L <88072261+idalr@users.noreply.github.com>
Date: Thu, 14 Dec 2023 11:29:27 +0100
Subject: [PATCH 17/24] minor changes

---
 tests/dataset_builders/pie/test_abstrct.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py
index ac8aeb65..266783f7 100644
--- a/tests/dataset_builders/pie/test_abstrct.py
+++ b/tests/dataset_builders/pie/test_abstrct.py
@@ -17,6 +17,7 @@
 disable_caching()
 
 DATASET_NAME = "abstrct"
+BUILDER_CLASS = AbstRCT
 PIE_DATASET_PATH = PIE_BASE_PATH / DATASET_NAME
 SPLIT_SIZES = {
     "glaucoma_test": 100,
@@ -28,7 +29,7 @@
 SPLIT = "neoplasm_train"
 
 
-@pytest.fixture(scope="module", params=[config.name for config in AbstRCT.BUILDER_CONFIGS])
+@pytest.fixture(scope="module", params=[config.name for config in BUILDER_CLASS.BUILDER_CONFIGS])
 def dataset_variant(request) -> str:
     return request.param
 
@@ -60,6 +61,11 @@ def test_document(document, dataset_variant):
     assert document.text.startswith(
         " A combination of mitoxantrone plus prednisone is preferable to prednisone alone"
     )
+    # TODO: test the actual content (annotation of the document)
+    # if dataset_variant == "default":
+    # assert
+    # elif dataset_variant == "merge_fragmented_spans":
+    # assert
 
 
 @pytest.fixture(scope="module")
@@ -303,7 +309,7 @@ def test_tokenized_documents_with_entities_and_relations_all(
 
 
 def test_document_converters(dataset_variant):
-    builder = AbstRCT(config_name=dataset_variant)
+    builder = BUILDER_CLASS(config_name=dataset_variant)
     document_converters = builder.document_converters
 
     if dataset_variant == "default":
@@ -313,6 +319,7 @@ def test_document_converters(dataset_variant):
         assert set(document_converters) == {
             TextDocumentWithLabeledSpansAndBinaryRelations,
         }
+        # TODO: recheck this
         assert all(dict(v) for k, v in document_converters.items())
     else:
         raise ValueError(f"Unknown dataset variant: {dataset_variant}")

From c6fd863337ad540df9f21c3e688572b0a829a6c9 Mon Sep 17 00:00:00 2001
From: Ruangrin L <88072261+idalr@users.noreply.github.com>
Date: Mon, 18 Dec 2023 16:38:55 +0100
Subject: [PATCH 18/24] updated `test_abstrct.py`

---
 tests/dataset_builders/pie/test_abstrct.py | 70 +++++++++++++++++++---
 1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py
index 266783f7..a70a0fb1 100644
--- a/tests/dataset_builders/pie/test_abstrct.py
+++ b/tests/dataset_builders/pie/test_abstrct.py
@@ -58,14 +58,64 @@ def document(dataset, dataset_variant) -> Union[BratDocument, BratDocumentWithMe
 
 def test_document(document, dataset_variant):
     assert document is not None
-    assert document.text.startswith(
-        " A combination of mitoxantrone plus prednisone is preferable to prednisone alone"
+    assert document.id == "10561201"
+
+    # check spans
+    assert len(document.spans) == 7
+    span_texts = document.metadata["span_texts"]
+    assert (
+        span_texts[0]
+        == "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain "
+           "in men with metastatic, hormone-resistant, prostate cancer."
     )
-    # TODO: test the actual content (annotation of the document)
-    # if dataset_variant == "default":
-    # assert
-    # elif dataset_variant == "merge_fragmented_spans":
-    # assert
+    assert span_texts[1] == "At 6 weeks, both groups showed improvement in several HQL domains,"
+    assert (
+        span_texts[2]
+        == "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the "
+           "prednisone-alone group."
+    )
+    assert (
+        span_texts[3]
+        == "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking "
+           "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four "
+           "functioning domains, and nine symptoms (.001 < P <. 01),"
+    )
+    assert (
+        span_texts[4]
+        == "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group "
+           "(.004 < P <.05)."
+    )
+    assert (
+        span_texts[5]
+        == "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with "
+           "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003)."
+    )
+    assert (
+        span_texts[6]
+        == "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
+           "in several HQL domains and symptoms than treatment with prednisone alone."
+    )
+
+    # check relations
+    assert len(document.relations) == 6
+    document.relations[0].label == "Support"
+    document.relations[0].head == document.spans[6]
+    document.relations[0].tail == document.spans[0]
+    document.relations[1].label == "Support"
+    document.relations[1].head == document.spans[1]
+    document.relations[1].tail == document.spans[6]
+    document.relations[2].label == "Support"
+    document.relations[2].head == document.spans[2]
+    document.relations[2].tail == document.spans[6]
+    document.relations[3].label == "Support"
+    document.relations[3].head == document.spans[5]
+    document.relations[3].tail == document.spans[6]
+    document.relations[4].label == "Support"
+    document.relations[4].head == document.spans[3]
+    document.relations[4].tail == document.spans[6]
+    document.relations[5].label == "Support"
+    document.relations[5].head == document.spans[5]
+    document.relations[5].tail == document.spans[0]
 
 
 @pytest.fixture(scope="module")
@@ -319,7 +369,9 @@ def test_document_converters(dataset_variant):
         assert set(document_converters) == {
             TextDocumentWithLabeledSpansAndBinaryRelations,
         }
-        # TODO: recheck this
-        assert all(dict(v) for k, v in document_converters.items())
+        assert document_converters[TextDocumentWithLabeledSpansAndBinaryRelations] == {
+            "spans": "labeled_spans",
+            "relations": "binary_relations",
+        }
     else:
         raise ValueError(f"Unknown dataset variant: {dataset_variant}")

From 2097a34d72aa528d6936cc20094dcce5f412777c Mon Sep 17 00:00:00 2001
From: Ruangrin L <88072261+idalr@users.noreply.github.com>
Date: Mon, 18 Dec 2023 16:40:51 +0100
Subject: [PATCH 19/24] make pre-commit happy

---
 tests/dataset_builders/pie/test_abstrct.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py
index a70a0fb1..7cbc557c 100644
--- a/tests/dataset_builders/pie/test_abstrct.py
+++ b/tests/dataset_builders/pie/test_abstrct.py
@@ -66,34 +66,34 @@ def test_document(document, dataset_variant):
     assert (
         span_texts[0]
         == "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain "
-           "in men with metastatic, hormone-resistant, prostate cancer."
+        "in men with metastatic, hormone-resistant, prostate cancer."
     )
     assert span_texts[1] == "At 6 weeks, both groups showed improvement in several HQL domains,"
     assert (
         span_texts[2]
         == "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the "
-           "prednisone-alone group."
+        "prednisone-alone group."
     )
     assert (
         span_texts[3]
         == "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking "
-           "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four "
-           "functioning domains, and nine symptoms (.001 < P <. 01),"
+        "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four "
+        "functioning domains, and nine symptoms (.001 < P <. 01),"
     )
     assert (
         span_texts[4]
         == "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group "
-           "(.004 < P <.05)."
+        "(.004 < P <.05)."
     )
     assert (
         span_texts[5]
         == "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with "
-           "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003)."
+        "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003)."
     )
     assert (
         span_texts[6]
         == "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
-           "in several HQL domains and symptoms than treatment with prednisone alone."
+        "in several HQL domains and symptoms than treatment with prednisone alone."
     )
 
     # check relations

From ac1c097119a6c7799933ce0e0e2f42dec3470bc0 Mon Sep 17 00:00:00 2001
From: Ruangrin L <88072261+idalr@users.noreply.github.com>
Date: Mon, 18 Dec 2023 20:37:20 +0100
Subject: [PATCH 20/24] edit 'test_document'

---
 tests/dataset_builders/pie/test_abstrct.py | 63 +++++++++++++---------
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py
index 7cbc557c..67b4190a 100644
--- a/tests/dataset_builders/pie/test_abstrct.py
+++ b/tests/dataset_builders/pie/test_abstrct.py
@@ -60,40 +60,51 @@ def test_document(document, dataset_variant):
     assert document is not None
     assert document.id == "10561201"
 
+    # check the annotation
+    if dataset_variant == "default":
+        span_texts_labels_tuples = [
+            (document.text[span.slices[0][0] : span.slices[-1][1]], span.label)
+            for span in document.spans
+        ]
+    elif dataset_variant == "merge_fragmented_spans":
+        span_texts_labels_tuples = [(str(span), span.label) for span in document.spans]
+
     # check spans
     assert len(document.spans) == 7
-    span_texts = document.metadata["span_texts"]
-    assert (
-        span_texts[0]
-        == "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain "
-        "in men with metastatic, hormone-resistant, prostate cancer."
+    assert span_texts_labels_tuples[0] == (
+        "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain "
+        "in men with metastatic, hormone-resistant, prostate cancer.",
+        "MajorClaim",
+    )
+    assert span_texts_labels_tuples[1] == (
+        "At 6 weeks, both groups showed improvement in several HQL domains,",
+        "Premise",
     )
-    assert span_texts[1] == "At 6 weeks, both groups showed improvement in several HQL domains,"
-    assert (
-        span_texts[2]
-        == "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the "
-        "prednisone-alone group."
+    assert span_texts_labels_tuples[2] == (
+        "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the "
+        "prednisone-alone group.",
+        "Premise",
     )
-    assert (
-        span_texts[3]
-        == "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking "
+    assert span_texts_labels_tuples[3] == (
+        "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking "
         "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four "
-        "functioning domains, and nine symptoms (.001 < P <. 01),"
+        "functioning domains, and nine symptoms (.001 < P <. 01),",
+        "Premise",
     )
-    assert (
-        span_texts[4]
-        == "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group "
-        "(.004 < P <.05)."
+    assert span_texts_labels_tuples[4] == (
+        "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group "
+        "(.004 < P <.05).",
+        "Premise",
     )
-    assert (
-        span_texts[5]
-        == "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with "
-        "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003)."
+    assert span_texts_labels_tuples[5] == (
+        "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with "
+        "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).",
+        "Premise",
     )
-    assert (
-        span_texts[6]
-        == "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
-        "in several HQL domains and symptoms than treatment with prednisone alone."
+    assert span_texts_labels_tuples[6] == (
+        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
+        "in several HQL domains and symptoms than treatment with prednisone alone.",
+        "Claim",
     )
 
     # check relations

From 0a2b8e923a6e83bec2251ec7f8b6cdbfc8b04d41 Mon Sep 17 00:00:00 2001
From: Ruangrin L <88072261+idalr@users.noreply.github.com>
Date: Tue, 19 Dec 2023 10:58:09 +0100
Subject: [PATCH 21/24] checked fragments in spans

---
 tests/dataset_builders/pie/test_abstrct.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py
index 67b4190a..896ec8b7 100644
--- a/tests/dataset_builders/pie/test_abstrct.py
+++ b/tests/dataset_builders/pie/test_abstrct.py
@@ -44,6 +44,14 @@ def test_dataset(dataset):
     assert {name: len(ds) for name, ds in dataset.items()} == SPLIT_SIZES
 
 
+def test_no_fragmented_spans(dataset, dataset_variant):
+    if dataset_variant == "default":
+        for split, docs in dataset.items():
+            for doc in docs:
+                # test the number of slices of the LabeledMultiSpan annotations
+                assert [len(span.slices) == 1 for span in doc.spans]
+
+
 @pytest.fixture(scope="module")
 def document(dataset, dataset_variant) -> Union[BratDocument, BratDocumentWithMergedSpans]:
     result = dataset[SPLIT][0]
@@ -63,7 +71,7 @@ def test_document(document, dataset_variant):
     # check the annotation
     if dataset_variant == "default":
         span_texts_labels_tuples = [
-            (document.text[span.slices[0][0] : span.slices[-1][1]], span.label)
+            (" ".join([document.text[start:end] for start, end in span.slices]), span.label)
             for span in document.spans
         ]
     elif dataset_variant == "merge_fragmented_spans":

From 682f872568aca71d06dd58f6ab83d7195b2989b7 Mon Sep 17 00:00:00 2001
From: Ruangrin L <88072261+idalr@users.noreply.github.com>
Date: Tue, 19 Dec 2023 11:37:39 +0100
Subject: [PATCH 22/24] minor fix

---
 tests/dataset_builders/pie/test_abstrct.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py
index 896ec8b7..af986623 100644
--- a/tests/dataset_builders/pie/test_abstrct.py
+++ b/tests/dataset_builders/pie/test_abstrct.py
@@ -49,7 +49,7 @@ def test_no_fragmented_spans(dataset, dataset_variant):
         for split, docs in dataset.items():
             for doc in docs:
                 # test the number of slices of the LabeledMultiSpan annotations
-                assert [len(span.slices) == 1 for span in doc.spans]
+                assert all([len(span.slices) == 1 for span in doc.spans])
 
 
 @pytest.fixture(scope="module")

From 612466b8e5a5a8161e559e0648d914412e258592 Mon Sep 17 00:00:00 2001
From: Ruangrin L <88072261+idalr@users.noreply.github.com>
Date: Tue, 19 Dec 2023 17:27:01 +0100
Subject: [PATCH 23/24] converted to single dataset_variant

---
 dataset_builders/pie/abstrct/README.md     | 14 +++--
 dataset_builders/pie/abstrct/abstrct.py    | 30 +++++------
 tests/dataset_builders/pie/test_abstrct.py | 61 ++++++++--------------
 3 files changed, 42 insertions(+), 63 deletions(-)

diff --git a/dataset_builders/pie/abstrct/README.md b/dataset_builders/pie/abstrct/README.md
index 9a626c0f..d0b752a4 100644
--- a/dataset_builders/pie/abstrct/README.md
+++ b/dataset_builders/pie/abstrct/README.md
@@ -21,7 +21,10 @@ The language in the dataset is English (in the medical/healthcare domain).
 
 ### Dataset Variants
 
-See [PIE-Brat Data Variants](https://huggingface.co/datasets/pie/brat#data-variants).
+The `abstrct` dataset comes in a single version (`default`) with `BratDocumentWithMergedSpans` as document type. Note,
+that this in contrast to the base `brat` dataset, where the document type for the `default` variant is `BratDocument`.
+The reason is that the AbstRCT dataset has already been published with only single-fragment spans.
+Without any need to merge fragments, the document type `BratDocumentWithMergedSpans` is easier to handle for most of the task modules.
 
 ### Data Schema
 
@@ -34,13 +37,8 @@ from pie_datasets import load_dataset, builders
 
 # load default version
 datasets = load_dataset("pie/abstrct")
-doc = datasets["train"][0]
-assert isinstance(doc, builders.brat.BratDocument)
-
-# load version with merged span fragments
-dataset_merged_spans = load_dataset("pie/abstrct", name="merge_fragmented_spans")
-doc_merged_spans = dataset_merged_spans["train"][0]
-assert isinstance(doc_merged_spans, builders.brat.BratDocumentWithMergedSpans)
+doc = datasets["neoplasm_train"][0]
+assert isinstance(doc, builders.brat.BratDocumentWithMergedSpans)
 ```
 
 ### Document Converters
diff --git a/dataset_builders/pie/abstrct/abstrct.py b/dataset_builders/pie/abstrct/abstrct.py
index c359ff4d..6dc12e42 100644
--- a/dataset_builders/pie/abstrct/abstrct.py
+++ b/dataset_builders/pie/abstrct/abstrct.py
@@ -1,7 +1,7 @@
 from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations
 
-from pie_datasets.builders import BratBuilder
-from pie_datasets.core.dataset import DocumentConvertersType
+from pie_datasets.builders import BratBuilder, BratConfig
+from pie_datasets.builders.brat import BratDocumentWithMergedSpans
 
 URL = "https://gitlab.com/tomaye/abstrct/-/archive/master/abstrct-master.zip"
 SPLIT_PATHS = {
@@ -17,22 +17,22 @@ class AbstRCT(BratBuilder):
     BASE_DATASET_PATH = "DFKI-SLT/brat"
     BASE_DATASET_REVISION = "bb8c37d84ddf2da1e691d226c55fef48fd8149b5"
 
+    BUILDER_CONFIGS = [
+        BratConfig(name=BratBuilder.DEFAULT_CONFIG_NAME, merge_fragmented_spans=True),
+    ]
+    DOCUMENT_TYPES = {
+        BratBuilder.DEFAULT_CONFIG_NAME: BratDocumentWithMergedSpans,
+    }
+
     # we need to add None to the list of dataset variants to support the default dataset variant
     BASE_BUILDER_KWARGS_DICT = {
         dataset_variant: {"url": URL, "split_paths": SPLIT_PATHS}
         for dataset_variant in ["default", "merge_fragmented_spans", None]
     }
 
-    @property
-    def document_converters(self) -> DocumentConvertersType:
-        if self.config.name == "default":
-            return {}
-        elif self.config.name == "merge_fragmented_spans":
-            return {
-                TextDocumentWithLabeledSpansAndBinaryRelations: {
-                    "spans": "labeled_spans",
-                    "relations": "binary_relations",
-                },
-            }
-        else:
-            raise ValueError(f"Unknown dataset variant: {self.config.name}")
+    DOCUMENT_CONVERTERS = {
+        TextDocumentWithLabeledSpansAndBinaryRelations: {
+            "spans": "labeled_spans",
+            "relations": "binary_relations",
+        },
+    }
diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py
index af986623..02036b8f 100644
--- a/tests/dataset_builders/pie/test_abstrct.py
+++ b/tests/dataset_builders/pie/test_abstrct.py
@@ -1,14 +1,15 @@
-from typing import List, Optional, Union
+from typing import List, Optional
 
 import pytest
 from datasets import disable_caching
 from pie_modules.document.processing import tokenize_document
+from pytorch_ie.core import Document
 from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations
 from transformers import AutoTokenizer, PreTrainedTokenizer
 
 from dataset_builders.pie.abstrct.abstrct import AbstRCT
 from pie_datasets import DatasetDict
-from pie_datasets.builders.brat import BratDocument, BratDocumentWithMergedSpans
+from pie_datasets.builders.brat import BratDocumentWithMergedSpans
 from tests.dataset_builders.common import (
     PIE_BASE_PATH,
     TestTokenDocumentWithLabeledSpansAndBinaryRelations,
@@ -44,23 +45,25 @@ def test_dataset(dataset):
     assert {name: len(ds) for name, ds in dataset.items()} == SPLIT_SIZES
 
 
-def test_no_fragmented_spans(dataset, dataset_variant):
-    if dataset_variant == "default":
-        for split, docs in dataset.items():
-            for doc in docs:
-                # test the number of slices of the LabeledMultiSpan annotations
-                assert all([len(span.slices) == 1 for span in doc.spans])
+@pytest.fixture(scope="module")
+def builder(dataset_variant) -> BUILDER_CLASS:
+    return BUILDER_CLASS(config_name=dataset_variant)
+
+
+def test_builder(builder, dataset_variant):
+    assert builder is not None
+    assert builder.config_id == dataset_variant
+    assert builder.dataset_name == DATASET_NAME
+    assert builder.document_type == BratDocumentWithMergedSpans
 
 
 @pytest.fixture(scope="module")
-def document(dataset, dataset_variant) -> Union[BratDocument, BratDocumentWithMergedSpans]:
+def document(dataset) -> BratDocumentWithMergedSpans:
     result = dataset[SPLIT][0]
-    if dataset_variant == "default":
-        assert isinstance(result, BratDocument)
-    elif dataset_variant == "merge_fragmented_spans":
-        assert isinstance(result, BratDocumentWithMergedSpans)
-    else:
-        raise ValueError(f"Unknown dataset variant: {dataset_variant}")
+    # we can not assert the real document type because it may come from a dataset loading script
+    # downloaded to a temporary directory and thus have a different type object, although it is
+    # semantically the same
+    assert isinstance(result, Document)
     return result
 
 
@@ -69,12 +72,7 @@ def test_document(document, dataset_variant):
     assert document.id == "10561201"
 
     # check the annotation
-    if dataset_variant == "default":
-        span_texts_labels_tuples = [
-            (" ".join([document.text[start:end] for start, end in span.slices]), span.label)
-            for span in document.spans
-        ]
-    elif dataset_variant == "merge_fragmented_spans":
+    if dataset_variant == "default" or dataset_variant is None:
         span_texts_labels_tuples = [(str(span), span.label) for span in document.spans]
 
     # check spans
@@ -141,20 +139,7 @@ def test_document(document, dataset_variant):
 def dataset_of_text_documents_with_labeled_spans_and_binary_relations(
     dataset, dataset_variant
 ) -> Optional[DatasetDict]:
-    if dataset_variant == "default":
-        with pytest.raises(ValueError) as excinfo:
-            dataset.to_document_type(TextDocumentWithLabeledSpansAndBinaryRelations)
-        assert (
-            str(excinfo.value)
-            == "No valid key (either subclass or superclass) was found for the document type "
-            "'<class 'pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations'>' in the "
-            "document_converters of the dataset. Available keys: set(). Consider adding a respective "
-            "converter to the dataset with dataset.register_document_converter(my_converter_method) "
-            "where my_converter_method should accept <class 'pie_datasets.builders.brat.BratDocument'> "
-            "as input and return '<class 'pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations'>'."
-        )
-        converted_dataset = None
-    elif dataset_variant == "merge_fragmented_spans":
+    if dataset_variant == "default" or dataset_variant is None:
         converted_dataset = dataset.to_document_type(
             TextDocumentWithLabeledSpansAndBinaryRelations
         )
@@ -168,9 +153,7 @@ def test_dataset_of_text_documents_with_labeled_spans_and_binary_relations(
 ):
     if dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None:
         # get a document to check
-        converted_doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[
-            "neoplasm_train"
-        ][0]
+        converted_doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0]
         # check that the conversion is correct and the data makes sense
         assert isinstance(converted_doc, TextDocumentWithLabeledSpansAndBinaryRelations)
 
@@ -382,8 +365,6 @@ def test_document_converters(dataset_variant):
     document_converters = builder.document_converters
 
     if dataset_variant == "default":
-        assert document_converters == {}
-    elif dataset_variant == "merge_fragmented_spans":
         assert len(document_converters) == 1
         assert set(document_converters) == {
             TextDocumentWithLabeledSpansAndBinaryRelations,

From 172fc14a9491a7660712716474dfe2273a00d714 Mon Sep 17 00:00:00 2001
From: Ruangrin L <88072261+idalr@users.noreply.github.com>
Date: Tue, 19 Dec 2023 18:10:02 +0100
Subject: [PATCH 24/24] minor fixes

---
 dataset_builders/pie/abstrct/README.md     |   4 +-
 dataset_builders/pie/abstrct/abstrct.py    |   2 +-
 tests/dataset_builders/pie/test_abstrct.py | 353 ++++++++++-----------
 3 files changed, 175 insertions(+), 184 deletions(-)

diff --git a/dataset_builders/pie/abstrct/README.md b/dataset_builders/pie/abstrct/README.md
index d0b752a4..45123b8a 100644
--- a/dataset_builders/pie/abstrct/README.md
+++ b/dataset_builders/pie/abstrct/README.md
@@ -46,9 +46,9 @@ assert isinstance(doc, builders.brat.BratDocumentWithMergedSpans)
 The dataset provides document converters for the following target document types:
 
 - `pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations`
-  - `LabeledSpans`, converted from `BratDocument`'s `spans`
+  - `LabeledSpans`, converted from `BratDocumentWithMergedSpans`'s `spans`
     - labels: `MajorClaim`, `Claim`, `Premise`
-  - `BinraryRelations`, converted from `BratDocument`'s `relations`
+  - `BinraryRelations`, converted from `BratDocumentWithMergedSpans`'s `relations`
     - labels:  `Support`, `Partial-Attack`, `Attack`
 
 See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type
diff --git a/dataset_builders/pie/abstrct/abstrct.py b/dataset_builders/pie/abstrct/abstrct.py
index 6dc12e42..045c2e23 100644
--- a/dataset_builders/pie/abstrct/abstrct.py
+++ b/dataset_builders/pie/abstrct/abstrct.py
@@ -27,7 +27,7 @@ class AbstRCT(BratBuilder):
     # we need to add None to the list of dataset variants to support the default dataset variant
     BASE_BUILDER_KWARGS_DICT = {
         dataset_variant: {"url": URL, "split_paths": SPLIT_PATHS}
-        for dataset_variant in ["default", "merge_fragmented_spans", None]
+        for dataset_variant in ["default", None]
     }
 
     DOCUMENT_CONVERTERS = {
diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py
index 02036b8f..54b99bce 100644
--- a/tests/dataset_builders/pie/test_abstrct.py
+++ b/tests/dataset_builders/pie/test_abstrct.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import List
 
 import pytest
 from datasets import disable_caching
@@ -71,12 +71,9 @@ def test_document(document, dataset_variant):
     assert document is not None
     assert document.id == "10561201"
 
-    # check the annotation
-    if dataset_variant == "default" or dataset_variant is None:
-        span_texts_labels_tuples = [(str(span), span.label) for span in document.spans]
-
-    # check spans
+    # check the spans
     assert len(document.spans) == 7
+    span_texts_labels_tuples = [(str(span), span.label) for span in document.spans]
     assert span_texts_labels_tuples[0] == (
         "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain "
         "in men with metastatic, hormone-resistant, prostate cancer.",
@@ -138,7 +135,7 @@ def test_document(document, dataset_variant):
 @pytest.fixture(scope="module")
 def dataset_of_text_documents_with_labeled_spans_and_binary_relations(
     dataset, dataset_variant
-) -> Optional[DatasetDict]:
+) -> DatasetDict:
     if dataset_variant == "default" or dataset_variant is None:
         converted_dataset = dataset.to_document_type(
             TextDocumentWithLabeledSpansAndBinaryRelations
@@ -151,98 +148,97 @@ def dataset_of_text_documents_with_labeled_spans_and_binary_relations(
 def test_dataset_of_text_documents_with_labeled_spans_and_binary_relations(
     dataset_of_text_documents_with_labeled_spans_and_binary_relations,
 ):
-    if dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None:
-        # get a document to check
-        converted_doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0]
-        # check that the conversion is correct and the data makes sense
-        assert isinstance(converted_doc, TextDocumentWithLabeledSpansAndBinaryRelations)
-
-        # check the entities
-        assert len(converted_doc.labeled_spans) == 7
-        entity_tuples = [(str(ent), ent.label) for ent in converted_doc.labeled_spans]
-        assert entity_tuples[0] == (
-            "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in men "
-            "with metastatic, hormone-resistant, prostate cancer.",
-            "MajorClaim",
-        )
-        assert entity_tuples[1] == (
-            "At 6 weeks, both groups showed improvement in several HQL domains,",
-            "Premise",
-        )
-        assert entity_tuples[2] == (
-            "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the "
-            "prednisone-alone group.",
-            "Premise",
-        )
-        assert entity_tuples[3] == (
-            "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking "
-            "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), "
-            "four functioning domains, and nine symptoms (.001 < P <. 01),",
-            "Premise",
-        )
-        assert entity_tuples[4] == (
-            "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group "
-            "(.004 < P <.05).",
-            "Premise",
-        )
-        assert entity_tuples[5] == (
-            "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with "
-            "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).",
-            "Premise",
-        )
-        assert entity_tuples[6] == (
-            "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
-            "in several HQL domains and symptoms than treatment with prednisone alone.",
-            "Claim",
-        )
+    # get a document to check
+    converted_doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0]
+    # check that the conversion is correct and the data makes sense
+    assert isinstance(converted_doc, TextDocumentWithLabeledSpansAndBinaryRelations)
+
+    # check the entities
+    assert len(converted_doc.labeled_spans) == 7
+    entity_tuples = [(str(ent), ent.label) for ent in converted_doc.labeled_spans]
+    assert entity_tuples[0] == (
+        "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in men "
+        "with metastatic, hormone-resistant, prostate cancer.",
+        "MajorClaim",
+    )
+    assert entity_tuples[1] == (
+        "At 6 weeks, both groups showed improvement in several HQL domains,",
+        "Premise",
+    )
+    assert entity_tuples[2] == (
+        "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the "
+        "prednisone-alone group.",
+        "Premise",
+    )
+    assert entity_tuples[3] == (
+        "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking "
+        "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), "
+        "four functioning domains, and nine symptoms (.001 < P <. 01),",
+        "Premise",
+    )
+    assert entity_tuples[4] == (
+        "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group "
+        "(.004 < P <.05).",
+        "Premise",
+    )
+    assert entity_tuples[5] == (
+        "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with "
+        "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).",
+        "Premise",
+    )
+    assert entity_tuples[6] == (
+        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
+        "in several HQL domains and symptoms than treatment with prednisone alone.",
+        "Claim",
+    )
 
-        # check the relations
-        assert len(converted_doc.binary_relations) == 6
-        relation_tuples = [
-            (str(rel.head), rel.label, str(rel.tail)) for rel in converted_doc.binary_relations
-        ]
-        assert relation_tuples[0] == (
-            "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
-            "in several HQL domains and symptoms than treatment with prednisone alone.",
-            "Support",
-            "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain "
-            "in men with metastatic, hormone-resistant, prostate cancer.",
-        )
-        assert relation_tuples[1] == (
-            "At 6 weeks, both groups showed improvement in several HQL domains,",
-            "Support",
-            "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
-            "in several HQL domains and symptoms than treatment with prednisone alone.",
-        )
-        assert relation_tuples[2] == (
-            "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the "
-            "prednisone-alone group.",
-            "Support",
-            "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
-            "in several HQL domains and symptoms than treatment with prednisone alone.",
-        )
-        assert relation_tuples[3] == (
-            "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group "
-            "(.004 < P <.05).",
-            "Support",
-            "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement in "
-            "several HQL domains and symptoms than treatment with prednisone alone.",
-        )
-        assert relation_tuples[4] == (
-            "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking "
-            "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four "
-            "functioning domains, and nine symptoms (.001 < P <. 01),",
-            "Support",
-            "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
-            "in several HQL domains and symptoms than treatment with prednisone alone.",
-        )
-        assert relation_tuples[5] == (
-            "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with improvements "
-            "in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).",
-            "Support",
-            "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in "
-            "men with metastatic, hormone-resistant, prostate cancer.",
-        )
+    # check the relations
+    assert len(converted_doc.binary_relations) == 6
+    relation_tuples = [
+        (str(rel.head), rel.label, str(rel.tail)) for rel in converted_doc.binary_relations
+    ]
+    assert relation_tuples[0] == (
+        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
+        "in several HQL domains and symptoms than treatment with prednisone alone.",
+        "Support",
+        "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain "
+        "in men with metastatic, hormone-resistant, prostate cancer.",
+    )
+    assert relation_tuples[1] == (
+        "At 6 weeks, both groups showed improvement in several HQL domains,",
+        "Support",
+        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
+        "in several HQL domains and symptoms than treatment with prednisone alone.",
+    )
+    assert relation_tuples[2] == (
+        "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the "
+        "prednisone-alone group.",
+        "Support",
+        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
+        "in several HQL domains and symptoms than treatment with prednisone alone.",
+    )
+    assert relation_tuples[3] == (
+        "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group "
+        "(.004 < P <.05).",
+        "Support",
+        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement in "
+        "several HQL domains and symptoms than treatment with prednisone alone.",
+    )
+    assert relation_tuples[4] == (
+        "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking "
+        "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four "
+        "functioning domains, and nine symptoms (.001 < P <. 01),",
+        "Support",
+        "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement "
+        "in several HQL domains and symptoms than treatment with prednisone alone.",
+    )
+    assert relation_tuples[5] == (
+        "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with "
+        "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).",
+        "Support",
+        "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in "
+        "men with metastatic, hormone-resistant, prostate cancer.",
+    )
 
 
 @pytest.fixture(scope="module")
@@ -253,10 +249,7 @@ def tokenizer() -> PreTrainedTokenizer:
 @pytest.fixture(scope="module")
 def tokenized_documents_with_labeled_spans_and_binary_relations(
     dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer
-) -> Optional[List[TestTokenDocumentWithLabeledSpansAndBinaryRelations]]:
-    if dataset_of_text_documents_with_labeled_spans_and_binary_relations is None:
-        return None
-
+) -> List[TestTokenDocumentWithLabeledSpansAndBinaryRelations]:
     # get a document to check
     doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0]
     # Note, that this is a list of documents, because the document may be split into chunks
@@ -275,96 +268,94 @@ def tokenized_documents_with_labeled_spans_and_binary_relations(
 def test_tokenized_documents_with_labeled_spans_and_binary_relations(
     tokenized_documents_with_labeled_spans_and_binary_relations,
 ):
-    if tokenized_documents_with_labeled_spans_and_binary_relations is not None:
-        docs = tokenized_documents_with_labeled_spans_and_binary_relations
-        # check that the tokenization was fine
-        assert len(docs) == 1
-        doc = docs[0]
-        assert len(doc.tokens) == 465
-        assert len(doc.labeled_spans) == 7
-        ent = doc.labeled_spans[0]
-        assert (
-            str(ent)
-            == "('a', 'combination', 'of', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'is', "
-            "'prefer', '##able', 'to', 'pre', '##d', '##nis', '##one', 'alone', 'for', 'reduction', 'of', 'pain', 'in', "
-            "'men', 'with', 'meta', '##static', ',', 'hormone', '-', 'resistant', ',', 'prostate', 'cancer', '.')"
-        )
-        ent = doc.labeled_spans[1]
-        assert (
-            str(ent)
-            == "('at', '6', 'weeks', ',', 'both', 'groups', 'showed', 'improvement', 'in', 'several', 'hq', '##l', "
-            "'domains', ',')"
-        )
-        ent = doc.labeled_spans[2]
-        assert (
-            str(ent)
-            == "('only', 'physical', 'functioning', 'and', 'pain', 'were', 'better', 'in', 'the', 'mit', '##ox', '##ant', "
-            "'##rone', '-', 'plus', '-', 'pre', '##d', '##nis', '##one', 'group', 'than', 'in', 'the', 'pre', '##d', "
-            "'##nis', '##one', '-', 'alone', 'group', '.')"
-        )
-        ent = doc.labeled_spans[3]
-        assert (
-            str(ent)
-            == "('after', '6', 'weeks', ',', 'patients', 'taking', 'pre', '##d', '##nis', '##one', 'showed', 'no', "
-            "'improvement', 'in', 'hq', '##l', 'scores', ',', 'whereas', 'those', 'taking', 'mit', '##ox', '##ant', "
-            "'##rone', 'plus', 'pre', '##d', '##nis', '##one', 'showed', 'significant', 'improvements', 'in', 'global', "
-            "'quality', 'of', 'life', '(', 'p', '=', '.', '00', '##9', ')', ',', 'four', 'functioning', 'domains', ',', "
-            "'and', 'nine', 'symptoms', '(', '.', '001', '<', 'p', '<', '.', '01', ')', ',')"
-        )
-        ent = doc.labeled_spans[4]
-        assert (
-            str(ent)
-            == "('the', 'improvement', '(', '>', '10', 'units', 'on', 'a', 'scale', 'of', '0', 'to', '##100', ')', "
-            "'lasted', 'longer', 'than', 'in', 'the', 'pre', '##d', '##nis', '##one', '-', 'alone', 'group', '(', '.', "
-            "'00', '##4', '<', 'p', '<', '.', '05', ')', '.')"
-        )
-        ent = doc.labeled_spans[5]
-        assert (
-            str(ent)
-            == "('the', 'addition', 'of', 'mit', '##ox', '##ant', '##rone', 'to', 'pre', '##d', '##nis', '##one', "
-            "'after', 'failure', 'of', 'pre', '##d', '##nis', '##one', 'alone', 'was', 'associated', 'with', "
-            "'improvements', 'in', 'pain', ',', 'pain', 'impact', ',', 'pain', 'relief', ',', 'ins', '##om', '##nia', "
-            "',', 'and', 'global', 'quality', 'of', 'life', '(', '.', '001', '<', 'p', '<', '.', '00', '##3', ')', '.')"
-        )
-        ent = doc.labeled_spans[6]
-        assert (
-            str(ent)
-            == "('treatment', 'with', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'was', "
-            "'associated', 'with', 'greater', 'and', 'longer', '-', 'lasting', 'improvement', 'in', 'several', "
-            "'hq', '##l', 'domains', 'and', 'symptoms', 'than', 'treatment', 'with', 'pre', '##d', '##nis', '##one', "
-            "'alone', '.')"
-        )
+    docs = tokenized_documents_with_labeled_spans_and_binary_relations
+    # check that the tokenization was fine
+    assert len(docs) == 1
+    doc = docs[0]
+    assert len(doc.tokens) == 465
+    assert len(doc.labeled_spans) == 7
+    ent = doc.labeled_spans[0]
+    assert (
+        str(ent)
+        == "('a', 'combination', 'of', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'is', "
+        "'prefer', '##able', 'to', 'pre', '##d', '##nis', '##one', 'alone', 'for', 'reduction', 'of', 'pain', 'in', "
+        "'men', 'with', 'meta', '##static', ',', 'hormone', '-', 'resistant', ',', 'prostate', 'cancer', '.')"
+    )
+    ent = doc.labeled_spans[1]
+    assert (
+        str(ent)
+        == "('at', '6', 'weeks', ',', 'both', 'groups', 'showed', 'improvement', 'in', 'several', 'hq', '##l', "
+        "'domains', ',')"
+    )
+    ent = doc.labeled_spans[2]
+    assert (
+        str(ent)
+        == "('only', 'physical', 'functioning', 'and', 'pain', 'were', 'better', 'in', 'the', 'mit', '##ox', '##ant', "
+        "'##rone', '-', 'plus', '-', 'pre', '##d', '##nis', '##one', 'group', 'than', 'in', 'the', 'pre', '##d', "
+        "'##nis', '##one', '-', 'alone', 'group', '.')"
+    )
+    ent = doc.labeled_spans[3]
+    assert (
+        str(ent)
+        == "('after', '6', 'weeks', ',', 'patients', 'taking', 'pre', '##d', '##nis', '##one', 'showed', 'no', "
+        "'improvement', 'in', 'hq', '##l', 'scores', ',', 'whereas', 'those', 'taking', 'mit', '##ox', '##ant', "
+        "'##rone', 'plus', 'pre', '##d', '##nis', '##one', 'showed', 'significant', 'improvements', 'in', 'global', "
+        "'quality', 'of', 'life', '(', 'p', '=', '.', '00', '##9', ')', ',', 'four', 'functioning', 'domains', ',', "
+        "'and', 'nine', 'symptoms', '(', '.', '001', '<', 'p', '<', '.', '01', ')', ',')"
+    )
+    ent = doc.labeled_spans[4]
+    assert (
+        str(ent)
+        == "('the', 'improvement', '(', '>', '10', 'units', 'on', 'a', 'scale', 'of', '0', 'to', '##100', ')', "
+        "'lasted', 'longer', 'than', 'in', 'the', 'pre', '##d', '##nis', '##one', '-', 'alone', 'group', '(', '.', "
+        "'00', '##4', '<', 'p', '<', '.', '05', ')', '.')"
+    )
+    ent = doc.labeled_spans[5]
+    assert (
+        str(ent)
+        == "('the', 'addition', 'of', 'mit', '##ox', '##ant', '##rone', 'to', 'pre', '##d', '##nis', '##one', "
+        "'after', 'failure', 'of', 'pre', '##d', '##nis', '##one', 'alone', 'was', 'associated', 'with', "
+        "'improvements', 'in', 'pain', ',', 'pain', 'impact', ',', 'pain', 'relief', ',', 'ins', '##om', '##nia', "
+        "',', 'and', 'global', 'quality', 'of', 'life', '(', '.', '001', '<', 'p', '<', '.', '00', '##3', ')', '.')"
+    )
+    ent = doc.labeled_spans[6]
+    assert (
+        str(ent)
+        == "('treatment', 'with', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'was', "
+        "'associated', 'with', 'greater', 'and', 'longer', '-', 'lasting', 'improvement', 'in', 'several', "
+        "'hq', '##l', 'domains', 'and', 'symptoms', 'than', 'treatment', 'with', 'pre', '##d', '##nis', '##one', "
+        "'alone', '.')"
+    )
 
 
 def test_tokenized_documents_with_entities_and_relations_all(
     dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer, dataset_variant
 ):
-    if dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None:
-        for (
-            split,
-            docs,
-        ) in dataset_of_text_documents_with_labeled_spans_and_binary_relations.items():
-            for doc in docs:
-                # Note, that this is a list of documents, because the document may be split into chunks
-                # if the input text is too long.
-                tokenized_docs = tokenize_document(
-                    doc,
-                    tokenizer=tokenizer,
-                    return_overflowing_tokens=True,
-                    result_document_type=TestTokenDocumentWithLabeledSpansAndBinaryRelations,
-                    strict_span_conversion=True,
-                    verbose=True,
-                )
-                # we just ensure that we get at least one tokenized document
-                assert tokenized_docs is not None
-                assert len(tokenized_docs) > 0
+    for (
+        split,
+        docs,
+    ) in dataset_of_text_documents_with_labeled_spans_and_binary_relations.items():
+        for doc in docs:
+            # Note, that this is a list of documents, because the document may be split into chunks
+            # if the input text is too long.
+            tokenized_docs = tokenize_document(
+                doc,
+                tokenizer=tokenizer,
+                return_overflowing_tokens=True,
+                result_document_type=TestTokenDocumentWithLabeledSpansAndBinaryRelations,
+                strict_span_conversion=True,
+                verbose=True,
+            )
+            # we just ensure that we get at least one tokenized document
+            assert tokenized_docs is not None
+            assert len(tokenized_docs) > 0
 
 
 def test_document_converters(dataset_variant):
     builder = BUILDER_CLASS(config_name=dataset_variant)
     document_converters = builder.document_converters
 
-    if dataset_variant == "default":
+    if dataset_variant == "default" or dataset_variant is None:
         assert len(document_converters) == 1
         assert set(document_converters) == {
             TextDocumentWithLabeledSpansAndBinaryRelations,