diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3c6c8ee5..6c25d6f3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -80,9 +80,8 @@ repos:
       - id: codespell
         args:
           - --skip=logs/**,data/**,tests/fixtures/**
-          # hist: required for plotext.hist()
-          # ba: denotes beginning of an encoding with label as 'a'. More details at src/pie_utils/sequence_tagging/ill_formed.py
-          - --ignore-words-list=hist,ba
+          # arbitral: this is a legal term and used in example data (cdcp dataset)
+          - --ignore-words-list=arbitral
 
   # python static type checking
   - repo: https://github.com/pre-commit/mirrors-mypy
diff --git a/dataset_builders/pie/cdcp/README.md b/dataset_builders/pie/cdcp/README.md
new file mode 100644
index 00000000..33edaa3a
--- /dev/null
+++ b/dataset_builders/pie/cdcp/README.md
@@ -0,0 +1,29 @@
+# PIE Dataset Card for "CDCP"
+
+This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the
+[CDCP Huggingface dataset loading script](https://huggingface.co/datasets/DFKI-SLT/cdcp).
+
+## Data Schema
+
+The document type for this dataset is `CDCPDocument` which defines the following data fields:
+
+- `text` (str)
+- `id` (str, optional)
+- `metadata` (dictionary, optional)
+
+and the following annotation layers:
+
+- `propositions` (annotation type: `LabeledSpan`, target: `text`)
+- `relations` (annotation type: `BinaryRelation`, target: `propositions`)
+- `urls` (annotation type: `Attribute`, target: `propositions`)
+
+See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/annotations.py) for the annotation type definitions.
+
+## Document Converters
+
+The dataset provides document converters for the following target document types:
+
+- `pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations`
+
+See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type
+definitions.
diff --git a/dataset_builders/pie/cdcp/cdcp.py b/dataset_builders/pie/cdcp/cdcp.py
new file mode 100644
index 00000000..73e9c8bd
--- /dev/null
+++ b/dataset_builders/pie/cdcp/cdcp.py
@@ -0,0 +1,142 @@
+import dataclasses
+import logging
+from typing import Any, Callable, Dict, List, Optional
+
+import datasets
+from pytorch_ie.annotations import BinaryRelation, LabeledSpan
+from pytorch_ie.core import Annotation, AnnotationList, annotation_field
+from pytorch_ie.documents import (
+    TextBasedDocument,
+    TextDocumentWithLabeledSpansAndBinaryRelations,
+)
+
+from pie_datasets import GeneratorBasedBuilder
+from pie_datasets.document.processing.text_span_trimmer import trim_text_spans
+
+log = logging.getLogger(__name__)
+
+
+def dl2ld(dict_of_lists):
+    return [dict(zip(dict_of_lists, t)) for t in zip(*dict_of_lists.values())]
+
+
+def ld2dl(list_of_dicts, keys: Optional[List[str]] = None):
+    return {k: [d[k] for d in list_of_dicts] for k in keys}
+
+
+@dataclasses.dataclass(frozen=True)
+class Attribute(Annotation):
+    value: str
+    annotation: Annotation
+
+
+@dataclasses.dataclass
+class CDCPDocument(TextBasedDocument):
+    propositions: AnnotationList[LabeledSpan] = annotation_field(target="text")
+    relations: AnnotationList[BinaryRelation] = annotation_field(target="propositions")
+    urls: AnnotationList[Attribute] = annotation_field(target="propositions")
+
+
+def example_to_document(
+    example: Dict[str, Any],
+    relation_label: datasets.ClassLabel,
+    proposition_label: datasets.ClassLabel,
+):
+    document = CDCPDocument(id=example["id"], text=example["text"])
+    for proposition_dict in dl2ld(example["propositions"]):
+        proposition = LabeledSpan(
+            start=proposition_dict["start"],
+            end=proposition_dict["end"],
+            label=proposition_label.int2str(proposition_dict["label"]),
+        )
+        document.propositions.append(proposition)
+        if proposition_dict.get("url", "") != "":
+            url = Attribute(annotation=proposition, value=proposition_dict["url"])
+            document.urls.append(url)
+
+    for relation_dict in dl2ld(example["relations"]):
+        relation = BinaryRelation(
+            head=document.propositions[relation_dict["head"]],
+            tail=document.propositions[relation_dict["tail"]],
+            label=relation_label.int2str(relation_dict["label"]),
+        )
+        document.relations.append(relation)
+
+    return document
+
+
+def document_to_example(
+    document: CDCPDocument,
+    relation_label: datasets.ClassLabel,
+    proposition_label: datasets.ClassLabel,
+) -> Dict[str, Any]:
+    result = {"id": document.id, "text": document.text}
+    proposition2dict = {}
+    proposition2idx = {}
+    for idx, proposition in enumerate(document.propositions):
+        proposition2dict[proposition] = {
+            "start": proposition.start,
+            "end": proposition.end,
+            "label": proposition_label.str2int(proposition.label),
+            "url": "",
+        }
+        proposition2idx[proposition] = idx
+    for url in document.urls:
+        proposition2dict[url.annotation]["url"] = url.value
+
+    result["propositions"] = ld2dl(
+        proposition2dict.values(), keys=["start", "end", "label", "url"]
+    )
+
+    relations = [
+        {
+            "head": proposition2idx[relation.head],
+            "tail": proposition2idx[relation.tail],
+            "label": relation_label.str2int(relation.label),
+        }
+        for relation in document.relations
+    ]
+    result["relations"] = ld2dl(relations, keys=["head", "tail", "label"])
+
+    return result
+
+
+def convert_to_text_document_with_labeled_spans_and_binary_relations(
+    document: CDCPDocument,
+    verbose: bool = True,
+) -> TextDocumentWithLabeledSpansAndBinaryRelations:
+    doc_simplified = document.as_type(
+        TextDocumentWithLabeledSpansAndBinaryRelations,
+        field_mapping={"propositions": "labeled_spans", "relations": "binary_relations"},
+    )
+    result = trim_text_spans(
+        doc_simplified,
+        layer="labeled_spans",
+        verbose=verbose,
+    )
+    return result
+
+
+class CDCP(GeneratorBasedBuilder):
+    DOCUMENT_TYPE = CDCPDocument
+
+    DOCUMENT_CONVERTERS = {
+        TextDocumentWithLabeledSpansAndBinaryRelations: convert_to_text_document_with_labeled_spans_and_binary_relations
+    }
+
+    BASE_DATASET_PATH = "DFKI-SLT/cdcp"
+
+    BUILDER_CONFIGS = [datasets.BuilderConfig(name="default")]
+
+    DEFAULT_CONFIG_NAME = "default"  # type: ignore
+
+    def _generate_document_kwargs(self, dataset):
+        return {
+            "relation_label": dataset.features["relations"].feature["label"],
+            "proposition_label": dataset.features["propositions"].feature["label"],
+        }
+
+    def _generate_document(self, example, relation_label, proposition_label):
+        return example_to_document(
+            example, relation_label=relation_label, proposition_label=proposition_label
+        )
diff --git a/dataset_builders/pie/cdcp/requirements.txt b/dataset_builders/pie/cdcp/requirements.txt
new file mode 100644
index 00000000..96711063
--- /dev/null
+++ b/dataset_builders/pie/cdcp/requirements.txt
@@ -0,0 +1 @@
+pie-datasets>=0.3.0
diff --git a/src/pie_datasets/document/types.py b/src/pie_datasets/document/types.py
index d57292f7..983d80ff 100644
--- a/src/pie_datasets/document/types.py
+++ b/src/pie_datasets/document/types.py
@@ -3,7 +3,7 @@
 
 from pytorch_ie.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan
 from pytorch_ie.core import Annotation, AnnotationList, annotation_field
-from pytorch_ie.documents import TextBasedDocument
+from pytorch_ie.documents import TextBasedDocument, TokenBasedDocument
 
 
 @dataclasses.dataclass(eq=True, frozen=True)
@@ -28,3 +28,13 @@ class BratDocumentWithMergedSpans(TextBasedDocument):
     relations: AnnotationList[BinaryRelation] = annotation_field(target="spans")
     span_attributes: AnnotationList[Attribute] = annotation_field(target="spans")
     relation_attributes: AnnotationList[Attribute] = annotation_field(target="relations")
+
+
+@dataclasses.dataclass
+class TokenDocumentWithLabeledSpans(TokenBasedDocument):
+    labeled_spans: AnnotationList[LabeledSpan] = annotation_field(target="tokens")
+
+
+@dataclasses.dataclass
+class TokenDocumentWithLabeledSpansAndBinaryRelations(TokenDocumentWithLabeledSpans):
+    binary_relations: AnnotationList[BinaryRelation] = annotation_field(target="labeled_spans")
diff --git a/tests/dataset_builders/pie/__init__.py b/tests/dataset_builders/pie/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/dataset_builders/pie/test_cdcp.py b/tests/dataset_builders/pie/test_cdcp.py
new file mode 100644
index 00000000..e2cb01f1
--- /dev/null
+++ b/tests/dataset_builders/pie/test_cdcp.py
@@ -0,0 +1,441 @@
+import dataclasses
+from typing import List
+
+import pytest
+from datasets import disable_caching, load_dataset
+from pytorch_ie.annotations import LabeledSpan
+from pytorch_ie.core import AnnotationList, Document, annotation_field
+from pytorch_ie.documents import (
+    TextBasedDocument,
+    TextDocumentWithLabeledSpansAndBinaryRelations,
+)
+from transformers import AutoTokenizer, PreTrainedTokenizer
+
+from dataset_builders.pie.cdcp.cdcp import (
+    CDCP,
+    CDCPDocument,
+    convert_to_text_document_with_labeled_spans_and_binary_relations,
+    document_to_example,
+    example_to_document,
+)
+from pie_datasets import DatasetDict
+from pie_datasets.document.conversion import tokenize_document
+from pie_datasets.document.types import TokenDocumentWithLabeledSpansAndBinaryRelations
+from tests import FIXTURES_ROOT
+from tests.dataset_builders.common import PIE_BASE_PATH, _deep_compare
+
+disable_caching()
+
+DATASET_NAME = "cdcp"
+SPLIT_SIZES = {"train": 581, "test": 150}
+HF_DATASET_PATH = CDCP.BASE_DATASET_PATH
+PIE_DATASET_PATH = PIE_BASE_PATH / DATASET_NAME
+DATA_PATH = FIXTURES_ROOT / "dataset_builders" / "cdcp_acl17.zip"
+
+HF_EXAMPLE_00195 = {
+    "id": "00195",
+    "text": "State and local court rules sometimes make default judgments much more likely. For example, "
+    "when a person who allegedly owes a debt is told to come to court on a work day, they may be "
+    "forced to choose between a default judgment and their job. I urge the CFPB to find practices "
+    "that involve scheduling hearings at inconvenient times unfair, deceptive, and abusive, or "
+    "inconsistent with 1692i.",
+    "propositions": {
+        "start": [0, 78, 242],
+        "end": [78, 242, 391],
+        "label": [4, 4, 1],
+        "url": ["", "", ""],
+    },
+    "relations": {"head": [0, 2], "tail": [1, 0], "label": [1, 1]},
+}
+
+
+HF_EXAMPLE_00194 = {
+    "id": "00194",
+    "text": "Recently, courts have held that debt collectors can escape 1692i's venue provisions entirely "
+    "by pursuing debt collection through arbitration instead. As the NAF studies reflect, arbitration "
+    "has not proven a satisfactory alternative. I urge the CFPB to include in a rule language "
+    "interpreting 1692i as requiring debt collectors to proceed in court, not through "
+    "largely-unregulated arbitral forums.",
+    "propositions": {
+        "start": [0, 149, 232],
+        "end": [149, 232, 396],
+        "label": [0, 4, 1],
+        "url": ["", "", ""],
+    },
+    "relations": {"head": [2], "tail": [1], "label": [1]},
+}
+
+
+@pytest.fixture(scope="module", params=["train", "test"])
+def split(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def hf_dataset():
+    return load_dataset(str(HF_DATASET_PATH), data_dir=DATA_PATH)
+
+
+def test_hf_dataset(hf_dataset):
+    assert hf_dataset is not None
+    assert {name: len(ds) for name, ds in hf_dataset.items()} == SPLIT_SIZES
+
+
+@pytest.fixture(scope="module")
+def hf_example(hf_dataset, split):
+    return hf_dataset[split][0]
+
+
+def test_hf_example(hf_example, split):
+    assert hf_example is not None
+    if split == "train":
+        assert hf_example == HF_EXAMPLE_00195
+    elif split == "test":
+        assert hf_example == HF_EXAMPLE_00194
+    else:
+        raise ValueError(f"Unknown split: {split}")
+
+
+@pytest.fixture(scope="module")
+def generate_document_kwargs(hf_dataset, split):
+    return CDCP()._generate_document_kwargs(hf_dataset[split])
+
+
+@pytest.fixture(scope="module")
+def generated_document(hf_example, generate_document_kwargs):
+    return CDCP()._generate_document(hf_example, **generate_document_kwargs)
+
+
+def test_generated_document(generated_document, split):
+    assert isinstance(generated_document, CDCPDocument)
+    if split == "train":
+        assert generated_document.text == HF_EXAMPLE_00195["text"]
+        assert len(generated_document.propositions) == 3
+        assert len(generated_document.relations) == 2
+    elif split == "test":
+        assert generated_document.text == HF_EXAMPLE_00194["text"]
+        assert len(generated_document.propositions) == 3
+        assert len(generated_document.relations) == 1
+    else:
+        raise ValueError(f"Unknown split: {split}")
+
+
+@pytest.fixture(scope="module")
+def hf_example_back(generated_document, generate_document_kwargs):
+    return document_to_example(generated_document, **generate_document_kwargs)
+
+
+def test_example_to_document_and_back(hf_example, hf_example_back):
+    _deep_compare(
+        obj=hf_example_back,
+        obj_expected=hf_example,
+    )
+
+
+def test_example_to_document_and_back_all(hf_dataset, generate_document_kwargs, split):
+    for hf_ex in hf_dataset[split]:
+        doc = example_to_document(hf_ex, **generate_document_kwargs)
+        _assert_no_span_overlap(document=doc, text_field="text", span_layer="propositions")
+        hf_example_back = document_to_example(doc, **generate_document_kwargs)
+        _deep_compare(
+            obj=hf_example_back,
+            obj_expected=hf_ex,
+        )
+
+
+@pytest.fixture(scope="module")
+def dataset() -> DatasetDict:
+    return DatasetDict.load_dataset(str(PIE_DATASET_PATH))
+
+
+def test_pie_dataset(dataset):
+    assert dataset is not None
+    assert {name: len(ds) for name, ds in dataset.items()} == SPLIT_SIZES
+
+
+@pytest.fixture(scope="module")
+def document(dataset, split) -> CDCPDocument:
+    result = dataset[split][0]
+    # we can not assert the real document type because it may come from a dataset loading script
+    # downloaded to a temporary directory and thus have a different type object, although it is
+    # semantically the same
+    assert isinstance(result, Document)
+    return result
+
+
+def test_compare_document_and_generated_document(document, generated_document):
+    assert document.text == generated_document.text
+    assert document.relations == generated_document.relations
+    assert document.metadata == generated_document.metadata
+
+
+def _assert_no_span_overlap(document: Document, text_field: str, span_layer: str):
+    spans = document[span_layer]
+    text = getattr(document, text_field)
+    seq = [None] * len(text)
+    for span in spans:
+        assert seq[span.start : span.end] == [None] * len(text[span.start : span.end])
+        seq[span.start : span.end] = text[span.start : span.end]
+
+
+def test_assert_no_span_overlap():
+    @dataclasses.dataclass
+    class TextDocumentWithEntities(TextBasedDocument):
+        entities: AnnotationList[LabeledSpan] = annotation_field(target="text")
+
+    doc0 = TextDocumentWithEntities(text="abcdefghijklmnop")
+    doc0.entities.append(LabeledSpan(start=0, end=4, label="A"))
+    doc0.entities.append(LabeledSpan(start=4, end=6, label="B"))
+
+    # this should work
+    _assert_no_span_overlap(document=doc0, text_field="text", span_layer="entities")
+
+    doc1 = TextDocumentWithEntities(text="abcdefghijklmnop")
+    doc1.entities.append(LabeledSpan(start=0, end=4, label="A"))
+    doc1.entities.append(LabeledSpan(start=2, end=6, label="B"))
+
+    # this should fail
+    with pytest.raises(AssertionError):
+        _assert_no_span_overlap(document=doc1, text_field="text", span_layer="entities")
+
+
+@pytest.fixture(scope="module")
+def dataset_of_text_documents_with_labeled_spans_and_binary_relations(
+    dataset,
+) -> DatasetDict:
+    converted_dataset = dataset.to_document_type(TextDocumentWithLabeledSpansAndBinaryRelations)
+    return converted_dataset
+
+
+def test_dataset_of_text_documents_with_labeled_spans_and_binary_relations(
+    dataset_of_text_documents_with_labeled_spans_and_binary_relations, split
+):
+    assert dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None
+    # get a document to check
+    document = dataset_of_text_documents_with_labeled_spans_and_binary_relations[split][0]
+    assert isinstance(document, TextDocumentWithLabeledSpansAndBinaryRelations)
+    if split == "train":
+        assert document.id == "00195"
+        # check entities
+        assert len(document.labeled_spans) == 3
+        entity_tuples = [(str(ent), ent.label) for ent in document.labeled_spans]
+        assert entity_tuples[0] == (
+            "State and local court rules sometimes make default judgments much more likely.",
+            "value",
+        )
+        assert entity_tuples[1] == (
+            "For example, when a person who allegedly owes a debt is told to come to court on a work day, "
+            "they may be forced to choose between a default judgment and their job.",
+            "value",
+        )
+        assert entity_tuples[2] == (
+            "I urge the CFPB to find practices that involve scheduling hearings at inconvenient times unfair, "
+            "deceptive, and abusive, or inconsistent with 1692i.",
+            "policy",
+        )
+
+        # check relations
+        assert len(document.binary_relations) == 2
+        relation_tuples = [
+            (str(rel.head), rel.label, str(rel.tail)) for rel in document.binary_relations
+        ]
+        assert relation_tuples[0] == (
+            "State and local court rules sometimes make default judgments much more likely.",
+            "reason",
+            "For example, when a person who allegedly owes a debt is told to come to court on a work day, "
+            "they may be forced to choose between a default judgment and their job.",
+        )
+        assert relation_tuples[1] == (
+            "I urge the CFPB to find practices that involve scheduling hearings at inconvenient times unfair, "
+            "deceptive, and abusive, or inconsistent with 1692i.",
+            "reason",
+            "State and local court rules sometimes make default judgments much more likely.",
+        )
+
+    elif split == "test":
+        assert document.id == "00194"
+        # check entities
+        assert len(document.labeled_spans) == 3
+        entity_tuples = [(str(ent), ent.label) for ent in document.labeled_spans]
+        assert entity_tuples[0] == (
+            "Recently, courts have held that debt collectors can escape 1692i's venue provisions entirely "
+            "by pursuing debt collection through arbitration instead.",
+            "fact",
+        )
+        assert entity_tuples[1] == (
+            "As the NAF studies reflect, arbitration has not proven a satisfactory alternative.",
+            "value",
+        )
+        assert entity_tuples[2] == (
+            "I urge the CFPB to include in a rule language interpreting 1692i as requiring debt collectors to proceed "
+            "in court, not through largely-unregulated arbitral forums.",
+            "policy",
+        )
+
+        # check relations
+        assert len(document.binary_relations) == 1
+        relation_tuples = [
+            (str(rel.head), rel.label, str(rel.tail)) for rel in document.binary_relations
+        ]
+        assert relation_tuples[0] == (
+            "I urge the CFPB to include in a rule language interpreting 1692i as requiring debt collectors to proceed "
+            "in court, not through largely-unregulated arbitral forums.",
+            "reason",
+            "As the NAF studies reflect, arbitration has not proven a satisfactory alternative.",
+        )
+    else:
+        raise ValueError(f"Unknown Split {split}")
+
+
+def test_convert_to_textdocument_with_entities_and_relations(
+    document, dataset_of_text_documents_with_labeled_spans_and_binary_relations, split
+):
+    # just check that we get the same as in the converted dataset when explicitly calling the conversion method
+    converted_doc = convert_to_text_document_with_labeled_spans_and_binary_relations(document)
+    doc_from_converted_dataset = dataset_of_text_documents_with_labeled_spans_and_binary_relations[
+        split
+    ][0]
+    assert converted_doc == doc_from_converted_dataset
+
+
+@pytest.fixture(scope="module")
+def tokenizer() -> PreTrainedTokenizer:
+    return AutoTokenizer.from_pretrained("bert-base-uncased")
+
+
+@pytest.fixture(scope="module")
+def tokenized_documents_with_labeled_spans_and_binary_relations(
+    dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer
+) -> List[TokenDocumentWithLabeledSpansAndBinaryRelations]:
+    # get a document to check
+    doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations["train"][0]
+    # Note, that this is a list of documents, because the document may be split into chunks
+    # if the input text is too long.
+    tokenized_docs = tokenize_document(
+        doc,
+        tokenizer=tokenizer,
+        return_overflowing_tokens=True,
+        result_document_type=TokenDocumentWithLabeledSpansAndBinaryRelations,
+        verbose=True,
+    )
+    return tokenized_docs
+
+
+def test_tokenized_documents_with_labeled_spans_and_binary_relations(
+    tokenized_documents_with_labeled_spans_and_binary_relations,
+):
+    docs = tokenized_documents_with_labeled_spans_and_binary_relations
+    assert len(docs) == 1
+    doc = docs[0]
+    assert len(doc.tokens) == 84
+    assert len(doc.labeled_spans) == 3
+    ent = doc.labeled_spans[0]
+    assert ent.target[ent.start : ent.end] == (
+        "state",
+        "and",
+        "local",
+        "court",
+        "rules",
+        "sometimes",
+        "make",
+        "default",
+        "judgments",
+        "much",
+        "more",
+        "likely",
+        ".",
+    )
+    ent = doc.labeled_spans[1]
+    assert ent.target[ent.start : ent.end] == (
+        "for",
+        "example",
+        ",",
+        "when",
+        "a",
+        "person",
+        "who",
+        "allegedly",
+        "owes",
+        "a",
+        "debt",
+        "is",
+        "told",
+        "to",
+        "come",
+        "to",
+        "court",
+        "on",
+        "a",
+        "work",
+        "day",
+        ",",
+        "they",
+        "may",
+        "be",
+        "forced",
+        "to",
+        "choose",
+        "between",
+        "a",
+        "default",
+        "judgment",
+        "and",
+        "their",
+        "job",
+        ".",
+    )
+    ent = doc.labeled_spans[2]
+    assert ent.target[ent.start : ent.end] == (
+        "i",
+        "urge",
+        "the",
+        "cf",
+        "##p",
+        "##b",
+        "to",
+        "find",
+        "practices",
+        "that",
+        "involve",
+        "scheduling",
+        "hearings",
+        "at",
+        "inc",
+        "##on",
+        "##ven",
+        "##ient",
+        "times",
+        "unfair",
+        ",",
+        "dec",
+        "##eptive",
+        ",",
+        "and",
+        "abusive",
+        ",",
+        "or",
+        "inconsistent",
+        "with",
+        "1692",
+        "##i",
+        ".",
+    )
+
+
+def test_tokenized_documents_with_entities_and_relations_all(
+    dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer
+):
+    for split, docs in dataset_of_text_documents_with_labeled_spans_and_binary_relations.items():
+        for doc in docs:
+            # Note, that this is a list of documents, because the document may be split into chunks
+            # if the input text is too long.
+            tokenized_docs = tokenize_document(
+                doc,
+                tokenizer=tokenizer,
+                return_overflowing_tokens=True,
+                result_document_type=TokenDocumentWithLabeledSpansAndBinaryRelations,
+                verbose=True,
+            )
+            # we just ensure that we get at least one tokenized document
+            assert tokenized_docs is not None
+            assert len(tokenized_docs) > 0
diff --git a/tests/fixtures/dataset_builders/cdcp_acl17.zip b/tests/fixtures/dataset_builders/cdcp_acl17.zip
new file mode 100644
index 00000000..d16997cc
Binary files /dev/null and b/tests/fixtures/dataset_builders/cdcp_acl17.zip differ