From d35e983d82a4a0303cde2e8e03070d386264aea1 Mon Sep 17 00:00:00 2001
From: ArneBinder <ArneBinder@users.noreply.github.com>
Date: Wed, 8 Nov 2023 14:09:08 +0100
Subject: [PATCH] remove Huggingface dataset scripts (moved to pie-datasets,
 see https://github.com/ArneBinder/pie-datasets/pull/36) (#368)

---
 src/pytorch_ie/data/datasets/__init__.py      |   3 -
 .../data/datasets/hf_datasets/__init__.py     |   0
 .../data/datasets/hf_datasets/ace2004.py      | 153 ----
 .../data/datasets/hf_datasets/ace2005.py      | 140 ----
 .../data/datasets/hf_datasets/brat.py         | 337 --------
 .../data/datasets/hf_datasets/chemprot.py     | 176 -----
 .../data/datasets/hf_datasets/fewrel.py       | 285 -------
 .../data/datasets/hf_datasets/genia.py        | 406 ----------
 .../data/datasets/hf_datasets/ontonotes.py    | 161 ----
 .../data/datasets/hf_datasets/scierc.py       | 132 ----
 .../hf_datasets/semeval_2010_task_8.py        | 185 -----
 .../data/datasets/hf_datasets/tacred.py       | 257 ------
 .../data/datasets/hf_datasets/webred.py       | 746 ------------------
 .../data/datasets/hf_datasets/wiki80.py       | 199 -----
 .../data/datasets/hf_datasets/wikigold.py     | 122 ---
 15 files changed, 3302 deletions(-)
 delete mode 100644 src/pytorch_ie/data/datasets/__init__.py
 delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/__init__.py
 delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/ace2004.py
 delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/ace2005.py
 delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/brat.py
 delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/chemprot.py
 delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/fewrel.py
 delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/genia.py
 delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/ontonotes.py
 delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/scierc.py
 delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/semeval_2010_task_8.py
 delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/tacred.py
 delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/webred.py
 delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/wiki80.py
 delete mode 100644 src/pytorch_ie/data/datasets/hf_datasets/wikigold.py

diff --git a/src/pytorch_ie/data/datasets/__init__.py b/src/pytorch_ie/data/datasets/__init__.py
deleted file mode 100644
index ffbe5d00..00000000
--- a/src/pytorch_ie/data/datasets/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-import pathlib
-
-HF_DATASETS_ROOT = pathlib.Path(__file__).parent / "hf_datasets"
diff --git a/src/pytorch_ie/data/datasets/hf_datasets/__init__.py b/src/pytorch_ie/data/datasets/hf_datasets/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/pytorch_ie/data/datasets/hf_datasets/ace2004.py b/src/pytorch_ie/data/datasets/hf_datasets/ace2004.py
deleted file mode 100644
index 646f1f34..00000000
--- a/src/pytorch_ie/data/datasets/hf_datasets/ace2004.py
+++ /dev/null
@@ -1,153 +0,0 @@
-"""TODO: Add a description here."""
-
-
-import json
-import os
-
-import datasets
-
-_CITATION_ACE2004 = """\
-@inproceedings{doddington-etal-2004-automatic,
-    title = "The Automatic Content Extraction ({ACE}) Program {--} Tasks, Data, and Evaluation",
-    author = "Doddington, George  and
-      Mitchell, Alexis  and
-      Przybocki, Mark  and
-      Ramshaw, Lance  and
-      Strassel, Stephanie  and
-      Weischedel, Ralph",
-    booktitle = "Proceedings of the Fourth International Conference on Language Resources and Evaluation ({LREC}{'}04)",
-    month = may,
-    year = "2004",
-    address = "Lisbon, Portugal",
-    publisher = "European Language Resources Association (ELRA)",
-    url = "http://www.lrec-conf.org/proceedings/lrec2004/pdf/5.pdf",
-}
-"""
-
-# You can copy an official description
-_DESCRIPTION = """\
-ACE 2004 Multilingual Training Corpus contains the complete set of English, Arabic and Chinese
-training data for the 2004 Automatic Content Extraction (ACE) technology evaluation. The corpus consists of data of
-various types annotated for entities and relations and was created by Linguistic Data Consortium with support from
-the ACE Program, with additional assistance from the DARPA TIDES (Translingual Information Detection, Extraction and
-Summarization) Program. This data was previously distributed as an e-corpus (LDC2004E17) to participants in the 2004
-ACE evaluation.
-
-The objective of the ACE program is to develop automatic content extraction technology to support automatic
-processing of human language in text form. In September 2004, sites were evaluated on system performance in six
-areas: Entity Detection and Recognition (EDR), Entity Mention Detection (EMD), EDR Co-reference, Relation Detection
-and Recognition (RDR), Relation Mention Detection (RMD), and RDR given reference entities. All tasks were evaluated
-in three languages: English, Chinese and Arabic.
-
-The current publication consists of the official training data for these evaluation tasks. A seventh evaluation area,
-Timex Detection and Recognition, is supported by the ACE Time Normalization (TERN) 2004 English Training Data Corpus
-(LDC2005T07). The TERN corpus source data largely overlaps with the English source data contained in the current
-release.
-
-For more information about linguistic resources for the ACE program, including annotation guidelines,
-task definitions, free annotation tools and other documentation, please visit LDC's ACE website:
-https://www.ldc.upenn.edu/collaborations/past-projects/ace
-"""
-
-_HOMEPAGE = "https://catalog.ldc.upenn.edu/LDC2005T09"
-
-# TODO: Add the license for the dataset here if you can find it
-_LICENSE = """https://catalog.ldc.upenn.edu/license/ldc-non-members-agreement.pdf"""
-
-# TODO: Add class labels
-_CLASS_LABELS = ["PHYS", "EMP-ORG", "ART", "OTHER-AFF", "GPE-AFF", "PER-SOC"]
-
-
-class ACE2004(datasets.GeneratorBasedBuilder):
-    """TODO: Short description of my dataset."""
-
-    VERSION = datasets.Version("1.0.0")  # type: ignore
-
-    @property
-    def manual_download_instructions(self):
-        return (
-            "To use ACE2004 you have to download it manually. "
-            "It is available via the LDC at https://catalog.ldc.upenn.edu/LDC2005T09"
-            "Preprocess the data as described in "
-            "https://github.com/LorrinWWW/two-are-better-than-one/tree/master/datasets and "
-            "extract test.ACE04_0,json, train.ACE04_0.json, valid.ACE04_0.json files from the "
-            "unified folder in one folder, and load the dataset with: "
-            "`datasets.load_dataset('ace2004', data_dir='path/to/folder/folder_name')`"
-        )
-
-    def _info(self):
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=_DESCRIPTION,
-            # This defines the different columns of the dataset and their types
-            features=datasets.Features(
-                {
-                    "tokens": datasets.Sequence(datasets.Value("string")),
-                    "head_start": datasets.Value("int32"),
-                    "head_end": datasets.Value("int32"),
-                    "tail_start": datasets.Value("int32"),
-                    "tail_end": datasets.Value("int32"),
-                    "label": datasets.ClassLabel(names=_CLASS_LABELS),
-                }
-            ),
-            # If there's a common (input, target) tuple from the features,
-            # specify them here. They'll be used if as_supervised=True in
-            # builder.as_dataset.
-            supervised_keys=None,
-            # Homepage of the dataset for documentation
-            homepage=_HOMEPAGE,
-            # License for the dataset if available
-            license=_LICENSE,
-            # Citation for the dataset
-            citation=_CITATION_ACE2004,
-        )
-
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
-        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
-
-        data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
-
-        if not os.path.exists(data_dir):
-            raise FileNotFoundError(
-                "{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('ace2004', data_dir=...)` that includes the train, valid, test files. Manual download instructions: {}".format(
-                    data_dir, self.manual_download_instructions
-                )
-            )
-
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={"filepath": os.path.join(data_dir, "train.ACE04_0.json")},
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={"filepath": os.path.join(data_dir, "test.ACE04_0.json")},
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                gen_kwargs={"filepath": os.path.join(data_dir, "valid.ACE04_0.json")},
-            ),
-        ]
-
-    def _generate_examples(self, filepath):
-        """Yields examples."""
-        with open(filepath, encoding="utf-8") as f:
-            data = json.load(f)
-            for example in data:
-                idx = 0
-                for rel in example["relations"]:
-                    head_start, head_end, tail_start, tail_end, label = rel
-
-                    id_ = str(idx)
-                    idx += 1
-
-                    yield id_, {
-                        "tokens": example["tokens"],
-                        "head_start": head_start,
-                        "head_end": head_end,
-                        "tail_start": tail_start,
-                        "tail_end": tail_end,
-                        "label": label,
-                    }
diff --git a/src/pytorch_ie/data/datasets/hf_datasets/ace2005.py b/src/pytorch_ie/data/datasets/hf_datasets/ace2005.py
deleted file mode 100644
index be86fa70..00000000
--- a/src/pytorch_ie/data/datasets/hf_datasets/ace2005.py
+++ /dev/null
@@ -1,140 +0,0 @@
-"""TODO: Add a description here."""
-
-
-import json
-import os
-
-import datasets
-
-_CITATION_ACE2005 = """\
-@article{walker2006ace,
-  title={ACE 2005 multilingual training corpus},
-  author={Walker, Christopher and Strassel, Stephanie and Medero, Julie and Maeda, Kazuaki},
-  journal={Linguistic Data Consortium, Philadelphia},
-  volume={57},
-  pages={45},
-  year={2006}
-}
-"""
-
-# You can copy an official description
-_DESCRIPTION = """\
-ACE 2005 Multilingual Training Corpus contains the complete set of English, Arabic and Chinese
-training data for the 2005 Automatic Content Extraction (ACE) technology evaluation. The corpus consists of data of
-various types annotated for entities, relations and events by the Linguistic Data Consortium (LDC) with support from
-the ACE Program and additional assistance from LDC.
-
-The objective of the ACE program was to develop automatic content extraction technology to support automatic
-processing of human language in text form.
-
-In November 2005, sites were evaluated on system performance in five primary areas: the recognition of entities,
-values, temporal expressions, relations, and events. Entity, relation and event mention detection were also offered
-as diagnostic tasks. All tasks with the exception of event tasks were performed for three languages, English,
-Chinese and Arabic. Events tasks were evaluated in English and Chinese only. This release comprises the official
-training data for these evaluation tasks.
-
-For more information about linguistic resources for the ACE Program, including annotation guidelines,
-task definitions and other documentation, see LDC's ACE website:
-http://projects.ldc.upenn.edu/ace/
-"""
-
-_HOMEPAGE = "https://catalog.ldc.upenn.edu/LDC2006T06"
-
-_LICENSE = """https://catalog.ldc.upenn.edu/license/ldc-non-members-agreement.pdf"""
-
-_CLASS_LABELS = ["PHYS", "ART", "PART-WHOLE", "ORG-AFF", "GEN-AFF", "PER-SOC"]
-
-
-class ACE2004(datasets.GeneratorBasedBuilder):
-    """TODO: Short description of my dataset."""
-
-    VERSION = datasets.Version("1.0.0")  # type: ignore
-
-    @property
-    def manual_download_instructions(self):
-        return (
-            "To use ACE2005 you have to download it manually. "
-            "It is available via the LDC at https://catalog.ldc.upenn.edu/LDC2006T06"
-            "Preprocess the data as described in "
-            "https://github.com/LorrinWWW/two-are-better-than-one/tree/master/datasets and "
-            "extract test.ACE05.json, train.ACE05.json, valid.ACE05.json files from the "
-            "unified folder in one folder, and load the dataset with: "
-            "`datasets.load_dataset('ace2005', data_dir='path/to/folder/folder_name')`"
-        )
-
-    def _info(self):
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=_DESCRIPTION,
-            # This defines the different columns of the dataset and their types
-            features=datasets.Features(
-                {
-                    "tokens": datasets.Sequence(datasets.Value("string")),
-                    "head_start": datasets.Value("int32"),
-                    "head_end": datasets.Value("int32"),
-                    "tail_start": datasets.Value("int32"),
-                    "tail_end": datasets.Value("int32"),
-                    "label": datasets.ClassLabel(names=_CLASS_LABELS),
-                }
-            ),
-            # If there's a common (input, target) tuple from the features,
-            # specify them here. They'll be used if as_supervised=True in
-            # builder.as_dataset.
-            supervised_keys=None,
-            # Homepage of the dataset for documentation
-            homepage=_HOMEPAGE,
-            # License for the dataset if available
-            license=_LICENSE,
-            # Citation for the dataset
-            citation=_CITATION_ACE2005,
-        )
-
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
-        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
-
-        data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
-
-        if not os.path.exists(data_dir):
-            raise FileNotFoundError(
-                "{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('ace2005', data_dir=...)` that includes the train, valid, test files. Manual download instructions: {}".format(
-                    data_dir, self.manual_download_instructions
-                )
-            )
-
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={"filepath": os.path.join(data_dir, "train.ACE05.json")},
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={"filepath": os.path.join(data_dir, "test.ACE05.json")},
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                gen_kwargs={"filepath": os.path.join(data_dir, "valid.ACE05.json")},
-            ),
-        ]
-
-    def _generate_examples(self, filepath):
-        """Yields examples."""
-        with open(filepath, encoding="utf-8") as f:
-            data = json.load(f)
-            for example in data:
-                idx = 0
-                for rel in example["relations"]:
-                    head_start, head_end, tail_start, tail_end, label = rel
-
-                    id_ = str(idx)
-                    idx += 1
-
-                    yield id_, {
-                        "tokens": example["tokens"],
-                        "head_start": head_start,
-                        "head_end": head_end,
-                        "tail_start": tail_start,
-                        "tail_end": tail_end,
-                        "label": label,
-                    }
diff --git a/src/pytorch_ie/data/datasets/hf_datasets/brat.py b/src/pytorch_ie/data/datasets/hf_datasets/brat.py
deleted file mode 100644
index 32ae39ca..00000000
--- a/src/pytorch_ie/data/datasets/hf_datasets/brat.py
+++ /dev/null
@@ -1,337 +0,0 @@
-import glob
-import logging
-from dataclasses import dataclass
-from os import listdir, path
-from typing import Dict, List, Optional
-
-import datasets
-from datasets import BuilderConfig, DatasetInfo, Features, Sequence, SplitGenerator, Value
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class BratConfig(BuilderConfig):
-    """BuilderConfig for BRAT."""
-
-    url: str = None  # type: ignore
-    description: Optional[str] = None
-    citation: Optional[str] = None
-    homepage: Optional[str] = None
-
-    subdirectory_mapping: Optional[Dict[str, str]] = None
-    file_name_blacklist: Optional[List[str]] = None
-    ann_file_extension: str = "ann"
-    txt_file_extension: str = "txt"
-
-
-class Brat(datasets.GeneratorBasedBuilder):
-    BUILDER_CONFIG_CLASS = BratConfig
-
-    def _info(self):
-        return DatasetInfo(
-            description=self.config.description,
-            citation=self.config.citation,
-            homepage=self.config.homepage,
-            features=Features(
-                {
-                    "context": Value("string"),
-                    "file_name": Value("string"),
-                    "spans": Sequence(
-                        {
-                            "id": Value("string"),
-                            "type": Value("string"),
-                            "locations": Sequence(
-                                {
-                                    "start": Value("int32"),
-                                    "end": Value("int32"),
-                                }
-                            ),
-                            "text": Value("string"),
-                        }
-                    ),
-                    "relations": Sequence(
-                        {
-                            "id": Value("string"),
-                            "type": Value("string"),
-                            "arguments": Sequence(
-                                {"type": Value("string"), "target": Value("string")}
-                            ),
-                        }
-                    ),
-                    "equivalence_relations": Sequence(
-                        {
-                            "type": Value("string"),
-                            "targets": Sequence(Value("string")),
-                        }
-                    ),
-                    "events": Sequence(
-                        {
-                            "id": Value("string"),
-                            "type": Value("string"),
-                            "trigger": Value("string"),
-                            "arguments": Sequence(
-                                {"type": Value("string"), "target": Value("string")}
-                            ),
-                        }
-                    ),
-                    "attributions": Sequence(
-                        {
-                            "id": Value("string"),
-                            "type": Value("string"),
-                            "target": Value("string"),
-                            "value": Value("string"),
-                        }
-                    ),
-                    "normalizations": Sequence(
-                        {
-                            "id": Value("string"),
-                            "type": Value("string"),
-                            "target": Value("string"),
-                            "resource_id": Value("string"),
-                            "entity_id": Value("string"),
-                        }
-                    ),
-                    "notes": Sequence(
-                        {
-                            "id": Value("string"),
-                            "type": Value("string"),
-                            "target": Value("string"),
-                            "note": Value("string"),
-                        }
-                    ),
-                }
-            ),
-        )
-
-    @staticmethod
-    def _get_location(location_string):
-        parts = location_string.split(" ")
-        assert (
-            len(parts) == 2
-        ), f"Wrong number of entries in location string. Expected 2, but found: {parts}"
-        return {"start": int(parts[0]), "end": int(parts[1])}
-
-    @staticmethod
-    def _get_span_annotation(annotation_line):
-        """
-        example input:
-        T1	Organization 0 4	Sony
-        """
-
-        _id, remaining, text = annotation_line.split("\t", maxsplit=2)
-        _type, locations = remaining.split(" ", maxsplit=1)
-        return {
-            "id": _id,
-            "text": text,
-            "type": _type,
-            "locations": [Brat._get_location(loc) for loc in locations.split(";")],
-        }
-
-    @staticmethod
-    def _get_event_annotation(annotation_line):
-        """
-        example input:
-        E1	MERGE-ORG:T2 Org1:T1 Org2:T3
-        """
-        _id, remaining = annotation_line.strip().split("\t")
-        args = [dict(zip(["type", "target"], a.split(":"))) for a in remaining.split(" ")]
-        return {
-            "id": _id,
-            "type": args[0]["type"],
-            "trigger": args[0]["target"],
-            "arguments": args[1:],
-        }
-
-    @staticmethod
-    def _get_relation_annotation(annotation_line):
-        """
-        example input:
-        R1	Origin Arg1:T3 Arg2:T4
-        """
-
-        _id, remaining = annotation_line.strip().split("\t")
-        _type, remaining = remaining.split(" ", maxsplit=1)
-        args = [dict(zip(["type", "target"], a.split(":"))) for a in remaining.split(" ")]
-        return {"id": _id, "type": _type, "arguments": args}
-
-    @staticmethod
-    def _get_equivalence_relation_annotation(annotation_line):
-        """
-        example input:
-        *	Equiv T1 T2 T3
-        """
-        _, remaining = annotation_line.strip().split("\t")
-        parts = remaining.split(" ")
-        return {"type": parts[0], "targets": parts[1:]}
-
-    @staticmethod
-    def _get_attribute_annotation(annotation_line):
-        """
-        example input (binary: implicit value is True, if present, False otherwise):
-        A1	Negation E1
-        example input (multi-value: explicit value)
-        A2	Confidence E2 L1
-        """
-
-        _id, remaining = annotation_line.strip().split("\t")
-        parts = remaining.split(" ")
-        # if no value is present, it is implicitly "true"
-        if len(parts) == 2:
-            parts.append("true")
-        return {
-            "id": _id,
-            "type": parts[0],
-            "target": parts[1],
-            "value": parts[2],
-        }
-
-    @staticmethod
-    def _get_normalization_annotation(annotation_line):
-        """
-        example input:
-        N1	Reference T1 Wikipedia:534366	Barack Obama
-        """
-        _id, remaining, text = annotation_line.split("\t", maxsplit=2)
-        _type, target, ref = remaining.split(" ")
-        res_id, ent_id = ref.split(":")
-        return {
-            "id": _id,
-            "type": _type,
-            "target": target,
-            "resource_id": res_id,
-            "entity_id": ent_id,
-        }
-
-    @staticmethod
-    def _get_note_annotation(annotation_line):
-        """
-        example input:
-        #1	AnnotatorNotes T1	this annotation is suspect
-        """
-        _id, remaining, note = annotation_line.split("\t", maxsplit=2)
-        _type, target = remaining.split(" ")
-        return {
-            "id": _id,
-            "type": _type,
-            "target": target,
-            "note": note,
-        }
-
-    @staticmethod
-    def _read_annotation_file(filename):
-        """
-        reads a BRAT v1.3 annotations file (see https://brat.nlplab.org/standoff.html)
-        """
-
-        res = {
-            "spans": [],
-            "events": [],
-            "relations": [],
-            "equivalence_relations": [],
-            "attributions": [],
-            "normalizations": [],
-            "notes": [],
-        }
-
-        with open(filename) as file:
-            for i, line in enumerate(file):
-                if len(line.strip()) == 0:
-                    continue
-                ann_type = line[0]
-
-                # strip away the new line character
-                if line.endswith("\n"):
-                    line = line[:-1]
-
-                if ann_type == "T":
-                    res["spans"].append(Brat._get_span_annotation(line))
-                elif ann_type == "E":
-                    res["events"].append(Brat._get_event_annotation(line))
-                elif ann_type == "R":
-                    res["relations"].append(Brat._get_relation_annotation(line))
-                elif ann_type == "*":
-                    res["equivalence_relations"].append(
-                        Brat._get_equivalence_relation_annotation(line)
-                    )
-                elif ann_type in ["A", "M"]:
-                    res["attributions"].append(Brat._get_attribute_annotation(line))
-                elif ann_type == "N":
-                    res["normalizations"].append(Brat._get_normalization_annotation(line))
-                elif ann_type == "#":
-                    res["notes"].append(Brat._get_note_annotation(line))
-                else:
-                    raise ValueError(
-                        f'unknown BRAT annotation id type: "{line}" (from file {filename} @line {i}). '
-                        f"Annotation ids have to start with T (spans), E (events), R (relations), "
-                        f"A (attributions), or N (normalizations). See "
-                        f"https://brat.nlplab.org/standoff.html for the BRAT annotation file "
-                        f"specification."
-                    )
-        return res
-
-    def _generate_examples(self, files=None, directory=None):
-        """Read context (.txt) and annotation (.ann) files."""
-        if files is None:
-            assert (
-                directory is not None
-            ), "If files is None, directory has to be provided, but it is also None."
-            _files = glob.glob(f"{directory}/*.{self.config.ann_file_extension}")
-            files = sorted(path.splitext(fn)[0] for fn in _files)
-
-        for filename in files:
-            basename = path.basename(filename)
-            if (
-                self.config.file_name_blacklist is not None
-                and basename in self.config.file_name_blacklist
-            ):
-                logger.info(f"skip annotation file: {basename} (blacklisted)")
-                continue
-
-            ann_fn = f"{filename}.{self.config.ann_file_extension}"
-            brat_annotations = Brat._read_annotation_file(ann_fn)
-
-            txt_fn = f"{filename}.{self.config.txt_file_extension}"
-            txt_content = open(txt_fn).read()
-            brat_annotations["context"] = txt_content
-            brat_annotations["file_name"] = basename
-
-            yield basename, brat_annotations
-
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-
-        subdirectory_mapping = self.config.subdirectory_mapping
-
-        # since subclasses of BuilderConfig are not allowed to define
-        # attributes without defaults, check here
-        assert self.config.url is not None, "data url not specified"
-
-        # if url points to a local directory, just point to that
-        if path.exists(self.config.url) and path.isdir(self.config.url):
-            data_dir = self.config.url
-        # otherwise, download and extract
-        else:
-            data_dir = dl_manager.download_and_extract(self.config.url)
-        logging.info(f"load from data dir: {data_dir}")
-
-        # if no subdirectory mapping is provided, ...
-        if subdirectory_mapping is None:
-            # ... use available subdirectories as split names ...
-            subdirs = [f for f in listdir(data_dir) if path.isdir(path.join(data_dir, f))]
-            if len(subdirs) > 0:
-                subdirectory_mapping = {subdir: subdir for subdir in subdirs}
-            else:
-                # ... otherwise, default to a single train split with the base directory
-                subdirectory_mapping = {"": "train"}
-
-        return [
-            SplitGenerator(
-                name=split,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "directory": path.join(data_dir, subdir),
-                },
-            )
-            for subdir, split in subdirectory_mapping.items()
-        ]
diff --git a/src/pytorch_ie/data/datasets/hf_datasets/chemprot.py b/src/pytorch_ie/data/datasets/hf_datasets/chemprot.py
deleted file mode 100644
index d09aa2c2..00000000
--- a/src/pytorch_ie/data/datasets/hf_datasets/chemprot.py
+++ /dev/null
@@ -1,176 +0,0 @@
-"""TODO: Add a description here."""
-
-
-import json
-import re
-
-import datasets
-
-_CITATION_CHEMPROT = """\
-@article{article,
-author = {Kringelum, Jens and Kjaerulff, Sonny and Brunak, Søren and Lund, Ole and Oprea, Tudor and Taboureau, Olivier},
-year = {2016},
-month = {02},
-pages = {bav123},
-title = {ChemProt-3.0: A global chemical biology diseases mapping},
-volume = {2016},
-journal = {Database},
-doi = {10.1093/database/bav123}
-}"""
-
-# You can copy an official description
-_DESCRIPTION = """\
-ChemProt is a publicly available compilation of chemical-protein-disease annotation resources that enables the study
-of systems pharmacology for a small molecule across multiple layers of complexity from molecular to clinical levels.
-In this third version, ChemProt has been updated to more than 1.7 million compounds with 7.8 million bioactivity
-measurements for 19 504 proteins.
-"""
-
-_HOMEPAGE = "http://potentia.cbs.dtu.dk/ChemProt/"
-
-# TODO: Add the license for the dataset here if you can find it
-_LICENSE = ""
-
-# TODO: Add link to the official dataset URLs here, currently pointing to preprocessed scibert files
-# The HuggingFace dataset library don't host the datasets but only point to the original files
-# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
-_DATA_URLs = {
-    "train": "https://raw.githubusercontent.com/allenai/scibert/master/data/text_classification/chemprot/train.txt",
-    "dev": "https://raw.githubusercontent.com/allenai/scibert/master/data/text_classification/chemprot/dev.txt",
-    "test": "https://raw.githubusercontent.com/allenai/scibert/master/data/text_classification/chemprot/test.txt",
-}
-
-_CLASS_LABELS = [
-    "ACTIVATOR",
-    "AGONIST",
-    "AGONIST-ACTIVATOR",
-    "AGONIST-INHIBITOR",
-    "ANTAGONIST",
-    "DOWNREGULATOR",
-    "INDIRECT-DOWNREGULATOR",
-    "INDIRECT-UPREGULATOR",
-    "INHIBITOR",
-    "PRODUCT-OF",
-    "SUBSTRATE",
-    "SUBSTRATE_PRODUCT-OF",
-    "UPREGULATOR",
-]
-
-
-# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
-class ChemProt(datasets.GeneratorBasedBuilder):
-    """TODO: Short description of my dataset."""
-
-    VERSION = datasets.Version("3.0.0")  # type: ignore
-
-    def _info(self):
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=_DESCRIPTION,
-            # This defines the different columns of the dataset and their types
-            features=datasets.Features(
-                {
-                    "tokens": datasets.Sequence(datasets.Value("string")),
-                    "head_start": datasets.Value("int32"),
-                    "head_end": datasets.Value("int32"),
-                    "tail_start": datasets.Value("int32"),
-                    "tail_end": datasets.Value("int32"),
-                    "label": datasets.ClassLabel(names=_CLASS_LABELS),
-                }
-            ),
-            # If there's a common (input, target) tuple from the features,
-            # specify them here. They'll be used if as_supervised=True in
-            # builder.as_dataset.
-            supervised_keys=None,
-            # Homepage of the dataset for documentation
-            homepage=_HOMEPAGE,
-            # License for the dataset if available
-            license=_LICENSE,
-            # Citation for the dataset
-            citation=_CITATION_CHEMPROT,
-        )
-
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-        # This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
-        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
-
-        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs
-        # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
-        # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
-        data_files = dl_manager.download_and_extract(_DATA_URLs)
-
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={"filepath": data_files.get("train")},
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                gen_kwargs={"filepath": data_files.get("dev")},
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={"filepath": data_files.get("test")},
-            ),
-        ]
-
-    def _generate_examples(self, filepath):
-        """Yields examples."""
-        with open(filepath, encoding="utf-8") as f:
-            for idx, line in enumerate(f.readlines()):
-                example = json.loads(line)
-                raw_text = example["text"]
-                label = example["label"]
-                id_ = str(idx)
-
-                # handle special case with square brackets surrounding entities in raw text
-                raw_text = re.sub(r"\[\[\[", "[ [[", raw_text)
-                raw_text = re.sub(r"\]\]\]", "]] ]", raw_text)
-                # handle unicode remnants
-                raw_text = re.sub(r"(\u2002|\xa0)", " ", raw_text)
-
-                # TODO check whether adding whitespace before and after symbols may be too aggressive
-                raw_text = re.sub(r"([.,!?()])(\S)", r"\1 \2", raw_text)
-                raw_text = re.sub(r"(\S)([.,!?()])", r"\1 \2", raw_text)
-
-                # add whitespace before start marker and after end marker
-                raw_text = re.sub(r"(\S)(\[\[)", r"\1 \2", raw_text)
-                raw_text = re.sub(r"(\S)(<<)", r"\1 \2", raw_text)
-                raw_text = re.sub(r"(\]\])(\S)", r"\1 \2", raw_text)
-                raw_text = re.sub(r"(>>)(\S)", r"\1 \2", raw_text)
-
-                tokens = raw_text.split(" ")
-
-                assert any(e in tokens for e in ["[[", "]]", "<<", ">>"]), (
-                    f"Missing head/tail markers in " f"{example}\n Tokens: {tokens}"
-                )
-
-                # Get head/tail order before determining head/tail indices and popping markers
-                head_start = tokens.index("[[")
-                tail_start = tokens.index("<<")
-                if head_start < tail_start:
-                    tokens.pop(head_start)
-                    head_end = tokens.index("]]")
-                    tokens.pop(head_end)
-                    tail_start = tokens.index("<<")
-                    tokens.pop(tail_start)
-                    tail_end = tokens.index(">>")
-                    tokens.pop(tail_end)
-                else:
-                    tokens.pop(tail_start)
-                    tail_end = tokens.index(">>")
-                    tokens.pop(tail_end)
-                    head_start = tokens.index("[[")
-                    tokens.pop(head_start)
-                    head_end = tokens.index("]]")
-                    tokens.pop(head_end)
-
-                yield id_, {
-                    "tokens": tokens,
-                    "head_start": head_start,
-                    "head_end": head_end,
-                    "tail_start": tail_start,
-                    "tail_end": tail_end,
-                    "label": label,
-                }
diff --git a/src/pytorch_ie/data/datasets/hf_datasets/fewrel.py b/src/pytorch_ie/data/datasets/hf_datasets/fewrel.py
deleted file mode 100644
index 5eb73609..00000000
--- a/src/pytorch_ie/data/datasets/hf_datasets/fewrel.py
+++ /dev/null
@@ -1,285 +0,0 @@
-"""TODO: Add a description here."""
-
-
-import json
-
-import datasets
-
-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
-
-# TODO: Add link to the official dataset URLs here
-# The HuggingFace dataset library don't host the datasets but only point to the original files
-# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
-
-_CITATION_FEWREL_1 = """\
-@inproceedings{han-etal-2018-fewrel,
-    title = "{F}ew{R}el: A Large-Scale Supervised Few-Shot Relation Classification Dataset with State-of-the-Art Evaluation",
-    author = "Han, Xu and Zhu, Hao and Yu, Pengfei and Wang, Ziyun and Yao, Yuan and Liu, Zhiyuan and Sun, Maosong",
-    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
-    month = oct # "-" # nov,
-    year = "2018",
-    address = "Brussels, Belgium",
-    publisher = "Association for Computational Linguistics",
-    url = "https://www.aclweb.org/anthology/D18-1514",
-    doi = "10.18653/v1/D18-1514",
-    pages = "4803--4809"
-}"""
-
-_CITATION_FEWREL_2 = """\
-@inproceedings{han-etal-2018-fewrel,
-    title = "{F}ew{R}el: A Large-Scale Supervised Few-Shot Relation Classification Dataset with State-of-the-Art Evaluation",
-    author = "Han, Xu and Zhu, Hao and Yu, Pengfei and Wang, Ziyun and Yao, Yuan and Liu, Zhiyuan and Sun, Maosong",
-    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
-    month = oct # "-" # nov,
-    year = "2018",
-    address = "Brussels, Belgium",
-    publisher = "Association for Computational Linguistics",
-    url = "https://www.aclweb.org/anthology/D18-1514",
-    doi = "10.18653/v1/D18-1514",
-    pages = "4803--4809"
-}
-
-@inproceedings{gao-etal-2019-fewrel,
-    title = "{F}ew{R}el 2.0: Towards More Challenging Few-Shot Relation Classification",
-    author = "Gao, Tianyu and Han, Xu and Zhu, Hao and Liu, Zhiyuan and Li, Peng and Sun, Maosong and Zhou, Jie",
-    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
-    month = nov,
-    year = "2019",
-    address = "Hong Kong, China",
-    publisher = "Association for Computational Linguistics",
-    url = "https://www.aclweb.org/anthology/D19-1649",
-    doi = "10.18653/v1/D19-1649",
-    pages = "6251--6256"
-}
-"""
-
-
-class FewRelConfig(datasets.BuilderConfig):
-    """BuilderConfig for FewRel."""
-
-    def __init__(
-        self,
-        data_url,
-        citation,
-        url,
-        class_labels,
-        description,
-        **kwargs,
-    ):
-        """BuilderConfig for FewRel.
-        Args:
-          data_url: `string`, url to download the zip file from
-          citation: `string`, citation for the data set
-          url: `string`, url for information about the data set
-          class_labels: `list[string]`, the list of classes if the label is
-            categorical. If not provided, then the label will be of type
-            `datasets.Value('float32')`.
-          **kwargs: keyword arguments forwarded to super.
-        """
-        super().__init__(version=datasets.Version("1.0.0", ""), **kwargs)
-        self.class_labels = class_labels
-        self.data_url = data_url
-        self.citation = citation
-        self.url = url
-        self.description = description
-
-
-# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
-class FewRel(datasets.GeneratorBasedBuilder):
-    """TODO: Short description of my dataset."""
-
-    BUILDER_CONFIGS = [
-        FewRelConfig(
-            name="fewrel_train",
-            data_url="https://github.com/thunlp/FewRel/raw/master/data/train_wiki.json",
-            citation=_CITATION_FEWREL_1,
-            url="https://thunlp.github.io/1/fewrel1.html",
-            class_labels=[
-                "P931",
-                "P4552",
-                "P140",
-                "P1923",
-                "P150",
-                "P6",
-                "P27",
-                "P449",
-                "P1435",
-                "P175",
-                "P1344",
-                "P39",
-                "P527",
-                "P740",
-                "P706",
-                "P84",
-                "P495",
-                "P123",
-                "P57",
-                "P22",
-                "P178",
-                "P241",
-                "P403",
-                "P1411",
-                "P135",
-                "P991",
-                "P156",
-                "P176",
-                "P31",
-                "P1877",
-                "P102",
-                "P1408",
-                "P159",
-                "P3373",
-                "P1303",
-                "P17",
-                "P106",
-                "P551",
-                "P937",
-                "P355",
-                "P710",
-                "P137",
-                "P674",
-                "P466",
-                "P136",
-                "P306",
-                "P127",
-                "P400",
-                "P974",
-                "P1346",
-                "P460",
-                "P86",
-                "P118",
-                "P264",
-                "P750",
-                "P58",
-                "P3450",
-                "P105",
-                "P276",
-                "P101",
-                "P407",
-                "P1001",
-                "P800",
-                "P131",
-            ],
-            description="",
-        ),
-        FewRelConfig(
-            name="fewrel_validation",
-            data_url="https://github.com/thunlp/FewRel/raw/master/data/val_wiki.json",
-            citation=_CITATION_FEWREL_1,
-            url="https://thunlp.github.io/1/fewrel1.html",
-            class_labels=[
-                "P177",
-                "P364",
-                "P2094",
-                "P361",
-                "P641",
-                "P59",
-                "P413",
-                "P206",
-                "P412",
-                "P155",
-                "P26",
-                "P410",
-                "P25",
-                "P463",
-                "P40",
-                "P921",
-            ],
-            description="",
-        ),
-        FewRelConfig(
-            name="fewrel2_validation",
-            data_url="https://github.com/thunlp/FewRel/raw/master/data/val_pubmed.json",
-            citation=_CITATION_FEWREL_2,
-            url="https://thunlp.github.io/2/fewrel2_da.html",
-            class_labels=[
-                "biological_process_involves_gene_product",
-                "inheritance_type_of",
-                "is_normal_tissue_origin_of_disease",
-                "ingredient_of",
-                "is_primary_anatomic_site_of_disease",
-                "gene_found_in_organism",
-                "occurs_in",
-                "causative_agent_of",
-                "classified_as",
-                "gene_plays_role_in_process",
-            ],
-            description="",
-        ),
-    ]
-
-    def _info(self):
-        features = datasets.Features(
-            {
-                "tokens": datasets.Sequence(datasets.Value("string")),
-                "head_start": datasets.Value("int32"),
-                "head_end": datasets.Value("int32"),
-                "tail_start": datasets.Value("int32"),
-                "tail_end": datasets.Value("int32"),
-                "label": datasets.ClassLabel(names=self.config.class_labels),
-            }
-        )
-
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=self.config.description,
-            # This defines the different columns of the dataset and their types
-            features=features,  # Here we define them above because they are different between the two configurations
-            # If there's a common (input, target) tuple from the features,
-            # specify them here. They'll be used if as_supervised=True in
-            # builder.as_dataset.
-            supervised_keys=None,
-            # Homepage of the dataset for documentation
-            homepage=self.config.url,
-            # License for the dataset if available
-            license=_LICENSE,
-            # Citation for the dataset
-            citation=self.config.citation,
-        )
-
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
-        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
-
-        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs
-        # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
-        # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
-        filepath = dl_manager.download_and_extract(self.config.data_url)
-
-        split = (
-            datasets.Split.VALIDATION if "validation" in self.config.name else datasets.Split.TRAIN
-        )
-
-        return [
-            datasets.SplitGenerator(
-                name=split,
-                gen_kwargs={"filepath": filepath},
-            ),
-        ]
-
-    def _generate_examples(self, filepath):
-        """Yields examples."""
-        with open(filepath, encoding="utf-8") as f:
-            data = json.load(f)
-            for label, examples in data.items():
-                for idx, example in enumerate(examples):
-                    id_ = label + "_" + str(idx)
-
-                    head_token_positions = example["h"][2][0]
-                    tail_token_positions = example["t"][2][0]
-
-                    head_start = head_token_positions[0]
-                    head_end = head_token_positions[-1]
-                    tail_start = tail_token_positions[0]
-                    tail_end = tail_token_positions[-1]
-
-                    yield id_, {
-                        "tokens": example["tokens"],
-                        "head_start": head_start,
-                        "head_end": head_end + 1,  # make end offset exclusive
-                        "tail_start": tail_start,
-                        "tail_end": tail_end + 1,  # make end offset exclusive
-                        "label": label,
-                    }
diff --git a/src/pytorch_ie/data/datasets/hf_datasets/genia.py b/src/pytorch_ie/data/datasets/hf_datasets/genia.py
deleted file mode 100644
index 8fd88503..00000000
--- a/src/pytorch_ie/data/datasets/hf_datasets/genia.py
+++ /dev/null
@@ -1,406 +0,0 @@
-"""TODO: Add a description here."""
-
-
-import os
-
-import datasets
-import spacy
-from spacy.lang.en import English
-from spacy.symbols import ORTH
-
-_CITATION_GENIA = """\
-@article{article,
-    author = {Kim, Jin-Dong and Ohta, Tomoko and Tateisi, Yuka and Tsujii, Jun'ichi},
-    year = {2003},
-    month = {02},
-    pages = {i180-2},
-    title = {GENIA corpus—A semantically annotated corpus for bio-textmining},
-    volume = {19 Suppl 1},
-    journal = {Bioinformatics (Oxford, England)},
-    doi = {10.1093/bioinformatics/btg1023}
-}"""
-
-# You can copy an official description
-_DESCRIPTION = """
-The GENIA corpus is the primary collection of biomedical literature compiled and annotated within the scope
-of the GENIA project. The corpus was created to support the development and evaluation of information
-extraction and text mining systems for the domain of molecular biology.
-"""
-
-_HOMEPAGE = "http://www.geniaproject.org/genia-corpus/relation-corpus"
-
-# TODO: Add the license for the dataset here if you can find it
-_LICENSE = """\
-GENIA Project License for Annotated Corpora
-
-1. Copyright of abstracts
-
-Any abstracts contained in this corpus are from PubMed(R), a database
-of the U.S. National Library of Medicine (NLM).
-
-NLM data are produced by a U.S. Government agency and include works of
-the United States Government that are not protected by U.S. copyright
-law but may be protected by non-US copyright law, as well as abstracts
-originating from publications that may be protected by U.S. copyright
-law.
-
-NLM assumes no responsibility or liability associated with use of
-copyrighted material, including transmitting, reproducing,
-redistributing, or making commercial use of the data. NLM does not
-provide legal advice regarding copyright, fair use, or other aspects
-of intellectual property rights. Persons contemplating any type of
-transmission or reproduction of copyrighted material such as abstracts
-are advised to consult legal counsel.
-
-2. Copyright of full texts
-
-Any full texts contained in this corpus are from the PMC Open Access
-Subset of PubMed Central (PMC), the U.S. National Institutes of Health
-(NIH) free digital archive of biomedical and life sciences journal
-literature.
-
-Articles in the PMC Open Access Subset are protected by copyright, but
-are made available under a Creative Commons or similar license that
-generally allows more liberal redistribution and reuse than a
-traditional copyrighted work. Please refer to the license of each
-article for specific license terms.
-
-3. Copyright of annotations
-
-The copyrights of annotations created in the GENIA Project of Tsujii
-Laboratory, University of Tokyo, belong in their entirety to the GENIA
-Project.
-
-4. Licence terms
-
-Use and distribution of abstracts drawn from PubMed is subject to the
-PubMed(R) license terms as stated in Clause 1.
-
-Use and distribution of full texts is subject to the license terms
-applying to each publication.
-
-Annotations created by the GENIA Project are licensed under the
-Creative Commons Attribution 3.0 Unported License. To view a copy of
-this license, visit http://creativecommons.org/licenses/by/3.0/ or
-send a letter to Creative Commons, 444 Castro Street, Suite 900,
-Mountain View, California, 94041, USA.
-
-Annotations created by the GENIA Project must be attributed as
-detailed in Clause 5.
-
-5. Attribution
-
-The GENIA Project was founded and led by prof. Jun'ichi Tsujii and
-the project and its annotation efforts have been coordinated in part
-by Nigel Collier, Yuka Tateisi, Sang-Zoo Lee, Tomoko Ohta, Jin-Dong
-Kim, and Sampo Pyysalo.
-
-For a complete list of the GENIA Project members and contributors,
-please refer to http://www.geniaproject.org.
-
-The GENIA Project has been supported by Grant-in-Aid for Scientific
-Research on Priority Area "Genome Information Science" (MEXT, Japan),
-Grant-in-Aid for Scientific Research on Priority Area "Systems
-Genomics" (MEXT, Japan), Core Research for Evolutional Science &
-Technology (CREST) "Information Mobility Project" (JST, Japan),
-Solution Oriented Research for Science and Technology (SORST) (JST,
-Japan), Genome Network Project (MEXT, Japan) and Grant-in-Aid for
-Specially Promoted Research (MEXT, Japan).
-
-Annotations covered by this license must be attributed as follows:
-
-    Corpus annotations (c) GENIA Project
-
-Distributions including annotations covered by this licence must
-include this license text and Attribution section.
-
-6. References
-
-- GENIA Project : http://www.geniaproject.org
-- PubMed : http://www.pubmed.gov/
-- NLM (United States National Library of Medicine) : http://www.nlm.nih.gov/
-- MEXT (Ministry of Education, Culture, Sports, Science and Technology) : http://www.mext.go.jp/
-- JST (Japan Science and Technology Agency) : http://www.jst.go.jp
-"""
-
-# TODO: Add link to the official dataset URLs here, currently test points to blind test file
-# The HuggingFace dataset library don't host the datasets but only point to the original files
-# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
-_DATA_URLs = {
-    "train": "http://www.nactem.ac.uk/GENIA/current/GENIA-corpus/Relation/GENIA_relation_annotation_training_data.tar.gz",
-    "dev": "http://www.nactem.ac.uk/GENIA/current/GENIA-corpus/Relation/GENIA_relation_annotation_development_data.tar.gz",
-    # "test": "http://www.nactem.ac.uk/GENIA/current/GENIA-corpus/Relation/GENIA_relation_annotation_test_data.tar.gz"
-}
-# TODO: Add class labels
-_CLASS_LABELS = ["Subunit-Complex", "Protein-Component"]
-
-
-class Genia(datasets.GeneratorBasedBuilder):
-    """TODO: Short description of my dataset."""
-
-    VERSION = datasets.Version("1.0.0")  # type: ignore
-
-    def _info(self):
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=_DESCRIPTION,
-            # This defines the different columns of the dataset and their types
-            features=datasets.Features(
-                {
-                    "tokens": datasets.Sequence(datasets.Value("string")),
-                    "head_start": datasets.Value("int32"),
-                    "head_end": datasets.Value("int32"),
-                    "tail_start": datasets.Value("int32"),
-                    "tail_end": datasets.Value("int32"),
-                    "label": datasets.ClassLabel(names=_CLASS_LABELS),
-                }
-            ),
-            # If there's a common (input, target) tuple from the features,
-            # specify them here. They'll be used if as_supervised=True in
-            # builder.as_dataset.
-            supervised_keys=None,
-            # Homepage of the dataset for documentation
-            homepage=_HOMEPAGE,
-            # License for the dataset if available
-            license=_LICENSE,
-            # Citation for the dataset
-            citation=_CITATION_GENIA,
-        )
-
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
-        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
-
-        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs
-        # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
-        # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
-        data_files = dl_manager.download_and_extract(_DATA_URLs)
-
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={"filepath": data_files.get("train")},
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                gen_kwargs={"filepath": data_files.get("dev")},
-            ),
-            # datasets.SplitGenerator(
-            #     name=datasets.Split.TEST,
-            #     gen_kwargs={"filepath": data_files.get("test")},
-            # ),
-        ]
-
-    def _generate_examples(self, filepath):
-        """Yields examples."""
-        doc_ids, list_of_files = self._get_doc_ids_and_file_paths(filepath)
-        processed_docs = self._get_processed_docs(doc_ids, list_of_files)
-
-        idx = 0
-        for doc in processed_docs:
-            if "sentences" in doc and "sent_rels" in doc:
-                sent_start_index = 0
-                for sent, rels in zip(doc["sentences"], doc["sent_rels"]):
-                    for rel in rels:
-                        label = rel["label"]
-                        head_start = rel["head_start"] - sent_start_index
-                        head_end = rel["head_end"] - sent_start_index
-                        tail_start = rel["tail_start"] - sent_start_index
-                        tail_end = rel["tail_end"] - sent_start_index
-
-                        id_ = str(idx) + "_" + doc["doc_id"]
-                        idx += 1
-
-                        yield id_, {
-                            "tokens": sent["tokens"],
-                            "head_start": head_start,
-                            "head_end": head_end,
-                            "tail_start": tail_start,
-                            "tail_end": tail_end,
-                            "label": label,
-                        }
-
-                    sent_start_index += len(sent)
-            else:
-                for rel in doc["relations"]:
-                    label = rel["label"]
-                    head_start = rel["head_start"]
-                    head_end = rel["head_end"]
-                    tail_start = rel["tail_start"]
-                    tail_end = rel["tail_end"]
-
-                    id_ = str(idx) + "_" + doc["doc_id"]
-                    idx += 1
-
-                    yield id_, {
-                        "tokens": doc["tokens"],
-                        "head_start": head_start,
-                        "head_end": head_end,
-                        "tail_start": tail_start,
-                        "tail_end": tail_end,
-                        "label": label,
-                    }
-
-    def _get_doc_ids_and_file_paths(self, path):
-        list_of_files = {}
-        for root, dirs, files in os.walk(path):
-            for file in files:
-                if file not in ["LICENSE", "README"]:
-                    list_of_files[file] = os.path.join(root, file)
-        doc_ids = list({file_name.split(".")[0] for file_name in list_of_files.keys()})
-        doc_ids.sort()
-        doc_ids.sort(key=len)
-        return doc_ids, list_of_files
-
-    def _get_processed_docs(self, doc_ids, list_of_files):
-        ssplit = False
-        try:
-            nlp = spacy.load("en_core_web_sm")
-            special_case = [{ORTH: "ca."}]
-            nlp.tokenizer.add_special_case("ca.", special_case)
-            ssplit = True
-        except OSError as e:
-            print(e)
-            print(
-                "You have to download the model first to enable sentence splitting: "
-                "\tpython -m spacy download en_core_web_sm"
-            )
-            print("Resorting to tokenization only")
-            nlp = English()
-        processed_docs = []
-        for doc_id in doc_ids:
-            try:
-                txt_file = list_of_files[doc_id + ".txt"]
-                a1_file = list_of_files[doc_id + ".a1"]
-                rel_file = list_of_files[doc_id + ".rel"]
-            except KeyError:
-                print(f"Missing annotation file for doc {doc_id}")
-                continue
-
-            relations = []
-            entities = {}
-            with open(txt_file, encoding="utf-8") as txt:
-                text = txt.read()
-            doc = nlp(text)
-            with open(a1_file, encoding="utf-8") as a1:
-                for line in a1.readlines():
-                    if line.startswith("T"):
-                        entity_id, entity = self._retrieve_entity(line, doc, doc_id)
-                        entities[entity_id] = entity
-            with open(rel_file, encoding="utf-8") as rel:
-                for line in rel.readlines():
-                    if line.startswith("T"):
-                        entity_id, entity = self._retrieve_entity(line, doc, doc_id)
-                        entities[entity_id] = entity
-                    elif line.startswith("R"):
-                        relations.append(self._retrieve_relation(line, entities))
-            tokens = [token.text for token in doc]
-            processed_doc = {
-                "doc_id": doc_id,
-                "text": text,
-                "tokens": tokens,
-                "entities": entities,
-                "relations": relations,
-            }
-            if ssplit:
-                sentences = self._convert_sentences(doc.sents)
-                sentences = self._fix_ssplit(doc_id, sentences)
-                sentence_tokens = []
-                sentence_relations = []
-                left_over_rels_indices = [True for _ in relations]
-                for sent in sentences:
-                    sent_rels = []
-                    for idx, relation in enumerate(relations):
-                        if (
-                            min(relation["head_start"], relation["tail_start"]) >= sent["start"]
-                            and max(relation["head_end"], relation["tail_end"]) <= sent["end"]
-                        ):
-                            sent_rels.append(relation)
-                            left_over_rels_indices[idx] = False
-                    sentence_tokens.append(sent["tokens"])
-                    sentence_relations.append(sent_rels)
-                left_over_rels = []
-                for indicator, relation in zip(left_over_rels_indices, relations):
-                    if indicator:
-                        left_over_rels.append(relation)
-                if left_over_rels:
-                    print(
-                        f"Examples in doc {doc_id} where spaCy ssplit were not compatible with relation annotation:"
-                    )
-                    print([list(sent) for sent in doc.sents])
-                    print(sentences)
-                    print(left_over_rels)
-                processed_doc["sentences"] = sentences
-                processed_doc["sent_rels"] = sentence_relations
-            processed_docs.append(processed_doc)
-        return processed_docs
-
-    def _retrieve_entity(self, line, doc, doc_id=""):
-        cols = line.strip().split()
-        entity_id, _, start_char, end_char = cols[0:4]
-        start_char, end_char = int(start_char), int(end_char)
-        entity_type = " ".join(cols[4:])
-        # default alignment mode is strict, but charOffset in annotation sometimes does not translate to token offsets
-        # well, e.g. charOffsets only cover "LMP1" in "LMP1+"
-        span = doc.char_span(start_char, end_char, alignment_mode="expand")
-        if span:
-            start, end = span.start, span.end
-        else:
-            snippet_start = max(0, start_char - 10)
-            snippet_end = min(len(doc.text), end_char + 10)
-            raise ValueError(
-                f"{doc_id} Could not retrieve span for character offsets: "
-                f"text[{start_char},{end_char}] = {doc.text[start_char:end_char]}\n"
-                f"{doc.text[snippet_start:snippet_end]}\n"
-                f"{list(doc)}"
-            )
-        return (entity_id, {"start": start, "end": end, "entity_type": entity_type})
-
-    def _retrieve_relation(self, line, entities):
-        cols = line.strip().split()
-        relation_id, rel_type, arg1, arg2 = cols
-        arg1 = arg1.split(":")[-1]
-        head_start, head_end = entities[arg1]["start"], entities[arg1]["end"]
-        arg2 = arg2.split(":")[-1]
-        tail_start, tail_end = entities[arg2]["start"], entities[arg2]["end"]
-        return {
-            "rel_id": relation_id,
-            "head_start": head_start,
-            "head_end": head_end,
-            "tail_start": tail_start,
-            "tail_end": tail_end,
-            "label": rel_type,
-        }
-
-    def _convert_sentences(self, sentences):
-        sentence_dicts = []
-        for sent in sentences:
-            start, end = sent.start, sent.end
-            tokens = [token.text for token in sent]
-            sentence_dicts.append({"tokens": tokens, "start": start, "end": end})
-        return sentence_dicts
-
-    def _fix_ssplit(self, doc_id, sentences):
-        if doc_id == "PMID-8164652":
-            sentences[2]["tokens"] += sentences[3]["tokens"]
-            sentences[2]["end"] = sentences[3]["end"]
-            del sentences[3]
-        elif doc_id == "PMID-9442380":
-            sentences[4]["tokens"].append(sentences[5]["tokens"].pop(0))
-            sentences[4]["end"] += 1
-            sentences[5]["start"] += 1
-        elif doc_id == "PMID-10201929":
-            sentences[4]["tokens"] += sentences[5]["tokens"]
-            sentences[4]["end"] = sentences[5]["end"]
-            del sentences[5]
-        elif doc_id == "PMID-10428853":
-            sentences[3]["tokens"] += sentences[4]["tokens"]
-            sentences[3]["end"] = sentences[4]["end"]
-            del sentences[4]
-        elif doc_id == "PMID-1675604":
-            sentences[2]["tokens"] += sentences[3]["tokens"]
-            sentences[2]["end"] = sentences[3]["end"]
-            del sentences[3]
-        sentences = [sent for sent in sentences if sent["tokens"]]
-        return sentences
diff --git a/src/pytorch_ie/data/datasets/hf_datasets/ontonotes.py b/src/pytorch_ie/data/datasets/hf_datasets/ontonotes.py
deleted file mode 100644
index e1307e09..00000000
--- a/src/pytorch_ie/data/datasets/hf_datasets/ontonotes.py
+++ /dev/null
@@ -1,161 +0,0 @@
-import os
-
-import datasets
-from tqdm import tqdm
-
-_CITATION = """
-"""
-
-_DESCRIPTION = """
-OntoNotes 5.0
-"""
-
-_URL = (
-    "https://cloud.dfki.de/owncloud/index.php/s/S8pB4xTBZ3zQEic/download/OntoNotes-5.0-NER-BIO.zip"
-)
-
-_LICENCE = "LDC User Agreement for Non-Members"
-
-# the label ids for ner_tags
-NER_TAGS_DICT = {
-    "O": 0,
-    "CARDINAL": 1,
-    "DATE": 2,
-    "EVENT": 3,
-    "FAC": 4,
-    "GPE": 5,
-    "LANGUAGE": 6,
-    "LAW": 7,
-    "LOC": 8,
-    "MONEY": 9,
-    "NORP": 10,
-    "ORDINAL": 11,
-    "ORG": 12,
-    "PERCENT": 13,
-    "PERSON": 14,
-    "PRODUCT": 15,
-    "QUANTITY": 16,
-    "TIME": 17,
-    "WORK_OF_ART": 18,
-}
-
-
-class OntoNotesConfig(datasets.BuilderConfig):
-    """BuilderConfig for OntoNotes"""
-
-    def __init__(self, **kwargs):
-        """BuilderConfig for OntoNotes.
-        Args:
-          **kwargs: keyword arguments forwarded to super.
-        """
-        super().__init__(**kwargs)
-
-
-class OntoNotes(datasets.GeneratorBasedBuilder):
-    def _info(self):
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=datasets.Features(
-                {
-                    "id": datasets.Value("string"),
-                    "tokens": datasets.features.Sequence(datasets.Value("string")),
-                    "pos_tags": datasets.features.Sequence(datasets.Value("string")),
-                    "parsing": datasets.features.Sequence(datasets.Value("string")),
-                    "ner_tags": datasets.features.Sequence(
-                        datasets.features.ClassLabel(
-                            names=[
-                                "O",
-                                "CARDINAL",
-                                "DATE",
-                                "EVENT",
-                                "FAC",
-                                "GPE",
-                                "LANGUAGE",
-                                "LAW",
-                                "LOC",
-                                "MONEY",
-                                "NORP",
-                                "ORDINAL",
-                                "ORG",
-                                "PERCENT",
-                                "PERSON",
-                                "PRODUCT",
-                                "QUANTITY",
-                                "TIME",
-                                "WORK_OF_ART",
-                            ]
-                        )
-                    ),
-                }
-            ),
-            supervised_keys=None,
-            homepage="https://catalog.ldc.upenn.edu/LDC2013T19",
-            citation=_CITATION,
-            license=_LICENCE,
-        )
-
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-        urls_to_download = dl_manager.download_and_extract(_URL)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={
-                    "filepath": os.path.join(
-                        urls_to_download,
-                        "onto.train.ner",
-                    )
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                gen_kwargs={"filepath": os.path.join(urls_to_download, "onto.development.ner")},
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={"filepath": os.path.join(urls_to_download, "onto.test.ner")},
-            ),
-        ]
-
-    def _generate_examples(self, filepath=None):
-        num_lines = sum(1 for _ in open(filepath))
-        id = 0
-
-        with open(filepath) as f:
-            tokens, pos_tags, dependencies, ner_tags = [], [], [], []
-            for line in tqdm(f, total=num_lines):
-                line = line.strip().split()
-
-                if line:
-                    assert len(line) == 4
-                    token, pos_tag, dependency, ner_tag = line
-                    if ner_tag != "O":
-                        ner_tag = ner_tag.split("-")[1]
-                    tokens.append(token)
-                    pos_tags.append(pos_tag)
-                    dependencies.append(dependency)
-                    ner_tags.append(NER_TAGS_DICT[ner_tag])
-
-                elif tokens:
-                    # organize a record to be written into json
-                    record = {
-                        "tokens": tokens,
-                        "id": str(id),
-                        "pos_tags": pos_tags,
-                        "parsing": dependencies,
-                        "ner_tags": ner_tags,
-                    }
-                    tokens, pos_tags, dependencies, ner_tags = [], [], [], []
-                    id += 1
-                    yield record["id"], record
-
-            # take the last sentence
-            if tokens:
-                record = {
-                    "tokens": tokens,
-                    "id": str(id),
-                    "pos_tags": pos_tags,
-                    "parsing": dependencies,
-                    "ner_tags": ner_tags,
-                }
-                yield record["id"], record
diff --git a/src/pytorch_ie/data/datasets/hf_datasets/scierc.py b/src/pytorch_ie/data/datasets/hf_datasets/scierc.py
deleted file mode 100644
index 4b3d9b7b..00000000
--- a/src/pytorch_ie/data/datasets/hf_datasets/scierc.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""TODO: Add a description here."""
-
-
-import json
-import os
-
-import datasets
-
-_CITATION_SCIERC = """\
-@InProceedings{luan2018multitask,
-     author = {Luan, Yi and He, Luheng and Ostendorf, Mari and Hajishirzi, Hannaneh},
-     title = {Multi-Task Identification of Entities, Relations, and Coreferencefor Scientific Knowledge Graph Construction},
-     booktitle = {Proc.\\ Conf. Empirical Methods Natural Language Process. (EMNLP)},
-     year = {2018},
-}"""
-
-# You can copy an official description
-_DESCRIPTION = """\
-SCIERC includes annotations for scientific entities, their relations, and coreference clusters
-for 500 scientific abstracts. These abstracts are taken from 12 AI conference/workshop proceedings
-in four AI communities, from the Semantic Scholar Corpus. SCI-ERC extends previous datasets in scientific
-articles SemEval 2017 Task 10 and SemEval 2018 Task 7 by extending entity types, relation types, relation coverage,
-and adding cross-sentence relations using coreference links.
-"""
-
-_HOMEPAGE = "http://nlp.cs.washington.edu/sciIE/"
-
-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
-
-# The HuggingFace dataset library don't host the datasets but only point to the original files
-# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
-_DATA_URL = "http://nlp.cs.washington.edu/sciIE/data/sciERC_processed.tar.gz"
-
-_CLASS_LABELS = [
-    "USED-FOR",
-    "FEATURE-OF",
-    "HYPONYM-OF",
-    "PART-OF",
-    "COMPARE",
-    "CONJUNCTION",
-    "EVALUATE-FOR",  # label in the data is not documented in annotation guidelines
-]
-
-
-# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
-class SCIERC(datasets.GeneratorBasedBuilder):
-    """TODO: Short description of my dataset."""
-
-    VERSION = datasets.Version("1.0.0")  # type: ignore
-
-    def _info(self):
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=_DESCRIPTION,
-            # This defines the different columns of the dataset and their types
-            features=datasets.Features(
-                {
-                    "tokens": datasets.Sequence(datasets.Value("string")),
-                    "head_start": datasets.Value("int32"),
-                    "head_end": datasets.Value("int32"),
-                    "tail_start": datasets.Value("int32"),
-                    "tail_end": datasets.Value("int32"),
-                    "label": datasets.ClassLabel(names=_CLASS_LABELS),
-                }
-            ),
-            # If there's a common (input, target) tuple from the features,
-            # specify them here. They'll be used if as_supervised=True in
-            # builder.as_dataset.
-            supervised_keys=None,
-            # Homepage of the dataset for documentation
-            homepage=_HOMEPAGE,
-            # License for the dataset if available
-            license=_LICENSE,
-            # Citation for the dataset
-            citation=_CITATION_SCIERC,
-        )
-
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-        # This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
-        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
-
-        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs
-        # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
-        # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
-        dl_dir = dl_manager.download_and_extract(_DATA_URL)
-        data_dir = os.path.join(dl_dir, "processed_data/json")
-
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={"filepath": os.path.join(data_dir, "train.json")},
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                gen_kwargs={"filepath": os.path.join(data_dir, "dev.json")},
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={"filepath": os.path.join(data_dir, "test.json")},
-            ),
-        ]
-
-    def _generate_examples(self, filepath):
-        """Yields examples."""
-        with open(filepath, encoding="utf-8") as f:
-            idx = 0
-            for line in f.readlines():
-                example = json.loads(line)
-                sent_start_index = 0
-                for sent, rels in zip(example["sentences"], example["relations"]):
-                    for rel in rels:
-                        head_start, head_end, tail_start, tail_end, label = rel
-                        head_start -= sent_start_index
-                        head_end -= sent_start_index
-                        tail_start -= sent_start_index
-                        tail_end -= sent_start_index
-
-                        id_ = str(idx) + "_" + example["doc_key"]
-                        idx += 1
-
-                        yield id_, {
-                            "tokens": sent,
-                            "head_start": head_start,
-                            "head_end": head_end + 1,  # make end offset exclusive
-                            "tail_start": tail_start,
-                            "tail_end": tail_end + 1,  # make end offset exclusive
-                            "label": label,
-                        }
-
-                    sent_start_index += len(sent)
diff --git a/src/pytorch_ie/data/datasets/hf_datasets/semeval_2010_task_8.py b/src/pytorch_ie/data/datasets/hf_datasets/semeval_2010_task_8.py
deleted file mode 100644
index 65879d8f..00000000
--- a/src/pytorch_ie/data/datasets/hf_datasets/semeval_2010_task_8.py
+++ /dev/null
@@ -1,185 +0,0 @@
-"""The SemEval-2010 Task 8 on Multi-way classification of semantic relations between pairs of nominals"""
-
-
-import os
-import re
-
-import datasets
-
-_CITATION = """\
-@inproceedings{hendrickx-etal-2010-semeval,
-    title = "{S}em{E}val-2010 Task 8: Multi-Way Classification of Semantic Relations between Pairs of Nominals",
-    author = "Hendrickx, Iris  and
-      Kim, Su Nam  and
-      Kozareva, Zornitsa  and
-      Nakov, Preslav  and
-      {\'O} S{\'e}aghdha, Diarmuid  and
-      Pad{\'o}, Sebastian  and
-      Pennacchiotti, Marco  and
-      Romano, Lorenza  and
-      Szpakowicz, Stan",
-    booktitle = "Proceedings of the 5th International Workshop on Semantic Evaluation",
-    month = jul,
-    year = "2010",
-    address = "Uppsala, Sweden",
-    publisher = "Association for Computational Linguistics",
-    url = "https://www.aclweb.org/anthology/S10-1006",
-    pages = "33--38",
-}
-"""
-
-_DESCRIPTION = """\
-The SemEval-2010 Task 8 focuses on Multi-way classification of semantic relations between pairs of nominals.
-The task was designed to compare different approaches to semantic relation classification
-and to provide a standard testbed for future research.
-"""
-
-_URL = "https://drive.google.com/uc?export=download&id=0B_jQiLugGTAkMDQ5ZjZiMTUtMzQ1Yy00YWNmLWJlZDYtOWY1ZDMwY2U4YjFk"
-
-_CLASS_LABELS = [
-    "Cause-Effect(e1,e2)",
-    "Cause-Effect(e2,e1)",
-    "Component-Whole(e1,e2)",
-    "Component-Whole(e2,e1)",
-    "Content-Container(e1,e2)",
-    "Content-Container(e2,e1)",
-    "Entity-Destination(e1,e2)",
-    "Entity-Destination(e2,e1)",
-    "Entity-Origin(e1,e2)",
-    "Entity-Origin(e2,e1)",
-    "Instrument-Agency(e1,e2)",
-    "Instrument-Agency(e2,e1)",
-    "Member-Collection(e1,e2)",
-    "Member-Collection(e2,e1)",
-    "Message-Topic(e1,e2)",
-    "Message-Topic(e2,e1)",
-    "Product-Producer(e1,e2)",
-    "Product-Producer(e2,e1)",
-    "Other",
-]
-
-
-class SemEval2010Task8(datasets.GeneratorBasedBuilder):
-    """The SemEval-2010 Task 8 focuses on Multi-way classification of semantic relations between pairs of nominals.
-    The task was designed to compare different approaches to semantic relation classification
-    and to provide a standard testbed for future research."""
-
-    VERSION = datasets.Version("1.0.0")  # type: ignore
-
-    def _info(self):
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=_DESCRIPTION,
-            # This defines the different columns of the dataset and their types
-            features=datasets.Features(
-                {
-                    "tokens": datasets.Sequence(datasets.Value("string")),
-                    "head_start": datasets.Value("int32"),
-                    "head_end": datasets.Value("int32"),
-                    "tail_start": datasets.Value("int32"),
-                    "tail_end": datasets.Value("int32"),
-                    "label": datasets.ClassLabel(names=_CLASS_LABELS),
-                }
-            ),
-            # If there's a common (input, target) tuple from the features,
-            # specify them here. They'll be used if as_supervised=True in
-            # builder.as_dataset.
-            supervised_keys=None,
-            # Homepage of the dataset for documentation
-            homepage="https://semeval2.fbk.eu/semeval2.php?location=tasks&taskid=11",
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-        # dl_manager is a datasets.download.DownloadManager that can be used to
-        # download and extract URLs
-        dl_dir = dl_manager.download_and_extract(_URL)
-        data_dir = os.path.join(dl_dir, "SemEval2010_task8_all_data")
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, "SemEval2010_task8_training/TRAIN_FILE.TXT"
-                    ),
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, "SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT"
-                    ),
-                },
-            ),
-        ]
-
-    def _generate_examples(self, filepath):
-        """Yields examples."""
-        with open(filepath, encoding="utf-8") as f:
-            raw_lines = []
-            for line in f:
-                line = line.strip()
-
-                if not line:
-                    idx, example = self._raw_lines_to_example(raw_lines)
-                    yield idx, example
-                    raw_lines = []
-                    continue
-
-                raw_lines.append(line)
-
-    def _raw_lines_to_example(self, raw_lines):
-        raw_id, raw_text = raw_lines[0].split("\t")
-        label = raw_lines[1]
-        id_ = int(raw_id)
-        raw_text = raw_text.strip('"')
-
-        # Some special cases (e.g., missing spaces before entity marker)
-        if id_ in [213, 4612, 6373, 8411, 9867]:
-            raw_text = raw_text.replace("<e2>", " <e2>")
-        if id_ in [2740, 4219, 4784]:
-            raw_text = raw_text.replace("<e1>", " <e1>")
-        if id_ == 9256:
-            raw_text = raw_text.replace("log- jam", "log-jam")
-
-        # necessary if text should be whitespace tokenizeable
-        if id_ in [2609, 7589]:
-            raw_text = raw_text.replace("1 1/2", "1-1/2")
-        if id_ == 10591:
-            raw_text = raw_text.replace("1 1/4", "1-1/4")
-        if id_ == 10665:
-            raw_text = raw_text.replace("6 1/2", "6-1/2")
-
-        raw_text = re.sub(r"([.,!?()])$", r" \1", raw_text)
-        raw_text = re.sub(r"(e[12]>)([',;:\"\(\)])", r"\1 \2", raw_text)
-        raw_text = re.sub(r"([',;:\"\(\)])(</?e[12])", r"\1 \2", raw_text)
-        raw_text = raw_text.replace("<e1>", "<e1> ")
-        raw_text = raw_text.replace("<e2>", "<e2> ")
-        raw_text = raw_text.replace("</e1>", " </e1>")
-        raw_text = raw_text.replace("</e2>", " </e2>")
-
-        tokens = raw_text.split(" ")
-
-        head_start = tokens.index("<e1>")
-        tokens.pop(head_start)
-
-        head_end = tokens.index("</e1>")
-        tokens.pop(head_end)
-
-        tail_start = tokens.index("<e2>")
-        tokens.pop(tail_start)
-
-        tail_end = tokens.index("</e2>")
-        tokens.pop(tail_end)
-
-        return id_, {
-            "tokens": tokens,
-            "head_start": head_start,
-            "head_end": head_end,
-            "tail_start": tail_start,
-            "tail_end": tail_end,
-            "label": label,
-        }
diff --git a/src/pytorch_ie/data/datasets/hf_datasets/tacred.py b/src/pytorch_ie/data/datasets/hf_datasets/tacred.py
deleted file mode 100644
index 3525fb1b..00000000
--- a/src/pytorch_ie/data/datasets/hf_datasets/tacred.py
+++ /dev/null
@@ -1,257 +0,0 @@
-"""TODO: Add a description here."""
-
-
-import json
-import os
-
-import datasets
-
-_CITATION = """\
-@inproceedings{zhang-etal-2017-position,
-    title = "Position-aware Attention and Supervised Data Improve Slot Filling",
-    author = "Zhang, Yuhao  and
-      Zhong, Victor  and
-      Chen, Danqi  and
-      Angeli, Gabor  and
-      Manning, Christopher D.",
-    booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
-    month = sep,
-    year = "2017",
-    address = "Copenhagen, Denmark",
-    publisher = "Association for Computational Linguistics",
-    url = "https://www.aclweb.org/anthology/D17-1004",
-    doi = "10.18653/v1/D17-1004",
-    pages = "35--45",
-}
-
-@inproceedings{alt-etal-2020-tacred,
-    title = "{TACRED} Revisited: A Thorough Evaluation of the {TACRED} Relation Extraction Task",
-    author = "Alt, Christoph  and
-      Gabryszak, Aleksandra  and
-      Hennig, Leonhard",
-    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
-    month = jul,
-    year = "2020",
-    address = "Online",
-    publisher = "Association for Computational Linguistics",
-    url = "https://www.aclweb.org/anthology/2020.acl-main.142",
-    doi = "10.18653/v1/2020.acl-main.142",
-    pages = "1558--1569",
-}
-"""
-
-# TODO: Add description of the dataset here
-# You can copy an official description
-_DESCRIPTION = """\
-This new dataset is designed to solve this great NLP task and is crafted with a lot of care.
-"""
-
-# TODO: Add a link to an official homepage for the dataset here
-_HOMEPAGE = ""
-
-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
-
-# TODO: Add link to the official dataset URLs here
-# The HuggingFace dataset library don't host the datasets but only point to the original files
-# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
-_PATCH_URLs = {
-    "dev": "https://raw.githubusercontent.com/DFKI-NLP/tacrev/master/patch/dev_patch.json",
-    "test": "https://raw.githubusercontent.com/DFKI-NLP/tacrev/master/patch/test_patch.json",
-}
-
-_CLASS_LABELS = [
-    "no_relation",
-    "org:alternate_names",
-    "org:city_of_headquarters",
-    "org:country_of_headquarters",
-    "org:dissolved",
-    "org:founded",
-    "org:founded_by",
-    "org:member_of",
-    "org:members",
-    "org:number_of_employees/members",
-    "org:parents",
-    "org:political/religious_affiliation",
-    "org:shareholders",
-    "org:stateorprovince_of_headquarters",
-    "org:subsidiaries",
-    "org:top_members/employees",
-    "org:website",
-    "per:age",
-    "per:alternate_names",
-    "per:cause_of_death",
-    "per:charges",
-    "per:children",
-    "per:cities_of_residence",
-    "per:city_of_birth",
-    "per:city_of_death",
-    "per:countries_of_residence",
-    "per:country_of_birth",
-    "per:country_of_death",
-    "per:date_of_birth",
-    "per:date_of_death",
-    "per:employee_of",
-    "per:origin",
-    "per:other_family",
-    "per:parents",
-    "per:religion",
-    "per:schools_attended",
-    "per:siblings",
-    "per:spouse",
-    "per:stateorprovince_of_birth",
-    "per:stateorprovince_of_death",
-    "per:stateorprovinces_of_residence",
-    "per:title",
-]
-
-
-def convert_ptb_token(token: str) -> str:
-    """Convert PTB tokens to normal tokens"""
-    return {
-        "-lrb-": "(",
-        "-rrb-": ")",
-        "-lsb-": "[",
-        "-rsb-": "]",
-        "-lcb-": "{",
-        "-rcb-": "}",
-    }.get(token.lower(), token)
-
-
-# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
-class TACRED(datasets.GeneratorBasedBuilder):
-    """TODO: Short description of my dataset."""
-
-    # This is an example of a dataset with multiple configurations.
-    # If you don't want/need to define several sub-sets in your dataset,
-    # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes.
-
-    # If you need to make complex sub-parts in the datasets with configurable options
-    # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig
-    # BUILDER_CONFIG_CLASS = MyBuilderConfig
-
-    # You will be able to load one or the other configurations in the following list with
-    # data = datasets.load_dataset('my_dataset', 'first_domain')
-    # data = datasets.load_dataset('my_dataset', 'second_domain')
-    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(
-            name="original", version=datasets.Version("1.0.0"), description="The original TACRED."
-        ),
-        datasets.BuilderConfig(
-            name="revised",
-            version=datasets.Version("1.0.0"),
-            description="The revised TACRED (corrected labels in dev and test split).",
-        ),
-    ]
-
-    DEFAULT_CONFIG_NAME = "original"  # type: ignore
-
-    @property
-    def manual_download_instructions(self):
-        return (
-            "To use TACRED you have to download it manually. "
-            "It is available via the LDC at https://catalog.ldc.upenn.edu/LDC2018T24"
-            "Please extract all files in one folder and load the dataset with: "
-            "`datasets.load_dataset('tacred', data_dir='path/to/folder/folder_name')`"
-        )
-
-    def _info(self):
-        features = datasets.Features(
-            {
-                "tokens": datasets.Sequence(datasets.Value("string")),
-                "head_start": datasets.Value("int32"),
-                "head_end": datasets.Value("int32"),
-                "tail_start": datasets.Value("int32"),
-                "tail_end": datasets.Value("int32"),
-                "label": datasets.ClassLabel(names=_CLASS_LABELS),
-            }
-        )
-
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=_DESCRIPTION,
-            # This defines the different columns of the dataset and their types
-            features=features,  # Here we define them above because they are different between the two configurations
-            # If there's a common (input, target) tuple from the features,
-            # specify them here. They'll be used if as_supervised=True in
-            # builder.as_dataset.
-            supervised_keys=None,
-            # Homepage of the dataset for documentation
-            homepage=_HOMEPAGE,
-            # License for the dataset if available
-            license=_LICENSE,
-            # Citation for the dataset
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
-        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
-
-        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs
-        # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
-        # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
-        patch_files = {}
-        if self.config.name == "revised":
-            patch_files = dl_manager.download_and_extract(_PATCH_URLs)
-
-        data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
-
-        if not os.path.exists(data_dir):
-            raise FileNotFoundError(
-                "{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('tacred', data_dir=...)` that includes the unzipped files from the TACRED_LDC zip. Manual download instructions: {}".format(
-                    data_dir, self.manual_download_instructions
-                )
-            )
-
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={
-                    "filepath": os.path.join(data_dir, "train.json"),
-                    "patch_filepath": None,
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={
-                    "filepath": os.path.join(data_dir, "test.json"),
-                    "patch_filepath": patch_files.get("test"),
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                gen_kwargs={
-                    "filepath": os.path.join(data_dir, "dev.json"),
-                    "patch_filepath": patch_files.get("dev"),
-                },
-            ),
-        ]
-
-    def _generate_examples(self, filepath, patch_filepath):
-        """Yields examples."""
-        # TODO: This method will receive as arguments the `gen_kwargs` defined in the previous `_split_generators` method.
-        # It is in charge of opening the given file and yielding (key, example) tuples from the dataset
-        # The key is not important, it's more here for legacy reason (legacy from tfds)
-        patch_examples = {}
-        if patch_filepath is not None:
-            with open(patch_filepath, encoding="utf-8") as f:
-                patch_examples = {example["id"]: example for example in json.load(f)}
-
-        with open(filepath, encoding="utf-8") as f:
-            data = json.load(f)
-            for example in data:
-                id_ = example["id"]
-
-                if id_ in patch_examples:
-                    example.update(patch_examples[id_])
-
-                yield id_, {
-                    "tokens": [convert_ptb_token(token) for token in example["token"]],
-                    "head_start": example["subj_start"],
-                    "head_end": example["subj_end"] + 1,  # make end offset exclusive
-                    "tail_start": example["obj_start"],
-                    "tail_end": example["obj_end"] + 1,  # make end offset exclusive
-                    "label": example["relation"],
-                }
diff --git a/src/pytorch_ie/data/datasets/hf_datasets/webred.py b/src/pytorch_ie/data/datasets/hf_datasets/webred.py
deleted file mode 100644
index 8ae785a3..00000000
--- a/src/pytorch_ie/data/datasets/hf_datasets/webred.py
+++ /dev/null
@@ -1,746 +0,0 @@
-"""TODO: Add a description here."""
-
-
-import os
-import re
-
-import datasets
-import tensorflow as tf
-from spacy.lang.en import English
-
-_CITATION_WEBRED = """\
-@misc{ormandi2021webred,
-    title={WebRED: Effective Pretraining And Finetuning For Relation Extraction On The Web},
-    author={Robert Ormandi and Mohammad Saleh and Erin Winter and Vinay Rao},
-    year={2021},
-    eprint={2102.09681},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL},
-    url={https://arxiv.org/abs/2102.09681},
-}
-"""
-
-# You can copy an official description
-_DESCRIPTION = """\
-A dataset for extracting relationships from a variety of text found on the World Wide Web. Text
-on the web has diverse surface forms including writing styles, complexity and grammar. This dataset collects
-sentences from a variety of webpages and documents that represent a variety of those categories. In each sentence,
-there will be a subject and object entities tagged with subject SUBJ{...} and object OBJ{...}, respectively. The two
-entities are either related by a relation from a set of pre-defined ones or has no relation.
-
-More information about the dataset can be found in our paper: https://arxiv.org/abs/2102.09681
-"""
-
-_HOMEPAGE = "https://github.com/google-research-datasets/WebRED"
-
-_LICENSE = """\
-This data is licensed by Google LLC under a Creative Commons Attribution 4.0 International License (
-http://creativecommons.org/licenses/by/4.0/) Users will be allowed to modify and repost it, and we encourage them to
-analyze and publish research based on the data.
-"""
-
-# The HuggingFace dataset library don't host the datasets but only point to the original files
-# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
-_DATA_URL = "https://github.com/google-research-datasets/WebRED"
-
-_CLASS_LABELS = [
-    "based on",
-    "part of the series",
-    "drug used for treatment",
-    "architectural style",
-    "writable file format",
-    "work location",
-    "position held",
-    "followed by",
-    "flash point",
-    "indigenous to",
-    "Mohs' hardness",
-    "political alignment",
-    "located in protected area",
-    "translator",
-    "director",
-    "highest judicial authority",
-    "producer",
-    "compressive modulus of elasticity",
-    "series spin-off",
-    "quantity",
-    "lyrics by",
-    "cell component",
-    "medical condition treated",
-    "place of death",
-    "number of seats",
-    "record label",
-    "league level above",
-    "military branch",
-    "origin of the watercourse",
-    "diameter",
-    "conversion to SI unit",
-    "works in collection",
-    "presenter",
-    "chairperson",
-    "temperature",
-    "currency",
-    "frequency",
-    "standards body",
-    "manufacturer",
-    "location of final assembly",
-    "coat of arms",
-    "astronaut mission",
-    "length",
-    "publication date",
-    "place of publication",
-    "country of citizenship",
-    "minimal lethal dose",
-    "game mechanics",
-    "afflicts",
-    "used by",
-    "oxidation state",
-    "mother",
-    "affiliation",
-    "head of state",
-    "creator",
-    "defendant",
-    "head coach of sports team",
-    "country",
-    "developer",
-    "approved by",
-    "cover artist",
-    "lake inflows",
-    "separated from",
-    "operating area",
-    "water as percent of area",
-    "head coach",
-    "update method",
-    "floruit",
-    "party chief representative",
-    "commander of",
-    "gestation period",
-    "religious order",
-    "school district",
-    "depicted by",
-    "publisher",
-    "excavation director",
-    "airline alliance",
-    "librettist",
-    "executive producer",
-    "donated by",
-    "mushroom ecological type",
-    "iconographic symbol",
-    "speed limit",
-    "number of representatives in an organization/legislature",
-    "subsidiary",
-    "educated at",
-    "number of participants",
-    "founded by",
-    "country of origin",
-    "family",
-    "package management system",
-    "subject has role",
-    "sibling",
-    "interchange station",
-    "facet of",
-    "decays to",
-    "repeals",
-    "legislative body",
-    "occupant",
-    "atomic number",
-    "CPU",
-    "GUI toolkit or framework",
-    "has parts of the class",
-    "director of photography",
-    "shares border with",
-    "parent organization",
-    "population",
-    "upper flammable limit",
-    "performer",
-    "isospin z-component",
-    "number of injured",
-    "number of seasons",
-    "choreographer",
-    "replaces",
-    "doctoral advisor",
-    "official residence",
-    "top-level Internet domain",
-    "VAT-rate",
-    "point in time",
-    "distance from Earth",
-    "public holiday",
-    "languages spoken, written or signed",
-    "located on astronomical location",
-    "solved by",
-    "designed by",
-    "twinned administrative body",
-    "encoded by",
-    "located in time zone",
-    "canonization status",
-    "date of official opening",
-    "student",
-    "brand",
-    "refractive index",
-    "inflation rate",
-    "home venue",
-    "neutron number",
-    "chief operating officer",
-    "lowest point",
-    "signatory",
-    "consecrator",
-    "model item",
-    "time of earliest written record",
-    "area",
-    "terminus location",
-    "significant event",
-    "inspired by",
-    "backup or reserve team or crew",
-    "maximum number of players",
-    "talk show guest",
-    "number of deaths",
-    "exclave of",
-    "maximal incubation period in humans",
-    "league",
-    "film crew member",
-    "electric charge",
-    "symptoms",
-    "replaced by",
-    "nominated for",
-    "religion",
-    "wavelength",
-    "total produced",
-    "time of discovery or invention",
-    "invasive to",
-    "use",
-    "negative therapeutic predictor",
-    "item operated",
-    "participating team",
-    "political ideology",
-    "compulsory education (maximum age)",
-    "applies to jurisdiction",
-    "history of topic",
-    "author",
-    "mass",
-    "heart rate",
-    "killed by",
-    "characters",
-    "diocese",
-    "Erdős number",
-    "time period",
-    "has part",
-    "age of candidacy",
-    "semi-major axis",
-    "dual to",
-    "official language",
-    "production company",
-    "replaced synonym (for nom. nov.)",
-    "main regulatory text",
-    "participant of",
-    "head of government",
-    "age of majority",
-    "heritage designation",
-    "drafted by",
-    "family relationship degree",
-    "discontinued date",
-    "operator",
-    "term length of office",
-    "spin quantum number",
-    "vehicles per capita (1000)",
-    "enclave within",
-    "embodied energy",
-    "represents",
-    "partner",
-    "stepparent",
-    "taxon synonym",
-    "time of spacecraft launch",
-    "conversion to standard unit",
-    "nominal GDP",
-    "lower flammable limit",
-    "readable file format",
-    "minimal incubation period in humans",
-    "connecting line",
-    "located in the administrative territorial entity",
-    "place of burial",
-    "contains administrative territorial entity",
-    "statistical leader",
-    "sports discipline competed in",
-    "tensile modulus of elasticity",
-    "research site",
-    "connects with",
-    "has cause",
-    "date of birth",
-    "location",
-    "age of consent",
-    "mains voltage",
-    "industry",
-    "basionym",
-    "marriageable age",
-    "visitors per year",
-    "Poisson's ratio",
-    "suicide rate",
-    "carries scientific instrument",
-    "connecting service",
-    "place of detention",
-    "crew member",
-    "place served by transport hub",
-    "organisation directed from the office or person",
-    "memory capacity",
-    "primary destinations",
-    "relative permeability",
-    "parent club",
-    "organizer",
-    "space launch vehicle",
-    "encodes",
-    "architect",
-    "notable work",
-    "commissioned by",
-    "depicts",
-    "individual tax rate",
-    "website account on",
-    "central bank",
-    "software engine",
-    "numeric value",
-    "official religion",
-    "wingspan",
-    "occupation",
-    "member count",
-    "ceiling exposure limit",
-    "date of first performance",
-    "discoverer or inventor",
-    "described by source",
-    "executive body",
-    "parent taxon",
-    "pole position",
-    "sports league level",
-    "pKa",
-    "genetic association",
-    "mountain range",
-    "part of",
-    "legal form",
-    "regulates (molecular biology)",
-    "end time",
-    "month of the year",
-    "employer",
-    "from fictional universe",
-    "spouse",
-    "copyright holder",
-    "lake outflow",
-    "solubility",
-    "located in or next to body of water",
-    "IDLH",
-    "office held by head of the organisation",
-    "office held by head of government",
-    "territory claimed by",
-    "tracklist",
-    "takes place in fictional universe",
-    "mount",
-    "season of club or team",
-    "this taxon is source of",
-    "theme music",
-    "Alexa rank",
-    "film editor",
-    "derivative work",
-    "territory overlaps",
-    "perimeter",
-    "price",
-    "secretary general",
-    "frequency of event",
-    "mascot",
-    "maintained by",
-    "duration",
-    "screenwriter",
-    "life expectancy",
-    "minimum number of players",
-    "winner",
-    "native language",
-    "start time",
-    "highest point",
-    "legislated by",
-    "parity",
-    "melting point",
-    "location of formation",
-    "ultimate tensile strength",
-    "defined daily dose",
-    "chief executive officer",
-    "number of parts of this work of art",
-    "endemic to",
-    "subclass of",
-    "dissolved, abolished or demolished",
-    "service entry",
-    "follows",
-    "number of constituencies",
-    "structural engineer",
-    "writing system",
-    "capital of",
-    "taxonomic type",
-    "next higher rank",
-    "commemorates",
-    "continent",
-    "relative",
-    "residence time of water",
-    "number of speakers",
-    "conferred by",
-    "Gram staining",
-    "work period (start)",
-    "sport",
-    "has effect",
-    "tributary",
-    "place of birth",
-    "member of sports team",
-    "relative permittivity",
-    "instrument",
-    "interested in",
-    "academic degree",
-    "location of discovery",
-    "electronegativity",
-    "located on terrain feature",
-    "conflict",
-    "height",
-    "short-term exposure limit",
-    "start point",
-    "original language of film or TV show",
-    "publication interval",
-    "amended by",
-    "material used",
-    "located in present-day administrative territorial entity",
-    "drainage basin",
-    "lakes on river",
-    "league level below",
-    "licensed to broadcast to",
-    "residence",
-    "after a work by",
-    "present in work",
-    "basin country",
-    "product certification",
-    "mouth of the watercourse",
-    "for work",
-    "has quality",
-    "uses",
-    "time-weighted average exposure limit",
-    "license",
-    "significant person",
-    "archives at",
-    "natural product of taxon",
-    "anthem",
-    "adjacent station",
-    "real gross domestic product growth rate",
-    "carries",
-    "member of political party",
-    "professional or sports partner",
-    "ethnic group",
-    "member of",
-    "platform",
-    "destination point",
-    "sports season of league or competition",
-    "country for sport",
-    "account charge / subscription fee",
-    "patron saint",
-    "compulsory education (minimum age)",
-    "route of administration",
-    "antiparticle",
-    "sponsor",
-    "floors above ground",
-    "timezone offset",
-    "programming language",
-    "stock exchange",
-    "opposite of",
-    "mouthpiece",
-    "unemployment rate",
-    "watershed area",
-    "editor",
-    "collection",
-    "award received",
-    "designated as terrorist by",
-    "illustrator",
-    "student of",
-    "dedicated to",
-    "youth wing",
-    "total fertility rate",
-    "elevation above sea level",
-    "repealed by",
-    "practiced by",
-    "named after",
-    "movement",
-    "flattening",
-    "position played on team / speciality",
-    "median lethal dose",
-    "employees",
-    "physically interacts with",
-    "highway system",
-    "parent peak",
-    "participant",
-    "number of cases",
-    "editor-in-chief",
-    "instance of",
-    "sidekick of",
-    "width",
-    "cites",
-    "child",
-    "has edition",
-    "doctoral student",
-    "original network",
-    "board member",
-    "service retirement",
-    "anatomical location",
-    "biological variant of",
-    "Euler characteristic",
-    "diplomatic relation",
-    "number of children",
-    "narrative location",
-    "incidence",
-    "allegiance",
-    "airline hub",
-    "vapor pressure",
-    "constellation",
-    "voice actor",
-    "number of platform tracks",
-    "work period (end)",
-    "military rank",
-    "vertical depth",
-    "vessel class",
-    "parent astronomical body",
-    "director/manager",
-    "owner of",
-    "distribution",
-    "court",
-    "angular resolution",
-    "located on street",
-    "owned by",
-    "retirement age",
-    "said to be the same as",
-    "language used",
-    "applies to part",
-    "business division",
-    "contains settlement",
-    "main subject",
-    "operating system",
-    "authority",
-    "number of representations",
-    "ancestral home",
-    "radius",
-    "binding energy",
-    "general manager",
-    "measured by",
-    "next lower rank",
-    "cast member",
-    "thermal conductivity",
-    "health specialty",
-    "father",
-    "worshipped by",
-    "headquarters location",
-    "child astronomical body",
-    "distributor",
-    "noble title",
-    "studied by",
-    "officeholder",
-    "genre",
-    "vaccine for",
-    "inception",
-    "produced by",
-    "narrator",
-    "different from",
-    "volcano observatory",
-    "art director",
-    "objective of project or action",
-    "composer",
-    "hardness",
-    "edition or translation of",
-    "isospin quantum number",
-    "foundational text",
-    "broadcast by",
-    "office held by head of state",
-    "boiling point",
-    "minimum wavelength of sensitivity",
-    "speaker",
-    "studies",
-    "capital",
-    "terminus",
-    "pressure",
-    "number of episodes",
-    "decomposition point",
-    "filming location",
-    "product or material produced",
-    "gene inversion association with",
-    "found in taxon",
-    "field of work",
-    "language of work or name",
-    "ranking",
-    "crosses",
-    "culture",
-    "location of first performance",
-    "dialect of",
-    "date of death",
-    "influenced by",
-]
-
-
-class WebRedConfig(datasets.BuilderConfig):
-    """BuilderConfig for WebRed."""
-
-    def __init__(
-        self,
-        data_url,
-        citation,
-        url,
-        class_labels,
-        description,
-        **kwargs,
-    ):
-        """BuilderConfig for WebRed.
-        Args:
-          data_url: `string`, url to download the zip file from
-          citation: `string`, citation for the data set
-          url: `string`, url for information about the data set
-          class_labels: `list[string]`, the list of classes if the label is
-            categorical. If not provided, then the label will be of type
-            `datasets.Value('float32')`.
-          **kwargs: keyword arguments forwarded to super.
-        """
-        super().__init__(version=datasets.Version("1.0.0", ""), **kwargs)
-        self.class_labels = class_labels
-        self.data_url = data_url
-        self.citation = citation
-        self.url = url
-        self.description = description
-
-
-# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
-class WebRed(datasets.GeneratorBasedBuilder):
-    """TODO: Short description of my dataset."""
-
-    BUILDER_CONFIGS = [
-        WebRedConfig(
-            name="webred_5",
-            data_url="https://github.com/google-research-datasets/WebRED/raw/main/webred_5.tfrecord",
-            citation=_CITATION_WEBRED,
-            url=_HOMEPAGE,
-            class_labels=_CLASS_LABELS,
-            description=_DESCRIPTION
-            + "\nEach example in WebRED 5 was annotated by exactly 5 independent human annotators.",
-        ),
-        WebRedConfig(
-            name="webred_21",
-            data_url="https://github.com/google-research-datasets/WebRED/raw/main/webred_21.tfrecord",
-            citation=_CITATION_WEBRED,
-            url=_HOMEPAGE,
-            class_labels=_CLASS_LABELS,
-            description=_DESCRIPTION
-            + "\nIn WebRED 2+1, each example was annotated by 2 independent annotators. If they "
-            "disagreed, an additional annotator (+1) was assigned to the example who also "
-            "provided a disambiguating annotation.",
-        ),
-    ]
-
-    def _info(self):
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=self.config.description,
-            # This defines the different columns of the dataset and their types
-            features=datasets.Features(
-                {
-                    "tokens": datasets.Sequence(datasets.Value("string")),
-                    "head_start": datasets.Value("int32"),
-                    "head_end": datasets.Value("int32"),
-                    "tail_start": datasets.Value("int32"),
-                    "tail_end": datasets.Value("int32"),
-                    "label": datasets.ClassLabel(names=_CLASS_LABELS),
-                }
-            ),
-            # If there's a common (input, target) tuple from the features,
-            # specify them here. They'll be used if as_supervised=True in
-            # builder.as_dataset.
-            supervised_keys=None,
-            # Homepage of the dataset for documentation
-            homepage=_HOMEPAGE,
-            # License for the dataset if available
-            license=_LICENSE,
-            # Citation for the dataset
-            citation=_CITATION_WEBRED,
-        )
-
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-        # This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
-        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
-
-        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs
-        # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
-        # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
-        file_path = dl_manager.download_and_extract(self.config.data_url)
-
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={"filepath": file_path},
-            ),
-        ]
-
-    def _generate_examples(self, filepath):
-        """Yields examples."""
-        dataset = tf.data.TFRecordDataset(filepath)
-        idx = 0
-        nlp = English()
-
-        def get_feature_value(feature, key):
-            return feature[key].bytes_list.value[0].decode("utf-8")
-
-        for raw_sentence in dataset:
-            example = tf.train.Example()
-            example.ParseFromString(raw_sentence.numpy())
-
-            rel_id = get_feature_value(example.features.feature, "relation_id")
-            sentence = get_feature_value(example.features.feature, "sentence")
-            label = get_feature_value(example.features.feature, "relation_name")
-
-            # 1. Find OBJ{} and SUBJ{} marker indices
-            subj = re.search("SUBJ{.+?}", sentence)
-            obj = re.search("OBJ{.+?}", sentence)
-            if not subj or not obj:
-                print(f"Did not find OBJ or SUBJ marker in sentence: {sentence}")
-                continue
-            else:
-                subj_start, subj_end = subj.span()
-                obj_start, obj_end = obj.span()
-            # 2. OPTIONAL: Replace with source and target strings (they contain special characters while the sentence
-            # contains standard writing?)
-            # source = get_feature_value(sentence.features.feature, "source_name")
-            # target = get_feature_value(sentence.features.feature, "target_name")
-
-            # 3. Remove markers and adjust indices: divide sentence at marker indices, remove marker, merge
-            # what if subj or obj is at the start or end of the sentence?
-            cleaned_sentence = ""
-            if subj_start < obj_start:
-                cleaned_sentence += sentence[:subj_start]
-                cleaned_sentence += sentence[subj_start + 5 : subj_end - 1]
-                cleaned_sentence += sentence[subj_end:obj_start]
-                cleaned_sentence += sentence[obj_start + 4 : obj_end - 1]
-                cleaned_sentence += sentence[obj_end:]
-                subj_end -= 6
-                obj_start -= 6
-                obj_end -= 11
-            else:
-                cleaned_sentence += sentence[:obj_start]
-                cleaned_sentence += sentence[obj_start + 4 : obj_end - 1]
-                cleaned_sentence += sentence[obj_end:subj_start]
-                cleaned_sentence += sentence[subj_start + 5 : subj_end - 1]
-                cleaned_sentence += sentence[subj_end:]
-                obj_end -= 5
-                subj_start -= 5
-                subj_end -= 11
-            # 4. Tokenize and calculate token indices from char offsets
-            doc = nlp(cleaned_sentence)
-            tokens = [token.text for token in doc]
-            subj_span = doc.char_span(subj_start, subj_end, alignment_mode="expand")
-            head_start = subj_span.start
-            head_end = subj_span.end
-            obj_span = doc.char_span(obj_start, obj_end, alignment_mode="expand")
-            tail_start = obj_span.start
-            tail_end = obj_span.end
-
-            id_ = str(idx) + "_" + rel_id
-            idx += 1
-
-            yield id_, {
-                "tokens": tokens,
-                "head_start": head_start,
-                "head_end": head_end,
-                "tail_start": tail_start,
-                "tail_end": tail_end,
-                "label": label,
-            }
diff --git a/src/pytorch_ie/data/datasets/hf_datasets/wiki80.py b/src/pytorch_ie/data/datasets/hf_datasets/wiki80.py
deleted file mode 100644
index 02aa2a7d..00000000
--- a/src/pytorch_ie/data/datasets/hf_datasets/wiki80.py
+++ /dev/null
@@ -1,199 +0,0 @@
-"""TODO: Add a description here."""
-
-
-import json
-
-import datasets
-
-_CITATION_WIKI80 = """\
-@inproceedings{han-etal-2019-opennre,
-    title = "{O}pen{NRE}: An Open and Extensible Toolkit for Neural Relation Extraction",
-    author = "Han, Xu and Gao, Tianyu and Yao, Yuan and Ye, Deming and Liu, Zhiyuan and Sun, Maosong",
-    booktitle = "Proceedings of EMNLP-IJCNLP: System Demonstrations",
-    year = "2019",
-    url = "https://www.aclweb.org/anthology/D19-3029",
-    doi = "10.18653/v1/D19-3029",
-    pages = "169--174"
-}"""
-
-# TODO: Add description of the dataset here
-# You can copy an official description
-_DESCRIPTION = """\
-Wiki80 is derived from FewRel, a large
-scale few-shot dataset. It contains 80 relations and
-56,000 instances from Wikipedia and Wikidata."""
-
-# TODO: Add a link to an official homepage for the dataset here
-_HOMEPAGE = ""
-
-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
-
-# The HuggingFace dataset library don't host the datasets but only point to the original files
-# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
-_DATA_URLs = {
-    "train": "https://thunlp.oss-cn-qingdao.aliyuncs.com/opennre/benchmark/wiki80/wiki80_train.txt",
-    "validation": "https://thunlp.oss-cn-qingdao.aliyuncs.com/opennre/benchmark/wiki80/wiki80_val.txt",
-}
-
-_CLASS_LABELS = [
-    "place served by transport hub",
-    "mountain range",
-    "religion",
-    "participating team",
-    "contains administrative territorial entity",
-    "head of government",
-    "country of citizenship",
-    "original network",
-    "heritage designation",
-    "performer",
-    "participant of",
-    "position held",
-    "has part",
-    "location of formation",
-    "located on terrain feature",
-    "architect",
-    "country of origin",
-    "publisher",
-    "director",
-    "father",
-    "developer",
-    "military branch",
-    "mouth of the watercourse",
-    "nominated for",
-    "movement",
-    "successful candidate",
-    "followed by",
-    "manufacturer",
-    "instance of",
-    "after a work by",
-    "member of political party",
-    "licensed to broadcast to",
-    "headquarters location",
-    "sibling",
-    "instrument",
-    "country",
-    "occupation",
-    "residence",
-    "work location",
-    "subsidiary",
-    "participant",
-    "operator",
-    "characters",
-    "occupant",
-    "genre",
-    "operating system",
-    "owned by",
-    "platform",
-    "tributary",
-    "winner",
-    "said to be the same as",
-    "composer",
-    "league",
-    "record label",
-    "distributor",
-    "screenwriter",
-    "sports season of league or competition",
-    "taxon rank",
-    "location",
-    "field of work",
-    "language of work or name",
-    "applies to jurisdiction",
-    "notable work",
-    "located in the administrative territorial entity",
-    "crosses",
-    "original language of film or TV show",
-    "competition class",
-    "part of",
-    "sport",
-    "constellation",
-    "position played on team / speciality",
-    "located in or next to body of water",
-    "voice type",
-    "follows",
-    "spouse",
-    "military rank",
-    "mother",
-    "member of",
-    "child",
-    "main subject",
-]
-
-
-class Wiki80(datasets.GeneratorBasedBuilder):
-    """TODO: Short description of my dataset."""
-
-    VERSION = datasets.Version("1.0.0")  # type: ignore
-
-    def _info(self):
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=_DESCRIPTION,
-            # This defines the different columns of the dataset and their types
-            features=datasets.Features(
-                {
-                    "tokens": datasets.Sequence(datasets.Value("string")),
-                    "head_start": datasets.Value("int32"),
-                    "head_end": datasets.Value("int32"),
-                    "tail_start": datasets.Value("int32"),
-                    "tail_end": datasets.Value("int32"),
-                    "label": datasets.ClassLabel(names=_CLASS_LABELS),
-                }
-            ),
-            # If there's a common (input, target) tuple from the features,
-            # specify them here. They'll be used if as_supervised=True in
-            # builder.as_dataset.
-            supervised_keys=None,
-            # Homepage of the dataset for documentation
-            homepage=_HOMEPAGE,
-            # License for the dataset if available
-            license=_LICENSE,
-            # Citation for the dataset
-            citation=_CITATION_WIKI80,
-        )
-
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-        # This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
-        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
-
-        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs
-        # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
-        # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
-        data_files = dl_manager.download_and_extract(_DATA_URLs)
-
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={"filepath": data_files.get("train")},
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                gen_kwargs={"filepath": data_files.get("validation")},
-            ),
-        ]
-
-    def _generate_examples(self, filepath):
-        """Yields examples."""
-        with open(filepath, encoding="utf-8") as f:
-            for idx, line in enumerate(f.readlines()):
-                example = json.loads(line)
-                label = example["relation"]
-                id_ = str(idx)
-
-                head_token_positions = example["h"]["pos"]
-                tail_token_positions = example["t"]["pos"]
-
-                head_start = head_token_positions[0]
-                head_end = head_token_positions[-1]
-                tail_start = tail_token_positions[0]
-                tail_end = tail_token_positions[-1]
-
-                yield id_, {
-                    "tokens": example["token"],
-                    "head_start": head_start,
-                    "head_end": head_end,
-                    "tail_start": tail_start,
-                    "tail_end": tail_end,
-                    "label": label,
-                }
diff --git a/src/pytorch_ie/data/datasets/hf_datasets/wikigold.py b/src/pytorch_ie/data/datasets/hf_datasets/wikigold.py
deleted file mode 100644
index ffd3a729..00000000
--- a/src/pytorch_ie/data/datasets/hf_datasets/wikigold.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import datasets
-from tqdm import tqdm
-
-_CITATION = """
-@inproceedings{balasuriya-etal-2009-named,
-    title = "Named Entity Recognition in Wikipedia",
-    author = "Balasuriya, Dominic  and
-      Ringland, Nicky  and
-      Nothman, Joel  and
-      Murphy, Tara  and
-      Curran, James R.",
-    booktitle = "Proceedings of the 2009 Workshop on The People{'}s Web Meets {NLP}:
-    Collaboratively Constructed Semantic Resources (People{'}s Web)",
-    month = aug,
-    year = "2009",
-    address = "Suntec, Singapore",
-    publisher = "Association for Computational Linguistics",
-    url = "https://aclanthology.org/W09-3302",
-    pages = "10--18",
-}
-"""
-
-_LICENCE = "CC-BY 4.0"
-
-_DESCRIPTION = """
-WikiGold dataset.
-"""
-
-_URL = (
-    "https://github.com/juand-r/entity-recognition-datasets/raw/master/"
-    "data/wikigold/CONLL-format/data/wikigold.conll.txt"
-)
-
-# the label ids
-NER_TAGS_DICT = {
-    "O": 0,
-    "PER": 1,
-    "LOC": 2,
-    "ORG": 3,
-    "MISC": 4,
-}
-
-
-class WikiGoldConfig(datasets.BuilderConfig):
-    """BuilderConfig for WikiGold"""
-
-    def __init__(self, **kwargs):
-        """BuilderConfig for WikiGold.
-        Args:
-          **kwargs: keyword arguments forwarded to super.
-        """
-        super().__init__(**kwargs)
-
-
-class WikiGold(datasets.GeneratorBasedBuilder):
-    def _info(self):
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=datasets.Features(
-                {
-                    "id": datasets.Value("string"),
-                    "tokens": datasets.features.Sequence(datasets.Value("string")),
-                    "ner_tags": datasets.features.Sequence(
-                        datasets.features.ClassLabel(names=["O", "PER", "LOC", "ORG", "MISC"])
-                    ),
-                }
-            ),
-            supervised_keys=None,
-            citation=_CITATION,
-            license=_LICENCE,
-        )
-
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-        urls_to_download = dl_manager.download_and_extract(_URL)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={"filepath": urls_to_download},
-            ),
-        ]
-
-    def _generate_examples(self, filepath=None):
-        num_lines = sum(1 for _ in open(filepath))
-        id = 0
-
-        with open(filepath) as f:
-            tokens, ner_tags = [], []
-            for line in tqdm(f, total=num_lines):
-                line = line.strip().split()
-
-                if line:
-                    assert len(line) == 2
-                    token, ner_tag = line
-
-                    if token == "-DOCSTART-":
-                        continue
-
-                    tokens.append(token)
-                    if ner_tag != "O":
-                        ner_tag = ner_tag.split("-")[1]
-                    ner_tags.append(NER_TAGS_DICT[ner_tag])
-
-                elif tokens:
-                    # organize a record to be written into json
-                    record = {
-                        "tokens": tokens,
-                        "id": str(id),
-                        "ner_tags": ner_tags,
-                    }
-                    tokens, ner_tags = [], []
-                    id += 1
-                    yield record["id"], record
-
-            # take the last sentence
-            if tokens:
-                record = {
-                    "tokens": tokens,
-                    "id": str(id),
-                    "ner_tags": ner_tags,
-                }
-                yield record["id"], record