From ee15ddfb2587bd1d25359de7be477d15df387e46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20K=C3=B6hnecke?=
 <155443293+NiklasKoehneckeTNG@users.noreply.github.com>
Date: Thu, 15 Feb 2024 12:21:51 +0100
Subject: [PATCH] Il 258 Split up EvaluationRepository (#503)

---
 src/examples/classification.ipynb             |  10 +-
 src/examples/document_index.ipynb             |  10 +-
 src/examples/evaluation.ipynb                 |  20 +-
 src/examples/human_evaluation.ipynb           |  22 +-
 src/examples/qa.ipynb                         |  10 +-
 src/examples/quickstart_task.ipynb            |  21 +-
 src/examples/summarize.ipynb                  |  10 +-
 src/intelligence_layer/core/__init__.py       |   6 +
 src/intelligence_layer/evaluation/__init__.py |  31 +-
 .../evaluation/data_storage/__init__.py       |   0
 .../{ => data_storage}/dataset_repository.py  |  40 +-
 .../evaluation_repository.py                  | 298 +++++++++------
 .../evaluation/data_storage/run_repository.py | 267 +++++++++++++
 .../evaluation/data_storage/utils.py          |  15 +
 src/intelligence_layer/evaluation/domain.py   |   2 +-
 .../evaluation/evaluator.py                   | 358 ++----------------
 .../evaluation/hugging_face.py                |   4 +-
 .../instruct_comparison_argilla_evaluator.py  |  15 +-
 src/intelligence_layer/evaluation/run.py      |  22 +-
 src/intelligence_layer/evaluation/runner.py   |  24 +-
 .../use_cases/classify/classify.py            |  16 +-
 .../use_cases/summarize/summarize.py          |  15 +-
 tests/conftest.py                             |   6 +
 tests/evaluation/conftest.py                  |  14 +-
 tests/evaluation/test_argilla_evaluator.py    |  13 +-
 .../evaluation/test_evaluation_repository.py  |  81 +---
 tests/evaluation/test_evaluator.py            |  31 +-
 ...t_instruct_comparison_argilla_evaluator.py |  25 +-
 tests/evaluation/test_run.py                  |  10 -
 tests/evaluation/test_run_repository.py       |  89 +++++
 tests/evaluation/test_runner.py               |  18 +-
 tests/use_cases/classify/test_classify.py     |  13 +-
 .../classify/test_prompt_based_classify.py    |  12 +-
 tests/use_cases/summarize/test_summarize.py   |  22 +-
 34 files changed, 860 insertions(+), 690 deletions(-)
 create mode 100644 src/intelligence_layer/evaluation/data_storage/__init__.py
 rename src/intelligence_layer/evaluation/{ => data_storage}/dataset_repository.py (85%)
 rename src/intelligence_layer/evaluation/{ => data_storage}/evaluation_repository.py (60%)
 create mode 100644 src/intelligence_layer/evaluation/data_storage/run_repository.py
 create mode 100644 src/intelligence_layer/evaluation/data_storage/utils.py
 create mode 100644 tests/evaluation/test_run_repository.py

diff --git a/src/examples/classification.ipynb b/src/examples/classification.ipynb
index 6c6568a77..db9b0c178 100644
--- a/src/examples/classification.ipynb
+++ b/src/examples/classification.ipynb
@@ -48,6 +48,10 @@
     "from intelligence_layer.use_cases import ClassifyInput, PromptBasedClassify\n",
     "from intelligence_layer.core import Chunk, InMemoryTracer\n",
     "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
     "text_to_classify = Chunk(\"In the distant future, a space exploration party embarked on a thrilling journey to the uncharted regions of the galaxy. \\n\\\n",
     "With excitement in their hearts and the cosmos as their canvas, they ventured into the unknown, discovering breathtaking celestial wonders. \\n\\\n",
     "As they gazed upon distant stars and nebulas, they forged unforgettable memories that would forever bind them as pioneers of the cosmos.\")\n",
@@ -425,7 +429,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "3.10-intelligence",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -439,9 +443,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/src/examples/document_index.ipynb b/src/examples/document_index.ipynb
index 7a32bfc20..454d219ed 100644
--- a/src/examples/document_index.ipynb
+++ b/src/examples/document_index.ipynb
@@ -49,6 +49,10 @@
     "\n",
     "from intelligence_layer.connectors import DocumentIndexClient\n",
     "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
     "\n",
     "document_index = DocumentIndexClient(token=getenv(\"AA_TOKEN\"), base_document_index_url = \"https://document-index.aleph-alpha.com\")\n",
     "?document_index"
@@ -282,7 +286,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "intelligence-layer-jSYEeheU-py3.10",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -296,9 +300,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/src/examples/evaluation.ipynb b/src/examples/evaluation.ipynb
index 55295fa6a..2dd5f388e 100644
--- a/src/examples/evaluation.ipynb
+++ b/src/examples/evaluation.ipynb
@@ -44,19 +44,21 @@
     "from dotenv import load_dotenv\n",
     "\n",
     "from intelligence_layer.connectors import LimitedConcurrencyClient\n",
-    "from intelligence_layer.evaluation import InMemoryEvaluationRepository, InMemoryDatasetRepository, Runner\n",
+    "from intelligence_layer.evaluation import InMemoryEvaluationRepository, InMemoryRunRepository, InMemoryDatasetRepository, Runner\n",
     "from intelligence_layer.use_cases import SingleLabelClassifyEvaluator, PromptBasedClassify\n",
     "\n",
     "load_dotenv()\n",
     "\n",
     "client = LimitedConcurrencyClient.from_token(os.getenv(\"AA_TOKEN\"))\n",
     "task = PromptBasedClassify(client)\n",
-    "evaluation_repository = InMemoryEvaluationRepository()\n",
     "dataset_repository = InMemoryDatasetRepository()\n",
+    "run_repository = InMemoryRunRepository()\n",
+    "evaluation_repository = InMemoryEvaluationRepository()\n",
+    "\n",
     "\n",
     "\n",
-    "evaluator = SingleLabelClassifyEvaluator(evaluation_repository, dataset_repository, \"singel-label-classify\")\n",
-    "runner = Runner(task, evaluation_repository, dataset_repository, \"prompt-based-classify\")\n"
+    "evaluator = SingleLabelClassifyEvaluator(dataset_repository, run_repository, evaluation_repository, \"singel-label-classify\")\n",
+    "runner = Runner(task, dataset_repository, run_repository, \"prompt-based-classify\")\n"
    ]
   },
   {
@@ -257,8 +259,8 @@
     "    ]\n",
     ")\n",
     "\n",
-    "embedding_based_classify_evaluator = MultiLabelClassifyEvaluator(evaluation_repository, dataset_repository, \"multi-label-classify\", threshold=0.6)\n",
-    "embedding_based_classify_runner = Runner(embedding_based_classify, evaluation_repository, dataset_repository, \"embedding-based-classify\")\n"
+    "embedding_based_classify_evaluator = MultiLabelClassifyEvaluator(dataset_repository, run_repository, evaluation_repository, \"multi-label-classify\", threshold=0.6)\n",
+    "embedding_based_classify_runner = Runner(embedding_based_classify,dataset_repository, run_repository, \"embedding-based-classify\")\n"
    ]
   },
   {
@@ -305,7 +307,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "intelligence-layer-tfT-HG2V-py3.11",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -319,9 +321,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/src/examples/human_evaluation.ipynb b/src/examples/human_evaluation.ipynb
index 13be43884..26e76e211 100644
--- a/src/examples/human_evaluation.ipynb
+++ b/src/examples/human_evaluation.ipynb
@@ -62,6 +62,7 @@
     "    Example,\n",
     "    InMemoryDatasetRepository,\n",
     "    InMemoryEvaluationRepository,\n",
+    "    InMemoryRunRepository,\n",
     "    Runner,\n",
     "    SuccessfulExampleOutput\n",
     ")\n",
@@ -171,8 +172,8 @@
    "outputs": [],
    "source": [
     "task = Instruct(client, model=\"luminous-base-control\")\n",
-    "evaluation_repository = InMemoryEvaluationRepository()\n",
-    "runner = Runner(task, evaluation_repository, dataset_repository, \"Instruct\")\n",
+    "run_repository = InMemoryRunRepository()\n",
+    "runner = Runner(task,dataset_repository, run_repository, \"Instruct\")\n",
     "run_overview = runner.run_dataset(dataset_id)"
    ]
   },
@@ -321,10 +322,12 @@
     "    \n",
     "argilla_client = DefaultArgillaClient()\n",
     "workspace_id = argilla_client.create_workspace(\"test\")\n",
+    "evaluation_repository = InMemoryEvaluationRepository()\n",
     "\n",
     "evaluator = InstructArgillaEvaluator(\n",
-    "    ArgillaEvaluationRepository(evaluation_repository, argilla_client),\n",
     "    dataset_repository,\n",
+    "    run_repository,\n",
+    "    ArgillaEvaluationRepository(evaluation_repository, argilla_client),\n",
     "    \"instruct\",\n",
     "    workspace_id,\n",
     "    fields,\n",
@@ -371,11 +374,18 @@
     "    output = evaluator.aggregate_evaluation(eval_overview.id)\n",
     "    print(output.statistics)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "intelligence-layer-WXd7Z3vu-py3.11",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -389,9 +399,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.6"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/src/examples/qa.ipynb b/src/examples/qa.ipynb
index ace60726a..fa9e15bae 100644
--- a/src/examples/qa.ipynb
+++ b/src/examples/qa.ipynb
@@ -25,6 +25,10 @@
     "load_dotenv()\n",
     "from intelligence_layer.connectors import LimitedConcurrencyClient\n",
     "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
     "client = LimitedConcurrencyClient.from_token(getenv(\"AA_TOKEN\"))"
    ]
   },
@@ -338,7 +342,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "3.10-intelligence",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -352,9 +356,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/src/examples/quickstart_task.ipynb b/src/examples/quickstart_task.ipynb
index ab9e44fdb..e5669b47f 100644
--- a/src/examples/quickstart_task.ipynb
+++ b/src/examples/quickstart_task.ipynb
@@ -289,13 +289,6 @@
     "        KeywordExtractionAggregatedEvaluation,\n",
     "    ]\n",
     "):\n",
-    "    def __init__(\n",
-    "        self, evaluation_repository: EvaluationRepository, dataset_repository: DatasetRepository, description: str\n",
-    "    ) -> None:\n",
-    "        \"\"\"We recommend adding the task to the init method of the evaluator\n",
-    "\n",
-    "        This allows for easy comparing of different implementations of the same task.\"\"\"\n",
-    "        super().__init__(evaluation_repository, dataset_repository, description)\n",
     "\n",
     "    def do_evaluate(\n",
     "        self,\n",
@@ -342,12 +335,14 @@
    "outputs": [],
    "source": [
     "from intelligence_layer.core import NoOpTracer\n",
-    "from intelligence_layer.evaluation import InMemoryDatasetRepository, InMemoryEvaluationRepository, Runner, Example\n",
+    "from intelligence_layer.evaluation import InMemoryDatasetRepository, InMemoryEvaluationRepository, InMemoryRunRepository, Runner, Example\n",
     "\n",
-    "evaluation_repository = InMemoryEvaluationRepository()\n",
     "dataset_repository = InMemoryDatasetRepository()\n",
-    "evaluator = KeywordExtractionEvaluator(evaluation_repository, dataset_repository, \"keyword-extraction\")\n",
-    "runner = Runner(task, evaluation_repository, dataset_repository, \"keyword-extraction\")\n",
+    "run_repository = InMemoryRunRepository()\n",
+    "evaluation_repository = InMemoryEvaluationRepository()\n",
+    "\n",
+    "evaluator = KeywordExtractionEvaluator(dataset_repository, run_repository, evaluation_repository, \"keyword-extraction\")\n",
+    "runner = Runner(task, dataset_repository, run_repository, \"keyword-extraction\")\n",
     "\n",
     "input = KeywordExtractionInput(text=\"This is a text about dolphins and sharks.\")\n",
     "expected_output = KeywordExtractionExpectedOutput(keywords=[\"dolphins\", \"sharks\"])\n",
@@ -428,7 +423,7 @@
    "outputs": [],
    "source": [
     "examples = list(dataset_repository.examples_by_id(dataset_id, evaluator.input_type(), evaluator.expected_output_type()))\n",
-    "last_example_result = evaluation_repository.example_trace(\n",
+    "last_example_result = run_repository.example_trace(\n",
     "    next(iter(evaluation.run_overviews)).id, examples[-1].id\n",
     ")\n",
     "last_example_result.trace\n"
@@ -498,7 +493,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
diff --git a/src/examples/summarize.ipynb b/src/examples/summarize.ipynb
index ba1d32950..aa2313a1b 100644
--- a/src/examples/summarize.ipynb
+++ b/src/examples/summarize.ipynb
@@ -26,6 +26,10 @@
     "\n",
     "from intelligence_layer.connectors import LimitedConcurrencyClient\n",
     "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
     "client = LimitedConcurrencyClient.from_token(getenv(\"AA_TOKEN\"))\n"
    ]
   },
@@ -194,7 +198,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "3.10-intelligence",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -208,9 +212,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/src/intelligence_layer/core/__init__.py b/src/intelligence_layer/core/__init__.py
index 90ad8ede1..64484b59d 100644
--- a/src/intelligence_layer/core/__init__.py
+++ b/src/intelligence_layer/core/__init__.py
@@ -1,3 +1,9 @@
+from intelligence_layer.core.intelligence_app import (
+    AuthenticatedIntelligenceApp as AuthenticatedIntelligenceApp,
+)
+from intelligence_layer.core.intelligence_app import AuthService as AuthService
+from intelligence_layer.core.intelligence_app import IntelligenceApp as IntelligenceApp
+
 from .chunk import Chunk as Chunk
 from .chunk import ChunkInput as ChunkInput
 from .chunk import ChunkOutput as ChunkOutput
diff --git a/src/intelligence_layer/evaluation/__init__.py b/src/intelligence_layer/evaluation/__init__.py
index 68f592d23..6ed399603 100644
--- a/src/intelligence_layer/evaluation/__init__.py
+++ b/src/intelligence_layer/evaluation/__init__.py
@@ -1,6 +1,26 @@
 from .accumulator import MeanAccumulator as MeanAccumulator
-from .dataset_repository import FileDatasetRepository as FileDatasetRepository
-from .dataset_repository import InMemoryDatasetRepository as InMemoryDatasetRepository
+from .data_storage.dataset_repository import DatasetRepository as DatasetRepository
+from .data_storage.dataset_repository import (
+    FileDatasetRepository as FileDatasetRepository,
+)
+from .data_storage.dataset_repository import (
+    InMemoryDatasetRepository as InMemoryDatasetRepository,
+)
+from .data_storage.evaluation_repository import (
+    ArgillaEvaluationRepository as ArgillaEvaluationRepository,
+)
+from .data_storage.evaluation_repository import (
+    EvaluationRepository as EvaluationRepository,
+)
+from .data_storage.evaluation_repository import (
+    FileEvaluationRepository as FileEvaluationRepository,
+)
+from .data_storage.evaluation_repository import (
+    InMemoryEvaluationRepository as InMemoryEvaluationRepository,
+)
+from .data_storage.run_repository import FileRunRepository as FileRunRepository
+from .data_storage.run_repository import InMemoryRunRepository as InMemoryRunRepository
+from .data_storage.run_repository import RunRepository as RunRepository
 from .domain import Evaluation as Evaluation
 from .domain import EvaluationFailed as EvaluationFailed
 from .domain import EvaluationOverview as EvaluationOverview
@@ -20,15 +40,8 @@
 from .elo import PayoffMatrix as PayoffMatrix
 from .elo import PlayerScore as PlayerScore
 from .elo import WinRateCalculator as WinRateCalculator
-from .evaluation_repository import FileEvaluationRepository as FileEvaluationRepository
-from .evaluation_repository import (
-    InMemoryEvaluationRepository as InMemoryEvaluationRepository,
-)
-from .evaluator import ArgillaEvaluationRepository as ArgillaEvaluationRepository
 from .evaluator import ArgillaEvaluator as ArgillaEvaluator
 from .evaluator import BaseEvaluator as BaseEvaluator
-from .evaluator import DatasetRepository as DatasetRepository
-from .evaluator import EvaluationRepository as EvaluationRepository
 from .evaluator import Evaluator as Evaluator
 from .graders import BleuGrader as BleuGrader
 from .graders import RougeGrader as RougeGrader
diff --git a/src/intelligence_layer/evaluation/data_storage/__init__.py b/src/intelligence_layer/evaluation/data_storage/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/intelligence_layer/evaluation/dataset_repository.py b/src/intelligence_layer/evaluation/data_storage/dataset_repository.py
similarity index 85%
rename from src/intelligence_layer/evaluation/dataset_repository.py
rename to src/intelligence_layer/evaluation/data_storage/dataset_repository.py
index b1e08c780..2857e14b0 100644
--- a/src/intelligence_layer/evaluation/dataset_repository.py
+++ b/src/intelligence_layer/evaluation/data_storage/dataset_repository.py
@@ -1,3 +1,4 @@
+from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Dict, Iterable, Optional, Sequence, cast
 from uuid import uuid4
@@ -5,10 +6,45 @@
 from fsspec import AbstractFileSystem  # type: ignore
 from fsspec.implementations.local import LocalFileSystem  # type: ignore
 
-from intelligence_layer.core.task import Input
+from intelligence_layer.core import Input
 from intelligence_layer.core.tracer import JsonSerializer, PydanticSerializable
 from intelligence_layer.evaluation.domain import Example, ExpectedOutput
-from intelligence_layer.evaluation.evaluator import DatasetRepository
+
+
+class DatasetRepository(ABC):
+    @abstractmethod
+    def create_dataset(
+        self,
+        examples: Iterable[Example[Input, ExpectedOutput]],
+    ) -> str:
+        ...
+
+    @abstractmethod
+    def examples_by_id(
+        self,
+        dataset_id: str,
+        input_type: type[Input],
+        expected_output_type: type[ExpectedOutput],
+    ) -> Optional[Iterable[Example[Input, ExpectedOutput]]]:
+        ...
+
+    @abstractmethod
+    def example(
+        self,
+        dataset_id: str,
+        example_id: str,
+        input_type: type[Input],
+        expected_output_type: type[ExpectedOutput],
+    ) -> Optional[Example[Input, ExpectedOutput]]:
+        ...
+
+    @abstractmethod
+    def delete_dataset(self, dataset_id: str) -> None:
+        ...
+
+    @abstractmethod
+    def list_datasets(self) -> Iterable[str]:
+        ...
 
 
 class FileSystemDatasetRepository(DatasetRepository):
diff --git a/src/intelligence_layer/evaluation/evaluation_repository.py b/src/intelligence_layer/evaluation/data_storage/evaluation_repository.py
similarity index 60%
rename from src/intelligence_layer/evaluation/evaluation_repository.py
rename to src/intelligence_layer/evaluation/data_storage/evaluation_repository.py
index cc59c3ead..d07d681a6 100644
--- a/src/intelligence_layer/evaluation/evaluation_repository.py
+++ b/src/intelligence_layer/evaluation/data_storage/evaluation_repository.py
@@ -1,40 +1,29 @@
+from abc import ABC, abstractmethod
 from collections import defaultdict
 from pathlib import Path
-from typing import Iterable, Optional, Sequence, cast
+from typing import Optional, Sequence, TypeVar, cast
 
 from pydantic import BaseModel, ValidationError
 
-from intelligence_layer.core.task import Output
-from intelligence_layer.core.tracer import (
-    FileTracer,
-    InMemoryTaskSpan,
-    InMemoryTracer,
-    JsonSerializer,
-    PydanticSerializable,
-    Tracer,
+from intelligence_layer.connectors.argilla.argilla_client import (
+    ArgillaClient,
+    ArgillaEvaluation,
 )
+from intelligence_layer.core import Output
+from intelligence_layer.core.tracer import FileTracer, JsonSerializer, Tracer
+from intelligence_layer.evaluation.data_storage.utils import read_utf8, write_utf8
 from intelligence_layer.evaluation.domain import (
     Evaluation,
     ExampleEvaluation,
     ExampleOutput,
-    ExampleTrace,
     FailedExampleEvaluation,
     IndividualEvaluationOverview,
     RunOverview,
-    TaskSpanTrace,
 )
-from intelligence_layer.evaluation.evaluator import (
-    EvaluationOverviewType,
-    EvaluationRepository,
-)
-
-
-def write_utf8(path: Path, content: str) -> None:
-    path.write_text(content, encoding="utf-8")
-
 
-def read_utf8(path: Path) -> str:
-    return path.read_text(encoding="utf-8")
+EvaluationOverviewType = TypeVar(
+    "EvaluationOverviewType", bound=IndividualEvaluationOverview
+)
 
 
 class SerializedExampleEvaluation(BaseModel):
@@ -80,6 +69,110 @@ def to_example_result(
             )
 
 
+class EvaluationRepository(ABC):
+    """Base evaluation repository interface.
+
+    Provides methods to store and load evaluation results for individual examples
+    of a run and the aggregated evaluation of said run.
+    """
+
+    @abstractmethod
+    def eval_ids(self) -> Sequence[str]:
+        """Returns the ids of all stored evaluation runs.
+
+        Having the id of an evaluation run, its overview can be retrieved with
+        :meth:`EvaluationRepository.evaluation_run_overview`.
+
+        Returns:
+            The ids of all stored evaluation runs.
+        """
+        ...
+
+    @abstractmethod
+    def example_evaluation(
+        self, eval_id: str, example_id: str, evaluation_type: type[Evaluation]
+    ) -> Optional[ExampleEvaluation[Evaluation]]:
+        """Returns an :class:`ExampleEvaluation` of a given run by its id.
+
+        Args:
+            eval_id: Identifier of the run to obtain the results for.
+            example_id: Example identifier, will match :class:`ExampleEvaluation` identifier.
+            evaluation_type: Type of evaluations that the `Evaluator` returned
+                in :func:`Evaluator.do_evaluate`
+
+        Returns:
+            :class:`ExampleEvaluation` if one was found, `None` otherwise.
+        """
+        ...
+
+    @abstractmethod
+    def store_example_evaluation(self, result: ExampleEvaluation[Evaluation]) -> None:
+        """Stores an :class:`ExampleEvaluation` for a run in the repository.
+
+        Args:
+            eval_id: Identifier of the eval run.
+            result: The result to be persisted.
+        """
+        ...
+
+    @abstractmethod
+    def example_evaluations(
+        self, eval_id: str, evaluation_type: type[Evaluation]
+    ) -> Sequence[ExampleEvaluation[Evaluation]]:
+        """Returns all :class:`ExampleResult` instances of a given run
+
+        Args:
+            eval_id: Identifier of the eval run to obtain the results for.
+            evaluation_type: Type of evaluations that the :class:`Evaluator` returned
+                in :func:`Evaluator.do_evaluate`
+
+        Returns:
+            All :class:`ExampleResult` of the run. Will return an empty list if there's none.
+        """
+        ...
+
+    @abstractmethod
+    def failed_example_evaluations(
+        self, eval_id: str, evaluation_type: type[Evaluation]
+    ) -> Sequence[ExampleEvaluation[Evaluation]]:
+        """Returns all failed :class:`ExampleResult` instances of a given run
+
+        Args:
+            eval_id: Identifier of the eval run to obtain the results for.
+            evaluation_type: Type of evaluations that the :class:`Evaluator` returned
+                in :func:`Evaluator.do_evaluate`
+
+        Returns:
+            All failed :class:`ExampleResult` of the run. Will return an empty list if there's none.
+        """
+        ...
+
+    @abstractmethod
+    def evaluation_overview(
+        self, eval_id: str, overview_type: type[EvaluationOverviewType]
+    ) -> EvaluationOverviewType | None:
+        """Returns an :class:`EvaluationOverview` of a given run by its id.
+
+        Args:
+            eval_id: Identifier of the eval run to obtain the overview for.
+            aggregation_type: Type of aggregations that the :class:`Evaluator` returned
+                in :func:`Evaluator.aggregate`
+
+        Returns:
+            :class:`EvaluationOverview` if one was found, `None` otherwise.
+        """
+        ...
+
+    @abstractmethod
+    def store_evaluation_overview(self, overview: IndividualEvaluationOverview) -> None:
+        """Stores an :class:`EvaluationRunOverview` in the repository.
+
+        Args:
+            overview: The overview to be persisted.
+        """
+        ...
+
+
 class FileEvaluationRepository(EvaluationRepository):
     """An :class:`EvaluationRepository` that stores evaluation results in json-files.
 
@@ -144,38 +237,6 @@ def store_example_output(self, example_output: ExampleOutput[Output]) -> None:
             serialized_result.model_dump_json(indent=2),
         )
 
-    def example_output(
-        self, run_id: str, example_id: str, output_type: type[Output]
-    ) -> Optional[ExampleOutput[Output]]:
-        file_path = self._example_output_path(run_id, example_id)
-        if not file_path.exists():
-            return None
-        content = read_utf8(file_path)
-        # Mypy does not accept dynamic types
-        return ExampleOutput[output_type].model_validate_json(json_data=content)  # type: ignore
-
-    def example_outputs(
-        self, run_id: str, output_type: type[Output]
-    ) -> Iterable[ExampleOutput[Output]]:
-        def load_example_output(
-            path: Path,
-        ) -> Optional[ExampleOutput[Output]]:
-            id = path.with_suffix("").name
-            return self.example_output(run_id, id, output_type)
-
-        path = self._output_directory(run_id)
-        output_files = path.glob("*.json")
-        return (
-            example_output
-            for example_output in sorted(
-                (load_example_output(file) for file in output_files),
-                key=lambda example_output: example_output.example_id
-                if example_output
-                else "",
-            )
-            if example_output
-        )
-
     def example_evaluations(
         self, eval_id: str, evaluation_type: type[Evaluation]
     ) -> Sequence[ExampleEvaluation[Evaluation]]:
@@ -209,16 +270,6 @@ def example_evaluation(
         serialized_example = SerializedExampleEvaluation.model_validate_json(content)
         return serialized_example.to_example_result(evaluation_type)
 
-    def example_trace(self, run_id: str, example_id: str) -> Optional[ExampleTrace]:
-        file_path = self._example_trace_path(run_id, example_id)
-        if not file_path.exists():
-            return None
-        in_memory_tracer = _parse_log(file_path)
-        trace = TaskSpanTrace.from_task_span(
-            cast(InMemoryTaskSpan, in_memory_tracer.entries[0])
-        )
-        return ExampleTrace(run_id=run_id, example_id=example_id, trace=trace)
-
     def example_tracer(self, run_id: str, example_id: str) -> Tracer:
         file_path = self._example_trace_path(run_id, example_id)
         return FileTracer(file_path)
@@ -245,23 +296,11 @@ def store_evaluation_overview(self, overview: IndividualEvaluationOverview) -> N
             overview.model_dump_json(indent=2),
         )
 
-    def run_overview(self, run_id: str) -> RunOverview | None:
-        file_path = self._run_overview_path(run_id)
-        if not file_path.exists():
-            return None
-        content = read_utf8(file_path)
-        return RunOverview.model_validate_json(content)
-
     def store_run_overview(self, overview: RunOverview) -> None:
         write_utf8(
             self._run_overview_path(overview.id), overview.model_dump_json(indent=2)
         )
 
-    def run_ids(self) -> Sequence[str]:
-        return [
-            path.parent.name for path in self._run_root_directory().glob("*/output")
-        ]
-
     def eval_ids(
         self, overview_type: type[EvaluationOverviewType] | None = None
     ) -> Sequence[str]:
@@ -282,10 +321,6 @@ def evaluation_overview(
         return [overview.id for overview in overviews if overview is not None]
 
 
-def _parse_log(log_path: Path) -> InMemoryTracer:
-    return FileTracer(log_path).trace()
-
-
 class InMemoryEvaluationRepository(EvaluationRepository):
     """An :class:`EvaluationRepository` that stores evaluation results in memory.
 
@@ -293,56 +328,14 @@ class InMemoryEvaluationRepository(EvaluationRepository):
     """
 
     def __init__(self) -> None:
-        self._example_outputs: dict[
-            str, list[ExampleOutput[PydanticSerializable]]
-        ] = defaultdict(list)
         self._example_evaluations: dict[
             str, list[ExampleEvaluation[BaseModel]]
         ] = defaultdict(list)
-        self._example_traces: dict[str, InMemoryTracer] = dict()
         self._evaluation_run_overviews: dict[str, IndividualEvaluationOverview] = dict()
-        self._run_overviews: dict[str, RunOverview] = dict()
-
-    def run_ids(self) -> Sequence[str]:
-        return list(self._example_outputs.keys())
 
     def eval_ids(self) -> Sequence[str]:
         return list(self._evaluation_run_overviews.keys())
 
-    def store_example_output(self, example_output: ExampleOutput[Output]) -> None:
-        self._example_outputs[example_output.run_id].append(
-            cast(ExampleOutput[PydanticSerializable], example_output)
-        )
-
-    def example_outputs(
-        self, run_id: str, output_type: type[Output]
-    ) -> Iterable[ExampleOutput[Output]]:
-        return (
-            cast(ExampleOutput[Output], example_output)
-            for example_output in sorted(
-                self._example_outputs[run_id],
-                key=lambda example_output: example_output.example_id,
-            )
-        )
-
-    def example_trace(self, run_id: str, example_id: str) -> Optional[ExampleTrace]:
-        tracer = self._example_traces.get(f"{run_id}/{example_id}")
-        if tracer is None:
-            return None
-        assert tracer
-        return ExampleTrace(
-            run_id=run_id,
-            example_id=example_id,
-            trace=TaskSpanTrace.from_task_span(
-                cast(InMemoryTaskSpan, tracer.entries[0])
-            ),
-        )
-
-    def example_tracer(self, run_id: str, example_id: str) -> Tracer:
-        tracer = InMemoryTracer()
-        self._example_traces[f"{run_id}/{example_id}"] = tracer
-        return tracer
-
     def example_evaluation(
         self, eval_id: str, example_id: str, evaluation_type: type[Evaluation]
     ) -> ExampleEvaluation[Evaluation] | None:
@@ -382,8 +375,61 @@ def evaluation_overview(
     def store_evaluation_overview(self, overview: IndividualEvaluationOverview) -> None:
         self._evaluation_run_overviews[overview.id] = overview
 
-    def run_overview(self, run_id: str) -> RunOverview | None:
-        return self._run_overviews.get(run_id)
 
-    def store_run_overview(self, overview: RunOverview) -> None:
-        self._run_overviews[overview.id] = overview
+class ArgillaEvaluationRepository(EvaluationRepository):
+    """Evaluation repository used for the :class:`ArgillaEvaluator`.
+
+    Wraps an :class:`Evaluator`.
+    Does not support storing evaluations, since the ArgillaEvaluator does not do automated evaluations.
+
+    Args:
+        evaluation_repository: repository to wrap.
+        argilla_client: client used to connect to Argilla.
+    """
+
+    def __init__(
+        self, evaluation_repository: EvaluationRepository, argilla_client: ArgillaClient
+    ) -> None:
+        super().__init__()
+        self._evaluation_repository = evaluation_repository
+        self._client = argilla_client
+
+    def eval_ids(self) -> Sequence[str]:
+        return self._evaluation_repository.eval_ids()
+
+    def example_evaluation(
+        self, eval_id: str, example_id: str, evaluation_type: type[Evaluation]
+    ) -> Optional[ExampleEvaluation[Evaluation]]:
+        return self._evaluation_repository.example_evaluation(
+            eval_id, example_id, evaluation_type
+        )
+
+    def store_example_evaluation(self, _: ExampleEvaluation[Evaluation]) -> None:
+        raise TypeError(
+            "ArgillaEvaluationRepository does not support storing evaluations."
+        )
+
+    def example_evaluations(
+        self, eval_id: str, eval_type: type[Evaluation]
+    ) -> Sequence[ExampleEvaluation[Evaluation]]:
+        assert eval_type == ArgillaEvaluation
+        # Mypy does not derive that the return type is always ExampleEvaluation with ArgillaEvaluation
+        return [
+            ExampleEvaluation(eval_id=eval_id, example_id=e.example_id, result=e)  # type: ignore
+            for e in self._client.evaluations(eval_id)
+        ]
+
+    def failed_example_evaluations(
+        self, eval_id: str, evaluation_type: type[Evaluation]
+    ) -> Sequence[ExampleEvaluation[Evaluation]]:
+        return self._evaluation_repository.failed_example_evaluations(
+            eval_id, evaluation_type
+        )
+
+    def evaluation_overview(
+        self, eval_id: str, overview_type: type[EvaluationOverviewType]
+    ) -> EvaluationOverviewType | None:
+        return self._evaluation_repository.evaluation_overview(eval_id, overview_type)
+
+    def store_evaluation_overview(self, overview: IndividualEvaluationOverview) -> None:
+        return self._evaluation_repository.store_evaluation_overview(overview)
diff --git a/src/intelligence_layer/evaluation/data_storage/run_repository.py b/src/intelligence_layer/evaluation/data_storage/run_repository.py
new file mode 100644
index 000000000..47ff27529
--- /dev/null
+++ b/src/intelligence_layer/evaluation/data_storage/run_repository.py
@@ -0,0 +1,267 @@
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from pathlib import Path
+from typing import Iterable, Optional, Sequence, cast
+
+from intelligence_layer.core.task import Output
+from intelligence_layer.core.tracer import (
+    FileTracer,
+    InMemoryTaskSpan,
+    InMemoryTracer,
+    JsonSerializer,
+    PydanticSerializable,
+    Tracer,
+)
+from intelligence_layer.evaluation.data_storage.utils import (
+    _parse_log,
+    read_utf8,
+    write_utf8,
+)
+from intelligence_layer.evaluation.domain import (
+    ExampleOutput,
+    ExampleTrace,
+    RunOverview,
+    TaskSpanTrace,
+)
+
+
+class RunRepository(ABC):
+    @abstractmethod
+    def run_ids(self) -> Sequence[str]:
+        """Returns the ids of all stored runs.
+
+        Having the id of a run, its outputs can be retrieved with
+        :meth:`EvaluationRepository.example_outputs`.
+
+        Returns:
+            The ids of all stored runs.
+        """
+        ...
+
+    @abstractmethod
+    def example_outputs(
+        self, run_id: str, output_type: type[Output]
+    ) -> Iterable[ExampleOutput[Output]]:
+        """Returns all :class:`ExampleOutput` for a given run.
+
+        Args:
+            run_id: The unique identifier of the run.
+            output_type: Type of output that the `Task` returned
+                in :func:`Task.do_run`
+
+        Returns:
+            Iterable over all outputs.
+        """
+        ...
+
+    @abstractmethod
+    def store_example_output(self, example_output: ExampleOutput[Output]) -> None:
+        """Stores an individual :class:`ExampleOutput`.
+
+        Args:
+            example_output: The actual output.
+        """
+        ...
+
+    @abstractmethod
+    def example_trace(self, run_id: str, example_id: str) -> Optional[ExampleTrace]:
+        """Returns an :class:`ExampleTrace` for an example in a run.
+
+        Args:
+            run_id: The unique identifier of the run.
+            example_id: Example identifier, will match :class:`ExampleEvaluation` identifier.
+            example_output: The actual output.
+        """
+        ...
+
+    @abstractmethod
+    def example_tracer(self, run_id: str, example_id: str) -> Tracer:
+        """Returns a :class:`Tracer` to trace an individual example run.
+
+        Args:
+            run_id: The unique identifier of the run.
+            example_id: Example identifier, will match :class:`ExampleEvaluation` identifier.
+        """
+        ...
+
+    @abstractmethod
+    def run_overview(self, run_id: str) -> RunOverview | None:
+        """Returns an :class:`RunOverview` of a given run by its id.
+
+        Args:
+            run_id: Identifier of the eval run to obtain the overview for.
+
+        Returns:
+            :class:`RunOverview` if one was found, `None` otherwise.
+        """
+        ...
+
+    @abstractmethod
+    def store_run_overview(self, overview: RunOverview) -> None:
+        """Stores an :class:`RunOverview` in the repository.
+
+        Args:
+            overview: The overview to be persisted.
+        """
+        ...
+
+
+class FileRunRepository(RunRepository):
+    def __init__(self, root_directory: Path) -> None:
+        root_directory.mkdir(parents=True, exist_ok=True)
+        self._root_directory = root_directory
+
+    def _example_trace_path(self, run_id: str, example_id: str) -> Path:
+        return (self._trace_directory(run_id) / example_id).with_suffix(".jsonl")
+
+    def _run_root_directory(self) -> Path:
+        path = self._root_directory / "runs"
+        path.mkdir(exist_ok=True)
+        return path
+
+    def _run_directory(self, run_id: str) -> Path:
+        path = self._run_root_directory() / run_id
+        path.mkdir(exist_ok=True)
+        return path
+
+    def _trace_directory(self, run_id: str) -> Path:
+        path = self._run_directory(run_id) / "trace"
+
+        path.mkdir(exist_ok=True)
+        return path
+
+    def _run_overview_path(self, run_id: str) -> Path:
+        return self._run_directory(run_id).with_suffix(".json")
+
+    def _output_directory(self, run_id: str) -> Path:
+        path = self._run_directory(run_id) / "output"
+
+        path.mkdir(exist_ok=True)
+        return path
+
+    def _example_output_path(self, run_id: str, example_id: str) -> Path:
+        return (self._output_directory(run_id) / example_id).with_suffix(".json")
+
+    def run_overview(self, run_id: str) -> RunOverview | None:
+        file_path = self._run_overview_path(run_id)
+        if not file_path.exists():
+            return None
+        content = read_utf8(file_path)
+        return RunOverview.model_validate_json(content)
+
+    def example_trace(self, run_id: str, example_id: str) -> Optional[ExampleTrace]:
+        file_path = self._example_trace_path(run_id, example_id)
+        if not file_path.exists():
+            return None
+        in_memory_tracer = _parse_log(file_path)
+        trace = TaskSpanTrace.from_task_span(
+            cast(InMemoryTaskSpan, in_memory_tracer.entries[0])
+        )
+        return ExampleTrace(run_id=run_id, example_id=example_id, trace=trace)
+
+    def example_output(
+        self, run_id: str, example_id: str, output_type: type[Output]
+    ) -> Optional[ExampleOutput[Output]]:
+        file_path = self._example_output_path(run_id, example_id)
+        if not file_path.exists():
+            return None
+        content = read_utf8(file_path)
+        # Mypy does not accept dynamic types
+        return ExampleOutput[output_type].model_validate_json(  # type: ignore
+            json_data=content
+        )
+
+    def example_outputs(
+        self, run_id: str, output_type: type[Output]
+    ) -> Iterable[ExampleOutput[Output]]:
+        def load_example_output(
+            path: Path,
+        ) -> Optional[ExampleOutput[Output]]:
+            id = path.with_suffix("").name
+            return self.example_output(run_id, id, output_type)
+
+        path = self._output_directory(run_id)
+        output_files = path.glob("*.json")
+        return (
+            example_output
+            for example_output in sorted(
+                (load_example_output(file) for file in output_files),
+                key=lambda example_output: example_output.example_id
+                if example_output
+                else "",
+            )
+            if example_output
+        )
+
+    def run_ids(self) -> Sequence[str]:
+        return [
+            path.parent.name for path in self._run_root_directory().glob("*/output")
+        ]
+
+    def example_tracer(self, run_id: str, example_id: str) -> Tracer:
+        file_path = self._example_trace_path(run_id, example_id)
+        return FileTracer(file_path)
+
+    def store_run_overview(self, overview: RunOverview) -> None:
+        write_utf8(
+            self._run_overview_path(overview.id), overview.model_dump_json(indent=2)
+        )
+
+    def store_example_output(self, example_output: ExampleOutput[Output]) -> None:
+        serialized_result = JsonSerializer(root=example_output)
+        write_utf8(
+            self._example_output_path(example_output.run_id, example_output.example_id),
+            serialized_result.model_dump_json(indent=2),
+        )
+
+
+class InMemoryRunRepository(RunRepository):
+    def __init__(self) -> None:
+        self._example_outputs: dict[
+            str, list[ExampleOutput[PydanticSerializable]]
+        ] = defaultdict(list)
+        self._example_traces: dict[str, InMemoryTracer] = dict()
+        self._run_overviews: dict[str, RunOverview] = dict()
+
+    def run_ids(self) -> Sequence[str]:
+        return list(self._example_outputs.keys())
+
+    def store_example_output(self, example_output: ExampleOutput[Output]) -> None:
+        self._example_outputs[example_output.run_id].append(
+            cast(ExampleOutput[PydanticSerializable], example_output)
+        )
+
+    def example_outputs(
+        self, run_id: str, output_type: type[Output]
+    ) -> Iterable[ExampleOutput[Output]]:
+        return (
+            cast(ExampleOutput[Output], example_output)
+            for example_output in sorted(
+                self._example_outputs[run_id],
+                key=lambda example_output: example_output.example_id,
+            )
+        )
+
+    def example_trace(self, run_id: str, example_id: str) -> Optional[ExampleTrace]:
+        tracer = self._example_traces.get(f"{run_id}/{example_id}")
+        if tracer is None:
+            return None
+        assert tracer
+        return ExampleTrace(
+            run_id=run_id,
+            example_id=example_id,
+            trace=TaskSpanTrace.from_task_span(
+                cast(InMemoryTaskSpan, tracer.entries[0])
+            ),
+        )
+
+    def example_tracer(self, run_id: str, example_id: str) -> Tracer:
+        tracer = InMemoryTracer()
+        self._example_traces[f"{run_id}/{example_id}"] = tracer
+        return tracer
+
+    def run_overview(self, run_id: str) -> RunOverview | None:
+        return self._run_overviews.get(run_id)
+
+    def store_run_overview(self, overview: RunOverview) -> None:
+        self._run_overviews[overview.id] = overview
diff --git a/src/intelligence_layer/evaluation/data_storage/utils.py b/src/intelligence_layer/evaluation/data_storage/utils.py
new file mode 100644
index 000000000..6dfc8328c
--- /dev/null
+++ b/src/intelligence_layer/evaluation/data_storage/utils.py
@@ -0,0 +1,15 @@
+from pathlib import Path
+
+from intelligence_layer.core.tracer import FileTracer, InMemoryTracer
+
+
+def write_utf8(path: Path, content: str) -> None:
+    path.write_text(content, encoding="utf-8")
+
+
+def read_utf8(path: Path) -> str:
+    return path.read_text(encoding="utf-8")
+
+
+def _parse_log(log_path: Path) -> InMemoryTracer:
+    return FileTracer(log_path).trace()
diff --git a/src/intelligence_layer/evaluation/domain.py b/src/intelligence_layer/evaluation/domain.py
index b18fff17e..8a6c3752a 100644
--- a/src/intelligence_layer/evaluation/domain.py
+++ b/src/intelligence_layer/evaluation/domain.py
@@ -274,7 +274,7 @@ class IndividualEvaluationOverview(BaseModel, frozen=True):
     """Overview of the unaggregated results of evaluating a :class:`Task` on a dataset.
 
     Attributes:
-        run_overview: Overview of the run that was evaluated.
+        run_overviews: Overviews of the runs that were evaluated.
         id: The unique identifier of this evaluation.
         start: The time when the evaluation run was started
         description: human-readable for the evaluator that created the evaluation
diff --git a/src/intelligence_layer/evaluation/evaluator.py b/src/intelligence_layer/evaluation/evaluator.py
index 52a26623f..90804bbf0 100644
--- a/src/intelligence_layer/evaluation/evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluator.py
@@ -20,14 +20,22 @@
 
 from tqdm import tqdm
 
-from intelligence_layer.connectors import ArgillaClient, Field
+from intelligence_layer.connectors import Field
 from intelligence_layer.connectors.argilla.argilla_client import (
     ArgillaEvaluation,
     Question,
     RecordData,
 )
 from intelligence_layer.core.task import Input, Output
-from intelligence_layer.core.tracer import Tracer, utc_now
+from intelligence_layer.core.tracer import utc_now
+from intelligence_layer.evaluation.data_storage.dataset_repository import (
+    DatasetRepository,
+)
+from intelligence_layer.evaluation.data_storage.evaluation_repository import (
+    ArgillaEvaluationRepository,
+    EvaluationRepository,
+)
+from intelligence_layer.evaluation.data_storage.run_repository import RunRepository
 from intelligence_layer.evaluation.domain import (
     AggregatedEvaluation,
     Evaluation,
@@ -35,7 +43,6 @@
     Example,
     ExampleEvaluation,
     ExampleOutput,
-    ExampleTrace,
     ExpectedOutput,
     FailedExampleEvaluation,
     FailedExampleRun,
@@ -44,230 +51,6 @@
     SuccessfulExampleOutput,
 )
 
-EvaluationOverviewType = TypeVar(
-    "EvaluationOverviewType", bound=IndividualEvaluationOverview
-)
-
-
-class EvaluationRepository(ABC):
-    """Base evaluation repository interface.
-
-    Provides methods to store and load evaluation results for individual examples
-    of a run and the aggregated evaluation of said run.
-    """
-
-    @abstractmethod
-    def run_ids(self) -> Sequence[str]:
-        """Returns the ids of all stored runs.
-
-        Having the id of a run, its outputs can be retrieved with
-        :meth:`EvaluationRepository.example_outputs`.
-
-        Returns:
-            The ids of all stored runs.
-        """
-        ...
-
-    @abstractmethod
-    def eval_ids(self) -> Sequence[str]:
-        """Returns the ids of all stored evaluation runs.
-
-        Having the id of an evaluation run, its overview can be retrieved with
-        :meth:`EvaluationRepository.evaluation_run_overview`.
-
-        Returns:
-            The ids of all stored evaluation runs.
-        """
-        ...
-
-    @abstractmethod
-    def example_outputs(
-        self, run_id: str, output_type: type[Output]
-    ) -> Iterable[ExampleOutput[Output]]:
-        """Returns all :class:`ExampleOutput` for a given run.
-
-        Args:
-            run_id: The unique identifier of the run.
-            output_type: Type of output that the `Task` returned
-                in :func:`Task.do_run`
-
-        Returns:
-            Iterable over all outputs.
-        """
-        ...
-
-    @abstractmethod
-    def store_example_output(self, example_output: ExampleOutput[Output]) -> None:
-        """Stores an individual :class:`ExampleOutput`.
-
-        Args:
-            example_output: The actual output.
-        """
-        ...
-
-    @abstractmethod
-    def example_trace(self, run_id: str, example_id: str) -> Optional[ExampleTrace]:
-        """Returns an :class:`ExampleTrace` for an example in a run.
-
-        Args:
-            run_id: The unique identifier of the run.
-            example_id: Example identifier, will match :class:`ExampleEvaluation` identifier.
-            example_output: The actual output.
-        """
-        ...
-
-    @abstractmethod
-    def example_tracer(self, run_id: str, example_id: str) -> Tracer:
-        """Returns a :class:`Tracer` to trace an individual example run.
-
-        Args:
-            run_id: The unique identifier of the run.
-            example_id: Example identifier, will match :class:`ExampleEvaluation` identifier.
-        """
-        ...
-
-    @abstractmethod
-    def example_evaluation(
-        self, eval_id: str, example_id: str, evaluation_type: type[Evaluation]
-    ) -> Optional[ExampleEvaluation[Evaluation]]:
-        """Returns an :class:`ExampleEvaluation` of a given run by its id.
-
-        Args:
-            eval_id: Identifier of the run to obtain the results for.
-            example_id: Example identifier, will match :class:`ExampleEvaluation` identifier.
-            evaluation_type: Type of evaluations that the `Evaluator` returned
-                in :func:`Evaluator.do_evaluate`
-
-        Returns:
-            :class:`ExampleEvaluation` if one was found, `None` otherwise.
-        """
-        ...
-
-    @abstractmethod
-    def store_example_evaluation(self, result: ExampleEvaluation[Evaluation]) -> None:
-        """Stores an :class:`ExampleEvaluation` for a run in the repository.
-
-        Args:
-            eval_id: Identifier of the eval run.
-            result: The result to be persisted.
-        """
-        ...
-
-    @abstractmethod
-    def example_evaluations(
-        self, eval_id: str, evaluation_type: type[Evaluation]
-    ) -> Sequence[ExampleEvaluation[Evaluation]]:
-        """Returns all :class:`ExampleResult` instances of a given run
-
-        Args:
-            eval_id: Identifier of the eval run to obtain the results for.
-            evaluation_type: Type of evaluations that the :class:`Evaluator` returned
-                in :func:`Evaluator.do_evaluate`
-
-        Returns:
-            All :class:`ExampleResult` of the run. Will return an empty list if there's none.
-        """
-        ...
-
-    @abstractmethod
-    def failed_example_evaluations(
-        self, eval_id: str, evaluation_type: type[Evaluation]
-    ) -> Sequence[ExampleEvaluation[Evaluation]]:
-        """Returns all failed :class:`ExampleResult` instances of a given run
-
-        Args:
-            eval_id: Identifier of the eval run to obtain the results for.
-            evaluation_type: Type of evaluations that the :class:`Evaluator` returned
-                in :func:`Evaluator.do_evaluate`
-
-        Returns:
-            All failed :class:`ExampleResult` of the run. Will return an empty list if there's none.
-        """
-        ...
-
-    @abstractmethod
-    def evaluation_overview(
-        self, eval_id: str, overview_type: type[EvaluationOverviewType]
-    ) -> EvaluationOverviewType | None:
-        """Returns an :class:`EvaluationOverview` of a given run by its id.
-
-        Args:
-            eval_id: Identifier of the eval run to obtain the overview for.
-            aggregation_type: Type of aggregations that the :class:`Evaluator` returned
-                in :func:`Evaluator.aggregate`
-
-        Returns:
-            :class:`EvaluationOverview` if one was found, `None` otherwise.
-        """
-        ...
-
-    @abstractmethod
-    def store_evaluation_overview(self, overview: IndividualEvaluationOverview) -> None:
-        """Stores an :class:`EvaluationRunOverview` in the repository.
-
-        Args:
-            overview: The overview to be persisted.
-        """
-        ...
-
-    @abstractmethod
-    def run_overview(self, run_id: str) -> RunOverview | None:
-        """Returns an :class:`RunOverview` of a given run by its id.
-
-        Args:
-            run_id: Identifier of the eval run to obtain the overview for.
-
-        Returns:
-            :class:`RunOverview` if one was found, `None` otherwise.
-        """
-        ...
-
-    @abstractmethod
-    def store_run_overview(self, overview: RunOverview) -> None:
-        """Stores an :class:`RunOverview` in the repository.
-
-        Args:
-            overview: The overview to be persisted.
-        """
-        ...
-
-
-class DatasetRepository(ABC):
-    @abstractmethod
-    def create_dataset(
-        self,
-        examples: Iterable[Example[Input, ExpectedOutput]],
-    ) -> str:
-        ...
-
-    @abstractmethod
-    def examples_by_id(
-        self,
-        dataset_id: str,
-        input_type: type[Input],
-        expected_output_type: type[ExpectedOutput],
-    ) -> Optional[Iterable[Example[Input, ExpectedOutput]]]:
-        ...
-
-    @abstractmethod
-    def example(
-        self,
-        dataset_id: str,
-        example_id: str,
-        input_type: type[Input],
-        expected_output_type: type[ExpectedOutput],
-    ) -> Optional[Example[Input, ExpectedOutput]]:
-        ...
-
-    @abstractmethod
-    def delete_dataset(self, dataset_id: str) -> None:
-        ...
-
-    @abstractmethod
-    def list_datasets(self) -> Iterable[str]:
-        ...
-
-
 T = TypeVar("T")
 
 
@@ -304,8 +87,9 @@ class BaseEvaluator(
     """Base evaluator interface.
 
     Arguments:
-        evaluation_repository: The repository that will be used to store evaluation results.
         dataset_repository: The repository with the examples that will be taken for the evaluation
+        run_repository: The repository with the run output that will be taken for the evaluation
+        evaluation_repository: The repository that will be used to store evaluation results.
         description: human-readable description for the evaluator
 
     Generics:
@@ -318,12 +102,14 @@ class BaseEvaluator(
 
     def __init__(
         self,
-        evaluation_repository: EvaluationRepository,
         dataset_repository: DatasetRepository,
+        run_repository: RunRepository,
+        evaluation_repository: EvaluationRepository,
         description: str,
     ) -> None:
-        self._evaluation_repository = evaluation_repository
         self._dataset_repository = dataset_repository
+        self._run_repository = run_repository
+        self._evaluation_repository = evaluation_repository
         self.description = description
 
     @lru_cache(maxsize=1)
@@ -441,7 +227,7 @@ def aggregate(self, evaluations: Iterable[Evaluation]) -> AggregatedEvaluation:
         It should create an `AggregatedEvaluation` class and return it at the end.
 
         Args:
-            evalautions: The results from running `evaluate_dataset` with a :class:`Task`.
+            evaluations: The results from running `evaluate_dataset` with a :class:`Task`.
 
         Returns:
             The aggregated results of an evaluation run with a :class:`Dataset`.
@@ -477,10 +263,7 @@ def evaluate_runs(
                 and their tasks have the same output-type. For each example in the
                 dataset referenced by the runs the outputs of all runs are collected
                 and if all of them were successful they are passed on to the implementation
-                specific evaluation. For a simple evaluation only a single run_id is provided.
-                If the output of multiple runs are to be compared (for example to compare
-                the performance of different model on the same task), multiple run_ids are
-                passed accordingly.
+                specific evaluation. The method compares all run of the provided ids to each other.
             num_examples: The number of examples which should be evaluated from the given runs.
                 Always the first n runs stored in the evaluation repository
 
@@ -490,15 +273,15 @@ def evaluate_runs(
             __init__.
         """
 
-        def load_overview(run_id: str) -> RunOverview:
-            run_overview = self._evaluation_repository.run_overview(run_id)
+        def load_run_overview(run_id: str) -> RunOverview:
+            run_overview = self._run_repository.run_overview(run_id)
             if not run_overview:
                 raise ValueError(f"No RunOverview found for run-id: {run_id}")
             return run_overview
 
         if not run_ids:
             raise ValueError("At least one run-id needs to be provided")
-        run_overviews = frozenset(load_overview(run_id) for run_id in run_ids)
+        run_overviews = frozenset(load_run_overview(run_id) for run_id in run_ids)
         if not all(
             next(iter(run_overviews)).dataset_id == run_overview.dataset_id
             for run_overview in run_overviews
@@ -519,7 +302,7 @@ def load_overview(run_id: str) -> RunOverview:
 
         examples_zipped: Iterable[tuple[ExampleOutput[Output], ...]] = zip(
             *(
-                self._evaluation_repository.example_outputs(
+                self._run_repository.example_outputs(
                     run_overview.id, self.output_type()
                 )
                 for run_overview in run_overviews
@@ -616,7 +399,7 @@ def aggregate_evaluation(
             An overview of the aggregated evaluation.
         """
 
-        def load_overview(eval_id: str) -> IndividualEvaluationOverview:
+        def load_eval_overview(eval_id: str) -> IndividualEvaluationOverview:
             evaluation_overview = self._evaluation_repository.evaluation_overview(
                 eval_id, IndividualEvaluationOverview
             )
@@ -626,7 +409,7 @@ def load_overview(eval_id: str) -> IndividualEvaluationOverview:
                 )
             return evaluation_overview
 
-        evaluation_overviews = frozenset(load_overview(id) for id in set(eval_ids))
+        evaluation_overviews = frozenset(load_eval_overview(id) for id in set(eval_ids))
 
         nested_evaluations = [
             self._evaluation_repository.example_evaluations(
@@ -688,11 +471,14 @@ class Evaluator(
 
     def __init__(
         self,
-        evaluation_repository: EvaluationRepository,
         dataset_repository: DatasetRepository,
+        run_repository: RunRepository,
+        evaluation_repository: EvaluationRepository,
         description: str,
     ) -> None:
-        super().__init__(evaluation_repository, dataset_repository, description)
+        super().__init__(
+            dataset_repository, run_repository, evaluation_repository, description
+        )
 
     @abstractmethod
     def do_evaluate(
@@ -757,85 +543,6 @@ def evaluate_dataset(
         return self.aggregate_evaluation(partial_evaluation_overview.id)
 
 
-class ArgillaEvaluationRepository(EvaluationRepository):
-    """Evaluation repository used for the :class:`ArgillaEvaluator`.
-
-    Wraps an :class:`Evaluator`.
-    Does not support storing evaluations, since the ArgillaEvaluator does not do automated evaluations.
-
-    Args:
-        evaluation_repository: repository to wrap.
-        argilla_client: client used to connect to Argilla.
-    """
-
-    def __init__(
-        self, evaluation_repository: EvaluationRepository, argilla_client: ArgillaClient
-    ) -> None:
-        super().__init__()
-        self._evaluation_repository = evaluation_repository
-        self._client = argilla_client
-
-    def run_ids(self) -> Sequence[str]:
-        return self._evaluation_repository.run_ids()
-
-    def eval_ids(self) -> Sequence[str]:
-        return self._evaluation_repository.eval_ids()
-
-    def example_outputs(
-        self, run_id: str, output_type: type[Output]
-    ) -> Iterable[ExampleOutput[Output]]:
-        return self._evaluation_repository.example_outputs(run_id, output_type)
-
-    def store_example_output(self, example_output: ExampleOutput[Output]) -> None:
-        return self._evaluation_repository.store_example_output(example_output)
-
-    def example_trace(self, run_id: str, example_id: str) -> Optional[ExampleTrace]:
-        return self._evaluation_repository.example_trace(run_id, example_id)
-
-    def example_tracer(self, run_id: str, example_id: str) -> Tracer:
-        return self._evaluation_repository.example_tracer(run_id, example_id)
-
-    def example_evaluation(
-        self, eval_id: str, example_id: str, evaluation_type: type[Evaluation]
-    ) -> Optional[ExampleEvaluation[Evaluation]]:
-        return self._evaluation_repository.example_evaluation(
-            eval_id, example_id, evaluation_type
-        )
-
-    def store_example_evaluation(self, _: ExampleEvaluation[Evaluation]) -> None:
-        raise TypeError(
-            "ArgillaEvaluationRepository does not support storing evaluations."
-        )
-
-    def example_evaluations(
-        self, eval_id: str, eval_type: type[Evaluation]
-    ) -> Sequence[ExampleEvaluation[Evaluation]]:
-        assert eval_type == ArgillaEvaluation
-        # Mypy does not derive that the return type is always ExampleEvaluation with ArgillaEvaluation
-        return [ExampleEvaluation(eval_id=eval_id, example_id=e.example_id, result=e) for e in self._client.evaluations(eval_id)]  # type: ignore
-
-    def failed_example_evaluations(
-        self, eval_id: str, evaluation_type: type[Evaluation]
-    ) -> Sequence[ExampleEvaluation[Evaluation]]:
-        return self._evaluation_repository.failed_example_evaluations(
-            eval_id, evaluation_type
-        )
-
-    def evaluation_overview(
-        self, eval_id: str, overview_type: type[EvaluationOverviewType]
-    ) -> EvaluationOverviewType | None:
-        return self._evaluation_repository.evaluation_overview(eval_id, overview_type)
-
-    def store_evaluation_overview(self, overview: IndividualEvaluationOverview) -> None:
-        return self._evaluation_repository.store_evaluation_overview(overview)
-
-    def run_overview(self, run_id: str) -> RunOverview | None:
-        return self._evaluation_repository.run_overview(run_id)
-
-    def store_run_overview(self, overview: RunOverview) -> None:
-        return self._evaluation_repository.store_run_overview(overview)
-
-
 class ArgillaEvaluator(
     BaseEvaluator[
         Input, Output, ExpectedOutput, ArgillaEvaluation, AggregatedEvaluation
@@ -859,14 +566,17 @@ class ArgillaEvaluator(
 
     def __init__(
         self,
-        evaluation_repository: ArgillaEvaluationRepository,
         dataset_repository: DatasetRepository,
+        run_repository: RunRepository,
+        evaluation_repository: ArgillaEvaluationRepository,
         description: str,
         workspace_id: str,
         fields: Sequence[Field],
         questions: Sequence[Question],
     ) -> None:
-        super().__init__(evaluation_repository, dataset_repository, description)
+        super().__init__(
+            dataset_repository, run_repository, evaluation_repository, description
+        )
         self._workspace_id = workspace_id
         self._fields = fields
         self._questions = questions
diff --git a/src/intelligence_layer/evaluation/hugging_face.py b/src/intelligence_layer/evaluation/hugging_face.py
index ff81f9469..c6573586d 100644
--- a/src/intelligence_layer/evaluation/hugging_face.py
+++ b/src/intelligence_layer/evaluation/hugging_face.py
@@ -1,7 +1,9 @@
 import huggingface_hub  # type: ignore
 from huggingface_hub import HfFileSystem, create_repo
 
-from intelligence_layer.evaluation.dataset_repository import FileSystemDatasetRepository
+from intelligence_layer.evaluation.data_storage.dataset_repository import (
+    FileSystemDatasetRepository,
+)
 
 
 class HuggingFaceDatasetRepository(FileSystemDatasetRepository):
diff --git a/src/intelligence_layer/evaluation/instruct_comparison_argilla_evaluator.py b/src/intelligence_layer/evaluation/instruct_comparison_argilla_evaluator.py
index 972941e65..91677b693 100644
--- a/src/intelligence_layer/evaluation/instruct_comparison_argilla_evaluator.py
+++ b/src/intelligence_layer/evaluation/instruct_comparison_argilla_evaluator.py
@@ -12,8 +12,11 @@
 )
 from intelligence_layer.core.complete import InstructInput, PromptOutput
 from intelligence_layer.evaluation import (
+    ArgillaEvaluationRepository,
+    DatasetRepository,
     Example,
     MeanAccumulator,
+    RunRepository,
     SuccessfulExampleOutput,
 )
 from intelligence_layer.evaluation.elo import (
@@ -24,11 +27,7 @@
     WinRateCalculator,
     build_tournaments,
 )
-from intelligence_layer.evaluation.evaluator import (
-    ArgillaEvaluationRepository,
-    ArgillaEvaluator,
-    DatasetRepository,
-)
+from intelligence_layer.evaluation.evaluator import ArgillaEvaluator
 
 
 class AggregatedInstructComparison(BaseModel):
@@ -52,8 +51,9 @@ class InstructComparisonArgillaEvaluator(
 
     def __init__(
         self,
-        evaluation_repository: ArgillaEvaluationRepository,
         dataset_repository: DatasetRepository,
+        run_repository: RunRepository,
+        evaluation_repository: ArgillaEvaluationRepository,
         description: str,
         workspace_id: str,
         high_priority_runs: Optional[frozenset[str]] = None,
@@ -74,8 +74,9 @@ def __init__(
         ]
 
         super().__init__(
-            evaluation_repository,
             dataset_repository,
+            run_repository,
+            evaluation_repository,
             description,
             workspace_id,
             fields,
diff --git a/src/intelligence_layer/evaluation/run.py b/src/intelligence_layer/evaluation/run.py
index e9c97ae3e..02db57b7d 100644
--- a/src/intelligence_layer/evaluation/run.py
+++ b/src/intelligence_layer/evaluation/run.py
@@ -9,8 +9,13 @@
 from intelligence_layer.connectors.limited_concurrency_client import (
     LimitedConcurrencyClient,
 )
-from intelligence_layer.evaluation.dataset_repository import FileDatasetRepository
-from intelligence_layer.evaluation.evaluation_repository import FileEvaluationRepository
+from intelligence_layer.evaluation.data_storage.dataset_repository import (
+    FileDatasetRepository,
+)
+from intelligence_layer.evaluation.data_storage.evaluation_repository import (
+    FileEvaluationRepository,
+)
+from intelligence_layer.evaluation.data_storage.run_repository import FileRunRepository
 from intelligence_layer.evaluation.runner import Runner
 
 
@@ -80,15 +85,18 @@ def parse_args(cli_args: Sequence[str]) -> Namespace:
 
 def main(cli_args: Sequence[str]) -> None:
     args = parse_args(cli_args)
-    evaluation_repository = FileEvaluationRepository(args.target_dir)
     dataset_repository = FileDatasetRepository(args.dataset_repository_path)
+    runner_repository = FileRunRepository(args.target_dir)
+    evaluation_repository = FileEvaluationRepository(args.target_dir)
     description = args.description
     task = create_task(args.task)
-    runner = Runner(task, evaluation_repository, dataset_repository, args.task.__name__)
+    runner = Runner(task, dataset_repository, runner_repository, args.task.__name__)
     dataset_id = args.dataset_id
-    run_overview = runner.run_dataset(dataset_id)
-    evaluator = args.evaluator(evaluation_repository, dataset_repository, description)
-    evaluator.evaluate_dataset(run_overview.id)
+    run_overview_id = runner.run_dataset(dataset_id).id
+    evaluator = args.evaluator(
+        dataset_repository, runner_repository, evaluation_repository, description
+    )
+    evaluator.evaluate_dataset(run_overview_id)
 
 
 if __name__ == "__main__":
diff --git a/src/intelligence_layer/evaluation/runner.py b/src/intelligence_layer/evaluation/runner.py
index c4bde68fd..204aa07a4 100644
--- a/src/intelligence_layer/evaluation/runner.py
+++ b/src/intelligence_layer/evaluation/runner.py
@@ -9,6 +9,10 @@
 
 from intelligence_layer.core.task import Input, Output, Task
 from intelligence_layer.core.tracer import CompositeTracer, Tracer, utc_now
+from intelligence_layer.evaluation.data_storage.dataset_repository import (
+    DatasetRepository,
+)
+from intelligence_layer.evaluation.data_storage.run_repository import RunRepository
 from intelligence_layer.evaluation.domain import (
     Example,
     ExampleOutput,
@@ -16,22 +20,18 @@
     FailedExampleRun,
     RunOverview,
 )
-from intelligence_layer.evaluation.evaluator import (
-    DatasetRepository,
-    EvaluationRepository,
-)
 
 
 class Runner(Generic[Input, Output]):
     def __init__(
         self,
         task: Task[Input, Output],
-        evaluation_repository: EvaluationRepository,
         dataset_repository: DatasetRepository,
+        run_repository: RunRepository,
         description: str,
     ) -> None:
         self._task = task
-        self._evaluation_repository = evaluation_repository
+        self._run_repository = run_repository
         self._dataset_repository = dataset_repository
         self.description = description
 
@@ -39,7 +39,7 @@ def output_type(self) -> type[Output]:
         """Returns the type of the evaluated task's output.
 
         This can be used to retrieve properly typed outputs of an evaluation run
-        from a :class:`EvaluationRepository`
+        from a :class:`RunRepository`
 
         Returns:
             the type of the evaluated task's output.
@@ -82,15 +82,13 @@ def run_dataset(
 
         Returns:
             An overview of the run. Outputs will not be returned but instead stored in the
-            :class:`EvaluationRepository` provided in the __init__.
+            :class:`RunRepository` provided in the __init__.
         """
 
         def run(
             example: Example[Input, ExpectedOutput]
         ) -> tuple[str, Output | FailedExampleRun]:
-            evaluate_tracer = self._evaluation_repository.example_tracer(
-                run_id, example.id
-            )
+            evaluate_tracer = self._run_repository.example_tracer(run_id, example.id)
             if tracer:
                 evaluate_tracer = CompositeTracer([evaluate_tracer, tracer])
             try:
@@ -119,7 +117,7 @@ def run(
                     failed_count += 1
                 else:
                     successful_count += 1
-                self._evaluation_repository.store_example_output(
+                self._run_repository.store_example_output(
                     ExampleOutput[Output](
                         run_id=run_id, example_id=example_id, output=output
                     ),
@@ -133,5 +131,5 @@ def run(
             successful_example_count=successful_count,
             description=self.description,
         )
-        self._evaluation_repository.store_run_overview(run_overview)
+        self._run_repository.store_run_overview(run_overview)
         return run_overview
diff --git a/src/intelligence_layer/use_cases/classify/classify.py b/src/intelligence_layer/use_cases/classify/classify.py
index 2398d1f98..fa582b586 100644
--- a/src/intelligence_layer/use_cases/classify/classify.py
+++ b/src/intelligence_layer/use_cases/classify/classify.py
@@ -9,6 +9,7 @@
     EvaluationRepository,
     Evaluator,
     MeanAccumulator,
+    RunRepository,
 )
 
 Probability = NewType("Probability", float)
@@ -81,14 +82,6 @@ class SingleLabelClassifyEvaluator(
         AggregatedSingleLabelClassifyEvaluation,
     ]
 ):
-    def __init__(
-        self,
-        evaluation_repository: EvaluationRepository,
-        dataset_respository: DatasetRepository,
-        description: str,
-    ):
-        super().__init__(evaluation_repository, dataset_respository, description)
-
     # mypy expects *args where this method only uses one output
     def do_evaluate(  # type: ignore
         self,
@@ -170,12 +163,15 @@ class MultiLabelClassifyEvaluator(
 ):
     def __init__(
         self,
-        evaluation_repository: EvaluationRepository,
         dataset_repository: DatasetRepository,
+        run_repository: RunRepository,
+        evaluation_repository: EvaluationRepository,
         description: str,
         threshold: float = 0.55,
     ):
-        super().__init__(evaluation_repository, dataset_repository, description)
+        super().__init__(
+            dataset_repository, run_repository, evaluation_repository, description
+        )
         self.threshold = threshold
 
     # mypy expects *args where this method only uses one output
diff --git a/src/intelligence_layer/use_cases/summarize/summarize.py b/src/intelligence_layer/use_cases/summarize/summarize.py
index 4660eb2c8..0c73742e1 100644
--- a/src/intelligence_layer/use_cases/summarize/summarize.py
+++ b/src/intelligence_layer/use_cases/summarize/summarize.py
@@ -11,6 +11,7 @@
     MeanAccumulator,
     RougeGrader,
 )
+from intelligence_layer.evaluation.data_storage.run_repository import RunRepository
 
 
 class LongContextSummarizeInput(BaseModel):
@@ -111,11 +112,14 @@ class SingleChunkSummarizeEvaluator(
 ):
     def __init__(
         self,
-        repository: EvaluationRepository,
         dataset_repository: DatasetRepository,
+        run_repository: RunRepository,
+        evaluation_repository: EvaluationRepository,
         description: str,
     ) -> None:
-        super().__init__(repository, dataset_repository, description)
+        super().__init__(
+            dataset_repository, run_repository, evaluation_repository, description
+        )
         self.bleu_grader = BleuGrader()
         self.rouge_grader = RougeGrader()
 
@@ -150,11 +154,14 @@ class LongContextSummarizeEvaluator(
 ):
     def __init__(
         self,
-        evaluation_repository: EvaluationRepository,
         dataset_repository: DatasetRepository,
+        run_repository: RunRepository,
+        evaluation_repository: EvaluationRepository,
         description: str,
     ) -> None:
-        super().__init__(evaluation_repository, dataset_repository, description)
+        super().__init__(
+            dataset_repository, run_repository, evaluation_repository, description
+        )
         self.bleu_grader = BleuGrader()
         self.rouge_grader = RougeGrader()
 
diff --git a/tests/conftest.py b/tests/conftest.py
index d0e4dea10..48743d975 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -30,6 +30,7 @@
 from intelligence_layer.evaluation import (
     InMemoryDatasetRepository,
     InMemoryEvaluationRepository,
+    InMemoryRunRepository,
 )
 
 
@@ -135,6 +136,11 @@ def in_memory_dataset_repository() -> InMemoryDatasetRepository:
     return InMemoryDatasetRepository()
 
 
+@fixture
+def in_memory_run_repository() -> InMemoryRunRepository:
+    return InMemoryRunRepository()
+
+
 @fixture
 def in_memory_evaluation_repository() -> InMemoryEvaluationRepository:
     return InMemoryEvaluationRepository()
diff --git a/tests/evaluation/conftest.py b/tests/evaluation/conftest.py
index 68359946c..471c83113 100644
--- a/tests/evaluation/conftest.py
+++ b/tests/evaluation/conftest.py
@@ -14,11 +14,12 @@
     ExampleEvaluation,
     FailedExampleEvaluation,
     FileEvaluationRepository,
-    InMemoryEvaluationRepository,
+    FileRunRepository,
+    InMemoryDatasetRepository,
+    InMemoryRunRepository,
     Runner,
     RunOverview,
 )
-from intelligence_layer.evaluation.dataset_repository import InMemoryDatasetRepository
 from tests.conftest import DummyStringInput, DummyStringOutput
 
 FAIL_IN_EVAL_INPUT = "fail in eval"
@@ -76,6 +77,11 @@ def file_evaluation_repository(tmp_path: Path) -> FileEvaluationRepository:
     return FileEvaluationRepository(tmp_path)
 
 
+@fixture
+def file_run_repository(tmp_path: Path) -> FileRunRepository:
+    return FileRunRepository(tmp_path)
+
+
 @fixture
 def string_dataset_id(
     dummy_string_examples: Iterable[Example[DummyStringInput, DummyStringOutput]],
@@ -135,12 +141,12 @@ def dummy_string_examples(
 
 @fixture
 def dummy_runner(
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,
     in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
 ) -> Runner[str, str]:
     return Runner(
         DummyTask(),
-        in_memory_evaluation_repository,
         in_memory_dataset_repository,
+        in_memory_run_repository,
         "dummy-runner",
     )
diff --git a/tests/evaluation/test_argilla_evaluator.py b/tests/evaluation/test_argilla_evaluator.py
index df762dfdb..81968a272 100644
--- a/tests/evaluation/test_argilla_evaluator.py
+++ b/tests/evaluation/test_argilla_evaluator.py
@@ -19,6 +19,9 @@
     Runner,
     SuccessfulExampleOutput,
 )
+from intelligence_layer.evaluation.data_storage.run_repository import (
+    InMemoryRunRepository,
+)
 from tests.conftest import DummyStringInput, DummyStringOutput, DummyStringTask
 from tests.evaluation.conftest import DummyAggregatedEvaluation
 
@@ -110,8 +113,9 @@ def stub_argilla_client() -> StubArgillaClient:
 
 @fixture
 def string_argilla_evaluator(
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,  # noqa: w0404
     in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
     stub_argilla_client: StubArgillaClient,
 ) -> DummyStringTaskArgillaEvaluator:
     stub_argilla_client._expected_workspace_id = "workspace-id"
@@ -128,10 +132,11 @@ def string_argilla_evaluator(
         Field(name="input", title="Input"),
     ]
     evaluator = DummyStringTaskArgillaEvaluator(
+        in_memory_dataset_repository,
+        in_memory_run_repository,
         ArgillaEvaluationRepository(
             in_memory_evaluation_repository, stub_argilla_client
         ),
-        in_memory_dataset_repository,
         "dummy-string-task",
         stub_argilla_client._expected_workspace_id,
         fields,
@@ -145,13 +150,13 @@ def string_argilla_evaluator(
 @fixture
 def string_argilla_runner(
     dummy_string_task: DummyStringTask,
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,  # noqa: w0404
     in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
 ) -> Runner[DummyStringInput, DummyStringOutput]:
     return Runner(
         dummy_string_task,
-        in_memory_evaluation_repository,
         in_memory_dataset_repository,
+        in_memory_run_repository,
         "dummy-task",
     )
 
diff --git a/tests/evaluation/test_evaluation_repository.py b/tests/evaluation/test_evaluation_repository.py
index c1eb7de19..4f6829d04 100644
--- a/tests/evaluation/test_evaluation_repository.py
+++ b/tests/evaluation/test_evaluation_repository.py
@@ -1,23 +1,18 @@
 from datetime import datetime
-from typing import Sequence, cast
+from typing import Sequence
 
 from pydantic import BaseModel
 from pytest import fixture
 
-from intelligence_layer.core import InMemoryTaskSpan
-from intelligence_layer.core.tracer import CompositeTracer, InMemoryTracer
 from intelligence_layer.evaluation import (
     EvaluationOverview,
-    EvaluationRepository,
     ExampleEvaluation,
-    ExampleOutput,
     ExampleTrace,
     FailedExampleEvaluation,
     FileEvaluationRepository,
     InMemoryEvaluationRepository,
     TaskSpanTrace,
 )
-from tests.conftest import DummyStringInput
 from tests.evaluation.conftest import DummyAggregatedEvaluation, DummyEvaluation
 
 
@@ -44,26 +39,6 @@ def example_trace(
     )
 
 
-def test_can_store_example_evaluation_traces_in_file(
-    file_evaluation_repository: FileEvaluationRepository,
-) -> None:
-    run_id = "run_id"
-    example_id = "example_id"
-    now = datetime.now()
-
-    tracer = file_evaluation_repository.example_tracer(run_id, example_id)
-    expected = InMemoryTracer()
-    CompositeTracer([tracer, expected]).task_span(
-        "task", DummyStringInput(input="input"), now
-    )
-
-    assert file_evaluation_repository.example_trace(run_id, example_id) == ExampleTrace(
-        run_id=run_id,
-        example_id=example_id,
-        trace=TaskSpanTrace.from_task_span(cast(InMemoryTaskSpan, expected.entries[0])),
-    )
-
-
 def test_can_store_example_results_in_file(
     file_evaluation_repository: FileEvaluationRepository,
     successful_example_result: ExampleEvaluation[DummyEvaluation],
@@ -203,57 +178,3 @@ def test_file_repository_returns_none_for_nonexisting_overview(
         )
         is None
     )
-
-
-def test_file_repository_run_id_returns_run_ids(
-    file_evaluation_repository: FileEvaluationRepository,
-) -> None:
-    run_id = "id"
-
-    file_evaluation_repository.store_example_output(
-        ExampleOutput(run_id=run_id, example_id="example_id", output=None)
-    )
-
-    assert file_evaluation_repository.run_ids() == [run_id]
-
-
-def evaluation_repository_returns_examples_in_same_order_for_two_runs(
-    evaluation_repository: EvaluationRepository,
-) -> None:
-    run_id_1 = "id_1"
-    run_id_2 = "id_2"
-    num_examples = 20
-
-    for example_id in range(num_examples):
-        evaluation_repository.store_example_output(
-            ExampleOutput(run_id=run_id_1, example_id=str(example_id), output=None),
-        )
-
-    for example_id in reversed(range(num_examples)):
-        evaluation_repository.store_example_output(
-            ExampleOutput(run_id=run_id_2, example_id=str(example_id), output=None),
-        )
-
-    assert list(
-        (output.example_id, output.output)
-        for output in evaluation_repository.example_outputs(run_id_1, type(None))
-    ) == list(
-        (output.example_id, output.output)
-        for output in evaluation_repository.example_outputs(run_id_2, type(None))
-    )
-
-
-def test_in_memory_evaluation_repository_returns_examples_in_same_order_for_two_runs(
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,
-) -> None:
-    evaluation_repository_returns_examples_in_same_order_for_two_runs(
-        in_memory_evaluation_repository
-    )
-
-
-def test_file_evaluation_repository_returns_examples_in_same_order_for_two_runs(
-    file_evaluation_repository: FileEvaluationRepository,
-) -> None:
-    evaluation_repository_returns_examples_in_same_order_for_two_runs(
-        file_evaluation_repository
-    )
diff --git a/tests/evaluation/test_evaluator.py b/tests/evaluation/test_evaluator.py
index ec9fead9e..07f3b49d2 100644
--- a/tests/evaluation/test_evaluator.py
+++ b/tests/evaluation/test_evaluator.py
@@ -15,6 +15,7 @@
     FailedExampleEvaluation,
     InMemoryDatasetRepository,
     InMemoryEvaluationRepository,
+    InMemoryRunRepository,
     MeanAccumulator,
     Runner,
     SuccessfulExampleOutput,
@@ -118,11 +119,15 @@ def sequence_good_examples() -> Iterable[Example[str, None]]:
 
 @fixture
 def dummy_evaluator(
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,
     in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
 ) -> DummyEvaluator:
     return DummyEvaluator(
-        in_memory_evaluation_repository, in_memory_dataset_repository, "dummy-evaluator"
+        in_memory_dataset_repository,
+        in_memory_run_repository,
+        in_memory_evaluation_repository,
+        "dummy-evaluator",
     )
 
 
@@ -144,12 +149,14 @@ def good_dataset_id(
 
 @fixture
 def comparing_evaluator(
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,
     in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
 ) -> ComparingEvaluator:
     return ComparingEvaluator(
-        in_memory_evaluation_repository,
         in_memory_dataset_repository,
+        in_memory_run_repository,
+        in_memory_evaluation_repository,
         "comparing-evaluator",
     )
 
@@ -234,7 +241,7 @@ def test_evaluate_dataset_stores_example_traces(
     dataset_id: str,
     dummy_runner: Runner[str, str],
 ) -> None:
-    evaluation_repository = dummy_evaluator._evaluation_repository
+    run_repository = dummy_evaluator._run_repository
     dataset_repository = dummy_evaluator._dataset_repository
     dataset: Optional[Iterable[Example[str, None]]] = dataset_repository.examples_by_id(
         dataset_id, str, type(None)
@@ -244,13 +251,13 @@ def test_evaluate_dataset_stores_example_traces(
     run_overview = dummy_runner.run_dataset(dataset_id)
     evaluation_run_overview = dummy_evaluator.evaluate_dataset(run_overview.id)
     examples = list(dataset)
-    success_result = evaluation_repository.example_trace(
+    success_result = run_repository.example_trace(
         evaluation_run_overview.run_ids[0], examples[0].id
     )
-    failure_result_task = evaluation_repository.example_trace(
+    failure_result_task = run_repository.example_trace(
         evaluation_run_overview.run_ids[0], examples[1].id
     )
-    failure_result_eval = evaluation_repository.example_trace(
+    failure_result_eval = run_repository.example_trace(
         evaluation_run_overview.run_ids[0], examples[2].id
     )
 
@@ -317,8 +324,9 @@ def test_aggregate_evaluation_can_aggregate_multiple_evals(
 
 
 def test_base_evaluator_type_magic_works(
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,
     in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
 ) -> None:
     class EvaluationType(BaseModel):
         pass
@@ -364,7 +372,10 @@ class GreatGrandChildEvaluator(
         pass
 
     timmy = GreatGrandChildEvaluator(
-        in_memory_evaluation_repository, in_memory_dataset_repository, "dummy"
+        in_memory_dataset_repository,
+        in_memory_run_repository,
+        in_memory_evaluation_repository,
+        "dummy",
     )
     who_is_timmy = timmy._get_types()
 
diff --git a/tests/evaluation/test_instruct_comparison_argilla_evaluator.py b/tests/evaluation/test_instruct_comparison_argilla_evaluator.py
index 66b14cf47..b06cf16f5 100644
--- a/tests/evaluation/test_instruct_comparison_argilla_evaluator.py
+++ b/tests/evaluation/test_instruct_comparison_argilla_evaluator.py
@@ -25,6 +25,7 @@
     ExampleOutput,
     InMemoryDatasetRepository,
     InMemoryEvaluationRepository,
+    InMemoryRunRepository,
     InstructComparisonArgillaEvaluator,
     Payoff,
     PayoffMatrix,
@@ -75,6 +76,7 @@ def argilla_fake() -> ArgillaClient:
 @fixture
 def evaluator(
     in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
     in_memory_evaluation_repository: InMemoryEvaluationRepository,
     argilla_fake: ArgillaClient,
 ) -> InstructComparisonArgillaEvaluator:
@@ -82,7 +84,11 @@ def evaluator(
         in_memory_evaluation_repository, argilla_fake
     )
     return InstructComparisonArgillaEvaluator(
-        eval_repository, in_memory_dataset_repository, "instruct-evaluator", "workspace"
+        in_memory_dataset_repository,
+        in_memory_run_repository,
+        eval_repository,
+        "instruct-evaluator",
+        "workspace",
     )
 
 
@@ -102,8 +108,9 @@ def any_instruct_output() -> PromptOutput:
 
 def test_evaluate_run_submits_pairwise_comparison_records(
     evaluator: InstructComparisonArgillaEvaluator,
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,
     in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
     any_instruct_output: PromptOutput,
     argilla_fake: ArgillaFake,
 ) -> None:
@@ -122,12 +129,12 @@ def test_evaluate_run_submits_pairwise_comparison_records(
         ]
     )
     for run_id in run_ids:
-        in_memory_evaluation_repository.store_example_output(
+        in_memory_run_repository.store_example_output(
             example_output=ExampleOutput(
                 run_id=run_id, example_id="example_id", output=any_instruct_output
             )
         )
-        in_memory_evaluation_repository.store_run_overview(
+        in_memory_run_repository.store_run_overview(
             RunOverview(
                 dataset_id=dataset_id,
                 id=run_id,
@@ -156,8 +163,9 @@ def test_evaluate_run_submits_pairwise_comparison_records(
 
 
 def test_evaluate_run_only_evaluates_high_priority(
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,
     in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
     any_instruct_output: PromptOutput,
     argilla_fake: ArgillaFake,
 ) -> None:
@@ -166,8 +174,9 @@ def test_evaluate_run_only_evaluates_high_priority(
     )
     relevant_ids = frozenset({"1", "2"})
     evaluator = InstructComparisonArgillaEvaluator(
-        eval_repository,
         in_memory_dataset_repository,
+        in_memory_run_repository,
+        eval_repository,
         "instruct-evaluator",
         "workspace",
         relevant_ids,
@@ -188,12 +197,12 @@ def test_evaluate_run_only_evaluates_high_priority(
         ]
     )
     for run_id in run_ids:
-        in_memory_evaluation_repository.store_example_output(
+        in_memory_run_repository.store_example_output(
             example_output=ExampleOutput(
                 run_id=run_id, example_id="example_id", output=any_instruct_output
             )
         )
-        in_memory_evaluation_repository.store_run_overview(
+        in_memory_run_repository.store_run_overview(
             RunOverview(
                 dataset_id=dataset_id,
                 id=run_id,
diff --git a/tests/evaluation/test_run.py b/tests/evaluation/test_run.py
index 49f1af711..362dabef0 100644
--- a/tests/evaluation/test_run.py
+++ b/tests/evaluation/test_run.py
@@ -8,9 +8,7 @@
 from intelligence_layer.connectors import AlephAlphaClientProtocol
 from intelligence_layer.core import Task, TaskSpan
 from intelligence_layer.evaluation import (
-    DatasetRepository,
     EvaluationOverview,
-    EvaluationRepository,
     Evaluator,
     Example,
     FileDatasetRepository,
@@ -48,14 +46,6 @@ def __init__(self, client: AlephAlphaClientProtocol) -> None:
 
 
 class DummyEvaluator(Evaluator[None, None, None, DummyEvaluation, DummyAggregation]):
-    def __init__(
-        self,
-        evaluation_repository: EvaluationRepository,
-        dataset_repository: DatasetRepository,
-        description: str,
-    ) -> None:
-        super().__init__(evaluation_repository, dataset_repository, description)
-
     # mypy expects *args where this method only uses one output
     def do_evaluate(  # type: ignore
         self, input: None, expected_output: None, output: None
diff --git a/tests/evaluation/test_run_repository.py b/tests/evaluation/test_run_repository.py
new file mode 100644
index 000000000..b58d9084e
--- /dev/null
+++ b/tests/evaluation/test_run_repository.py
@@ -0,0 +1,89 @@
+from datetime import datetime
+from typing import cast
+
+from intelligence_layer.core.tracer import (
+    CompositeTracer,
+    InMemoryTaskSpan,
+    InMemoryTracer,
+)
+from intelligence_layer.evaluation import ExampleTrace, TaskSpanTrace
+from intelligence_layer.evaluation.data_storage.run_repository import (
+    FileRunRepository,
+    RunRepository,
+)
+from intelligence_layer.evaluation.domain import ExampleOutput
+from tests.conftest import DummyStringInput
+
+
+def test_can_store_example_evaluation_traces_in_file(
+    file_run_repository: FileRunRepository,
+) -> None:
+    run_id = "run_id"
+    example_id = "example_id"
+    now = datetime.now()
+
+    tracer = file_run_repository.example_tracer(run_id, example_id)
+    expected = InMemoryTracer()
+    CompositeTracer([tracer, expected]).task_span(
+        "task", DummyStringInput(input="input"), now
+    )
+
+    assert file_run_repository.example_trace(run_id, example_id) == ExampleTrace(
+        run_id=run_id,
+        example_id=example_id,
+        trace=TaskSpanTrace.from_task_span(cast(InMemoryTaskSpan, expected.entries[0])),
+    )
+
+
+def test_file_repository_run_id_returns_run_ids(
+    file_run_repository: FileRunRepository,
+) -> None:
+    run_id = "id"
+
+    file_run_repository.store_example_output(
+        ExampleOutput(run_id=run_id, example_id="example_id", output=None)
+    )
+
+    assert file_run_repository.run_ids() == [run_id]
+
+
+# def test_in_memory_evaluation_repository_returns_examples_in_same_order_for_two_runs(
+#     in_memory_evaluation_repository: InMemoryEvaluationRepository,
+# ) -> None:
+#     evaluation_repository_returns_examples_in_same_order_for_two_runs(
+#         in_memory_evaluation_repository
+#     )
+
+
+def test_file_evaluation_repository_returns_examples_in_same_order_for_two_runs(
+    file_run_repository: FileRunRepository,
+) -> None:
+    evaluation_repository_returns_examples_in_same_order_for_two_runs(
+        file_run_repository
+    )
+
+
+def evaluation_repository_returns_examples_in_same_order_for_two_runs(
+    run_repository: RunRepository,
+) -> None:
+    run_id_1 = "id_1"
+    run_id_2 = "id_2"
+    num_examples = 20
+
+    for example_id in range(num_examples):
+        run_repository.store_example_output(
+            ExampleOutput(run_id=run_id_1, example_id=str(example_id), output=None),
+        )
+
+    for example_id in reversed(range(num_examples)):
+        run_repository.store_example_output(
+            ExampleOutput(run_id=run_id_2, example_id=str(example_id), output=None),
+        )
+
+    assert list(
+        (output.example_id, output.output)
+        for output in run_repository.example_outputs(run_id_1, type(None))
+    ) == list(
+        (output.example_id, output.output)
+        for output in run_repository.example_outputs(run_id_2, type(None))
+    )
diff --git a/tests/evaluation/test_runner.py b/tests/evaluation/test_runner.py
index d37138d8d..737008348 100644
--- a/tests/evaluation/test_runner.py
+++ b/tests/evaluation/test_runner.py
@@ -2,22 +2,19 @@
 from intelligence_layer.evaluation import (
     Example,
     InMemoryDatasetRepository,
-    InMemoryEvaluationRepository,
+    InMemoryRunRepository,
     Runner,
 )
 from tests.evaluation.conftest import FAIL_IN_EVAL_INPUT, FAIL_IN_TASK_INPUT, DummyTask
 
 
 def test_runner_runs_dataset(
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,
     in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
 ) -> None:
     task = DummyTask()
     runner = Runner(
-        task,
-        in_memory_evaluation_repository,
-        in_memory_dataset_repository,
-        "dummy-runner",
+        task, in_memory_dataset_repository, in_memory_run_repository, "dummy-runner"
     )
     examples = [
         Example(input="success", expected_output=None),
@@ -28,7 +25,7 @@ def test_runner_runs_dataset(
     dataset_id = in_memory_dataset_repository.create_dataset(examples=examples)
     overview = runner.run_dataset(dataset_id)
     outputs = list(
-        in_memory_evaluation_repository.example_outputs(
+        in_memory_run_repository.example_outputs(
             overview.id, output_type=runner.output_type()
         )
     )
@@ -39,16 +36,13 @@ def test_runner_runs_dataset(
 
 
 def test_runner_runs_n_examples(
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,
     in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
 ) -> None:
     task = DummyTask()
     tracer = InMemoryTracer()
     runner = Runner(
-        task,
-        in_memory_evaluation_repository,
-        in_memory_dataset_repository,
-        "dummy-runner",
+        task, in_memory_dataset_repository, in_memory_run_repository, "dummy-runner"
     )
     examples = [
         Example(input="success", expected_output=None),
diff --git a/tests/use_cases/classify/test_classify.py b/tests/use_cases/classify/test_classify.py
index 85fd9586c..12d02478f 100644
--- a/tests/use_cases/classify/test_classify.py
+++ b/tests/use_cases/classify/test_classify.py
@@ -10,6 +10,7 @@
     InMemoryDatasetRepository,
     InMemoryEvaluationRepository,
     Runner,
+    RunRepository,
 )
 from intelligence_layer.use_cases.classify.classify import (
     ClassifyInput,
@@ -126,12 +127,14 @@ def multiple_entries_dataset_name(
 
 @fixture
 def classify_evaluator(
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,
     in_memory_dataset_repository: DatasetRepository,
+    in_memory_run_repository: RunRepository,
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
 ) -> MultiLabelClassifyEvaluator:
     return MultiLabelClassifyEvaluator(
-        in_memory_evaluation_repository,
         in_memory_dataset_repository,
+        in_memory_run_repository,
+        in_memory_evaluation_repository,
         "multi-label-classify",
     )
 
@@ -139,13 +142,13 @@ def classify_evaluator(
 @fixture
 def classify_runner(
     embedding_based_classify: Task[ClassifyInput, MultiLabelClassifyOutput],
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,
     in_memory_dataset_repository: DatasetRepository,
+    in_memory_run_repository: RunRepository,
 ) -> Runner[ClassifyInput, MultiLabelClassifyOutput]:
     return Runner(
         embedding_based_classify,
-        in_memory_evaluation_repository,
         in_memory_dataset_repository,
+        in_memory_run_repository,
         "multi-label-classify",
     )
 
@@ -158,7 +161,7 @@ def test_multi_label_classify_evaluator_single_example(
     run_overview = classify_runner.run_dataset(single_entry_dataset_name)
 
     evaluation_overview = classify_evaluator.evaluate_dataset(run_overview.id)
-    evaluation = classify_runner._evaluation_repository.example_evaluations(
+    evaluation = classify_evaluator._evaluation_repository.example_evaluations(
         evaluation_overview.individual_evaluation_overviews[0].id,
         MultiLabelClassifyEvaluation,
     )[0].result
diff --git a/tests/use_cases/classify/test_prompt_based_classify.py b/tests/use_cases/classify/test_prompt_based_classify.py
index b00b1e5ca..571525526 100644
--- a/tests/use_cases/classify/test_prompt_based_classify.py
+++ b/tests/use_cases/classify/test_prompt_based_classify.py
@@ -8,11 +8,11 @@
 from intelligence_layer.core import Chunk, InMemoryTracer, NoOpTracer
 from intelligence_layer.evaluation import (
     DatasetRepository,
-    EvaluationRepository,
     Example,
     InMemoryDatasetRepository,
     InMemoryEvaluationRepository,
     Runner,
+    RunRepository,
 )
 from intelligence_layer.use_cases.classify.classify import (
     ClassifyInput,
@@ -32,12 +32,14 @@ def prompt_based_classify(client: AlephAlphaClientProtocol) -> PromptBasedClassi
 
 @fixture
 def classify_evaluator(
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,
     in_memory_dataset_repository: DatasetRepository,
+    in_memory_run_repository: RunRepository,
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
 ) -> SingleLabelClassifyEvaluator:
     return SingleLabelClassifyEvaluator(
-        in_memory_evaluation_repository,
         in_memory_dataset_repository,
+        in_memory_run_repository,
+        in_memory_evaluation_repository,
         "single-label-classify",
     )
 
@@ -45,13 +47,13 @@ def classify_evaluator(
 @fixture
 def classify_runner(
     prompt_based_classify: PromptBasedClassify,
-    in_memory_evaluation_repository: EvaluationRepository,
     in_memory_dataset_repository: DatasetRepository,
+    in_memory_run_repository: RunRepository,
 ) -> Runner[ClassifyInput, SingleLabelClassifyOutput]:
     return Runner(
         prompt_based_classify,
-        in_memory_evaluation_repository,
         in_memory_dataset_repository,
+        in_memory_run_repository,
         "prompt-based-classify",
     )
 
diff --git a/tests/use_cases/summarize/test_summarize.py b/tests/use_cases/summarize/test_summarize.py
index 9e1888fc9..ccae8abcf 100644
--- a/tests/use_cases/summarize/test_summarize.py
+++ b/tests/use_cases/summarize/test_summarize.py
@@ -7,7 +7,9 @@
     Example,
     InMemoryDatasetRepository,
     InMemoryEvaluationRepository,
+    InMemoryRunRepository,
     Runner,
+    RunRepository,
 )
 from intelligence_layer.use_cases.summarize.long_context_high_compression_summarize import (
     LongContextHighCompressionSummarize,
@@ -28,12 +30,14 @@
 
 @fixture
 def single_chunk_summarize_evaluator(
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,
     in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
 ) -> SingleChunkSummarizeEvaluator:
     return SingleChunkSummarizeEvaluator(
-        in_memory_evaluation_repository,
         in_memory_dataset_repository,
+        in_memory_run_repository,
+        in_memory_evaluation_repository,
         "single-chunk-summarize",
     )
 
@@ -41,25 +45,27 @@ def single_chunk_summarize_evaluator(
 @fixture
 def single_chunk_summarize_runner(
     single_chunk_few_shot_summarize: SingleChunkFewShotSummarize,
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,
     in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
 ) -> Runner[SingleChunkSummarizeInput, SummarizeOutput]:
     return Runner(
         single_chunk_few_shot_summarize,
-        in_memory_evaluation_repository,
         in_memory_dataset_repository,
+        in_memory_run_repository,
         "single-chunk-summarize",
     )
 
 
 @fixture
 def long_context_summarize_evaluator(
-    in_memory_evaluation_repository: EvaluationRepository,
     in_memory_dataset_repository: DatasetRepository,
+    in_memory_run_repository: RunRepository,
+    in_memory_evaluation_repository: EvaluationRepository,
 ) -> LongContextSummarizeEvaluator:
     return LongContextSummarizeEvaluator(
-        in_memory_evaluation_repository,
         in_memory_dataset_repository,
+        in_memory_run_repository,
+        in_memory_evaluation_repository,
         "long-context-summarize",
     )
 
@@ -67,13 +73,13 @@ def long_context_summarize_evaluator(
 @fixture
 def long_context_summarize_runner(
     long_context_high_compression_summarize: LongContextHighCompressionSummarize,
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,
     in_memory_dataset_repository: DatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
 ) -> Runner[LongContextSummarizeInput, LongContextSummarizeOutput]:
     return Runner(
         long_context_high_compression_summarize,
-        in_memory_evaluation_repository,
         in_memory_dataset_repository,
+        in_memory_run_repository,
         "long-context-summarize",
     )