From 47fec3d4594384393d31076b5fe4f5ff5fedef95 Mon Sep 17 00:00:00 2001
From: Valentina Galata <valentina.galata@tngtech.com>
Date: Wed, 21 Feb 2024 15:31:57 +0100
Subject: [PATCH] refactor: split evaluation and aggregation

Task: IL-259
---
 src/examples/classification.ipynb             |   8 +-
 src/examples/document_index.ipynb             |   4 +-
 src/examples/evaluation.ipynb                 |  37 +--
 src/examples/human_evaluation.ipynb           |  54 +++--
 src/examples/performance_tips.ipynb           |   6 +-
 src/examples/qa.ipynb                         |   2 +-
 src/examples/quickstart_task.ipynb            | 163 ++++++++-----
 src/examples/summarize.ipynb                  |   1 +
 src/intelligence_layer/evaluation/__init__.py |   1 +
 .../evaluation/aggregator.py                  | 223 ++++++++++++++++++
 src/intelligence_layer/evaluation/argilla.py  |  48 +++-
 .../evaluation/evaluator.py                   | 141 +----------
 src/intelligence_layer/evaluation/run.py      |   9 +-
 tests/evaluation/test_argilla_evaluator.py    |  88 +++++--
 tests/evaluation/test_evaluator.py            | 187 ++++++++++-----
 ...t_instruct_comparison_argilla_evaluator.py |  46 +++-
 tests/use_cases/classify/test_classify.py     |  32 ++-
 .../classify/test_prompt_based_classify.py    |  46 +++-
 tests/use_cases/summarize/test_summarize.py   |  58 +++--
 19 files changed, 775 insertions(+), 379 deletions(-)
 create mode 100644 src/intelligence_layer/evaluation/aggregator.py

diff --git a/src/examples/classification.ipynb b/src/examples/classification.ipynb
index 0883b9a3d..bb512fffb 100644
--- a/src/examples/classification.ipynb
+++ b/src/examples/classification.ipynb
@@ -43,13 +43,13 @@
    "outputs": [],
    "source": [
     "from os import getenv\n",
-    "from intelligence_layer.connectors import LimitedConcurrencyClient\n",
-    "\n",
-    "from intelligence_layer.use_cases import ClassifyInput, PromptBasedClassify\n",
-    "from intelligence_layer.core import Chunk, InMemoryTracer\n",
     "\n",
     "from dotenv import load_dotenv\n",
     "\n",
+    "from intelligence_layer.connectors import LimitedConcurrencyClient\n",
+    "from intelligence_layer.core import Chunk, InMemoryTracer\n",
+    "from intelligence_layer.use_cases import ClassifyInput, PromptBasedClassify\n",
+    "\n",
     "load_dotenv()\n",
     "\n",
     "text_to_classify = Chunk(\"In the distant future, a space exploration party embarked on a thrilling journey to the uncharted regions of the galaxy. \\n\\\n",
diff --git a/src/examples/document_index.ipynb b/src/examples/document_index.ipynb
index 454d219ed..48173b5e1 100644
--- a/src/examples/document_index.ipynb
+++ b/src/examples/document_index.ipynb
@@ -47,10 +47,10 @@
    "source": [
     "from os import getenv\n",
     "\n",
-    "from intelligence_layer.connectors import DocumentIndexClient\n",
-    "\n",
     "from dotenv import load_dotenv\n",
     "\n",
+    "from intelligence_layer.connectors import DocumentIndexClient\n",
+    "\n",
     "load_dotenv()\n",
     "\n",
     "\n",
diff --git a/src/examples/evaluation.ipynb b/src/examples/evaluation.ipynb
index 1c1276c49..15e3aefd9 100644
--- a/src/examples/evaluation.ipynb
+++ b/src/examples/evaluation.ipynb
@@ -50,7 +50,7 @@
     "    InMemoryRunRepository,\n",
     "    InMemoryDatasetRepository,\n",
     "    InMemoryAggregationRepository,\n",
-    "    Runner,\n",
+    "    Runner, Aggregator,\n",
     ")\n",
     "from intelligence_layer.use_cases import (\n",
     "    PromptBasedClassify,\n",
@@ -58,7 +58,6 @@
     "    SingleLabelClassifyAggregationLogic,\n",
     ")\n",
     "\n",
-    "\n",
     "load_dotenv()\n",
     "\n",
     "client = LimitedConcurrencyClient.from_token(os.getenv(\"AA_TOKEN\"))\n",
@@ -75,9 +74,13 @@
     "    dataset_repository,\n",
     "    run_repository,\n",
     "    evaluation_repository,\n",
-    "    aggregation_repository,\n",
-    "    \"singel-label-classify\",\n",
+    "    \"single-label-classify\",\n",
     "    evaluation_logic,\n",
+    ")\n",
+    "aggregator = Aggregator(\n",
+    "    evaluation_repository,\n",
+    "    aggregation_repository,\n",
+    "    \"single-label-classify\",\n",
     "    aggregation_logic,\n",
     ")\n",
     "runner = Runner(task, dataset_repository, run_repository, \"prompt-based-classify\")"
@@ -114,10 +117,10 @@
     "])\n",
     "\n",
     "run_overview = runner.run_dataset(single_example_dataset, NoOpTracer())\n",
-    "aggregation_overview = evaluator.eval_and_aggregate_runs(run_overview.id)\n",
+    "evaluation_overview = evaluator.evaluate_runs(run_overview.id)\n",
+    "aggregation_overview = aggregator.aggregate_evaluation(evaluation_overview.id)\n",
     "\n",
-    "print(\"Statistics: \", aggregation_overview.statistics)\n",
-    "\n"
+    "print(\"Statistics: \", aggregation_overview.statistics)"
    ]
   },
   {
@@ -140,7 +143,7 @@
     "dataset = load_dataset(\"cardiffnlp/tweet_topic_multi\")\n",
     "test_set_name = \"validation_random\"\n",
     "all_data = list(dataset[test_set_name])\n",
-    "data = all_data[:25] # this has 573 datapoints, let's take a look at 25 for now\n"
+    "data = all_data[:25] # this has 573 datapoints, let's take a look at 25 for now"
    ]
   },
   {
@@ -157,7 +160,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data[1]\n"
+    "data[1]"
    ]
   },
   {
@@ -215,7 +218,8 @@
    "outputs": [],
    "source": [
     "run_overview = runner.run_dataset(dataset_id)\n",
-    "aggregation_overview = evaluator.eval_and_aggregate_runs(run_overview.id)\n",
+    "evaluation_overview = evaluator.evaluate_runs(run_overview.id)\n",
+    "aggregation_overview = aggregator.aggregate_evaluation(evaluation_overview.id)\n",
     "aggregation_overview.raise_on_evaluation_failure()"
    ]
   },
@@ -288,9 +292,13 @@
     "    dataset_repository,\n",
     "    run_repository,\n",
     "    evaluation_repository,\n",
-    "    aggregation_repository,\n",
     "    \"multi-label-classify\",\n",
     "    eval_logic,\n",
+    ")\n",
+    "embedding_based_classify_aggregator = Aggregator(\n",
+    "    evaluation_repository,\n",
+    "    aggregation_repository,\n",
+    "    \"multi-label-classify\",\n",
     "    aggregation_logic,\n",
     ")\n",
     "embedding_based_classify_runner = Runner(\n",
@@ -308,8 +316,9 @@
    "outputs": [],
    "source": [
     "embedding_based_classify_run_result = embedding_based_classify_runner.run_dataset(dataset_id)\n",
-    "embedding_based_classify_evaluation_result = embedding_based_classify_evaluator.eval_and_aggregate_runs(embedding_based_classify_run_result.id)\n",
-    "embedding_based_classify_evaluation_result.raise_on_evaluation_failure()"
+    "embedding_based_classify_evaluation_result = embedding_based_classify_evaluator.evaluate_runs(embedding_based_classify_run_result.id)\n",
+    "embedding_based_classify_aggregation_result = embedding_based_classify_aggregator.aggregate_evaluation(embedding_based_classify_evaluation_result.id)\n",
+    "embedding_based_classify_aggregation_result.raise_on_evaluation_failure()"
    ]
   },
   {
@@ -318,7 +327,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "embedding_based_classify_evaluation_result.statistics.macro_avg"
+    "embedding_based_classify_aggregation_result.statistics.macro_avg"
    ]
   },
   {
diff --git a/src/examples/human_evaluation.ipynb b/src/examples/human_evaluation.ipynb
index 33e19ba82..841c6df4a 100644
--- a/src/examples/human_evaluation.ipynb
+++ b/src/examples/human_evaluation.ipynb
@@ -42,20 +42,26 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os\n",
+    "from typing import Iterable, cast\n",
+    "\n",
+    "from datasets import load_dataset\n",
     "from dotenv import load_dotenv\n",
-    "from intelligence_layer.core import (\n",
-    "    InstructInput, \n",
-    "    Instruct, \n",
-    "    PromptOutput\n",
-    ")\n",
+    "from pydantic import BaseModel\n",
+    "\n",
     "from intelligence_layer.connectors import (\n",
-    "    LimitedConcurrencyClient, \n",
-    "    Question, \n",
-    "    ArgillaEvaluation, \n",
-    "    DefaultArgillaClient, \n",
-    "    Field, \n",
+    "    LimitedConcurrencyClient,\n",
+    "    Question,\n",
+    "    ArgillaEvaluation,\n",
+    "    DefaultArgillaClient,\n",
+    "    Field,\n",
     "    RecordData\n",
     ")\n",
+    "from intelligence_layer.core import (\n",
+    "    InstructInput,\n",
+    "    Instruct,\n",
+    "    PromptOutput\n",
+    ")\n",
     "from intelligence_layer.evaluation import (\n",
     "    ArgillaEvaluator,\n",
     "    AggregationLogic,\n",
@@ -70,10 +76,7 @@
     "    Runner,\n",
     "    SuccessfulExampleOutput\n",
     ")\n",
-    "from typing import Iterable, cast, Sequence\n",
-    "from datasets import load_dataset\n",
-    "import os\n",
-    "from pydantic import BaseModel\n",
+    "from intelligence_layer.evaluation.argilla import ArgillaAggregator\n",
     "\n",
     "load_dotenv()\n",
     "\n",
@@ -318,14 +321,14 @@
     "    def _to_record(\n",
     "        self,\n",
     "        example: Example[InstructInput, None],\n",
-    "        example_outputs: SuccessfulExampleOutput[PromptOutput],\n",
+    "        *example_outputs: SuccessfulExampleOutput[PromptOutput],\n",
     "    ) -> RecordDataSequence:\n",
     "        return RecordDataSequence(\n",
     "            records=[\n",
     "                RecordData(\n",
     "                    content={\n",
     "                        \"input\": example.input.instruction,\n",
-    "                        \"output\": example_outputs.output.completion,\n",
+    "                        \"output\": example_outputs[0].output.completion,\n",
     "                    },\n",
     "                    example_id=example.id,\n",
     "                )\n",
@@ -340,16 +343,16 @@
     "eval_logic = InstructArgillaEvaluationLogic()\n",
     "aggregation_logic = InstructArgillaAggregationLogic()\n",
     "\n",
+    "argilla_evaluation_repository =  ArgillaEvaluationRepository(\n",
+    "    evaluation_repository, argilla_client, workspace_id, fields, questions\n",
+    ")\n",
+    "\n",
     "evaluator = ArgillaEvaluator(\n",
     "    dataset_repository,\n",
     "    run_repository,\n",
-    "    ArgillaEvaluationRepository(\n",
-    "        evaluation_repository, argilla_client, workspace_id, fields, questions\n",
-    "    ),\n",
-    "    aggregation_repository,\n",
+    "    argilla_evaluation_repository,\n",
     "    \"instruct\",\n",
     "    eval_logic,\n",
-    "    aggregation_logic,\n",
     ")"
    ]
   },
@@ -388,8 +391,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "aggregator = ArgillaAggregator(\n",
+    "    argilla_evaluation_repository,\n",
+    "    aggregation_repository,\n",
+    "    \"instruct\",\n",
+    "    aggregation_logic,\n",
+    ")\n",
+    "\n",
     "if eval_overview:\n",
-    "    output = evaluator.aggregate_evaluation(eval_overview.id)\n",
+    "    output = aggregator.aggregate_evaluation(eval_overview.id)\n",
     "    print(output.statistics)"
    ]
   }
diff --git a/src/examples/performance_tips.ipynb b/src/examples/performance_tips.ipynb
index 69227c04e..80300fd77 100644
--- a/src/examples/performance_tips.ipynb
+++ b/src/examples/performance_tips.ipynb
@@ -44,11 +44,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from intelligence_layer.core.task import Task\n",
-    "from intelligence_layer.core.tracer import TaskSpan, NoOpTracer\n",
     "import time\n",
     "from typing import Any\n",
     "\n",
+    "from intelligence_layer.core.task import Task\n",
+    "from intelligence_layer.core.tracer import TaskSpan, NoOpTracer\n",
+    "\n",
+    "\n",
     "class DummyTask(Task):\n",
     "    def do_run(self, input: Any, task_span: TaskSpan) -> Any:\n",
     "        time.sleep(2)\n",
diff --git a/src/examples/qa.ipynb b/src/examples/qa.ipynb
index fa9e15bae..b69c37599 100644
--- a/src/examples/qa.ipynb
+++ b/src/examples/qa.ipynb
@@ -19,8 +19,8 @@
    "outputs": [],
    "source": [
     "from os import getenv\n",
-    "from dotenv import load_dotenv\n",
     "\n",
+    "from dotenv import load_dotenv\n",
     "\n",
     "load_dotenv()\n",
     "from intelligence_layer.connectors import LimitedConcurrencyClient\n",
diff --git a/src/examples/quickstart_task.ipynb b/src/examples/quickstart_task.ipynb
index 0b873ccf5..010259124 100644
--- a/src/examples/quickstart_task.ipynb
+++ b/src/examples/quickstart_task.ipynb
@@ -21,10 +21,10 @@
     "Output = TypeVar(\"Output\", bound=PydanticSerializable)\n",
     "\n",
     "class Task(ABC, Generic[Input, Output]):\n",
-    "    @abstractmethod\n",
-    "    def do_run(self, input: Input, task_span: TaskSpan) -> Output:\n",
-    "        \"\"\"Executes the process for this use-case.\"\"\"\n",
-    "        ...\n",
+    "    @abstractmethod\n",
+    "    def do_run(self, input: Input, task_span: TaskSpan) -> Output:\n",
+    "        \"\"\"Executes the process for this use-case.\"\"\"\n",
+    "        ...\n",
     "```\n",
     "\n",
     "For every task, we have to define an `Input`, an `Output` and how we would like to run it. Since these can vary so much, we make no assumptions about a `Task`'s implementation. \n",
@@ -261,45 +261,130 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now that we have all parts in place, let's implement our `KeywordExtractionEvaluator`."
+    "Now that we have all parts in place, let's run our task which will produce the results for evaluation."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "from intelligence_layer.core import NoOpTracer\n",
+    "from intelligence_layer.evaluation import (\n",
+    "    InMemoryDatasetRepository,\n",
+    "    InMemoryRunRepository,\n",
+    "    Runner,\n",
+    "    Example\n",
+    ")\n",
     "from statistics import mean\n",
     "from typing import Iterable\n",
-    "from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic\n",
     "\n",
+    "dataset_repository = InMemoryDatasetRepository()\n",
+    "run_repository = InMemoryRunRepository()\n",
+    "\n",
+    "runner = Runner(task, dataset_repository, run_repository, \"keyword-extraction\")\n",
+    "input = KeywordExtractionInput(text=\"This is a text about dolphins and sharks.\")\n",
+    "expected_output = KeywordExtractionExpectedOutput(keywords=[\"dolphins\", \"sharks\"])\n",
+    "\n",
+    "single_example_dataset = dataset_repository.create_dataset(\n",
+    "    examples=[Example(input=input, expected_output=expected_output)]\n",
+    ")\n",
+    "\n",
+    "run_overview = runner.run_dataset(single_example_dataset, NoOpTracer())"
+   ],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, let's build an evaluator.\n",
+    "For this, we need to implement a method doing the actual evaluation in a `EvaluationLogic` class."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intelligence_layer.evaluation import (\n",
+    "    Evaluator,\n",
+    "    InMemoryEvaluationRepository,\n",
+    "    Example\n",
+    ")\n",
+    "from intelligence_layer.evaluation.base_logic import SingleOutputEvaluationLogic\n",
     "\n",
     "class KeywordExtractionEvaluationLogic(\n",
-    "    EvaluationLogic[\n",
+    "    SingleOutputEvaluationLogic[\n",
     "        KeywordExtractionInput,\n",
     "        KeywordExtractionOutput,\n",
     "        KeywordExtractionExpectedOutput,\n",
     "        KeywordExtractionEvaluation,\n",
     "    ]\n",
     "):\n",
-    "    def do_evaluate(\n",
+    "    def do_evaluate_single_output(\n",
     "        self,\n",
-    "        input: KeywordExtractionInput,\n",
-    "        output: KeywordExtractionOutput,\n",
-    "        expected_output: KeywordExtractionExpectedOutput,\n",
+    "        example: Example[KeywordExtractionInput, KeywordExtractionOutput],\n",
+    "        output: KeywordExtractionExpectedOutput,\n",
     "    ) -> KeywordExtractionEvaluation:\n",
-    "        true_positives = output.keywords & expected_output.keywords\n",
-    "        false_positives = output.keywords - expected_output.keywords\n",
-    "        false_negatives = expected_output.keywords - output.keywords\n",
+    "        true_positives = output.keywords & output.keywords\n",
+    "        false_positives = output.keywords - output.keywords\n",
+    "        false_negatives = output.keywords - output.keywords\n",
     "        return KeywordExtractionEvaluation(\n",
-    "            true_positive_rate=len(true_positives) / len(expected_output.keywords),\n",
+    "            true_positive_rate=len(true_positives) / len(output.keywords),\n",
     "            true_positives=true_positives,\n",
     "            false_positives=false_positives,\n",
     "            false_negatives=false_negatives,\n",
-    "        )\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And now, we can create an evaluator and run it on our data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluation_repository = InMemoryEvaluationRepository()\n",
+    "evaluation_logic = KeywordExtractionEvaluationLogic()\n",
+    "evaluator = Evaluator(\n",
+    "    dataset_repository,\n",
+    "    run_repository,\n",
+    "    evaluation_repository,\n",
+    "    \"keyword-extraction\",\n",
+    "    evaluation_logic,\n",
+    ")\n",
+    "\n",
+    "evaluation_overview = evaluator.evaluate_runs(run_overview.id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To aggregate the evaluation results, we have to implement a method doing this in a `AggregationLogic` class."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intelligence_layer.evaluation import (\n",
+    "    InMemoryAggregationRepository,\n",
+    "    Example, Aggregator,\n",
+    ")\n",
+    "from intelligence_layer.evaluation.base_logic import AggregationLogic\n",
+    "\n",
     "\n",
-    "# this is needed later for the aggregation\n",
     "class KeywordExtractionAggregationLogic(\n",
     "    AggregationLogic[\n",
     "        KeywordExtractionEvaluation,\n",
@@ -322,7 +407,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's run this on a single example."
+    "Let's create now an aggregator and generate evaluation statistics from the previously generated evaluation results."
    ]
   },
   {
@@ -331,45 +416,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from intelligence_layer.core import NoOpTracer\n",
-    "from intelligence_layer.evaluation import (\n",
-    "    Evaluator,\n",
-    "    InMemoryDatasetRepository,\n",
-    "    InMemoryEvaluationRepository,\n",
-    "    InMemoryRunRepository,\n",
-    "    InMemoryAggregationRepository,\n",
-    "    Runner,\n",
-    "    Example,\n",
-    ")\n",
-    "\n",
-    "dataset_repository = InMemoryDatasetRepository()\n",
-    "run_repository = InMemoryRunRepository()\n",
-    "evaluation_repository = InMemoryEvaluationRepository()\n",
     "aggregation_repository = InMemoryAggregationRepository()\n",
-    "evaluation_logic = KeywordExtractionEvaluationLogic()\n",
     "aggregation_logic = KeywordExtractionAggregationLogic()\n",
-    "\n",
-    "\n",
-    "evaluator = Evaluator(\n",
-    "    dataset_repository,\n",
-    "    run_repository,\n",
+    "aggregator = Aggregator(\n",
     "    evaluation_repository,\n",
     "    aggregation_repository,\n",
     "    \"keyword-extraction\",\n",
-    "    evaluation_logic,\n",
     "    aggregation_logic,\n",
     ")\n",
-    "runner = Runner(task, dataset_repository, run_repository, \"keyword-extraction\")\n",
-    "\n",
-    "input = KeywordExtractionInput(text=\"This is a text about dolphins and sharks.\")\n",
-    "expected_output = KeywordExtractionExpectedOutput(keywords=[\"dolphins\", \"sharks\"])\n",
-    "\n",
-    "single_example_dataset = dataset_repository.create_dataset(\n",
-    "    examples=[Example(input=input, expected_output=expected_output)]\n",
-    ")\n",
     "\n",
-    "run_overview = runner.run_dataset(single_example_dataset, NoOpTracer())\n",
-    "aggregation_overview = evaluator.eval_and_aggregate_runs(run_overview.id)\n",
+    "aggregation_overview = aggregator.aggregate_evaluation(evaluation_overview.id)\n",
     "\n",
     "print(\"Statistics: \", aggregation_overview.statistics)"
    ]
@@ -378,7 +434,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now that we have implemented our aggregate method, let's run a dataset with some example data."
+    "Now that we have implemented all required methods, let's run a dataset with some more examples."
    ]
   },
   {
@@ -417,7 +473,8 @@
     ")\n",
     "\n",
     "run = runner.run_dataset(dataset_id)\n",
-    "aggregation_overview = evaluator.eval_and_aggregate_runs(run.id)\n",
+    "evaluation_overview = evaluator.evaluate_runs(run.id)\n",
+    "aggregation_overview = aggregator.aggregate_evaluation(evaluation_overview.id)\n",
     "\n",
     "pprint(aggregation_overview)"
    ]
@@ -440,7 +497,7 @@
     "last_example_result = run_repository.example_trace(\n",
     "    next(iter(aggregation_overview.run_overviews())).id, examples[-1].id\n",
     ")\n",
-    "last_example_result.trace\n"
+    "last_example_result.trace"
    ]
   },
   {
diff --git a/src/examples/summarize.ipynb b/src/examples/summarize.ipynb
index aa2313a1b..17f1d9164 100644
--- a/src/examples/summarize.ipynb
+++ b/src/examples/summarize.ipynb
@@ -20,6 +20,7 @@
    "outputs": [],
    "source": [
     "from os import getenv\n",
+    "\n",
     "from dotenv import load_dotenv\n",
     "\n",
     "load_dotenv()\n",
diff --git a/src/intelligence_layer/evaluation/__init__.py b/src/intelligence_layer/evaluation/__init__.py
index e4248fe91..615b1ea3e 100644
--- a/src/intelligence_layer/evaluation/__init__.py
+++ b/src/intelligence_layer/evaluation/__init__.py
@@ -1,4 +1,5 @@
 from .accumulator import MeanAccumulator as MeanAccumulator
+from .aggregator import Aggregator as Aggregator
 from .argilla import ArgillaEvaluationLogic as ArgillaEvaluationLogic
 from .argilla import ArgillaEvaluator as ArgillaEvaluator
 from .argilla import (
diff --git a/src/intelligence_layer/evaluation/aggregator.py b/src/intelligence_layer/evaluation/aggregator.py
new file mode 100644
index 000000000..94d0bf0e9
--- /dev/null
+++ b/src/intelligence_layer/evaluation/aggregator.py
@@ -0,0 +1,223 @@
+from functools import lru_cache
+from typing import (
+    Callable,
+    Generic,
+    Iterable,
+    Iterator,
+    Mapping,
+    TypeVar,
+    cast,
+    final,
+    get_args,
+    get_origin,
+)
+from uuid import uuid4
+
+from intelligence_layer.core.tracer import utc_now
+from intelligence_layer.evaluation.base_logic import AggregationLogic
+from intelligence_layer.evaluation.data_storage.aggregation_repository import (
+    AggregationRepository,
+)
+from intelligence_layer.evaluation.data_storage.evaluation_repository import (
+    EvaluationRepository,
+)
+from intelligence_layer.evaluation.domain import (
+    AggregatedEvaluation,
+    AggregationOverview,
+    Evaluation,
+    EvaluationOverview,
+    FailedExampleEvaluation,
+)
+
+T = TypeVar("T")
+
+
+class CountingFilterIterable(Iterable[T]):
+    def __init__(
+        self, wrapped_iterable: Iterable[T], filter: Callable[[T], bool]
+    ) -> None:
+        self._wrapped_iterator = iter(wrapped_iterable)
+        self._filter = filter
+        self._included_count = 0
+        self._excluded_count = 0
+
+    def __next__(self) -> T:
+        e = next(self._wrapped_iterator)
+        while not self._filter(e):
+            self._excluded_count += 1
+            e = next(self._wrapped_iterator)
+        self._included_count += 1
+        return e
+
+    def __iter__(self) -> Iterator[T]:
+        return self
+
+    def included_count(self) -> int:
+        return self._included_count
+
+    def excluded_count(self) -> int:
+        return self._excluded_count
+
+
+class Aggregator(Generic[Evaluation, AggregatedEvaluation]):
+    """Aggregator that can handle automatic aggregation of evaluation scenarios.
+
+    This aggregator should be used for automatic eval. A user still has to implement
+    :class: `AggregationLogic`.
+
+
+    Arguments:
+        evaluation_repository: The repository that will be used to store evaluation results.
+        aggregation_repository: The repository that will be used to store aggregation results.
+        description: Human-readable description for the evaluator.
+        aggregation_logic: The logic to aggregate the evaluations.
+
+    Generics:
+        Evaluation: Interface of the metrics that come from the evaluated :class:`Task`.
+        AggregatedEvaluation: The aggregated results of an evaluation run with a :class:`Dataset`.
+    """
+
+    def __init__(
+        self,
+        evaluation_repository: EvaluationRepository,
+        aggregation_repository: AggregationRepository,
+        description: str,
+        aggregation_logic: AggregationLogic[Evaluation, AggregatedEvaluation],
+    ) -> None:
+        self._evaluation_repository = evaluation_repository
+        self._aggregation_repository = aggregation_repository
+        self._aggregation_logic = aggregation_logic
+        self.description = description
+
+    @lru_cache(maxsize=1)
+    def _get_types(self) -> Mapping[str, type]:
+        """Type magic function that gets the actual types of the generic parameters.
+
+        Traverses the inheritance history of `BaseEvaluator`-subclass to find an actual type every time a TypeVar is replaced.
+
+        Returns:
+            Name of generic parameter to the type found.
+        """
+
+        def is_eligible_subclass(parent: type) -> bool:
+            return hasattr(parent, "__orig_bases__") and issubclass(
+                parent, AggregationLogic
+            )
+
+        def update_types() -> None:
+            num_types_set = 0
+            for current_index, current_type in enumerate(current_types):
+                if type(current_type) is not TypeVar:
+                    type_var_count = num_types_set - 1
+                    for element_index, element in enumerate(type_list):
+                        if type(element) is TypeVar:
+                            type_var_count += 1
+                        if type_var_count == current_index:
+                            break
+                    assert type_var_count == current_index
+                    type_list[element_index] = current_type
+                    num_types_set += 1
+
+        # mypy does not know __orig_bases__
+        base_types = AggregationLogic.__orig_bases__[1]  # type: ignore
+        type_list: list[type | TypeVar] = list(get_args(base_types))
+
+        possible_parent_classes = [
+            p
+            for p in reversed(type(self._aggregation_logic).__mro__)
+            if is_eligible_subclass(p)
+        ]
+        for parent in possible_parent_classes:
+            # mypy does not know __orig_bases__
+            for base in parent.__orig_bases__:  # type: ignore
+                origin = get_origin(base)
+                if origin is None or not issubclass(origin, AggregationLogic):
+                    continue
+                current_types = list(get_args(base))
+                update_types()
+
+        return {
+            name: param_type
+            for name, param_type in zip(
+                (a.__name__ for a in get_args(base_types)), type_list
+            )
+            if type(param_type) is not TypeVar
+        }
+
+    def evaluation_type(self) -> type[Evaluation]:
+        """Returns the type of the evaluation result of an example.
+
+        This can be used to retrieve properly typed evaluations of an evaluation run
+        from a :class:`EvaluationRepository`
+
+        Returns:
+            Returns the type of the evaluation result of an example.
+        """
+        try:
+            evaluation_type = self._get_types()["Evaluation"]
+        except KeyError:
+            raise TypeError(
+                f"Alternatively overwrite evaluation_type() in {type(self)}"
+            )
+        return cast(type[Evaluation], evaluation_type)
+
+    @final
+    def aggregate_evaluation(
+        self, *eval_ids: str
+    ) -> AggregationOverview[AggregatedEvaluation]:
+        """Aggregates all evaluations into an overview that includes high-level statistics.
+
+        Aggregates :class:`Evaluation`s according to the implementation of :func:`BaseEvaluator.aggregate`.
+
+        Args:
+            evaluation_overview: An overview of the evaluation to be aggregated. Does not include
+                actual evaluations as these will be retrieved from the repository.
+
+        Returns:
+            An overview of the aggregated evaluation.
+        """
+
+        def load_eval_overview(eval_id: str) -> EvaluationOverview:
+            evaluation_overview = self._evaluation_repository.evaluation_overview(
+                eval_id
+            )
+            if not evaluation_overview:
+                raise ValueError(
+                    f"No PartialEvaluationOverview found for eval-id: {eval_id}"
+                )
+            return evaluation_overview
+
+        evaluation_overviews = frozenset(load_eval_overview(id) for id in set(eval_ids))
+
+        nested_evaluations = [
+            self._evaluation_repository.example_evaluations(
+                overview.id, self.evaluation_type()
+            )
+            for overview in evaluation_overviews
+        ]
+        example_evaluations = [
+            eval for sublist in nested_evaluations for eval in sublist
+        ]
+
+        successful_evaluations = CountingFilterIterable(
+            (example_eval.result for example_eval in example_evaluations),
+            lambda evaluation: not isinstance(evaluation, FailedExampleEvaluation),
+        )
+        id = str(uuid4())
+        start = utc_now()
+        statistics = self._aggregation_logic.aggregate(
+            cast(Iterable[Evaluation], successful_evaluations)
+        )
+
+        aggregation_overview = AggregationOverview(
+            evaluation_overviews=frozenset(evaluation_overviews),
+            id=id,
+            start=start,
+            end=utc_now(),
+            successful_evaluation_count=successful_evaluations.included_count(),
+            crashed_during_eval_count=successful_evaluations.excluded_count(),
+            description=self.description,
+            statistics=statistics,
+        )
+        self._aggregation_repository.store_aggregation_overview(aggregation_overview)
+        return aggregation_overview
diff --git a/src/intelligence_layer/evaluation/argilla.py b/src/intelligence_layer/evaluation/argilla.py
index f4e781f84..41e288ea1 100644
--- a/src/intelligence_layer/evaluation/argilla.py
+++ b/src/intelligence_layer/evaluation/argilla.py
@@ -13,6 +13,7 @@
     RecordData,
 )
 from intelligence_layer.core import Input, InstructInput, Output, PromptOutput
+from intelligence_layer.evaluation import Aggregator
 from intelligence_layer.evaluation.accumulator import MeanAccumulator
 from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic
 from intelligence_layer.evaluation.data_storage.aggregation_repository import (
@@ -71,30 +72,26 @@ def _to_record(
 
 
 class ArgillaEvaluator(
-    Evaluator[Input, Output, ExpectedOutput, ArgillaEvaluation, AggregatedEvaluation],
+    Evaluator[Input, Output, ExpectedOutput, ArgillaEvaluation],
     ABC,
 ):
     """Evaluator used to integrate with Argilla (https://github.com/argilla-io/argilla).
 
     Use this evaluator if you would like to easily do human eval.
     This evaluator runs a dataset and sends the input and output to Argilla to be evaluated.
-    After they have been evaluated, you can fetch the results by using the `aggregate_evaluation` method.
 
      Arguments:
         dataset_repository: The repository with the examples that will be taken for the evaluation.
         run_repository: The repository of the runs to evaluate.
         evaluation_repository: The repository that will be used to store evaluation results.
-        aggregation_repository: The repository that will be used to store aggregation results.
         description: Human-readable description for the evaluator.
         evaluation_logic: The logic to use for evaluation.
-        aggregation_logic: The logic to aggregate the evaluations.
 
     Generics:
         Input: Interface to be passed to the :class:`Task` that shall be evaluated.
         Output: Type of the output of the :class:`Task` to be evaluated.
         ExpectedOutput: Output that is expected from the run with the supplied input.
         ArgillaEvaluation: Interface of the metrics that come from the Argilla task`.
-        AggregatedEvaluation: The aggregated results of an evaluation run with a :class:`Dataset`.
     """
 
     def __init__(
@@ -102,25 +99,58 @@ def __init__(
         dataset_repository: DatasetRepository,
         run_repository: RunRepository,
         evaluation_repository: ArgillaEvaluationRepository,
-        aggregation_repository: AggregationRepository,
         description: str,
         evaluation_logic: ArgillaEvaluationLogic[Input, Output, ExpectedOutput],
-        aggregation_logic: AggregationLogic[ArgillaEvaluation, AggregatedEvaluation],
     ) -> None:
         super().__init__(
             dataset_repository,
             run_repository,
             evaluation_repository,
-            aggregation_repository,
             description,
             evaluation_logic,  # type: ignore
-            aggregation_logic,  # TODO: check if the non-matching types of the evaluation logic and aggregation logic (in the line above) are a problem
         )
 
     def evaluation_type(self) -> type[ArgillaEvaluation]:  # type: ignore
         return ArgillaEvaluation
 
 
+class ArgillaAggregator(
+    Aggregator[ArgillaEvaluation, AggregatedEvaluation],
+    ABC,
+):
+    """Aggregator used to aggregate Argilla (https://github.com/argilla-io/argilla) evaluations.
+
+    You can fetch the results by using the `aggregate_evaluation` method.
+
+     Arguments:
+        evaluation_repository: The repository that will be used to store evaluation results.
+        aggregation_repository: The repository that will be used to store aggregation results.
+        description: Human-readable description for the evaluator.
+        aggregation_logic: The logic to aggregate the evaluations.
+
+    Generics:
+        ArgillaEvaluation: Interface of the metrics that come from the Argilla task`.
+        AggregatedEvaluation: The aggregated results of an evaluation run with a :class:`Dataset`.
+    """
+
+    def evaluation_type(self) -> type[ArgillaEvaluation]:  # type: ignore
+        return ArgillaEvaluation
+
+    def __init__(
+        self,
+        evaluation_repository: ArgillaEvaluationRepository,
+        aggregation_repository: AggregationRepository,
+        description: str,
+        aggregation_logic: AggregationLogic[ArgillaEvaluation, AggregatedEvaluation],
+    ) -> None:
+        super().__init__(
+            evaluation_repository,
+            aggregation_repository,
+            description,
+            aggregation_logic,
+        )
+
+
 class AggregatedInstructComparison(BaseModel):
     scores: Mapping[str, PlayerScore]
 
diff --git a/src/intelligence_layer/evaluation/evaluator.py b/src/intelligence_layer/evaluation/evaluator.py
index 36b34da95..6d5b9788e 100644
--- a/src/intelligence_layer/evaluation/evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluator.py
@@ -1,10 +1,8 @@
 from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
 from typing import (
-    Callable,
     Generic,
     Iterable,
-    Iterator,
     Mapping,
     Optional,
     Sequence,
@@ -15,16 +13,12 @@
     get_args,
     get_origin,
 )
-from uuid import uuid4
 
 from tqdm import tqdm
 
 from intelligence_layer.core.task import Input, Output
 from intelligence_layer.core.tracer import utc_now
-from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic
-from intelligence_layer.evaluation.data_storage.aggregation_repository import (
-    AggregationRepository,
-)
+from intelligence_layer.evaluation.base_logic import EvaluationLogic
 from intelligence_layer.evaluation.data_storage.dataset_repository import (
     DatasetRepository,
 )
@@ -33,8 +27,6 @@
 )
 from intelligence_layer.evaluation.data_storage.run_repository import RunRepository
 from intelligence_layer.evaluation.domain import (
-    AggregatedEvaluation,
-    AggregationOverview,
     Evaluation,
     EvaluationOverview,
     Example,
@@ -47,60 +39,26 @@
     SuccessfulExampleOutput,
 )
 
-T = TypeVar("T")
-
-
-class CountingFilterIterable(Iterable[T]):
-    def __init__(
-        self, wrapped_iterable: Iterable[T], filter: Callable[[T], bool]
-    ) -> None:
-        self._wrapped_iterator = iter(wrapped_iterable)
-        self._filter = filter
-        self._included_count = 0
-        self._excluded_count = 0
-
-    def __next__(self) -> T:
-        e = next(self._wrapped_iterator)
-        while not self._filter(e):
-            self._excluded_count += 1
-            e = next(self._wrapped_iterator)
-        self._included_count += 1
-        return e
 
-    def __iter__(self) -> Iterator[T]:
-        return self
-
-    def included_count(self) -> int:
-        return self._included_count
-
-    def excluded_count(self) -> int:
-        return self._excluded_count
-
-
-class Evaluator(
-    Generic[Input, Output, ExpectedOutput, Evaluation, AggregatedEvaluation]
-):
+class Evaluator(Generic[Input, Output, ExpectedOutput, Evaluation]):
     """Evaluator that can handle automatic evaluation scenarios.
 
     This evaluator should be used for automatic eval. A user still has to implement
-    :class:`EvaluationLogic` and :class: `AggregationLogic`.
+    :class:`EvaluationLogic`.
 
 
     Arguments:
         dataset_repository: The repository with the examples that will be taken for the evaluation.
         run_repository: The repository of the runs to evaluate.
         evaluation_repository: The repository that will be used to store evaluation results.
-        aggregation_repository: The repository that will be used to store aggregation results.
         description: Human-readable description for the evaluator.
         evaluation_logic: The logic to use for evaluation.
-        aggregation_logic: The logic to aggregate the evaluations.
 
     Generics:
         Input: Interface to be passed to the :class:`Task` that shall be evaluated.
         Output: Type of the output of the :class:`Task` to be evaluated.
         ExpectedOutput: Output that is expected from the run with the supplied input.
         Evaluation: Interface of the metrics that come from the evaluated :class:`Task`.
-        AggregatedEvaluation: The aggregated results of an evaluation run with a :class:`Dataset`.
     """
 
     def __init__(
@@ -108,18 +66,13 @@ def __init__(
         dataset_repository: DatasetRepository,
         run_repository: RunRepository,
         evaluation_repository: EvaluationRepository,
-        aggregation_repository: AggregationRepository,
         description: str,
         evaluation_logic: EvaluationLogic[Input, Output, ExpectedOutput, Evaluation],
-        aggregation_logic: AggregationLogic[Evaluation, AggregatedEvaluation],
     ) -> None:
         self._dataset_repository = dataset_repository
         self._run_repository = run_repository
         self._evaluation_repository = evaluation_repository
-        self._aggregation_repository = aggregation_repository
-
         self._evaluation_logic = evaluation_logic
-        self._aggregation_logic = aggregation_logic
         self.description = description
 
     @lru_cache(maxsize=1)
@@ -152,8 +105,8 @@ def update_types() -> None:
                     num_types_set += 1
 
         # mypy does not know __orig_bases__
-        base_evaluator_bases = EvaluationLogic.__orig_bases__[1]  # type: ignore
-        type_list: list[type | TypeVar] = list(get_args(base_evaluator_bases))
+        base_types = EvaluationLogic.__orig_bases__[1]  # type: ignore
+        type_list: list[type | TypeVar] = list(get_args(base_types))
         possible_parent_classes = [
             p
             for p in reversed(type(self._evaluation_logic).__mro__)
@@ -171,7 +124,7 @@ def update_types() -> None:
         return {
             name: param_type
             for name, param_type in zip(
-                (a.__name__ for a in get_args(base_evaluator_bases)), type_list
+                (a.__name__ for a in get_args(base_types)), type_list
             )
             if type(param_type) is not TypeVar
         }
@@ -360,67 +313,6 @@ def evaluate(
 
         return partial_overview
 
-    @final
-    def aggregate_evaluation(
-        self, *eval_ids: str
-    ) -> AggregationOverview[AggregatedEvaluation]:
-        """Aggregates all evaluations into an overview that includes high-level statistics.
-
-        Aggregates :class:`Evaluation`s according to the implementation of :func:`BaseEvaluator.aggregate`.
-
-        Args:
-            evaluation_overview: An overview of the evaluation to be aggregated. Does not include
-                actual evaluations as these will be retrieved from the repository.
-
-        Returns:
-            An overview of the aggregated evaluation.
-        """
-
-        def load_eval_overview(eval_id: str) -> EvaluationOverview:
-            evaluation_overview = self._evaluation_repository.evaluation_overview(
-                eval_id
-            )
-            if not evaluation_overview:
-                raise ValueError(
-                    f"No PartialEvaluationOverview found for eval-id: {eval_id}"
-                )
-            return evaluation_overview
-
-        evaluation_overviews = frozenset(load_eval_overview(id) for id in set(eval_ids))
-
-        nested_evaluations = [
-            self._evaluation_repository.example_evaluations(
-                overview.id, self.evaluation_type()
-            )
-            for overview in evaluation_overviews
-        ]
-        example_evaluations = [
-            eval for sublist in nested_evaluations for eval in sublist
-        ]
-
-        successful_evaluations = CountingFilterIterable(
-            (example_eval.result for example_eval in example_evaluations),
-            lambda evaluation: not isinstance(evaluation, FailedExampleEvaluation),
-        )
-        id = str(uuid4())
-        start = utc_now()
-        statistics = self._aggregation_logic.aggregate(
-            cast(Iterable[Evaluation], successful_evaluations)
-        )
-
-        aggregation_overview = AggregationOverview(
-            evaluation_overviews=frozenset(evaluation_overviews),
-            id=id,
-            start=start,
-            end=utc_now(),
-            successful_evaluation_count=successful_evaluations.included_count(),
-            crashed_during_eval_count=successful_evaluations.excluded_count(),
-            description=self.description,
-            statistics=statistics,
-        )
-        self._aggregation_repository.store_aggregation_overview(aggregation_overview)
-        return aggregation_overview
-
     @final
     def evaluate(
         self,
@@ -440,24 +332,3 @@ def evaluate(
         self._evaluation_repository.store_example_evaluation(
             ExampleEvaluation(eval_id=eval_id, example_id=example.id, result=result)
         )
-
-    @final
-    def eval_and_aggregate_runs(
-        self, *run_ids: str
-    ) -> AggregationOverview[AggregatedEvaluation]:
-        """Evaluates an entire dataset in a threaded manner and aggregates the results into an `AggregatedEvaluation`.
-
-        This will call the `run` method for each example in the dataset.
-        Finally, it will call the `aggregate` method and return the aggregated results.
-
-        Args:
-            dataset_id: id of the dataset that will be used to evaluate a :class:`Task`.
-                The actual data is loaded from the :class:`DatasetRepository` passed to `__init__`
-            tracer: Optional tracer used for extra tracing.
-                Traces are always saved in the evaluation repository.
-
-        Returns:
-            The aggregated results of an evaluation run with a dataset.
-        """
-        partial_evaluation_overview = self.evaluate_runs(*run_ids)
-        return self.aggregate_evaluation(partial_evaluation_overview.id)
diff --git a/src/intelligence_layer/evaluation/run.py b/src/intelligence_layer/evaluation/run.py
index 12200d89d..525bab45b 100644
--- a/src/intelligence_layer/evaluation/run.py
+++ b/src/intelligence_layer/evaluation/run.py
@@ -9,6 +9,7 @@
 from intelligence_layer.connectors.limited_concurrency_client import (
     LimitedConcurrencyClient,
 )
+from intelligence_layer.evaluation import Aggregator
 from intelligence_layer.evaluation.data_storage.aggregation_repository import (
     FileAggregationRepository,
 )
@@ -107,12 +108,14 @@ def main(cli_args: Sequence[str]) -> None:
         dataset_repository,
         runner_repository,
         evaluation_repository,
-        aggregation_repository,
         description,
         eval_logic,
-        aggregation_logic,
     )
-    evaluator.eval_and_aggregate_runs(run_overview_id)
+    aggregator = Aggregator(
+        evaluation_repository, aggregation_repository, description, aggregation_logic
+    )
+    evaluation_overview = evaluator.evaluate_runs(run_overview_id)
+    aggregator.aggregate_evaluation(evaluation_overview.id)
 
 
 if __name__ == "__main__":
diff --git a/tests/evaluation/test_argilla_evaluator.py b/tests/evaluation/test_argilla_evaluator.py
index 9aa19deae..fd6e4aafe 100644
--- a/tests/evaluation/test_argilla_evaluator.py
+++ b/tests/evaluation/test_argilla_evaluator.py
@@ -21,6 +21,7 @@
     Runner,
     SuccessfulExampleOutput,
 )
+from intelligence_layer.evaluation.argilla import ArgillaAggregator
 from intelligence_layer.evaluation.base_logic import AggregationLogic
 from intelligence_layer.evaluation.data_storage.aggregation_repository import (
     InMemoryAggregationRepository,
@@ -131,21 +132,9 @@ def arg() -> StubArgillaClient:
     return StubArgillaClient()
 
 
-@fixture
-def string_argilla_evaluator(
-    in_memory_dataset_repository: InMemoryDatasetRepository,
-    in_memory_run_repository: InMemoryRunRepository,
-    in_memory_evaluation_repository: InMemoryEvaluationRepository,
-    in_memory_aggregation_repository: InMemoryAggregationRepository,
-    stub_argilla_client: StubArgillaClient,
-) -> ArgillaEvaluator[
-    DummyStringInput,
-    DummyStringOutput,
-    DummyStringOutput,
-    DummyAggregatedEvaluation,
-]:
-    stub_argilla_client._expected_workspace_id = "workspace-id"
-    questions = [
+@fixture()
+def argilla_questions() -> Sequence[Question]:
+    return [
         Question(
             name="question",
             title="title",
@@ -153,31 +142,81 @@ def string_argilla_evaluator(
             options=[1],
         )
     ]
-    fields = [
+
+
+@fixture()
+def argilla_fields() -> Sequence[Field]:
+    return [
         Field(name="output", title="Output"),
         Field(name="input", title="Input"),
     ]
+
+
+@fixture
+def argilla_evaluation_repository(
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
+    stub_argilla_client: StubArgillaClient,
+    argilla_questions: Sequence[Question],
+    argilla_fields: Sequence[Field],
+) -> ArgillaEvaluationRepository:
+    stub_argilla_client._expected_workspace_id = "workspace-id"
+    stub_argilla_client._expected_questions = argilla_questions
+    stub_argilla_client._expected_fields = argilla_fields
+
     workspace_id = stub_argilla_client._expected_workspace_id
 
-    eval_repository = ArgillaEvaluationRepository(
+    return ArgillaEvaluationRepository(
         in_memory_evaluation_repository,
         stub_argilla_client,
         workspace_id,
-        fields,
-        questions,
+        argilla_fields,
+        argilla_questions,
     )
 
+
+@fixture
+def string_argilla_evaluator(
+    in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    argilla_evaluation_repository: ArgillaEvaluationRepository,
+    stub_argilla_client: StubArgillaClient,
+    argilla_questions: Sequence[Question],
+    argilla_fields: Sequence[Field],
+) -> ArgillaEvaluator[DummyStringInput, DummyStringOutput, DummyStringOutput,]:
     evaluator = ArgillaEvaluator(
         in_memory_dataset_repository,
         in_memory_run_repository,
+        argilla_evaluation_repository,
+        "dummy-string-task",
+        DummyStringTaskArgillaEvaluationLogic(),
+    )
+    return evaluator
+
+
+@fixture
+def string_argilla_aggregator(
+    argilla_evaluation_repository: ArgillaEvaluationRepository,
+    in_memory_aggregation_repository: InMemoryAggregationRepository,
+    stub_argilla_client: StubArgillaClient,
+    argilla_questions: Sequence[Question],
+    argilla_fields: Sequence[Field],
+) -> ArgillaAggregator[DummyAggregatedEvaluation,]:
+    workspace_id = stub_argilla_client._expected_workspace_id
+
+    eval_repository = ArgillaEvaluationRepository(
+        argilla_evaluation_repository,
+        stub_argilla_client,
+        workspace_id,
+        argilla_fields,
+        argilla_questions,
+    )
+
+    evaluator = ArgillaAggregator(
         eval_repository,
         in_memory_aggregation_repository,
         "dummy-string-task",
-        DummyStringTaskArgillaEvaluationLogic(),
         DummyStringTaskArgillaAggregationLogic(),
     )
-    stub_argilla_client._expected_questions = questions
-    stub_argilla_client._expected_fields = fields
     return evaluator
 
 
@@ -200,7 +239,6 @@ def test_argilla_evaluator_can_do_sync_evaluation(
         DummyStringInput,
         DummyStringOutput,
         DummyStringOutput,
-        DummyAggregatedEvaluation,
     ],
     string_argilla_runner: Runner[DummyStringInput, DummyStringOutput],
     string_dataset_id: str,
@@ -230,8 +268,8 @@ def test_argilla_evaluator_can_aggregate_evaluation(
         DummyStringInput,
         DummyStringOutput,
         DummyStringOutput,
-        DummyAggregatedEvaluation,
     ],
+    string_argilla_aggregator: ArgillaAggregator[DummyAggregatedEvaluation],
     string_argilla_runner: Runner[DummyStringInput, DummyStringOutput],
     string_dataset_id: str,
 ) -> None:
@@ -240,7 +278,7 @@ def test_argilla_evaluator_can_aggregate_evaluation(
     )
     run_overview = string_argilla_runner.run_dataset(string_dataset_id)
     eval_overview = string_argilla_evaluator.evaluate_runs(run_overview.id)
-    aggregated_eval_overview = string_argilla_evaluator.aggregate_evaluation(
+    aggregated_eval_overview = string_argilla_aggregator.aggregate_evaluation(
         eval_overview.id
     )
     assert aggregated_eval_overview.statistics.score == argilla_client._score
diff --git a/tests/evaluation/test_evaluator.py b/tests/evaluation/test_evaluator.py
index 214c60b90..bab2fc5a0 100644
--- a/tests/evaluation/test_evaluator.py
+++ b/tests/evaluation/test_evaluator.py
@@ -6,6 +6,7 @@
 from intelligence_layer.core import InMemoryTaskSpan, InMemoryTracer, NoOpTracer, Tracer
 from intelligence_layer.core.task import Input, Output, Task
 from intelligence_layer.evaluation import (
+    Aggregator,
     Evaluation,
     Evaluator,
     Example,
@@ -146,19 +147,30 @@ def dummy_evaluator(
     in_memory_dataset_repository: InMemoryDatasetRepository,
     in_memory_run_repository: InMemoryRunRepository,
     in_memory_evaluation_repository: InMemoryEvaluationRepository,
-    in_memory_aggregation_repository: InMemoryAggregationRepository,
     dummy_eval_logic: DummyEvaluationLogic,
-    dummy_aggregate_logic: DummyAggregationLogic,
-) -> Evaluator[
-    str, str, None, DummyEvaluation, DummyAggregatedEvaluationWithResultList
-]:
+) -> Evaluator[str, str, None, DummyEvaluation]:
     return Evaluator(
         in_memory_dataset_repository,
         in_memory_run_repository,
         in_memory_evaluation_repository,
-        in_memory_aggregation_repository,
         "dummy-evaluator",
         dummy_eval_logic,
+    )
+
+
+@fixture
+def dummy_aggregator(
+    in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
+    in_memory_aggregation_repository: InMemoryAggregationRepository,
+    dummy_eval_logic: DummyEvaluationLogic,
+    dummy_aggregate_logic: DummyAggregationLogic,
+) -> Aggregator[DummyEvaluation, DummyAggregatedEvaluationWithResultList]:
+    return Aggregator(
+        in_memory_evaluation_repository,
+        in_memory_aggregation_repository,
+        "dummy-evaluator",
         dummy_aggregate_logic,
     )
 
@@ -194,30 +206,42 @@ def comparing_evaluator(
     in_memory_dataset_repository: InMemoryDatasetRepository,
     in_memory_run_repository: InMemoryRunRepository,
     in_memory_evaluation_repository: InMemoryEvaluationRepository,
-    in_memory_aggregation_repository: InMemoryAggregationRepository,
     comparing_eval_logic: ComparingEvaluationLogic,
-    comparing_aggregation_logic: ComparingAggregationLogic,
-) -> Evaluator[str, str, None, ComparisonEvaluation, ComparisonAggregation]:
+) -> Evaluator[str, str, None, ComparisonEvaluation]:
     return Evaluator(
         in_memory_dataset_repository,
         in_memory_run_repository,
         in_memory_evaluation_repository,
-        in_memory_aggregation_repository,
         "comparing-evaluator",
         comparing_eval_logic,
+    )
+
+
+@fixture
+def comparing_aggregator(
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
+    in_memory_aggregation_repository: InMemoryAggregationRepository,
+    comparing_aggregation_logic: ComparingAggregationLogic,
+) -> Aggregator[ComparisonEvaluation, ComparisonAggregation]:
+    return Aggregator(
+        in_memory_evaluation_repository,
+        in_memory_aggregation_repository,
+        "comparing-evaluator",
         comparing_aggregation_logic,
     )
 
 
 def test_eval_and_aggregate_runs_returns_generic_statistics(
-    dummy_evaluator: Evaluator[
-        str, str, None, DummyEvaluation, DummyAggregatedEvaluationWithResultList
+    dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
+    dummy_aggregator: Aggregator[
+        DummyEvaluation, DummyAggregatedEvaluationWithResultList
     ],
     dummy_runner: Runner[str, str],
     dataset_id: str,
 ) -> None:
     run_overview = dummy_runner.run_dataset(dataset_id)
-    aggregation_overview = dummy_evaluator.eval_and_aggregate_runs(run_overview.id)
+    evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id)
+    aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id)
 
     assert next(iter(aggregation_overview.run_overviews())).dataset_id == dataset_id
     assert aggregation_overview.successful_evaluation_count == 1
@@ -225,15 +249,17 @@ def test_eval_and_aggregate_runs_returns_generic_statistics(
 
 
 def test_eval_and_aggregate_runs_uses_passed_tracer(
-    dummy_evaluator: Evaluator[
-        str, str, None, DummyEvaluation, DummyAggregatedEvaluationWithResultList
+    dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
+    dummy_aggregator: Aggregator[
+        DummyEvaluation, DummyAggregatedEvaluationWithResultList
     ],
     dataset_id: str,
     dummy_runner: Runner[str, str],
 ) -> None:
     in_memory_tracer = InMemoryTracer()
     run_overview = dummy_runner.run_dataset(dataset_id, in_memory_tracer)
-    dummy_evaluator.eval_and_aggregate_runs(run_overview.id)
+    evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id)
+    dummy_aggregator.aggregate_evaluation(evaluation_overview.id)
 
     entries = in_memory_tracer.entries
     assert len(entries) == 3
@@ -241,8 +267,9 @@ def test_eval_and_aggregate_runs_uses_passed_tracer(
 
 
 def test_eval_and_aggregate_runs_stores_example_evaluations(
-    dummy_evaluator: Evaluator[
-        str, str, None, DummyEvaluation, DummyAggregatedEvaluationWithResultList
+    dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
+    dummy_aggregator: Aggregator[
+        DummyEvaluation, DummyAggregatedEvaluationWithResultList
     ],
     dataset_id: str,
     dummy_runner: Runner[str, str],
@@ -256,7 +283,9 @@ def test_eval_and_aggregate_runs_stores_example_evaluations(
     assert dataset is not None
 
     run_overview = dummy_runner.run_dataset(dataset_id, NoOpTracer())
-    aggregation_overview = dummy_evaluator.eval_and_aggregate_runs(run_overview.id)
+    evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id)
+    aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id)
+
     examples = list(dataset)
     eval_overview = next(iter(aggregation_overview.evaluation_overviews))
     success_result = evaluation_repository.example_evaluation(
@@ -283,8 +312,9 @@ def test_eval_and_aggregate_runs_stores_example_evaluations(
 
 
 def test_eval_and_aggregate_runs_stores_example_traces(
-    dummy_evaluator: Evaluator[
-        str, str, None, DummyEvaluation, DummyAggregatedEvaluationWithResultList
+    dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
+    dummy_aggregator: Aggregator[
+        DummyEvaluation, DummyAggregatedEvaluationWithResultList
     ],
     dataset_id: str,
     dummy_runner: Runner[str, str],
@@ -297,16 +327,18 @@ def test_eval_and_aggregate_runs_stores_example_traces(
     assert dataset is not None
 
     run_overview = dummy_runner.run_dataset(dataset_id)
-    evaluation_run_overview = dummy_evaluator.eval_and_aggregate_runs(run_overview.id)
+    evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id)
+    aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id)
+
     examples = list(dataset)
     success_result = run_repository.example_trace(
-        evaluation_run_overview.run_ids[0], examples[0].id
+        aggregation_overview.run_ids[0], examples[0].id
     )
     failure_result_task = run_repository.example_trace(
-        evaluation_run_overview.run_ids[0], examples[1].id
+        aggregation_overview.run_ids[0], examples[1].id
     )
     failure_result_eval = run_repository.example_trace(
-        evaluation_run_overview.run_ids[0], examples[2].id
+        aggregation_overview.run_ids[0], examples[2].id
     )
 
     assert success_result
@@ -318,16 +350,18 @@ def test_eval_and_aggregate_runs_stores_example_traces(
 
 
 def test_eval_and_aggregate_runs_stores_aggregated_results(
-    dummy_evaluator: Evaluator[
-        str, str, None, DummyEvaluation, DummyAggregatedEvaluationWithResultList
+    dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
+    dummy_aggregator: Aggregator[
+        DummyEvaluation, DummyAggregatedEvaluationWithResultList
     ],
     dummy_runner: Runner[str, str],
     dataset_id: str,
 ) -> None:
-    aggregation_repository = dummy_evaluator._aggregation_repository
+    aggregation_repository = dummy_aggregator._aggregation_repository
 
     run_overview = dummy_runner.run_dataset(dataset_id)
-    aggregation_overview = dummy_evaluator.eval_and_aggregate_runs(run_overview.id)
+    evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id)
+    aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id)
     loaded_evaluation_run_overview = aggregation_repository.aggregation_overview(
         aggregation_overview.id, DummyAggregatedEvaluationWithResultList
     )
@@ -336,40 +370,39 @@ def test_eval_and_aggregate_runs_stores_aggregated_results(
 
 
 def test_evaluate_can_evaluate_multiple_runs(
-    comparing_evaluator: Evaluator[
-        str, str, None, ComparisonEvaluation, ComparisonAggregation
-    ],
+    comparing_evaluator: Evaluator[str, str, None, ComparisonEvaluation],
+    comparing_aggregator: Aggregator[ComparisonEvaluation, ComparisonAggregation],
     string_dataset_id: str,
     dummy_runner: Runner[str, str],
 ) -> None:
     run_overview1 = dummy_runner.run_dataset(string_dataset_id)
     run_overview2 = dummy_runner.run_dataset(string_dataset_id)
 
-    partial_overview = comparing_evaluator.evaluate_runs(
+    evaluation_overview = comparing_evaluator.evaluate_runs(
         run_overview1.id, run_overview2.id
     )
-
-    eval_overview = comparing_evaluator.aggregate_evaluation(partial_overview.id)
-    assert eval_overview.statistics.equal_ratio == 1
+    aggregation_overview = comparing_aggregator.aggregate_evaluation(
+        evaluation_overview.id
+    )
+    assert aggregation_overview.statistics.equal_ratio == 1
 
 
 def test_aggregate_evaluation_can_aggregate_multiple_evals(
-    comparing_evaluator: Evaluator[
-        str, str, None, ComparisonEvaluation, ComparisonAggregation
-    ],
+    comparing_evaluator: Evaluator[str, str, None, ComparisonEvaluation],
+    comparing_aggregator: Aggregator[ComparisonEvaluation, ComparisonAggregation],
     string_dataset_id: str,
     dummy_runner: Runner[str, str],
 ) -> None:
     run_overview_1 = dummy_runner.run_dataset(string_dataset_id)
     run_overview_2 = dummy_runner.run_dataset(string_dataset_id)
 
-    partial_overview_1 = comparing_evaluator.evaluate_runs(run_overview_1.id)
-    partial_overview_2 = comparing_evaluator.evaluate_runs(
+    evaluation_overview_1 = comparing_evaluator.evaluate_runs(run_overview_1.id)
+    evaluation_overview_2 = comparing_evaluator.evaluate_runs(
         run_overview_1.id, run_overview_2.id
     )
 
-    aggregation_overview = comparing_evaluator.aggregate_evaluation(
-        partial_overview_1.id, partial_overview_1.id, partial_overview_2.id
+    aggregation_overview = comparing_aggregator.aggregate_evaluation(
+        evaluation_overview_1.id, evaluation_overview_1.id, evaluation_overview_2.id
     )
 
     assert len(list(aggregation_overview.run_overviews())) == 2
@@ -393,7 +426,6 @@ class AggregatedEvaluationType(BaseModel):
         "Output": str,
         "ExpectedOutput": type(None),
         "Evaluation": EvaluationType,
-        # "AggregatedEvaluation": AggregatedEvaluationType,  # TODO: fix after Evaluation and Aggregation have been split
     }
 
     A = TypeVar("A", bound=BaseModel)
@@ -422,15 +454,51 @@ class GreatGrandChildEvaluationLogic(
         pass
 
     timmy: Evaluator[
-        str, str, None, EvaluationType, AggregatedEvaluationType
+        str,
+        str,
+        None,
+        EvaluationType,
     ] = Evaluator(
         in_memory_dataset_repository,
         in_memory_run_repository,
         in_memory_evaluation_repository,
-        in_memory_aggregation_repository,
         "dummy",
         evaluation_logic=GreatGrandChildEvaluationLogic(),
-        aggregation_logic=ChildAggregationLogic(),
+    )
+    who_is_timmy = timmy._get_types()
+
+    assert who_is_timmy == types
+
+
+def test_aggregator_type_magic_works(
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
+    in_memory_aggregation_repository: InMemoryAggregationRepository,
+) -> None:
+    class EvaluationType(BaseModel):
+        pass
+
+    class AggregatedEvaluationType(BaseModel):
+        pass
+
+    types = {
+        "Evaluation": EvaluationType,
+        "AggregatedEvaluation": AggregatedEvaluationType,
+    }
+
+    class ChildAggregationLogic(AggregationLogic[Evaluation, AggregatedEvaluationType]):
+        def aggregate(
+            self, evaluations: Iterable[Evaluation]
+        ) -> AggregatedEvaluationType:
+            return None  # type: ignore
+
+    class GrandChildAggregationLogic(ChildAggregationLogic[EvaluationType]):
+        pass
+
+    timmy: Aggregator[EvaluationType, AggregatedEvaluationType] = Aggregator(
+        in_memory_evaluation_repository,
+        in_memory_aggregation_repository,
+        "dummy",
+        aggregation_logic=GrandChildAggregationLogic(),
     )
     who_is_timmy = timmy._get_types()
 
@@ -438,28 +506,29 @@ class GreatGrandChildEvaluationLogic(
 
 
 def test_eval_and_aggregate_runs_only_runs_n_examples(
-    dummy_evaluator: Evaluator[
-        str, str, None, DummyEvaluation, DummyAggregatedEvaluationWithResultList
+    dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
+    dummy_aggregator: Aggregator[
+        DummyEvaluation, DummyAggregatedEvaluationWithResultList
     ],
     dummy_runner: Runner[str, str],
     good_dataset_id: str,
 ) -> None:
     run_overview = dummy_runner.run_dataset(good_dataset_id)
-    evaluation_overview = dummy_evaluator.eval_and_aggregate_runs(run_overview.id)
-    partial_evaluation_overview = dummy_evaluator.evaluate_runs(
-        run_overview.id, num_examples=2
-    )
-    evaluation_overview_n = dummy_evaluator.aggregate_evaluation(
-        partial_evaluation_overview.id
+    evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id)
+    aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id)
+
+    evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id, num_examples=2)
+    aggregation_overview_n = dummy_aggregator.aggregate_evaluation(
+        evaluation_overview.id
     )
 
     assert (
-        evaluation_overview.successful_evaluation_count
-        + evaluation_overview.crashed_during_eval_count
+        aggregation_overview.successful_evaluation_count
+        + aggregation_overview.crashed_during_eval_count
         == 3
     )
     assert (
-        evaluation_overview_n.successful_evaluation_count
-        + evaluation_overview_n.crashed_during_eval_count
+        aggregation_overview_n.successful_evaluation_count
+        + aggregation_overview_n.crashed_during_eval_count
         == 2
     )
diff --git a/tests/evaluation/test_instruct_comparison_argilla_evaluator.py b/tests/evaluation/test_instruct_comparison_argilla_evaluator.py
index abb37ad95..cd15d725e 100644
--- a/tests/evaluation/test_instruct_comparison_argilla_evaluator.py
+++ b/tests/evaluation/test_instruct_comparison_argilla_evaluator.py
@@ -19,6 +19,7 @@
 from intelligence_layer.core.prompt_template import RichPrompt
 from intelligence_layer.core.tracer import utc_now
 from intelligence_layer.evaluation import (
+    ArgillaEvaluationRepository,
     ArgillaEvaluator,
     EloCalculator,
     Example,
@@ -33,6 +34,7 @@
 )
 from intelligence_layer.evaluation.argilla import (
     AggregatedInstructComparison,
+    ArgillaAggregator,
     create_instruct_comparison_argilla_evaluation_classes,
 )
 from intelligence_layer.evaluation.data_storage.aggregation_repository import (
@@ -80,15 +82,27 @@ def argilla_fake() -> ArgillaClient:
     return ArgillaFake()
 
 
+@fixture
+def argilla_repository(
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
+    argilla_fake: ArgillaClient,
+) -> ArgillaEvaluationRepository:
+    (
+        evaluation_logic,
+        evaluation_repository,
+    ) = create_instruct_comparison_argilla_evaluation_classes(
+        "workspace", in_memory_evaluation_repository, argilla_fake, None
+    )
+    return evaluation_repository
+
+
 @fixture
 def evaluator(
     in_memory_dataset_repository: InMemoryDatasetRepository,
     in_memory_run_repository: InMemoryRunRepository,
     in_memory_evaluation_repository: InMemoryEvaluationRepository,
-    in_memory_aggregation_repository: InMemoryAggregationRepository,
     argilla_fake: ArgillaClient,
-    argilla_aggregation_logic: InstructComparisonArgillaAggregationLogic,
-) -> ArgillaEvaluator[InstructInput, PromptOutput, None, AggregatedInstructComparison]:
+) -> ArgillaEvaluator[InstructInput, PromptOutput, None]:
     (
         evaluation_logic,
         evaluation_repository,
@@ -99,9 +113,21 @@ def evaluator(
         in_memory_dataset_repository,
         in_memory_run_repository,
         evaluation_repository,
-        in_memory_aggregation_repository,
         "instruct-evaluator",
         evaluation_logic,
+    )
+
+
+@fixture
+def aggregator(
+    argilla_repository: ArgillaEvaluationRepository,
+    in_memory_aggregation_repository: InMemoryAggregationRepository,
+    argilla_aggregation_logic: InstructComparisonArgillaAggregationLogic,
+) -> ArgillaAggregator[AggregatedInstructComparison]:
+    return ArgillaAggregator(
+        argilla_repository,
+        in_memory_aggregation_repository,
+        "instruct-evaluator",
         argilla_aggregation_logic,
     )
 
@@ -121,12 +147,8 @@ def any_instruct_output() -> PromptOutput:
 
 
 def test_evaluate_run_submits_pairwise_comparison_records(
-    evaluator: ArgillaEvaluator[
-        InstructInput,
-        PromptOutput,
-        None,
-        AggregatedInstructComparison,
-    ],
+    evaluator: ArgillaEvaluator[InstructInput, PromptOutput, None],
+    aggregator: ArgillaAggregator[AggregatedInstructComparison],
     in_memory_dataset_repository: InMemoryDatasetRepository,
     in_memory_run_repository: InMemoryRunRepository,
     any_instruct_output: PromptOutput,
@@ -172,7 +194,7 @@ def test_evaluate_run_submits_pairwise_comparison_records(
         for record_data in argilla_fake.record_data(evaluation_overview.id)
     ) == sorted(pairs)
 
-    elo_score = evaluator.aggregate_evaluation(evaluation_overview.id)
+    elo_score = aggregator.aggregate_evaluation(evaluation_overview.id)
     scores = elo_score.statistics.scores
     # lower id always wins, should be sorted
     for i in range(run_count - 1):
@@ -197,10 +219,8 @@ def test_evaluate_run_only_evaluates_high_priority(
         in_memory_dataset_repository,
         in_memory_run_repository,
         eval_repository,
-        in_memory_aggregation_repository,
         "instruct-evaluator",
         eval_logic,
-        argilla_aggregation_logic,
     )
 
     run_count = 10
diff --git a/tests/use_cases/classify/test_classify.py b/tests/use_cases/classify/test_classify.py
index b97b34c25..0c6bf5b2e 100644
--- a/tests/use_cases/classify/test_classify.py
+++ b/tests/use_cases/classify/test_classify.py
@@ -5,6 +5,7 @@
 from intelligence_layer.connectors import AlephAlphaClientProtocol
 from intelligence_layer.core import Chunk, Task
 from intelligence_layer.evaluation import (
+    Aggregator,
     DatasetRepository,
     Example,
     InMemoryDatasetRepository,
@@ -146,23 +147,32 @@ def classify_evaluator(
     in_memory_dataset_repository: DatasetRepository,
     in_memory_run_repository: RunRepository,
     in_memory_evaluation_repository: InMemoryEvaluationRepository,
-    in_memory_aggregation_repository: InMemoryAggregationRepository,
     multi_label_classify_evaluation_logic: MultiLabelClassifyEvaluationLogic,
-    multi_label_classify_aggregation_logic: MultiLabelClassifyAggregationLogic,
 ) -> Evaluator[
     ClassifyInput,
     MultiLabelClassifyOutput,
     Sequence[str],
     MultiLabelClassifyEvaluation,
-    AggregatedMultiLabelClassifyEvaluation,
 ]:
     return Evaluator(
         in_memory_dataset_repository,
         in_memory_run_repository,
         in_memory_evaluation_repository,
-        in_memory_aggregation_repository,
         "multi-label-classify",
         multi_label_classify_evaluation_logic,
+    )
+
+
+@fixture
+def classify_aggregator(
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
+    in_memory_aggregation_repository: InMemoryAggregationRepository,
+    multi_label_classify_aggregation_logic: MultiLabelClassifyAggregationLogic,
+) -> Aggregator[MultiLabelClassifyEvaluation, AggregatedMultiLabelClassifyEvaluation,]:
+    return Aggregator(
+        in_memory_evaluation_repository,
+        in_memory_aggregation_repository,
+        "multi-label-classify",
         multi_label_classify_aggregation_logic,
     )
 
@@ -188,7 +198,6 @@ def test_multi_label_classify_evaluator_single_example(
         MultiLabelClassifyOutput,
         Sequence[str],
         MultiLabelClassifyEvaluation,
-        AggregatedMultiLabelClassifyEvaluation,
     ],
     classify_runner: Runner[ClassifyInput, MultiLabelClassifyOutput],
 ) -> None:
@@ -214,14 +223,19 @@ def test_multi_label_classify_evaluator_full_dataset(
         MultiLabelClassifyOutput,
         Sequence[str],
         MultiLabelClassifyEvaluation,
-        AggregatedMultiLabelClassifyEvaluation,
+    ],
+    classify_aggregator: Aggregator[
+        MultiLabelClassifyEvaluation, AggregatedMultiLabelClassifyEvaluation
     ],
     classify_runner: Runner[ClassifyInput, MultiLabelClassifyOutput],
 ) -> None:
     run_overview = classify_runner.run_dataset(multiple_entries_dataset_name)
 
-    evaluation = classify_evaluator.eval_and_aggregate_runs(run_overview.id)
+    evaluation_overview = classify_evaluator.evaluate_runs(run_overview.id)
+    aggregation_overview = classify_aggregator.aggregate_evaluation(
+        evaluation_overview.id
+    )
 
-    assert set(["positive", "negative", "finance", "school"]) == set(
-        evaluation.statistics.class_metrics.keys()
+    assert {"positive", "negative", "finance", "school"} == set(
+        aggregation_overview.statistics.class_metrics.keys()
     )
diff --git a/tests/use_cases/classify/test_prompt_based_classify.py b/tests/use_cases/classify/test_prompt_based_classify.py
index f8f5e9066..5b349061d 100644
--- a/tests/use_cases/classify/test_prompt_based_classify.py
+++ b/tests/use_cases/classify/test_prompt_based_classify.py
@@ -7,16 +7,15 @@
 )
 from intelligence_layer.core import Chunk, InMemoryTracer, NoOpTracer
 from intelligence_layer.evaluation import (
+    Aggregator,
     DatasetRepository,
     Example,
+    InMemoryAggregationRepository,
     InMemoryDatasetRepository,
     InMemoryEvaluationRepository,
     Runner,
     RunRepository,
 )
-from intelligence_layer.evaluation.data_storage.aggregation_repository import (
-    InMemoryAggregationRepository,
-)
 from intelligence_layer.evaluation.evaluator import Evaluator
 from intelligence_layer.use_cases.classify.classify import (
     AggregatedSingleLabelClassifyEvaluation,
@@ -51,23 +50,35 @@ def classify_evaluator(
     in_memory_dataset_repository: DatasetRepository,
     in_memory_run_repository: RunRepository,
     in_memory_evaluation_repository: InMemoryEvaluationRepository,
-    in_memory_aggregation_repository: InMemoryAggregationRepository,
     single_label_classify_eval_logic: SingleLabelClassifyEvaluationLogic,
-    single_label_classify_aggregation_logic: SingleLabelClassifyAggregationLogic,
 ) -> Evaluator[
     ClassifyInput,
     SingleLabelClassifyOutput,
     Sequence[str],
     SingleLabelClassifyEvaluation,
-    AggregatedSingleLabelClassifyEvaluation,
 ]:
     return Evaluator(
         in_memory_dataset_repository,
         in_memory_run_repository,
         in_memory_evaluation_repository,
-        in_memory_aggregation_repository,
         "single-label-classify",
         single_label_classify_eval_logic,
+    )
+
+
+@fixture
+def classify_aggregator(
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
+    in_memory_aggregation_repository: InMemoryAggregationRepository,
+    single_label_classify_aggregation_logic: SingleLabelClassifyAggregationLogic,
+) -> Aggregator[
+    SingleLabelClassifyEvaluation,
+    AggregatedSingleLabelClassifyEvaluation,
+]:
+    return Aggregator(
+        in_memory_evaluation_repository,
+        in_memory_aggregation_repository,
+        "single-label-classify",
         single_label_classify_aggregation_logic,
     )
 
@@ -178,7 +189,6 @@ def test_can_evaluate_classify(
         SingleLabelClassifyOutput,
         Sequence[str],
         SingleLabelClassifyEvaluation,
-        AggregatedSingleLabelClassifyEvaluation,
     ],
     prompt_based_classify: PromptBasedClassify,
 ) -> None:
@@ -210,6 +220,9 @@ def test_can_aggregate_evaluations(
         SingleLabelClassifyOutput,
         Sequence[str],
         SingleLabelClassifyEvaluation,
+    ],
+    classify_aggregator: Aggregator[
+        SingleLabelClassifyEvaluation,
         AggregatedSingleLabelClassifyEvaluation,
     ],
     in_memory_dataset_repository: InMemoryDatasetRepository,
@@ -235,9 +248,12 @@ def test_can_aggregate_evaluations(
     )
 
     run_overview = classify_runner.run_dataset(dataset_name)
-    evaluation_overview = classify_evaluator.eval_and_aggregate_runs(run_overview.id)
+    evaluation_overview = classify_evaluator.evaluate_runs(run_overview.id)
+    aggregation_overview = classify_aggregator.aggregate_evaluation(
+        evaluation_overview.id
+    )
 
-    assert evaluation_overview.statistics.percentage_correct == 0.5
+    assert aggregation_overview.statistics.percentage_correct == 0.5
 
 
 def test_aggregating_evaluations_works_with_empty_list(
@@ -246,6 +262,9 @@ def test_aggregating_evaluations_works_with_empty_list(
         SingleLabelClassifyOutput,
         Sequence[str],
         SingleLabelClassifyEvaluation,
+    ],
+    classify_aggregator: Aggregator[
+        SingleLabelClassifyEvaluation,
         AggregatedSingleLabelClassifyEvaluation,
     ],
     classify_runner: Runner[ClassifyInput, SingleLabelClassifyOutput],
@@ -253,6 +272,9 @@ def test_aggregating_evaluations_works_with_empty_list(
 ) -> None:
     dataset_id = in_memory_dataset_repository.create_dataset([])
     run_overview = classify_runner.run_dataset(dataset_id)
-    evaluation_overview = classify_evaluator.eval_and_aggregate_runs(run_overview.id)
+    evaluation_overview = classify_evaluator.evaluate_runs(run_overview.id)
+    aggregation_overview = classify_aggregator.aggregate_evaluation(
+        evaluation_overview.id
+    )
 
-    assert evaluation_overview.statistics.percentage_correct == 0
+    assert aggregation_overview.statistics.percentage_correct == 0
diff --git a/tests/use_cases/summarize/test_summarize.py b/tests/use_cases/summarize/test_summarize.py
index fbee3847f..e43702cbe 100644
--- a/tests/use_cases/summarize/test_summarize.py
+++ b/tests/use_cases/summarize/test_summarize.py
@@ -2,6 +2,7 @@
 
 from intelligence_layer.core import Chunk, Language, NoOpTracer
 from intelligence_layer.evaluation import (
+    Aggregator,
     DatasetRepository,
     EvaluationRepository,
     Example,
@@ -50,23 +51,27 @@ def single_chunk_summarize_evaluator(
     in_memory_dataset_repository: InMemoryDatasetRepository,
     in_memory_run_repository: InMemoryRunRepository,
     in_memory_evaluation_repository: InMemoryEvaluationRepository,
-    in_memory_aggregation_repository: InMemoryAggregationRepository,
     single_chunk_summarize_eval_logic: SingleChunkSummarizeEvaluationLogic,
-    single_chunk_summarize_aggregation_logic: SingleChunkSummarizeAggregationLogic,
-) -> Evaluator[
-    SingleChunkSummarizeInput,
-    SummarizeOutput,
-    str,
-    SummarizeEvaluation,
-    AggregatedSummarizeEvaluation,
-]:
+) -> Evaluator[SingleChunkSummarizeInput, SummarizeOutput, str, SummarizeEvaluation,]:
     return Evaluator(
         in_memory_dataset_repository,
         in_memory_run_repository,
         in_memory_evaluation_repository,
-        in_memory_aggregation_repository,
         "single-chunk-summarize",
         single_chunk_summarize_eval_logic,
+    )
+
+
+@fixture
+def single_chunk_summarize_aggregator(
+    in_memory_evaluation_repository: InMemoryEvaluationRepository,
+    in_memory_aggregation_repository: InMemoryAggregationRepository,
+    single_chunk_summarize_aggregation_logic: SingleChunkSummarizeAggregationLogic,
+) -> Aggregator[SummarizeEvaluation, AggregatedSummarizeEvaluation,]:
+    return Aggregator(
+        in_memory_evaluation_repository,
+        in_memory_aggregation_repository,
+        "single-chunk-summarize",
         single_chunk_summarize_aggregation_logic,
     )
 
@@ -100,23 +105,32 @@ def long_context_summarize_evaluator(
     in_memory_dataset_repository: DatasetRepository,
     in_memory_run_repository: RunRepository,
     in_memory_evaluation_repository: EvaluationRepository,
-    in_memory_aggregation_repository: InMemoryAggregationRepository,
     long_context_summarize_evaluation_logic: LongContextSummarizeEvaluationLogic,
-    long_context_summarize_aggregation_logic: LongContextSummarizeAggregationLogic,
 ) -> Evaluator[
     LongContextSummarizeInput,
     LongContextSummarizeOutput,
     str,
     SummarizeEvaluation,
-    AggregatedSummarizeEvaluation,
 ]:
     return Evaluator(
         in_memory_dataset_repository,
         in_memory_run_repository,
         in_memory_evaluation_repository,
-        in_memory_aggregation_repository,
         "long-context-summarize",
         long_context_summarize_evaluation_logic,
+    )
+
+
+@fixture
+def long_context_summarize_aggregator(
+    in_memory_evaluation_repository: EvaluationRepository,
+    in_memory_aggregation_repository: InMemoryAggregationRepository,
+    long_context_summarize_aggregation_logic: LongContextSummarizeAggregationLogic,
+) -> Aggregator[SummarizeEvaluation, AggregatedSummarizeEvaluation,]:
+    return Aggregator(
+        in_memory_evaluation_repository,
+        in_memory_aggregation_repository,
+        "long-context-summarize",
         long_context_summarize_aggregation_logic,
     )
 
@@ -141,6 +155,9 @@ def test_single_chunk_summarize_evaluator(
         SummarizeOutput,
         str,
         SummarizeEvaluation,
+    ],
+    single_chunk_summarize_aggregator: Aggregator[
+        SummarizeEvaluation,
         AggregatedSummarizeEvaluation,
     ],
     single_chunk_summarize_runner: Runner[str, str],
@@ -162,9 +179,12 @@ def test_single_chunk_summarize_evaluator(
     )
     run_overview = single_chunk_summarize_runner.run_dataset(dataset_name)
 
-    aggregation_overview = single_chunk_summarize_evaluator.eval_and_aggregate_runs(
+    evaluation_overview = single_chunk_summarize_evaluator.evaluate_runs(
         run_overview.id
     )
+    aggregation_overview = single_chunk_summarize_aggregator.aggregate_evaluation(
+        evaluation_overview.id
+    )
 
     assert aggregation_overview.successful_evaluation_count == 2
     individual_evaluation_id = next(iter(aggregation_overview.evaluation_overviews)).id
@@ -194,6 +214,9 @@ def test_long_context_summarize_evaluator(
         LongContextSummarizeOutput,
         str,
         SummarizeEvaluation,
+    ],
+    long_context_summarize_aggregator: Aggregator[
+        SummarizeEvaluation,
         AggregatedSummarizeEvaluation,
     ],
     long_context_summarize_runner: Runner[str, str],
@@ -214,9 +237,12 @@ def test_long_context_summarize_evaluator(
     )
     run_overview = long_context_summarize_runner.run_dataset(dataset_name)
 
-    aggregation_overview = long_context_summarize_evaluator.eval_and_aggregate_runs(
+    evaluation_overview = long_context_summarize_evaluator.evaluate_runs(
         run_overview.id
     )
+    aggregation_overview = long_context_summarize_aggregator.aggregate_evaluation(
+        evaluation_overview.id
+    )
 
     assert aggregation_overview.successful_evaluation_count == 2
     individual_evaluation_id = next(iter(aggregation_overview.evaluation_overviews)).id