From db817ab4a5dc587d352c1b582d08ef9c3437cb30 Mon Sep 17 00:00:00 2001
From: Merlin Kallenborn <Merlin.Kallenborn@ext.aleph-alpha.com>
Date: Fri, 17 May 2024 14:31:47 +0200
Subject: [PATCH] refactor: Remove non-incremental ELO logics and instead use
 only incremental ELO classes and remove `Incremental` prefix

TASK: IL-394
---
 src/documentation/elo_qa_eval.ipynb           | 319 ++++++++++++++----
 .../evaluation/evaluator/elo_evaluator.py     |  74 +++-
 .../evaluator/incremental_evaluator.py        |  12 +-
 src/intelligence_layer/examples/__init__.py   |   6 +-
 .../examples/qa/elo_qa_evaluation_logic.py    |  36 +-
 .../qa/incremental_elo_qa_evaluation_logic.py | 183 ----------
 ...luator.py => test_elo_evaluation_logic.py} |  23 +-
 7 files changed, 366 insertions(+), 287 deletions(-)
 delete mode 100644 src/intelligence_layer/examples/qa/incremental_elo_qa_evaluation_logic.py
 rename tests/evaluation/{test_elo_evaluator.py => test_elo_evaluation_logic.py} (92%)

diff --git a/src/documentation/elo_qa_eval.ipynb b/src/documentation/elo_qa_eval.ipynb
index 3a261fd52..bd8b79e45 100644
--- a/src/documentation/elo_qa_eval.ipynb
+++ b/src/documentation/elo_qa_eval.ipynb
@@ -23,7 +23,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -48,7 +48,7 @@
     "    Runner,\n",
     ")\n",
     "from intelligence_layer.examples import (\n",
-    "    IncrementalEloQaEvaluationLogic,\n",
+    "    EloQaEvaluationLogic,\n",
     "    SingleChunkQa,\n",
     "    SingleChunkQaInput,\n",
     "    SingleChunkQaOutput,\n",
@@ -79,7 +79,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -99,7 +99,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -146,7 +146,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -157,7 +157,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -174,9 +174,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Example ID = 18a61dbe-d2d1-4652-8bfa-2940a38ec74b\n",
+      "Input = chunk=\"Surface micromachining\\n\\nSurface micromachining builds microstructures by deposition and etching structural layers over a substrate.[1] This is different from Bulk micromachining, in which a silicon substrate wafer is selectively etched to produce structures.\\n\\nLayers\\n\\nGenerally, polysilicon is used as one of the substrate layers while silicon dioxide is used as a sacrificial layer. The sacrificial layer is removed or etched out to create any necessary void in the thickness direction. Added layers tend to vary in size from 2-5 micrometres. The main advantage of this machining process is the ability to build electronic and mechanical components (functions) on the same substrate. Surface micro-machined components are smaller compared to their bulk micro-machined counterparts.\\n\\nAs the structures are built on top of the substrate and not inside it, the substrate's properties are not as important as in bulk micro-machining. Expensive silicon wafers can be replaced by cheaper substrates, such as glass or plastic. The size of the substrates may be larger than a silicon wafer, and surface micro-machining is used to produce thin-film transistors on large area glass substrates for flat panel displays. This technology can also be used for the manufacture of thin film solar cells, which can be deposited on glass, polyethylene terepthalate substrates or other non-rigid materials.\\n\\nFabrication process\\n\\nMicro-machining starts with a silicon wafer or other substrate upon which new layers are grown. These layers are selectively etched by photo-lithography; either a wet etch involving an acid, or a dry etch involving an ionized gas (or plasma). Dry etching can combine chemical etching with physical etching or ion bombardment. Surface micro-machining involves as many layers as are needed with a different mask (producing a different pattern) on each layer. Modern integrated circuit fabrication uses this technique and can use as many as 100 layers. Micro-machining is a younger technology and usually uses no more than 5 or 6 layers. Surface micro-machining uses developed technology (although sometimes not enough for demanding applications) which is easily repeatable for volume production.\" question='What is micromachining?' language=Language(iso_639_1='en')\n",
+      "Expected output = \"Surface micromachining builds microstructures by deposition and etching structural layers over a substrate. This is different from Bulk micromachining, in which a silicon substrate wafer is selectively etched to produce structures.\"\n",
+      "\n",
+      "Example ID = f5cdcb1f-38e4-4bfd-b170-9744237bd0d9\n",
+      "Input = chunk=\"\\nSilicon is a chemical element; it has symbol Si and atomic number 14. It is a hard, brittle crystalline solid with a blue-grey metallic luster, and is a non metal and semiconductor. It is a member of group 14 in the periodic table: carbon is above it; and germanium, tin, lead, and flerovium are below it. It is relatively unreactive.\\n\\nBecause of its high chemical affinity for oxygen, it was not until 1823 that Jöns Jakob Berzelius was first able to prepare it and characterize it in pure form. Its oxides form a family of anions known as silicates. Its melting and boiling points of 1414 °C and 3265 °C, respectively, are the second highest among all the metalloids and nonmetals, being surpassed only by boron.[a]\\n\\nSilicon is the eighth most common element in the universe by mass, but very rarely occurs as the pure element in the Earth's crust. It is widely distributed in space in cosmic dusts, planetoids, and planets as various forms of silicon dioxide (silica) or silicates. More than 90% of the Earth's crust is composed of silicate minerals, making silicon the second most abundant element in the Earth's crust (about 28% by mass), after oxygen. \\n\" question='What is silicon?' language=Language(iso_639_1='en')\n",
+      "Expected output = \"Silicon is a chemical element.\"\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "for example in dataset_repository.examples(dataset_id, SingleChunkQaInput, str):\n",
     "    print(example)"
@@ -200,9 +215,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Running: 2it [00:09,  4.79s/it]\n",
+      "Running: 2it [00:16,  8.17s/it]\n"
+     ]
+    }
+   ],
    "source": [
     "models = [\n",
     "    LuminousControlModel(name=\"luminous-base-control-20240215\", client=aa_client),\n",
@@ -221,7 +245,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -241,9 +265,50 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Run overview IDs saved in the run repository: ['40acbcce-d908-4749-a454-172286e47532', '59339760-1fd7-4606-af14-332eeed32950']\n",
+      "\n",
+      "Run Overview ID = 40acbcce-d908-4749-a454-172286e47532\n",
+      "Dataset ID = e30b678e-fa7f-45ea-8e48-21860a8aa23c\n",
+      "Start time = 2024-05-17 12:06:39.347886+00:00\n",
+      "End time = 2024-05-17 12:06:55.694959+00:00\n",
+      "Failed example count = 0\n",
+      "Successful example count = 2\n",
+      "Description = \"QA with model luminous-supreme-control-20240215\"\n",
+      "\n",
+      "Example ID=18a61dbe-d2d1-4652-8bfa-2940a38ec74b\n",
+      "Related Run ID=40acbcce-d908-4749-a454-172286e47532\n",
+      "Output=\"answer='Surface micromachining is a process of building microstructures by deposition and etching structural layers over a substrate.' highlights=[ScoredTextHighlight(start=24, end=131, score=1.0)]\"\n",
+      "\n",
+      "Example ID=f5cdcb1f-38e4-4bfd-b170-9744237bd0d9\n",
+      "Related Run ID=40acbcce-d908-4749-a454-172286e47532\n",
+      "Output=\"answer='Silicon is a chemical element with symbol Si and atomic number 14. It is a hard, brittle crystalline solid with a blue-grey metallic luster, and is a non metal and semiconductor.' highlights=[ScoredTextHighlight(start=71, end=182, score=1.0)]\"\n",
+      "\n",
+      "Run Overview ID = 59339760-1fd7-4606-af14-332eeed32950\n",
+      "Dataset ID = e30b678e-fa7f-45ea-8e48-21860a8aa23c\n",
+      "Start time = 2024-05-17 12:06:29.753340+00:00\n",
+      "End time = 2024-05-17 12:06:39.347388+00:00\n",
+      "Failed example count = 0\n",
+      "Successful example count = 2\n",
+      "Description = \"QA with model luminous-base-control-20240215\"\n",
+      "\n",
+      "Example ID=18a61dbe-d2d1-4652-8bfa-2940a38ec74b\n",
+      "Related Run ID=59339760-1fd7-4606-af14-332eeed32950\n",
+      "Output=\"answer='Micromachining is a process of building microstructures by deposition and etching structural layers over a substrate.' highlights=[ScoredTextHighlight(start=24, end=131, score=1.0)]\"\n",
+      "\n",
+      "Example ID=f5cdcb1f-38e4-4bfd-b170-9744237bd0d9\n",
+      "Related Run ID=59339760-1fd7-4606-af14-332eeed32950\n",
+      "Output=\"answer='Silicon is a chemical element with symbol Si and atomic number 14. It is a hard, brittle crystalline solid with a blue-grey metallic luster, and is a non metal and semiconductor.' highlights=[ScoredTextHighlight(start=71, end=182, score=1.0)]\"\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "print(\n",
     "    f\"Run overview IDs saved in the run repository: {run_repository.run_overview_ids()}\\n\"\n",
@@ -269,9 +334,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "IDs of stored evaluations: []\n"
+     ]
+    }
+   ],
    "source": [
     "# this should demonstrate that there are no stored evaluations yet in our repository\n",
     "print(f\"IDs of stored evaluations: {evaluation_repository.evaluation_overview_ids()}\")"
@@ -279,11 +352,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
-    "elo_qa_evaluation_logic = IncrementalEloQaEvaluationLogic(\n",
+    "elo_qa_evaluation_logic = EloQaEvaluationLogic(\n",
     "    model=Llama3InstructModel(name=\"llama-3-8b-instruct\")\n",
     ")\n",
     "\n",
@@ -298,16 +371,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Evaluating: 2it [00:00,  3.29it/s]\n"
+     ]
+    }
+   ],
    "source": [
     "evaluation_overview = evaluator.evaluate_runs(*run_repository.run_overview_ids())"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -330,9 +411,39 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluation Overview ID = ef2da6c0-3c84-4414-9aa8-be878afd18f6\n",
+      "Start time = 2024-05-17 12:06:55.727219+00:00\n",
+      "End time = 2024-05-17 12:06:56.337819+00:00\n",
+      "Successful examples = 2\n",
+      "Failed examples = 0\n",
+      "Description = \"ELO QA evaluation\"\n",
+      "Run Overviews={\n",
+      "Run Overview ID = 40acbcce-d908-4749-a454-172286e47532\n",
+      "Dataset ID = e30b678e-fa7f-45ea-8e48-21860a8aa23c\n",
+      "Start time = 2024-05-17 12:06:39.347886+00:00\n",
+      "End time = 2024-05-17 12:06:55.694959+00:00\n",
+      "Failed example count = 0\n",
+      "Successful example count = 2\n",
+      "Description = \"QA with model luminous-supreme-control-20240215\"\n",
+      ", Run Overview ID = 59339760-1fd7-4606-af14-332eeed32950\n",
+      "Dataset ID = e30b678e-fa7f-45ea-8e48-21860a8aa23c\n",
+      "Start time = 2024-05-17 12:06:29.753340+00:00\n",
+      "End time = 2024-05-17 12:06:39.347388+00:00\n",
+      "Failed example count = 0\n",
+      "Successful example count = 2\n",
+      "Description = \"QA with model luminous-base-control-20240215\"\n",
+      "}\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "for evaluation_overview in evaluation_repository.evaluation_overviews():\n",
     "    print(evaluation_overview)"
@@ -350,9 +461,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "IDs of stored aggregated evaluations: []\n"
+     ]
+    }
+   ],
    "source": [
     "# this should demonstrate that there are no stored aggregated evaluations yet in our repository\n",
     "print(\n",
@@ -362,7 +481,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -378,7 +497,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -404,9 +523,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Aggregation Overview ID = 5bf4c3bf-de54-4a90-bede-ce1235c9aff7\n",
+      "Start time = 2024-05-17 12:06:56.366455+00:00\n",
+      "End time = 2024-05-17 12:06:56.367808+00:00\n",
+      "Successful example count = 2\n",
+      "Count of examples crashed during evaluation = 0\n",
+      "Description = \"ELO QA aggregation\"\n",
+      "IDs of aggregated Evaluation Overviews = ['ef2da6c0-3c84-4414-9aa8-be878afd18f6']\n",
+      "IDs of aggregated Run Overviews = ['40acbcce-d908-4749-a454-172286e47532', '59339760-1fd7-4606-af14-332eeed32950']\n",
+      "Statistics = {\n",
+      "scores={'40acbcce-d908-4749-a454-172286e47532': PlayerScore(elo=1509.7170402138256, elo_standard_error=0.028612685604210077, win_rate=0.75, num_matches=2), '59339760-1fd7-4606-af14-332eeed32950': PlayerScore(elo=1490.2829597861742, elo_standard_error=0.02861268397674558, win_rate=0.25, num_matches=2)}\n",
+      "}\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "for aggregation_overview in aggregation_repository.aggregation_overviews(\n",
     "    AggregatedEvaluation\n",
@@ -421,16 +559,25 @@
     "# Step 4 - Addition of New Models\n",
     "\n",
     "Now let us consider the case where we want to add new models to our existing evaluation.\n",
-    "Since the comparison of answers is rather time-consuming, we want to avoid recalculating the evaluation for the previous models, and just compare the new models to the old ones. This is why we used the `IncrementalEloQaEvaluator` to begin with.\n",
+    "Since the comparison of answers is rather time-consuming, we want to avoid recalculating the evaluation for the previous models, and just compare the new models to the old ones. This is why we used the `IncrementalEvaluator` to begin with.\n",
     "\n",
     "For this example, we first define the new models and generate their answers."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Running: 2it [00:20, 10.12s/it]\n",
+      "Running: 2it [00:14,  7.11s/it]\n"
+     ]
+    }
+   ],
    "source": [
     "newly_added_models = [\n",
     "    LuminousControlModel(name=\"luminous-base-control-20230501\", client=aa_client),\n",
@@ -451,7 +598,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -464,9 +611,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Run Overview ID = 7df4764c-e873-4aff-9ec5-52cb001a9382\n",
+      "Dataset ID = e30b678e-fa7f-45ea-8e48-21860a8aa23c\n",
+      "Start time = 2024-05-17 12:06:56.394123+00:00\n",
+      "End time = 2024-05-17 12:07:16.635122+00:00\n",
+      "Failed example count = 0\n",
+      "Successful example count = 2\n",
+      "Description = \"New QA with model luminous-base-control-20230501\"\n",
+      "\n",
+      "Run Overview ID = 9be1b687-0e07-4214-8967-49baa351587f\n",
+      "Dataset ID = e30b678e-fa7f-45ea-8e48-21860a8aa23c\n",
+      "Start time = 2024-05-17 12:07:16.635290+00:00\n",
+      "End time = 2024-05-17 12:07:30.865449+00:00\n",
+      "Failed example count = 0\n",
+      "Successful example count = 2\n",
+      "Description = \"New QA with model luminous-supreme-control-20230501\"\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "for run_overview in run_repository.run_overviews():\n",
     "    # skip runs done for previous models\n",
@@ -480,14 +650,22 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Thanks to the `IncrementalEloQaEvaluator`, we can now easily extend our existing evaluation with the comparisons of new model runs against the previous runs, without re-running the previous comparisons. To this end, we use the same evaluator instance as for our first evaluation, but use the `evaluate_additional_runs` method, which takes a list of previous evaluation_overview IDs and uses them to filter the resulting comparisons. In this case, only comparisons of new pairings will be performed."
+    "Thanks to the `IncrementalEvaluator`, we can now easily extend our existing evaluation with the comparisons of new model runs against the previous runs, without re-running the previous comparisons. To this end, we use the same evaluator instance as for our first evaluation, but use the `evaluate_additional_runs` method, which takes a list of previous evaluation_overview IDs and uses them to filter the resulting comparisons. In this case, only comparisons of new pairings will be performed."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Evaluating: 2it [00:03,  1.93s/it]\n"
+     ]
+    }
+   ],
    "source": [
     "new_evaluation_overview = evaluator.evaluate_additional_runs(\n",
     "    *run_repository.run_overview_ids(), previous_evaluation_ids=[evaluation_overview.id]\n",
@@ -496,7 +674,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -520,9 +698,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluation overviews to aggregate: ['842bc7cb-d898-4023-afba-0b390b705ff4', 'ef2da6c0-3c84-4414-9aa8-be878afd18f6']\n"
+     ]
+    }
+   ],
    "source": [
     "# get the IDs of all the evaluation overviews which we created for the QA task\n",
     "evaluation_overview_ids = [\n",
@@ -535,7 +721,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -547,7 +733,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -573,9 +759,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Aggregation Overview ID = 71c31442-8e5b-4ce0-89f6-5df5bba259dc\n",
+      "Start time = 2024-05-17 12:07:34.778145+00:00\n",
+      "End time = 2024-05-17 12:07:34.780775+00:00\n",
+      "Successful example count = 4\n",
+      "Count of examples crashed during evaluation = 0\n",
+      "Description = \"ELO QA aggregation\"\n",
+      "IDs of aggregated Evaluation Overviews = ['842bc7cb-d898-4023-afba-0b390b705ff4', 'ef2da6c0-3c84-4414-9aa8-be878afd18f6']\n",
+      "IDs of aggregated Run Overviews = ['7df4764c-e873-4aff-9ec5-52cb001a9382', '40acbcce-d908-4749-a454-172286e47532', '9be1b687-0e07-4214-8967-49baa351587f', '59339760-1fd7-4606-af14-332eeed32950']\n",
+      "Statistics = {\n",
+      "scores={'40acbcce-d908-4749-a454-172286e47532': PlayerScore(elo=1509.189099417823, elo_standard_error=0.1344337229545878, win_rate=0.5833333333333334, num_matches=6), '9be1b687-0e07-4214-8967-49baa351587f': PlayerScore(elo=1499.8609033022822, elo_standard_error=0.18459654247004761, win_rate=0.5, num_matches=6), '7df4764c-e873-4aff-9ec5-52cb001a9382': PlayerScore(elo=1518.2546091514266, elo_standard_error=0.15278799424990525, win_rate=0.6666666666666666, num_matches=6), '59339760-1fd7-4606-af14-332eeed32950': PlayerScore(elo=1472.6955463281204, elo_standard_error=0.12521054058850456, win_rate=0.25, num_matches=6)}\n",
+      "}\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "print(aggregated_evaluation_with_new_model)"
    ]
diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/elo_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/elo_evaluator.py
index e79ae14ef..e7271288d 100644
--- a/src/intelligence_layer/evaluation/evaluation/evaluator/elo_evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluation/evaluator/elo_evaluator.py
@@ -54,13 +54,63 @@ class EloGradingInput(BaseModel):
 
 
 class EloEvaluationLogic(EvaluationLogic[Input, Output, ExpectedOutput, Matches]):
+    def __init__(self) -> None:
+        super().__init__()
+        self._previous_run_output_ids: list[set[str]] = []
+
+    def set_previous_run_output_ids(
+        self, previous_run_output_ids: list[set[str]]
+    ) -> None:
+        self._previous_run_output_ids = previous_run_output_ids
+
     @final
     def do_evaluate(
         self,
         example: Example[Input, ExpectedOutput],
         *output: SuccessfulExampleOutput[Output],
     ) -> Matches:
-        pairs = combinations(output, 2)
+        """Executes the evaluation for this specific example.
+
+        Responsible for comparing the input & expected output of a task to the
+        actually generated output. The difference to the standard :class:`EvaluationLogic`'s `do_evaluate` is that
+        this method will, in addition, send the collection of already evaluated outputs to `do_incremental_evaluate`.
+
+        Args:
+            example: Input data of :class:`Example` to produce the output.
+            *output: Outputs of the :class:`Task`.
+
+        Returns:
+            :class:`Matches`: The summary of the pairwise comparisons between the provided outputs.
+        """
+
+        already_evaluated_outputs = []
+        for run_output_ids in self._previous_run_output_ids:
+            already_evaluated_outputs.append(
+                [
+                    current_output
+                    for current_output in output
+                    if current_output.run_id in run_output_ids
+                ]
+            )
+
+        return self.do_incremental_evaluate(
+            example, list(output), already_evaluated_outputs
+        )
+
+    @final
+    def do_incremental_evaluate(
+        self,
+        example: Example[Input, ExpectedOutput],
+        outputs: list[SuccessfulExampleOutput[Output]],
+        already_evaluated_outputs: list[list[SuccessfulExampleOutput[Output]]],
+    ) -> Matches:
+        pairs = combinations(outputs, 2)
+        unique_pre_evaluated_runs: set[str] = set()
+
+        for pre_run_output in already_evaluated_outputs:
+            for current_output in pre_run_output:
+                unique_pre_evaluated_runs.add(current_output.run_id)
+
         return Matches(
             comparison_evaluations=[
                 ComparisonEvaluation(
@@ -69,14 +119,32 @@ def do_evaluate(
                     outcome=self.grade(player_a, player_b, example),
                 )
                 for [player_a, player_b] in pairs
+                if unique_pre_evaluated_runs is None
+                or len(unique_pre_evaluated_runs) == 0
+                or not (
+                    player_a.run_id in unique_pre_evaluated_runs
+                    and player_b.run_id in unique_pre_evaluated_runs
+                )
             ]
         )
 
     @abstractmethod
     def grade(
         self,
-        output_a: SuccessfulExampleOutput[Output],
-        output_b: SuccessfulExampleOutput[Output],
+        first: SuccessfulExampleOutput[Output],
+        second: SuccessfulExampleOutput[Output],
         example: Example[Input, ExpectedOutput],
     ) -> MatchOutcome:
+        """Returns a :class: `MatchOutcome`for the provided two contestants on the given example.
+        Defines the use case specific logic how to determine the winner of the two provided outputs.
+
+
+        Args:
+            first: Instance of :class: `SuccessfulExampleOutut[Output]` of the first contestant in the comparison
+            second: Instance of :class: `SuccessfulExampleOutut[Output]` of the second contestant in the comparison
+            example: Datapoint of :class: `Example` on which the two outputs were generated
+
+        Return:
+            Instance of :class: `MatchOutcome`
+        """
         pass
diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py
index 284e6ee5c..aaba0f9d1 100644
--- a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py
@@ -34,7 +34,7 @@ def set_previous_run_output_ids(
     def do_evaluate(
         self,
         example: Example[Input, ExpectedOutput],
-        *outputs: SuccessfulExampleOutput[Output],
+        *output: SuccessfulExampleOutput[Output],
     ) -> Evaluation:
         """Executes the evaluation for this specific example.
 
@@ -45,7 +45,7 @@ def do_evaluate(
 
         Args:
             example: Input data of :class:`Task` to produce the output.
-            outputs: Outputs of the :class:`Task`.
+            *output: Outputs of the :class:`Task`.
 
         Returns:
             :class:`Evaluation`: The metrics that come from the evaluated :class:`Task`.
@@ -54,11 +54,15 @@ def do_evaluate(
         already_evaluated_outputs = []
         for run_output_ids in self._previous_run_output_ids:
             already_evaluated_outputs.append(
-                [output for output in outputs if output.run_id in run_output_ids]
+                [
+                    current_output
+                    for current_output in output
+                    if current_output.run_id in run_output_ids
+                ]
             )
 
         return self.do_incremental_evaluate(
-            example, list(outputs), already_evaluated_outputs
+            example, list(output), already_evaluated_outputs
         )
 
     @abstractmethod
diff --git a/src/intelligence_layer/examples/__init__.py b/src/intelligence_layer/examples/__init__.py
index c0a75ba40..3c4391876 100644
--- a/src/intelligence_layer/examples/__init__.py
+++ b/src/intelligence_layer/examples/__init__.py
@@ -45,9 +45,6 @@
     PromptBasedClassifyWithDefinitions as PromptBasedClassifyWithDefinitions,
 )
 from .qa.elo_qa_evaluation_logic import EloQaEvaluationLogic as EloQaEvaluationLogic
-from .qa.incremental_elo_qa_evaluation_logic import (
-    IncrementalEloQaEvaluationLogic as IncrementalEloQaEvaluationLogic,
-)
 from .qa.long_context_qa import LongContextQa as LongContextQa
 from .qa.long_context_qa import LongContextQaInput as LongContextQaInput
 from .qa.multiple_chunk_qa import MultipleChunkQa as MultipleChunkQa
@@ -60,6 +57,9 @@
 from .qa.multiple_chunk_retriever_qa import (
     MultipleChunkRetrieverQaOutput as MultipleChunkRetrieverQaOutput,
 )
+from .qa.elo_qa_evaluation_logic import (
+    EloQaEvaluationLogic as EloQaEvaluationLogic,
+)
 from .qa.retriever_based_qa import EnrichedSubanswer as EnrichedSubanswer
 from .qa.retriever_based_qa import RetrieverBasedQa as RetrieverBasedQa
 from .qa.retriever_based_qa import RetrieverBasedQaInput as RetrieverBasedQaInput
diff --git a/src/intelligence_layer/examples/qa/elo_qa_evaluation_logic.py b/src/intelligence_layer/examples/qa/elo_qa_evaluation_logic.py
index 002c07e58..081ab2561 100644
--- a/src/intelligence_layer/examples/qa/elo_qa_evaluation_logic.py
+++ b/src/intelligence_layer/examples/qa/elo_qa_evaluation_logic.py
@@ -7,11 +7,11 @@
 from intelligence_layer.core.detect_language import Language
 from intelligence_layer.core.model import CompleteInput, CompleteOutput, ControlModel
 from intelligence_layer.core.tracer.tracer import NoOpTracer, TaskSpan, Tracer
-from intelligence_layer.evaluation import MatchOutcome
 from intelligence_layer.evaluation.dataset.domain import Example
 from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import (
     EloEvaluationLogic,
     EloGradingInput,
+    MatchOutcome,
 )
 from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput
 from intelligence_layer.examples.qa.single_chunk_qa import (
@@ -57,6 +57,23 @@ def __init__(
         self._model = model
         self.tracer = tracer
 
+    def grade(
+        self,
+        first: SuccessfulExampleOutput[SingleChunkQaOutput],
+        second: SuccessfulExampleOutput[SingleChunkQaOutput],
+        example: Example[SingleChunkQaInput, SingleChunkQaOutput],
+    ) -> MatchOutcome:
+        grading_input = self._create_grading_input(first, second, example)
+
+        return MatchOutcome(
+            self.do_run(
+                grading_input,
+                self.tracer.task_span(
+                    task_name="elo_qa_run_grader", input=grading_input
+                ),
+            )
+        )
+
     @staticmethod
     def _create_grading_input(
         first: SuccessfulExampleOutput[SingleChunkQaOutput],
@@ -95,23 +112,6 @@ def do_run(self, input: EloGradingInput, task_span: TaskSpan) -> MatchOutcome:
 
         return self.calculate_winners(complete_output)
 
-    def grade(
-        self,
-        first: SuccessfulExampleOutput[SingleChunkQaOutput],
-        second: SuccessfulExampleOutput[SingleChunkQaOutput],
-        example: Example[SingleChunkQaInput, SingleChunkQaOutput],
-    ) -> MatchOutcome:
-        grading_input = self._create_grading_input(first, second, example)
-
-        return MatchOutcome(
-            self.do_run(
-                grading_input,
-                self.tracer.task_span(
-                    task_name="elo_qa_run_grader", input=grading_input
-                ),
-            )
-        )
-
     def calculate_winners(self, complete_output: CompleteOutput) -> MatchOutcome:
         default_log_prob = float("-inf")
 
diff --git a/src/intelligence_layer/examples/qa/incremental_elo_qa_evaluation_logic.py b/src/intelligence_layer/examples/qa/incremental_elo_qa_evaluation_logic.py
deleted file mode 100644
index 3cc52ec5d..000000000
--- a/src/intelligence_layer/examples/qa/incremental_elo_qa_evaluation_logic.py
+++ /dev/null
@@ -1,183 +0,0 @@
-import math
-from itertools import combinations
-from typing import Mapping, Sequence
-
-from aleph_alpha_client import Prompt
-from liquid import Template
-
-from intelligence_layer.core.detect_language import Language
-from intelligence_layer.core.model import CompleteInput, CompleteOutput, ControlModel
-from intelligence_layer.core.tracer.tracer import NoOpTracer, TaskSpan, Tracer
-from intelligence_layer.evaluation.dataset.domain import Example
-from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import (
-    ComparisonEvaluation,
-    EloGradingInput,
-    Matches,
-    MatchOutcome,
-)
-from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import (
-    IncrementalEvaluationLogic,
-)
-from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput
-from intelligence_layer.examples.qa.single_chunk_qa import (
-    QA_INSTRUCTIONS,
-    SingleChunkQaInput,
-    SingleChunkQaOutput,
-)
-
-
-class IncrementalEloQaEvaluationLogic(
-    IncrementalEvaluationLogic[
-        SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput, Matches
-    ]
-):
-    INPUT_TEMPLATE = """
-Your task is to compare two answers to an instruction on one metric.
-
-Please make sure you read and understand these instruction carefully. Please keep this document open while reviewing, and refer to it as needed.
-
-The Instruction for the answers was:{instruction}
-
-Evaluation Procedure:
-1. Read both answers carefully and identify the main facts and details they present.
-2. Check if the answers contain any factual errors that are not supported by the instruction.
-3. Evaluate which answer is more correct.
-
-Answer A:{first_completion}
-
-Answer B:{second_completion}
-
-Which answer is more correct given the Instruction and Evaluation Procedure, Answer A or Answer B?
-
-Response: Answer """
-    VALUES = [
-        " A",
-        " B",
-    ]  # The space before the A and B is important due to tokenization
-
-    def __init__(
-        self,
-        model: ControlModel,
-        tracer: Tracer = NoOpTracer(),
-    ):
-        super().__init__()
-        self._model = model
-        self.tracer = tracer
-
-    def do_incremental_evaluate(
-        self,
-        example: Example[SingleChunkQaInput, SingleChunkQaOutput],
-        outputs: list[SuccessfulExampleOutput[SingleChunkQaOutput]],
-        already_evaluated_outputs: list[
-            list[SuccessfulExampleOutput[SingleChunkQaOutput]]
-        ],
-    ) -> Matches:
-        pairs = combinations(outputs, 2)
-        unique_pre_evaluated_runs: set[str] = set()
-
-        for pre_run_output in already_evaluated_outputs:
-            for current_output in pre_run_output:
-                unique_pre_evaluated_runs.add(current_output.run_id)
-
-        return Matches(
-            comparison_evaluations=[
-                ComparisonEvaluation(
-                    first_player=player_a.run_id,
-                    second_player=player_b.run_id,
-                    outcome=self.grade(player_a, player_b, example),
-                )
-                for [player_a, player_b] in pairs
-                if unique_pre_evaluated_runs is None
-                or len(unique_pre_evaluated_runs) == 0
-                or not (
-                    player_a.run_id in unique_pre_evaluated_runs
-                    and player_b.run_id in unique_pre_evaluated_runs
-                )
-            ]
-        )
-
-    def grade(
-        self,
-        first: SuccessfulExampleOutput[SingleChunkQaOutput],
-        second: SuccessfulExampleOutput[SingleChunkQaOutput],
-        example: Example[SingleChunkQaInput, SingleChunkQaOutput],
-    ) -> MatchOutcome:
-        grading_input = self._create_grading_input(first, second, example)
-
-        return MatchOutcome(
-            self.do_run(
-                grading_input,
-                self.tracer.task_span(
-                    task_name="elo_qa_run_grader", input=grading_input
-                ),
-            )
-        )
-
-    @staticmethod
-    def _create_grading_input(
-        first: SuccessfulExampleOutput[SingleChunkQaOutput],
-        second: SuccessfulExampleOutput[SingleChunkQaOutput],
-        example: Example[SingleChunkQaInput, SingleChunkQaOutput],
-    ) -> EloGradingInput:
-        qa_instruction = Template(
-            QA_INSTRUCTIONS[Language("en")].unformatted_instruction
-        ).render(question=example.input.question)
-
-        no_answer = "There is no answer."
-        return EloGradingInput(
-            instruction=f"{example.input.chunk} {qa_instruction}",
-            first_completion=(
-                first.output.answer if first.output.answer is not None else no_answer
-            ),
-            second_completion=(
-                second.output.answer if second.output.answer is not None else no_answer
-            ),
-        )
-
-    def do_run(self, input: EloGradingInput, task_span: TaskSpan) -> MatchOutcome:
-        text = self.INPUT_TEMPLATE.format(
-            instruction=input.instruction,
-            first_completion=input.first_completion,
-            second_completion=input.second_completion,
-        )
-
-        complete_input = CompleteInput(
-            prompt=Prompt.from_text(text),
-            maximum_tokens=1,
-            log_probs=3,
-            disable_optimizations=True,
-        )
-        complete_output = self._model.complete_task().run(complete_input, task_span)
-
-        return self.calculate_winners(complete_output)
-
-    def calculate_winners(self, complete_output: CompleteOutput) -> MatchOutcome:
-        default_log_prob = float("-inf")
-
-        def get_normalized_prob(
-            log_prob_list: Sequence[Mapping[str, float | None]] | None,
-        ) -> float:
-            assert log_prob_list is not None
-            log_probs = log_prob_list[0]
-            values = [
-                math.exp(log_probs.get(str(key), default_log_prob) or default_log_prob)
-                for key in self.VALUES
-            ]
-            if all(v == 0 for v in values):
-                raise ValueError(
-                    f"LLM evaluation response does not contain logprobs for the required tokens for the values: {self.VALUES}"
-                )
-            return values[0] / sum(values)
-
-        def categorize_value(value: float) -> MatchOutcome:
-            if value > 0.7:
-                return MatchOutcome.A_WINS
-            elif 0.3 > value:
-                return MatchOutcome.B_WINS
-            else:
-                return MatchOutcome.DRAW
-
-        normalized_probability = get_normalized_prob(
-            complete_output.completions[0].log_probs
-        )
-        return categorize_value(normalized_probability)
diff --git a/tests/evaluation/test_elo_evaluator.py b/tests/evaluation/test_elo_evaluation_logic.py
similarity index 92%
rename from tests/evaluation/test_elo_evaluator.py
rename to tests/evaluation/test_elo_evaluation_logic.py
index 8698dbe1c..74e35ee1b 100644
--- a/tests/evaluation/test_elo_evaluator.py
+++ b/tests/evaluation/test_elo_evaluation_logic.py
@@ -64,21 +64,6 @@ def model(client: AlephAlphaClientProtocol) -> ControlModel:
     return LuminousControlModel(client=client, name="luminous-base-control")
 
 
-@fixture
-def in_memory_dataset_repository() -> InMemoryDatasetRepository:
-    return InMemoryDatasetRepository()
-
-
-@fixture
-def in_memory_run_repository() -> InMemoryRunRepository:
-    return InMemoryRunRepository()
-
-
-@fixture
-def in_memory_evaluation_repository() -> InMemoryEvaluationRepository:
-    return InMemoryEvaluationRepository()
-
-
 @fixture
 def dummy_eval_logic(model: ControlModel) -> DummyEloQaEvalLogic:
     return DummyEloQaEvalLogic(model=model)
@@ -129,7 +114,7 @@ def qa_setup(
     in_memory_dataset_repository: InMemoryDatasetRepository,
     in_memory_run_repository: InMemoryRunRepository,
     qa_outputs: Sequence[SingleChunkQaOutput],
-) -> Tuple[Sequence[str], str]:
+) -> Sequence[str]:
     qa_input_text = TextChunk(
         """Surface micromachining builds microstructures by deposition and etching structural layers over a substrate.[1] This is different from Bulk micromachining, in which a silicon substrate wafer is selectively etched to produce structures."""
     )
@@ -167,16 +152,16 @@ def qa_setup(
                 description="runner",
             )
         )
-    return run_ids, dataset_id
+    return run_ids
 
 
 def test_evaluate_runs_creates_correct_matches_for_elo_qa_eval(
-    qa_setup: Tuple[Sequence[str], str],
+    qa_setup: Sequence[str],
     elo_evaluator: Evaluator[
         SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput, Matches
     ],
 ) -> None:
-    run_ids, _ = qa_setup
+    run_ids = qa_setup
     evaluation_overview = elo_evaluator.evaluate_runs(*run_ids)
 
     eval_result = list(elo_evaluator.evaluation_lineages(evaluation_overview.id))[