From d0c4e58f32d67436f6e58a76a72282c5418b4e11 Mon Sep 17 00:00:00 2001
From: Merlin Kallenborn <Merlin.Kallenborn@ext.aleph-alpha.com>
Date: Thu, 16 May 2024 10:45:02 +0200
Subject: [PATCH] WIP: refactor: Remove filtering of previous run ids from
 basic incremental evaluator

TASK: IL-394
---
 src/documentation/elo_qa_eval.ipynb           | 309 +++++++++++-------
 .../evaluator/incremental_elo_evaluator.py    |   6 +-
 .../evaluator/incremental_evaluator.py        |  15 +-
 3 files changed, 204 insertions(+), 126 deletions(-)

diff --git a/src/documentation/elo_qa_eval.ipynb b/src/documentation/elo_qa_eval.ipynb
index d0ba3f7b1..acf6fcb03 100644
--- a/src/documentation/elo_qa_eval.ipynb
+++ b/src/documentation/elo_qa_eval.ipynb
@@ -173,11 +173,11 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Example ID = 44375baa-ff38-48e8-b9c9-e17d4bcc5ee6\n",
+      "Example ID = 87012cf8-1b98-4c9f-8a5f-815aa0e76edb\n",
       "Input = chunk=\"Surface micromachining\\n\\nSurface micromachining builds microstructures by deposition and etching structural layers over a substrate.[1] This is different from Bulk micromachining, in which a silicon substrate wafer is selectively etched to produce structures.\\n\\nLayers\\n\\nGenerally, polysilicon is used as one of the substrate layers while silicon dioxide is used as a sacrificial layer. The sacrificial layer is removed or etched out to create any necessary void in the thickness direction. Added layers tend to vary in size from 2-5 micrometres. The main advantage of this machining process is the ability to build electronic and mechanical components (functions) on the same substrate. Surface micro-machined components are smaller compared to their bulk micro-machined counterparts.\\n\\nAs the structures are built on top of the substrate and not inside it, the substrate's properties are not as important as in bulk micro-machining. Expensive silicon wafers can be replaced by cheaper substrates, such as glass or plastic. The size of the substrates may be larger than a silicon wafer, and surface micro-machining is used to produce thin-film transistors on large area glass substrates for flat panel displays. This technology can also be used for the manufacture of thin film solar cells, which can be deposited on glass, polyethylene terepthalate substrates or other non-rigid materials.\\n\\nFabrication process\\n\\nMicro-machining starts with a silicon wafer or other substrate upon which new layers are grown. These layers are selectively etched by photo-lithography; either a wet etch involving an acid, or a dry etch involving an ionized gas (or plasma). Dry etching can combine chemical etching with physical etching or ion bombardment. Surface micro-machining involves as many layers as are needed with a different mask (producing a different pattern) on each layer. Modern integrated circuit fabrication uses this technique and can use as many as 100 layers. Micro-machining is a younger technology and usually uses no more than 5 or 6 layers. Surface micro-machining uses developed technology (although sometimes not enough for demanding applications) which is easily repeatable for volume production.\" question='What is micromachining?' language=Language(iso_639_1='en')\n",
       "Expected output = \"Surface micromachining builds microstructures by deposition and etching structural layers over a substrate. This is different from Bulk micromachining, in which a silicon substrate wafer is selectively etched to produce structures.\"\n",
       "\n",
-      "Example ID = 95e5b7e4-925b-40bd-951e-5ad74b3c9b58\n",
+      "Example ID = d91ae806-31bf-4e92-a40f-6a560dc68a95\n",
       "Input = chunk=\"\\nSilicon is a chemical element; it has symbol Si and atomic number 14. It is a hard, brittle crystalline solid with a blue-grey metallic luster, and is a non metal and semiconductor. It is a member of group 14 in the periodic table: carbon is above it; and germanium, tin, lead, and flerovium are below it. It is relatively unreactive.\\n\\nBecause of its high chemical affinity for oxygen, it was not until 1823 that Jöns Jakob Berzelius was first able to prepare it and characterize it in pure form. Its oxides form a family of anions known as silicates. Its melting and boiling points of 1414 °C and 3265 °C, respectively, are the second highest among all the metalloids and nonmetals, being surpassed only by boron.[a]\\n\\nSilicon is the eighth most common element in the universe by mass, but very rarely occurs as the pure element in the Earth's crust. It is widely distributed in space in cosmic dusts, planetoids, and planets as various forms of silicon dioxide (silica) or silicates. More than 90% of the Earth's crust is composed of silicate minerals, making silicon the second most abundant element in the Earth's crust (about 28% by mass), after oxygen. \\n\" question='What is silicon?' language=Language(iso_639_1='en')\n",
       "Expected output = \"Silicon is a chemical element.\"\n",
       "\n"
@@ -221,8 +221,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Running: 2it [00:10,  5.38s/it]\n",
-      "Running: 2it [00:14,  7.15s/it]\n"
+      "Running: 2it [00:21, 10.62s/it]\n",
+      "Running: 2it [00:17,  8.68s/it]\n"
      ]
     }
    ],
@@ -278,38 +278,38 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Run overview IDs saved in the run repository: ['07332585-5833-4ee0-9fec-dd95d632055d', 'f84bc567-68f4-498b-b3ef-4b9cb268eb86']\n",
+      "Run overview IDs saved in the run repository: ['74d288d3-7536-4a40-bfe4-ccc9b9e107b0', 'a63e12f6-ebf5-4c4d-a9b6-d58a45f07257']\n",
       "\n",
-      "Run Overview ID = 07332585-5833-4ee0-9fec-dd95d632055d\n",
-      "Dataset ID = 0d363fad-708d-4667-ba21-d934aebd641b\n",
-      "Start time = 2024-05-16 07:15:10.386274+00:00\n",
-      "End time = 2024-05-16 07:15:21.158450+00:00\n",
+      "Run Overview ID = 74d288d3-7536-4a40-bfe4-ccc9b9e107b0\n",
+      "Dataset ID = d99a3aff-619d-422b-8d69-daf5513b3942\n",
+      "Start time = 2024-05-16 08:30:42.168784+00:00\n",
+      "End time = 2024-05-16 08:31:03.416546+00:00\n",
       "Failed example count = 0\n",
       "Successful example count = 2\n",
       "Description = \"QA with model luminous-base-control-20240215\"\n",
       "\n",
-      "Example ID=44375baa-ff38-48e8-b9c9-e17d4bcc5ee6\n",
-      "Related Run ID=07332585-5833-4ee0-9fec-dd95d632055d\n",
+      "Example ID=87012cf8-1b98-4c9f-8a5f-815aa0e76edb\n",
+      "Related Run ID=74d288d3-7536-4a40-bfe4-ccc9b9e107b0\n",
       "Output=\"answer='Micromachining is a process of building microstructures by deposition and etching structural layers over a substrate.' highlights=[ScoredTextHighlight(start=24, end=131, score=1.0)]\"\n",
       "\n",
-      "Example ID=95e5b7e4-925b-40bd-951e-5ad74b3c9b58\n",
-      "Related Run ID=07332585-5833-4ee0-9fec-dd95d632055d\n",
+      "Example ID=d91ae806-31bf-4e92-a40f-6a560dc68a95\n",
+      "Related Run ID=74d288d3-7536-4a40-bfe4-ccc9b9e107b0\n",
       "Output=\"answer='Silicon is a chemical element with symbol Si and atomic number 14. It is a hard, brittle crystalline solid with a blue-grey metallic luster, and is a non metal and semiconductor.' highlights=[ScoredTextHighlight(start=71, end=182, score=1.0)]\"\n",
       "\n",
-      "Run Overview ID = f84bc567-68f4-498b-b3ef-4b9cb268eb86\n",
-      "Dataset ID = 0d363fad-708d-4667-ba21-d934aebd641b\n",
-      "Start time = 2024-05-16 07:15:21.158746+00:00\n",
-      "End time = 2024-05-16 07:15:35.473366+00:00\n",
+      "Run Overview ID = a63e12f6-ebf5-4c4d-a9b6-d58a45f07257\n",
+      "Dataset ID = d99a3aff-619d-422b-8d69-daf5513b3942\n",
+      "Start time = 2024-05-16 08:31:03.417036+00:00\n",
+      "End time = 2024-05-16 08:31:20.775926+00:00\n",
       "Failed example count = 0\n",
       "Successful example count = 2\n",
       "Description = \"QA with model luminous-supreme-control-20240215\"\n",
       "\n",
-      "Example ID=44375baa-ff38-48e8-b9c9-e17d4bcc5ee6\n",
-      "Related Run ID=f84bc567-68f4-498b-b3ef-4b9cb268eb86\n",
+      "Example ID=87012cf8-1b98-4c9f-8a5f-815aa0e76edb\n",
+      "Related Run ID=a63e12f6-ebf5-4c4d-a9b6-d58a45f07257\n",
       "Output=\"answer='Surface micromachining is a process of building microstructures by deposition and etching structural layers over a substrate.' highlights=[ScoredTextHighlight(start=24, end=131, score=1.0)]\"\n",
       "\n",
-      "Example ID=95e5b7e4-925b-40bd-951e-5ad74b3c9b58\n",
-      "Related Run ID=f84bc567-68f4-498b-b3ef-4b9cb268eb86\n",
+      "Example ID=d91ae806-31bf-4e92-a40f-6a560dc68a95\n",
+      "Related Run ID=a63e12f6-ebf5-4c4d-a9b6-d58a45f07257\n",
       "Output=\"answer='Silicon is a chemical element with symbol Si and atomic number 14. It is a hard, brittle crystalline solid with a blue-grey metallic luster, and is a non metal and semiconductor.' highlights=[ScoredTextHighlight(start=71, end=182, score=1.0)]\"\n",
       "\n"
      ]
@@ -388,13 +388,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "PLAYER A:  07332585-5833-4ee0-9fec-dd95d632055d\n",
-      "PLAYER B:  f84bc567-68f4-498b-b3ef-4b9cb268eb86\n",
-      "example_id:  44375baa-ff38-48e8-b9c9-e17d4bcc5ee6\n",
+      "PAIRS <itertools.combinations object at 0x33ff72890>\n",
+      "PLAYER A:  74d288d3-7536-4a40-bfe4-ccc9b9e107b0\n",
+      "PLAYER B:  a63e12f6-ebf5-4c4d-a9b6-d58a45f07257\n",
+      "example_id:  87012cf8-1b98-4c9f-8a5f-815aa0e76edb\n",
       "______________________\n",
-      "PLAYER A:  07332585-5833-4ee0-9fec-dd95d632055d\n",
-      "PLAYER B:  f84bc567-68f4-498b-b3ef-4b9cb268eb86\n",
-      "example_id:  95e5b7e4-925b-40bd-951e-5ad74b3c9b58\n",
+      "PAIRS <itertools.combinations object at 0x33ff729d0>\n",
+      "PLAYER A:  74d288d3-7536-4a40-bfe4-ccc9b9e107b0\n",
+      "PLAYER B:  a63e12f6-ebf5-4c4d-a9b6-d58a45f07257\n",
+      "example_id:  d91ae806-31bf-4e92-a40f-6a560dc68a95\n",
       "______________________\n"
      ]
     },
@@ -402,7 +404,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Evaluating: 2it [00:00,  3.02it/s]\n"
+      "Evaluating: 2it [00:00,  2.98it/s]\n"
      ]
     }
    ],
@@ -444,24 +446,24 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Evaluation Overview ID = 58fe6bf1-c9d8-455d-b83a-b4b21e456e73\n",
-      "Start time = 2024-05-16 07:15:35.502240+00:00\n",
-      "End time = 2024-05-16 07:15:36.168871+00:00\n",
+      "Evaluation Overview ID = 12e1b780-3360-4801-990c-512b9fb524a7\n",
+      "Start time = 2024-05-16 08:31:20.812654+00:00\n",
+      "End time = 2024-05-16 08:31:21.501250+00:00\n",
       "Successful examples = 2\n",
       "Failed examples = 0\n",
       "Description = \"ELO QA evaluation\"\n",
       "Run Overviews={\n",
-      "Run Overview ID = 07332585-5833-4ee0-9fec-dd95d632055d\n",
-      "Dataset ID = 0d363fad-708d-4667-ba21-d934aebd641b\n",
-      "Start time = 2024-05-16 07:15:10.386274+00:00\n",
-      "End time = 2024-05-16 07:15:21.158450+00:00\n",
+      "Run Overview ID = 74d288d3-7536-4a40-bfe4-ccc9b9e107b0\n",
+      "Dataset ID = d99a3aff-619d-422b-8d69-daf5513b3942\n",
+      "Start time = 2024-05-16 08:30:42.168784+00:00\n",
+      "End time = 2024-05-16 08:31:03.416546+00:00\n",
       "Failed example count = 0\n",
       "Successful example count = 2\n",
       "Description = \"QA with model luminous-base-control-20240215\"\n",
-      ", Run Overview ID = f84bc567-68f4-498b-b3ef-4b9cb268eb86\n",
-      "Dataset ID = 0d363fad-708d-4667-ba21-d934aebd641b\n",
-      "Start time = 2024-05-16 07:15:21.158746+00:00\n",
-      "End time = 2024-05-16 07:15:35.473366+00:00\n",
+      ", Run Overview ID = a63e12f6-ebf5-4c4d-a9b6-d58a45f07257\n",
+      "Dataset ID = d99a3aff-619d-422b-8d69-daf5513b3942\n",
+      "Start time = 2024-05-16 08:31:03.417036+00:00\n",
+      "End time = 2024-05-16 08:31:20.775926+00:00\n",
       "Failed example count = 0\n",
       "Successful example count = 2\n",
       "Description = \"QA with model luminous-supreme-control-20240215\"\n",
@@ -561,16 +563,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Aggregation Overview ID = ec093c0e-7d20-4e5d-bf92-6646ae3ec9f5\n",
-      "Start time = 2024-05-16 07:15:36.210254+00:00\n",
-      "End time = 2024-05-16 07:15:36.211322+00:00\n",
+      "Aggregation Overview ID = 8374c433-e1b8-4601-a5db-109b76226da6\n",
+      "Start time = 2024-05-16 08:31:21.530772+00:00\n",
+      "End time = 2024-05-16 08:31:21.531775+00:00\n",
       "Successful example count = 2\n",
       "Count of examples crashed during evaluation = 0\n",
       "Description = \"ELO QA aggregation\"\n",
-      "IDs of aggregated Evaluation Overviews = ['58fe6bf1-c9d8-455d-b83a-b4b21e456e73']\n",
-      "IDs of aggregated Run Overviews = ['07332585-5833-4ee0-9fec-dd95d632055d', 'f84bc567-68f4-498b-b3ef-4b9cb268eb86']\n",
+      "IDs of aggregated Evaluation Overviews = ['12e1b780-3360-4801-990c-512b9fb524a7']\n",
+      "IDs of aggregated Run Overviews = ['74d288d3-7536-4a40-bfe4-ccc9b9e107b0', 'a63e12f6-ebf5-4c4d-a9b6-d58a45f07257']\n",
       "Statistics = {\n",
-      "scores={'07332585-5833-4ee0-9fec-dd95d632055d': PlayerScore(elo=1499.9313158176976, elo_standard_error=0.05682321943633073, win_rate=0.5, num_matches=2), 'f84bc567-68f4-498b-b3ef-4b9cb268eb86': PlayerScore(elo=1500.0686841823024, elo_standard_error=0.056823219518279844, win_rate=0.5, num_matches=2)}\n",
+      "scores={'74d288d3-7536-4a40-bfe4-ccc9b9e107b0': PlayerScore(elo=1500.034342091151, elo_standard_error=0.05713369977028134, win_rate=0.5, num_matches=2), 'a63e12f6-ebf5-4c4d-a9b6-d58a45f07257': PlayerScore(elo=1499.965657908849, elo_standard_error=0.05713369977028134, win_rate=0.5, num_matches=2)}\n",
       "}\n",
       "\n"
      ]
@@ -606,14 +608,15 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Running: 2it [00:09,  4.52s/it]\n"
+      "Running: 2it [00:10,  5.06s/it]\n",
+      "Running: 2it [00:17,  8.70s/it]\n"
      ]
     }
    ],
    "source": [
     "newly_added_models = [\n",
     "    LuminousControlModel(name=\"luminous-base-control-20230501\", client=aa_client),\n",
-    "    #LuminousControlModel(name=\"luminous-supreme-control-20230501\", client=aa_client),\n",
+    "    LuminousControlModel(name=\"luminous-supreme-control-20230501\", client=aa_client),\n",
     "]\n",
     "\n",
     "for model in newly_added_models:\n",
@@ -650,13 +653,21 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Run Overview ID = 03438077-f06d-4ce3-a60b-d8de2ddbfdb2\n",
-      "Dataset ID = 0d363fad-708d-4667-ba21-d934aebd641b\n",
-      "Start time = 2024-05-16 07:15:36.228255+00:00\n",
-      "End time = 2024-05-16 07:15:45.270250+00:00\n",
+      "Run Overview ID = 1fe9f33e-4e07-4e4b-a258-891aae660991\n",
+      "Dataset ID = d99a3aff-619d-422b-8d69-daf5513b3942\n",
+      "Start time = 2024-05-16 08:31:21.542825+00:00\n",
+      "End time = 2024-05-16 08:31:31.664710+00:00\n",
       "Failed example count = 0\n",
       "Successful example count = 2\n",
       "Description = \"New QA with model luminous-base-control-20230501\"\n",
+      "\n",
+      "Run Overview ID = 9470d56c-e6f0-4e9c-af25-cbd381c36a3a\n",
+      "Dataset ID = d99a3aff-619d-422b-8d69-daf5513b3942\n",
+      "Start time = 2024-05-16 08:31:31.664870+00:00\n",
+      "End time = 2024-05-16 08:31:49.058983+00:00\n",
+      "Failed example count = 0\n",
+      "Successful example count = 2\n",
+      "Description = \"New QA with model luminous-supreme-control-20230501\"\n",
       "\n"
      ]
     }
@@ -681,18 +692,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['03438077-f06d-4ce3-a60b-d8de2ddbfdb2',\n",
-       " '07332585-5833-4ee0-9fec-dd95d632055d',\n",
-       " 'f84bc567-68f4-498b-b3ef-4b9cb268eb86']"
+       "['1fe9f33e-4e07-4e4b-a258-891aae660991',\n",
+       " '74d288d3-7536-4a40-bfe4-ccc9b9e107b0',\n",
+       " '9470d56c-e6f0-4e9c-af25-cbd381c36a3a',\n",
+       " 'a63e12f6-ebf5-4c4d-a9b6-d58a45f07257']"
       ]
      },
-     "execution_count": 26,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -703,16 +715,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'58fe6bf1-c9d8-455d-b83a-b4b21e456e73'"
+       "'12e1b780-3360-4801-990c-512b9fb524a7'"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -723,9 +735,78 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 24,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PAIRSPAIRS <itertools.combinations object at 0x33ff9aac0>\n",
+      "PLAYER A:  9470d56c-e6f0-4e9c-af25-cbd381c36a3a\n",
+      "PLAYER B:  1fe9f33e-4e07-4e4b-a258-891aae660991\n",
+      "example_id:  d91ae806-31bf-4e92-a40f-6a560dc68a95\n",
+      "______________________\n",
+      " <itertools.combinations object at 0x33ff9a340>\n",
+      "PLAYER A:  9470d56c-e6f0-4e9c-af25-cbd381c36a3a\n",
+      "PLAYER B:  1fe9f33e-4e07-4e4b-a258-891aae660991\n",
+      "example_id:  87012cf8-1b98-4c9f-8a5f-815aa0e76edb\n",
+      "______________________\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Evaluating: 0it [00:00, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PLAYER A:  9470d56c-e6f0-4e9c-af25-cbd381c36a3a\n",
+      "PLAYER B:  74d288d3-7536-4a40-bfe4-ccc9b9e107b0\n",
+      "example_id:  d91ae806-31bf-4e92-a40f-6a560dc68a95\n",
+      "______________________\n",
+      "PLAYER A:  9470d56c-e6f0-4e9c-af25-cbd381c36a3a\n",
+      "PLAYER B:  a63e12f6-ebf5-4c4d-a9b6-d58a45f07257\n",
+      "example_id:  d91ae806-31bf-4e92-a40f-6a560dc68a95\n",
+      "______________________\n",
+      "PLAYER A:  9470d56c-e6f0-4e9c-af25-cbd381c36a3a\n",
+      "PLAYER B:  74d288d3-7536-4a40-bfe4-ccc9b9e107b0\n",
+      "example_id:  87012cf8-1b98-4c9f-8a5f-815aa0e76edb\n",
+      "______________________\n",
+      "PLAYER A:  1fe9f33e-4e07-4e4b-a258-891aae660991\n",
+      "PLAYER B:  74d288d3-7536-4a40-bfe4-ccc9b9e107b0\n",
+      "example_id:  d91ae806-31bf-4e92-a40f-6a560dc68a95\n",
+      "______________________\n",
+      "PLAYER A:  9470d56c-e6f0-4e9c-af25-cbd381c36a3a\n",
+      "PLAYER B:  a63e12f6-ebf5-4c4d-a9b6-d58a45f07257\n",
+      "example_id:  87012cf8-1b98-4c9f-8a5f-815aa0e76edb\n",
+      "______________________\n",
+      "PLAYER A:  1fe9f33e-4e07-4e4b-a258-891aae660991\n",
+      "PLAYER B:  a63e12f6-ebf5-4c4d-a9b6-d58a45f07257\n",
+      "example_id:  d91ae806-31bf-4e92-a40f-6a560dc68a95\n",
+      "______________________\n",
+      "PLAYER A:  1fe9f33e-4e07-4e4b-a258-891aae660991\n",
+      "PLAYER B:  74d288d3-7536-4a40-bfe4-ccc9b9e107b0\n",
+      "example_id:  87012cf8-1b98-4c9f-8a5f-815aa0e76edb\n",
+      "______________________\n",
+      "PLAYER A:  1fe9f33e-4e07-4e4b-a258-891aae660991\n",
+      "PLAYER B:  a63e12f6-ebf5-4c4d-a9b6-d58a45f07257\n",
+      "example_id:  87012cf8-1b98-4c9f-8a5f-815aa0e76edb\n",
+      "______________________\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Evaluating: 2it [00:04,  2.44s/it]\n"
+     ]
+    }
+   ],
    "source": [
     "\n",
     "\n",
@@ -745,58 +826,65 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Evaluation Overview ID = 58fe6bf1-c9d8-455d-b83a-b4b21e456e73\n",
-      "Start time = 2024-05-16 07:15:35.502240+00:00\n",
-      "End time = 2024-05-16 07:15:36.168871+00:00\n",
+      "Evaluation Overview ID = 12e1b780-3360-4801-990c-512b9fb524a7\n",
+      "Start time = 2024-05-16 08:31:20.812654+00:00\n",
+      "End time = 2024-05-16 08:31:21.501250+00:00\n",
       "Successful examples = 2\n",
       "Failed examples = 0\n",
       "Description = \"ELO QA evaluation\"\n",
       "Run Overviews={\n",
-      "Run Overview ID = 07332585-5833-4ee0-9fec-dd95d632055d\n",
-      "Dataset ID = 0d363fad-708d-4667-ba21-d934aebd641b\n",
-      "Start time = 2024-05-16 07:15:10.386274+00:00\n",
-      "End time = 2024-05-16 07:15:21.158450+00:00\n",
+      "Run Overview ID = 74d288d3-7536-4a40-bfe4-ccc9b9e107b0\n",
+      "Dataset ID = d99a3aff-619d-422b-8d69-daf5513b3942\n",
+      "Start time = 2024-05-16 08:30:42.168784+00:00\n",
+      "End time = 2024-05-16 08:31:03.416546+00:00\n",
       "Failed example count = 0\n",
       "Successful example count = 2\n",
       "Description = \"QA with model luminous-base-control-20240215\"\n",
-      ", Run Overview ID = f84bc567-68f4-498b-b3ef-4b9cb268eb86\n",
-      "Dataset ID = 0d363fad-708d-4667-ba21-d934aebd641b\n",
-      "Start time = 2024-05-16 07:15:21.158746+00:00\n",
-      "End time = 2024-05-16 07:15:35.473366+00:00\n",
+      ", Run Overview ID = a63e12f6-ebf5-4c4d-a9b6-d58a45f07257\n",
+      "Dataset ID = d99a3aff-619d-422b-8d69-daf5513b3942\n",
+      "Start time = 2024-05-16 08:31:03.417036+00:00\n",
+      "End time = 2024-05-16 08:31:20.775926+00:00\n",
       "Failed example count = 0\n",
       "Successful example count = 2\n",
       "Description = \"QA with model luminous-supreme-control-20240215\"\n",
       "}\n",
       "\n",
       "_____________________\n",
-      "Evaluation Overview ID = e31eef55-05b0-46e7-88cd-f40c44fce73c\n",
-      "Start time = 2024-05-16 07:15:45.290401+00:00\n",
-      "End time = 2024-05-16 07:15:45.291573+00:00\n",
+      "Evaluation Overview ID = 063b3257-9712-4b95-b3ca-68e6dc8034e0\n",
+      "Start time = 2024-05-16 08:31:49.091557+00:00\n",
+      "End time = 2024-05-16 08:31:53.978176+00:00\n",
       "Successful examples = 2\n",
       "Failed examples = 0\n",
       "Description = \"ELO QA evaluation\"\n",
       "Run Overviews={\n",
-      "Run Overview ID = 03438077-f06d-4ce3-a60b-d8de2ddbfdb2\n",
-      "Dataset ID = 0d363fad-708d-4667-ba21-d934aebd641b\n",
-      "Start time = 2024-05-16 07:15:36.228255+00:00\n",
-      "End time = 2024-05-16 07:15:45.270250+00:00\n",
+      "Run Overview ID = 1fe9f33e-4e07-4e4b-a258-891aae660991\n",
+      "Dataset ID = d99a3aff-619d-422b-8d69-daf5513b3942\n",
+      "Start time = 2024-05-16 08:31:21.542825+00:00\n",
+      "End time = 2024-05-16 08:31:31.664710+00:00\n",
       "Failed example count = 0\n",
       "Successful example count = 2\n",
       "Description = \"New QA with model luminous-base-control-20230501\"\n",
-      ", Run Overview ID = f84bc567-68f4-498b-b3ef-4b9cb268eb86\n",
-      "Dataset ID = 0d363fad-708d-4667-ba21-d934aebd641b\n",
-      "Start time = 2024-05-16 07:15:21.158746+00:00\n",
-      "End time = 2024-05-16 07:15:35.473366+00:00\n",
+      ", Run Overview ID = 9470d56c-e6f0-4e9c-af25-cbd381c36a3a\n",
+      "Dataset ID = d99a3aff-619d-422b-8d69-daf5513b3942\n",
+      "Start time = 2024-05-16 08:31:31.664870+00:00\n",
+      "End time = 2024-05-16 08:31:49.058983+00:00\n",
       "Failed example count = 0\n",
       "Successful example count = 2\n",
-      "Description = \"QA with model luminous-supreme-control-20240215\"\n",
-      ", Run Overview ID = 07332585-5833-4ee0-9fec-dd95d632055d\n",
-      "Dataset ID = 0d363fad-708d-4667-ba21-d934aebd641b\n",
-      "Start time = 2024-05-16 07:15:10.386274+00:00\n",
-      "End time = 2024-05-16 07:15:21.158450+00:00\n",
+      "Description = \"New QA with model luminous-supreme-control-20230501\"\n",
+      ", Run Overview ID = 74d288d3-7536-4a40-bfe4-ccc9b9e107b0\n",
+      "Dataset ID = d99a3aff-619d-422b-8d69-daf5513b3942\n",
+      "Start time = 2024-05-16 08:30:42.168784+00:00\n",
+      "End time = 2024-05-16 08:31:03.416546+00:00\n",
       "Failed example count = 0\n",
       "Successful example count = 2\n",
       "Description = \"QA with model luminous-base-control-20240215\"\n",
+      ", Run Overview ID = a63e12f6-ebf5-4c4d-a9b6-d58a45f07257\n",
+      "Dataset ID = d99a3aff-619d-422b-8d69-daf5513b3942\n",
+      "Start time = 2024-05-16 08:31:03.417036+00:00\n",
+      "End time = 2024-05-16 08:31:20.775926+00:00\n",
+      "Failed example count = 0\n",
+      "Successful example count = 2\n",
+      "Description = \"QA with model luminous-supreme-control-20240215\"\n",
       "}\n",
       "\n"
      ]
@@ -810,21 +898,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 26,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "AssertionError",
-     "evalue": "There are no matches (comparisons) for example ID 44375baa-ff38-48e8-b9c9-e17d4bcc5ee6",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[24], line 7\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m example_evaluation \u001b[38;5;129;01min\u001b[39;00m evaluation_repository\u001b[38;5;241m.\u001b[39mexample_evaluations(\n\u001b[1;32m      3\u001b[0m     new_evaluation_overview\u001b[38;5;241m.\u001b[39mid, Matches\n\u001b[1;32m      4\u001b[0m ):\n\u001b[1;32m      5\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(example_evaluation\u001b[38;5;241m.\u001b[39mresult, Matches)\n\u001b[1;32m      6\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m (\n\u001b[0;32m----> 7\u001b[0m         \u001b[38;5;28mlen\u001b[39m(example_evaluation\u001b[38;5;241m.\u001b[39mresult\u001b[38;5;241m.\u001b[39mcomparison_evaluations) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m      8\u001b[0m     ), \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThere are no matches (comparisons) for example ID \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexample_evaluation\u001b[38;5;241m.\u001b[39mexample_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n",
-      "\u001b[0;31mAssertionError\u001b[0m: There are no matches (comparisons) for example ID 44375baa-ff38-48e8-b9c9-e17d4bcc5ee6"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# ensure that for each example there are evaluated comparisons\n",
     "for example_evaluation in evaluation_repository.example_evaluations(\n",
@@ -846,14 +922,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Evaluation overviews to aggregate: ['0bef2a2a-31f6-4857-82ee-f570e66e5a9e', '96bb8184-6786-420d-b82a-6b39dabd6a15']\n"
+      "Evaluation overviews to aggregate: ['063b3257-9712-4b95-b3ca-68e6dc8034e0', '12e1b780-3360-4801-990c-512b9fb524a7']\n"
      ]
     }
    ],
@@ -869,7 +945,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -881,7 +957,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -907,23 +983,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Aggregation Overview ID = 0e43a608-ad93-40ef-87cc-ccc40422306c\n",
-      "Start time = 2024-05-16 07:06:20.146145+00:00\n",
-      "End time = 2024-05-16 07:06:20.148150+00:00\n",
+      "Aggregation Overview ID = 29a2316d-47b3-4aa8-a8c2-5983f1b072de\n",
+      "Start time = 2024-05-16 08:31:54.005689+00:00\n",
+      "End time = 2024-05-16 08:31:54.008595+00:00\n",
       "Successful example count = 4\n",
       "Count of examples crashed during evaluation = 0\n",
       "Description = \"ELO QA aggregation\"\n",
-      "IDs of aggregated Evaluation Overviews = ['0bef2a2a-31f6-4857-82ee-f570e66e5a9e', '96bb8184-6786-420d-b82a-6b39dabd6a15']\n",
-      "IDs of aggregated Run Overviews = ['d66bda94-80b9-4f4b-a31d-ab35691680df', '83dfe27b-726d-4154-a1ee-78fa22b97a58', 'fbb0e396-58fc-4d7e-933f-5b5f398560f2']\n",
+      "IDs of aggregated Evaluation Overviews = ['063b3257-9712-4b95-b3ca-68e6dc8034e0', '12e1b780-3360-4801-990c-512b9fb524a7']\n",
+      "IDs of aggregated Run Overviews = ['1fe9f33e-4e07-4e4b-a258-891aae660991', '9470d56c-e6f0-4e9c-af25-cbd381c36a3a', '74d288d3-7536-4a40-bfe4-ccc9b9e107b0', 'a63e12f6-ebf5-4c4d-a9b6-d58a45f07257']\n",
       "Statistics = {\n",
-      "scores={'d66bda94-80b9-4f4b-a31d-ab35691680df': PlayerScore(elo=1517.4294696919358, elo_standard_error=0.13944839396385506, win_rate=0.6666666666666666, num_matches=6), 'fbb0e396-58fc-4d7e-933f-5b5f398560f2': PlayerScore(elo=1463.6588612724133, elo_standard_error=0.10161514291105345, win_rate=0.16666666666666666, num_matches=6), '83dfe27b-726d-4154-a1ee-78fa22b97a58': PlayerScore(elo=1518.9155718486318, elo_standard_error=0.07301431927430814, win_rate=0.75, num_matches=4)}\n",
+      "scores={'9470d56c-e6f0-4e9c-af25-cbd381c36a3a': PlayerScore(elo=1499.9627526168383, elo_standard_error=0.17735549496891068, win_rate=0.5, num_matches=6), '1fe9f33e-4e07-4e4b-a258-891aae660991': PlayerScore(elo=1536.6164246453945, elo_standard_error=0.11725579512845279, win_rate=0.8333333333333334, num_matches=6), '74d288d3-7536-4a40-bfe4-ccc9b9e107b0': PlayerScore(elo=1481.9272029297929, elo_standard_error=0.16086469782787777, win_rate=0.3333333333333333, num_matches=6), 'a63e12f6-ebf5-4c4d-a9b6-d58a45f07257': PlayerScore(elo=1481.493191730659, elo_standard_error=0.15735532722704, win_rate=0.3333333333333333, num_matches=6)}\n",
       "}\n",
       "\n"
      ]
@@ -932,6 +1008,13 @@
    "source": [
     "print(aggregated_evaluation_with_new_model)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_elo_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_elo_evaluator.py
index 2dd26e09b..d1bf1a64c 100644
--- a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_elo_evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_elo_evaluator.py
@@ -58,16 +58,16 @@ def do_incremental_evaluate(
         outputs: list[SuccessfulExampleOutput[SingleChunkQaOutput]],
         already_evaluated_outputs: list[list[SuccessfulExampleOutput[SingleChunkQaOutput]]],
     ) -> Matches:
-        
+
         pairs = combinations(outputs, 2)
-        print('PAIRS', pairs)
+
         unique_pre_evaluated_runs: set[str] = set()
        
         for pre_run_output in already_evaluated_outputs:
             for current_output in pre_run_output:
                 unique_pre_evaluated_runs.add(current_output.run_id)
 
-        
+
         return Matches(
             comparison_evaluations=[
                 ComparisonEvaluation(
diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py
index bcab4124a..237386340 100644
--- a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py
@@ -50,20 +50,15 @@ def do_evaluate(
         Returns:
             :class:`Evaluation`: The metrics that come from the evaluated :class:`Task`.
         """
-        flattened_run_output_ids: set[str] = set()
-        evaluated_outputs = []
+
+        already_evaluated_outputs = []
         for run_output_ids in self._previous_run_output_ids:
-            flattened_run_output_ids = flattened_run_output_ids.union(run_output_ids)
-            evaluated_outputs.append(
+
+            already_evaluated_outputs.append(
                 [output for output in outputs if output.run_id in run_output_ids]
             )
 
-        new_outputs = [
-            output
-            for output in outputs
-            if output.run_id not in flattened_run_output_ids
-        ]
-        return self.do_incremental_evaluate(example, new_outputs, evaluated_outputs)
+        return self.do_incremental_evaluate(example, outputs, already_evaluated_outputs)
 
     @abstractmethod
     def do_incremental_evaluate(