diff --git a/src/documentation/elo_qa_eval.ipynb b/src/documentation/elo_qa_eval.ipynb
index 30b610e33..467281ee6 100644
--- a/src/documentation/elo_qa_eval.ipynb
+++ b/src/documentation/elo_qa_eval.ipynb
@@ -23,7 +23,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -31,9 +31,9 @@
     "\n",
     "from aleph_alpha_client import Client\n",
     "from dotenv import load_dotenv\n",
-    "from intelligence_layer.connectors import LimitedConcurrencyClient\n",
-    "\n",
     "\n",
+    "from intelligence_layer.connectors import LimitedConcurrencyClient\n",
+    "from intelligence_layer.evaluation.evaluation.elo_evaluator import Match\n",
     "\n",
     "load_dotenv()\n",
     "\n",
@@ -60,7 +60,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -87,13 +87,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from intelligence_layer.evaluation import Example\n",
-    "\n",
     "from intelligence_layer.core import Language\n",
+    "from intelligence_layer.evaluation import Example\n",
     "from intelligence_layer.examples.qa.single_chunk_qa import SingleChunkQaInput\n",
     "\n",
     "qa_input_text_1 = \"\"\"Surface micromachining\n",
@@ -139,7 +138,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -150,7 +149,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -167,24 +166,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Example ID = 3d7a7c26-5e01-4d2b-ab06-618a35e035a2\n",
-      "Input = chunk=\"\\nSilicon is a chemical element; it has symbol Si and atomic number 14. It is a hard, brittle crystalline solid with a blue-grey metallic luster, and is a non metal and semiconductor. It is a member of group 14 in the periodic table: carbon is above it; and germanium, tin, lead, and flerovium are below it. It is relatively unreactive.\\n\\nBecause of its high chemical affinity for oxygen, it was not until 1823 that Jöns Jakob Berzelius was first able to prepare it and characterize it in pure form. Its oxides form a family of anions known as silicates. Its melting and boiling points of 1414 °C and 3265 °C, respectively, are the second highest among all the metalloids and nonmetals, being surpassed only by boron.[a]\\n\\nSilicon is the eighth most common element in the universe by mass, but very rarely occurs as the pure element in the Earth's crust. It is widely distributed in space in cosmic dusts, planetoids, and planets as various forms of silicon dioxide (silica) or silicates. More than 90% of the Earth's crust is composed of silicate minerals, making silicon the second most abundant element in the Earth's crust (about 28% by mass), after oxygen. \\n\" question='What is silicon?' language=Language(iso_639_1='en')\n",
-      "Expected output = \"Silicon is a chemical element.\"\n",
-      "\n",
-      "Example ID = e47d60ca-60f0-441b-9127-5ea12434b90f\n",
-      "Input = chunk=\"Surface micromachining\\n\\nSurface micromachining builds microstructures by deposition and etching structural layers over a substrate.[1] This is different from Bulk micromachining, in which a silicon substrate wafer is selectively etched to produce structures.\\n\\nLayers\\n\\nGenerally, polysilicon is used as one of the substrate layers while silicon dioxide is used as a sacrificial layer. The sacrificial layer is removed or etched out to create any necessary void in the thickness direction. Added layers tend to vary in size from 2-5 micrometres. The main advantage of this machining process is the ability to build electronic and mechanical components (functions) on the same substrate. Surface micro-machined components are smaller compared to their bulk micro-machined counterparts.\\n\\nAs the structures are built on top of the substrate and not inside it, the substrate's properties are not as important as in bulk micro-machining. Expensive silicon wafers can be replaced by cheaper substrates, such as glass or plastic. The size of the substrates may be larger than a silicon wafer, and surface micro-machining is used to produce thin-film transistors on large area glass substrates for flat panel displays. This technology can also be used for the manufacture of thin film solar cells, which can be deposited on glass, polyethylene terepthalate substrates or other non-rigid materials.\\n\\nFabrication process\\n\\nMicro-machining starts with a silicon wafer or other substrate upon which new layers are grown. These layers are selectively etched by photo-lithography; either a wet etch involving an acid, or a dry etch involving an ionized gas (or plasma). Dry etching can combine chemical etching with physical etching or ion bombardment. Surface micro-machining involves as many layers as are needed with a different mask (producing a different pattern) on each layer. Modern integrated circuit fabrication uses this technique and can use as many as 100 layers. Micro-machining is a younger technology and usually uses no more than 5 or 6 layers. Surface micro-machining uses developed technology (although sometimes not enough for demanding applications) which is easily repeatable for volume production.\" question='What is micromachining?' language=Language(iso_639_1='en')\n",
-      "Expected output = \"Surface micromachining builds microstructures by deposition and etching structural layers over a substrate. This is different from Bulk micromachining, in which a silicon substrate wafer is selectively etched to produce structures.\"\n",
-      "\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "for example in dataset_repository.examples(dataset_id, SingleChunkQaInput, str):\n",
     "    print(example)"
@@ -208,23 +192,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Running: 2it [00:17,  8.51s/it]\n",
-      "Running: 2it [00:22, 11.31s/it]\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
-    "from intelligence_layer.evaluation.run.runner import Runner\n",
     "from intelligence_layer.core import LuminousControlModel\n",
-    "from intelligence_layer.examples.qa.single_chunk_qa import SingleChunkQa, SingleChunkQaOutput\n",
-    "\n",
+    "from intelligence_layer.evaluation.run.runner import Runner\n",
+    "from intelligence_layer.examples.qa.single_chunk_qa import (\n",
+    "    SingleChunkQa,\n",
+    "    SingleChunkQaOutput,\n",
+    ")\n",
     "\n",
     "models = [\n",
     "    LuminousControlModel(name=\"luminous-base-control-20240215\", client=aa_client),\n",
@@ -243,7 +220,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -263,50 +240,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Run overview IDs saved in the run repository: ['6a45e898-5516-4e7a-84cc-a010580e335c', 'b66fc08b-068a-4219-bcb5-51637cfaa47e']\n",
-      "\n",
-      "Run Overview ID = 6a45e898-5516-4e7a-84cc-a010580e335c\n",
-      "Dataset ID = 2bbd6c69-dbde-4739-a0f1-e82214deeb2d\n",
-      "Start time = 2024-05-14 14:26:55.937724+00:00\n",
-      "End time = 2024-05-14 14:27:12.977689+00:00\n",
-      "Failed example count = 0\n",
-      "Successful example count = 2\n",
-      "Description = \"QA with model luminous-base-control-20240215\"\n",
-      "\n",
-      "Example ID=3d7a7c26-5e01-4d2b-ab06-618a35e035a2\n",
-      "Related Run ID=6a45e898-5516-4e7a-84cc-a010580e335c\n",
-      "Output=\"answer='Silicon is a chemical element with symbol Si and atomic number 14. It is a hard, brittle crystalline solid with a blue-grey metallic luster, and is a non metal and semiconductor.' highlights=[ScoredTextHighlight(start=71, end=182, score=1.0)]\"\n",
-      "\n",
-      "Example ID=e47d60ca-60f0-441b-9127-5ea12434b90f\n",
-      "Related Run ID=6a45e898-5516-4e7a-84cc-a010580e335c\n",
-      "Output=\"answer='Micromachining is a process of building microstructures by deposition and etching structural layers over a substrate.' highlights=[ScoredTextHighlight(start=24, end=131, score=1.0)]\"\n",
-      "\n",
-      "Run Overview ID = b66fc08b-068a-4219-bcb5-51637cfaa47e\n",
-      "Dataset ID = 2bbd6c69-dbde-4739-a0f1-e82214deeb2d\n",
-      "Start time = 2024-05-14 14:27:12.978007+00:00\n",
-      "End time = 2024-05-14 14:27:35.599692+00:00\n",
-      "Failed example count = 0\n",
-      "Successful example count = 2\n",
-      "Description = \"QA with model luminous-supreme-control-20240215\"\n",
-      "\n",
-      "Example ID=3d7a7c26-5e01-4d2b-ab06-618a35e035a2\n",
-      "Related Run ID=b66fc08b-068a-4219-bcb5-51637cfaa47e\n",
-      "Output=\"answer='Silicon is a chemical element with symbol Si and atomic number 14. It is a hard, brittle crystalline solid with a blue-grey metallic luster, and is a non metal and semiconductor.' highlights=[ScoredTextHighlight(start=71, end=182, score=1.0)]\"\n",
-      "\n",
-      "Example ID=e47d60ca-60f0-441b-9127-5ea12434b90f\n",
-      "Related Run ID=b66fc08b-068a-4219-bcb5-51637cfaa47e\n",
-      "Output=\"answer='Surface micromachining is a process of building microstructures by deposition and etching structural layers over a substrate.' highlights=[ScoredTextHighlight(start=24, end=131, score=1.0)]\"\n",
-      "\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "print(\n",
     "    f\"Run overview IDs saved in the run repository: {run_repository.run_overview_ids()}\\n\"\n",
@@ -332,17 +268,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "IDs of stored evaluations: []\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# this should demonstrate that there are no stored evaluations yet in our repository\n",
     "print(f\"IDs of stored evaluations: {evaluation_repository.evaluation_overview_ids()}\")"
@@ -357,18 +285,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from intelligence_layer.core.model import Llama2InstructModel\n",
     "from intelligence_layer.evaluation import Evaluator\n",
     "from intelligence_layer.evaluation.evaluation.elo_evaluator import EloEvaluationLogic\n",
-    "from intelligence_layer.evaluation.evaluation.elo_graders.elo_qa_grader import EloQaGrader\n",
-    "\n",
     "\n",
-    "\n",
-    "elo_evaluation_logic: EloEvaluationLogic[SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput] = EloEvaluationLogic()\n",
+    "elo_evaluation_logic: EloEvaluationLogic[\n",
+    "    SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput\n",
+    "] = EloEvaluationLogic()\n",
     "\n",
     "evaluator = Evaluator(\n",
     "    dataset_repository=dataset_repository,\n",
@@ -376,34 +302,15 @@
     "    evaluation_repository=evaluation_repository,\n",
     "    description=\"ELO QA evaluation\",  # this description will be used later to query for specific evaluations\n",
     "    evaluation_logic=elo_evaluation_logic,\n",
-    ")\n"
+    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "TypeError",
-     "evalue": "Alternatively overwrite input_type() in <class 'intelligence_layer.evaluation.evaluation.evaluator.Evaluator'>",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "File \u001b[0;32m~/Aleph-Alpha/intelligence-layer-sdk/src/intelligence_layer/evaluation/evaluation/evaluator.py:234\u001b[0m, in \u001b[0;36mEvaluator.input_type\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    233\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 234\u001b[0m     input_type \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_types\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mInput\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m    235\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m:\n",
-      "\u001b[0;31mKeyError\u001b[0m: 'Input'",
-      "\nDuring handling of the above exception, another exception occurred:\n",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m evaluation_overview \u001b[38;5;241m=\u001b[39m \u001b[43mevaluator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mevaluate_runs\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mrun_repository\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_overview_ids\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/Aleph-Alpha/intelligence-layer-sdk/src/intelligence_layer/evaluation/evaluation/evaluator.py:329\u001b[0m, in \u001b[0;36mEvaluator.evaluate_runs\u001b[0;34m(self, num_examples, abort_on_error, *run_ids)\u001b[0m\n\u001b[1;32m    325\u001b[0m eval_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_evaluation_repository\u001b[38;5;241m.\u001b[39minitialize_evaluation()\n\u001b[1;32m    326\u001b[0m dataset_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28miter\u001b[39m(run_overviews))\u001b[38;5;241m.\u001b[39mdataset_id\n\u001b[1;32m    327\u001b[0m examples \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_dataset_repository\u001b[38;5;241m.\u001b[39mexamples(\n\u001b[1;32m    328\u001b[0m     dataset_id,\n\u001b[0;32m--> 329\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minput_type\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m,\n\u001b[1;32m    330\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexpected_output_type(),\n\u001b[1;32m    331\u001b[0m )\n\u001b[1;32m    332\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m examples \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    333\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdataset_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not found\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-      "File \u001b[0;32m~/Aleph-Alpha/intelligence-layer-sdk/src/intelligence_layer/evaluation/evaluation/evaluator.py:236\u001b[0m, in \u001b[0;36mEvaluator.input_type\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    234\u001b[0m     input_type \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_types()[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInput\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m    235\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m:\n\u001b[0;32m--> 236\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAlternatively overwrite input_type() in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    237\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m cast(\u001b[38;5;28mtype\u001b[39m[Input], input_type)\n",
-      "\u001b[0;31mTypeError\u001b[0m: Alternatively overwrite input_type() in <class 'intelligence_layer.evaluation.evaluation.evaluator.Evaluator'>"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
-    "\n",
     "evaluation_overview = evaluator.evaluate_runs(*run_repository.run_overview_ids())"
    ]
   },
@@ -467,11 +374,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from intelligence_layer.evaluation import Aggregator\n",
     "from intelligence_layer_experiments.use_cases.elo_usecase.elo_qa_aggregator import (\n",
     "    EloQaAggregationLogic,\n",
     ")\n",
     "\n",
+    "from intelligence_layer.evaluation import Aggregator\n",
+    "\n",
     "aggregator = Aggregator(\n",
     "    evaluation_repository=evaluation_repository,\n",
     "    aggregation_repository=aggregation_repository,\n",
@@ -546,7 +454,9 @@
     "]\n",
     "\n",
     "for model in newly_added_models:\n",
-    "    runner = Runner[SingleChunkQaInput, SingleChunkQaOutput](\n",
+    "    runner = Runner[\n",
+    "        SingleChunkQaInput, SingleChunkQaOutput\n",
+    "    ](\n",
     "        task=SingleChunkQa(model),\n",
     "        dataset_repository=dataset_repository,\n",
     "        run_repository=run_repository,\n",
diff --git a/src/intelligence_layer/evaluation/evaluation/elo_evaluation_logics/elo_qa_evaluation_logic.py b/src/intelligence_layer/evaluation/evaluation/elo_evaluation_logics/elo_qa_evaluation_logic.py
index a2abb6574..b58dcac4e 100644
--- a/src/intelligence_layer/evaluation/evaluation/elo_evaluation_logics/elo_qa_evaluation_logic.py
+++ b/src/intelligence_layer/evaluation/evaluation/elo_evaluation_logics/elo_qa_evaluation_logic.py
@@ -1,15 +1,24 @@
 import math
-from typing import Mapping, Sequence, final
+from typing import Mapping, Sequence
+
 from aleph_alpha_client import Prompt
 from liquid import Template
+
 from intelligence_layer.core.detect_language import Language
 from intelligence_layer.core.model import CompleteInput, CompleteOutput, ControlModel
 from intelligence_layer.core.tracer.tracer import NoOpTracer, TaskSpan, Tracer
 from intelligence_layer.evaluation.aggregation.elo import MatchOutcome
 from intelligence_layer.evaluation.dataset.domain import Example
-from intelligence_layer.evaluation.evaluation.elo_evaluator import EloEvaluationLogic, EloGradingInput
+from intelligence_layer.evaluation.evaluation.elo_evaluator import (
+    EloEvaluationLogic,
+    EloGradingInput,
+)
 from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput
-from intelligence_layer.examples.qa.single_chunk_qa import QA_INSTRUCTIONS, SingleChunkQaInput, SingleChunkQaOutput
+from intelligence_layer.examples.qa.single_chunk_qa import (
+    QA_INSTRUCTIONS,
+    SingleChunkQaInput,
+    SingleChunkQaOutput,
+)
 
 
 class EloQaEvaluationLogic(
@@ -133,5 +142,3 @@ def categorize_value(value: float) -> MatchOutcome:
             complete_output.completions[0].log_probs
         )
         return categorize_value(normalized_probability)
-    
-    
\ No newline at end of file
diff --git a/src/intelligence_layer/evaluation/evaluation/elo_graders/elo_grader.py b/src/intelligence_layer/evaluation/evaluation/elo_graders/elo_grader.py
deleted file mode 100644
index b2f0ca8cc..000000000
--- a/src/intelligence_layer/evaluation/evaluation/elo_graders/elo_grader.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from abc import abstractmethod
-from typing import Generic, Sequence
-
-from pydantic import BaseModel
-
-from intelligence_layer.core.task import Input, Output
-from intelligence_layer.evaluation import MatchOutcome
-from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
-from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput
-
-
-class Match(BaseModel):
-    player_a: str
-    player_b: str
-    outcome: MatchOutcome
-
-
-class Matches(BaseModel):
-    matches: Sequence[Match]
-
-
-class EloGradingInput(BaseModel):
-    instruction: str
-    first_completion: str
-    second_completion: str
-
-
-class EloGrader(
-    Generic[
-        Input,
-        Output,
-        ExpectedOutput,
-    ],
-):
-    @abstractmethod
-    def grade(
-        self,
-        output_a: SuccessfulExampleOutput[Output],
-        output_b: SuccessfulExampleOutput[Output],
-        example: Example[Input, ExpectedOutput],
-    ) -> MatchOutcome:
-        pass
diff --git a/src/intelligence_layer/evaluation/evaluation/elo_graders/elo_qa_grader.py b/src/intelligence_layer/evaluation/evaluation/elo_graders/elo_qa_grader.py
deleted file mode 100644
index d8a42070d..000000000
--- a/src/intelligence_layer/evaluation/evaluation/elo_graders/elo_qa_grader.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import math
-from typing import Mapping, Sequence
-
-from aleph_alpha_client import Prompt
-from liquid import Template
-
-from intelligence_layer.core import TaskSpan
-from intelligence_layer.core.detect_language import Language
-from intelligence_layer.core.model import CompleteInput, CompleteOutput, ControlModel
-from intelligence_layer.core.tracer.tracer import NoOpTracer, Tracer
-from intelligence_layer.evaluation.aggregation.elo import MatchOutcome
-from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
-from intelligence_layer.evaluation.evaluation.elo_graders.elo_grader import (
-    EloGrader,
-    EloGradingInput,
-)
-from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput
-from intelligence_layer.examples.qa.single_chunk_qa import (
-    QA_INSTRUCTIONS,
-    SingleChunkQaInput,
-    SingleChunkQaOutput,
-)
-
-
-class EloQaGrader(
-    EloGrader[SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput]
-):
-    INPUT_TEMPLATE = """
-Your task is to compare two answers to an instruction on one metric.
-
-Please make sure you read and understand these instruction carefully. Please keep this document open while reviewing, and refer to it as needed.
-
-The Instruction for the answers was:{instruction}
-
-Evaluation Procedure:
-1. Read both answers carefully and identify the main facts and details they present.
-2. Check if the answers contain any factual errors that are not supported by the instruction.
-3. Evaluate which answer is more correct.
-
-Answer A:{first_completion}
-
-Answer B:{second_completion}
-
-Which answer is more correct given the Instruction and Evaluation Procedure, Answer A or Answer B?
-
-Response: Answer """
-    VALUES = [
-        " A",
-        " B",
-    ]  # The space before the A and B is important due to tokenization
-
-    def __init__(
-        self,
-        model: ControlModel,
-        tracer: Tracer = NoOpTracer(),
-    ):
-        super().__init__()
-        self._model = model
-        self.tracer = tracer
-
-    def _create_grading_input(
-        self,
-        first: SuccessfulExampleOutput[SingleChunkQaOutput],
-        second: SuccessfulExampleOutput[SingleChunkQaOutput],
-        example: Example[SingleChunkQaInput, ExpectedOutput],
-    ) -> EloGradingInput:
-        qa_instruction = Template(
-            QA_INSTRUCTIONS[Language("en")].unformatted_instruction
-        ).render(question=example.input.question)
-
-        no_answer = "There is no answer."
-        return EloGradingInput(
-            instruction=f"{example.input.chunk} {qa_instruction}",
-            first_completion=(
-                first.output.answer if first.output.answer is not None else no_answer
-            ),
-            second_completion=(
-                second.output.answer if second.output.answer is not None else no_answer
-            ),
-        )
-
-    def do_run(self, input: EloGradingInput, task_span: TaskSpan) -> MatchOutcome:
-        text = self.INPUT_TEMPLATE.format(
-            instruction=input.instruction,
-            first_completion=input.first_completion,
-            second_completion=input.second_completion,
-        )
-
-        complete_input = CompleteInput(
-            prompt=Prompt.from_text(text),
-            maximum_tokens=1,
-            log_probs=3,
-            disable_optimizations=True,
-        )
-        complete_output = self._model.complete_task().run(complete_input, task_span)
-
-        return self.calculate_winners(complete_output)
-
-    def grade(
-        self,
-        first: SuccessfulExampleOutput[SingleChunkQaOutput],
-        second: SuccessfulExampleOutput[SingleChunkQaOutput],
-        example: Example[SingleChunkQaInput, SingleChunkQaOutput],
-    ) -> MatchOutcome:
-        grading_input = self._create_grading_input(first, second, example)
-
-        return MatchOutcome(
-            self.do_run(
-                grading_input,
-                self.tracer.task_span(
-                    task_name="elo_qa_run_grader", input=grading_input
-                ),
-            )
-        )
-
-    def calculate_winners(self, complete_output: CompleteOutput) -> MatchOutcome:
-        default_log_prob = float("-inf")
-
-        def get_normalized_prob(
-            log_prob_list: Sequence[Mapping[str, float | None]] | None,
-        ) -> float:
-            assert log_prob_list is not None
-            log_probs = log_prob_list[0]
-            values = [
-                math.exp(log_probs.get(str(key), default_log_prob) or default_log_prob)
-                for key in self.VALUES
-            ]
-            if all(v == 0 for v in values):
-                raise ValueError(
-                    f"LLM evaluation response does not contain logprobs for the required tokens for the values: {self.VALUES}"
-                )
-            return values[0] / sum(values)
-
-        def categorize_value(value: float) -> MatchOutcome:
-            if value > 0.7:
-                return MatchOutcome.A_WINS
-            elif 0.3 > value:
-                return MatchOutcome.B_WINS
-            else:
-                return MatchOutcome.DRAW
-
-        normalized_probability = get_normalized_prob(
-            complete_output.completions[0].log_probs
-        )
-        return categorize_value(normalized_probability)
diff --git a/tests/evaluation/test_elo_evaluator.py b/tests/evaluation/test_elo_evaluator.py
index e21ece38d..9eb9405e9 100644
--- a/tests/evaluation/test_elo_evaluator.py
+++ b/tests/evaluation/test_elo_evaluator.py
@@ -23,8 +23,8 @@
     RunOverview,
     SuccessfulExampleOutput,
 )
-from intelligence_layer.evaluation.evaluation.elo_evaluator import EloEvaluationLogic
-from intelligence_layer.evaluation.evaluation.elo_graders.elo_grader import (
+from intelligence_layer.evaluation.evaluation.elo_evaluator import (
+    EloEvaluationLogic,
     Match,
     Matches,
 )
@@ -34,23 +34,6 @@
 load_dotenv()
 
 
-# class DummyEloQaGrader(EloQaGrader):
-#    def grade(
-#        self,
-#        first: SuccessfulExampleOutput[SingleChunkQaOutput],
-#        second: SuccessfulExampleOutput[SingleChunkQaOutput],
-#        example: Example[SingleChunkQaInput, SingleChunkQaOutput],
-#    ) -> MatchOutcome:
-#        _ = example
-#        if first.run_id < second.run_id:
-#            return MatchOutcome.A_WINS
-#        elif first.run_id > second.run_id:
-#            return MatchOutcome.B_WINS
-#        else:
-#            return MatchOutcome.DRAW
-#
-
-
 class DummyEloQaEvalLogic(
     EloEvaluationLogic[SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput]
 ):
@@ -83,11 +66,6 @@ def model(client: AlephAlphaClientProtocol) -> ControlModel:
     return LuminousControlModel(client=client, name="luminous-base-control")
 
 
-# @fixture
-# def dummy_elo_qa_grader(model: ControlModel) -> DummyEloQaGrader:
-#    return DummyEloQaGrader(model=model)
-
-
 @fixture
 def in_memory_dataset_repository() -> InMemoryDatasetRepository:
     return InMemoryDatasetRepository()