From 9fab77e595064f502332f1e8d2c780049a1c6cbf Mon Sep 17 00:00:00 2001
From: Merlin Kallenborn <Merlin.Kallenborn@ext.aleph-alpha.com>
Date: Mon, 13 May 2024 17:55:51 +0200
Subject: [PATCH] WIP: fix: Test setup

TASK: IL-394
---
 tests/evaluation/test_elo_evaluator.py | 45 +++++++++++++++++---------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/tests/evaluation/test_elo_evaluator.py b/tests/evaluation/test_elo_evaluator.py
index 577af32c9..8e927de6c 100644
--- a/tests/evaluation/test_elo_evaluator.py
+++ b/tests/evaluation/test_elo_evaluator.py
@@ -1,8 +1,7 @@
-from typing import Sequence, Tuple, Any, Set, FrozenSet, Mapping
-from uuid import UUID
+from itertools import combinations
+from typing import Sequence, Tuple
 
 from dotenv import load_dotenv
-from pydantic import BaseModel
 from pytest import fixture
 
 from intelligence_layer.connectors import AlephAlphaClientProtocol
@@ -55,6 +54,27 @@ def grade(
             return MatchOutcome.DRAW
 
 
+class DummyEloQaEvalLogic(
+    EloEvaluationLogic[SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput]
+):
+    def do_evaluate(
+        self,
+        example: Example[SingleChunkQaInput, SingleChunkQaOutput],
+        *output: SuccessfulExampleOutput[SingleChunkQaOutput],
+    ) -> Matches:
+        pairs = combinations(output, 2)
+        return Matches(
+            matches=[
+                Match(
+                    player_a=first.run_id,
+                    player_b=second.run_id,
+                    outcome=self._grader.grade(first, second, example),
+                )
+                for [first, second] in pairs
+            ]
+        )
+
+
 @fixture
 def model(client: AlephAlphaClientProtocol) -> ControlModel:
     return LuminousControlModel(client=client, name="luminous-base-control")
@@ -81,10 +101,10 @@ def in_memory_evaluation_repository() -> InMemoryEvaluationRepository:
 
 
 @fixture
-def elo_evaluation_logic(
+def dummy_eval_logic(
     dummy_elo_qa_grader: EloQaGrader,
-) -> EloEvaluationLogic[SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput]:
-    return EloEvaluationLogic(grader=dummy_elo_qa_grader)
+) -> DummyEloQaEvalLogic:
+    return DummyEloQaEvalLogic(grader=dummy_elo_qa_grader)
 
 
 @fixture
@@ -92,7 +112,7 @@ def elo_evaluator(
     in_memory_dataset_repository: InMemoryDatasetRepository,
     in_memory_run_repository: InMemoryRunRepository,
     in_memory_evaluation_repository: InMemoryEvaluationRepository,
-    elo_evaluation_logic: EloEvaluationLogic[
+    dummy_eval_logic: EloEvaluationLogic[
         SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput
     ],
 ) -> Evaluator[SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput, Matches]:
@@ -101,7 +121,7 @@ def elo_evaluator(
         in_memory_run_repository,
         in_memory_evaluation_repository,
         "Testing",
-        elo_evaluation_logic,
+        dummy_eval_logic,
     )
 
 
@@ -227,13 +247,6 @@ def test_full_elo_eval_run(  # TODO: Better name
 ) -> None:
     run_ids, _ = qa_setup
     evaluation_overview = elo_evaluator.evaluate_runs(run_ids[0], run_ids[1])
-    new_elo_qa_evaluator = Evaluator(
-        in_memory_dataset_repository,
-        in_memory_run_repository,
-        in_memory_evaluation_repository,
-        "Testing",
-        evaluation_logic=EloEvaluationLogic(grader=dummy_elo_qa_grader),
-    )
+
     # new_evaluation_overview = new_elo_qa_evaluator.evaluate_runs(*run_ids)
     print(evaluation_overview)
-    print(new_elo_qa_evaluator)