From 9fab77e595064f502332f1e8d2c780049a1c6cbf Mon Sep 17 00:00:00 2001 From: Merlin Kallenborn Date: Mon, 13 May 2024 17:55:51 +0200 Subject: [PATCH] WIP: fix: Test setup TASK: IL-394 --- tests/evaluation/test_elo_evaluator.py | 45 +++++++++++++++++--------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/tests/evaluation/test_elo_evaluator.py b/tests/evaluation/test_elo_evaluator.py index 577af32c9..8e927de6c 100644 --- a/tests/evaluation/test_elo_evaluator.py +++ b/tests/evaluation/test_elo_evaluator.py @@ -1,8 +1,7 @@ -from typing import Sequence, Tuple, Any, Set, FrozenSet, Mapping -from uuid import UUID +from itertools import combinations +from typing import Sequence, Tuple from dotenv import load_dotenv -from pydantic import BaseModel from pytest import fixture from intelligence_layer.connectors import AlephAlphaClientProtocol @@ -55,6 +54,27 @@ def grade( return MatchOutcome.DRAW +class DummyEloQaEvalLogic( + EloEvaluationLogic[SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput] +): + def do_evaluate( + self, + example: Example[SingleChunkQaInput, SingleChunkQaOutput], + *output: SuccessfulExampleOutput[SingleChunkQaOutput], + ) -> Matches: + pairs = combinations(output, 2) + return Matches( + matches=[ + Match( + player_a=first.run_id, + player_b=second.run_id, + outcome=self._grader.grade(first, second, example), + ) + for [first, second] in pairs + ] + ) + + @fixture def model(client: AlephAlphaClientProtocol) -> ControlModel: return LuminousControlModel(client=client, name="luminous-base-control") @@ -81,10 +101,10 @@ def in_memory_evaluation_repository() -> InMemoryEvaluationRepository: @fixture -def elo_evaluation_logic( +def dummy_eval_logic( dummy_elo_qa_grader: EloQaGrader, -) -> EloEvaluationLogic[SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput]: - return EloEvaluationLogic(grader=dummy_elo_qa_grader) +) -> DummyEloQaEvalLogic: + return DummyEloQaEvalLogic(grader=dummy_elo_qa_grader) @fixture @@ -92,7 +112,7 @@ def elo_evaluator( in_memory_dataset_repository: InMemoryDatasetRepository, in_memory_run_repository: InMemoryRunRepository, in_memory_evaluation_repository: InMemoryEvaluationRepository, - elo_evaluation_logic: EloEvaluationLogic[ + dummy_eval_logic: EloEvaluationLogic[ SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput ], ) -> Evaluator[SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput, Matches]: @@ -101,7 +121,7 @@ def elo_evaluator( in_memory_run_repository, in_memory_evaluation_repository, "Testing", - elo_evaluation_logic, + dummy_eval_logic, ) @@ -227,13 +247,6 @@ def test_full_elo_eval_run( # TODO: Better name ) -> None: run_ids, _ = qa_setup evaluation_overview = elo_evaluator.evaluate_runs(run_ids[0], run_ids[1]) - new_elo_qa_evaluator = Evaluator( - in_memory_dataset_repository, - in_memory_run_repository, - in_memory_evaluation_repository, - "Testing", - evaluation_logic=EloEvaluationLogic(grader=dummy_elo_qa_grader), - ) + # new_evaluation_overview = new_elo_qa_evaluator.evaluate_runs(*run_ids) print(evaluation_overview) - print(new_elo_qa_evaluator)