Skip to content

Commit

Permalink
WIP: fix: Test setup
Browse files Browse the repository at this point in the history
TASK: IL-394
  • Loading branch information
MerlinKallenbornAA committed May 13, 2024
1 parent 9f936a2 commit 9fab77e
Showing 1 changed file with 29 additions and 16 deletions.
45 changes: 29 additions & 16 deletions tests/evaluation/test_elo_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from typing import Sequence, Tuple, Any, Set, FrozenSet, Mapping
from uuid import UUID
from itertools import combinations
from typing import Sequence, Tuple

from dotenv import load_dotenv
from pydantic import BaseModel
from pytest import fixture

from intelligence_layer.connectors import AlephAlphaClientProtocol
Expand Down Expand Up @@ -55,6 +54,27 @@ def grade(
return MatchOutcome.DRAW


class DummyEloQaEvalLogic(
EloEvaluationLogic[SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput]
):
def do_evaluate(
self,
example: Example[SingleChunkQaInput, SingleChunkQaOutput],
*output: SuccessfulExampleOutput[SingleChunkQaOutput],
) -> Matches:
pairs = combinations(output, 2)
return Matches(
matches=[
Match(
player_a=first.run_id,
player_b=second.run_id,
outcome=self._grader.grade(first, second, example),
)
for [first, second] in pairs
]
)


@fixture
def model(client: AlephAlphaClientProtocol) -> ControlModel:
return LuminousControlModel(client=client, name="luminous-base-control")
Expand All @@ -81,18 +101,18 @@ def in_memory_evaluation_repository() -> InMemoryEvaluationRepository:


@fixture
def elo_evaluation_logic(
def dummy_eval_logic(
dummy_elo_qa_grader: EloQaGrader,
) -> EloEvaluationLogic[SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput]:
return EloEvaluationLogic(grader=dummy_elo_qa_grader)
) -> DummyEloQaEvalLogic:
return DummyEloQaEvalLogic(grader=dummy_elo_qa_grader)


@fixture
def elo_evaluator(
in_memory_dataset_repository: InMemoryDatasetRepository,
in_memory_run_repository: InMemoryRunRepository,
in_memory_evaluation_repository: InMemoryEvaluationRepository,
elo_evaluation_logic: EloEvaluationLogic[
dummy_eval_logic: EloEvaluationLogic[
SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput
],
) -> Evaluator[SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput, Matches]:
Expand All @@ -101,7 +121,7 @@ def elo_evaluator(
in_memory_run_repository,
in_memory_evaluation_repository,
"Testing",
elo_evaluation_logic,
dummy_eval_logic,
)


Expand Down Expand Up @@ -227,13 +247,6 @@ def test_full_elo_eval_run( # TODO: Better name
) -> None:
run_ids, _ = qa_setup
evaluation_overview = elo_evaluator.evaluate_runs(run_ids[0], run_ids[1])
new_elo_qa_evaluator = Evaluator(
in_memory_dataset_repository,
in_memory_run_repository,
in_memory_evaluation_repository,
"Testing",
evaluation_logic=EloEvaluationLogic(grader=dummy_elo_qa_grader),
)

# new_evaluation_overview = new_elo_qa_evaluator.evaluate_runs(*run_ids)
print(evaluation_overview)
print(new_elo_qa_evaluator)

0 comments on commit 9fab77e

Please sign in to comment.