From 2b2a53a3461e5fb18c714f9bdfc82f0ab9c7d15a Mon Sep 17 00:00:00 2001 From: Merlin Kallenborn Date: Tue, 14 May 2024 09:09:05 +0200 Subject: [PATCH] feat: Write test for Elo Qa Evaluator Task: IL-394 --- tests/evaluation/test_elo_evaluator.py | 60 +++++--------------------- 1 file changed, 10 insertions(+), 50 deletions(-) diff --git a/tests/evaluation/test_elo_evaluator.py b/tests/evaluation/test_elo_evaluator.py index 8e927de6c..597eb058b 100644 --- a/tests/evaluation/test_elo_evaluator.py +++ b/tests/evaluation/test_elo_evaluator.py @@ -193,60 +193,20 @@ def qa_setup( return run_ids, dataset_id -# def test_choose_winner_should_return_contestant_with_lower_run_id(dummy) -> None: -# # Given -# contestant_a = SuccessfulExampleOutput[str](run_id="a", example_id="", output="") -# contestant_b = SuccessfulExampleOutput[str](run_id="b", example_id="", output="") -# contestant_a2 = SuccessfulExampleOutput[str](run_id="a", example_id="", output="") -# contestant_c = SuccessfulExampleOutput[str](run_id="c", example_id="", output="") -# # When -# match_a_wins =(contestant_a.run_id, contestant_b.run_id) -# match_b_wins = choose_winner(contestant_c.run_id, contestant_b.run_id) -# match_draw = choose_winner(contestant_a.run_id, contestant_a2.run_id) -# # Then -# assert match_a_wins == MatchOutcome.A_WINS -# assert match_b_wins == MatchOutcome.B_WINS -# assert match_draw == MatchOutcome.DRAW - - -def test_do_evaluate_should_build_correct_matches( - dummy_elo_qa_grader: DummyEloQaGrader, - dummy_qa_input: SingleChunkQaInput, - dummy_qa_output: SingleChunkQaOutput, +def test_evauluate_runs_creates_correct_matches_for_elo_qa_eval( + qa_setup: Tuple[Sequence[str], str], + elo_evaluator: Evaluator[ + SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput, Matches + ] ) -> None: - example = Example(input=dummy_qa_input, expected_output=dummy_qa_output) - contestant_a = SuccessfulExampleOutput[SingleChunkQaOutput]( - run_id="a", example_id="_", output=dummy_qa_output - ) - contestant_b = SuccessfulExampleOutput[SingleChunkQaOutput]( - run_id="b", example_id="_", output=dummy_qa_output - ) - contestant_c = SuccessfulExampleOutput[SingleChunkQaOutput]( - run_id="c", example_id="_", output=dummy_qa_output - ) - contestants = [contestant_a, contestant_b, contestant_c] - evaluation_logic = EloEvaluationLogic(grader=dummy_elo_qa_grader) - matches = evaluation_logic.do_evaluate(example, *contestants).matches + run_ids, _ = qa_setup + evaluation_overview = elo_evaluator.evaluate_runs(*run_ids) + + matches = list(elo_evaluator.evaluation_lineages(evaluation_overview.id))[0].evaluation.result.matches for match in matches: assert isinstance(match, Match) if match.player_a < match.player_b: assert match.outcome == MatchOutcome.A_WINS elif match.player_a > match.player_b: assert match.outcome == MatchOutcome.B_WINS - - -def test_full_elo_eval_run( # TODO: Better name - qa_setup: Tuple[Sequence[str], str], - elo_evaluator: Evaluator[ - SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput, Matches - ], - in_memory_dataset_repository: InMemoryDatasetRepository, - in_memory_run_repository: InMemoryRunRepository, - in_memory_evaluation_repository: InMemoryEvaluationRepository, - dummy_elo_qa_grader: EloQaGrader, -) -> None: - run_ids, _ = qa_setup - evaluation_overview = elo_evaluator.evaluate_runs(run_ids[0], run_ids[1]) - - # new_evaluation_overview = new_elo_qa_evaluator.evaluate_runs(*run_ids) - print(evaluation_overview) + \ No newline at end of file