diff --git a/src/documentation/how_tos/example_data.py b/src/documentation/how_tos/example_data.py index c9d60dd33..24353db7f 100644 --- a/src/documentation/how_tos/example_data.py +++ b/src/documentation/how_tos/example_data.py @@ -17,6 +17,12 @@ SuccessfulExampleOutput, ) from intelligence_layer.evaluation.aggregation.aggregator import AggregationLogic +from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import ( + ComparisonEvaluation, + EloEvaluationLogic, + Matches, + MatchOutcome, +) class DummyExample(Example[str, str]): @@ -42,6 +48,34 @@ def do_evaluate( ) +class DummyEloEvaluationLogic(EloEvaluationLogic[str, str, str]): + def grade( + self, + first: SuccessfulExampleOutput[str], + second: SuccessfulExampleOutput[str], + example: Example[str, str], + ) -> MatchOutcome: + return MatchOutcome.DRAW + + def do_incremental_evaluate( + self, + example: Example[str, str], + outputs: list[SuccessfulExampleOutput[str]], + already_evaluated_outputs: list[list[SuccessfulExampleOutput[str]]], + ) -> Matches: + player_a = SuccessfulExampleOutput(run_id="1", example_id="1", output="1") + player_b = SuccessfulExampleOutput(run_id="2", example_id="2", output="2") + return Matches( + comparison_evaluations=[ + ComparisonEvaluation( + first_player="1", + second_player="2", + outcome=self.grade(player_a, player_b, example), + ) + ] + ) + + class DummyAggregation(BaseModel): num_evaluations: int diff --git a/src/documentation/how_tos/how_to_implement_elo_evaluations.ipynb b/src/documentation/how_tos/how_to_implement_elo_evaluations.ipynb new file mode 100644 index 000000000..e630d7336 --- /dev/null +++ b/src/documentation/how_tos/how_to_implement_elo_evaluations.ipynb @@ -0,0 +1,96 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from documentation.how_tos.example_data import DummyEloEvaluationLogic, example_data\n", + "from intelligence_layer.evaluation import (\n", + " IncrementalEvaluator,\n", + " InMemoryEvaluationRepository,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How to implement elo evaluations\n", + "0. Run your tasks on the datasets you want to evaluate (see [here](./how_to_run_a_task_on_a_dataset.ipynb))\n", + " - When evaluating multiple runs, all of them need the same data types \n", + "2. Initialize all necessary repositories for the `IncrementalEvaluator`, and an `EloEvaluationLogic` that is specific to your use case. \n", + "3. Run the evaluator to evaluate all examples and create a single `EvaluationOverview`\n", + "4. (Optional) Save the evaluation id for later use" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 0\n", + "\n", + "\n", + "my_example_data = example_data()\n", + "print()\n", + "run_ids = [my_example_data.run_overview_1.id, my_example_data.run_overview_2.id]\n", + "\n", + "# Step 1\n", + "dataset_repository = my_example_data.dataset_repository\n", + "run_repository = my_example_data.run_repository\n", + "evaluation_repository = InMemoryEvaluationRepository()\n", + "evaluation_logic = DummyEloEvaluationLogic()\n", + "\n", + "# Step 3\n", + "evaluator = IncrementalEvaluator(\n", + " dataset_repository,\n", + " run_repository,\n", + " evaluation_repository,\n", + " \"My dummy evaluation\",\n", + " evaluation_logic,\n", + ")\n", + "\n", + "evaluation_overview = evaluator.evaluate_runs(*run_ids)\n", + "\n", + "# Step 4\n", + "print(evaluation_overview.id)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "intelligence-layer-aL2cXmJM-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/intelligence_layer/evaluation/__init__.py b/src/intelligence_layer/evaluation/__init__.py index c3f5c7551..f6deb4832 100644 --- a/src/intelligence_layer/evaluation/__init__.py +++ b/src/intelligence_layer/evaluation/__init__.py @@ -63,23 +63,28 @@ from .evaluation.evaluator.async_evaluator import ( AsyncEvaluationRepository as AsyncEvaluationRepository, ) -from .evaluation.evaluator.elo_evaluator import ( - ComparisonEvaluation as ComparisonEvaluation, -) -from .evaluation.evaluator.elo_evaluator import EloEvaluationLogic as EloEvaluationLogic -from .evaluation.evaluator.elo_evaluator import Matches as Matches -from .evaluation.evaluator.elo_evaluator import MatchOutcome as MatchOutcome from .evaluation.evaluator.evaluator import EvaluationLogic as EvaluationLogic from .evaluation.evaluator.evaluator import Evaluator as Evaluator from .evaluation.evaluator.evaluator import ( SingleOutputEvaluationLogic as SingleOutputEvaluationLogic, ) +from .evaluation.evaluator.incremental_evaluator import ( + ComparisonEvaluation as ComparisonEvaluation, +) +from .evaluation.evaluator.incremental_evaluator import ( + EloEvaluationLogic as EloEvaluationLogic, +) +from .evaluation.evaluator.incremental_evaluator import ( + EloGradingInput as EloGradingInput, +) from .evaluation.evaluator.incremental_evaluator import ( IncrementalEvaluationLogic as IncrementalEvaluationLogic, ) from .evaluation.evaluator.incremental_evaluator import ( IncrementalEvaluator as IncrementalEvaluator, ) +from .evaluation.evaluator.incremental_evaluator import Matches as Matches +from .evaluation.evaluator.incremental_evaluator import MatchOutcome as MatchOutcome from .evaluation.file_evaluation_repository import ( AsyncFileEvaluationRepository as AsyncFileEvaluationRepository, ) diff --git a/src/intelligence_layer/evaluation/aggregation/elo_aggregation.py b/src/intelligence_layer/evaluation/aggregation/elo_aggregation.py index 006beeeb3..72ff3e15b 100644 --- a/src/intelligence_layer/evaluation/aggregation/elo_aggregation.py +++ b/src/intelligence_layer/evaluation/aggregation/elo_aggregation.py @@ -7,7 +7,7 @@ from intelligence_layer.evaluation.aggregation.accumulator import MeanAccumulator from intelligence_layer.evaluation.aggregation.aggregator import AggregationLogic -from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import ( +from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import ( ComparisonEvaluation, Matches, MatchOutcome, diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py index efa2e02ab..e6430e627 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py @@ -30,7 +30,7 @@ from intelligence_layer.evaluation.evaluation.evaluator.base_evaluator import ( EvaluationLogicBase, ) -from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import ( +from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import ( ComparisonEvaluation, MatchOutcome, ) diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/elo_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/elo_evaluator.py deleted file mode 100644 index 5a1a065c1..000000000 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/elo_evaluator.py +++ /dev/null @@ -1,120 +0,0 @@ -from abc import abstractmethod -from enum import Enum -from itertools import combinations -from typing import Sequence, final - -from pydantic import BaseModel - -from intelligence_layer.core import Input, Output -from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput -from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import ( - IncrementalEvaluationLogic, -) -from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput - - -class MatchOutcome(str, Enum): - A_WINS = "a_wins" - DRAW = "draw" - B_WINS = "b_wins" - - @property - def payoff(self) -> tuple[float, float]: - if self == self.A_WINS: - return (1, 0) - if self == self.DRAW: - return (0.5, 0.5) - return (0, 1) - - @staticmethod - def from_rank_literal(rank: int) -> "MatchOutcome": - match rank: - case 1: - return MatchOutcome.A_WINS - case 2: - return MatchOutcome.B_WINS - case 3: - return MatchOutcome.DRAW - case _: - raise ValueError(f"Got unexpected rank {rank}") - - -class ComparisonEvaluation(BaseModel): - first_player: str - second_player: str - outcome: MatchOutcome - - -class Matches(BaseModel): - comparison_evaluations: Sequence[ComparisonEvaluation] - - -class EloGradingInput(BaseModel): - instruction: str - first_completion: str - second_completion: str - - -class EloEvaluationLogic( - IncrementalEvaluationLogic[Input, Output, ExpectedOutput, Matches] -): - def __init__(self) -> None: - super().__init__() - self._previous_run_output_ids: list[set[str]] = [] - - def set_previous_run_output_ids( - self, previous_run_output_ids: list[set[str]] - ) -> None: - self._previous_run_output_ids = previous_run_output_ids - - @final - def do_incremental_evaluate( - self, - example: Example[Input, ExpectedOutput], - outputs: list[SuccessfulExampleOutput[Output]], - already_evaluated_outputs: list[list[SuccessfulExampleOutput[Output]]], - ) -> Matches: - pairs = combinations(outputs, 2) - unique_pre_evaluated_runs: set[str] = set() - - for pre_run_output in already_evaluated_outputs: - for current_output in pre_run_output: - unique_pre_evaluated_runs.add(current_output.run_id) - - return Matches( - comparison_evaluations=[ - ComparisonEvaluation( - first_player=player_a.run_id, - second_player=player_b.run_id, - outcome=self.grade(player_a, player_b, example), - ) - for [player_a, player_b] in pairs - if unique_pre_evaluated_runs is None - or len(unique_pre_evaluated_runs) == 0 - or not ( - player_a.run_id in unique_pre_evaluated_runs - and player_b.run_id in unique_pre_evaluated_runs - ) - ] - ) - - @abstractmethod - def grade( - self, - first: SuccessfulExampleOutput[Output], - second: SuccessfulExampleOutput[Output], - example: Example[Input, ExpectedOutput], - ) -> MatchOutcome: - """Returns a :class: `MatchOutcome`for the provided two contestants on the given example. - Defines the use case specific logic how to determine the winner of the two provided outputs. - - - Args: - first: Instance of :class: `SuccessfulExampleOutut[Output]` of the first contestant in the comparison - second: Instance of :class: `SuccessfulExampleOutut[Output]` of the second contestant in the comparison - example: Datapoint of :class: `Example` on which the two outputs were generated - - Return: - Instance of :class: `MatchOutcome` - """ - pass diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py index aaba0f9d1..cf24f55c1 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_evaluator.py @@ -1,5 +1,9 @@ from abc import abstractmethod -from typing import Optional +from enum import Enum +from itertools import combinations +from typing import Optional, Sequence + +from pydantic import BaseModel from intelligence_layer.core import Input, Output from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository @@ -170,3 +174,109 @@ def evaluate_runs( return super().evaluate_runs( *run_ids, num_examples=num_examples, abort_on_error=abort_on_error ) + + +class MatchOutcome(str, Enum): + A_WINS = "a_wins" + DRAW = "draw" + B_WINS = "b_wins" + + @property + def payoff(self) -> tuple[float, float]: + if self == self.A_WINS: + return (1, 0) + if self == self.DRAW: + return (0.5, 0.5) + return (0, 1) + + @staticmethod + def from_rank_literal(rank: int) -> "MatchOutcome": + match rank: + case 1: + return MatchOutcome.A_WINS + case 2: + return MatchOutcome.B_WINS + case 3: + return MatchOutcome.DRAW + case _: + raise ValueError(f"Got unexpected rank {rank}") + + +class ComparisonEvaluation(BaseModel): + first_player: str + second_player: str + outcome: MatchOutcome + + +class Matches(BaseModel): + comparison_evaluations: Sequence[ComparisonEvaluation] + + +class EloGradingInput(BaseModel): + instruction: str + first_completion: str + second_completion: str + + +class EloEvaluationLogic( + IncrementalEvaluationLogic[Input, Output, ExpectedOutput, Matches] +): + def __init__(self) -> None: + super().__init__() + self._previous_run_output_ids: list[set[str]] = [] + + def set_previous_run_output_ids( + self, previous_run_output_ids: list[set[str]] + ) -> None: + self._previous_run_output_ids = previous_run_output_ids + + def do_incremental_evaluate( + self, + example: Example[Input, ExpectedOutput], + outputs: list[SuccessfulExampleOutput[Output]], + already_evaluated_outputs: list[list[SuccessfulExampleOutput[Output]]], + ) -> Matches: + pairs = combinations(outputs, 2) + unique_pre_evaluated_runs: set[str] = set() + + for pre_run_output in already_evaluated_outputs: + for current_output in pre_run_output: + unique_pre_evaluated_runs.add(current_output.run_id) + + return Matches( + comparison_evaluations=[ + ComparisonEvaluation( + first_player=player_a.run_id, + second_player=player_b.run_id, + outcome=self.grade(player_a, player_b, example), + ) + for [player_a, player_b] in pairs + if unique_pre_evaluated_runs is None + or len(unique_pre_evaluated_runs) == 0 + or not ( + player_a.run_id in unique_pre_evaluated_runs + and player_b.run_id in unique_pre_evaluated_runs + ) + ] + ) + + @abstractmethod + def grade( + self, + first: SuccessfulExampleOutput[Output], + second: SuccessfulExampleOutput[Output], + example: Example[Input, ExpectedOutput], + ) -> MatchOutcome: + """Returns a :class: `MatchOutcome`for the provided two contestants on the given example. + Defines the use case specific logic how to determine the winner of the two provided outputs. + + + Args: + first: Instance of :class: `SuccessfulExampleOutut[Output]` of the first contestant in the comparison + second: Instance of :class: `SuccessfulExampleOutut[Output]` of the second contestant in the comparison + example: Datapoint of :class: `Example` on which the two outputs were generated + + Return: + Instance of :class: `MatchOutcome` + """ + pass diff --git a/src/intelligence_layer/examples/qa/elo_qa_evaluation_logic.py b/src/intelligence_layer/examples/qa/elo_qa_evaluation_logic.py index 081ab2561..f6c3335a8 100644 --- a/src/intelligence_layer/examples/qa/elo_qa_evaluation_logic.py +++ b/src/intelligence_layer/examples/qa/elo_qa_evaluation_logic.py @@ -7,11 +7,10 @@ from intelligence_layer.core.detect_language import Language from intelligence_layer.core.model import CompleteInput, CompleteOutput, ControlModel from intelligence_layer.core.tracer.tracer import NoOpTracer, TaskSpan, Tracer +from intelligence_layer.evaluation import EloEvaluationLogic, MatchOutcome from intelligence_layer.evaluation.dataset.domain import Example -from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import ( - EloEvaluationLogic, +from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import ( EloGradingInput, - MatchOutcome, ) from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput from intelligence_layer.examples.qa.single_chunk_qa import ( diff --git a/tests/evaluation/test_elo_calculator.py b/tests/evaluation/test_elo_calculator.py index a045208ab..1b52b402c 100644 --- a/tests/evaluation/test_elo_calculator.py +++ b/tests/evaluation/test_elo_calculator.py @@ -5,7 +5,7 @@ from pytest import fixture from intelligence_layer.evaluation import EloCalculator, MatchOutcome, WinRateCalculator -from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import ( +from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import ( ComparisonEvaluation, )