Skip to content

Commit

Permalink
WIP: Start refactoring based on new (Argilla) Evaluators
Browse files Browse the repository at this point in the history
Task: IL-394
  • Loading branch information
SebastianNiehusAA committed May 15, 2024
1 parent a0e714e commit 0728eac
Show file tree
Hide file tree
Showing 8 changed files with 67 additions and 72 deletions.
16 changes: 11 additions & 5 deletions src/intelligence_layer/evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
from .aggregation.aggregator import Aggregator as Aggregator
from .aggregation.domain import AggregatedEvaluation as AggregatedEvaluation
from .aggregation.domain import AggregationOverview as AggregationOverview
from .aggregation.elo import ComparisonAggregationLogic as ComparisonAggregationLogic
from .aggregation.elo import ComparisonEvaluation as ComparisonEvaluation
from .aggregation.elo import EloCalculator as EloCalculator
from .aggregation.elo import MatchOutcome as MatchOutcome
from .aggregation.elo import WinRateCalculator as WinRateCalculator
from .aggregation.elo_aggregation import (
ComparisonAggregationLogic as ComparisonAggregationLogic,
)
from .aggregation.elo_aggregation import EloCalculator as EloCalculator
from .aggregation.elo_aggregation import WinRateCalculator as WinRateCalculator
from .aggregation.file_aggregation_repository import (
FileAggregationRepository as FileAggregationRepository,
)
Expand Down Expand Up @@ -60,6 +60,12 @@
from .evaluation.evaluator.async_evaluator import (
AsyncEvaluationRepository as AsyncEvaluationRepository,
)
from .evaluation.evaluator.elo_evaluator import (
ComparisonEvaluation as ComparisonEvaluation,
)
from .evaluation.evaluator.elo_evaluator import EloEvaluationLogic as EloEvaluationLogic
from .evaluation.evaluator.elo_evaluator import Matches as Matches
from .evaluation.evaluator.elo_evaluator import MatchOutcome as MatchOutcome
from .evaluation.evaluator.evaluator import EvaluationLogic as EvaluationLogic
from .evaluation.evaluator.evaluator import Evaluator as Evaluator
from .evaluation.evaluator.evaluator import (
Expand Down
Original file line number Diff line number Diff line change
@@ -1,41 +1,15 @@
import random
from collections import Counter, defaultdict
from enum import Enum
from typing import Iterable, Mapping, Sequence

import numpy as np
from pydantic import BaseModel

from intelligence_layer.evaluation import ComparisonEvaluation, MatchOutcome
from intelligence_layer.evaluation.aggregation.accumulator import MeanAccumulator
from intelligence_layer.evaluation.aggregation.aggregator import AggregationLogic


class MatchOutcome(str, Enum):
A_WINS = "a_wins"
DRAW = "draw"
B_WINS = "b_wins"

@property
def payoff(self) -> tuple[float, float]:
if self == self.A_WINS:
return (1, 0)
if self == self.DRAW:
return (0.5, 0.5)
return (0, 1)

@staticmethod
def from_rank_literal(rank: int) -> "MatchOutcome":
match rank:
case 1:
return MatchOutcome.A_WINS
case 2:
return MatchOutcome.B_WINS
case 3:
return MatchOutcome.DRAW
case _:
raise ValueError(f"Got unexpected rank {rank}")


class EloCalculator:
def __init__(
self,
Expand Down Expand Up @@ -114,12 +88,6 @@ class AggregatedComparison(BaseModel):
scores: Mapping[str, PlayerScore]


class ComparisonEvaluation(BaseModel):
first: str
second: str
winner: MatchOutcome


class ComparisonAggregationLogic(
AggregationLogic[ComparisonEvaluation, AggregatedComparison]
):
Expand All @@ -128,9 +96,9 @@ def aggregate(
) -> AggregatedComparison:
flattened_evaluations = [
(
evaluation.first,
evaluation.second,
evaluation.winner,
evaluation.first_player,
evaluation.second_player,
evaluation.outcome,
)
for evaluation in evaluations
]
Expand Down
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@
RecordData,
)
from intelligence_layer.core import CompleteOutput, Input, InstructInput, Output
from intelligence_layer.evaluation.aggregation.elo import (
ComparisonEvaluation,
MatchOutcome,
)
from intelligence_layer.evaluation import ComparisonEvaluation, MatchOutcome
from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
from intelligence_layer.evaluation.evaluation.domain import (
Expand Down Expand Up @@ -303,9 +300,9 @@ def from_record(
self, argilla_evaluation: ArgillaEvaluation
) -> ComparisonEvaluation:
return ComparisonEvaluation(
first=argilla_evaluation.metadata["first"],
second=argilla_evaluation.metadata["second"],
winner=MatchOutcome.from_rank_literal(
first_player=argilla_evaluation.metadata["first"],
second_player=argilla_evaluation.metadata["second"],
outcome=MatchOutcome.from_rank_literal(
int(argilla_evaluation.responses["winner"])
),
)
Original file line number Diff line number Diff line change
@@ -1,24 +1,50 @@
from abc import abstractmethod
from enum import Enum
from itertools import combinations
from typing import Sequence, final

from pydantic import BaseModel

from intelligence_layer.core import Input, Output
from intelligence_layer.evaluation import EvaluationLogic
from intelligence_layer.evaluation.aggregation.elo import MatchOutcome
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput


class Match(BaseModel):
player_a: str
player_b: str
class MatchOutcome(str, Enum):
A_WINS = "a_wins"
DRAW = "draw"
B_WINS = "b_wins"

@property
def payoff(self) -> tuple[float, float]:
if self == self.A_WINS:
return (1, 0)
if self == self.DRAW:
return (0.5, 0.5)
return (0, 1)

@staticmethod
def from_rank_literal(rank: int) -> "MatchOutcome":
match rank:
case 1:
return MatchOutcome.A_WINS
case 2:
return MatchOutcome.B_WINS
case 3:
return MatchOutcome.DRAW
case _:
raise ValueError(f"Got unexpected rank {rank}")


class ComparisonEvaluation(BaseModel):
first_player: str
second_player: str
outcome: MatchOutcome


class Matches(BaseModel):
matches: Sequence[Match]
matches: Sequence[ComparisonEvaluation]


class EloGradingInput(BaseModel):
Expand All @@ -45,12 +71,12 @@ def do_evaluate(
pairs = combinations(output, 2)
return Matches(
matches=[
Match(
player_a=first.run_id,
player_b=second.run_id,
outcome=self.grade(first, second, example),
ComparisonEvaluation(
first_player=player_a.run_id,
second_player=player_b.run_id,
outcome=self.grade(player_a, player_b, example),
)
for [first, second] in pairs
for [player_a, player_b] in pairs
]
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
from intelligence_layer.core.detect_language import Language
from intelligence_layer.core.model import CompleteInput, CompleteOutput, ControlModel
from intelligence_layer.core.tracer.tracer import NoOpTracer, TaskSpan, Tracer
from intelligence_layer.evaluation.aggregation.elo import MatchOutcome
from intelligence_layer.evaluation import MatchOutcome
from intelligence_layer.evaluation.dataset.domain import Example
from intelligence_layer.evaluation.evaluation.elo_evaluator import (
from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import (
EloEvaluationLogic,
EloGradingInput,
)
Expand Down
6 changes: 3 additions & 3 deletions tests/evaluation/test_argilla_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,9 +330,9 @@ def test_argilla_aggregation_logic_works() -> None:
argilla_aggregation_logic = ComparisonAggregationLogic()
evaluations = (
ComparisonEvaluation(
first="player_1",
second="player_2" if i < 9000 else "player_3",
winner=MatchOutcome.from_rank_literal(
first_player="player_1",
second_player="player_2" if i < 9000 else "player_3",
outcome=MatchOutcome.from_rank_literal(
random.choices([1, 2, 3], [0.5, 0.25, 0.25], k=1)[0]
),
)
Expand Down
16 changes: 7 additions & 9 deletions tests/evaluation/test_elo_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,20 @@
)
from intelligence_layer.core.tracer.tracer import NoOpTracer, Tracer
from intelligence_layer.evaluation import (
ComparisonEvaluation,
EloEvaluationLogic,
EvaluationLogic,
Evaluator,
Example,
ExampleOutput,
InMemoryDatasetRepository,
InMemoryEvaluationRepository,
InMemoryRunRepository,
Matches,
MatchOutcome,
RunOverview,
SuccessfulExampleOutput,
)
from intelligence_layer.evaluation.evaluation.elo_evaluator import (
EloEvaluationLogic,
Match,
Matches,
)
from intelligence_layer.evaluation.evaluation.evaluator import EvaluationLogic
from intelligence_layer.examples import SingleChunkQaInput, SingleChunkQaOutput

load_dotenv()
Expand Down Expand Up @@ -188,8 +186,8 @@ def test_evaluate_runs_creates_correct_matches_for_elo_qa_eval(
matches = eval_result.matches

for match in matches:
assert isinstance(match, Match)
if match.player_a < match.player_b:
assert isinstance(match, ComparisonEvaluation)
if match.first_player < match.second_player:
assert match.outcome == MatchOutcome.A_WINS
elif match.player_a > match.player_b:
elif match.first_player > match.second_player:
assert match.outcome == MatchOutcome.B_WINS

0 comments on commit 0728eac

Please sign in to comment.