Skip to content

Commit

Permalink
seperate elo aggregation in eloaggregationadapter
Browse files Browse the repository at this point in the history
  • Loading branch information
JohannesWesch committed May 15, 2024
1 parent 0728eac commit c306a29
Show file tree
Hide file tree
Showing 7 changed files with 100 additions and 81 deletions.
2 changes: 1 addition & 1 deletion src/intelligence_layer/evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .aggregation.domain import AggregatedEvaluation as AggregatedEvaluation
from .aggregation.domain import AggregationOverview as AggregationOverview
from .aggregation.elo_aggregation import (
ComparisonAggregationLogic as ComparisonAggregationLogic,
ComparisonEvaluationAggregationLogic as ComparisonEvaluationAggregationLogic,
)
from .aggregation.elo_aggregation import EloCalculator as EloCalculator
from .aggregation.elo_aggregation import WinRateCalculator as WinRateCalculator
Expand Down
138 changes: 77 additions & 61 deletions src/intelligence_layer/evaluation/aggregation/elo_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,58 @@
from intelligence_layer.evaluation import ComparisonEvaluation, MatchOutcome
from intelligence_layer.evaluation.aggregation.accumulator import MeanAccumulator
from intelligence_layer.evaluation.aggregation.aggregator import AggregationLogic
from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import Matches


class PlayerScore(BaseModel):
elo: float
elo_standard_error: float
win_rate: float
num_matches: int


class AggregatedComparison(BaseModel):
scores: Mapping[str, PlayerScore]


class EloAggregationAdapter:
@staticmethod
def aggregate(evaluations: Iterable[ComparisonEvaluation]) -> AggregatedComparison:
evaluations = list(evaluations)
player_counter = Counter(
player
for comparison_evaluation in evaluations
for player in [
comparison_evaluation.first_player,
comparison_evaluation.second_player,
]
)

player_counts = dict(player_counter)
players = player_counts.keys()

accumulators = {p: MeanAccumulator() for p in players}
for _ in range(100):
elo_calc = EloCalculator(players)
random.shuffle(evaluations)
elo_calc.calculate(evaluations)
for p in players:
accumulators[p].add(elo_calc.ratings[p])

win_rate_calc = WinRateCalculator(players)
win_rate = win_rate_calc.calculate(evaluations)

return AggregatedComparison(
scores={
p: PlayerScore(
elo=acc.extract(),
elo_standard_error=acc.standard_error(),
win_rate=win_rate[p],
num_matches=player_counts[p],
)
for p, acc in accumulators.items()
},
)


class EloCalculator:
Expand Down Expand Up @@ -48,85 +100,49 @@ def _calc_difs(
actual_b - expected_win_rate_b
)

def calculate(self, matches: Sequence[tuple[str, str, MatchOutcome]]) -> None:
for a, b, o in matches:
dif_a, dif_b = self._calc_difs(o, a, b)
self.ratings[a] += dif_a
self.ratings[b] += dif_b
self._match_counts[a] += 1
self._match_counts[b] += 1
def calculate(self, matches: Sequence[ComparisonEvaluation]) -> None:
for match in matches:
dif_a, dif_b = self._calc_difs(
match.outcome, match.first_player, match.second_player
)
self.ratings[match.first_player] += dif_a
self.ratings[match.second_player] += dif_b
self._match_counts[match.first_player] += 1
self._match_counts[match.second_player] += 1


class WinRateCalculator:
def __init__(self, players: Iterable[str]) -> None:
self.match_count: dict[str, int] = {p: 0 for p in players}
self.win_count: dict[str, float] = {p: 0 for p in players}

def calculate(
self, matches: Sequence[tuple[str, str, MatchOutcome]]
) -> Mapping[str, float]:
for a, b, o in matches:
self.match_count[a] += 1
self.match_count[b] += 1
self.win_count[a] += o.payoff[0]
self.win_count[b] += o.payoff[1]
def calculate(self, matches: Sequence[ComparisonEvaluation]) -> Mapping[str, float]:
for match in matches:
self.match_count[match.first_player] += 1
self.match_count[match.second_player] += 1
self.win_count[match.first_player] += match.outcome.payoff[0]
self.win_count[match.second_player] += match.outcome.payoff[1]

return {
player: self.win_count[player] / match_count
for player, match_count in self.match_count.items()
}


class PlayerScore(BaseModel):
elo: float
elo_standard_error: float
win_rate: float
num_matches: int


class AggregatedComparison(BaseModel):
scores: Mapping[str, PlayerScore]


class ComparisonAggregationLogic(
class ComparisonEvaluationAggregationLogic(
AggregationLogic[ComparisonEvaluation, AggregatedComparison]
):
def aggregate(
self, evaluations: Iterable[ComparisonEvaluation]
) -> AggregatedComparison:
flattened_evaluations = [
(
evaluation.first_player,
evaluation.second_player,
evaluation.outcome,
)
for evaluation in evaluations
]
player_counter = Counter(
player for match in flattened_evaluations for player in [match[0], match[1]]
)
player_counts = dict(player_counter)
players = player_counts.keys()
return EloAggregationAdapter.aggregate(evaluations)

accumulators = {p: MeanAccumulator() for p in players}
for _ in range(100):
elo_calc = EloCalculator(players)
random.shuffle(flattened_evaluations)
elo_calc.calculate(flattened_evaluations)
for p in players:
accumulators[p].add(elo_calc.ratings[p])

win_rate_calc = WinRateCalculator(players)
win_rate = win_rate_calc.calculate(flattened_evaluations)

return AggregatedComparison(
scores={
p: PlayerScore(
elo=acc.extract(),
elo_standard_error=acc.standard_error(),
win_rate=win_rate[p],
num_matches=player_counts[p],
)
for p, acc in accumulators.items()
},
)
class MatchesAggregationLogic(AggregationLogic[Matches, AggregatedComparison]):
def aggregate(self, evaluations: Iterable[Matches]) -> AggregatedComparison:
flattened_matches = [
comparison_evaluation
for match in evaluations
for comparison_evaluation in match.comparison_evaluations
]
return EloAggregationAdapter.aggregate(flattened_matches)
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class ComparisonEvaluation(BaseModel):


class Matches(BaseModel):
matches: Sequence[ComparisonEvaluation]
comparison_evaluations: Sequence[ComparisonEvaluation]


class EloGradingInput(BaseModel):
Expand All @@ -70,7 +70,7 @@ def do_evaluate(
) -> Matches:
pairs = combinations(output, 2)
return Matches(
matches=[
comparison_evaluations=[
ComparisonEvaluation(
first_player=player_a.run_id,
second_player=player_b.run_id,
Expand Down
4 changes: 2 additions & 2 deletions tests/evaluation/test_argilla_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
ArgillaEvaluationLogic,
ArgillaEvaluator,
AsyncInMemoryEvaluationRepository,
ComparisonAggregationLogic,
ComparisonEvaluation,
ComparisonEvaluationAggregationLogic,
DatasetRepository,
Example,
InMemoryDatasetRepository,
Expand Down Expand Up @@ -327,7 +327,7 @@ def test_argilla_evaluator_abort_on_error_works(


def test_argilla_aggregation_logic_works() -> None:
argilla_aggregation_logic = ComparisonAggregationLogic()
argilla_aggregation_logic = ComparisonEvaluationAggregationLogic()
evaluations = (
ComparisonEvaluation(
first_player="player_1",
Expand Down
7 changes: 5 additions & 2 deletions tests/evaluation/test_elo_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from pytest import fixture

from intelligence_layer.evaluation import EloCalculator, MatchOutcome, WinRateCalculator
from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import (
ComparisonEvaluation,
)


@fixture
Expand Down Expand Up @@ -33,7 +36,7 @@ def test_match_outcome_serializes() -> None:


def test_elo_calculator_works(
players: Sequence[str], matches: Sequence[tuple[str, str, MatchOutcome]]
players: Sequence[str], matches: Sequence[ComparisonEvaluation]
) -> None:
elo_calculator = EloCalculator(players)
elo_calculator.calculate(matches)
Expand All @@ -52,7 +55,7 @@ def test_elo_calculator_works(


def test_win_rate_calculator_works(
players: Sequence[str], matches: Sequence[tuple[str, str, MatchOutcome]]
players: Sequence[str], matches: Sequence[ComparisonEvaluation]
) -> None:
win_rate_calculator = WinRateCalculator(players)
scores = win_rate_calculator.calculate(matches)
Expand Down
2 changes: 1 addition & 1 deletion tests/evaluation/test_elo_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def test_evaluate_runs_creates_correct_matches_for_elo_qa_eval(
0
].evaluation.result
assert isinstance(eval_result, Matches)
matches = eval_result.matches
matches = eval_result.comparison_evaluations

for match in matches:
assert isinstance(match, ComparisonEvaluation)
Expand Down
24 changes: 12 additions & 12 deletions tests/evaluation/test_instruct_comparison_argilla_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
Aggregator,
ArgillaEvaluator,
AsyncInMemoryEvaluationRepository,
ComparisonAggregationLogic,
ComparisonEvaluation,
ComparisonEvaluationAggregationLogic,
EloCalculator,
Example,
ExampleOutput,
Expand Down Expand Up @@ -109,8 +109,8 @@ def any_instruct_output() -> CompleteOutput:


@fixture
def argilla_aggregation_logic() -> ComparisonAggregationLogic:
return ComparisonAggregationLogic()
def argilla_aggregation_logic() -> ComparisonEvaluationAggregationLogic:
return ComparisonEvaluationAggregationLogic()


def create_dummy_dataset(
Expand Down Expand Up @@ -165,7 +165,7 @@ def test_evaluate_run_submits_pairwise_comparison_records(
in_memory_run_repository: InMemoryRunRepository,
async_in_memory_evaluation_repository: AsyncInMemoryEvaluationRepository,
in_memory_aggregation_repository: InMemoryAggregationRepository,
argilla_aggregation_logic: ComparisonAggregationLogic,
argilla_aggregation_logic: ComparisonEvaluationAggregationLogic,
any_instruct_output: CompleteOutput,
argilla_fake: ArgillaFake,
) -> None:
Expand Down Expand Up @@ -244,10 +244,10 @@ def test_elo_calculating_works_as_expected() -> None:
player1 = "player1"
player2 = "player2"
matches = [
(
player1,
player2,
MatchOutcome.A_WINS,
ComparisonEvaluation(
first_player=player1,
second_player=player2,
outcome=MatchOutcome.A_WINS,
)
for _ in range(10)
]
Expand All @@ -258,10 +258,10 @@ def test_elo_calculating_works_as_expected() -> None:
assert elo.ratings[player2] < 1500

comeback_matches = [
(
player1,
player2,
MatchOutcome.B_WINS,
ComparisonEvaluation(
first_player=player1,
second_player=player2,
outcome=MatchOutcome.B_WINS,
)
for i in range(10)
]
Expand Down

0 comments on commit c306a29

Please sign in to comment.