Skip to content

Commit

Permalink
feat: Move calculate_winners to EloGrader
Browse files Browse the repository at this point in the history
TASK: IL-394
  • Loading branch information
SebastianNiehusAA committed May 7, 2024
1 parent 6ec8918 commit e1b0a35
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 58 deletions.
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import math
from abc import abstractmethod
from typing import Generic, Sequence
from typing import Generic, Mapping, Sequence

from pydantic import BaseModel

from intelligence_layer.core import NoOpTracer, Task, Tracer
from intelligence_layer.core import CompleteOutput, NoOpTracer, Task, Tracer
from intelligence_layer.core.task import Input, Output
from intelligence_layer.evaluation import MatchOutcome
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
Expand Down Expand Up @@ -34,6 +35,11 @@ class EloGrader(
ExpectedOutput,
],
):
VALUES = [
" A",
" B",
] # The space before the A and B is important due to tokenization

def __init__(self, tracer: Tracer = NoOpTracer()):
self.tracer = tracer

Expand All @@ -54,3 +60,34 @@ def run_grader(
example: Example[Input, ExpectedOutput], # TODO Generalize away from Llama
) -> Match:
pass

def calculate_winners(self, complete_output: CompleteOutput) -> MatchOutcome:
default_log_prob = float("-inf")

def get_normalized_prob(
log_prob_list: Sequence[Mapping[str, float | None]] | None,
) -> float:
assert log_prob_list is not None
log_probs = log_prob_list[0]
values = [
math.exp(log_probs.get(str(key), default_log_prob) or default_log_prob)
for key in self.VALUES
]
if all(v == 0 for v in values):
raise ValueError(
f"LLM evaluation response does not contain logprobs for the required tokens for the values: {self.VALUES}"
)
return values[0] / sum(values)

def categorize_value(value: float) -> MatchOutcome:
if value > 0.7:
return MatchOutcome.A_WINS
elif 0.3 > value:
return MatchOutcome.B_WINS
else:
return MatchOutcome.DRAW

normalized_probability = get_normalized_prob(
complete_output.completions[0].log_probs
)
return categorize_value(normalized_probability)
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
from aleph_alpha_client import Prompt
from liquid import Template

from intelligence_layer.core import TaskSpan
from intelligence_layer.core.detect_language import Language
from intelligence_layer.core.model import ControlModel, Llama2InstructModel
from intelligence_layer.core.model import (
CompleteInput,
ControlModel,
Llama2InstructModel,
)
from intelligence_layer.evaluation.aggregation.elo import MatchOutcome
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
from intelligence_layer.evaluation.evaluation.elo_graders.elo_grader import (
EloGrader,
Expand Down Expand Up @@ -38,10 +45,6 @@ class EloQaGrader(
Which answer is more correct given the Instruction and Evaluation Procedure, Answer A or Answer B?
Response: Answer """
VALUES = [
" A",
" B",
] # The space before the A and B is important due to tokenization

def __init__(self, model: ControlModel = Llama2InstructModel()):
super().__init__()
Expand All @@ -68,18 +71,23 @@ def _create_grading_input(
),
)

# def do_run(self, input: EloGradingInput, task_span: TaskSpan) -> MatchOutcome:
# text = self.INPUT_TEMPLATE.format(
# instruction=input.instruction,
# first_completion=input.first_completion,
# second_completion=input.second_completion,
# )
#
# complete_input = self._create_complete_input(Prompt.from_text(text))
# complete_output = self._model.complete_task().run(complete_input, task_span)
#
# return self._calculate_winners(complete_output)
#
def do_run(self, input: EloGradingInput, task_span: TaskSpan) -> MatchOutcome:
text = self.INPUT_TEMPLATE.format(
instruction=input.instruction,
first_completion=input.first_completion,
second_completion=input.second_completion,
)

complete_input = CompleteInput(
prompt=Prompt.from_text(text),
maximum_tokens=1,
log_probs=3,
disable_optimizations=True,
)
complete_output = self._model.complete_task().run(complete_input, task_span)

return self.calculate_winners(complete_output)

def run_grader(
self,
first: SuccessfulExampleOutput[SingleChunkQaOutput],
Expand All @@ -100,42 +108,3 @@ def run_grader(
)

#
# def _create_complete_input(self, prompt: Prompt) -> CompleteInput:
# return CompleteInput(
# prompt=prompt,
# maximum_tokens=1,
# log_probs=3,
# disable_optimizations=True,
# )
#
# def _calculate_winners(self, complete_output: CompleteOutput) -> MatchOutcome:
# default_log_prob = float("-inf")
#
# def get_normalized_prob(
# log_prob_list: Sequence[Mapping[str, float | None]] | None,
# ) -> float:
# assert log_prob_list is not None
# log_probs = log_prob_list[0]
# values = [
# math.exp(log_probs.get(str(key), default_log_prob) or default_log_prob)
# for key in self.VALUES
# ]
# if all(v == 0 for v in values):
# raise ValueError(
# f"LLM evaluation response does not contain logprobs for the required tokens for the values: {self.VALUES}"
# )
# normalized_A_prob = values[0] / sum(values)
# return normalized_A_prob
#
# def categorize_value(value: float) -> MatchOutcome:
# if value > 0.7:
# return MatchOutcome.A_WINS
# elif 0.3 > value:
# return MatchOutcome.B_WINS
# else:
# return MatchOutcome.DRAW
#
# normalized_probability = get_normalized_prob(
# complete_output.completions[0].log_probs
# )
# return categorize_value(normalized_probability)

0 comments on commit e1b0a35

Please sign in to comment.