Skip to content

Commit

Permalink
feat: WIP add first part of more comprehensive elo test
Browse files Browse the repository at this point in the history
TASK: IL-394
  • Loading branch information
SebastianNiehusAA authored and MerlinKallenbornAA committed May 6, 2024
1 parent 16dde84 commit ecc8ad2
Show file tree
Hide file tree
Showing 3 changed files with 447 additions and 54 deletions.
103 changes: 62 additions & 41 deletions src/intelligence_layer/evaluation/evaluation/elo_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
import random
from abc import abstractmethod
from itertools import combinations
from typing import Sequence

from pydantic import BaseModel

from intelligence_layer.core import Input, NoOpTracer, Output, Tracer
from intelligence_layer.core import Input, Language, NoOpTracer, Output, Task, Tracer
from intelligence_layer.evaluation import EvaluationLogic
from intelligence_layer.evaluation.aggregation.elo import MatchOutcome
from intelligence_layer.evaluation.dataset.domain import Example
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
from intelligence_layer.evaluation.evaluation.elo_graders.llama_grader import (
LlamaGradingInput, LlamaGrader, LlamaQaGrader
)
from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput
from intelligence_layer.examples.qa.single_chunk_qa import QA_INSTRUCTIONS


class Match(BaseModel):
Expand All @@ -16,50 +22,65 @@ class Match(BaseModel):
outcome: MatchOutcome


class EloEvaluationLogic(EvaluationLogic[Input, Output, str, list[Match]]):
# def __init__(
# self,
# # client: Client,
# tracer: Tracer = NoOpTracer(),
# ):
# self._tracer = tracer
# # self._grader = Grader( ## TODO
# # LlamaControlModel(name="llama-2-70b-chat", client=client)
# # )
class Matches(BaseModel):
matches: Sequence[Match]


class EloEvaluationLogic(EvaluationLogic[Input, Output, ExpectedOutput, Matches]):
"""Evaluation logic for a pair-wise ELO comparison.
Args:
grader: The :class:`Task` that perform the grading, i.e. the actual comparison of two run outputs.
tracer: :class:`Tracer` for tracking and debugging
"""

def __init__(
self,
grader: LlamaQaGrader(LlamaControlModel(name="llama-2-70b-chat")),
tracer: Tracer = NoOpTracer(),
):
self.tracer = tracer
self.grader = grader



@abstractmethod
def do_evaluate(
self,
example: Example[Input, str],
*output: SuccessfulExampleOutput[Output],
) -> list[Match]:
# pairs = combinations(output, 2)
# return Matches(
# matches=[
# self._run_grader(first, second, example)
# for [first, second] in pairs
# if self._high_priority_runs is None ##TODO: Adapts to iterative elo class
# or len(self._high_priority_runs) == 0
# or first.run_id in self._high_priority_runs
# or second.run_id in self._high_priority_runs
# ]
# )
pass
*output: SuccessfulExampleOutput[str],
) -> Matches:
pairs = combinations(output, 2)
return Matches(
matches=[
Match(
outcome=self._run_grader(first, second, example),
player_a=first.run_id,
player_b=second.run_id,
)
for [first, second] in pairs
]
)


def _run_grader(
self,
first: SuccessfulExampleOutput[Output],
second: SuccessfulExampleOutput[Output],
example: Example[Input, str],
first_id: str,
second_id: str,
grading_input: LlamaGradingInput, # TODO Generalize away from Llama
) -> Match:
pass
# if random.choice([True, False]):
# first, second = second, first
#
#
#
# return Match(
# outcome='str',
# player_a=first.run_id,
# player_b=second.run_id,
# )
"""Compare two run outputs to each other and return a :class:`Match` that contains the result of the comparison.
Args:
first_id: `str`, id of the first example of the comparison.
second_id: `str`, id of the second example of the comparison.
grading_input: # TODO
Returns: :class:`Match` that contains the result of the comparison
"""

grading_output = self.grader.do_run(grading_input, self.tracer.task_span())

return Match(
outcome=grading_output,
player_a=first_id,
player_b=second_id,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
from abc import abstractmethod
import math
from typing import Mapping, Optional, Sequence

from aleph_alpha_client import Prompt
from pydantic import BaseModel

from intelligence_layer.core import (
CompleteInput,
CompleteOutput,
ControlModel,
Task,
TaskSpan,
)
from intelligence_layer.core.detect_language import Language
from intelligence_layer.core.task import Input, Output
from intelligence_layer.evaluation import MatchOutcome
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput
from liquid import Template

from intelligence_layer.examples.qa.single_chunk_qa import QA_INSTRUCTIONS

class LlamaGradingInput(BaseModel):
instruction: str
first_completion: str
second_completion: str


class LlamaGrader(Task[LlamaGradingInput, MatchOutcome]):
@abstractmethod
def create_grading_input(
self,
first: SuccessfulExampleOutput[Output],
second: SuccessfulExampleOutput[Output],
example: Optional[Example[Input, ExpectedOutput]],
) -> LlamaGradingInput:
# TODO: General GradingInputClass or similar. Match with Arg für _run_grader
# TODO: Move below code to llama/task grader for specific implementation
# no_result = "There is no result."
# grading_input = LlamaGradingInput(
# instruction=f"{example.input.chunk} {grader_instruction}",
# first_completion=(
# first.output.answer if first.output.answer is not None else no_result
# ),
# second_completion=(
# second.output.answer if second.output.answer is not None else no_result
# ),
# )
pass

@abstractmethod
def run_grader(
self,
first_id: str,
second_id: str,
grading_input: LlamaGradingInput, # TODO Generalize away from Llama
) -> Match:
pass

from abc import abstractmethod
import math
from typing import Mapping, Optional, Sequence

from aleph_alpha_client import Prompt
from pydantic import BaseModel

from intelligence_layer.core import (
CompleteInput,
CompleteOutput,
ControlModel,
Task,
TaskSpan,
)
from intelligence_layer.core.task import Input, Output
from intelligence_layer.evaluation import MatchOutcome
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput


class LlamaGradingInput(BaseModel):
instruction: str
first_completion: str
second_completion: str


class LlamaGrader(Task[LlamaGradingInput, MatchOutcome]):
@abstractmethod
def create_grading_input(
self,
first: SuccessfulExampleOutput[Output],
second: SuccessfulExampleOutput[Output],
example: Optional[Example[Input, ExpectedOutput]],
) -> LlamaGradingInput:

pass

@abstractmethod
def run_grader(
self,
first_id: str,
second_id: str,
grading_input: LlamaGradingInput, # TODO Generalize away from Llama
) -> Match:
pass



class LlamaQaGrader(LlamaGrader):
INPUT_TEMPLATE = """
Your task is to compare two answers to an instruction on one metric.
Please make sure you read and understand these instruction carefully. Please keep this document open while reviewing, and refer to it as needed.
The Instruction for the answers was:{instruction}
Evaluation Procedure:
1. Read both answers carefully and identify the main facts and details they present.
2. Check if the answers contain any factual errors that are not supported by the instruction.
3. Evaluate which answer is more correct.
Answer A:{first_completion}
Answer B:{second_completion}
Which answer is more correct given the Instruction and Evaluation Procedure, Answer A or Answer B?
Response: Answer """
VALUES = [
" A",
" B",
] # The space before the A and B is important due to tokenization

def __init__(self, model: ControlModel, grader_instruction:str):
super().__init__()
self._model = model,
self._grader_instruction = grader_instruction

def do_run(self, input: LlamaGradingInput, task_span: TaskSpan) -> MatchOutcome:
text = self.INPUT_TEMPLATE.format(
instruction=input.instruction,
first_completion=input.first_completion,
second_completion=input.second_completion,
)

complete_input = self._create_complete_input(Prompt.from_text(text))
complete_output = self._model.complete_task().run(complete_input, task_span)

return self._calculate_winners(complete_output)

def _create_grading_input(
self,
first: SuccessfulExampleOutput[Output],
second: SuccessfulExampleOutput[Output],
example: Optional[Example[Input, ExpectedOutput]],
) -> (
LlamaGradingInput
):
qa_instruction = Template(
QA_INSTRUCTIONS[Language("en")].unformatted_instruction
).render(question=example.input.question)

no_answer = "There is no answer."
grading_input = LlamaGradingInput(
instruction=f"{example.input.chunk} {self._grader_instruction}",
first_completion=(
first.output.answer if first.output.answer is not None else no_answer
),
second_completion=(
second.output.answer if second.output.answer is not None else no_answer
),
)


def _run_grader(
self,
first_id: str,
second_id: str,
grading_input: LlamaGradingInput, # TODO Generalize away from Llama
) -> Match:
"""Compare two run outputs to each other and return a :class:`Match` that contains the result of the comparison.
Args:
first_id: `str`, id of the first example of the comparison.
second_id: `str`, id of the second example of the comparison.
grading_input: # TODO
Returns: :class:`Match` that contains the result of the comparison
"""

grading_output = self.do_run(grading_input, self.tracer.task_span())

return Match(
outcome=grading_output,
player_a=first_id,
player_b=second_id,
)

def _create_complete_input(self, prompt: Prompt) -> CompleteInput:
return CompleteInput(
prompt=prompt,
maximum_tokens=1,
log_probs=3,
disable_optimizations=True,
)

def _calculate_winners(self, complete_output: CompleteOutput) -> MatchOutcome:
default_log_prob = float("-inf")

def get_normalized_prob(
log_prob_list: Sequence[Mapping[str, float | None]] | None,
) -> float:
assert log_prob_list is not None
log_probs = log_prob_list[0]
values = [
math.exp(log_probs.get(str(key), default_log_prob) or default_log_prob)
for key in self.VALUES
]
if all(v == 0 for v in values):
raise ValueError(
f"LLM evaluation response does not contain logprobs for the required tokens for the values: {self.VALUES}"
)
normalized_A_prob = values[0] / sum(values)
return normalized_A_prob

def categorize_value(value: float) -> MatchOutcome:
if value > 0.7:
return MatchOutcome.A_WINS
elif 0.3 > value:
return MatchOutcome.B_WINS
else:
return MatchOutcome.DRAW

normalized_probability = get_normalized_prob(
complete_output.completions[0].log_probs
)
return categorize_value(normalized_probability)
Loading

0 comments on commit ecc8ad2

Please sign in to comment.