Skip to content

Commit

Permalink
feat: WIP Fix generics and types, some refactoring
Browse files Browse the repository at this point in the history
TASK: IL-394
  • Loading branch information
SebastianNiehusAA committed May 7, 2024
1 parent 16dde84 commit 6ec8918
Show file tree
Hide file tree
Showing 4 changed files with 401 additions and 115 deletions.
82 changes: 29 additions & 53 deletions src/intelligence_layer/evaluation/evaluation/elo_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,41 @@
from abc import abstractmethod
from typing import Sequence

from pydantic import BaseModel
from itertools import combinations

from intelligence_layer.core import Input, NoOpTracer, Output, Tracer
from intelligence_layer.evaluation import EvaluationLogic
from intelligence_layer.evaluation.aggregation.elo import MatchOutcome
from intelligence_layer.evaluation.dataset.domain import Example
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
from intelligence_layer.evaluation.evaluation.elo_graders.elo_grader import (
EloGrader,
Matches,
)
from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput


class Match(BaseModel):
player_a: str
player_b: str
outcome: MatchOutcome
class EloEvaluationLogic(EvaluationLogic[Input, Output, ExpectedOutput, Matches]):
"""Evaluation logic for a pair-wise ELO comparison.
Args:
grader: The :class:`Task` that perform the grading, i.e. the actual comparison of two run outputs.
tracer: :class:`Tracer` for tracking and debugging
class EloEvaluationLogic(EvaluationLogic[Input, Output, str, list[Match]]):
# def __init__(
# self,
# # client: Client,
# tracer: Tracer = NoOpTracer(),
# ):
# self._tracer = tracer
# # self._grader = Grader( ## TODO
# # LlamaControlModel(name="llama-2-70b-chat", client=client)
# # )
"""

@abstractmethod
def do_evaluate(
def __init__(
self,
example: Example[Input, str],
*output: SuccessfulExampleOutput[Output],
) -> list[Match]:
# pairs = combinations(output, 2)
# return Matches(
# matches=[
# self._run_grader(first, second, example)
# for [first, second] in pairs
# if self._high_priority_runs is None ##TODO: Adapts to iterative elo class
# or len(self._high_priority_runs) == 0
# or first.run_id in self._high_priority_runs
# or second.run_id in self._high_priority_runs
# ]
# )
pass
grader: EloGrader[Input, Output, ExpectedOutput],
tracer: Tracer = NoOpTracer(),
):
self.tracer = tracer
self.grader = grader

def _run_grader(
def do_evaluate(
self,
first: SuccessfulExampleOutput[Output],
second: SuccessfulExampleOutput[Output],
example: Example[Input, str],
) -> Match:
pass
# if random.choice([True, False]):
# first, second = second, first
#
#
#
# return Match(
# outcome='str',
# player_a=first.run_id,
# player_b=second.run_id,
# )
example: Example[Input, ExpectedOutput],
*output: SuccessfulExampleOutput[Output],
) -> Matches:
pairs = combinations(output, 2)
return Matches(
matches=[
self.grader.run_grader(first, second, example)
for [first, second] in pairs
]
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from abc import abstractmethod
from typing import Generic, Sequence

from pydantic import BaseModel

from intelligence_layer.core import NoOpTracer, Task, Tracer
from intelligence_layer.core.task import Input, Output
from intelligence_layer.evaluation import MatchOutcome
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput


class Match(BaseModel):
player_a: str
player_b: str
outcome: MatchOutcome


class Matches(BaseModel):
matches: Sequence[Match]


class EloGradingInput(BaseModel):
instruction: str
first_completion: str
second_completion: str


class EloGrader(
Task[EloGradingInput, MatchOutcome],
Generic[
Input,
Output,
ExpectedOutput,
],
):
def __init__(self, tracer: Tracer = NoOpTracer()):
self.tracer = tracer

# @abstractmethod
# def create_grading_input(
# self,
# first: SuccessfulExampleOutput[Output],
# second: SuccessfulExampleOutput[Output],
# example: Example[Input, ExpectedOutput],
# ) -> EloGradingInput:
# pass

@abstractmethod
def run_grader(
self,
first: SuccessfulExampleOutput[Output],
second: SuccessfulExampleOutput[Output],
example: Example[Input, ExpectedOutput], # TODO Generalize away from Llama
) -> Match:
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
from liquid import Template

from intelligence_layer.core.detect_language import Language
from intelligence_layer.core.model import ControlModel, Llama2InstructModel
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
from intelligence_layer.evaluation.evaluation.elo_graders.elo_grader import (
EloGrader,
EloGradingInput,
Match,
)
from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput
from intelligence_layer.examples.qa.single_chunk_qa import (
QA_INSTRUCTIONS,
SingleChunkQaInput,
SingleChunkQaOutput,
)


class EloQaGrader(
EloGrader[SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput]
):
INPUT_TEMPLATE = """
Your task is to compare two answers to an instruction on one metric.
Please make sure you read and understand these instruction carefully. Please keep this document open while reviewing, and refer to it as needed.
The Instruction for the answers was:{instruction}
Evaluation Procedure:
1. Read both answers carefully and identify the main facts and details they present.
2. Check if the answers contain any factual errors that are not supported by the instruction.
3. Evaluate which answer is more correct.
Answer A:{first_completion}
Answer B:{second_completion}
Which answer is more correct given the Instruction and Evaluation Procedure, Answer A or Answer B?
Response: Answer """
VALUES = [
" A",
" B",
] # The space before the A and B is important due to tokenization

def __init__(self, model: ControlModel = Llama2InstructModel()):
super().__init__()
self._model = model

def _create_grading_input(
self,
first: SuccessfulExampleOutput[SingleChunkQaOutput],
second: SuccessfulExampleOutput[SingleChunkQaOutput],
example: Example[SingleChunkQaInput, ExpectedOutput],
) -> EloGradingInput:
qa_instruction = Template(
QA_INSTRUCTIONS[Language("en")].unformatted_instruction
).render(question=example.input.question)

no_answer = "There is no answer."
return EloGradingInput(
instruction=f"{example.input.chunk} {qa_instruction}",
first_completion=(
first.output.answer if first.output.answer is not None else no_answer
),
second_completion=(
second.output.answer if second.output.answer is not None else no_answer
),
)

# def do_run(self, input: EloGradingInput, task_span: TaskSpan) -> MatchOutcome:
# text = self.INPUT_TEMPLATE.format(
# instruction=input.instruction,
# first_completion=input.first_completion,
# second_completion=input.second_completion,
# )
#
# complete_input = self._create_complete_input(Prompt.from_text(text))
# complete_output = self._model.complete_task().run(complete_input, task_span)
#
# return self._calculate_winners(complete_output)
#
def run_grader(
self,
first: SuccessfulExampleOutput[SingleChunkQaOutput],
second: SuccessfulExampleOutput[SingleChunkQaOutput],
example: Example[SingleChunkQaInput, ExpectedOutput],
) -> Match:
grading_input = self._create_grading_input(first, second, example)

return Match(
outcome=self.do_run(
grading_input,
self.tracer.task_span(
task_name="elo_qa_run_grader", input=grading_input
),
),
player_a=first.run_id,
player_b=second.run_id,
)

#
# def _create_complete_input(self, prompt: Prompt) -> CompleteInput:
# return CompleteInput(
# prompt=prompt,
# maximum_tokens=1,
# log_probs=3,
# disable_optimizations=True,
# )
#
# def _calculate_winners(self, complete_output: CompleteOutput) -> MatchOutcome:
# default_log_prob = float("-inf")
#
# def get_normalized_prob(
# log_prob_list: Sequence[Mapping[str, float | None]] | None,
# ) -> float:
# assert log_prob_list is not None
# log_probs = log_prob_list[0]
# values = [
# math.exp(log_probs.get(str(key), default_log_prob) or default_log_prob)
# for key in self.VALUES
# ]
# if all(v == 0 for v in values):
# raise ValueError(
# f"LLM evaluation response does not contain logprobs for the required tokens for the values: {self.VALUES}"
# )
# normalized_A_prob = values[0] / sum(values)
# return normalized_A_prob
#
# def categorize_value(value: float) -> MatchOutcome:
# if value > 0.7:
# return MatchOutcome.A_WINS
# elif 0.3 > value:
# return MatchOutcome.B_WINS
# else:
# return MatchOutcome.DRAW
#
# normalized_probability = get_normalized_prob(
# complete_output.completions[0].log_probs
# )
# return categorize_value(normalized_probability)
Loading

0 comments on commit 6ec8918

Please sign in to comment.