-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: WIP Fix generics and types, some refactoring
TASK: IL-394
- Loading branch information
1 parent
16dde84
commit 6ec8918
Showing
4 changed files
with
401 additions
and
115 deletions.
There are no files selected for viewing
82 changes: 29 additions & 53 deletions
82
src/intelligence_layer/evaluation/evaluation/elo_evaluator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,65 +1,41 @@ | ||
from abc import abstractmethod | ||
from typing import Sequence | ||
|
||
from pydantic import BaseModel | ||
from itertools import combinations | ||
|
||
from intelligence_layer.core import Input, NoOpTracer, Output, Tracer | ||
from intelligence_layer.evaluation import EvaluationLogic | ||
from intelligence_layer.evaluation.aggregation.elo import MatchOutcome | ||
from intelligence_layer.evaluation.dataset.domain import Example | ||
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput | ||
from intelligence_layer.evaluation.evaluation.elo_graders.elo_grader import ( | ||
EloGrader, | ||
Matches, | ||
) | ||
from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput | ||
|
||
|
||
class Match(BaseModel): | ||
player_a: str | ||
player_b: str | ||
outcome: MatchOutcome | ||
class EloEvaluationLogic(EvaluationLogic[Input, Output, ExpectedOutput, Matches]): | ||
"""Evaluation logic for a pair-wise ELO comparison. | ||
Args: | ||
grader: The :class:`Task` that perform the grading, i.e. the actual comparison of two run outputs. | ||
tracer: :class:`Tracer` for tracking and debugging | ||
class EloEvaluationLogic(EvaluationLogic[Input, Output, str, list[Match]]): | ||
# def __init__( | ||
# self, | ||
# # client: Client, | ||
# tracer: Tracer = NoOpTracer(), | ||
# ): | ||
# self._tracer = tracer | ||
# # self._grader = Grader( ## TODO | ||
# # LlamaControlModel(name="llama-2-70b-chat", client=client) | ||
# # ) | ||
""" | ||
|
||
@abstractmethod | ||
def do_evaluate( | ||
def __init__( | ||
self, | ||
example: Example[Input, str], | ||
*output: SuccessfulExampleOutput[Output], | ||
) -> list[Match]: | ||
# pairs = combinations(output, 2) | ||
# return Matches( | ||
# matches=[ | ||
# self._run_grader(first, second, example) | ||
# for [first, second] in pairs | ||
# if self._high_priority_runs is None ##TODO: Adapts to iterative elo class | ||
# or len(self._high_priority_runs) == 0 | ||
# or first.run_id in self._high_priority_runs | ||
# or second.run_id in self._high_priority_runs | ||
# ] | ||
# ) | ||
pass | ||
grader: EloGrader[Input, Output, ExpectedOutput], | ||
tracer: Tracer = NoOpTracer(), | ||
): | ||
self.tracer = tracer | ||
self.grader = grader | ||
|
||
def _run_grader( | ||
def do_evaluate( | ||
self, | ||
first: SuccessfulExampleOutput[Output], | ||
second: SuccessfulExampleOutput[Output], | ||
example: Example[Input, str], | ||
) -> Match: | ||
pass | ||
# if random.choice([True, False]): | ||
# first, second = second, first | ||
# | ||
# | ||
# | ||
# return Match( | ||
# outcome='str', | ||
# player_a=first.run_id, | ||
# player_b=second.run_id, | ||
# ) | ||
example: Example[Input, ExpectedOutput], | ||
*output: SuccessfulExampleOutput[Output], | ||
) -> Matches: | ||
pairs = combinations(output, 2) | ||
return Matches( | ||
matches=[ | ||
self.grader.run_grader(first, second, example) | ||
for [first, second] in pairs | ||
] | ||
) |
56 changes: 56 additions & 0 deletions
56
src/intelligence_layer/evaluation/evaluation/elo_graders/elo_grader.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
from abc import abstractmethod | ||
from typing import Generic, Sequence | ||
|
||
from pydantic import BaseModel | ||
|
||
from intelligence_layer.core import NoOpTracer, Task, Tracer | ||
from intelligence_layer.core.task import Input, Output | ||
from intelligence_layer.evaluation import MatchOutcome | ||
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput | ||
from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput | ||
|
||
|
||
class Match(BaseModel): | ||
player_a: str | ||
player_b: str | ||
outcome: MatchOutcome | ||
|
||
|
||
class Matches(BaseModel): | ||
matches: Sequence[Match] | ||
|
||
|
||
class EloGradingInput(BaseModel): | ||
instruction: str | ||
first_completion: str | ||
second_completion: str | ||
|
||
|
||
class EloGrader( | ||
Task[EloGradingInput, MatchOutcome], | ||
Generic[ | ||
Input, | ||
Output, | ||
ExpectedOutput, | ||
], | ||
): | ||
def __init__(self, tracer: Tracer = NoOpTracer()): | ||
self.tracer = tracer | ||
|
||
# @abstractmethod | ||
# def create_grading_input( | ||
# self, | ||
# first: SuccessfulExampleOutput[Output], | ||
# second: SuccessfulExampleOutput[Output], | ||
# example: Example[Input, ExpectedOutput], | ||
# ) -> EloGradingInput: | ||
# pass | ||
|
||
@abstractmethod | ||
def run_grader( | ||
self, | ||
first: SuccessfulExampleOutput[Output], | ||
second: SuccessfulExampleOutput[Output], | ||
example: Example[Input, ExpectedOutput], # TODO Generalize away from Llama | ||
) -> Match: | ||
pass |
141 changes: 141 additions & 0 deletions
141
src/intelligence_layer/evaluation/evaluation/elo_graders/elo_qa_grader.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
from liquid import Template | ||
|
||
from intelligence_layer.core.detect_language import Language | ||
from intelligence_layer.core.model import ControlModel, Llama2InstructModel | ||
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput | ||
from intelligence_layer.evaluation.evaluation.elo_graders.elo_grader import ( | ||
EloGrader, | ||
EloGradingInput, | ||
Match, | ||
) | ||
from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput | ||
from intelligence_layer.examples.qa.single_chunk_qa import ( | ||
QA_INSTRUCTIONS, | ||
SingleChunkQaInput, | ||
SingleChunkQaOutput, | ||
) | ||
|
||
|
||
class EloQaGrader( | ||
EloGrader[SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput] | ||
): | ||
INPUT_TEMPLATE = """ | ||
Your task is to compare two answers to an instruction on one metric. | ||
Please make sure you read and understand these instruction carefully. Please keep this document open while reviewing, and refer to it as needed. | ||
The Instruction for the answers was:{instruction} | ||
Evaluation Procedure: | ||
1. Read both answers carefully and identify the main facts and details they present. | ||
2. Check if the answers contain any factual errors that are not supported by the instruction. | ||
3. Evaluate which answer is more correct. | ||
Answer A:{first_completion} | ||
Answer B:{second_completion} | ||
Which answer is more correct given the Instruction and Evaluation Procedure, Answer A or Answer B? | ||
Response: Answer """ | ||
VALUES = [ | ||
" A", | ||
" B", | ||
] # The space before the A and B is important due to tokenization | ||
|
||
def __init__(self, model: ControlModel = Llama2InstructModel()): | ||
super().__init__() | ||
self._model = model | ||
|
||
def _create_grading_input( | ||
self, | ||
first: SuccessfulExampleOutput[SingleChunkQaOutput], | ||
second: SuccessfulExampleOutput[SingleChunkQaOutput], | ||
example: Example[SingleChunkQaInput, ExpectedOutput], | ||
) -> EloGradingInput: | ||
qa_instruction = Template( | ||
QA_INSTRUCTIONS[Language("en")].unformatted_instruction | ||
).render(question=example.input.question) | ||
|
||
no_answer = "There is no answer." | ||
return EloGradingInput( | ||
instruction=f"{example.input.chunk} {qa_instruction}", | ||
first_completion=( | ||
first.output.answer if first.output.answer is not None else no_answer | ||
), | ||
second_completion=( | ||
second.output.answer if second.output.answer is not None else no_answer | ||
), | ||
) | ||
|
||
# def do_run(self, input: EloGradingInput, task_span: TaskSpan) -> MatchOutcome: | ||
# text = self.INPUT_TEMPLATE.format( | ||
# instruction=input.instruction, | ||
# first_completion=input.first_completion, | ||
# second_completion=input.second_completion, | ||
# ) | ||
# | ||
# complete_input = self._create_complete_input(Prompt.from_text(text)) | ||
# complete_output = self._model.complete_task().run(complete_input, task_span) | ||
# | ||
# return self._calculate_winners(complete_output) | ||
# | ||
def run_grader( | ||
self, | ||
first: SuccessfulExampleOutput[SingleChunkQaOutput], | ||
second: SuccessfulExampleOutput[SingleChunkQaOutput], | ||
example: Example[SingleChunkQaInput, ExpectedOutput], | ||
) -> Match: | ||
grading_input = self._create_grading_input(first, second, example) | ||
|
||
return Match( | ||
outcome=self.do_run( | ||
grading_input, | ||
self.tracer.task_span( | ||
task_name="elo_qa_run_grader", input=grading_input | ||
), | ||
), | ||
player_a=first.run_id, | ||
player_b=second.run_id, | ||
) | ||
|
||
# | ||
# def _create_complete_input(self, prompt: Prompt) -> CompleteInput: | ||
# return CompleteInput( | ||
# prompt=prompt, | ||
# maximum_tokens=1, | ||
# log_probs=3, | ||
# disable_optimizations=True, | ||
# ) | ||
# | ||
# def _calculate_winners(self, complete_output: CompleteOutput) -> MatchOutcome: | ||
# default_log_prob = float("-inf") | ||
# | ||
# def get_normalized_prob( | ||
# log_prob_list: Sequence[Mapping[str, float | None]] | None, | ||
# ) -> float: | ||
# assert log_prob_list is not None | ||
# log_probs = log_prob_list[0] | ||
# values = [ | ||
# math.exp(log_probs.get(str(key), default_log_prob) or default_log_prob) | ||
# for key in self.VALUES | ||
# ] | ||
# if all(v == 0 for v in values): | ||
# raise ValueError( | ||
# f"LLM evaluation response does not contain logprobs for the required tokens for the values: {self.VALUES}" | ||
# ) | ||
# normalized_A_prob = values[0] / sum(values) | ||
# return normalized_A_prob | ||
# | ||
# def categorize_value(value: float) -> MatchOutcome: | ||
# if value > 0.7: | ||
# return MatchOutcome.A_WINS | ||
# elif 0.3 > value: | ||
# return MatchOutcome.B_WINS | ||
# else: | ||
# return MatchOutcome.DRAW | ||
# | ||
# normalized_probability = get_normalized_prob( | ||
# complete_output.completions[0].log_probs | ||
# ) | ||
# return categorize_value(normalized_probability) |
Oops, something went wrong.