-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: WIP add first part of more comprehensive elo test
TASK: IL-394
- Loading branch information
1 parent
16dde84
commit ecc8ad2
Showing
3 changed files
with
447 additions
and
54 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
235 changes: 235 additions & 0 deletions
235
src/intelligence_layer/evaluation/evaluation/elo_graders/llama_grader.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,235 @@ | ||
from abc import abstractmethod | ||
import math | ||
from typing import Mapping, Optional, Sequence | ||
|
||
from aleph_alpha_client import Prompt | ||
from pydantic import BaseModel | ||
|
||
from intelligence_layer.core import ( | ||
CompleteInput, | ||
CompleteOutput, | ||
ControlModel, | ||
Task, | ||
TaskSpan, | ||
) | ||
from intelligence_layer.core.detect_language import Language | ||
from intelligence_layer.core.task import Input, Output | ||
from intelligence_layer.evaluation import MatchOutcome | ||
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput | ||
from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput | ||
from liquid import Template | ||
|
||
from intelligence_layer.examples.qa.single_chunk_qa import QA_INSTRUCTIONS | ||
|
||
class LlamaGradingInput(BaseModel): | ||
instruction: str | ||
first_completion: str | ||
second_completion: str | ||
|
||
|
||
class LlamaGrader(Task[LlamaGradingInput, MatchOutcome]): | ||
@abstractmethod | ||
def create_grading_input( | ||
self, | ||
first: SuccessfulExampleOutput[Output], | ||
second: SuccessfulExampleOutput[Output], | ||
example: Optional[Example[Input, ExpectedOutput]], | ||
) -> LlamaGradingInput: | ||
# TODO: General GradingInputClass or similar. Match with Arg für _run_grader | ||
# TODO: Move below code to llama/task grader for specific implementation | ||
# no_result = "There is no result." | ||
# grading_input = LlamaGradingInput( | ||
# instruction=f"{example.input.chunk} {grader_instruction}", | ||
# first_completion=( | ||
# first.output.answer if first.output.answer is not None else no_result | ||
# ), | ||
# second_completion=( | ||
# second.output.answer if second.output.answer is not None else no_result | ||
# ), | ||
# ) | ||
pass | ||
|
||
@abstractmethod | ||
def run_grader( | ||
self, | ||
first_id: str, | ||
second_id: str, | ||
grading_input: LlamaGradingInput, # TODO Generalize away from Llama | ||
) -> Match: | ||
pass | ||
|
||
from abc import abstractmethod | ||
import math | ||
from typing import Mapping, Optional, Sequence | ||
|
||
from aleph_alpha_client import Prompt | ||
from pydantic import BaseModel | ||
|
||
from intelligence_layer.core import ( | ||
CompleteInput, | ||
CompleteOutput, | ||
ControlModel, | ||
Task, | ||
TaskSpan, | ||
) | ||
from intelligence_layer.core.task import Input, Output | ||
from intelligence_layer.evaluation import MatchOutcome | ||
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput | ||
from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput | ||
|
||
|
||
class LlamaGradingInput(BaseModel): | ||
instruction: str | ||
first_completion: str | ||
second_completion: str | ||
|
||
|
||
class LlamaGrader(Task[LlamaGradingInput, MatchOutcome]): | ||
@abstractmethod | ||
def create_grading_input( | ||
self, | ||
first: SuccessfulExampleOutput[Output], | ||
second: SuccessfulExampleOutput[Output], | ||
example: Optional[Example[Input, ExpectedOutput]], | ||
) -> LlamaGradingInput: | ||
|
||
pass | ||
|
||
@abstractmethod | ||
def run_grader( | ||
self, | ||
first_id: str, | ||
second_id: str, | ||
grading_input: LlamaGradingInput, # TODO Generalize away from Llama | ||
) -> Match: | ||
pass | ||
|
||
|
||
|
||
class LlamaQaGrader(LlamaGrader): | ||
INPUT_TEMPLATE = """ | ||
Your task is to compare two answers to an instruction on one metric. | ||
Please make sure you read and understand these instruction carefully. Please keep this document open while reviewing, and refer to it as needed. | ||
The Instruction for the answers was:{instruction} | ||
Evaluation Procedure: | ||
1. Read both answers carefully and identify the main facts and details they present. | ||
2. Check if the answers contain any factual errors that are not supported by the instruction. | ||
3. Evaluate which answer is more correct. | ||
Answer A:{first_completion} | ||
Answer B:{second_completion} | ||
Which answer is more correct given the Instruction and Evaluation Procedure, Answer A or Answer B? | ||
Response: Answer """ | ||
VALUES = [ | ||
" A", | ||
" B", | ||
] # The space before the A and B is important due to tokenization | ||
|
||
def __init__(self, model: ControlModel, grader_instruction:str): | ||
super().__init__() | ||
self._model = model, | ||
self._grader_instruction = grader_instruction | ||
|
||
def do_run(self, input: LlamaGradingInput, task_span: TaskSpan) -> MatchOutcome: | ||
text = self.INPUT_TEMPLATE.format( | ||
instruction=input.instruction, | ||
first_completion=input.first_completion, | ||
second_completion=input.second_completion, | ||
) | ||
|
||
complete_input = self._create_complete_input(Prompt.from_text(text)) | ||
complete_output = self._model.complete_task().run(complete_input, task_span) | ||
|
||
return self._calculate_winners(complete_output) | ||
|
||
def _create_grading_input( | ||
self, | ||
first: SuccessfulExampleOutput[Output], | ||
second: SuccessfulExampleOutput[Output], | ||
example: Optional[Example[Input, ExpectedOutput]], | ||
) -> ( | ||
LlamaGradingInput | ||
): | ||
qa_instruction = Template( | ||
QA_INSTRUCTIONS[Language("en")].unformatted_instruction | ||
).render(question=example.input.question) | ||
|
||
no_answer = "There is no answer." | ||
grading_input = LlamaGradingInput( | ||
instruction=f"{example.input.chunk} {self._grader_instruction}", | ||
first_completion=( | ||
first.output.answer if first.output.answer is not None else no_answer | ||
), | ||
second_completion=( | ||
second.output.answer if second.output.answer is not None else no_answer | ||
), | ||
) | ||
|
||
|
||
def _run_grader( | ||
self, | ||
first_id: str, | ||
second_id: str, | ||
grading_input: LlamaGradingInput, # TODO Generalize away from Llama | ||
) -> Match: | ||
"""Compare two run outputs to each other and return a :class:`Match` that contains the result of the comparison. | ||
Args: | ||
first_id: `str`, id of the first example of the comparison. | ||
second_id: `str`, id of the second example of the comparison. | ||
grading_input: # TODO | ||
Returns: :class:`Match` that contains the result of the comparison | ||
""" | ||
|
||
grading_output = self.do_run(grading_input, self.tracer.task_span()) | ||
|
||
return Match( | ||
outcome=grading_output, | ||
player_a=first_id, | ||
player_b=second_id, | ||
) | ||
|
||
def _create_complete_input(self, prompt: Prompt) -> CompleteInput: | ||
return CompleteInput( | ||
prompt=prompt, | ||
maximum_tokens=1, | ||
log_probs=3, | ||
disable_optimizations=True, | ||
) | ||
|
||
def _calculate_winners(self, complete_output: CompleteOutput) -> MatchOutcome: | ||
default_log_prob = float("-inf") | ||
|
||
def get_normalized_prob( | ||
log_prob_list: Sequence[Mapping[str, float | None]] | None, | ||
) -> float: | ||
assert log_prob_list is not None | ||
log_probs = log_prob_list[0] | ||
values = [ | ||
math.exp(log_probs.get(str(key), default_log_prob) or default_log_prob) | ||
for key in self.VALUES | ||
] | ||
if all(v == 0 for v in values): | ||
raise ValueError( | ||
f"LLM evaluation response does not contain logprobs for the required tokens for the values: {self.VALUES}" | ||
) | ||
normalized_A_prob = values[0] / sum(values) | ||
return normalized_A_prob | ||
|
||
def categorize_value(value: float) -> MatchOutcome: | ||
if value > 0.7: | ||
return MatchOutcome.A_WINS | ||
elif 0.3 > value: | ||
return MatchOutcome.B_WINS | ||
else: | ||
return MatchOutcome.DRAW | ||
|
||
normalized_probability = get_normalized_prob( | ||
complete_output.completions[0].log_probs | ||
) | ||
return categorize_value(normalized_probability) |
Oops, something went wrong.