Skip to content

Commit

Permalink
WIP: IL-259
Browse files Browse the repository at this point in the history
  • Loading branch information
Merlin Kallenborn committed Feb 21, 2024
1 parent d043d70 commit 0c3d7a1
Show file tree
Hide file tree
Showing 11 changed files with 196 additions and 218 deletions.
15 changes: 7 additions & 8 deletions src/intelligence_layer/evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
from .accumulator import MeanAccumulator as MeanAccumulator
from .argilla import ArgillaEvaluator as ArgillaEvaluator
from .argilla import (
InstructComparisonArgillaAggregationLogic as InstructComparisonArgillaAggregationLogic,
)
from .argilla import (
InstructComparisonArgillaEvaluationLogic as InstructComparisonArgillaEvaluationLogic,
)
from .data_storage.aggregation_repository import (
AggregationRepository as AggregationRepository,
)
Expand Down Expand Up @@ -49,19 +56,11 @@
from .elo import PayoffMatrix as PayoffMatrix
from .elo import PlayerScore as PlayerScore
from .elo import WinRateCalculator as WinRateCalculator
from .evaluator import ArgillaEvaluator as ArgillaEvaluator
from .evaluator import BaseEvaluator as BaseEvaluator
from .evaluator import Evaluator as Evaluator
from .graders import BleuGrader as BleuGrader
from .graders import RougeGrader as RougeGrader
from .graders import RougeScores as RougeScores
from .hugging_face import HuggingFaceDatasetRepository as HuggingFaceDatasetRepository
from .instruct_comparison_argilla_evaluator import (
InstructComparisonArgillaAggregationLogic as InstructComparisonArgillaAggregationLogic,
)
from .instruct_comparison_argilla_evaluator import (
InstructComparisonArgillaEvaluationLogic as InstructComparisonArgillaEvaluationLogic,
)
from .runner import Runner as Runner

__all__ = [symbol for symbol in dir()]
Original file line number Diff line number Diff line change
@@ -1,38 +1,102 @@
import random
from abc import ABC, abstractmethod
from itertools import combinations
from typing import Iterable, Mapping, Optional

from pydantic import BaseModel

from intelligence_layer.connectors import Field
from intelligence_layer.connectors.argilla.argilla_client import (
ArgillaClient,
ArgillaEvaluation,
Field,
Question,
RecordData,
RecordDataSequence,
)
from intelligence_layer.core.complete import InstructInput, PromptOutput
from intelligence_layer.evaluation import (
ArgillaEvaluationRepository,
EvaluationRepository,
Example,
MeanAccumulator,
from intelligence_layer.core import Input, InstructInput, Output, PromptOutput
from intelligence_layer.evaluation.data_storage.run_repository import RunRepository
from intelligence_layer.evaluation.data_storage.dataset_repository import DatasetRepository
from intelligence_layer.evaluation.evaluator import Evaluator

from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic
from intelligence_layer.evaluation.domain import (
AggregatedEvaluation,
SuccessfulExampleOutput, ExpectedOutput, Example,
)
from intelligence_layer.evaluation.elo import (
AutomatedEloComparison,
EloCalculator,
EloComparison,
PlayerScore,
WinRateCalculator,
build_tournaments,
)
from intelligence_layer.evaluation.evaluator import (
AggregationLogic,
ArgillaEvaluationLogic,
build_tournaments, PlayerScore,
)


class ArgillaEvaluationLogic(
EvaluationLogic[Input, Output, ExpectedOutput, RecordDataSequence], ABC
):
def do_evaluate(
self,
example: Example[Input, ExpectedOutput],
*output: SuccessfulExampleOutput[Output],
) -> RecordDataSequence:
return self._to_record(example, *output)

@abstractmethod
def _to_record(
self,
example: Example[Input, ExpectedOutput],
*output: SuccessfulExampleOutput[Output],
) -> RecordDataSequence:
"""This method is responsible for translating the `Example` and `Output` of the task to :class:`RecordData`
Args:
example: The example to be translated.
output: The output of the example that was run.
"""
...


class ArgillaEvaluator(
Evaluator[Input, Output, ExpectedOutput, ArgillaEvaluation, AggregatedEvaluation],
ABC,
):
"""Evaluator used to integrate with Argilla (https://github.com/argilla-io/argilla).
Use this evaluator if you would like to easily do human eval.
This evaluator runs a dataset and sends the input and output to Argilla to be evaluated.
After they have been evaluated, you can fetch the results by using the `aggregate_evaluation` method.
Args:
evaluation_repository: The repository that will be used to store evaluation results.
dataset_repository: The repository with the examples that will be taken for the evaluation
description: human-readable description for the evaluator
# TODO: docstrings
"""

def __init__(
self,
dataset_repository: DatasetRepository,
run_repository: RunRepository,
evaluation_repository: ArgillaEvaluationRepository,
aggregation_repository: AggregationRepository,
description: str,
evaluation_logic: ArgillaEvaluationLogic[Input, Output, ExpectedOutput],
aggregation_logic: AggregationLogic[ArgillaEvaluation, AggregatedEvaluation],
) -> None:
super().__init__(
dataset_repository,
run_repository,
evaluation_repository,
aggregation_repository,
description,
evaluation_logic, # type: ignore
aggregation_logic, # TODO: check if the non-matching types of the evaluation logic and aggregation logic (in the line above) are a problem
)

def evaluation_type(self) -> type[ArgillaEvaluation]: # type: ignore
return ArgillaEvaluation


class AggregatedInstructComparison(BaseModel):
scores: Mapping[str, PlayerScore]

Expand Down Expand Up @@ -97,29 +161,25 @@ def __init__(
def _to_record(
self,
example: Example[InstructInput, None],
*outputs: PromptOutput,
*outputs: SuccessfulExampleOutput[PromptOutput],
) -> RecordDataSequence:
def create_record_data(
first: PromptOutput,
second: PromptOutput,
first: SuccessfulExampleOutput[PromptOutput],
second: SuccessfulExampleOutput[PromptOutput],
) -> RecordData:
if random.choice([True, False]):
first, second = second, first
return RecordData(
content={
self._fields["KEY_INSTRUCTION"].name: example.input.instruction,
self._fields["KEY_INPUT"].name: example.input.input or "",
self._fields["KEY_RESPONSE_1"].name: first.completion,
self._fields["KEY_RESPONSE_2"].name: second.completion,
self._fields["KEY_RESPONSE_1"].name: first.output.completion,
self._fields["KEY_RESPONSE_2"].name: second.output.completion,
},
example_id=example.id,
metadata={
self._fields[
"KEY_RESPONSE_1"
].name: "1", # TODO: first.run_id (SuccessfulExampleOutput[PromptOutput])
self._fields[
"KEY_RESPONSE_2"
].name: "2", # TODO: second.run_id (SuccessfulExampleOutput[PromptOutput])
self._fields["KEY_RESPONSE_1"].name: first.run_id,
self._fields["KEY_RESPONSE_2"].name: second.run_id,
},
)

Expand All @@ -131,7 +191,7 @@ def create_record_data(
if self._high_priority_runs is None
or any(
run_id in self._high_priority_runs
for run_id in ["1", "2"] # TODO first.run_id, second.run_id
for run_id in [first.run_id, second.run_id]
)
]
)
Expand Down
49 changes: 49 additions & 0 deletions src/intelligence_layer/evaluation/base_logic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from abc import ABC, abstractmethod
from typing import Generic, Iterable

from intelligence_layer.core import Input, Output
from intelligence_layer.evaluation import Evaluation, Example, ExpectedOutput
from intelligence_layer.evaluation.domain import (
AggregatedEvaluation,
SuccessfulExampleOutput,
)


class AggregationLogic(ABC, Generic[Evaluation, AggregatedEvaluation]):
@abstractmethod
def aggregate(self, evaluations: Iterable[Evaluation]) -> AggregatedEvaluation:
"""`Evaluator`-specific method for aggregating individual `Evaluations` into report-like `Aggregated Evaluation`.
This method is responsible for taking the results of an evaluation run and aggregating all the results.
It should create an `AggregatedEvaluation` class and return it at the end.
Args:
evaluations: The results from running `eval_and_aggregate_runs` with a :class:`Task`.
Returns:
The aggregated results of an evaluation run with a :class:`Dataset`.
"""
...


class EvaluationLogic(ABC, Generic[Input, Output, ExpectedOutput, Evaluation]):
@abstractmethod
def do_evaluate(
self,
example: Example[Input, ExpectedOutput],
*output: SuccessfulExampleOutput[Output],
) -> Evaluation:
"""Executes the evaluation for this use-case.
Responsible for comparing the input & expected output of a task to the
actually generated output.
Args:
TODO: find a better way to describe this
example: The data example data whose input was passed to the :class:`Task` to produce the output.
output: Output of the :class:`Task` that shall be evaluated.
Returns:
The metrics that come from the evaluated :class:`Task`.
"""
pass
Loading

0 comments on commit 0c3d7a1

Please sign in to comment.