Skip to content

Commit

Permalink
feat: move shared evaluation behavior to superclass
Browse files Browse the repository at this point in the history
  • Loading branch information
NiklasKoehneckeAA committed May 13, 2024
1 parent c84de71 commit 6383ed0
Show file tree
Hide file tree
Showing 4 changed files with 256 additions and 100 deletions.
4 changes: 2 additions & 2 deletions src/intelligence_layer/evaluation/aggregation/aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def __init__(
def _get_types(self) -> Mapping[str, type]:
"""Type magic function that gets the actual types of the generic parameters.
Traverses the inheritance history of `BaseEvaluator`-subclass to find an actual type every time a TypeVar is replaced.
Traverses the inheritance history of `AggregationLogic`-subclass to find an actual type every time a TypeVar is replaced.
Returns:
Name of generic parameter to the type found.
Expand Down Expand Up @@ -186,7 +186,7 @@ def aggregate_evaluation(
) -> AggregationOverview[AggregatedEvaluation]:
"""Aggregates all evaluations into an overview that includes high-level statistics.
Aggregates :class:`Evaluation`s according to the implementation of :func:`BaseEvaluator.aggregate`.
Aggregates :class:`Evaluation`s according to the implementation of :func:`AggregationLogic.aggregate`.
Args:
eval_ids: An overview of the evaluation to be aggregated. Does not include
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,18 +72,16 @@ class ArgillaEvaluator(AsyncEvaluator[Input, Output, ExpectedOutput, Evaluation]
Use this evaluator if you would like to easily do human eval.
This evaluator runs a dataset and sends the input and output to Argilla to be evaluated.
Arguments:
Arguments:
dataset_repository: The repository with the examples that will be taken for the evaluation.
run_repository: The repository of the runs to evaluate.
evaluation_repository: The repository that will be used to store evaluation results.
description: Human-readable description for the evaluator.
evaluation_logic: The logic to use for evaluation.
argilla_client: The client to interface with argilla.
workspace_id: The argilla workspace id where datasets are created for evaluation.
Generics:
Input: Interface to be passed to the :class:`Task` that shall be evaluated.
Output: Type of the output of the :class:`Task` to be evaluated.
ExpectedOutput: Output that is expected from the run with the supplied input.
ArgillaEvaluation: Interface of the metrics that come from the Argilla task`.
See the :class:`EvaluatorBase` for more information.
"""

def __init__(
Expand All @@ -103,11 +101,11 @@ def __init__(
run_repository,
evaluation_repository,
description,
evaluation_logic, # type: ignore
evaluation_logic,
)
self._client = argilla_client
self._workspace_id = workspace_id
self._evaluation_logic: ArgillaEvaluationLogic[ # type: ignore
self._evaluation_logic: ArgillaEvaluationLogic[
Input, Output, ExpectedOutput, Evaluation
]
self._evaluation_repository: AsyncEvaluationRepository
Expand All @@ -127,7 +125,7 @@ def submit(

run_overviews = self._load_run_overviews(*run_ids)
submit_count = 0
for example, outputs in self.retrieve_eval_logic_input(
for example, outputs in self._retrieve_eval_logic_input(
run_overviews, num_examples=num_examples
):
record_sequence = self._evaluation_logic._to_record(example, *outputs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
from intelligence_layer.evaluation.evaluation.evaluation_repository import (
EvaluationRepository,
)
from intelligence_layer.evaluation.evaluation.evaluator import Evaluator
from intelligence_layer.evaluation.evaluation.evaluator import EvaluatorBase


class AsyncEvaluator(Evaluator[Input, Output, ExpectedOutput, Evaluation], ABC):
class AsyncEvaluator(EvaluatorBase[Input, Output, ExpectedOutput, Evaluation], ABC):
@abstractmethod
def submit(
self,
Expand Down
Loading

0 comments on commit 6383ed0

Please sign in to comment.