feat: move shared evaluation behavior to superclass

Aleph-Alpha · May 13, 2024 · 6383ed0 · 6383ed0
1 parent c84de71
commit 6383ed0
Show file tree

Hide file tree

Showing 4 changed files with 256 additions and 100 deletions.
diff --git a/src/intelligence_layer/evaluation/aggregation/aggregator.py b/src/intelligence_layer/evaluation/aggregation/aggregator.py
@@ -112,7 +112,7 @@ def __init__(
     def _get_types(self) -> Mapping[str, type]:
         """Type magic function that gets the actual types of the generic parameters.
 
-        Traverses the inheritance history of `BaseEvaluator`-subclass to find an actual type every time a TypeVar is replaced.
+        Traverses the inheritance history of `AggregationLogic`-subclass to find an actual type every time a TypeVar is replaced.
 
         Returns:
             Name of generic parameter to the type found.
@@ -186,7 +186,7 @@ def aggregate_evaluation(
     ) -> AggregationOverview[AggregatedEvaluation]:
         """Aggregates all evaluations into an overview that includes high-level statistics.
 
-        Aggregates :class:`Evaluation`s according to the implementation of :func:`BaseEvaluator.aggregate`.
+        Aggregates :class:`Evaluation`s according to the implementation of :func:`AggregationLogic.aggregate`.
 
         Args:
             eval_ids: An overview of the evaluation to be aggregated. Does not include

diff --git a/src/intelligence_layer/evaluation/evaluation/argilla_evaluator.py b/src/intelligence_layer/evaluation/evaluation/argilla_evaluator.py
@@ -72,18 +72,16 @@ class ArgillaEvaluator(AsyncEvaluator[Input, Output, ExpectedOutput, Evaluation]
     Use this evaluator if you would like to easily do human eval.
     This evaluator runs a dataset and sends the input and output to Argilla to be evaluated.
 
-     Arguments:
+    Arguments:
         dataset_repository: The repository with the examples that will be taken for the evaluation.
         run_repository: The repository of the runs to evaluate.
         evaluation_repository: The repository that will be used to store evaluation results.
         description: Human-readable description for the evaluator.
         evaluation_logic: The logic to use for evaluation.
+        argilla_client: The client to interface with argilla.
+        workspace_id: The argilla workspace id where datasets are created for evaluation.
 
-    Generics:
-        Input: Interface to be passed to the :class:`Task` that shall be evaluated.
-        Output: Type of the output of the :class:`Task` to be evaluated.
-        ExpectedOutput: Output that is expected from the run with the supplied input.
-        ArgillaEvaluation: Interface of the metrics that come from the Argilla task`.
+    See the :class:`EvaluatorBase` for more information.
     """
 
     def __init__(
@@ -103,11 +101,11 @@ def __init__(
             run_repository,
             evaluation_repository,
             description,
-            evaluation_logic,  # type: ignore
+            evaluation_logic,
         )
         self._client = argilla_client
         self._workspace_id = workspace_id
-        self._evaluation_logic: ArgillaEvaluationLogic[  # type: ignore
+        self._evaluation_logic: ArgillaEvaluationLogic[
             Input, Output, ExpectedOutput, Evaluation
         ]
         self._evaluation_repository: AsyncEvaluationRepository
@@ -127,7 +125,7 @@ def submit(
 
         run_overviews = self._load_run_overviews(*run_ids)
         submit_count = 0
-        for example, outputs in self.retrieve_eval_logic_input(
+        for example, outputs in self._retrieve_eval_logic_input(
             run_overviews, num_examples=num_examples
         ):
             record_sequence = self._evaluation_logic._to_record(example, *outputs)

diff --git a/src/intelligence_layer/evaluation/evaluation/async_evaluation.py b/src/intelligence_layer/evaluation/evaluation/async_evaluation.py
@@ -11,10 +11,10 @@
 from intelligence_layer.evaluation.evaluation.evaluation_repository import (
     EvaluationRepository,
 )
-from intelligence_layer.evaluation.evaluation.evaluator import Evaluator
+from intelligence_layer.evaluation.evaluation.evaluator import EvaluatorBase
 
 
-class AsyncEvaluator(Evaluator[Input, Output, ExpectedOutput, Evaluation], ABC):
+class AsyncEvaluator(EvaluatorBase[Input, Output, ExpectedOutput, Evaluation], ABC):
     @abstractmethod
     def submit(
         self,