WIP: IL-259

Aleph-Alpha · Feb 21, 2024 · 0c3d7a1 · 0c3d7a1
1 parent d043d70
commit 0c3d7a1
Show file tree

Hide file tree

Showing 11 changed files with 196 additions and 218 deletions.
diff --git a/src/intelligence_layer/evaluation/__init__.py b/src/intelligence_layer/evaluation/__init__.py
@@ -1,4 +1,11 @@
 from .accumulator import MeanAccumulator as MeanAccumulator
+from .argilla import ArgillaEvaluator as ArgillaEvaluator
+from .argilla import (
+    InstructComparisonArgillaAggregationLogic as InstructComparisonArgillaAggregationLogic,
+)
+from .argilla import (
+    InstructComparisonArgillaEvaluationLogic as InstructComparisonArgillaEvaluationLogic,
+)
 from .data_storage.aggregation_repository import (
     AggregationRepository as AggregationRepository,
 )
@@ -49,19 +56,11 @@
 from .elo import PayoffMatrix as PayoffMatrix
 from .elo import PlayerScore as PlayerScore
 from .elo import WinRateCalculator as WinRateCalculator
-from .evaluator import ArgillaEvaluator as ArgillaEvaluator
-from .evaluator import BaseEvaluator as BaseEvaluator
 from .evaluator import Evaluator as Evaluator
 from .graders import BleuGrader as BleuGrader
 from .graders import RougeGrader as RougeGrader
 from .graders import RougeScores as RougeScores
 from .hugging_face import HuggingFaceDatasetRepository as HuggingFaceDatasetRepository
-from .instruct_comparison_argilla_evaluator import (
-    InstructComparisonArgillaAggregationLogic as InstructComparisonArgillaAggregationLogic,
-)
-from .instruct_comparison_argilla_evaluator import (
-    InstructComparisonArgillaEvaluationLogic as InstructComparisonArgillaEvaluationLogic,
-)
 from .runner import Runner as Runner
 
 __all__ = [symbol for symbol in dir()]
diff --git a/.../instruct_comparison_argilla_evaluator.py → src/intelligence_layer/evaluation/argilla.py b/.../instruct_comparison_argilla_evaluator.py → src/intelligence_layer/evaluation/argilla.py
@@ -1,38 +1,102 @@
 import random
+from abc import ABC, abstractmethod
 from itertools import combinations
 from typing import Iterable, Mapping, Optional
 
 from pydantic import BaseModel
 
-from intelligence_layer.connectors import Field
 from intelligence_layer.connectors.argilla.argilla_client import (
     ArgillaClient,
     ArgillaEvaluation,
+    Field,
     Question,
     RecordData,
     RecordDataSequence,
 )
-from intelligence_layer.core.complete import InstructInput, PromptOutput
-from intelligence_layer.evaluation import (
-    ArgillaEvaluationRepository,
-    EvaluationRepository,
-    Example,
-    MeanAccumulator,
+from intelligence_layer.core import Input, InstructInput, Output, PromptOutput
+from intelligence_layer.evaluation.data_storage.run_repository import RunRepository
+from intelligence_layer.evaluation.data_storage.dataset_repository import DatasetRepository
+from intelligence_layer.evaluation.evaluator import Evaluator
+
+from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic
+from intelligence_layer.evaluation.domain import (
+    AggregatedEvaluation,
+    SuccessfulExampleOutput, ExpectedOutput, Example,
 )
 from intelligence_layer.evaluation.elo import (
     AutomatedEloComparison,
-    EloCalculator,
     EloComparison,
-    PlayerScore,
-    WinRateCalculator,
-    build_tournaments,
-)
-from intelligence_layer.evaluation.evaluator import (
-    AggregationLogic,
-    ArgillaEvaluationLogic,
+    build_tournaments, PlayerScore,
 )
 
 
+class ArgillaEvaluationLogic(
+    EvaluationLogic[Input, Output, ExpectedOutput, RecordDataSequence], ABC
+):
+    def do_evaluate(
+        self,
+        example: Example[Input, ExpectedOutput],
+        *output: SuccessfulExampleOutput[Output],
+    ) -> RecordDataSequence:
+        return self._to_record(example, *output)
+
+    @abstractmethod
+    def _to_record(
+        self,
+        example: Example[Input, ExpectedOutput],
+        *output: SuccessfulExampleOutput[Output],
+    ) -> RecordDataSequence:
+        """This method is responsible for translating the `Example` and `Output` of the task to :class:`RecordData`
+
+
+        Args:
+            example: The example to be translated.
+            output: The output of the example that was run.
+        """
+        ...
+
+
+class ArgillaEvaluator(
+    Evaluator[Input, Output, ExpectedOutput, ArgillaEvaluation, AggregatedEvaluation],
+    ABC,
+):
+    """Evaluator used to integrate with Argilla (https://github.com/argilla-io/argilla).
+
+    Use this evaluator if you would like to easily do human eval.
+    This evaluator runs a dataset and sends the input and output to Argilla to be evaluated.
+    After they have been evaluated, you can fetch the results by using the `aggregate_evaluation` method.
+
+    Args:
+        evaluation_repository: The repository that will be used to store evaluation results.
+        dataset_repository: The repository with the examples that will be taken for the evaluation
+        description: human-readable description for the evaluator
+        # TODO: docstrings
+    """
+
+    def __init__(
+        self,
+        dataset_repository: DatasetRepository,
+        run_repository: RunRepository,
+        evaluation_repository: ArgillaEvaluationRepository,
+        aggregation_repository: AggregationRepository,
+        description: str,
+        evaluation_logic: ArgillaEvaluationLogic[Input, Output, ExpectedOutput],
+        aggregation_logic: AggregationLogic[ArgillaEvaluation, AggregatedEvaluation],
+    ) -> None:
+        super().__init__(
+            dataset_repository,
+            run_repository,
+            evaluation_repository,
+            aggregation_repository,
+            description,
+            evaluation_logic,  # type: ignore
+            aggregation_logic,  # TODO: check if the non-matching types of the evaluation logic and aggregation logic (in the line above) are a problem
+        )
+
+    def evaluation_type(self) -> type[ArgillaEvaluation]:  # type: ignore
+        return ArgillaEvaluation
+
+
 class AggregatedInstructComparison(BaseModel):
     scores: Mapping[str, PlayerScore]
 
@@ -97,29 +161,25 @@ def __init__(
     def _to_record(
         self,
         example: Example[InstructInput, None],
-        *outputs: PromptOutput,
+        *outputs: SuccessfulExampleOutput[PromptOutput],
     ) -> RecordDataSequence:
         def create_record_data(
-            first: PromptOutput,
-            second: PromptOutput,
+            first: SuccessfulExampleOutput[PromptOutput],
+            second: SuccessfulExampleOutput[PromptOutput],
         ) -> RecordData:
             if random.choice([True, False]):
                 first, second = second, first
             return RecordData(
                 content={
                     self._fields["KEY_INSTRUCTION"].name: example.input.instruction,
                     self._fields["KEY_INPUT"].name: example.input.input or "",
-                    self._fields["KEY_RESPONSE_1"].name: first.completion,
-                    self._fields["KEY_RESPONSE_2"].name: second.completion,
+                    self._fields["KEY_RESPONSE_1"].name: first.output.completion,
+                    self._fields["KEY_RESPONSE_2"].name: second.output.completion,
                 },
                 example_id=example.id,
                 metadata={
-                    self._fields[
-                        "KEY_RESPONSE_1"
-                    ].name: "1",  # TODO: first.run_id (SuccessfulExampleOutput[PromptOutput])
-                    self._fields[
-                        "KEY_RESPONSE_2"
-                    ].name: "2",  # TODO: second.run_id (SuccessfulExampleOutput[PromptOutput])
+                    self._fields["KEY_RESPONSE_1"].name: first.run_id,
+                    self._fields["KEY_RESPONSE_2"].name: second.run_id,
                 },
             )
 
@@ -131,7 +191,7 @@ def create_record_data(
                 if self._high_priority_runs is None
                 or any(
                     run_id in self._high_priority_runs
-                    for run_id in ["1", "2"]  # TODO first.run_id, second.run_id
+                    for run_id in [first.run_id, second.run_id]
                 )
             ]
         )

diff --git a/src/intelligence_layer/evaluation/base_logic.py b/src/intelligence_layer/evaluation/base_logic.py
@@ -0,0 +1,49 @@
+from abc import ABC, abstractmethod
+from typing import Generic, Iterable
+
+from intelligence_layer.core import Input, Output
+from intelligence_layer.evaluation import Evaluation, Example, ExpectedOutput
+from intelligence_layer.evaluation.domain import (
+    AggregatedEvaluation,
+    SuccessfulExampleOutput,
+)
+
+
+class AggregationLogic(ABC, Generic[Evaluation, AggregatedEvaluation]):
+    @abstractmethod
+    def aggregate(self, evaluations: Iterable[Evaluation]) -> AggregatedEvaluation:
+        """`Evaluator`-specific method for aggregating individual `Evaluations` into report-like `Aggregated Evaluation`.
+
+        This method is responsible for taking the results of an evaluation run and aggregating all the results.
+        It should create an `AggregatedEvaluation` class and return it at the end.
+
+        Args:
+            evaluations: The results from running `eval_and_aggregate_runs` with a :class:`Task`.
+
+        Returns:
+            The aggregated results of an evaluation run with a :class:`Dataset`.
+        """
+        ...
+
+
+class EvaluationLogic(ABC, Generic[Input, Output, ExpectedOutput, Evaluation]):
+    @abstractmethod
+    def do_evaluate(
+        self,
+        example: Example[Input, ExpectedOutput],
+        *output: SuccessfulExampleOutput[Output],
+    ) -> Evaluation:
+        """Executes the evaluation for this use-case.
+
+        Responsible for comparing the input & expected output of a task to the
+        actually generated output.
+
+        Args:
+            TODO: find a better way to describe this
+            example: The data example data whose input was passed to the :class:`Task` to produce the output.
+            output: Output of the :class:`Task` that shall be evaluated.
+
+        Returns:
+            The metrics that come from the evaluated :class:`Task`.
+        """
+        pass