IL-259 fix TODOs

Aleph-Alpha · Feb 21, 2024 · f313de6 · f313de6
1 parent ce43c27
commit f313de6
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 19 deletions.
diff --git a/src/intelligence_layer/evaluation/argilla.py b/src/intelligence_layer/evaluation/argilla.py
@@ -80,11 +80,21 @@ class ArgillaEvaluator(
     This evaluator runs a dataset and sends the input and output to Argilla to be evaluated.
     After they have been evaluated, you can fetch the results by using the `aggregate_evaluation` method.
 
-    Args:
+     Arguments:
+        dataset_repository: The repository with the examples that will be taken for the evaluation.
+        run_repository: The repository of the runs to evaluate.
         evaluation_repository: The repository that will be used to store evaluation results.
-        dataset_repository: The repository with the examples that will be taken for the evaluation
-        description: human-readable description for the evaluator
-        # TODO: docstrings
+        aggregation_repository: The repository that will be used to store aggregation results.
+        description: Human-readable description for the evaluator.
+        evaluation_logic: The logic to use for evaluation.
+        aggregation_logic: The logic to aggregate the evaluations.
+
+    Generics:
+        Input: Interface to be passed to the :class:`Task` that shall be evaluated.
+        Output: Type of the output of the :class:`Task` to be evaluated.
+        ExpectedOutput: Output that is expected from the run with the supplied input.
+        ArgillaEvaluation: Interface of the metrics that come from the Argilla task`.
+        AggregatedEvaluation: The aggregated results of an evaluation run with a :class:`Dataset`.
     """
 
     def __init__(

diff --git a/src/intelligence_layer/evaluation/base_logic.py b/src/intelligence_layer/evaluation/base_logic.py
@@ -35,15 +35,14 @@ def do_evaluate(
         example: Example[Input, ExpectedOutput],
         *output: SuccessfulExampleOutput[Output],
     ) -> Evaluation:
-        """Executes the evaluation for this use-case.
+        """Executes the evaluation for this specific example.
 
         Responsible for comparing the input & expected output of a task to the
         actually generated output.
 
         Args:
-            TODO: find a better way to describe this
-            example: The data example data whose input was passed to the :class:`Task` to produce the output.
-            output: Output of the :class:`Task` that shall be evaluated.
+            example: Input data of :class:`Task` to produce the output.
+            output: Output of the :class:`Task`.
 
         Returns:
             The metrics that come from the evaluated :class:`Task`.

diff --git a/src/intelligence_layer/evaluation/evaluator.py b/src/intelligence_layer/evaluation/evaluator.py
@@ -83,13 +83,17 @@ class Evaluator(
     """Evaluator that can handle automatic evaluation scenarios.
 
     This evaluator should be used for automatic eval. A user still has to implement
-    :func:`BaseEvaluator.do_evaluate` and :func:`BaseEvaluator.aggregate`.
+    :class:`EvaluationLogic` and :class: `AggregationLogic`.
+
 
-    #TODO fix arguments
     Arguments:
+        dataset_repository: The repository with the examples that will be taken for the evaluation.
+        run_repository: The repository of the runs to evaluate.
         evaluation_repository: The repository that will be used to store evaluation results.
-        dataset_repository: The repository with the examples that will be taken for the evaluation
-        description: human-readable description for the evaluator
+        aggregation_repository: The repository that will be used to store aggregation results.
+        description: Human-readable description for the evaluator.
+        evaluation_logic: The logic to use for evaluation.
+        aggregation_logic: The logic to aggregate the evaluations.
 
     Generics:
         Input: Interface to be passed to the :class:`Task` that shall be evaluated.

diff --git a/src/intelligence_layer/use_cases/summarize/summarize.py b/src/intelligence_layer/use_cases/summarize/summarize.py
@@ -93,7 +93,6 @@ class AggregatedSummarizeEvaluation(BaseModel):
     Attributes:
         aggregate_bleu: average over BLEU-scores
         aggregate_rouge: average over ROUGE-scores
-        evaluation: The actual evaluations
     """
 
     aggregate_bleu: float
@@ -130,10 +129,10 @@ def do_evaluate(
         assert len(output) == 1
         single_output = output[0].output
         bleu_score = self.bleu_grader.calculate_bleu(
-            single_output.summary, example.expected_output  # TODO
+            single_output.summary, example.expected_output
         )
         rouge_score = self.rouge_grader.calculate_rouge(
-            single_output.summary, example.expected_output  # TODO
+            single_output.summary, example.expected_output
         )
 
         return SummarizeEvaluation(
@@ -163,14 +162,13 @@ def __init__(self) -> None:
         self.bleu_grader = BleuGrader()
         self.rouge_grader = RougeGrader()
 
-    # mypy expects *args where this method only uses one output
-    def do_evaluate(  # type: ignore
+    def do_evaluate(
         self,
         example: Example[LongContextSummarizeInput, str],
-        *output: LongContextSummarizeOutput,
+        *output: SuccessfulExampleOutput[LongContextSummarizeOutput],
     ) -> SummarizeEvaluation:
         assert len(output) == 1
-        single_output = output[0]
+        single_output = output[0].output
         joint_summary = " ".join(
             partial_summary.summary
             for partial_summary in single_output.partial_summaries