From 151a331ead5f421ab98eb26237d56538fae71282 Mon Sep 17 00:00:00 2001
From: Sebastian Niehus <sebastian.niehus@ext.aleph-alpha.com>
Date: Thu, 2 May 2024 16:51:32 +0200
Subject: [PATCH] feat: Allow IncrementalEvaluator to operate with lists of
 previous evaluation_ids. Task: IL-315

---
 .../evaluation/evaluation/evaluator.py        | 74 ++++++++++++++---
 tests/evaluation/test_diff_evaluator.py       | 80 ++++++++++++++-----
 2 files changed, 124 insertions(+), 30 deletions(-)

diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator.py
index 53d265354..42065419d 100644
--- a/src/intelligence_layer/evaluation/evaluation/evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluation/evaluator.py
@@ -70,9 +70,11 @@ class IncrementalEvaluationLogic(
 ):
     def __init__(self) -> None:
         super().__init__()
-        self._previous_run_output_ids: set[str] = set()
+        self._previous_run_output_ids: list[set[str]] = []
 
-    def set_previous_run_output_ids(self, previous_run_output_ids: set[str]) -> None:
+    def set_previous_run_output_ids(
+        self, previous_run_output_ids: list[set[str]]
+    ) -> None:
         self._previous_run_output_ids = previous_run_output_ids
 
     def do_evaluate(
@@ -80,12 +82,19 @@ def do_evaluate(
         example: Example[Input, ExpectedOutput],
         *outputs: SuccessfulExampleOutput[Output],
     ) -> Evaluation:
-        evaluated_outputs = [
+        flattened_run_output_ids: set[str] = set()
+        evaluated_outputs = []
+        for run_output_ids in self._previous_run_output_ids:
+            flattened_run_output_ids = flattened_run_output_ids.union(run_output_ids)
+            evaluated_outputs.append(
+                [output for output in outputs if output.run_id in run_output_ids]
+            )
+
+        new_outputs = [
             output
             for output in outputs
-            if output.run_id in self._previous_run_output_ids
+            if output.run_id not in flattened_run_output_ids
         ]
-        new_outputs = [output for output in outputs if output not in evaluated_outputs]
         return self.do_incremental_evaluate(example, new_outputs, evaluated_outputs)
 
     @abstractmethod
@@ -93,7 +102,7 @@ def do_incremental_evaluate(
         self,
         example: Example[Input, ExpectedOutput],
         outputs: list[SuccessfulExampleOutput[Output]],
-        evaluated_outputs: list[SuccessfulExampleOutput[Output]],
+        evaluated_outputs: list[list[SuccessfulExampleOutput[Output]]],
     ) -> Evaluation:
         pass
 
@@ -484,6 +493,22 @@ def evaluation_lineage(
 
 
 class IncrementalEvaluator(Evaluator[Input, Output, ExpectedOutput, Evaluation]):
+    """:class:`Evaluator` for evaluating additional runs on top of previous evaluations. Intended for use with :class:`IncrementalEvaluationLogic`.
+
+    Args:
+        dataset_repository: The repository with the examples that will be taken for the evaluation.
+        run_repository: The repository of the runs to evaluate.
+        evaluation_repository: The repository that will be used to store evaluation results.
+        description: Human-readable description for the evaluator.
+        incremental_evaluation_logic: The logic to use for evaluation.
+
+    Generics:
+        Input: Interface to be passed to the :class:`Task` that shall be evaluated.
+        Output: Type of the output of the :class:`Task` to be evaluated.
+        ExpectedOutput: Output that is expected from the run with the supplied input.
+        Evaluation: Interface of the metrics that come from the evaluated :class:`Task`.
+    """
+
     def __init__(
         self,
         dataset_repository: DatasetRepository,
@@ -505,17 +530,44 @@ def __init__(
     def evaluate_additional_runs(
         self,
         *run_ids: str,
-        previous_evaluation_id: Optional[str] = None,
+        previous_evaluation_ids: Optional[list[str]] = None,
         num_examples: Optional[int] = None,
         abort_on_error: bool = False,
     ) -> EvaluationOverview:
-        previous_run_ids = set()
+        """Evaluate all runs while considering which runs have already been evaluated according to `previous_evaluation_id`.
+
+        For each set of successful outputs in the referenced runs,
+        :func:`EvaluationLogic.do_evaluate` is called and eval metrics are produced &
+        stored in the provided :class:`EvaluationRepository`.
+
+        Args:
+            run_ids: The runs to be evaluated. Each run is expected to have the same
+                dataset as input (which implies their tasks have the same input-type)
+                and their tasks have the same output-type. For each example in the
+                dataset referenced by the runs the outputs of all runs are collected
+                and if all of them were successful they are passed on to the implementation
+                specific evaluation. The method compares all run of the provided ids to each other.
+            previous_evaluation_ids: IDs of previous evaluation to consider
+            num_examples: The number of examples which should be evaluated from the given runs.
+                Always the first n runs stored in the evaluation repository
+            abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False.
+
+        Returns:
+            EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be
+            returned but instead stored in the :class:`EvaluationRepository` provided in the
+            __init__.
+        """
+
+        previous_run_ids = []
+        previous_evaluation_ids = previous_evaluation_ids or []
 
-        if previous_evaluation_id is not None:
+        for previous_evaluation_id in previous_evaluation_ids:
+            prev_run_ids: set[str] = set()
             lineages = self.evaluation_lineages(previous_evaluation_id)
             for lineage in lineages:
                 for output in lineage.outputs:
-                    previous_run_ids.add(output.run_id)
+                    prev_run_ids.add(output.run_id)
+            previous_run_ids.append(prev_run_ids)
 
         cast(
             IncrementalEvaluationLogic[Input, Output, ExpectedOutput, Evaluation],
@@ -534,7 +586,7 @@ def evaluate_runs(
         cast(
             IncrementalEvaluationLogic[Input, Output, ExpectedOutput, Evaluation],
             self._evaluation_logic,
-        ).set_previous_run_output_ids(set())
+        ).set_previous_run_output_ids([])
         return super().evaluate_runs(
             *run_ids, num_examples=num_examples, abort_on_error=abort_on_error
         )
diff --git a/tests/evaluation/test_diff_evaluator.py b/tests/evaluation/test_diff_evaluator.py
index 6ea1fdbab..f0e408ea4 100644
--- a/tests/evaluation/test_diff_evaluator.py
+++ b/tests/evaluation/test_diff_evaluator.py
@@ -22,7 +22,7 @@
 
 class DummyEvaluation(BaseModel):
     new_run_ids: list[str]
-    old_run_ids: list[str]
+    old_run_ids: list[list[str]]
 
 
 class DummyIncrementalLogic(IncrementalEvaluationLogic[str, str, str, DummyEvaluation]):
@@ -33,11 +33,14 @@ def do_incremental_evaluate(
         self,
         example: Example[str, str],
         outputs: list[SuccessfulExampleOutput[str]],
-        evaluated_outputs: list[SuccessfulExampleOutput[str]],
+        evaluated_outputs: list[list[SuccessfulExampleOutput[str]]],
     ) -> DummyEvaluation:
         return DummyEvaluation(
             new_run_ids=[output.run_id for output in outputs],
-            old_run_ids=[output.run_id for output in evaluated_outputs],
+            old_run_ids=[
+                [output.run_id for output in evaluated_output]
+                for evaluated_output in evaluated_outputs
+            ],
         )
 
 
@@ -60,13 +63,13 @@ def test_incremental_evaluator_should_filter_previous_run_ids() -> None:
     )
 
     run_repository = InMemoryRunRepository()
-    old_runner = Runner(
-        task=DummyTask("Task0"),
+    first_runner = Runner(
+        task=DummyTask("Task1"),
         dataset_repository=dataset_repository,
         run_repository=run_repository,
-        description="test_runner_0",
+        description="test_runner_1",
     )
-    old_run = old_runner.run_dataset(dataset.id)
+    first_run = first_runner.run_dataset(dataset.id)
 
     evaluation_repository = InMemoryEvaluationRepository()
     evaluator = IncrementalEvaluator(
@@ -76,25 +79,64 @@ def test_incremental_evaluator_should_filter_previous_run_ids() -> None:
         description="test_incremental_evaluator",
         incremental_evaluation_logic=DummyIncrementalLogic(),
     )
-    evaluation_overview = evaluator.evaluate_additional_runs(old_run.id)
+    first_evaluation_overview = evaluator.evaluate_additional_runs(first_run.id)
 
-    new_runner = Runner(
+    second_runner = Runner(
         task=DummyTask("Task2"),
         dataset_repository=dataset_repository,
         run_repository=run_repository,
         description="test_runner_2",
     )
-    new_run = new_runner.run_dataset(dataset.id)
+    second_run = second_runner.run_dataset(dataset.id)
 
-    # When
-    new_evaluation_overview = evaluator.evaluate_additional_runs(
-        old_run.id, new_run.id, previous_evaluation_id=evaluation_overview.id
+    second_evaluation_overview = evaluator.evaluate_additional_runs(
+        first_run.id,
+        second_run.id,
+        previous_evaluation_ids=[first_evaluation_overview.id],
     )
 
-    # Then
-    result = next(
-        iter(evaluator.evaluation_lineages(new_evaluation_overview.id))
+    second_result = next(
+        iter(evaluator.evaluation_lineages(second_evaluation_overview.id))
     ).evaluation.result
-    assert isinstance(result, DummyEvaluation)
-    assert result.new_run_ids == [new_run.id]
-    assert result.old_run_ids == [old_run.id]
+    assert isinstance(second_result, DummyEvaluation)
+    assert second_result.new_run_ids == [second_run.id]
+    assert second_result.old_run_ids == [[first_run.id]]
+
+    independent_runner = Runner(
+        task=DummyTask("TaskIndependent"),
+        dataset_repository=dataset_repository,
+        run_repository=run_repository,
+        description="test_runner_independent",
+    )
+    independent_run = independent_runner.run_dataset(dataset.id)
+    independent_evaluation_overview = evaluator.evaluate_additional_runs(
+        independent_run.id
+    )
+
+    third_runner = Runner(
+        task=DummyTask("Task3"),
+        dataset_repository=dataset_repository,
+        run_repository=run_repository,
+        description="test_runner_3",
+    )
+
+    third_run = third_runner.run_dataset(dataset.id)
+
+    third_evaluation_overview = evaluator.evaluate_additional_runs(
+        first_run.id,
+        second_run.id,
+        independent_run.id,
+        third_run.id,
+        previous_evaluation_ids=[
+            second_evaluation_overview.id,
+            independent_evaluation_overview.id,
+        ],
+    )
+
+    third_result = next(
+        iter(evaluator.evaluation_lineages(third_evaluation_overview.id))
+    ).evaluation.result
+    assert isinstance(third_result, DummyEvaluation)
+    assert third_result.new_run_ids == [third_run.id]
+    assert sorted(third_result.old_run_ids[0]) == sorted([first_run.id, second_run.id])
+    assert sorted(third_result.old_run_ids[1]) == sorted([independent_run.id])