From 151a331ead5f421ab98eb26237d56538fae71282 Mon Sep 17 00:00:00 2001 From: Sebastian Niehus Date: Thu, 2 May 2024 16:51:32 +0200 Subject: [PATCH] feat: Allow IncrementalEvaluator to operate with lists of previous evaluation_ids. Task: IL-315 --- .../evaluation/evaluation/evaluator.py | 74 ++++++++++++++--- tests/evaluation/test_diff_evaluator.py | 80 ++++++++++++++----- 2 files changed, 124 insertions(+), 30 deletions(-) diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator.py index 53d265354..42065419d 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator.py @@ -70,9 +70,11 @@ class IncrementalEvaluationLogic( ): def __init__(self) -> None: super().__init__() - self._previous_run_output_ids: set[str] = set() + self._previous_run_output_ids: list[set[str]] = [] - def set_previous_run_output_ids(self, previous_run_output_ids: set[str]) -> None: + def set_previous_run_output_ids( + self, previous_run_output_ids: list[set[str]] + ) -> None: self._previous_run_output_ids = previous_run_output_ids def do_evaluate( @@ -80,12 +82,19 @@ def do_evaluate( example: Example[Input, ExpectedOutput], *outputs: SuccessfulExampleOutput[Output], ) -> Evaluation: - evaluated_outputs = [ + flattened_run_output_ids: set[str] = set() + evaluated_outputs = [] + for run_output_ids in self._previous_run_output_ids: + flattened_run_output_ids = flattened_run_output_ids.union(run_output_ids) + evaluated_outputs.append( + [output for output in outputs if output.run_id in run_output_ids] + ) + + new_outputs = [ output for output in outputs - if output.run_id in self._previous_run_output_ids + if output.run_id not in flattened_run_output_ids ] - new_outputs = [output for output in outputs if output not in evaluated_outputs] return self.do_incremental_evaluate(example, new_outputs, evaluated_outputs) @abstractmethod @@ -93,7 +102,7 @@ def do_incremental_evaluate( self, example: Example[Input, ExpectedOutput], outputs: list[SuccessfulExampleOutput[Output]], - evaluated_outputs: list[SuccessfulExampleOutput[Output]], + evaluated_outputs: list[list[SuccessfulExampleOutput[Output]]], ) -> Evaluation: pass @@ -484,6 +493,22 @@ def evaluation_lineage( class IncrementalEvaluator(Evaluator[Input, Output, ExpectedOutput, Evaluation]): + """:class:`Evaluator` for evaluating additional runs on top of previous evaluations. Intended for use with :class:`IncrementalEvaluationLogic`. + + Args: + dataset_repository: The repository with the examples that will be taken for the evaluation. + run_repository: The repository of the runs to evaluate. + evaluation_repository: The repository that will be used to store evaluation results. + description: Human-readable description for the evaluator. + incremental_evaluation_logic: The logic to use for evaluation. + + Generics: + Input: Interface to be passed to the :class:`Task` that shall be evaluated. + Output: Type of the output of the :class:`Task` to be evaluated. + ExpectedOutput: Output that is expected from the run with the supplied input. + Evaluation: Interface of the metrics that come from the evaluated :class:`Task`. + """ + def __init__( self, dataset_repository: DatasetRepository, @@ -505,17 +530,44 @@ def __init__( def evaluate_additional_runs( self, *run_ids: str, - previous_evaluation_id: Optional[str] = None, + previous_evaluation_ids: Optional[list[str]] = None, num_examples: Optional[int] = None, abort_on_error: bool = False, ) -> EvaluationOverview: - previous_run_ids = set() + """Evaluate all runs while considering which runs have already been evaluated according to `previous_evaluation_id`. + + For each set of successful outputs in the referenced runs, + :func:`EvaluationLogic.do_evaluate` is called and eval metrics are produced & + stored in the provided :class:`EvaluationRepository`. + + Args: + run_ids: The runs to be evaluated. Each run is expected to have the same + dataset as input (which implies their tasks have the same input-type) + and their tasks have the same output-type. For each example in the + dataset referenced by the runs the outputs of all runs are collected + and if all of them were successful they are passed on to the implementation + specific evaluation. The method compares all run of the provided ids to each other. + previous_evaluation_ids: IDs of previous evaluation to consider + num_examples: The number of examples which should be evaluated from the given runs. + Always the first n runs stored in the evaluation repository + abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False. + + Returns: + EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be + returned but instead stored in the :class:`EvaluationRepository` provided in the + __init__. + """ + + previous_run_ids = [] + previous_evaluation_ids = previous_evaluation_ids or [] - if previous_evaluation_id is not None: + for previous_evaluation_id in previous_evaluation_ids: + prev_run_ids: set[str] = set() lineages = self.evaluation_lineages(previous_evaluation_id) for lineage in lineages: for output in lineage.outputs: - previous_run_ids.add(output.run_id) + prev_run_ids.add(output.run_id) + previous_run_ids.append(prev_run_ids) cast( IncrementalEvaluationLogic[Input, Output, ExpectedOutput, Evaluation], @@ -534,7 +586,7 @@ def evaluate_runs( cast( IncrementalEvaluationLogic[Input, Output, ExpectedOutput, Evaluation], self._evaluation_logic, - ).set_previous_run_output_ids(set()) + ).set_previous_run_output_ids([]) return super().evaluate_runs( *run_ids, num_examples=num_examples, abort_on_error=abort_on_error ) diff --git a/tests/evaluation/test_diff_evaluator.py b/tests/evaluation/test_diff_evaluator.py index 6ea1fdbab..f0e408ea4 100644 --- a/tests/evaluation/test_diff_evaluator.py +++ b/tests/evaluation/test_diff_evaluator.py @@ -22,7 +22,7 @@ class DummyEvaluation(BaseModel): new_run_ids: list[str] - old_run_ids: list[str] + old_run_ids: list[list[str]] class DummyIncrementalLogic(IncrementalEvaluationLogic[str, str, str, DummyEvaluation]): @@ -33,11 +33,14 @@ def do_incremental_evaluate( self, example: Example[str, str], outputs: list[SuccessfulExampleOutput[str]], - evaluated_outputs: list[SuccessfulExampleOutput[str]], + evaluated_outputs: list[list[SuccessfulExampleOutput[str]]], ) -> DummyEvaluation: return DummyEvaluation( new_run_ids=[output.run_id for output in outputs], - old_run_ids=[output.run_id for output in evaluated_outputs], + old_run_ids=[ + [output.run_id for output in evaluated_output] + for evaluated_output in evaluated_outputs + ], ) @@ -60,13 +63,13 @@ def test_incremental_evaluator_should_filter_previous_run_ids() -> None: ) run_repository = InMemoryRunRepository() - old_runner = Runner( - task=DummyTask("Task0"), + first_runner = Runner( + task=DummyTask("Task1"), dataset_repository=dataset_repository, run_repository=run_repository, - description="test_runner_0", + description="test_runner_1", ) - old_run = old_runner.run_dataset(dataset.id) + first_run = first_runner.run_dataset(dataset.id) evaluation_repository = InMemoryEvaluationRepository() evaluator = IncrementalEvaluator( @@ -76,25 +79,64 @@ def test_incremental_evaluator_should_filter_previous_run_ids() -> None: description="test_incremental_evaluator", incremental_evaluation_logic=DummyIncrementalLogic(), ) - evaluation_overview = evaluator.evaluate_additional_runs(old_run.id) + first_evaluation_overview = evaluator.evaluate_additional_runs(first_run.id) - new_runner = Runner( + second_runner = Runner( task=DummyTask("Task2"), dataset_repository=dataset_repository, run_repository=run_repository, description="test_runner_2", ) - new_run = new_runner.run_dataset(dataset.id) + second_run = second_runner.run_dataset(dataset.id) - # When - new_evaluation_overview = evaluator.evaluate_additional_runs( - old_run.id, new_run.id, previous_evaluation_id=evaluation_overview.id + second_evaluation_overview = evaluator.evaluate_additional_runs( + first_run.id, + second_run.id, + previous_evaluation_ids=[first_evaluation_overview.id], ) - # Then - result = next( - iter(evaluator.evaluation_lineages(new_evaluation_overview.id)) + second_result = next( + iter(evaluator.evaluation_lineages(second_evaluation_overview.id)) ).evaluation.result - assert isinstance(result, DummyEvaluation) - assert result.new_run_ids == [new_run.id] - assert result.old_run_ids == [old_run.id] + assert isinstance(second_result, DummyEvaluation) + assert second_result.new_run_ids == [second_run.id] + assert second_result.old_run_ids == [[first_run.id]] + + independent_runner = Runner( + task=DummyTask("TaskIndependent"), + dataset_repository=dataset_repository, + run_repository=run_repository, + description="test_runner_independent", + ) + independent_run = independent_runner.run_dataset(dataset.id) + independent_evaluation_overview = evaluator.evaluate_additional_runs( + independent_run.id + ) + + third_runner = Runner( + task=DummyTask("Task3"), + dataset_repository=dataset_repository, + run_repository=run_repository, + description="test_runner_3", + ) + + third_run = third_runner.run_dataset(dataset.id) + + third_evaluation_overview = evaluator.evaluate_additional_runs( + first_run.id, + second_run.id, + independent_run.id, + third_run.id, + previous_evaluation_ids=[ + second_evaluation_overview.id, + independent_evaluation_overview.id, + ], + ) + + third_result = next( + iter(evaluator.evaluation_lineages(third_evaluation_overview.id)) + ).evaluation.result + assert isinstance(third_result, DummyEvaluation) + assert third_result.new_run_ids == [third_run.id] + assert sorted(third_result.old_run_ids[0]) == sorted([first_run.id, second_run.id]) + assert sorted(third_result.old_run_ids[1]) == sorted([independent_run.id])