Skip to content

Commit

Permalink
feat: Allow IncrementalEvaluator to operate with lists of previous ev…
Browse files Browse the repository at this point in the history
…aluation_ids.

Task: IL-315
  • Loading branch information
SebastianNiehusAA committed May 2, 2024
1 parent b67d3e8 commit 151a331
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 30 deletions.
74 changes: 63 additions & 11 deletions src/intelligence_layer/evaluation/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,30 +70,39 @@ class IncrementalEvaluationLogic(
):
def __init__(self) -> None:
super().__init__()
self._previous_run_output_ids: set[str] = set()
self._previous_run_output_ids: list[set[str]] = []

def set_previous_run_output_ids(self, previous_run_output_ids: set[str]) -> None:
def set_previous_run_output_ids(
self, previous_run_output_ids: list[set[str]]
) -> None:
self._previous_run_output_ids = previous_run_output_ids

def do_evaluate(
self,
example: Example[Input, ExpectedOutput],
*outputs: SuccessfulExampleOutput[Output],
) -> Evaluation:
evaluated_outputs = [
flattened_run_output_ids: set[str] = set()
evaluated_outputs = []
for run_output_ids in self._previous_run_output_ids:
flattened_run_output_ids = flattened_run_output_ids.union(run_output_ids)
evaluated_outputs.append(
[output for output in outputs if output.run_id in run_output_ids]
)

new_outputs = [
output
for output in outputs
if output.run_id in self._previous_run_output_ids
if output.run_id not in flattened_run_output_ids
]
new_outputs = [output for output in outputs if output not in evaluated_outputs]
return self.do_incremental_evaluate(example, new_outputs, evaluated_outputs)

@abstractmethod
def do_incremental_evaluate(
self,
example: Example[Input, ExpectedOutput],
outputs: list[SuccessfulExampleOutput[Output]],
evaluated_outputs: list[SuccessfulExampleOutput[Output]],
evaluated_outputs: list[list[SuccessfulExampleOutput[Output]]],
) -> Evaluation:
pass

Expand Down Expand Up @@ -484,6 +493,22 @@ def evaluation_lineage(


class IncrementalEvaluator(Evaluator[Input, Output, ExpectedOutput, Evaluation]):
""":class:`Evaluator` for evaluating additional runs on top of previous evaluations. Intended for use with :class:`IncrementalEvaluationLogic`.
Args:
dataset_repository: The repository with the examples that will be taken for the evaluation.
run_repository: The repository of the runs to evaluate.
evaluation_repository: The repository that will be used to store evaluation results.
description: Human-readable description for the evaluator.
incremental_evaluation_logic: The logic to use for evaluation.
Generics:
Input: Interface to be passed to the :class:`Task` that shall be evaluated.
Output: Type of the output of the :class:`Task` to be evaluated.
ExpectedOutput: Output that is expected from the run with the supplied input.
Evaluation: Interface of the metrics that come from the evaluated :class:`Task`.
"""

def __init__(
self,
dataset_repository: DatasetRepository,
Expand All @@ -505,17 +530,44 @@ def __init__(
def evaluate_additional_runs(
self,
*run_ids: str,
previous_evaluation_id: Optional[str] = None,
previous_evaluation_ids: Optional[list[str]] = None,
num_examples: Optional[int] = None,
abort_on_error: bool = False,
) -> EvaluationOverview:
previous_run_ids = set()
"""Evaluate all runs while considering which runs have already been evaluated according to `previous_evaluation_id`.
For each set of successful outputs in the referenced runs,
:func:`EvaluationLogic.do_evaluate` is called and eval metrics are produced &
stored in the provided :class:`EvaluationRepository`.
Args:
run_ids: The runs to be evaluated. Each run is expected to have the same
dataset as input (which implies their tasks have the same input-type)
and their tasks have the same output-type. For each example in the
dataset referenced by the runs the outputs of all runs are collected
and if all of them were successful they are passed on to the implementation
specific evaluation. The method compares all run of the provided ids to each other.
previous_evaluation_ids: IDs of previous evaluation to consider
num_examples: The number of examples which should be evaluated from the given runs.
Always the first n runs stored in the evaluation repository
abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False.
Returns:
EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be
returned but instead stored in the :class:`EvaluationRepository` provided in the
__init__.
"""

previous_run_ids = []
previous_evaluation_ids = previous_evaluation_ids or []

if previous_evaluation_id is not None:
for previous_evaluation_id in previous_evaluation_ids:
prev_run_ids: set[str] = set()
lineages = self.evaluation_lineages(previous_evaluation_id)
for lineage in lineages:
for output in lineage.outputs:
previous_run_ids.add(output.run_id)
prev_run_ids.add(output.run_id)
previous_run_ids.append(prev_run_ids)

cast(
IncrementalEvaluationLogic[Input, Output, ExpectedOutput, Evaluation],
Expand All @@ -534,7 +586,7 @@ def evaluate_runs(
cast(
IncrementalEvaluationLogic[Input, Output, ExpectedOutput, Evaluation],
self._evaluation_logic,
).set_previous_run_output_ids(set())
).set_previous_run_output_ids([])
return super().evaluate_runs(
*run_ids, num_examples=num_examples, abort_on_error=abort_on_error
)
80 changes: 61 additions & 19 deletions tests/evaluation/test_diff_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

class DummyEvaluation(BaseModel):
new_run_ids: list[str]
old_run_ids: list[str]
old_run_ids: list[list[str]]


class DummyIncrementalLogic(IncrementalEvaluationLogic[str, str, str, DummyEvaluation]):
Expand All @@ -33,11 +33,14 @@ def do_incremental_evaluate(
self,
example: Example[str, str],
outputs: list[SuccessfulExampleOutput[str]],
evaluated_outputs: list[SuccessfulExampleOutput[str]],
evaluated_outputs: list[list[SuccessfulExampleOutput[str]]],
) -> DummyEvaluation:
return DummyEvaluation(
new_run_ids=[output.run_id for output in outputs],
old_run_ids=[output.run_id for output in evaluated_outputs],
old_run_ids=[
[output.run_id for output in evaluated_output]
for evaluated_output in evaluated_outputs
],
)


Expand All @@ -60,13 +63,13 @@ def test_incremental_evaluator_should_filter_previous_run_ids() -> None:
)

run_repository = InMemoryRunRepository()
old_runner = Runner(
task=DummyTask("Task0"),
first_runner = Runner(
task=DummyTask("Task1"),
dataset_repository=dataset_repository,
run_repository=run_repository,
description="test_runner_0",
description="test_runner_1",
)
old_run = old_runner.run_dataset(dataset.id)
first_run = first_runner.run_dataset(dataset.id)

evaluation_repository = InMemoryEvaluationRepository()
evaluator = IncrementalEvaluator(
Expand All @@ -76,25 +79,64 @@ def test_incremental_evaluator_should_filter_previous_run_ids() -> None:
description="test_incremental_evaluator",
incremental_evaluation_logic=DummyIncrementalLogic(),
)
evaluation_overview = evaluator.evaluate_additional_runs(old_run.id)
first_evaluation_overview = evaluator.evaluate_additional_runs(first_run.id)

new_runner = Runner(
second_runner = Runner(
task=DummyTask("Task2"),
dataset_repository=dataset_repository,
run_repository=run_repository,
description="test_runner_2",
)
new_run = new_runner.run_dataset(dataset.id)
second_run = second_runner.run_dataset(dataset.id)

# When
new_evaluation_overview = evaluator.evaluate_additional_runs(
old_run.id, new_run.id, previous_evaluation_id=evaluation_overview.id
second_evaluation_overview = evaluator.evaluate_additional_runs(
first_run.id,
second_run.id,
previous_evaluation_ids=[first_evaluation_overview.id],
)

# Then
result = next(
iter(evaluator.evaluation_lineages(new_evaluation_overview.id))
second_result = next(
iter(evaluator.evaluation_lineages(second_evaluation_overview.id))
).evaluation.result
assert isinstance(result, DummyEvaluation)
assert result.new_run_ids == [new_run.id]
assert result.old_run_ids == [old_run.id]
assert isinstance(second_result, DummyEvaluation)
assert second_result.new_run_ids == [second_run.id]
assert second_result.old_run_ids == [[first_run.id]]

independent_runner = Runner(
task=DummyTask("TaskIndependent"),
dataset_repository=dataset_repository,
run_repository=run_repository,
description="test_runner_independent",
)
independent_run = independent_runner.run_dataset(dataset.id)
independent_evaluation_overview = evaluator.evaluate_additional_runs(
independent_run.id
)

third_runner = Runner(
task=DummyTask("Task3"),
dataset_repository=dataset_repository,
run_repository=run_repository,
description="test_runner_3",
)

third_run = third_runner.run_dataset(dataset.id)

third_evaluation_overview = evaluator.evaluate_additional_runs(
first_run.id,
second_run.id,
independent_run.id,
third_run.id,
previous_evaluation_ids=[
second_evaluation_overview.id,
independent_evaluation_overview.id,
],
)

third_result = next(
iter(evaluator.evaluation_lineages(third_evaluation_overview.id))
).evaluation.result
assert isinstance(third_result, DummyEvaluation)
assert third_result.new_run_ids == [third_run.id]
assert sorted(third_result.old_run_ids[0]) == sorted([first_run.id, second_run.id])
assert sorted(third_result.old_run_ids[1]) == sorted([independent_run.id])

0 comments on commit 151a331

Please sign in to comment.