From 47fec3d4594384393d31076b5fe4f5ff5fedef95 Mon Sep 17 00:00:00 2001 From: Valentina Galata Date: Wed, 21 Feb 2024 15:31:57 +0100 Subject: [PATCH] refactor: split evaluation and aggregation Task: IL-259 --- src/examples/classification.ipynb | 8 +- src/examples/document_index.ipynb | 4 +- src/examples/evaluation.ipynb | 37 +-- src/examples/human_evaluation.ipynb | 54 +++-- src/examples/performance_tips.ipynb | 6 +- src/examples/qa.ipynb | 2 +- src/examples/quickstart_task.ipynb | 163 ++++++++----- src/examples/summarize.ipynb | 1 + src/intelligence_layer/evaluation/__init__.py | 1 + .../evaluation/aggregator.py | 223 ++++++++++++++++++ src/intelligence_layer/evaluation/argilla.py | 48 +++- .../evaluation/evaluator.py | 141 +---------- src/intelligence_layer/evaluation/run.py | 9 +- tests/evaluation/test_argilla_evaluator.py | 88 +++++-- tests/evaluation/test_evaluator.py | 187 ++++++++++----- ...t_instruct_comparison_argilla_evaluator.py | 46 +++- tests/use_cases/classify/test_classify.py | 32 ++- .../classify/test_prompt_based_classify.py | 46 +++- tests/use_cases/summarize/test_summarize.py | 58 +++-- 19 files changed, 775 insertions(+), 379 deletions(-) create mode 100644 src/intelligence_layer/evaluation/aggregator.py diff --git a/src/examples/classification.ipynb b/src/examples/classification.ipynb index 0883b9a3d..bb512fffb 100644 --- a/src/examples/classification.ipynb +++ b/src/examples/classification.ipynb @@ -43,13 +43,13 @@ "outputs": [], "source": [ "from os import getenv\n", - "from intelligence_layer.connectors import LimitedConcurrencyClient\n", - "\n", - "from intelligence_layer.use_cases import ClassifyInput, PromptBasedClassify\n", - "from intelligence_layer.core import Chunk, InMemoryTracer\n", "\n", "from dotenv import load_dotenv\n", "\n", + "from intelligence_layer.connectors import LimitedConcurrencyClient\n", + "from intelligence_layer.core import Chunk, InMemoryTracer\n", + "from intelligence_layer.use_cases import ClassifyInput, PromptBasedClassify\n", + "\n", "load_dotenv()\n", "\n", "text_to_classify = Chunk(\"In the distant future, a space exploration party embarked on a thrilling journey to the uncharted regions of the galaxy. \\n\\\n", diff --git a/src/examples/document_index.ipynb b/src/examples/document_index.ipynb index 454d219ed..48173b5e1 100644 --- a/src/examples/document_index.ipynb +++ b/src/examples/document_index.ipynb @@ -47,10 +47,10 @@ "source": [ "from os import getenv\n", "\n", - "from intelligence_layer.connectors import DocumentIndexClient\n", - "\n", "from dotenv import load_dotenv\n", "\n", + "from intelligence_layer.connectors import DocumentIndexClient\n", + "\n", "load_dotenv()\n", "\n", "\n", diff --git a/src/examples/evaluation.ipynb b/src/examples/evaluation.ipynb index 1c1276c49..15e3aefd9 100644 --- a/src/examples/evaluation.ipynb +++ b/src/examples/evaluation.ipynb @@ -50,7 +50,7 @@ " InMemoryRunRepository,\n", " InMemoryDatasetRepository,\n", " InMemoryAggregationRepository,\n", - " Runner,\n", + " Runner, Aggregator,\n", ")\n", "from intelligence_layer.use_cases import (\n", " PromptBasedClassify,\n", @@ -58,7 +58,6 @@ " SingleLabelClassifyAggregationLogic,\n", ")\n", "\n", - "\n", "load_dotenv()\n", "\n", "client = LimitedConcurrencyClient.from_token(os.getenv(\"AA_TOKEN\"))\n", @@ -75,9 +74,13 @@ " dataset_repository,\n", " run_repository,\n", " evaluation_repository,\n", - " aggregation_repository,\n", - " \"singel-label-classify\",\n", + " \"single-label-classify\",\n", " evaluation_logic,\n", + ")\n", + "aggregator = Aggregator(\n", + " evaluation_repository,\n", + " aggregation_repository,\n", + " \"single-label-classify\",\n", " aggregation_logic,\n", ")\n", "runner = Runner(task, dataset_repository, run_repository, \"prompt-based-classify\")" @@ -114,10 +117,10 @@ "])\n", "\n", "run_overview = runner.run_dataset(single_example_dataset, NoOpTracer())\n", - "aggregation_overview = evaluator.eval_and_aggregate_runs(run_overview.id)\n", + "evaluation_overview = evaluator.evaluate_runs(run_overview.id)\n", + "aggregation_overview = aggregator.aggregate_evaluation(evaluation_overview.id)\n", "\n", - "print(\"Statistics: \", aggregation_overview.statistics)\n", - "\n" + "print(\"Statistics: \", aggregation_overview.statistics)" ] }, { @@ -140,7 +143,7 @@ "dataset = load_dataset(\"cardiffnlp/tweet_topic_multi\")\n", "test_set_name = \"validation_random\"\n", "all_data = list(dataset[test_set_name])\n", - "data = all_data[:25] # this has 573 datapoints, let's take a look at 25 for now\n" + "data = all_data[:25] # this has 573 datapoints, let's take a look at 25 for now" ] }, { @@ -157,7 +160,7 @@ "metadata": {}, "outputs": [], "source": [ - "data[1]\n" + "data[1]" ] }, { @@ -215,7 +218,8 @@ "outputs": [], "source": [ "run_overview = runner.run_dataset(dataset_id)\n", - "aggregation_overview = evaluator.eval_and_aggregate_runs(run_overview.id)\n", + "evaluation_overview = evaluator.evaluate_runs(run_overview.id)\n", + "aggregation_overview = aggregator.aggregate_evaluation(evaluation_overview.id)\n", "aggregation_overview.raise_on_evaluation_failure()" ] }, @@ -288,9 +292,13 @@ " dataset_repository,\n", " run_repository,\n", " evaluation_repository,\n", - " aggregation_repository,\n", " \"multi-label-classify\",\n", " eval_logic,\n", + ")\n", + "embedding_based_classify_aggregator = Aggregator(\n", + " evaluation_repository,\n", + " aggregation_repository,\n", + " \"multi-label-classify\",\n", " aggregation_logic,\n", ")\n", "embedding_based_classify_runner = Runner(\n", @@ -308,8 +316,9 @@ "outputs": [], "source": [ "embedding_based_classify_run_result = embedding_based_classify_runner.run_dataset(dataset_id)\n", - "embedding_based_classify_evaluation_result = embedding_based_classify_evaluator.eval_and_aggregate_runs(embedding_based_classify_run_result.id)\n", - "embedding_based_classify_evaluation_result.raise_on_evaluation_failure()" + "embedding_based_classify_evaluation_result = embedding_based_classify_evaluator.evaluate_runs(embedding_based_classify_run_result.id)\n", + "embedding_based_classify_aggregation_result = embedding_based_classify_aggregator.aggregate_evaluation(embedding_based_classify_evaluation_result.id)\n", + "embedding_based_classify_aggregation_result.raise_on_evaluation_failure()" ] }, { @@ -318,7 +327,7 @@ "metadata": {}, "outputs": [], "source": [ - "embedding_based_classify_evaluation_result.statistics.macro_avg" + "embedding_based_classify_aggregation_result.statistics.macro_avg" ] }, { diff --git a/src/examples/human_evaluation.ipynb b/src/examples/human_evaluation.ipynb index 33e19ba82..841c6df4a 100644 --- a/src/examples/human_evaluation.ipynb +++ b/src/examples/human_evaluation.ipynb @@ -42,20 +42,26 @@ "metadata": {}, "outputs": [], "source": [ + "import os\n", + "from typing import Iterable, cast\n", + "\n", + "from datasets import load_dataset\n", "from dotenv import load_dotenv\n", - "from intelligence_layer.core import (\n", - " InstructInput, \n", - " Instruct, \n", - " PromptOutput\n", - ")\n", + "from pydantic import BaseModel\n", + "\n", "from intelligence_layer.connectors import (\n", - " LimitedConcurrencyClient, \n", - " Question, \n", - " ArgillaEvaluation, \n", - " DefaultArgillaClient, \n", - " Field, \n", + " LimitedConcurrencyClient,\n", + " Question,\n", + " ArgillaEvaluation,\n", + " DefaultArgillaClient,\n", + " Field,\n", " RecordData\n", ")\n", + "from intelligence_layer.core import (\n", + " InstructInput,\n", + " Instruct,\n", + " PromptOutput\n", + ")\n", "from intelligence_layer.evaluation import (\n", " ArgillaEvaluator,\n", " AggregationLogic,\n", @@ -70,10 +76,7 @@ " Runner,\n", " SuccessfulExampleOutput\n", ")\n", - "from typing import Iterable, cast, Sequence\n", - "from datasets import load_dataset\n", - "import os\n", - "from pydantic import BaseModel\n", + "from intelligence_layer.evaluation.argilla import ArgillaAggregator\n", "\n", "load_dotenv()\n", "\n", @@ -318,14 +321,14 @@ " def _to_record(\n", " self,\n", " example: Example[InstructInput, None],\n", - " example_outputs: SuccessfulExampleOutput[PromptOutput],\n", + " *example_outputs: SuccessfulExampleOutput[PromptOutput],\n", " ) -> RecordDataSequence:\n", " return RecordDataSequence(\n", " records=[\n", " RecordData(\n", " content={\n", " \"input\": example.input.instruction,\n", - " \"output\": example_outputs.output.completion,\n", + " \"output\": example_outputs[0].output.completion,\n", " },\n", " example_id=example.id,\n", " )\n", @@ -340,16 +343,16 @@ "eval_logic = InstructArgillaEvaluationLogic()\n", "aggregation_logic = InstructArgillaAggregationLogic()\n", "\n", + "argilla_evaluation_repository = ArgillaEvaluationRepository(\n", + " evaluation_repository, argilla_client, workspace_id, fields, questions\n", + ")\n", + "\n", "evaluator = ArgillaEvaluator(\n", " dataset_repository,\n", " run_repository,\n", - " ArgillaEvaluationRepository(\n", - " evaluation_repository, argilla_client, workspace_id, fields, questions\n", - " ),\n", - " aggregation_repository,\n", + " argilla_evaluation_repository,\n", " \"instruct\",\n", " eval_logic,\n", - " aggregation_logic,\n", ")" ] }, @@ -388,8 +391,15 @@ "metadata": {}, "outputs": [], "source": [ + "aggregator = ArgillaAggregator(\n", + " argilla_evaluation_repository,\n", + " aggregation_repository,\n", + " \"instruct\",\n", + " aggregation_logic,\n", + ")\n", + "\n", "if eval_overview:\n", - " output = evaluator.aggregate_evaluation(eval_overview.id)\n", + " output = aggregator.aggregate_evaluation(eval_overview.id)\n", " print(output.statistics)" ] } diff --git a/src/examples/performance_tips.ipynb b/src/examples/performance_tips.ipynb index 69227c04e..80300fd77 100644 --- a/src/examples/performance_tips.ipynb +++ b/src/examples/performance_tips.ipynb @@ -44,11 +44,13 @@ "metadata": {}, "outputs": [], "source": [ - "from intelligence_layer.core.task import Task\n", - "from intelligence_layer.core.tracer import TaskSpan, NoOpTracer\n", "import time\n", "from typing import Any\n", "\n", + "from intelligence_layer.core.task import Task\n", + "from intelligence_layer.core.tracer import TaskSpan, NoOpTracer\n", + "\n", + "\n", "class DummyTask(Task):\n", " def do_run(self, input: Any, task_span: TaskSpan) -> Any:\n", " time.sleep(2)\n", diff --git a/src/examples/qa.ipynb b/src/examples/qa.ipynb index fa9e15bae..b69c37599 100644 --- a/src/examples/qa.ipynb +++ b/src/examples/qa.ipynb @@ -19,8 +19,8 @@ "outputs": [], "source": [ "from os import getenv\n", - "from dotenv import load_dotenv\n", "\n", + "from dotenv import load_dotenv\n", "\n", "load_dotenv()\n", "from intelligence_layer.connectors import LimitedConcurrencyClient\n", diff --git a/src/examples/quickstart_task.ipynb b/src/examples/quickstart_task.ipynb index 0b873ccf5..010259124 100644 --- a/src/examples/quickstart_task.ipynb +++ b/src/examples/quickstart_task.ipynb @@ -21,10 +21,10 @@ "Output = TypeVar(\"Output\", bound=PydanticSerializable)\n", "\n", "class Task(ABC, Generic[Input, Output]):\n", - "    @abstractmethod\n", - "    def do_run(self, input: Input, task_span: TaskSpan) -> Output:\n", - "        \"\"\"Executes the process for this use-case.\"\"\"\n", - "        ...\n", + " @abstractmethod\n", + " def do_run(self, input: Input, task_span: TaskSpan) -> Output:\n", + " \"\"\"Executes the process for this use-case.\"\"\"\n", + " ...\n", "```\n", "\n", "For every task, we have to define an `Input`, an `Output` and how we would like to run it. Since these can vary so much, we make no assumptions about a `Task`'s implementation. \n", @@ -261,45 +261,130 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we have all parts in place, let's implement our `KeywordExtractionEvaluator`." + "Now that we have all parts in place, let's run our task which will produce the results for evaluation." ] }, { "cell_type": "code", - "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "from intelligence_layer.core import NoOpTracer\n", + "from intelligence_layer.evaluation import (\n", + " InMemoryDatasetRepository,\n", + " InMemoryRunRepository,\n", + " Runner,\n", + " Example\n", + ")\n", "from statistics import mean\n", "from typing import Iterable\n", - "from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic\n", "\n", + "dataset_repository = InMemoryDatasetRepository()\n", + "run_repository = InMemoryRunRepository()\n", + "\n", + "runner = Runner(task, dataset_repository, run_repository, \"keyword-extraction\")\n", + "input = KeywordExtractionInput(text=\"This is a text about dolphins and sharks.\")\n", + "expected_output = KeywordExtractionExpectedOutput(keywords=[\"dolphins\", \"sharks\"])\n", + "\n", + "single_example_dataset = dataset_repository.create_dataset(\n", + " examples=[Example(input=input, expected_output=expected_output)]\n", + ")\n", + "\n", + "run_overview = runner.run_dataset(single_example_dataset, NoOpTracer())" + ], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's build an evaluator.\n", + "For this, we need to implement a method doing the actual evaluation in a `EvaluationLogic` class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from intelligence_layer.evaluation import (\n", + " Evaluator,\n", + " InMemoryEvaluationRepository,\n", + " Example\n", + ")\n", + "from intelligence_layer.evaluation.base_logic import SingleOutputEvaluationLogic\n", "\n", "class KeywordExtractionEvaluationLogic(\n", - " EvaluationLogic[\n", + " SingleOutputEvaluationLogic[\n", " KeywordExtractionInput,\n", " KeywordExtractionOutput,\n", " KeywordExtractionExpectedOutput,\n", " KeywordExtractionEvaluation,\n", " ]\n", "):\n", - " def do_evaluate(\n", + " def do_evaluate_single_output(\n", " self,\n", - " input: KeywordExtractionInput,\n", - " output: KeywordExtractionOutput,\n", - " expected_output: KeywordExtractionExpectedOutput,\n", + " example: Example[KeywordExtractionInput, KeywordExtractionOutput],\n", + " output: KeywordExtractionExpectedOutput,\n", " ) -> KeywordExtractionEvaluation:\n", - " true_positives = output.keywords & expected_output.keywords\n", - " false_positives = output.keywords - expected_output.keywords\n", - " false_negatives = expected_output.keywords - output.keywords\n", + " true_positives = output.keywords & output.keywords\n", + " false_positives = output.keywords - output.keywords\n", + " false_negatives = output.keywords - output.keywords\n", " return KeywordExtractionEvaluation(\n", - " true_positive_rate=len(true_positives) / len(expected_output.keywords),\n", + " true_positive_rate=len(true_positives) / len(output.keywords),\n", " true_positives=true_positives,\n", " false_positives=false_positives,\n", " false_negatives=false_negatives,\n", - " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now, we can create an evaluator and run it on our data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evaluation_repository = InMemoryEvaluationRepository()\n", + "evaluation_logic = KeywordExtractionEvaluationLogic()\n", + "evaluator = Evaluator(\n", + " dataset_repository,\n", + " run_repository,\n", + " evaluation_repository,\n", + " \"keyword-extraction\",\n", + " evaluation_logic,\n", + ")\n", + "\n", + "evaluation_overview = evaluator.evaluate_runs(run_overview.id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To aggregate the evaluation results, we have to implement a method doing this in a `AggregationLogic` class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from intelligence_layer.evaluation import (\n", + " InMemoryAggregationRepository,\n", + " Example, Aggregator,\n", + ")\n", + "from intelligence_layer.evaluation.base_logic import AggregationLogic\n", + "\n", "\n", - "# this is needed later for the aggregation\n", "class KeywordExtractionAggregationLogic(\n", " AggregationLogic[\n", " KeywordExtractionEvaluation,\n", @@ -322,7 +407,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's run this on a single example." + "Let's create now an aggregator and generate evaluation statistics from the previously generated evaluation results." ] }, { @@ -331,45 +416,16 @@ "metadata": {}, "outputs": [], "source": [ - "from intelligence_layer.core import NoOpTracer\n", - "from intelligence_layer.evaluation import (\n", - " Evaluator,\n", - " InMemoryDatasetRepository,\n", - " InMemoryEvaluationRepository,\n", - " InMemoryRunRepository,\n", - " InMemoryAggregationRepository,\n", - " Runner,\n", - " Example,\n", - ")\n", - "\n", - "dataset_repository = InMemoryDatasetRepository()\n", - "run_repository = InMemoryRunRepository()\n", - "evaluation_repository = InMemoryEvaluationRepository()\n", "aggregation_repository = InMemoryAggregationRepository()\n", - "evaluation_logic = KeywordExtractionEvaluationLogic()\n", "aggregation_logic = KeywordExtractionAggregationLogic()\n", - "\n", - "\n", - "evaluator = Evaluator(\n", - " dataset_repository,\n", - " run_repository,\n", + "aggregator = Aggregator(\n", " evaluation_repository,\n", " aggregation_repository,\n", " \"keyword-extraction\",\n", - " evaluation_logic,\n", " aggregation_logic,\n", ")\n", - "runner = Runner(task, dataset_repository, run_repository, \"keyword-extraction\")\n", - "\n", - "input = KeywordExtractionInput(text=\"This is a text about dolphins and sharks.\")\n", - "expected_output = KeywordExtractionExpectedOutput(keywords=[\"dolphins\", \"sharks\"])\n", - "\n", - "single_example_dataset = dataset_repository.create_dataset(\n", - " examples=[Example(input=input, expected_output=expected_output)]\n", - ")\n", "\n", - "run_overview = runner.run_dataset(single_example_dataset, NoOpTracer())\n", - "aggregation_overview = evaluator.eval_and_aggregate_runs(run_overview.id)\n", + "aggregation_overview = aggregator.aggregate_evaluation(evaluation_overview.id)\n", "\n", "print(\"Statistics: \", aggregation_overview.statistics)" ] @@ -378,7 +434,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we have implemented our aggregate method, let's run a dataset with some example data." + "Now that we have implemented all required methods, let's run a dataset with some more examples." ] }, { @@ -417,7 +473,8 @@ ")\n", "\n", "run = runner.run_dataset(dataset_id)\n", - "aggregation_overview = evaluator.eval_and_aggregate_runs(run.id)\n", + "evaluation_overview = evaluator.evaluate_runs(run.id)\n", + "aggregation_overview = aggregator.aggregate_evaluation(evaluation_overview.id)\n", "\n", "pprint(aggregation_overview)" ] @@ -440,7 +497,7 @@ "last_example_result = run_repository.example_trace(\n", " next(iter(aggregation_overview.run_overviews())).id, examples[-1].id\n", ")\n", - "last_example_result.trace\n" + "last_example_result.trace" ] }, { diff --git a/src/examples/summarize.ipynb b/src/examples/summarize.ipynb index aa2313a1b..17f1d9164 100644 --- a/src/examples/summarize.ipynb +++ b/src/examples/summarize.ipynb @@ -20,6 +20,7 @@ "outputs": [], "source": [ "from os import getenv\n", + "\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv()\n", diff --git a/src/intelligence_layer/evaluation/__init__.py b/src/intelligence_layer/evaluation/__init__.py index e4248fe91..615b1ea3e 100644 --- a/src/intelligence_layer/evaluation/__init__.py +++ b/src/intelligence_layer/evaluation/__init__.py @@ -1,4 +1,5 @@ from .accumulator import MeanAccumulator as MeanAccumulator +from .aggregator import Aggregator as Aggregator from .argilla import ArgillaEvaluationLogic as ArgillaEvaluationLogic from .argilla import ArgillaEvaluator as ArgillaEvaluator from .argilla import ( diff --git a/src/intelligence_layer/evaluation/aggregator.py b/src/intelligence_layer/evaluation/aggregator.py new file mode 100644 index 000000000..94d0bf0e9 --- /dev/null +++ b/src/intelligence_layer/evaluation/aggregator.py @@ -0,0 +1,223 @@ +from functools import lru_cache +from typing import ( + Callable, + Generic, + Iterable, + Iterator, + Mapping, + TypeVar, + cast, + final, + get_args, + get_origin, +) +from uuid import uuid4 + +from intelligence_layer.core.tracer import utc_now +from intelligence_layer.evaluation.base_logic import AggregationLogic +from intelligence_layer.evaluation.data_storage.aggregation_repository import ( + AggregationRepository, +) +from intelligence_layer.evaluation.data_storage.evaluation_repository import ( + EvaluationRepository, +) +from intelligence_layer.evaluation.domain import ( + AggregatedEvaluation, + AggregationOverview, + Evaluation, + EvaluationOverview, + FailedExampleEvaluation, +) + +T = TypeVar("T") + + +class CountingFilterIterable(Iterable[T]): + def __init__( + self, wrapped_iterable: Iterable[T], filter: Callable[[T], bool] + ) -> None: + self._wrapped_iterator = iter(wrapped_iterable) + self._filter = filter + self._included_count = 0 + self._excluded_count = 0 + + def __next__(self) -> T: + e = next(self._wrapped_iterator) + while not self._filter(e): + self._excluded_count += 1 + e = next(self._wrapped_iterator) + self._included_count += 1 + return e + + def __iter__(self) -> Iterator[T]: + return self + + def included_count(self) -> int: + return self._included_count + + def excluded_count(self) -> int: + return self._excluded_count + + +class Aggregator(Generic[Evaluation, AggregatedEvaluation]): + """Aggregator that can handle automatic aggregation of evaluation scenarios. + + This aggregator should be used for automatic eval. A user still has to implement + :class: `AggregationLogic`. + + + Arguments: + evaluation_repository: The repository that will be used to store evaluation results. + aggregation_repository: The repository that will be used to store aggregation results. + description: Human-readable description for the evaluator. + aggregation_logic: The logic to aggregate the evaluations. + + Generics: + Evaluation: Interface of the metrics that come from the evaluated :class:`Task`. + AggregatedEvaluation: The aggregated results of an evaluation run with a :class:`Dataset`. + """ + + def __init__( + self, + evaluation_repository: EvaluationRepository, + aggregation_repository: AggregationRepository, + description: str, + aggregation_logic: AggregationLogic[Evaluation, AggregatedEvaluation], + ) -> None: + self._evaluation_repository = evaluation_repository + self._aggregation_repository = aggregation_repository + self._aggregation_logic = aggregation_logic + self.description = description + + @lru_cache(maxsize=1) + def _get_types(self) -> Mapping[str, type]: + """Type magic function that gets the actual types of the generic parameters. + + Traverses the inheritance history of `BaseEvaluator`-subclass to find an actual type every time a TypeVar is replaced. + + Returns: + Name of generic parameter to the type found. + """ + + def is_eligible_subclass(parent: type) -> bool: + return hasattr(parent, "__orig_bases__") and issubclass( + parent, AggregationLogic + ) + + def update_types() -> None: + num_types_set = 0 + for current_index, current_type in enumerate(current_types): + if type(current_type) is not TypeVar: + type_var_count = num_types_set - 1 + for element_index, element in enumerate(type_list): + if type(element) is TypeVar: + type_var_count += 1 + if type_var_count == current_index: + break + assert type_var_count == current_index + type_list[element_index] = current_type + num_types_set += 1 + + # mypy does not know __orig_bases__ + base_types = AggregationLogic.__orig_bases__[1] # type: ignore + type_list: list[type | TypeVar] = list(get_args(base_types)) + + possible_parent_classes = [ + p + for p in reversed(type(self._aggregation_logic).__mro__) + if is_eligible_subclass(p) + ] + for parent in possible_parent_classes: + # mypy does not know __orig_bases__ + for base in parent.__orig_bases__: # type: ignore + origin = get_origin(base) + if origin is None or not issubclass(origin, AggregationLogic): + continue + current_types = list(get_args(base)) + update_types() + + return { + name: param_type + for name, param_type in zip( + (a.__name__ for a in get_args(base_types)), type_list + ) + if type(param_type) is not TypeVar + } + + def evaluation_type(self) -> type[Evaluation]: + """Returns the type of the evaluation result of an example. + + This can be used to retrieve properly typed evaluations of an evaluation run + from a :class:`EvaluationRepository` + + Returns: + Returns the type of the evaluation result of an example. + """ + try: + evaluation_type = self._get_types()["Evaluation"] + except KeyError: + raise TypeError( + f"Alternatively overwrite evaluation_type() in {type(self)}" + ) + return cast(type[Evaluation], evaluation_type) + + @final + def aggregate_evaluation( + self, *eval_ids: str + ) -> AggregationOverview[AggregatedEvaluation]: + """Aggregates all evaluations into an overview that includes high-level statistics. + + Aggregates :class:`Evaluation`s according to the implementation of :func:`BaseEvaluator.aggregate`. + + Args: + evaluation_overview: An overview of the evaluation to be aggregated. Does not include + actual evaluations as these will be retrieved from the repository. + + Returns: + An overview of the aggregated evaluation. + """ + + def load_eval_overview(eval_id: str) -> EvaluationOverview: + evaluation_overview = self._evaluation_repository.evaluation_overview( + eval_id + ) + if not evaluation_overview: + raise ValueError( + f"No PartialEvaluationOverview found for eval-id: {eval_id}" + ) + return evaluation_overview + + evaluation_overviews = frozenset(load_eval_overview(id) for id in set(eval_ids)) + + nested_evaluations = [ + self._evaluation_repository.example_evaluations( + overview.id, self.evaluation_type() + ) + for overview in evaluation_overviews + ] + example_evaluations = [ + eval for sublist in nested_evaluations for eval in sublist + ] + + successful_evaluations = CountingFilterIterable( + (example_eval.result for example_eval in example_evaluations), + lambda evaluation: not isinstance(evaluation, FailedExampleEvaluation), + ) + id = str(uuid4()) + start = utc_now() + statistics = self._aggregation_logic.aggregate( + cast(Iterable[Evaluation], successful_evaluations) + ) + + aggregation_overview = AggregationOverview( + evaluation_overviews=frozenset(evaluation_overviews), + id=id, + start=start, + end=utc_now(), + successful_evaluation_count=successful_evaluations.included_count(), + crashed_during_eval_count=successful_evaluations.excluded_count(), + description=self.description, + statistics=statistics, + ) + self._aggregation_repository.store_aggregation_overview(aggregation_overview) + return aggregation_overview diff --git a/src/intelligence_layer/evaluation/argilla.py b/src/intelligence_layer/evaluation/argilla.py index f4e781f84..41e288ea1 100644 --- a/src/intelligence_layer/evaluation/argilla.py +++ b/src/intelligence_layer/evaluation/argilla.py @@ -13,6 +13,7 @@ RecordData, ) from intelligence_layer.core import Input, InstructInput, Output, PromptOutput +from intelligence_layer.evaluation import Aggregator from intelligence_layer.evaluation.accumulator import MeanAccumulator from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic from intelligence_layer.evaluation.data_storage.aggregation_repository import ( @@ -71,30 +72,26 @@ def _to_record( class ArgillaEvaluator( - Evaluator[Input, Output, ExpectedOutput, ArgillaEvaluation, AggregatedEvaluation], + Evaluator[Input, Output, ExpectedOutput, ArgillaEvaluation], ABC, ): """Evaluator used to integrate with Argilla (https://github.com/argilla-io/argilla). Use this evaluator if you would like to easily do human eval. This evaluator runs a dataset and sends the input and output to Argilla to be evaluated. - After they have been evaluated, you can fetch the results by using the `aggregate_evaluation` method. Arguments: dataset_repository: The repository with the examples that will be taken for the evaluation. run_repository: The repository of the runs to evaluate. evaluation_repository: The repository that will be used to store evaluation results. - aggregation_repository: The repository that will be used to store aggregation results. description: Human-readable description for the evaluator. evaluation_logic: The logic to use for evaluation. - aggregation_logic: The logic to aggregate the evaluations. Generics: Input: Interface to be passed to the :class:`Task` that shall be evaluated. Output: Type of the output of the :class:`Task` to be evaluated. ExpectedOutput: Output that is expected from the run with the supplied input. ArgillaEvaluation: Interface of the metrics that come from the Argilla task`. - AggregatedEvaluation: The aggregated results of an evaluation run with a :class:`Dataset`. """ def __init__( @@ -102,25 +99,58 @@ def __init__( dataset_repository: DatasetRepository, run_repository: RunRepository, evaluation_repository: ArgillaEvaluationRepository, - aggregation_repository: AggregationRepository, description: str, evaluation_logic: ArgillaEvaluationLogic[Input, Output, ExpectedOutput], - aggregation_logic: AggregationLogic[ArgillaEvaluation, AggregatedEvaluation], ) -> None: super().__init__( dataset_repository, run_repository, evaluation_repository, - aggregation_repository, description, evaluation_logic, # type: ignore - aggregation_logic, # TODO: check if the non-matching types of the evaluation logic and aggregation logic (in the line above) are a problem ) def evaluation_type(self) -> type[ArgillaEvaluation]: # type: ignore return ArgillaEvaluation +class ArgillaAggregator( + Aggregator[ArgillaEvaluation, AggregatedEvaluation], + ABC, +): + """Aggregator used to aggregate Argilla (https://github.com/argilla-io/argilla) evaluations. + + You can fetch the results by using the `aggregate_evaluation` method. + + Arguments: + evaluation_repository: The repository that will be used to store evaluation results. + aggregation_repository: The repository that will be used to store aggregation results. + description: Human-readable description for the evaluator. + aggregation_logic: The logic to aggregate the evaluations. + + Generics: + ArgillaEvaluation: Interface of the metrics that come from the Argilla task`. + AggregatedEvaluation: The aggregated results of an evaluation run with a :class:`Dataset`. + """ + + def evaluation_type(self) -> type[ArgillaEvaluation]: # type: ignore + return ArgillaEvaluation + + def __init__( + self, + evaluation_repository: ArgillaEvaluationRepository, + aggregation_repository: AggregationRepository, + description: str, + aggregation_logic: AggregationLogic[ArgillaEvaluation, AggregatedEvaluation], + ) -> None: + super().__init__( + evaluation_repository, + aggregation_repository, + description, + aggregation_logic, + ) + + class AggregatedInstructComparison(BaseModel): scores: Mapping[str, PlayerScore] diff --git a/src/intelligence_layer/evaluation/evaluator.py b/src/intelligence_layer/evaluation/evaluator.py index 36b34da95..6d5b9788e 100644 --- a/src/intelligence_layer/evaluation/evaluator.py +++ b/src/intelligence_layer/evaluation/evaluator.py @@ -1,10 +1,8 @@ from concurrent.futures import ThreadPoolExecutor from functools import lru_cache from typing import ( - Callable, Generic, Iterable, - Iterator, Mapping, Optional, Sequence, @@ -15,16 +13,12 @@ get_args, get_origin, ) -from uuid import uuid4 from tqdm import tqdm from intelligence_layer.core.task import Input, Output from intelligence_layer.core.tracer import utc_now -from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic -from intelligence_layer.evaluation.data_storage.aggregation_repository import ( - AggregationRepository, -) +from intelligence_layer.evaluation.base_logic import EvaluationLogic from intelligence_layer.evaluation.data_storage.dataset_repository import ( DatasetRepository, ) @@ -33,8 +27,6 @@ ) from intelligence_layer.evaluation.data_storage.run_repository import RunRepository from intelligence_layer.evaluation.domain import ( - AggregatedEvaluation, - AggregationOverview, Evaluation, EvaluationOverview, Example, @@ -47,60 +39,26 @@ SuccessfulExampleOutput, ) -T = TypeVar("T") - - -class CountingFilterIterable(Iterable[T]): - def __init__( - self, wrapped_iterable: Iterable[T], filter: Callable[[T], bool] - ) -> None: - self._wrapped_iterator = iter(wrapped_iterable) - self._filter = filter - self._included_count = 0 - self._excluded_count = 0 - - def __next__(self) -> T: - e = next(self._wrapped_iterator) - while not self._filter(e): - self._excluded_count += 1 - e = next(self._wrapped_iterator) - self._included_count += 1 - return e - def __iter__(self) -> Iterator[T]: - return self - - def included_count(self) -> int: - return self._included_count - - def excluded_count(self) -> int: - return self._excluded_count - - -class Evaluator( - Generic[Input, Output, ExpectedOutput, Evaluation, AggregatedEvaluation] -): +class Evaluator(Generic[Input, Output, ExpectedOutput, Evaluation]): """Evaluator that can handle automatic evaluation scenarios. This evaluator should be used for automatic eval. A user still has to implement - :class:`EvaluationLogic` and :class: `AggregationLogic`. + :class:`EvaluationLogic`. Arguments: dataset_repository: The repository with the examples that will be taken for the evaluation. run_repository: The repository of the runs to evaluate. evaluation_repository: The repository that will be used to store evaluation results. - aggregation_repository: The repository that will be used to store aggregation results. description: Human-readable description for the evaluator. evaluation_logic: The logic to use for evaluation. - aggregation_logic: The logic to aggregate the evaluations. Generics: Input: Interface to be passed to the :class:`Task` that shall be evaluated. Output: Type of the output of the :class:`Task` to be evaluated. ExpectedOutput: Output that is expected from the run with the supplied input. Evaluation: Interface of the metrics that come from the evaluated :class:`Task`. - AggregatedEvaluation: The aggregated results of an evaluation run with a :class:`Dataset`. """ def __init__( @@ -108,18 +66,13 @@ def __init__( dataset_repository: DatasetRepository, run_repository: RunRepository, evaluation_repository: EvaluationRepository, - aggregation_repository: AggregationRepository, description: str, evaluation_logic: EvaluationLogic[Input, Output, ExpectedOutput, Evaluation], - aggregation_logic: AggregationLogic[Evaluation, AggregatedEvaluation], ) -> None: self._dataset_repository = dataset_repository self._run_repository = run_repository self._evaluation_repository = evaluation_repository - self._aggregation_repository = aggregation_repository - self._evaluation_logic = evaluation_logic - self._aggregation_logic = aggregation_logic self.description = description @lru_cache(maxsize=1) @@ -152,8 +105,8 @@ def update_types() -> None: num_types_set += 1 # mypy does not know __orig_bases__ - base_evaluator_bases = EvaluationLogic.__orig_bases__[1] # type: ignore - type_list: list[type | TypeVar] = list(get_args(base_evaluator_bases)) + base_types = EvaluationLogic.__orig_bases__[1] # type: ignore + type_list: list[type | TypeVar] = list(get_args(base_types)) possible_parent_classes = [ p for p in reversed(type(self._evaluation_logic).__mro__) @@ -171,7 +124,7 @@ def update_types() -> None: return { name: param_type for name, param_type in zip( - (a.__name__ for a in get_args(base_evaluator_bases)), type_list + (a.__name__ for a in get_args(base_types)), type_list ) if type(param_type) is not TypeVar } @@ -360,67 +313,6 @@ def evaluate( return partial_overview - @final - def aggregate_evaluation( - self, *eval_ids: str - ) -> AggregationOverview[AggregatedEvaluation]: - """Aggregates all evaluations into an overview that includes high-level statistics. - - Aggregates :class:`Evaluation`s according to the implementation of :func:`BaseEvaluator.aggregate`. - - Args: - evaluation_overview: An overview of the evaluation to be aggregated. Does not include - actual evaluations as these will be retrieved from the repository. - - Returns: - An overview of the aggregated evaluation. - """ - - def load_eval_overview(eval_id: str) -> EvaluationOverview: - evaluation_overview = self._evaluation_repository.evaluation_overview( - eval_id - ) - if not evaluation_overview: - raise ValueError( - f"No PartialEvaluationOverview found for eval-id: {eval_id}" - ) - return evaluation_overview - - evaluation_overviews = frozenset(load_eval_overview(id) for id in set(eval_ids)) - - nested_evaluations = [ - self._evaluation_repository.example_evaluations( - overview.id, self.evaluation_type() - ) - for overview in evaluation_overviews - ] - example_evaluations = [ - eval for sublist in nested_evaluations for eval in sublist - ] - - successful_evaluations = CountingFilterIterable( - (example_eval.result for example_eval in example_evaluations), - lambda evaluation: not isinstance(evaluation, FailedExampleEvaluation), - ) - id = str(uuid4()) - start = utc_now() - statistics = self._aggregation_logic.aggregate( - cast(Iterable[Evaluation], successful_evaluations) - ) - - aggregation_overview = AggregationOverview( - evaluation_overviews=frozenset(evaluation_overviews), - id=id, - start=start, - end=utc_now(), - successful_evaluation_count=successful_evaluations.included_count(), - crashed_during_eval_count=successful_evaluations.excluded_count(), - description=self.description, - statistics=statistics, - ) - self._aggregation_repository.store_aggregation_overview(aggregation_overview) - return aggregation_overview - @final def evaluate( self, @@ -440,24 +332,3 @@ def evaluate( self._evaluation_repository.store_example_evaluation( ExampleEvaluation(eval_id=eval_id, example_id=example.id, result=result) ) - - @final - def eval_and_aggregate_runs( - self, *run_ids: str - ) -> AggregationOverview[AggregatedEvaluation]: - """Evaluates an entire dataset in a threaded manner and aggregates the results into an `AggregatedEvaluation`. - - This will call the `run` method for each example in the dataset. - Finally, it will call the `aggregate` method and return the aggregated results. - - Args: - dataset_id: id of the dataset that will be used to evaluate a :class:`Task`. - The actual data is loaded from the :class:`DatasetRepository` passed to `__init__` - tracer: Optional tracer used for extra tracing. - Traces are always saved in the evaluation repository. - - Returns: - The aggregated results of an evaluation run with a dataset. - """ - partial_evaluation_overview = self.evaluate_runs(*run_ids) - return self.aggregate_evaluation(partial_evaluation_overview.id) diff --git a/src/intelligence_layer/evaluation/run.py b/src/intelligence_layer/evaluation/run.py index 12200d89d..525bab45b 100644 --- a/src/intelligence_layer/evaluation/run.py +++ b/src/intelligence_layer/evaluation/run.py @@ -9,6 +9,7 @@ from intelligence_layer.connectors.limited_concurrency_client import ( LimitedConcurrencyClient, ) +from intelligence_layer.evaluation import Aggregator from intelligence_layer.evaluation.data_storage.aggregation_repository import ( FileAggregationRepository, ) @@ -107,12 +108,14 @@ def main(cli_args: Sequence[str]) -> None: dataset_repository, runner_repository, evaluation_repository, - aggregation_repository, description, eval_logic, - aggregation_logic, ) - evaluator.eval_and_aggregate_runs(run_overview_id) + aggregator = Aggregator( + evaluation_repository, aggregation_repository, description, aggregation_logic + ) + evaluation_overview = evaluator.evaluate_runs(run_overview_id) + aggregator.aggregate_evaluation(evaluation_overview.id) if __name__ == "__main__": diff --git a/tests/evaluation/test_argilla_evaluator.py b/tests/evaluation/test_argilla_evaluator.py index 9aa19deae..fd6e4aafe 100644 --- a/tests/evaluation/test_argilla_evaluator.py +++ b/tests/evaluation/test_argilla_evaluator.py @@ -21,6 +21,7 @@ Runner, SuccessfulExampleOutput, ) +from intelligence_layer.evaluation.argilla import ArgillaAggregator from intelligence_layer.evaluation.base_logic import AggregationLogic from intelligence_layer.evaluation.data_storage.aggregation_repository import ( InMemoryAggregationRepository, @@ -131,21 +132,9 @@ def arg() -> StubArgillaClient: return StubArgillaClient() -@fixture -def string_argilla_evaluator( - in_memory_dataset_repository: InMemoryDatasetRepository, - in_memory_run_repository: InMemoryRunRepository, - in_memory_evaluation_repository: InMemoryEvaluationRepository, - in_memory_aggregation_repository: InMemoryAggregationRepository, - stub_argilla_client: StubArgillaClient, -) -> ArgillaEvaluator[ - DummyStringInput, - DummyStringOutput, - DummyStringOutput, - DummyAggregatedEvaluation, -]: - stub_argilla_client._expected_workspace_id = "workspace-id" - questions = [ +@fixture() +def argilla_questions() -> Sequence[Question]: + return [ Question( name="question", title="title", @@ -153,31 +142,81 @@ def string_argilla_evaluator( options=[1], ) ] - fields = [ + + +@fixture() +def argilla_fields() -> Sequence[Field]: + return [ Field(name="output", title="Output"), Field(name="input", title="Input"), ] + + +@fixture +def argilla_evaluation_repository( + in_memory_evaluation_repository: InMemoryEvaluationRepository, + stub_argilla_client: StubArgillaClient, + argilla_questions: Sequence[Question], + argilla_fields: Sequence[Field], +) -> ArgillaEvaluationRepository: + stub_argilla_client._expected_workspace_id = "workspace-id" + stub_argilla_client._expected_questions = argilla_questions + stub_argilla_client._expected_fields = argilla_fields + workspace_id = stub_argilla_client._expected_workspace_id - eval_repository = ArgillaEvaluationRepository( + return ArgillaEvaluationRepository( in_memory_evaluation_repository, stub_argilla_client, workspace_id, - fields, - questions, + argilla_fields, + argilla_questions, ) + +@fixture +def string_argilla_evaluator( + in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + argilla_evaluation_repository: ArgillaEvaluationRepository, + stub_argilla_client: StubArgillaClient, + argilla_questions: Sequence[Question], + argilla_fields: Sequence[Field], +) -> ArgillaEvaluator[DummyStringInput, DummyStringOutput, DummyStringOutput,]: evaluator = ArgillaEvaluator( in_memory_dataset_repository, in_memory_run_repository, + argilla_evaluation_repository, + "dummy-string-task", + DummyStringTaskArgillaEvaluationLogic(), + ) + return evaluator + + +@fixture +def string_argilla_aggregator( + argilla_evaluation_repository: ArgillaEvaluationRepository, + in_memory_aggregation_repository: InMemoryAggregationRepository, + stub_argilla_client: StubArgillaClient, + argilla_questions: Sequence[Question], + argilla_fields: Sequence[Field], +) -> ArgillaAggregator[DummyAggregatedEvaluation,]: + workspace_id = stub_argilla_client._expected_workspace_id + + eval_repository = ArgillaEvaluationRepository( + argilla_evaluation_repository, + stub_argilla_client, + workspace_id, + argilla_fields, + argilla_questions, + ) + + evaluator = ArgillaAggregator( eval_repository, in_memory_aggregation_repository, "dummy-string-task", - DummyStringTaskArgillaEvaluationLogic(), DummyStringTaskArgillaAggregationLogic(), ) - stub_argilla_client._expected_questions = questions - stub_argilla_client._expected_fields = fields return evaluator @@ -200,7 +239,6 @@ def test_argilla_evaluator_can_do_sync_evaluation( DummyStringInput, DummyStringOutput, DummyStringOutput, - DummyAggregatedEvaluation, ], string_argilla_runner: Runner[DummyStringInput, DummyStringOutput], string_dataset_id: str, @@ -230,8 +268,8 @@ def test_argilla_evaluator_can_aggregate_evaluation( DummyStringInput, DummyStringOutput, DummyStringOutput, - DummyAggregatedEvaluation, ], + string_argilla_aggregator: ArgillaAggregator[DummyAggregatedEvaluation], string_argilla_runner: Runner[DummyStringInput, DummyStringOutput], string_dataset_id: str, ) -> None: @@ -240,7 +278,7 @@ def test_argilla_evaluator_can_aggregate_evaluation( ) run_overview = string_argilla_runner.run_dataset(string_dataset_id) eval_overview = string_argilla_evaluator.evaluate_runs(run_overview.id) - aggregated_eval_overview = string_argilla_evaluator.aggregate_evaluation( + aggregated_eval_overview = string_argilla_aggregator.aggregate_evaluation( eval_overview.id ) assert aggregated_eval_overview.statistics.score == argilla_client._score diff --git a/tests/evaluation/test_evaluator.py b/tests/evaluation/test_evaluator.py index 214c60b90..bab2fc5a0 100644 --- a/tests/evaluation/test_evaluator.py +++ b/tests/evaluation/test_evaluator.py @@ -6,6 +6,7 @@ from intelligence_layer.core import InMemoryTaskSpan, InMemoryTracer, NoOpTracer, Tracer from intelligence_layer.core.task import Input, Output, Task from intelligence_layer.evaluation import ( + Aggregator, Evaluation, Evaluator, Example, @@ -146,19 +147,30 @@ def dummy_evaluator( in_memory_dataset_repository: InMemoryDatasetRepository, in_memory_run_repository: InMemoryRunRepository, in_memory_evaluation_repository: InMemoryEvaluationRepository, - in_memory_aggregation_repository: InMemoryAggregationRepository, dummy_eval_logic: DummyEvaluationLogic, - dummy_aggregate_logic: DummyAggregationLogic, -) -> Evaluator[ - str, str, None, DummyEvaluation, DummyAggregatedEvaluationWithResultList -]: +) -> Evaluator[str, str, None, DummyEvaluation]: return Evaluator( in_memory_dataset_repository, in_memory_run_repository, in_memory_evaluation_repository, - in_memory_aggregation_repository, "dummy-evaluator", dummy_eval_logic, + ) + + +@fixture +def dummy_aggregator( + in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + in_memory_evaluation_repository: InMemoryEvaluationRepository, + in_memory_aggregation_repository: InMemoryAggregationRepository, + dummy_eval_logic: DummyEvaluationLogic, + dummy_aggregate_logic: DummyAggregationLogic, +) -> Aggregator[DummyEvaluation, DummyAggregatedEvaluationWithResultList]: + return Aggregator( + in_memory_evaluation_repository, + in_memory_aggregation_repository, + "dummy-evaluator", dummy_aggregate_logic, ) @@ -194,30 +206,42 @@ def comparing_evaluator( in_memory_dataset_repository: InMemoryDatasetRepository, in_memory_run_repository: InMemoryRunRepository, in_memory_evaluation_repository: InMemoryEvaluationRepository, - in_memory_aggregation_repository: InMemoryAggregationRepository, comparing_eval_logic: ComparingEvaluationLogic, - comparing_aggregation_logic: ComparingAggregationLogic, -) -> Evaluator[str, str, None, ComparisonEvaluation, ComparisonAggregation]: +) -> Evaluator[str, str, None, ComparisonEvaluation]: return Evaluator( in_memory_dataset_repository, in_memory_run_repository, in_memory_evaluation_repository, - in_memory_aggregation_repository, "comparing-evaluator", comparing_eval_logic, + ) + + +@fixture +def comparing_aggregator( + in_memory_evaluation_repository: InMemoryEvaluationRepository, + in_memory_aggregation_repository: InMemoryAggregationRepository, + comparing_aggregation_logic: ComparingAggregationLogic, +) -> Aggregator[ComparisonEvaluation, ComparisonAggregation]: + return Aggregator( + in_memory_evaluation_repository, + in_memory_aggregation_repository, + "comparing-evaluator", comparing_aggregation_logic, ) def test_eval_and_aggregate_runs_returns_generic_statistics( - dummy_evaluator: Evaluator[ - str, str, None, DummyEvaluation, DummyAggregatedEvaluationWithResultList + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], + dummy_aggregator: Aggregator[ + DummyEvaluation, DummyAggregatedEvaluationWithResultList ], dummy_runner: Runner[str, str], dataset_id: str, ) -> None: run_overview = dummy_runner.run_dataset(dataset_id) - aggregation_overview = dummy_evaluator.eval_and_aggregate_runs(run_overview.id) + evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id) + aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id) assert next(iter(aggregation_overview.run_overviews())).dataset_id == dataset_id assert aggregation_overview.successful_evaluation_count == 1 @@ -225,15 +249,17 @@ def test_eval_and_aggregate_runs_returns_generic_statistics( def test_eval_and_aggregate_runs_uses_passed_tracer( - dummy_evaluator: Evaluator[ - str, str, None, DummyEvaluation, DummyAggregatedEvaluationWithResultList + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], + dummy_aggregator: Aggregator[ + DummyEvaluation, DummyAggregatedEvaluationWithResultList ], dataset_id: str, dummy_runner: Runner[str, str], ) -> None: in_memory_tracer = InMemoryTracer() run_overview = dummy_runner.run_dataset(dataset_id, in_memory_tracer) - dummy_evaluator.eval_and_aggregate_runs(run_overview.id) + evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id) + dummy_aggregator.aggregate_evaluation(evaluation_overview.id) entries = in_memory_tracer.entries assert len(entries) == 3 @@ -241,8 +267,9 @@ def test_eval_and_aggregate_runs_uses_passed_tracer( def test_eval_and_aggregate_runs_stores_example_evaluations( - dummy_evaluator: Evaluator[ - str, str, None, DummyEvaluation, DummyAggregatedEvaluationWithResultList + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], + dummy_aggregator: Aggregator[ + DummyEvaluation, DummyAggregatedEvaluationWithResultList ], dataset_id: str, dummy_runner: Runner[str, str], @@ -256,7 +283,9 @@ def test_eval_and_aggregate_runs_stores_example_evaluations( assert dataset is not None run_overview = dummy_runner.run_dataset(dataset_id, NoOpTracer()) - aggregation_overview = dummy_evaluator.eval_and_aggregate_runs(run_overview.id) + evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id) + aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id) + examples = list(dataset) eval_overview = next(iter(aggregation_overview.evaluation_overviews)) success_result = evaluation_repository.example_evaluation( @@ -283,8 +312,9 @@ def test_eval_and_aggregate_runs_stores_example_evaluations( def test_eval_and_aggregate_runs_stores_example_traces( - dummy_evaluator: Evaluator[ - str, str, None, DummyEvaluation, DummyAggregatedEvaluationWithResultList + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], + dummy_aggregator: Aggregator[ + DummyEvaluation, DummyAggregatedEvaluationWithResultList ], dataset_id: str, dummy_runner: Runner[str, str], @@ -297,16 +327,18 @@ def test_eval_and_aggregate_runs_stores_example_traces( assert dataset is not None run_overview = dummy_runner.run_dataset(dataset_id) - evaluation_run_overview = dummy_evaluator.eval_and_aggregate_runs(run_overview.id) + evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id) + aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id) + examples = list(dataset) success_result = run_repository.example_trace( - evaluation_run_overview.run_ids[0], examples[0].id + aggregation_overview.run_ids[0], examples[0].id ) failure_result_task = run_repository.example_trace( - evaluation_run_overview.run_ids[0], examples[1].id + aggregation_overview.run_ids[0], examples[1].id ) failure_result_eval = run_repository.example_trace( - evaluation_run_overview.run_ids[0], examples[2].id + aggregation_overview.run_ids[0], examples[2].id ) assert success_result @@ -318,16 +350,18 @@ def test_eval_and_aggregate_runs_stores_example_traces( def test_eval_and_aggregate_runs_stores_aggregated_results( - dummy_evaluator: Evaluator[ - str, str, None, DummyEvaluation, DummyAggregatedEvaluationWithResultList + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], + dummy_aggregator: Aggregator[ + DummyEvaluation, DummyAggregatedEvaluationWithResultList ], dummy_runner: Runner[str, str], dataset_id: str, ) -> None: - aggregation_repository = dummy_evaluator._aggregation_repository + aggregation_repository = dummy_aggregator._aggregation_repository run_overview = dummy_runner.run_dataset(dataset_id) - aggregation_overview = dummy_evaluator.eval_and_aggregate_runs(run_overview.id) + evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id) + aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id) loaded_evaluation_run_overview = aggregation_repository.aggregation_overview( aggregation_overview.id, DummyAggregatedEvaluationWithResultList ) @@ -336,40 +370,39 @@ def test_eval_and_aggregate_runs_stores_aggregated_results( def test_evaluate_can_evaluate_multiple_runs( - comparing_evaluator: Evaluator[ - str, str, None, ComparisonEvaluation, ComparisonAggregation - ], + comparing_evaluator: Evaluator[str, str, None, ComparisonEvaluation], + comparing_aggregator: Aggregator[ComparisonEvaluation, ComparisonAggregation], string_dataset_id: str, dummy_runner: Runner[str, str], ) -> None: run_overview1 = dummy_runner.run_dataset(string_dataset_id) run_overview2 = dummy_runner.run_dataset(string_dataset_id) - partial_overview = comparing_evaluator.evaluate_runs( + evaluation_overview = comparing_evaluator.evaluate_runs( run_overview1.id, run_overview2.id ) - - eval_overview = comparing_evaluator.aggregate_evaluation(partial_overview.id) - assert eval_overview.statistics.equal_ratio == 1 + aggregation_overview = comparing_aggregator.aggregate_evaluation( + evaluation_overview.id + ) + assert aggregation_overview.statistics.equal_ratio == 1 def test_aggregate_evaluation_can_aggregate_multiple_evals( - comparing_evaluator: Evaluator[ - str, str, None, ComparisonEvaluation, ComparisonAggregation - ], + comparing_evaluator: Evaluator[str, str, None, ComparisonEvaluation], + comparing_aggregator: Aggregator[ComparisonEvaluation, ComparisonAggregation], string_dataset_id: str, dummy_runner: Runner[str, str], ) -> None: run_overview_1 = dummy_runner.run_dataset(string_dataset_id) run_overview_2 = dummy_runner.run_dataset(string_dataset_id) - partial_overview_1 = comparing_evaluator.evaluate_runs(run_overview_1.id) - partial_overview_2 = comparing_evaluator.evaluate_runs( + evaluation_overview_1 = comparing_evaluator.evaluate_runs(run_overview_1.id) + evaluation_overview_2 = comparing_evaluator.evaluate_runs( run_overview_1.id, run_overview_2.id ) - aggregation_overview = comparing_evaluator.aggregate_evaluation( - partial_overview_1.id, partial_overview_1.id, partial_overview_2.id + aggregation_overview = comparing_aggregator.aggregate_evaluation( + evaluation_overview_1.id, evaluation_overview_1.id, evaluation_overview_2.id ) assert len(list(aggregation_overview.run_overviews())) == 2 @@ -393,7 +426,6 @@ class AggregatedEvaluationType(BaseModel): "Output": str, "ExpectedOutput": type(None), "Evaluation": EvaluationType, - # "AggregatedEvaluation": AggregatedEvaluationType, # TODO: fix after Evaluation and Aggregation have been split } A = TypeVar("A", bound=BaseModel) @@ -422,15 +454,51 @@ class GreatGrandChildEvaluationLogic( pass timmy: Evaluator[ - str, str, None, EvaluationType, AggregatedEvaluationType + str, + str, + None, + EvaluationType, ] = Evaluator( in_memory_dataset_repository, in_memory_run_repository, in_memory_evaluation_repository, - in_memory_aggregation_repository, "dummy", evaluation_logic=GreatGrandChildEvaluationLogic(), - aggregation_logic=ChildAggregationLogic(), + ) + who_is_timmy = timmy._get_types() + + assert who_is_timmy == types + + +def test_aggregator_type_magic_works( + in_memory_evaluation_repository: InMemoryEvaluationRepository, + in_memory_aggregation_repository: InMemoryAggregationRepository, +) -> None: + class EvaluationType(BaseModel): + pass + + class AggregatedEvaluationType(BaseModel): + pass + + types = { + "Evaluation": EvaluationType, + "AggregatedEvaluation": AggregatedEvaluationType, + } + + class ChildAggregationLogic(AggregationLogic[Evaluation, AggregatedEvaluationType]): + def aggregate( + self, evaluations: Iterable[Evaluation] + ) -> AggregatedEvaluationType: + return None # type: ignore + + class GrandChildAggregationLogic(ChildAggregationLogic[EvaluationType]): + pass + + timmy: Aggregator[EvaluationType, AggregatedEvaluationType] = Aggregator( + in_memory_evaluation_repository, + in_memory_aggregation_repository, + "dummy", + aggregation_logic=GrandChildAggregationLogic(), ) who_is_timmy = timmy._get_types() @@ -438,28 +506,29 @@ class GreatGrandChildEvaluationLogic( def test_eval_and_aggregate_runs_only_runs_n_examples( - dummy_evaluator: Evaluator[ - str, str, None, DummyEvaluation, DummyAggregatedEvaluationWithResultList + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], + dummy_aggregator: Aggregator[ + DummyEvaluation, DummyAggregatedEvaluationWithResultList ], dummy_runner: Runner[str, str], good_dataset_id: str, ) -> None: run_overview = dummy_runner.run_dataset(good_dataset_id) - evaluation_overview = dummy_evaluator.eval_and_aggregate_runs(run_overview.id) - partial_evaluation_overview = dummy_evaluator.evaluate_runs( - run_overview.id, num_examples=2 - ) - evaluation_overview_n = dummy_evaluator.aggregate_evaluation( - partial_evaluation_overview.id + evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id) + aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id) + + evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id, num_examples=2) + aggregation_overview_n = dummy_aggregator.aggregate_evaluation( + evaluation_overview.id ) assert ( - evaluation_overview.successful_evaluation_count - + evaluation_overview.crashed_during_eval_count + aggregation_overview.successful_evaluation_count + + aggregation_overview.crashed_during_eval_count == 3 ) assert ( - evaluation_overview_n.successful_evaluation_count - + evaluation_overview_n.crashed_during_eval_count + aggregation_overview_n.successful_evaluation_count + + aggregation_overview_n.crashed_during_eval_count == 2 ) diff --git a/tests/evaluation/test_instruct_comparison_argilla_evaluator.py b/tests/evaluation/test_instruct_comparison_argilla_evaluator.py index abb37ad95..cd15d725e 100644 --- a/tests/evaluation/test_instruct_comparison_argilla_evaluator.py +++ b/tests/evaluation/test_instruct_comparison_argilla_evaluator.py @@ -19,6 +19,7 @@ from intelligence_layer.core.prompt_template import RichPrompt from intelligence_layer.core.tracer import utc_now from intelligence_layer.evaluation import ( + ArgillaEvaluationRepository, ArgillaEvaluator, EloCalculator, Example, @@ -33,6 +34,7 @@ ) from intelligence_layer.evaluation.argilla import ( AggregatedInstructComparison, + ArgillaAggregator, create_instruct_comparison_argilla_evaluation_classes, ) from intelligence_layer.evaluation.data_storage.aggregation_repository import ( @@ -80,15 +82,27 @@ def argilla_fake() -> ArgillaClient: return ArgillaFake() +@fixture +def argilla_repository( + in_memory_evaluation_repository: InMemoryEvaluationRepository, + argilla_fake: ArgillaClient, +) -> ArgillaEvaluationRepository: + ( + evaluation_logic, + evaluation_repository, + ) = create_instruct_comparison_argilla_evaluation_classes( + "workspace", in_memory_evaluation_repository, argilla_fake, None + ) + return evaluation_repository + + @fixture def evaluator( in_memory_dataset_repository: InMemoryDatasetRepository, in_memory_run_repository: InMemoryRunRepository, in_memory_evaluation_repository: InMemoryEvaluationRepository, - in_memory_aggregation_repository: InMemoryAggregationRepository, argilla_fake: ArgillaClient, - argilla_aggregation_logic: InstructComparisonArgillaAggregationLogic, -) -> ArgillaEvaluator[InstructInput, PromptOutput, None, AggregatedInstructComparison]: +) -> ArgillaEvaluator[InstructInput, PromptOutput, None]: ( evaluation_logic, evaluation_repository, @@ -99,9 +113,21 @@ def evaluator( in_memory_dataset_repository, in_memory_run_repository, evaluation_repository, - in_memory_aggregation_repository, "instruct-evaluator", evaluation_logic, + ) + + +@fixture +def aggregator( + argilla_repository: ArgillaEvaluationRepository, + in_memory_aggregation_repository: InMemoryAggregationRepository, + argilla_aggregation_logic: InstructComparisonArgillaAggregationLogic, +) -> ArgillaAggregator[AggregatedInstructComparison]: + return ArgillaAggregator( + argilla_repository, + in_memory_aggregation_repository, + "instruct-evaluator", argilla_aggregation_logic, ) @@ -121,12 +147,8 @@ def any_instruct_output() -> PromptOutput: def test_evaluate_run_submits_pairwise_comparison_records( - evaluator: ArgillaEvaluator[ - InstructInput, - PromptOutput, - None, - AggregatedInstructComparison, - ], + evaluator: ArgillaEvaluator[InstructInput, PromptOutput, None], + aggregator: ArgillaAggregator[AggregatedInstructComparison], in_memory_dataset_repository: InMemoryDatasetRepository, in_memory_run_repository: InMemoryRunRepository, any_instruct_output: PromptOutput, @@ -172,7 +194,7 @@ def test_evaluate_run_submits_pairwise_comparison_records( for record_data in argilla_fake.record_data(evaluation_overview.id) ) == sorted(pairs) - elo_score = evaluator.aggregate_evaluation(evaluation_overview.id) + elo_score = aggregator.aggregate_evaluation(evaluation_overview.id) scores = elo_score.statistics.scores # lower id always wins, should be sorted for i in range(run_count - 1): @@ -197,10 +219,8 @@ def test_evaluate_run_only_evaluates_high_priority( in_memory_dataset_repository, in_memory_run_repository, eval_repository, - in_memory_aggregation_repository, "instruct-evaluator", eval_logic, - argilla_aggregation_logic, ) run_count = 10 diff --git a/tests/use_cases/classify/test_classify.py b/tests/use_cases/classify/test_classify.py index b97b34c25..0c6bf5b2e 100644 --- a/tests/use_cases/classify/test_classify.py +++ b/tests/use_cases/classify/test_classify.py @@ -5,6 +5,7 @@ from intelligence_layer.connectors import AlephAlphaClientProtocol from intelligence_layer.core import Chunk, Task from intelligence_layer.evaluation import ( + Aggregator, DatasetRepository, Example, InMemoryDatasetRepository, @@ -146,23 +147,32 @@ def classify_evaluator( in_memory_dataset_repository: DatasetRepository, in_memory_run_repository: RunRepository, in_memory_evaluation_repository: InMemoryEvaluationRepository, - in_memory_aggregation_repository: InMemoryAggregationRepository, multi_label_classify_evaluation_logic: MultiLabelClassifyEvaluationLogic, - multi_label_classify_aggregation_logic: MultiLabelClassifyAggregationLogic, ) -> Evaluator[ ClassifyInput, MultiLabelClassifyOutput, Sequence[str], MultiLabelClassifyEvaluation, - AggregatedMultiLabelClassifyEvaluation, ]: return Evaluator( in_memory_dataset_repository, in_memory_run_repository, in_memory_evaluation_repository, - in_memory_aggregation_repository, "multi-label-classify", multi_label_classify_evaluation_logic, + ) + + +@fixture +def classify_aggregator( + in_memory_evaluation_repository: InMemoryEvaluationRepository, + in_memory_aggregation_repository: InMemoryAggregationRepository, + multi_label_classify_aggregation_logic: MultiLabelClassifyAggregationLogic, +) -> Aggregator[MultiLabelClassifyEvaluation, AggregatedMultiLabelClassifyEvaluation,]: + return Aggregator( + in_memory_evaluation_repository, + in_memory_aggregation_repository, + "multi-label-classify", multi_label_classify_aggregation_logic, ) @@ -188,7 +198,6 @@ def test_multi_label_classify_evaluator_single_example( MultiLabelClassifyOutput, Sequence[str], MultiLabelClassifyEvaluation, - AggregatedMultiLabelClassifyEvaluation, ], classify_runner: Runner[ClassifyInput, MultiLabelClassifyOutput], ) -> None: @@ -214,14 +223,19 @@ def test_multi_label_classify_evaluator_full_dataset( MultiLabelClassifyOutput, Sequence[str], MultiLabelClassifyEvaluation, - AggregatedMultiLabelClassifyEvaluation, + ], + classify_aggregator: Aggregator[ + MultiLabelClassifyEvaluation, AggregatedMultiLabelClassifyEvaluation ], classify_runner: Runner[ClassifyInput, MultiLabelClassifyOutput], ) -> None: run_overview = classify_runner.run_dataset(multiple_entries_dataset_name) - evaluation = classify_evaluator.eval_and_aggregate_runs(run_overview.id) + evaluation_overview = classify_evaluator.evaluate_runs(run_overview.id) + aggregation_overview = classify_aggregator.aggregate_evaluation( + evaluation_overview.id + ) - assert set(["positive", "negative", "finance", "school"]) == set( - evaluation.statistics.class_metrics.keys() + assert {"positive", "negative", "finance", "school"} == set( + aggregation_overview.statistics.class_metrics.keys() ) diff --git a/tests/use_cases/classify/test_prompt_based_classify.py b/tests/use_cases/classify/test_prompt_based_classify.py index f8f5e9066..5b349061d 100644 --- a/tests/use_cases/classify/test_prompt_based_classify.py +++ b/tests/use_cases/classify/test_prompt_based_classify.py @@ -7,16 +7,15 @@ ) from intelligence_layer.core import Chunk, InMemoryTracer, NoOpTracer from intelligence_layer.evaluation import ( + Aggregator, DatasetRepository, Example, + InMemoryAggregationRepository, InMemoryDatasetRepository, InMemoryEvaluationRepository, Runner, RunRepository, ) -from intelligence_layer.evaluation.data_storage.aggregation_repository import ( - InMemoryAggregationRepository, -) from intelligence_layer.evaluation.evaluator import Evaluator from intelligence_layer.use_cases.classify.classify import ( AggregatedSingleLabelClassifyEvaluation, @@ -51,23 +50,35 @@ def classify_evaluator( in_memory_dataset_repository: DatasetRepository, in_memory_run_repository: RunRepository, in_memory_evaluation_repository: InMemoryEvaluationRepository, - in_memory_aggregation_repository: InMemoryAggregationRepository, single_label_classify_eval_logic: SingleLabelClassifyEvaluationLogic, - single_label_classify_aggregation_logic: SingleLabelClassifyAggregationLogic, ) -> Evaluator[ ClassifyInput, SingleLabelClassifyOutput, Sequence[str], SingleLabelClassifyEvaluation, - AggregatedSingleLabelClassifyEvaluation, ]: return Evaluator( in_memory_dataset_repository, in_memory_run_repository, in_memory_evaluation_repository, - in_memory_aggregation_repository, "single-label-classify", single_label_classify_eval_logic, + ) + + +@fixture +def classify_aggregator( + in_memory_evaluation_repository: InMemoryEvaluationRepository, + in_memory_aggregation_repository: InMemoryAggregationRepository, + single_label_classify_aggregation_logic: SingleLabelClassifyAggregationLogic, +) -> Aggregator[ + SingleLabelClassifyEvaluation, + AggregatedSingleLabelClassifyEvaluation, +]: + return Aggregator( + in_memory_evaluation_repository, + in_memory_aggregation_repository, + "single-label-classify", single_label_classify_aggregation_logic, ) @@ -178,7 +189,6 @@ def test_can_evaluate_classify( SingleLabelClassifyOutput, Sequence[str], SingleLabelClassifyEvaluation, - AggregatedSingleLabelClassifyEvaluation, ], prompt_based_classify: PromptBasedClassify, ) -> None: @@ -210,6 +220,9 @@ def test_can_aggregate_evaluations( SingleLabelClassifyOutput, Sequence[str], SingleLabelClassifyEvaluation, + ], + classify_aggregator: Aggregator[ + SingleLabelClassifyEvaluation, AggregatedSingleLabelClassifyEvaluation, ], in_memory_dataset_repository: InMemoryDatasetRepository, @@ -235,9 +248,12 @@ def test_can_aggregate_evaluations( ) run_overview = classify_runner.run_dataset(dataset_name) - evaluation_overview = classify_evaluator.eval_and_aggregate_runs(run_overview.id) + evaluation_overview = classify_evaluator.evaluate_runs(run_overview.id) + aggregation_overview = classify_aggregator.aggregate_evaluation( + evaluation_overview.id + ) - assert evaluation_overview.statistics.percentage_correct == 0.5 + assert aggregation_overview.statistics.percentage_correct == 0.5 def test_aggregating_evaluations_works_with_empty_list( @@ -246,6 +262,9 @@ def test_aggregating_evaluations_works_with_empty_list( SingleLabelClassifyOutput, Sequence[str], SingleLabelClassifyEvaluation, + ], + classify_aggregator: Aggregator[ + SingleLabelClassifyEvaluation, AggregatedSingleLabelClassifyEvaluation, ], classify_runner: Runner[ClassifyInput, SingleLabelClassifyOutput], @@ -253,6 +272,9 @@ def test_aggregating_evaluations_works_with_empty_list( ) -> None: dataset_id = in_memory_dataset_repository.create_dataset([]) run_overview = classify_runner.run_dataset(dataset_id) - evaluation_overview = classify_evaluator.eval_and_aggregate_runs(run_overview.id) + evaluation_overview = classify_evaluator.evaluate_runs(run_overview.id) + aggregation_overview = classify_aggregator.aggregate_evaluation( + evaluation_overview.id + ) - assert evaluation_overview.statistics.percentage_correct == 0 + assert aggregation_overview.statistics.percentage_correct == 0 diff --git a/tests/use_cases/summarize/test_summarize.py b/tests/use_cases/summarize/test_summarize.py index fbee3847f..e43702cbe 100644 --- a/tests/use_cases/summarize/test_summarize.py +++ b/tests/use_cases/summarize/test_summarize.py @@ -2,6 +2,7 @@ from intelligence_layer.core import Chunk, Language, NoOpTracer from intelligence_layer.evaluation import ( + Aggregator, DatasetRepository, EvaluationRepository, Example, @@ -50,23 +51,27 @@ def single_chunk_summarize_evaluator( in_memory_dataset_repository: InMemoryDatasetRepository, in_memory_run_repository: InMemoryRunRepository, in_memory_evaluation_repository: InMemoryEvaluationRepository, - in_memory_aggregation_repository: InMemoryAggregationRepository, single_chunk_summarize_eval_logic: SingleChunkSummarizeEvaluationLogic, - single_chunk_summarize_aggregation_logic: SingleChunkSummarizeAggregationLogic, -) -> Evaluator[ - SingleChunkSummarizeInput, - SummarizeOutput, - str, - SummarizeEvaluation, - AggregatedSummarizeEvaluation, -]: +) -> Evaluator[SingleChunkSummarizeInput, SummarizeOutput, str, SummarizeEvaluation,]: return Evaluator( in_memory_dataset_repository, in_memory_run_repository, in_memory_evaluation_repository, - in_memory_aggregation_repository, "single-chunk-summarize", single_chunk_summarize_eval_logic, + ) + + +@fixture +def single_chunk_summarize_aggregator( + in_memory_evaluation_repository: InMemoryEvaluationRepository, + in_memory_aggregation_repository: InMemoryAggregationRepository, + single_chunk_summarize_aggregation_logic: SingleChunkSummarizeAggregationLogic, +) -> Aggregator[SummarizeEvaluation, AggregatedSummarizeEvaluation,]: + return Aggregator( + in_memory_evaluation_repository, + in_memory_aggregation_repository, + "single-chunk-summarize", single_chunk_summarize_aggregation_logic, ) @@ -100,23 +105,32 @@ def long_context_summarize_evaluator( in_memory_dataset_repository: DatasetRepository, in_memory_run_repository: RunRepository, in_memory_evaluation_repository: EvaluationRepository, - in_memory_aggregation_repository: InMemoryAggregationRepository, long_context_summarize_evaluation_logic: LongContextSummarizeEvaluationLogic, - long_context_summarize_aggregation_logic: LongContextSummarizeAggregationLogic, ) -> Evaluator[ LongContextSummarizeInput, LongContextSummarizeOutput, str, SummarizeEvaluation, - AggregatedSummarizeEvaluation, ]: return Evaluator( in_memory_dataset_repository, in_memory_run_repository, in_memory_evaluation_repository, - in_memory_aggregation_repository, "long-context-summarize", long_context_summarize_evaluation_logic, + ) + + +@fixture +def long_context_summarize_aggregator( + in_memory_evaluation_repository: EvaluationRepository, + in_memory_aggregation_repository: InMemoryAggregationRepository, + long_context_summarize_aggregation_logic: LongContextSummarizeAggregationLogic, +) -> Aggregator[SummarizeEvaluation, AggregatedSummarizeEvaluation,]: + return Aggregator( + in_memory_evaluation_repository, + in_memory_aggregation_repository, + "long-context-summarize", long_context_summarize_aggregation_logic, ) @@ -141,6 +155,9 @@ def test_single_chunk_summarize_evaluator( SummarizeOutput, str, SummarizeEvaluation, + ], + single_chunk_summarize_aggregator: Aggregator[ + SummarizeEvaluation, AggregatedSummarizeEvaluation, ], single_chunk_summarize_runner: Runner[str, str], @@ -162,9 +179,12 @@ def test_single_chunk_summarize_evaluator( ) run_overview = single_chunk_summarize_runner.run_dataset(dataset_name) - aggregation_overview = single_chunk_summarize_evaluator.eval_and_aggregate_runs( + evaluation_overview = single_chunk_summarize_evaluator.evaluate_runs( run_overview.id ) + aggregation_overview = single_chunk_summarize_aggregator.aggregate_evaluation( + evaluation_overview.id + ) assert aggregation_overview.successful_evaluation_count == 2 individual_evaluation_id = next(iter(aggregation_overview.evaluation_overviews)).id @@ -194,6 +214,9 @@ def test_long_context_summarize_evaluator( LongContextSummarizeOutput, str, SummarizeEvaluation, + ], + long_context_summarize_aggregator: Aggregator[ + SummarizeEvaluation, AggregatedSummarizeEvaluation, ], long_context_summarize_runner: Runner[str, str], @@ -214,9 +237,12 @@ def test_long_context_summarize_evaluator( ) run_overview = long_context_summarize_runner.run_dataset(dataset_name) - aggregation_overview = long_context_summarize_evaluator.eval_and_aggregate_runs( + evaluation_overview = long_context_summarize_evaluator.evaluate_runs( run_overview.id ) + aggregation_overview = long_context_summarize_aggregator.aggregate_evaluation( + evaluation_overview.id + ) assert aggregation_overview.successful_evaluation_count == 2 individual_evaluation_id = next(iter(aggregation_overview.evaluation_overviews)).id