From c0370bab37857ebafbdc645d0d0574e4a8315b1c Mon Sep 17 00:00:00 2001 From: Merlin Kallenborn Date: Tue, 21 May 2024 16:45:52 +0200 Subject: [PATCH] feat: Add How-to describing complete incremental evaluation workflow TASK: IL-313 --- CHANGELOG.md | 2 +- README.md | 2 +- src/documentation/how_tos/example_data.py | 16 ++ ...complete_incremental_evaluation_flow.ipynb | 205 ++++++++++++++++++ .../how_to_implement_elo_evaluations.ipynb | 4 +- 5 files changed, 225 insertions(+), 4 deletions(-) create mode 100644 src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb diff --git a/CHANGELOG.md b/CHANGELOG.md index 666254782..82be2fb01 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ ... ### New Features -... + - Add `how_to_implement_complete_incremental_evaluation_flow` ### Fixes - The document index client now correctly URL-encodes document names in its queries. diff --git a/README.md b/README.md index 28b2b19bd..f33b089f8 100644 --- a/README.md +++ b/README.md @@ -180,7 +180,7 @@ The how-tos are quick lookups about how to do things. Compared to the tutorials, | [...retrieve data for analysis](./src/documentation/how_tos/how_to_retrieve_data_for_analysis.ipynb) | Retrieve experiment data in multiple different ways | | [...implement a custom human evaluation](./src/documentation/how_tos/how_to_human_evaluation_via_argilla.ipynb) | Necessary steps to create an evaluation with humans as a judge via Argilla | | [...implement elo evaluations](./src/documentation/how_tos/how_to_implement_elo_evaluations.ipynb) | Evaluate runs and create ELO ranking for them | - +| [...implement complete incremental evaluation flow](./src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb) | Run complete incremental evaluation flow from runner to aggretation # Models Currently, we support a bunch of models accessible via the Aleph Alpha API. Depending on your local setup, you may even have additional models available. diff --git a/src/documentation/how_tos/example_data.py b/src/documentation/how_tos/example_data.py index 24353db7f..5affd555b 100644 --- a/src/documentation/how_tos/example_data.py +++ b/src/documentation/how_tos/example_data.py @@ -20,6 +20,7 @@ from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import ( ComparisonEvaluation, EloEvaluationLogic, + IncrementalEvaluationLogic, Matches, MatchOutcome, ) @@ -48,6 +49,21 @@ def do_evaluate( ) +class DummyIncrementalEvaluationLogic( + IncrementalEvaluationLogic[str, str, str, DummyEvaluation] +): + def do_incremental_evaluate( + self, + example: Example[str, str], + outputs: list[SuccessfulExampleOutput[str]], + already_evaluated_outputs: list[list[SuccessfulExampleOutput[str]]], + ) -> DummyEvaluation: + output_str = "(" + (", ".join(o.output for o in outputs)) + ")" + return DummyEvaluation( + eval=f"{example.input}, {example.expected_output}, {output_str} -> evaluation" + ) + + class DummyEloEvaluationLogic(EloEvaluationLogic[str, str, str]): def grade( self, diff --git a/src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb b/src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb new file mode 100644 index 000000000..163e15238 --- /dev/null +++ b/src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb @@ -0,0 +1,205 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from documentation.how_tos.example_data import (\n", + " DummyAggregationLogic,\n", + " DummyEvaluation,\n", + " DummyExample,\n", + " DummyTask,\n", + ")\n", + "from intelligence_layer.evaluation import (\n", + " Aggregator,\n", + " IncrementalEvaluator,\n", + " InMemoryAggregationRepository,\n", + " InMemoryEvaluationRepository,\n", + " InMemoryRunRepository,\n", + " Runner,\n", + ")\n", + "from intelligence_layer.evaluation.dataset.domain import Example\n", + "from intelligence_layer.evaluation.dataset.in_memory_dataset_repository import (\n", + " InMemoryDatasetRepository,\n", + ")\n", + "from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import (\n", + " IncrementalEvaluationLogic,\n", + ")\n", + "from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How to implement complete incremental evaluation workflows from running (multiple) tasks to aggregation\n", + "This notebook outlines how to:\n", + " - run multiple tasks and configurations on the same dataset\n", + " - perform evaluations in an incremental fashion, i.e., adding additional runs to your existing evaluations without the need for recalculation\n", + " - run aggregation on these evaluations\n", + " \n", + "## Step-by-Step Guide\n", + "1. Setup:\n", + "- Initialize all necessary repositories: \n", + " - dataset\n", + " - run\n", + " - evaluation\n", + " - aggregation\n", + "- Create dataset from example(s)\n", + "- Initialized task(s)\n", + "- Initialize `Runner` for each task \n", + "2. Run task(s) for the dataset (see [here](./how_to_run_a_task_on_a_dataset.ipynb))\n", + "3. Compose a list of IDs of runs you want to evaluate.\n", + "4. Define and initialize an `IncrementalEvaluationLogic`; This is similar to a normal `EvaluationLogic` (see [here](./how_to_implement_a_simple_evaluation_and_aggregation_logic.ipynb)) but you also have to implement your own `do_incremental_evaluate` method\n", + "5. Initialize an `IncrementalEvaluator` with the repositories and your custom `IncrementalEvaluationLogic`\n", + "6. Call the `evaluate_runs` method of the `IncrementalEvaluator` to evaluate the run(s) and create a single `EvaluationOverview`\n", + "7. Aggregate your evaluation of the run(s) using the [standard aggregation](./how_to_aggregate_evaluations.ipynb) or using a [custom aggregation logic](./how_to_implement_a_simple_evaluation_and_aggregation_logic.ipynb)\n", + "\n", + "#### Steps for addition of new runs \n", + "8. Define and run some new task(s)\n", + "9. Define a list for runs that should not be re-evaluated\n", + "10. Call the `evaluate_additional_runs` method of the `IncrementalEvaluator`:\n", + " - `run_ids`: Runs to be included in the evaluation results, including those that have been evaluated before\n", + " - `previous_evaluation_ids`: Runs **not** to be re-evaluated, depending on the specific implementation of the `do_incremental_evaluate` method\n", + "11. Aggregate all your `EvaluationOverview`s in your `EvaluationRepository`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Preparation\n", + "examples = [\n", + " DummyExample(input=\"input1\", expected_output=\"expected_output1\", data=\"data1\")\n", + "]\n", + "\n", + "# Step 1\n", + "dataset_repository = InMemoryDatasetRepository()\n", + "run_repository = InMemoryRunRepository()\n", + "evaluation_repository = InMemoryEvaluationRepository()\n", + "aggregation_repository = InMemoryAggregationRepository()\n", + "\n", + "my_dataset = dataset_repository.create_dataset(examples, \"MyDataset\")\n", + "\n", + "first_task = DummyTask()\n", + "first_runner = Runner(first_task, dataset_repository, run_repository, \"MyFirstRun\")\n", + "\n", + "# Step 2\n", + "first_run_overview = first_runner.run_dataset(my_dataset.id)\n", + "print(f\"ID of first run: {first_run_overview.id}\")\n", + "\n", + "# Step 3\n", + "run_overview_ids_for_first_evaluation = []\n", + "for run_overview in run_repository.run_overviews():\n", + " if (\n", + " run_overview.description == \"MyFirstRun\"\n", + " ): ## This is filter for all the runs you want to include\n", + " run_overview_ids_for_first_evaluation.append(run_overview.id)\n", + "\n", + "\n", + "# Step 4\n", + "class DummyIncrementalEvaluationLogic(\n", + " IncrementalEvaluationLogic[str, str, str, DummyEvaluation]\n", + "):\n", + " def do_incremental_evaluate(\n", + " self,\n", + " example: Example[str, str],\n", + " outputs: list[SuccessfulExampleOutput[str]],\n", + " already_evaluated_outputs: list[list[SuccessfulExampleOutput[str]]],\n", + " ) -> DummyEvaluation:\n", + " output_str = \"(\" + (\", \".join(o.output for o in outputs)) + \")\"\n", + " return DummyEvaluation(\n", + " eval=f\"{example.input}, {example.expected_output}, {output_str}, {already_evaluated_outputs} -> evaluation\"\n", + " )\n", + "\n", + "\n", + "incremental_evaluation_logic = DummyIncrementalEvaluationLogic()\n", + "\n", + "# Step 5\n", + "incremental_evaluator = IncrementalEvaluator(\n", + " dataset_repository,\n", + " run_repository,\n", + " evaluation_repository,\n", + " \"My incremental evaluation\",\n", + " incremental_evaluation_logic,\n", + ")\n", + "\n", + "# Step 6\n", + "evaluation_overview_first_task = incremental_evaluator.evaluate_runs(\n", + " *run_overview_ids_for_first_evaluation\n", + ")\n", + "\n", + "# Step 7\n", + "aggregation_logic = DummyAggregationLogic()\n", + "aggregator = Aggregator(\n", + " evaluation_repository, aggregation_repository, \"MyAggregator\", aggregation_logic\n", + ")\n", + "first_aggregation_overview = aggregator.aggregate_evaluation(\n", + " *evaluation_repository.evaluation_overview_ids()\n", + ")\n", + "print(f\"First aggregation: {first_aggregation_overview}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Addition of new task/run\n", + "# Step 8\n", + "second_task = DummyTask()\n", + "second_runner = Runner(second_task, dataset_repository, run_repository, \"MySecondRun\")\n", + "second_run_overview = second_runner.run_dataset(my_dataset.id)\n", + "print(f\"ID of second run: {second_run_overview.id}\")\n", + "\n", + "# Step 9\n", + "already_evaluated_run_ids = evaluation_repository.evaluation_overview_ids()\n", + "\n", + "# Step 10\n", + "incremental_evaluator.evaluate_additional_runs(\n", + " *run_repository.run_overview_ids(),\n", + " previous_evaluation_ids=already_evaluated_run_ids,\n", + ")\n", + "\n", + "# Step 11\n", + "second_aggregation_overview = aggregator.aggregate_evaluation(\n", + " *evaluation_repository.evaluation_overview_ids()\n", + ")\n", + "print(second_aggregation_overview)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/documentation/how_tos/how_to_implement_elo_evaluations.ipynb b/src/documentation/how_tos/how_to_implement_elo_evaluations.ipynb index e630d7336..7e3fb880a 100644 --- a/src/documentation/how_tos/how_to_implement_elo_evaluations.ipynb +++ b/src/documentation/how_tos/how_to_implement_elo_evaluations.ipynb @@ -56,7 +56,7 @@ "evaluation_repository = InMemoryEvaluationRepository()\n", "evaluation_logic = DummyEloEvaluationLogic()\n", "\n", - "# Step 3\n", + "# Step 2\n", "evaluator = IncrementalEvaluator(\n", " dataset_repository,\n", " run_repository,\n", @@ -67,7 +67,7 @@ "\n", "evaluation_overview = evaluator.evaluate_runs(*run_ids)\n", "\n", - "# Step 4\n", + "# Step 3\n", "print(evaluation_overview.id)" ] }