From 8274c366008cbc504bb3432a741be729f062b4de Mon Sep 17 00:00:00 2001
From: Sebastian Niehus <sebastian.niehus@ext.aleph-alpha.com>
Date: Thu, 23 May 2024 10:57:04 +0200
Subject: [PATCH] feat: Rename how-to implement incremental evaluation and make
 it more concise TASK: IL-313

---
 CHANGELOG.md                                  |   2 +-
 README.md                                     |   2 +-
 ...complete_incremental_evaluation_flow.ipynb | 205 ------------------
 ..._to_implement_incremental_evaluation.ipynb | 153 +++++++++++++
 4 files changed, 155 insertions(+), 207 deletions(-)
 delete mode 100644 src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb
 create mode 100644 src/documentation/how_tos/how_to_implement_incremental_evaluation.ipynb

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 82be2fb01..01fa80a0b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,7 +6,7 @@
 ...
 
 ### New Features
- - Add `how_to_implement_complete_incremental_evaluation_flow`
+ - Add `how_to_implement_incremental_evaluation`.
 
 ### Fixes
 - The document index client now correctly URL-encodes document names in its queries.
diff --git a/README.md b/README.md
index f33b089f8..dcd03d42b 100644
--- a/README.md
+++ b/README.md
@@ -180,7 +180,7 @@ The how-tos are quick lookups about how to do things. Compared to the tutorials,
 | [...retrieve data for analysis](./src/documentation/how_tos/how_to_retrieve_data_for_analysis.ipynb)                                                   | Retrieve experiment data in multiple different ways                        |
 | [...implement a custom human evaluation](./src/documentation/how_tos/how_to_human_evaluation_via_argilla.ipynb)                                        | Necessary steps to create an evaluation with humans as a judge via Argilla |
 | [...implement elo evaluations](./src/documentation/how_tos/how_to_implement_elo_evaluations.ipynb)                                                     | Evaluate runs and create ELO ranking for them                               |
-| [...implement complete incremental evaluation flow](./src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb)           | Run complete incremental evaluation flow from runner to aggretation
+| [...implement incremental evaluation](./src/documentation/how_tos/how_to_implement_incremental_evaluation.ipynb)           | Implement and run an incremental evaluation
 # Models
 
 Currently, we support a bunch of models accessible via the Aleph Alpha API. Depending on your local setup, you may even have additional models available.
diff --git a/src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb b/src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb
deleted file mode 100644
index 163e15238..000000000
--- a/src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb
+++ /dev/null
@@ -1,205 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from documentation.how_tos.example_data import (\n",
-    "    DummyAggregationLogic,\n",
-    "    DummyEvaluation,\n",
-    "    DummyExample,\n",
-    "    DummyTask,\n",
-    ")\n",
-    "from intelligence_layer.evaluation import (\n",
-    "    Aggregator,\n",
-    "    IncrementalEvaluator,\n",
-    "    InMemoryAggregationRepository,\n",
-    "    InMemoryEvaluationRepository,\n",
-    "    InMemoryRunRepository,\n",
-    "    Runner,\n",
-    ")\n",
-    "from intelligence_layer.evaluation.dataset.domain import Example\n",
-    "from intelligence_layer.evaluation.dataset.in_memory_dataset_repository import (\n",
-    "    InMemoryDatasetRepository,\n",
-    ")\n",
-    "from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import (\n",
-    "    IncrementalEvaluationLogic,\n",
-    ")\n",
-    "from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# How to implement complete incremental evaluation workflows from running (multiple) tasks to aggregation\n",
-    "This notebook outlines how to:\n",
-    " - run multiple tasks and configurations on the same dataset\n",
-    " - perform evaluations in an incremental fashion, i.e., adding additional runs to your existing evaluations without the need for recalculation\n",
-    " - run aggregation on these evaluations\n",
-    "    \n",
-    "## Step-by-Step Guide\n",
-    "1. Setup:\n",
-    "- Initialize all necessary repositories: \n",
-    "  - dataset\n",
-    "  - run\n",
-    "  - evaluation\n",
-    "  - aggregation\n",
-    "- Create dataset from example(s)\n",
-    "- Initialized task(s)\n",
-    "- Initialize `Runner` for each task \n",
-    "2. Run task(s) for the dataset (see [here](./how_to_run_a_task_on_a_dataset.ipynb))\n",
-    "3. Compose a list of IDs of runs you want to evaluate.\n",
-    "4. Define and initialize an `IncrementalEvaluationLogic`; This is similar to a normal `EvaluationLogic` (see [here](./how_to_implement_a_simple_evaluation_and_aggregation_logic.ipynb)) but you also have to implement your own `do_incremental_evaluate` method\n",
-    "5. Initialize an `IncrementalEvaluator` with the repositories and your custom `IncrementalEvaluationLogic`\n",
-    "6. Call the `evaluate_runs` method of the `IncrementalEvaluator` to evaluate the run(s) and create a single `EvaluationOverview`\n",
-    "7. Aggregate your evaluation of the run(s) using the [standard aggregation](./how_to_aggregate_evaluations.ipynb) or using a [custom aggregation logic](./how_to_implement_a_simple_evaluation_and_aggregation_logic.ipynb)\n",
-    "\n",
-    "#### Steps for addition of new runs \n",
-    "8. Define and run some new task(s)\n",
-    "9. Define a list for runs that should not be re-evaluated\n",
-    "10. Call the `evaluate_additional_runs` method of the `IncrementalEvaluator`:\n",
-    " - `run_ids`: Runs to be included in the evaluation results, including those that have been evaluated before\n",
-    " - `previous_evaluation_ids`: Runs **not** to be re-evaluated, depending on the specific implementation of the `do_incremental_evaluate` method\n",
-    "11. Aggregate all your `EvaluationOverview`s in your `EvaluationRepository`"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Preparation\n",
-    "examples = [\n",
-    "    DummyExample(input=\"input1\", expected_output=\"expected_output1\", data=\"data1\")\n",
-    "]\n",
-    "\n",
-    "# Step 1\n",
-    "dataset_repository = InMemoryDatasetRepository()\n",
-    "run_repository = InMemoryRunRepository()\n",
-    "evaluation_repository = InMemoryEvaluationRepository()\n",
-    "aggregation_repository = InMemoryAggregationRepository()\n",
-    "\n",
-    "my_dataset = dataset_repository.create_dataset(examples, \"MyDataset\")\n",
-    "\n",
-    "first_task = DummyTask()\n",
-    "first_runner = Runner(first_task, dataset_repository, run_repository, \"MyFirstRun\")\n",
-    "\n",
-    "# Step 2\n",
-    "first_run_overview = first_runner.run_dataset(my_dataset.id)\n",
-    "print(f\"ID of first run: {first_run_overview.id}\")\n",
-    "\n",
-    "# Step 3\n",
-    "run_overview_ids_for_first_evaluation = []\n",
-    "for run_overview in run_repository.run_overviews():\n",
-    "    if (\n",
-    "        run_overview.description == \"MyFirstRun\"\n",
-    "    ):  ## This is filter for all the runs you want to include\n",
-    "        run_overview_ids_for_first_evaluation.append(run_overview.id)\n",
-    "\n",
-    "\n",
-    "# Step 4\n",
-    "class DummyIncrementalEvaluationLogic(\n",
-    "    IncrementalEvaluationLogic[str, str, str, DummyEvaluation]\n",
-    "):\n",
-    "    def do_incremental_evaluate(\n",
-    "        self,\n",
-    "        example: Example[str, str],\n",
-    "        outputs: list[SuccessfulExampleOutput[str]],\n",
-    "        already_evaluated_outputs: list[list[SuccessfulExampleOutput[str]]],\n",
-    "    ) -> DummyEvaluation:\n",
-    "        output_str = \"(\" + (\", \".join(o.output for o in outputs)) + \")\"\n",
-    "        return DummyEvaluation(\n",
-    "            eval=f\"{example.input}, {example.expected_output}, {output_str}, {already_evaluated_outputs} -> evaluation\"\n",
-    "        )\n",
-    "\n",
-    "\n",
-    "incremental_evaluation_logic = DummyIncrementalEvaluationLogic()\n",
-    "\n",
-    "# Step 5\n",
-    "incremental_evaluator = IncrementalEvaluator(\n",
-    "    dataset_repository,\n",
-    "    run_repository,\n",
-    "    evaluation_repository,\n",
-    "    \"My incremental evaluation\",\n",
-    "    incremental_evaluation_logic,\n",
-    ")\n",
-    "\n",
-    "# Step 6\n",
-    "evaluation_overview_first_task = incremental_evaluator.evaluate_runs(\n",
-    "    *run_overview_ids_for_first_evaluation\n",
-    ")\n",
-    "\n",
-    "# Step 7\n",
-    "aggregation_logic = DummyAggregationLogic()\n",
-    "aggregator = Aggregator(\n",
-    "    evaluation_repository, aggregation_repository, \"MyAggregator\", aggregation_logic\n",
-    ")\n",
-    "first_aggregation_overview = aggregator.aggregate_evaluation(\n",
-    "    *evaluation_repository.evaluation_overview_ids()\n",
-    ")\n",
-    "print(f\"First aggregation: {first_aggregation_overview}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## Addition of new task/run\n",
-    "# Step 8\n",
-    "second_task = DummyTask()\n",
-    "second_runner = Runner(second_task, dataset_repository, run_repository, \"MySecondRun\")\n",
-    "second_run_overview = second_runner.run_dataset(my_dataset.id)\n",
-    "print(f\"ID of second run: {second_run_overview.id}\")\n",
-    "\n",
-    "# Step 9\n",
-    "already_evaluated_run_ids = evaluation_repository.evaluation_overview_ids()\n",
-    "\n",
-    "# Step 10\n",
-    "incremental_evaluator.evaluate_additional_runs(\n",
-    "    *run_repository.run_overview_ids(),\n",
-    "    previous_evaluation_ids=already_evaluated_run_ids,\n",
-    ")\n",
-    "\n",
-    "# Step 11\n",
-    "second_aggregation_overview = aggregator.aggregate_evaluation(\n",
-    "    *evaluation_repository.evaluation_overview_ids()\n",
-    ")\n",
-    "print(second_aggregation_overview)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/src/documentation/how_tos/how_to_implement_incremental_evaluation.ipynb b/src/documentation/how_tos/how_to_implement_incremental_evaluation.ipynb
new file mode 100644
index 000000000..52278c53e
--- /dev/null
+++ b/src/documentation/how_tos/how_to_implement_incremental_evaluation.ipynb
@@ -0,0 +1,153 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from documentation.how_tos.example_data import (\n",
+    "    DummyAggregationLogic,\n",
+    "    DummyEvaluation,\n",
+    "    DummyExample,\n",
+    "    example_data,\n",
+    ")\n",
+    "from intelligence_layer.evaluation import (\n",
+    "    Aggregator,\n",
+    "    Example,\n",
+    "    IncrementalEvaluationLogic,\n",
+    "    IncrementalEvaluator,\n",
+    "    InMemoryAggregationRepository,\n",
+    "    InMemoryEvaluationRepository,\n",
+    "    SuccessfulExampleOutput,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# How to implement incremental evaluation\n",
+    "This notebook outlines how to perform evaluations in an incremental fashion, i.e., adding additional runs to your existing evaluations without the need for recalculation.\n",
+    "    \n",
+    "## Step-by-Step Guide\n",
+    "0. Run your tasks on the datasets on which you want to evaluate them (see [here](./how_to_run_a_task_on_a_dataset.ipynb))\n",
+    "   - When evaluating multiple runs, all of them need the same data types \n",
+    "1. Initialize all necessary repositories and define your `IncrementalEvaluationLogic`; It is similar to a normal `EvaluationLogic` (see [here](./how_to_implement_a_simple_evaluation_and_aggregation_logic.ipynb)) but you additionally have to implement your own `do_incremental_evaluate` method\n",
+    "2. Initialize an `IncrementalEvaluator` with the repositories and your custom `IncrementalEvaluationLogic`\n",
+    "3. Call the `evaluate_runs` method of the `IncrementalEvaluator`\n",
+    "4. Aggregate your evaluations using the [standard aggregation](./how_to_aggregate_evaluations.ipynb) or using a [custom aggregation logic](./how_to_implement_a_simple_evaluation_and_aggregation_logic.ipynb)\n",
+    "\n",
+    "#### Steps for addition of new runs \n",
+    "5. Call the `evaluate_additional_runs` method of the `IncrementalEvaluator`:\n",
+    "   - `run_ids`: Runs to be included in the evaluation results, including those that have been evaluated before\n",
+    "   - `previous_evaluation_ids`: Runs **not** to be re-evaluated, depending on the specific implementation of the `do_incremental_evaluate` method\n",
+    "6. Aggregate all your `EvaluationOverview`s"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Step 0\n",
+    "examples = [\n",
+    "    DummyExample(input=\"input1\", expected_output=\"expected_output1\", data=\"data1\")\n",
+    "]\n",
+    "my_example_data = example_data()\n",
+    "\n",
+    "dataset_repository = my_example_data.dataset_repository\n",
+    "run_repository = my_example_data.run_repository\n",
+    "\n",
+    "# Step 1\n",
+    "evaluation_repository = InMemoryEvaluationRepository()\n",
+    "aggregation_repository = InMemoryAggregationRepository()\n",
+    "\n",
+    "\n",
+    "class DummyIncrementalEvaluationLogic(\n",
+    "    IncrementalEvaluationLogic[str, str, str, DummyEvaluation]\n",
+    "):\n",
+    "    def do_incremental_evaluate(\n",
+    "        self,\n",
+    "        example: Example[str, str],\n",
+    "        outputs: list[SuccessfulExampleOutput[str]],\n",
+    "        already_evaluated_outputs: list[list[SuccessfulExampleOutput[str]]],\n",
+    "    ) -> DummyEvaluation:\n",
+    "        return DummyEvaluation(eval=\"DummyEvalResult\")\n",
+    "\n",
+    "\n",
+    "# Step 2\n",
+    "incremental_evaluator = IncrementalEvaluator(\n",
+    "    dataset_repository,\n",
+    "    run_repository,\n",
+    "    evaluation_repository,\n",
+    "    \"My incremental evaluation\",\n",
+    "    DummyIncrementalEvaluationLogic(),\n",
+    ")\n",
+    "\n",
+    "# Step 3\n",
+    "incremental_evaluator.evaluate_runs(my_example_data.run_overview_1.id)\n",
+    "\n",
+    "# Step 4\n",
+    "aggregation_logic = DummyAggregationLogic()\n",
+    "aggregator = Aggregator(\n",
+    "    evaluation_repository, aggregation_repository, \"MyAggregator\", aggregation_logic\n",
+    ")\n",
+    "aggregation_overview = aggregator.aggregate_evaluation(\n",
+    "    *evaluation_repository.evaluation_overview_ids()\n",
+    ")\n",
+    "print(aggregation_overview)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Addition of new task/run\n",
+    "# Step 5\n",
+    "run_ids = [my_example_data.run_overview_1.id, my_example_data.run_overview_1.id]\n",
+    "incremental_evaluator.evaluate_additional_runs(\n",
+    "    *run_ids,\n",
+    "    previous_evaluation_ids=evaluation_repository.evaluation_overview_ids(),\n",
+    ")\n",
+    "\n",
+    "# Step 6\n",
+    "second_aggregation_overview = aggregator.aggregate_evaluation(\n",
+    "    *evaluation_repository.evaluation_overview_ids()\n",
+    ")\n",
+    "print(second_aggregation_overview)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}