diff --git a/src/documentation/human_evaluation.ipynb b/src/documentation/human_evaluation.ipynb
index 142d21965..992e98fed 100644
--- a/src/documentation/human_evaluation.ipynb
+++ b/src/documentation/human_evaluation.ipynb
@@ -2,13 +2,13 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import shutil\n",
     "from pathlib import Path\n",
-    "from typing import Iterable, cast\n",
+    "from typing import Iterable\n",
     "\n",
     "from datasets import load_dataset\n",
     "from dotenv import load_dotenv\n",
@@ -21,7 +21,6 @@
     "    LimitedConcurrencyClient,\n",
     "    Question,\n",
     "    RecordData,\n",
-    "    ArgillaEvaluation,\n",
     ")\n",
     "from intelligence_layer.core import (\n",
     "    CompleteOutput,\n",
@@ -31,8 +30,10 @@
     ")\n",
     "from intelligence_layer.evaluation import (\n",
     "    AggregationLogic,\n",
+    "    Aggregator,\n",
     "    ArgillaEvaluationLogic,\n",
     "    ArgillaEvaluator,\n",
+    "    AsyncFileEvaluationRepository,\n",
     "    Example,\n",
     "    FileAggregationRepository,\n",
     "    FileDatasetRepository,\n",
@@ -40,8 +41,6 @@
     "    RecordDataSequence,\n",
     "    Runner,\n",
     "    SuccessfulExampleOutput,\n",
-    "    AsyncFileEvaluationRepository,\n",
-    "    Aggregator,\n",
     ")\n",
     "\n",
     "load_dotenv()\n",
@@ -112,7 +111,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -128,21 +127,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset({\n",
-      "    features: ['prompt', 'completion', 'meta'],\n",
-      "    num_rows: 327\n",
-      "})\n",
-      "dict_keys(['id', 'motivation_app', 'prompt', 'input', 'completion', 'source', 'category', 'subcategory'])\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "print(dataset)\n",
     "print(dataset[\"meta\"][0].keys())"
@@ -157,7 +144,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -177,7 +164,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -199,20 +186,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'human-evaluation-dataset'"
-      ]
-     },
-     "execution_count": 45,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "dataset_id.name"
    ]
@@ -230,17 +206,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Running: 5it [00:05,  1.05s/it]\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "model = LuminousControlModel(name=\"luminous-base-control\", client=client)\n",
     "task = Instruct(model=model)\n",
@@ -271,19 +239,26 @@
    "metadata": {},
    "source": [
     "At the end of our evaluation we want a float score $s \\in [1,5]$ describing the model performance.\n",
-    "We define this as an `InstructAggregatedEvaluation`, which will be used in our aggregation later."
+    "We define this as an `InstructAggregatedEvaluation`, which will be used in our aggregation later.\n",
+    "\n",
+    "We also define the `InstructEvaluation`, which represents an evaluation of a single entry, which we will aggregate later."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "class InstructAggregatedEvaluation(BaseModel):\n",
     "    general_rating: float | None\n",
     "    fluency: float | None\n",
-    "    evaluated_examples: int"
+    "    evaluated_examples: int\n",
+    "\n",
+    "\n",
+    "class InstructEvaluation(BaseModel):\n",
+    "    general_rating: float\n",
+    "    fluency: float"
    ]
   },
   {
@@ -291,13 +266,14 @@
    "metadata": {},
    "source": [
     "We can now start to define our human evaluation. This is done with `Questions` and `Fields`.  \n",
-    "`Fields` define what a user has to evaluate. In our example, this will be the model input (Instruction) and output (Model Completion). Note that the field names have to match the content keys from the `RecordData` which we will define later in our `InstructArgillaEvaluationLogic`.  \n",
-    "`Questions` are what a user has to answer in order to evaluate the `Fields`. The `name` property will later be used to access the human ratings in the aggregation step. In our case we ask how complete and how fluent the completions are."
+    "`Fields` define what a user has to evaluate. In our example, this will be the model input (Instruction) and output (Model Completion).  \n",
+    "`Questions` are what a user has to answer in order to evaluate the `Fields`. The `name` property will later be used to access the human ratings.  \n",
+    "Both of these are passed to the `ArgillaEvaluationLogic` to create `RecordData` to convert data back and forth from Argilla. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -329,74 +305,36 @@
     "Our defined fields and questions will look like this:\n",
     "![Argilla Interface](../../assets/argilla_interface.png)\n",
     "\n",
-    "We can now define our `InstructArgillaEvaluationLogic` and `InstructArgillaAggregationLogic`.\n",
-    "They have to implement the two abstract methods `_to_record` and `aggregate` respectively.\n",
+    "We can now define our `InstructArgillaEvaluationLogic` to translate our data to specific Argilla formats .\n",
+    "The logic has to implement the two abstract methods `to_record` and `from_record`.\n",
     "Lets look at the documentation:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Help on function to_record in module intelligence_layer.evaluation.evaluation.evaluator.argilla_evaluator:\n",
-      "\n",
-      "to_record(self, example: intelligence_layer.evaluation.dataset.domain.Example, *output: intelligence_layer.evaluation.run.domain.SuccessfulExampleOutput) -> intelligence_layer.evaluation.evaluation.evaluator.argilla_evaluator.RecordDataSequence\n",
-      "    This method is responsible for translating the `Example` and `Output` of the task to :class:`RecordData`\n",
-      "    \n",
-      "    \n",
-      "    Args:\n",
-      "        example: The example to be translated.\n",
-      "        output: The output of the example that was run.\n",
-      "\n",
-      "----------------------------------------------------------------------------------------------------\n",
-      "Help on function aggregate in module intelligence_layer.evaluation.aggregation.aggregator:\n",
-      "\n",
-      "aggregate(self, evaluations: Iterable[+Evaluation]) -> +AggregatedEvaluation\n",
-      "    `Evaluator`-specific method for aggregating individual `Evaluations` into report-like `Aggregated Evaluation`.\n",
-      "    \n",
-      "    This method is responsible for taking the results of an evaluation run and aggregating all the results.\n",
-      "    It should create an `AggregatedEvaluation` class and return it at the end.\n",
-      "    \n",
-      "    Args:\n",
-      "        evaluations: The results from running `eval_and_aggregate_runs` with a :class:`Task`.\n",
-      "    \n",
-      "    Returns:\n",
-      "        The aggregated results of an evaluation run with a :class:`Dataset`.\n",
-      "\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "help(ArgillaEvaluationLogic.to_record)\n",
     "print(\"-\" * 100)\n",
-    "help(AggregationLogic.aggregate)"
+    "help(ArgillaEvaluationLogic.from_record)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Instead of performing the evaluation, the `ArgillaEvaluationLogic` is responsible for converting the evaluation data to a format that is accepted by Argilla. During the evaluation, these records will simply be submitted to Argilla.  \n",
+    "Instead of performing the evaluation, the `ArgillaEvaluationLogic` is responsible for converting the evaluation data to a format that is accepted by Argilla. During the evaluation, these records will simply be submitted to Argilla and retrieved later.\n",
     "We will now create everything we need to submit these evaluations to our Argilla instance."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
-    "\n",
-    "class InstructEvaluation(BaseModel):\n",
-    "    general_rating: float\n",
-    "    fluency: float\n",
-    "\n",
     "class InstructArgillaEvaluationLogic(\n",
     "    ArgillaEvaluationLogic[\n",
     "        InstructInput,\n",
@@ -414,16 +352,21 @@
     "            records=[\n",
     "                RecordData(\n",
     "                    content={\n",
-    "                        \"input\": example.input.instruction,\n",
-    "                        \"output\": example_outputs[0].output.completion,\n",
+    "                        self.fields[\"input\"].name: example.input.instruction,\n",
+    "                        self.fields[\"output\"].name: example_outputs[\n",
+    "                            0\n",
+    "                        ].output.completion,\n",
     "                    },\n",
     "                    example_id=example.id,\n",
     "                )\n",
     "            ]\n",
     "        )\n",
-    "    \n",
+    "\n",
     "    def from_record(self, argilla_evaluation: ArgillaEvaluation) -> InstructEvaluation:\n",
-    "        return InstructEvaluation(general_rating=argilla_evaluation.responses[\"general_rating\"], fluency=argilla_evaluation.responses[\"fluency\"])\n",
+    "        return InstructEvaluation(\n",
+    "            general_rating=argilla_evaluation.responses[\"general_rating\"],\n",
+    "            fluency=argilla_evaluation.responses[\"fluency\"],\n",
+    "        )\n",
     "\n",
     "\n",
     "argilla_client = DefaultArgillaClient()\n",
@@ -431,9 +374,7 @@
     "\n",
     "dataset_repository = FileDatasetRepository(REPOSITORY_ROOT_PATH)\n",
     "run_repository = FileRunRepository(REPOSITORY_ROOT_PATH)\n",
-    "evaluation_repository = AsyncFileEvaluationRepository(\n",
-    "    REPOSITORY_ROOT_PATH\n",
-    ")\n",
+    "evaluation_repository = AsyncFileEvaluationRepository(REPOSITORY_ROOT_PATH)\n",
     "\n",
     "eval_logic = InstructArgillaEvaluationLogic(fields, questions)\n",
     "evaluator = ArgillaEvaluator(\n",
@@ -443,7 +384,7 @@
     "    \"instruct-evaluation\",\n",
     "    eval_logic,\n",
     "    argilla_client=argilla_client,\n",
-    "    workspace_id=workspace_id\n",
+    "    workspace_id=workspace_id,\n",
     ")"
    ]
   },
@@ -451,35 +392,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "After setting up the `ArgillaEvaluator`, the `evaluate_runs` methods posts the records to the Argilla instance."
+    "After setting up the `ArgillaEvaluator`, the `sumit` methods posts the records to the Argilla instance."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Evaluation Overview ID = 41d5a7ac-c7f1-4e22-9ef3-538bf98ecbc5\n",
-      "Start time = 2024-05-13 18:22:42.271727\n",
-      "Submitted Evaluations = 5\n",
-      "Description = \"instruct-evaluation\"\n",
-      "Run Overviews={\n",
-      "Run Overview ID = 5fece010-9d8f-4eb3-bf55-4abda031ed25\n",
-      "Dataset ID = 2f40e028-f4ea-4018-bdcb-24e62b38d057\n",
-      "Start time = 2024-05-13 16:16:21.485698+00:00\n",
-      "End time = 2024-05-13 16:16:34.460946+00:00\n",
-      "Failed example count = 0\n",
-      "Successful example count = 5\n",
-      "Description = \"instruct-run\"\n",
-      "}\n",
-      "\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# either remember the id from before (run_overview.id) or retrieve as below\n",
     "run_id = [\n",
@@ -497,9 +417,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "While the evaluation says that 5 examples were successfully evaluated, no real evaluation has happened yet.  \n",
     "If we try to perform an aggregation right now, it will have no evaluations, as none of the submitted records were evaluated by humans through Argilla yet.  \n",
-    "The aggregation fetches only the results that were already evaluated.\n",
+    "The next steps fetches only results that have been evaluated already\n",
     "\n",
     "---\n",
     "\n",
@@ -509,7 +428,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -522,31 +441,53 @@
    "metadata": {},
    "source": [
     "These splits can then be filered by, as shown below.  \n",
-    "<img src=\"../../assets/argilla_splits.png\" alt=\"drawing\" width=\"300\"/>"
+    "<img src=\"../../assets/argilla_splits.png\" alt=\"drawing\" width=\"300\"/>\n",
+    "\n",
+    "To finish the evaluation, we can retrieve the evaluated examples as follows:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluation_repository = AsyncFileEvaluationRepository(REPOSITORY_ROOT_PATH)\n",
+    "\n",
+    "# either remember the id from before (eval_overview.id) or retrieve as below\n",
+    "eval_id = [\n",
+    "    overview.id\n",
+    "    for overview in evaluation_repository.partial_evaluation_overviews()\n",
+    "    if overview.description == \"instruct-evaluation\"\n",
+    "][0]\n",
+    "\n",
+    "evaluation_overview = evaluator.retrieve(eval_id)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "\n",
+    "Note that all examples that are not yet evaluated in argilla are noted as `failed_examples` and not passed to the next step.\n",
     "\n",
     "---\n",
     "\n",
-    "For the Aggregation, we first need to define our `AggregationLogic` that has to take an `ArgillaEvaluation` as an input. As output, we use the `InstructAggregatedEvaluation` we defined earlier."
+    "For the Aggregation, we first need to define our `AggregationLogic` that takes our previously defined types as input and output. Here, we use `InstructEvaluation` and `InstructAggregatedEvaluation`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "class InstructArgillaAggregationLogic(\n",
-    "    AggregationLogic[ArgillaEvaluation, InstructAggregatedEvaluation]\n",
+    "    AggregationLogic[InstructEvaluation, InstructAggregatedEvaluation]\n",
     "):\n",
     "    def aggregate(\n",
     "        self,\n",
-    "        evaluations: Iterable[ArgillaEvaluation],\n",
+    "        evaluations: Iterable[InstructEvaluation],\n",
     "    ) -> InstructAggregatedEvaluation:\n",
     "        evaluations = list(evaluations)\n",
     "\n",
@@ -558,13 +499,12 @@
     "            )\n",
     "\n",
     "        general_rating = sum(\n",
-    "            cast(float, evaluation.responses[\"general_rating\"])\n",
-    "            for evaluation in evaluations\n",
+    "            evaluation.general_rating for evaluation in evaluations\n",
     "        ) / len(evaluations)\n",
     "\n",
-    "        fluency = sum(\n",
-    "            cast(float, evaluation.responses[\"fluency\"]) for evaluation in evaluations\n",
-    "        ) / len(evaluations)\n",
+    "        fluency = sum(evaluation.fluency for evaluation in evaluations) / len(\n",
+    "            evaluations\n",
+    "        )\n",
     "\n",
     "        return InstructAggregatedEvaluation(\n",
     "            general_rating=general_rating,\n",
@@ -580,42 +520,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "With this, we can define our `ArgillaAggregator` and retrieve the aggregation of all records that have been evaluated."
+    "With this, we can define our `Aggregator` and aggregate all evaluations. This step is the same as non-human evaluation."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "ValueError",
-     "evalue": "Repository does not contain an evaluation with id: 41d5a7ac-c7f1-4e22-9ef3-538bf98ecbc5",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[54], line 11\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;66;03m# either remember the id from before (eval_overview.id) or retrieve as below\u001b[39;00m\n\u001b[1;32m      5\u001b[0m eval_id \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m      6\u001b[0m     overview\u001b[38;5;241m.\u001b[39mid\n\u001b[1;32m      7\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m overview \u001b[38;5;129;01min\u001b[39;00m evaluation_repository\u001b[38;5;241m.\u001b[39mpartial_evaluation_overviews()\n\u001b[1;32m      8\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m overview\u001b[38;5;241m.\u001b[39mdescription \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minstruct-evaluation\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      9\u001b[0m ][\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m---> 11\u001b[0m evaluation_overview \u001b[38;5;241m=\u001b[39m \u001b[43mevaluator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mretrieve\u001b[49m\u001b[43m(\u001b[49m\u001b[43meval_id\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     13\u001b[0m aggregator \u001b[38;5;241m=\u001b[39m Aggregator(\n\u001b[1;32m     14\u001b[0m     evaluation_repository,\n\u001b[1;32m     15\u001b[0m     aggregation_repository,\n\u001b[1;32m     16\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minstruct-aggregation\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     17\u001b[0m     aggregation_logic,\n\u001b[1;32m     18\u001b[0m )\n\u001b[1;32m     20\u001b[0m output \u001b[38;5;241m=\u001b[39m aggregator\u001b[38;5;241m.\u001b[39maggregate_evaluation(eval_id)\n",
-      "File \u001b[0;32m~/intelligence-layer/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py:190\u001b[0m, in \u001b[0;36mArgillaEvaluator.retrieve\u001b[0;34m(self, evaluation_id)\u001b[0m\n\u001b[1;32m    187\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m evaluation \u001b[38;5;129;01min\u001b[39;00m evaluations:\n\u001b[1;32m    188\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_evaluation_repository\u001b[38;5;241m.\u001b[39mstore_example_evaluation(evaluation)\n\u001b[1;32m    189\u001b[0m num_failed_evaluations \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\n\u001b[0;32m--> 190\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_evaluation_repository\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfailed_example_evaluations\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    191\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpartial_evaluation_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mevaluation_type\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    192\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    193\u001b[0m )\n\u001b[1;32m    194\u001b[0m num_not_yet_evaluated_evals \u001b[38;5;241m=\u001b[39m partial_overview\u001b[38;5;241m.\u001b[39msubmitted_evaluation_count \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mlen\u001b[39m(\n\u001b[1;32m    195\u001b[0m     evaluations\n\u001b[1;32m    196\u001b[0m )\n\u001b[1;32m    198\u001b[0m overview \u001b[38;5;241m=\u001b[39m EvaluationOverview(\n\u001b[1;32m    199\u001b[0m     run_overviews\u001b[38;5;241m=\u001b[39mpartial_overview\u001b[38;5;241m.\u001b[39mrun_overviews,\n\u001b[1;32m    200\u001b[0m     \u001b[38;5;28mid\u001b[39m\u001b[38;5;241m=\u001b[39mpartial_evaluation_id,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    206\u001b[0m     \u001b[38;5;241m+\u001b[39m num_failed_evaluations,\n\u001b[1;32m    207\u001b[0m )\n",
-      "File \u001b[0;32m~/intelligence-layer/src/intelligence_layer/evaluation/evaluation/evaluation_repository.py:195\u001b[0m, in \u001b[0;36mEvaluationRepository.failed_example_evaluations\u001b[0;34m(self, evaluation_id, evaluation_type)\u001b[0m\n\u001b[1;32m    182\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfailed_example_evaluations\u001b[39m(\n\u001b[1;32m    183\u001b[0m     \u001b[38;5;28mself\u001b[39m, evaluation_id: \u001b[38;5;28mstr\u001b[39m, evaluation_type: \u001b[38;5;28mtype\u001b[39m[Evaluation]\n\u001b[1;32m    184\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Sequence[ExampleEvaluation[Evaluation]]:\n\u001b[1;32m    185\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Returns all failed :class:`ExampleEvaluation`s for the given evaluation overview ID sorted by their example ID.\u001b[39;00m\n\u001b[1;32m    186\u001b[0m \n\u001b[1;32m    187\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    193\u001b[0m \u001b[38;5;124;03m        A :class:`Sequence` of failed :class:`ExampleEvaluation`s.\u001b[39;00m\n\u001b[1;32m    194\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 195\u001b[0m     results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexample_evaluations\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevaluation_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mevaluation_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    196\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m [r \u001b[38;5;28;01mfor\u001b[39;00m r \u001b[38;5;129;01min\u001b[39;00m results \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(r\u001b[38;5;241m.\u001b[39mresult, FailedExampleEvaluation)]\n",
-      "File \u001b[0;32m~/intelligence-layer/src/intelligence_layer/evaluation/evaluation/file_evaluation_repository.py:81\u001b[0m, in \u001b[0;36mFileSystemEvaluationRepository.example_evaluations\u001b[0;34m(self, evaluation_id, evaluation_type)\u001b[0m\n\u001b[1;32m     79\u001b[0m path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_eval_directory(evaluation_id)\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexists(path):\n\u001b[0;32m---> 81\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m     82\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRepository does not contain an evaluation with id: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevaluation_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     83\u001b[0m     )\n\u001b[1;32m     85\u001b[0m example_evaluations: \u001b[38;5;28mlist\u001b[39m[ExampleEvaluation[Evaluation]] \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m     86\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m file_name \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_names(path):\n",
-      "\u001b[0;31mValueError\u001b[0m: Repository does not contain an evaluation with id: 41d5a7ac-c7f1-4e22-9ef3-538bf98ecbc5"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
-    "\n",
-    "evaluation_repository = AsyncFileEvaluationRepository(REPOSITORY_ROOT_PATH)\n",
-    "\n",
     "aggregation_repository = FileAggregationRepository(REPOSITORY_ROOT_PATH)\n",
-    "# either remember the id from before (eval_overview.id) or retrieve as below\n",
-    "eval_id = [\n",
-    "    overview.id\n",
-    "    for overview in evaluation_repository.partial_evaluation_overviews()\n",
-    "    if overview.description == \"instruct-evaluation\"\n",
-    "][0]\n",
-    "\n",
-    "evaluation_overview = evaluator.retrieve(eval_id)\n",
     "\n",
     "aggregator = Aggregator(\n",
     "    evaluation_repository,\n",
diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py
index 19bcf81ff..991a67654 100644
--- a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py
@@ -55,17 +55,33 @@ def to_record(
         example: Example[Input, ExpectedOutput],
         *output: SuccessfulExampleOutput[Output],
     ) -> RecordDataSequence:
-        """This method is responsible for translating the `Example` and `Output` of the task to :class:`RecordData`
+        """This method is responsible for translating the `Example` and `Output` of the task to :class:`RecordData`.
+
+        The specific format depends on the `fields`.
 
 
         Args:
             example: The example to be translated.
             output: The output of the example that was run.
+
+        Returns:
+            A :class:`RecordDataSequence` that contains entries that should be evaluated in Argilla.
         """
         ...
 
     @abstractmethod
-    def from_record(self, argilla_evaluation: ArgillaEvaluation) -> Evaluation: ...
+    def from_record(self, argilla_evaluation: ArgillaEvaluation) -> Evaluation:
+        """This method takes the specific Argilla evaluation format and converts into a compatible :class:`Evaluation`.
+
+        The format of argilla_evaluation.responses depends on the `questions` attribute.
+        Each `name` of a question will be a key in the `argilla_evaluation.responses` mapping.
+
+        Args:
+            argilla_evaluation: Argilla-specific data for a single evaluation.
+
+        Returns:
+            An :class:`Evaluation` that contains all evaluation specific data.
+        """
 
 
 class ArgillaEvaluator(AsyncEvaluator[Input, Output, ExpectedOutput, Evaluation]):