diff --git a/src/documentation/human_evaluation.ipynb b/src/documentation/human_evaluation.ipynb index 142d21965..992e98fed 100644 --- a/src/documentation/human_evaluation.ipynb +++ b/src/documentation/human_evaluation.ipynb @@ -2,13 +2,13 @@ "cells": [ { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import shutil\n", "from pathlib import Path\n", - "from typing import Iterable, cast\n", + "from typing import Iterable\n", "\n", "from datasets import load_dataset\n", "from dotenv import load_dotenv\n", @@ -21,7 +21,6 @@ " LimitedConcurrencyClient,\n", " Question,\n", " RecordData,\n", - " ArgillaEvaluation,\n", ")\n", "from intelligence_layer.core import (\n", " CompleteOutput,\n", @@ -31,8 +30,10 @@ ")\n", "from intelligence_layer.evaluation import (\n", " AggregationLogic,\n", + " Aggregator,\n", " ArgillaEvaluationLogic,\n", " ArgillaEvaluator,\n", + " AsyncFileEvaluationRepository,\n", " Example,\n", " FileAggregationRepository,\n", " FileDatasetRepository,\n", @@ -40,8 +41,6 @@ " RecordDataSequence,\n", " Runner,\n", " SuccessfulExampleOutput,\n", - " AsyncFileEvaluationRepository,\n", - " Aggregator,\n", ")\n", "\n", "load_dotenv()\n", @@ -112,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -128,21 +127,9 @@ }, { "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset({\n", - " features: ['prompt', 'completion', 'meta'],\n", - " num_rows: 327\n", - "})\n", - "dict_keys(['id', 'motivation_app', 'prompt', 'input', 'completion', 'source', 'category', 'subcategory'])\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "print(dataset)\n", "print(dataset[\"meta\"][0].keys())" @@ -157,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -177,7 +164,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -199,20 +186,9 @@ }, { "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'human-evaluation-dataset'" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "dataset_id.name" ] @@ -230,17 +206,9 @@ }, { "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Running: 5it [00:05, 1.05s/it]\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "model = LuminousControlModel(name=\"luminous-base-control\", client=client)\n", "task = Instruct(model=model)\n", @@ -271,19 +239,26 @@ "metadata": {}, "source": [ "At the end of our evaluation we want a float score $s \\in [1,5]$ describing the model performance.\n", - "We define this as an `InstructAggregatedEvaluation`, which will be used in our aggregation later." + "We define this as an `InstructAggregatedEvaluation`, which will be used in our aggregation later.\n", + "\n", + "We also define the `InstructEvaluation`, which represents an evaluation of a single entry, which we will aggregate later." ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class InstructAggregatedEvaluation(BaseModel):\n", " general_rating: float | None\n", " fluency: float | None\n", - " evaluated_examples: int" + " evaluated_examples: int\n", + "\n", + "\n", + "class InstructEvaluation(BaseModel):\n", + " general_rating: float\n", + " fluency: float" ] }, { @@ -291,13 +266,14 @@ "metadata": {}, "source": [ "We can now start to define our human evaluation. This is done with `Questions` and `Fields`. \n", - "`Fields` define what a user has to evaluate. In our example, this will be the model input (Instruction) and output (Model Completion). Note that the field names have to match the content keys from the `RecordData` which we will define later in our `InstructArgillaEvaluationLogic`. \n", - "`Questions` are what a user has to answer in order to evaluate the `Fields`. The `name` property will later be used to access the human ratings in the aggregation step. In our case we ask how complete and how fluent the completions are." + "`Fields` define what a user has to evaluate. In our example, this will be the model input (Instruction) and output (Model Completion). \n", + "`Questions` are what a user has to answer in order to evaluate the `Fields`. The `name` property will later be used to access the human ratings. \n", + "Both of these are passed to the `ArgillaEvaluationLogic` to create `RecordData` to convert data back and forth from Argilla. " ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -329,74 +305,36 @@ "Our defined fields and questions will look like this:\n", "![Argilla Interface](../../assets/argilla_interface.png)\n", "\n", - "We can now define our `InstructArgillaEvaluationLogic` and `InstructArgillaAggregationLogic`.\n", - "They have to implement the two abstract methods `_to_record` and `aggregate` respectively.\n", + "We can now define our `InstructArgillaEvaluationLogic` to translate our data to specific Argilla formats .\n", + "The logic has to implement the two abstract methods `to_record` and `from_record`.\n", "Lets look at the documentation:" ] }, { "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Help on function to_record in module intelligence_layer.evaluation.evaluation.evaluator.argilla_evaluator:\n", - "\n", - "to_record(self, example: intelligence_layer.evaluation.dataset.domain.Example, *output: intelligence_layer.evaluation.run.domain.SuccessfulExampleOutput) -> intelligence_layer.evaluation.evaluation.evaluator.argilla_evaluator.RecordDataSequence\n", - " This method is responsible for translating the `Example` and `Output` of the task to :class:`RecordData`\n", - " \n", - " \n", - " Args:\n", - " example: The example to be translated.\n", - " output: The output of the example that was run.\n", - "\n", - "----------------------------------------------------------------------------------------------------\n", - "Help on function aggregate in module intelligence_layer.evaluation.aggregation.aggregator:\n", - "\n", - "aggregate(self, evaluations: Iterable[+Evaluation]) -> +AggregatedEvaluation\n", - " `Evaluator`-specific method for aggregating individual `Evaluations` into report-like `Aggregated Evaluation`.\n", - " \n", - " This method is responsible for taking the results of an evaluation run and aggregating all the results.\n", - " It should create an `AggregatedEvaluation` class and return it at the end.\n", - " \n", - " Args:\n", - " evaluations: The results from running `eval_and_aggregate_runs` with a :class:`Task`.\n", - " \n", - " Returns:\n", - " The aggregated results of an evaluation run with a :class:`Dataset`.\n", - "\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "help(ArgillaEvaluationLogic.to_record)\n", "print(\"-\" * 100)\n", - "help(AggregationLogic.aggregate)" + "help(ArgillaEvaluationLogic.from_record)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Instead of performing the evaluation, the `ArgillaEvaluationLogic` is responsible for converting the evaluation data to a format that is accepted by Argilla. During the evaluation, these records will simply be submitted to Argilla. \n", + "Instead of performing the evaluation, the `ArgillaEvaluationLogic` is responsible for converting the evaluation data to a format that is accepted by Argilla. During the evaluation, these records will simply be submitted to Argilla and retrieved later.\n", "We will now create everything we need to submit these evaluations to our Argilla instance." ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "\n", - "\n", - "class InstructEvaluation(BaseModel):\n", - " general_rating: float\n", - " fluency: float\n", - "\n", "class InstructArgillaEvaluationLogic(\n", " ArgillaEvaluationLogic[\n", " InstructInput,\n", @@ -414,16 +352,21 @@ " records=[\n", " RecordData(\n", " content={\n", - " \"input\": example.input.instruction,\n", - " \"output\": example_outputs[0].output.completion,\n", + " self.fields[\"input\"].name: example.input.instruction,\n", + " self.fields[\"output\"].name: example_outputs[\n", + " 0\n", + " ].output.completion,\n", " },\n", " example_id=example.id,\n", " )\n", " ]\n", " )\n", - " \n", + "\n", " def from_record(self, argilla_evaluation: ArgillaEvaluation) -> InstructEvaluation:\n", - " return InstructEvaluation(general_rating=argilla_evaluation.responses[\"general_rating\"], fluency=argilla_evaluation.responses[\"fluency\"])\n", + " return InstructEvaluation(\n", + " general_rating=argilla_evaluation.responses[\"general_rating\"],\n", + " fluency=argilla_evaluation.responses[\"fluency\"],\n", + " )\n", "\n", "\n", "argilla_client = DefaultArgillaClient()\n", @@ -431,9 +374,7 @@ "\n", "dataset_repository = FileDatasetRepository(REPOSITORY_ROOT_PATH)\n", "run_repository = FileRunRepository(REPOSITORY_ROOT_PATH)\n", - "evaluation_repository = AsyncFileEvaluationRepository(\n", - " REPOSITORY_ROOT_PATH\n", - ")\n", + "evaluation_repository = AsyncFileEvaluationRepository(REPOSITORY_ROOT_PATH)\n", "\n", "eval_logic = InstructArgillaEvaluationLogic(fields, questions)\n", "evaluator = ArgillaEvaluator(\n", @@ -443,7 +384,7 @@ " \"instruct-evaluation\",\n", " eval_logic,\n", " argilla_client=argilla_client,\n", - " workspace_id=workspace_id\n", + " workspace_id=workspace_id,\n", ")" ] }, @@ -451,35 +392,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "After setting up the `ArgillaEvaluator`, the `evaluate_runs` methods posts the records to the Argilla instance." + "After setting up the `ArgillaEvaluator`, the `sumit` methods posts the records to the Argilla instance." ] }, { "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Evaluation Overview ID = 41d5a7ac-c7f1-4e22-9ef3-538bf98ecbc5\n", - "Start time = 2024-05-13 18:22:42.271727\n", - "Submitted Evaluations = 5\n", - "Description = \"instruct-evaluation\"\n", - "Run Overviews={\n", - "Run Overview ID = 5fece010-9d8f-4eb3-bf55-4abda031ed25\n", - "Dataset ID = 2f40e028-f4ea-4018-bdcb-24e62b38d057\n", - "Start time = 2024-05-13 16:16:21.485698+00:00\n", - "End time = 2024-05-13 16:16:34.460946+00:00\n", - "Failed example count = 0\n", - "Successful example count = 5\n", - "Description = \"instruct-run\"\n", - "}\n", - "\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# either remember the id from before (run_overview.id) or retrieve as below\n", "run_id = [\n", @@ -497,9 +417,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "While the evaluation says that 5 examples were successfully evaluated, no real evaluation has happened yet. \n", "If we try to perform an aggregation right now, it will have no evaluations, as none of the submitted records were evaluated by humans through Argilla yet. \n", - "The aggregation fetches only the results that were already evaluated.\n", + "The next steps fetches only results that have been evaluated already\n", "\n", "---\n", "\n", @@ -509,7 +428,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -522,31 +441,53 @@ "metadata": {}, "source": [ "These splits can then be filered by, as shown below. \n", - "\"drawing\"" + "\"drawing\"\n", + "\n", + "To finish the evaluation, we can retrieve the evaluated examples as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evaluation_repository = AsyncFileEvaluationRepository(REPOSITORY_ROOT_PATH)\n", + "\n", + "# either remember the id from before (eval_overview.id) or retrieve as below\n", + "eval_id = [\n", + " overview.id\n", + " for overview in evaluation_repository.partial_evaluation_overviews()\n", + " if overview.description == \"instruct-evaluation\"\n", + "][0]\n", + "\n", + "evaluation_overview = evaluator.retrieve(eval_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ + "\n", + "Note that all examples that are not yet evaluated in argilla are noted as `failed_examples` and not passed to the next step.\n", "\n", "---\n", "\n", - "For the Aggregation, we first need to define our `AggregationLogic` that has to take an `ArgillaEvaluation` as an input. As output, we use the `InstructAggregatedEvaluation` we defined earlier." + "For the Aggregation, we first need to define our `AggregationLogic` that takes our previously defined types as input and output. Here, we use `InstructEvaluation` and `InstructAggregatedEvaluation`." ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class InstructArgillaAggregationLogic(\n", - " AggregationLogic[ArgillaEvaluation, InstructAggregatedEvaluation]\n", + " AggregationLogic[InstructEvaluation, InstructAggregatedEvaluation]\n", "):\n", " def aggregate(\n", " self,\n", - " evaluations: Iterable[ArgillaEvaluation],\n", + " evaluations: Iterable[InstructEvaluation],\n", " ) -> InstructAggregatedEvaluation:\n", " evaluations = list(evaluations)\n", "\n", @@ -558,13 +499,12 @@ " )\n", "\n", " general_rating = sum(\n", - " cast(float, evaluation.responses[\"general_rating\"])\n", - " for evaluation in evaluations\n", + " evaluation.general_rating for evaluation in evaluations\n", " ) / len(evaluations)\n", "\n", - " fluency = sum(\n", - " cast(float, evaluation.responses[\"fluency\"]) for evaluation in evaluations\n", - " ) / len(evaluations)\n", + " fluency = sum(evaluation.fluency for evaluation in evaluations) / len(\n", + " evaluations\n", + " )\n", "\n", " return InstructAggregatedEvaluation(\n", " general_rating=general_rating,\n", @@ -580,42 +520,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "With this, we can define our `ArgillaAggregator` and retrieve the aggregation of all records that have been evaluated." + "With this, we can define our `Aggregator` and aggregate all evaluations. This step is the same as non-human evaluation." ] }, { "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "Repository does not contain an evaluation with id: 41d5a7ac-c7f1-4e22-9ef3-538bf98ecbc5", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[54], line 11\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# either remember the id from before (eval_overview.id) or retrieve as below\u001b[39;00m\n\u001b[1;32m 5\u001b[0m eval_id \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 6\u001b[0m overview\u001b[38;5;241m.\u001b[39mid\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m overview \u001b[38;5;129;01min\u001b[39;00m evaluation_repository\u001b[38;5;241m.\u001b[39mpartial_evaluation_overviews()\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m overview\u001b[38;5;241m.\u001b[39mdescription \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minstruct-evaluation\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 9\u001b[0m ][\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m---> 11\u001b[0m evaluation_overview \u001b[38;5;241m=\u001b[39m \u001b[43mevaluator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mretrieve\u001b[49m\u001b[43m(\u001b[49m\u001b[43meval_id\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 13\u001b[0m aggregator \u001b[38;5;241m=\u001b[39m Aggregator(\n\u001b[1;32m 14\u001b[0m evaluation_repository,\n\u001b[1;32m 15\u001b[0m aggregation_repository,\n\u001b[1;32m 16\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minstruct-aggregation\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 17\u001b[0m aggregation_logic,\n\u001b[1;32m 18\u001b[0m )\n\u001b[1;32m 20\u001b[0m output \u001b[38;5;241m=\u001b[39m aggregator\u001b[38;5;241m.\u001b[39maggregate_evaluation(eval_id)\n", - "File \u001b[0;32m~/intelligence-layer/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py:190\u001b[0m, in \u001b[0;36mArgillaEvaluator.retrieve\u001b[0;34m(self, evaluation_id)\u001b[0m\n\u001b[1;32m 187\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m evaluation \u001b[38;5;129;01min\u001b[39;00m evaluations:\n\u001b[1;32m 188\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_evaluation_repository\u001b[38;5;241m.\u001b[39mstore_example_evaluation(evaluation)\n\u001b[1;32m 189\u001b[0m num_failed_evaluations \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\n\u001b[0;32m--> 190\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_evaluation_repository\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfailed_example_evaluations\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 191\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartial_evaluation_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mevaluation_type\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 192\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 193\u001b[0m )\n\u001b[1;32m 194\u001b[0m num_not_yet_evaluated_evals \u001b[38;5;241m=\u001b[39m partial_overview\u001b[38;5;241m.\u001b[39msubmitted_evaluation_count \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mlen\u001b[39m(\n\u001b[1;32m 195\u001b[0m evaluations\n\u001b[1;32m 196\u001b[0m )\n\u001b[1;32m 198\u001b[0m overview \u001b[38;5;241m=\u001b[39m EvaluationOverview(\n\u001b[1;32m 199\u001b[0m run_overviews\u001b[38;5;241m=\u001b[39mpartial_overview\u001b[38;5;241m.\u001b[39mrun_overviews,\n\u001b[1;32m 200\u001b[0m \u001b[38;5;28mid\u001b[39m\u001b[38;5;241m=\u001b[39mpartial_evaluation_id,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;241m+\u001b[39m num_failed_evaluations,\n\u001b[1;32m 207\u001b[0m )\n", - "File \u001b[0;32m~/intelligence-layer/src/intelligence_layer/evaluation/evaluation/evaluation_repository.py:195\u001b[0m, in \u001b[0;36mEvaluationRepository.failed_example_evaluations\u001b[0;34m(self, evaluation_id, evaluation_type)\u001b[0m\n\u001b[1;32m 182\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfailed_example_evaluations\u001b[39m(\n\u001b[1;32m 183\u001b[0m \u001b[38;5;28mself\u001b[39m, evaluation_id: \u001b[38;5;28mstr\u001b[39m, evaluation_type: \u001b[38;5;28mtype\u001b[39m[Evaluation]\n\u001b[1;32m 184\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Sequence[ExampleEvaluation[Evaluation]]:\n\u001b[1;32m 185\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Returns all failed :class:`ExampleEvaluation`s for the given evaluation overview ID sorted by their example ID.\u001b[39;00m\n\u001b[1;32m 186\u001b[0m \n\u001b[1;32m 187\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 193\u001b[0m \u001b[38;5;124;03m A :class:`Sequence` of failed :class:`ExampleEvaluation`s.\u001b[39;00m\n\u001b[1;32m 194\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 195\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexample_evaluations\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevaluation_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mevaluation_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 196\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [r \u001b[38;5;28;01mfor\u001b[39;00m r \u001b[38;5;129;01min\u001b[39;00m results \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(r\u001b[38;5;241m.\u001b[39mresult, FailedExampleEvaluation)]\n", - "File \u001b[0;32m~/intelligence-layer/src/intelligence_layer/evaluation/evaluation/file_evaluation_repository.py:81\u001b[0m, in \u001b[0;36mFileSystemEvaluationRepository.example_evaluations\u001b[0;34m(self, evaluation_id, evaluation_type)\u001b[0m\n\u001b[1;32m 79\u001b[0m path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_eval_directory(evaluation_id)\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexists(path):\n\u001b[0;32m---> 81\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 82\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRepository does not contain an evaluation with id: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevaluation_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 83\u001b[0m )\n\u001b[1;32m 85\u001b[0m example_evaluations: \u001b[38;5;28mlist\u001b[39m[ExampleEvaluation[Evaluation]] \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 86\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m file_name \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_names(path):\n", - "\u001b[0;31mValueError\u001b[0m: Repository does not contain an evaluation with id: 41d5a7ac-c7f1-4e22-9ef3-538bf98ecbc5" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "\n", - "evaluation_repository = AsyncFileEvaluationRepository(REPOSITORY_ROOT_PATH)\n", - "\n", "aggregation_repository = FileAggregationRepository(REPOSITORY_ROOT_PATH)\n", - "# either remember the id from before (eval_overview.id) or retrieve as below\n", - "eval_id = [\n", - " overview.id\n", - " for overview in evaluation_repository.partial_evaluation_overviews()\n", - " if overview.description == \"instruct-evaluation\"\n", - "][0]\n", - "\n", - "evaluation_overview = evaluator.retrieve(eval_id)\n", "\n", "aggregator = Aggregator(\n", " evaluation_repository,\n", diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py index 19bcf81ff..991a67654 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py @@ -55,17 +55,33 @@ def to_record( example: Example[Input, ExpectedOutput], *output: SuccessfulExampleOutput[Output], ) -> RecordDataSequence: - """This method is responsible for translating the `Example` and `Output` of the task to :class:`RecordData` + """This method is responsible for translating the `Example` and `Output` of the task to :class:`RecordData`. + + The specific format depends on the `fields`. Args: example: The example to be translated. output: The output of the example that was run. + + Returns: + A :class:`RecordDataSequence` that contains entries that should be evaluated in Argilla. """ ... @abstractmethod - def from_record(self, argilla_evaluation: ArgillaEvaluation) -> Evaluation: ... + def from_record(self, argilla_evaluation: ArgillaEvaluation) -> Evaluation: + """This method takes the specific Argilla evaluation format and converts into a compatible :class:`Evaluation`. + + The format of argilla_evaluation.responses depends on the `questions` attribute. + Each `name` of a question will be a key in the `argilla_evaluation.responses` mapping. + + Args: + argilla_evaluation: Argilla-specific data for a single evaluation. + + Returns: + An :class:`Evaluation` that contains all evaluation specific data. + """ class ArgillaEvaluator(AsyncEvaluator[Input, Output, ExpectedOutput, Evaluation]):