Implement reviewer comments

TASK: - IL-347
Aleph-Alpha · Apr 3, 2024 · a63e2a9 · a63e2a9
1 parent 24ec873
commit a63e2a9
Show file tree

Hide file tree

Showing 5 changed files with 80 additions and 70 deletions.
diff --git a/src/examples/evaluation.ipynb b/src/examples/evaluation.ipynb
@@ -401,7 +401,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,

diff --git a/src/examples/user_journey.ipynb b/src/examples/user_journey.ipynb
@@ -1,5 +1,37 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "from intelligence_layer.core import InMemoryTracer, LuminousControlModel, TextChunk\n",
+    "from intelligence_layer.evaluation import (\n",
+    "    Aggregator,\n",
+    "    Evaluator,\n",
+    "    Example,\n",
+    "    InMemoryAggregationRepository,\n",
+    "    InMemoryDatasetRepository,\n",
+    "    InMemoryEvaluationRepository,\n",
+    "    InMemoryRunRepository,\n",
+    "    Runner,\n",
+    ")\n",
+    "from intelligence_layer.use_cases import (\n",
+    "    ClassifyInput,\n",
+    "    PromptBasedClassify,\n",
+    "    SingleLabelClassifyAggregationLogic,\n",
+    "    SingleLabelClassifyEvaluation,\n",
+    "    SingleLabelClassifyEvaluationLogic,\n",
+    "    SingleLabelClassifyOutput,\n",
+    ")\n",
+    "import json\n",
+    "\n",
+    "\n",
+    "load_dotenv()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -49,9 +81,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Luckily, the Intelligence provides some classification tasks out of the box.\n",
+    "Luckily, the Intelligence Layer provides some classification tasks out of the box.\n",
     "\n",
-    "Let's import it and run!\n"
+    "Let's run it!\n"
    ]
   },
   {
@@ -60,12 +92,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from intelligence_layer.core import TextChunk, InMemoryTracer\n",
-    "from intelligence_layer.use_cases import PromptBasedClassify, ClassifyInput\n",
-    "from dotenv import load_dotenv\n",
-    "\n",
-    "load_dotenv()\n",
-    "\n",
     "# instantiating the default task\n",
     "prompt_based_classify = PromptBasedClassify()\n",
     "\n",
@@ -112,9 +138,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import json\n",
-    "\n",
-    "\n",
     "with open(\"data/classify_examples.json\", \"r\") as file:\n",
     "    labeled_examples: list[dict[str, str]] = json.load(file)\n",
     "\n",
@@ -125,7 +148,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The Intelligence layer offers support to run task evaluations.\n",
+    "The Intelligence Layer offers support to run task evaluations.\n",
     "\n",
     "First, we have to create a dataset inside a repository.\n",
     "There are different repositories (that persist datasets in different ways), but an `InMemoryDatasetRepository` will do for now.\n"
@@ -137,8 +160,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from intelligence_layer.evaluation import InMemoryDatasetRepository, Example\n",
-    "\n",
     "dataset_repository = InMemoryDatasetRepository()\n",
     "\n",
     "dataset_id = dataset_repository.create_dataset(\n",
@@ -182,20 +203,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from intelligence_layer.evaluation import (\n",
-    "    Evaluator,\n",
-    "    InMemoryEvaluationRepository,\n",
-    "    InMemoryRunRepository,\n",
-    "    InMemoryAggregationRepository,\n",
-    "    Runner,\n",
-    "    Aggregator,\n",
-    ")\n",
-    "from intelligence_layer.use_cases import (\n",
-    "    SingleLabelClassifyEvaluationLogic,\n",
-    "    SingleLabelClassifyAggregationLogic,\n",
-    ")\n",
-    "\n",
-    "\n",
     "# we need a few repositories to store runs, evals and aggregated evaluations\n",
     "run_repository = InMemoryRunRepository()\n",
     "evaluation_repository = InMemoryEvaluationRepository()\n",
@@ -259,7 +266,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Finally, let's aggregate all individual evaluations to get seom eval statistics."
+    "Finally, let's aggregate all individual evaluations to get some eval statistics."
    ]
   },
   {
@@ -290,12 +297,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from intelligence_layer.use_cases import (\n",
-    "    SingleLabelClassifyOutput,\n",
-    "    SingleLabelClassifyEvaluation,\n",
-    ")\n",
-    "\n",
-    "\n",
     "def get_failed_examples(run_id: str, eval_id: str, dataset_id: str, first_n: int):\n",
     "    overview = [\n",
     "        {\n",
@@ -304,11 +305,11 @@
     "            \"result\": sorted(\n",
     "                list(\n",
     "                    next(\n",
-    "                        e\n",
-    "                        for e in run_repository.example_outputs(\n",
+    "                        example\n",
+    "                        for example in run_repository.example_outputs(\n",
     "                            run_id, SingleLabelClassifyOutput\n",
     "                        )\n",
-    "                        if e.example_id == example.id\n",
+    "                        if example.example_id == example.id\n",
     "                    ).output.scores.items()\n",
     "                ),\n",
     "                key=lambda i: i[1],\n",
@@ -324,7 +325,7 @@
     "            dataset_id=dataset_id, input_type=ClassifyInput, expected_output_type=str\n",
     "        )\n",
     "    ]\n",
-    "    return [e for e in overview if not e[\"eval\"].correct][:first_n]\n",
+    "    return [example for example in overview if not example[\"eval\"].correct][:first_n]\n",
     "\n",
     "\n",
     "get_failed_examples(run_overview.id, eval_overview.id, dataset_id, 3)"
@@ -447,28 +448,7 @@
    "source": [
     "Cool, this already got us up to 62%!\n",
     "\n",
-    "Notice, how we don't actually tell our classification task, what each class means; we only supply it with all the labels.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "get_failed_examples(\n",
-    "    run_overview_prompt_adjusted.id,\n",
-    "    eval_overview_prompt_adjusted.id,\n",
-    "    cleaned_dataset_id,\n",
-    "    3,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "So far, we have only used `luminous-base-control`. Let's upgrade to a bigger model!\n"
+    "So far, we only used the `luminous-base-control` model. Let's see if we can improve our classifications by upgrading to a bigger model!"
    ]
   },
   {
@@ -477,8 +457,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from intelligence_layer.core import LuminousControlModel\n",
-    "\n",
     "classify_with_extended = PromptBasedClassify(\n",
     "    instruction=adjusted_prompt, model=LuminousControlModel(\"luminous-supreme-control\")\n",
     ")"
@@ -518,6 +496,38 @@
    "source": [
     "aggregation_overview_with_extended"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "So using a bigger model slightly improved our results to 66.66%.\n",
+    "\n",
+    "Feel free to further play around and improve our classification example. \n",
+    "\n",
+    "Notice, for instance, that so far we do not tell our classification task what each class means. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "get_failed_examples(\n",
+    "    run_overview_prompt_adjusted.id,\n",
+    "    eval_overview_prompt_adjusted.id,\n",
+    "    cleaned_dataset_id,\n",
+    "    3,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The model has to 'guess' what we mean by each class purely from the given labels. In order to takle this issue you could use the `PromptBasedClassifyWithDefinitions` task to also provide a short description for each class."
+   ]
   }
  ],
  "metadata": {
@@ -536,7 +546,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.0"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,

diff --git a/src/intelligence_layer/use_cases/classify/classify.py b/src/intelligence_layer/use_cases/classify/classify.py
@@ -93,9 +93,9 @@ class AggregatedSingleLabelClassifyEvaluation(BaseModel):
 
     Attributes:
         percentage_correct: Percentage of answers that were considered to be correct.
-        confusion_matrix: How often each label was confused with each other.
-        by_label: Each label along with a couple aggregated statistics.
-        missing_labels: Each label missing from the results accompanied by the missing count.
+        confusion_matrix: A matrix showing the predicted classifications vs the expected classifications.
+        by_label: Each label along side the counts how often it was expected or predicted.
+        missing_labels: Each expected label which is missing in the set of possible labels in the task input and the number of its occurrences.
     """
 
     percentage_correct: float

diff --git a/src/intelligence_layer/use_cases/classify/prompt_based_classify_with_definitions.py b/src/intelligence_layer/use_cases/classify/prompt_based_classify_with_definitions.py
@@ -71,7 +71,7 @@ def format_input(text: str, labels: frozenset[str]) -> str:
                 if label.name in labels
             )
             return f"""Labels:
-{', '.join(l.name for l in self._labels_with_definitions)}
+{', '.join(label.name for label in self._labels_with_definitions if label.name in labels)}
 
 Definitions:
 {definitions}

diff --git a/tests/evaluation/test_repository_navigator.py b/tests/evaluation/test_repository_navigator.py
@@ -392,7 +392,7 @@ def test_evaluation_lineages_to_pandas(
     for lineage in lineages:
         for output in lineage.outputs:
             row = df.loc[
-                lineage.example.id, lineage.evaluation.evaluation_id, output.run_id  # type: ignore
+                lineage.example.id, lineage.evaluation.evaluation_id, output.run_id
             ]
             assert lineage.example.input == row.input
             assert lineage.example.expected_output == row.expected_output