feat: Add property for sorted scores of SingleLabelClassifyOutput (#699)

Co-authored-by: Sebastian Niehus <[email protected]>
Aleph-Alpha · Apr 4, 2024 · 9d9b26a · 9d9b26a
1 parent 053725a
commit 9d9b26a
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,7 @@
 ### Breaking Changes
 
 ### New Features
-- feature: Add SingleLabelClassifyFailedExampleIterator for easy retrieval of failed examples.
+- feature: Add sorted_scores property to `SingleLabelClassifyOutput`.
 - feature: Error information is printed to the console on failed runs and evaluations.
 - feature: The stack trace of a failed run/evaluation is included in the `FailedExampleRun`/`FailedExampleEvaluation` object
 - feature: The `Runner.run_dataset(..)` and `Evaluator.evaluate_run(..)` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs.

diff --git a/src/examples/user_journey.ipynb b/src/examples/user_journey.ipynb
@@ -19,6 +19,7 @@
     "    Runner,\n",
     "    evaluation_lineages_to_pandas,\n",
     ")\n",
+    "from intelligence_layer.evaluation.evaluation.domain import FailedExampleEvaluation\n",
     "from intelligence_layer.use_cases import (\n",
     "    ClassifyInput,\n",
     "    PromptBasedClassify,\n",
@@ -27,7 +28,6 @@
     ")\n",
     "import json\n",
     "\n",
-    "\n",
     "load_dotenv()"
    ]
   },
@@ -110,7 +110,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Hmm, we have some results, but they aren't really legible (yet)."
+    "Hmm, we have some results, but they aren't really legible (yet).\n",
+    "So let's look at the sorted individual results for more clarity: "
    ]
   },
   {
@@ -119,15 +120,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "[sorted(list(o.scores.items()), key=lambda i: i[1], reverse=True)[0] for o in outputs]"
+    "[o.sorted_scores for o in outputs]"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "It appears that the Finance Department can fix my laptop and the Comms people can reward free credits...\n",
-    "We probably have to do some finetuning of our classification approach.\n",
+    "For the first example 'Communications' gets the highest score, while for the second example the 'Communications' is the clear winner.\n",
+    "This suggests that the Finance Department can fix my laptop and the Comms people can reward free credits ... Not very likely.\n",
+    "We probably have to do some fine-tuning of our classification approach.\n",
     "\n",
     "However, let's first make sure that this evidence is not anecdotal.\n",
     "For this, we need to do some eval. Luckily, we have by now got access to a few more examples...\n"
@@ -313,11 +315,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# from intelligence_layer.evaluation import evaluation_lineages_to_pandas\n",
-    "\n",
-    "\n",
-    "from intelligence_layer.evaluation import FailedExampleEvaluation\n",
-    "\n",
     "passed_lineages = [\n",
     "    lineage\n",
     "    for lineage in evaluator.evaluation_lineages(eval_overview.id)\n",

diff --git a/src/intelligence_layer/use_cases/classify/classify.py b/src/intelligence_layer/use_cases/classify/classify.py
@@ -39,6 +39,10 @@ class SingleLabelClassifyOutput(BaseModel):
 
     scores: Mapping[str, Probability]
 
+    @property
+    def sorted_scores(self) -> list[tuple[str, Probability]]:
+        return sorted(self.scores.items(), key=lambda item: item[1], reverse=True)
+
 
 class MultiLabelClassifyOutput(BaseModel):
     """Output for a multi label classification task.
@@ -143,14 +147,11 @@ def do_evaluate_single_output(
         example: Example[ClassifyInput, str],
         output: SingleLabelClassifyOutput,
     ) -> SingleLabelClassifyEvaluation:
-        sorted_classes = sorted(
-            output.scores.items(), key=lambda item: item[1], reverse=True
-        )
         if example.expected_output not in example.input.labels:
             warn_message = f"[WARNING] Example with ID '{example.id}' has expected label '{example.expected_output}', which is not part of the example's input labels."
             warnings.warn(warn_message, RuntimeWarning)
 
-        predicted = sorted_classes[0][0]
+        predicted = output.sorted_scores[0][0]
         if predicted == example.expected_output:
             correct = True
         else: