From 5475833f25ffff9e9e833067a61e598ca1b4dc7e Mon Sep 17 00:00:00 2001
From: Sebastian Niehus <Sebastian.Niehus@tngtech.com>
Date: Wed, 3 Apr 2024 17:59:24 +0200
Subject: [PATCH] feat: Add FailedExampleIterator for retrieval of failed
 examples and adapt user_journey.ipynb Task IL-367

---
 src/examples/user_journey.ipynb               |  4 +-
 .../use_cases/classify/classify.py            | 44 +++++++++++++++++++
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/src/examples/user_journey.ipynb b/src/examples/user_journey.ipynb
index 6516a1aac..cd07e5a22 100644
--- a/src/examples/user_journey.ipynb
+++ b/src/examples/user_journey.ipynb
@@ -386,9 +386,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This confirms it: some expected labels are missing. Let's try fixing this.\n",
-    "\n",
-    "We can do this two ways: Adjust our set of labels or adjust the eval set. In this case, we'll do the latter.\n"
+    "\n"
    ]
   },
   {
diff --git a/src/intelligence_layer/use_cases/classify/classify.py b/src/intelligence_layer/use_cases/classify/classify.py
index 9f3522b8c..7e28e0271 100644
--- a/src/intelligence_layer/use_cases/classify/classify.py
+++ b/src/intelligence_layer/use_cases/classify/classify.py
@@ -7,10 +7,15 @@
 from intelligence_layer.core import TextChunk
 from intelligence_layer.evaluation import (
     AggregationLogic,
+    DatasetRepository,
+    EvaluationRepository,
     Example,
     MeanAccumulator,
+    RepositoryNavigator,
+    RunRepository,
     SingleOutputEvaluationLogic,
 )
+from intelligence_layer.evaluation.evaluation.domain import FailedExampleEvaluation
 
 Probability = NewType("Probability", float)
 
@@ -111,6 +116,11 @@ def aggregate(
                 confusion_matrix[(evaluation.predicted, evaluation.expected)] += 1
                 by_label[evaluation.predicted]["predicted"] += 1
                 by_label[evaluation.expected]["expected"] += 1
+
+        if len(missing_labels) > 0:
+            warn_message = "[WARNING] There were examples with expected labels missing in the evaluation inputs. For a detailed list, see the 'statistics.missing_labels' field of the returned `AggregationOverview`."
+            warnings.warn(warn_message, RuntimeWarning)
+
         return AggregatedSingleLabelClassifyEvaluation(
             percentage_correct=acc.extract(),
             confusion_matrix=confusion_matrix,
@@ -158,6 +168,40 @@ def do_evaluate_single_output(
         )
 
 
+class FailedExampleIterator:
+    def __init__(
+        self,
+        dataset_repository: DatasetRepository,
+        run_repository: RunRepository,
+        evaluation_repository: EvaluationRepository,
+    ):
+        self.repository_navigator = RepositoryNavigator(
+            dataset_repository, run_repository, evaluation_repository
+        )
+
+    # TODO: Add test
+    def get_examples(
+        self, evaluation_overview_id: str, first_n: int = 0
+    ) -> Iterable[Example[ClassifyInput, str]]:
+        evaluation_lineages = self.repository_navigator.evaluation_lineages(
+            evaluation_id=evaluation_overview_id,
+            input_type=ClassifyInput,
+            expected_output_type=str,
+            output_type=SingleLabelClassifyOutput,
+            evaluation_type=SingleLabelClassifyEvaluation,
+        )
+        count_yielded = 0
+        for lineage in evaluation_lineages:
+            if first_n != 0 and count_yielded >= first_n:
+                break
+            if (
+                isinstance(lineage.evaluation.result, FailedExampleEvaluation)
+                or not lineage.evaluation.result.correct
+            ):
+                count_yielded += 1
+                yield lineage.example
+
+
 class MultiLabelClassifyEvaluation(BaseModel):
     """The evaluation of a single multi-label classification example.