feat: Add FailedExampleIterator for retrieval of failed examples and …

…adapt user_journey.ipynb Task IL-367
Aleph-Alpha · Apr 4, 2024 · 5475833 · 5475833
1 parent b51d98f
commit 5475833
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 3 deletions.
diff --git a/src/examples/user_journey.ipynb b/src/examples/user_journey.ipynb
@@ -386,9 +386,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This confirms it: some expected labels are missing. Let's try fixing this.\n",
-    "\n",
-    "We can do this two ways: Adjust our set of labels or adjust the eval set. In this case, we'll do the latter.\n"
+    "\n"
    ]
   },
   {

diff --git a/src/intelligence_layer/use_cases/classify/classify.py b/src/intelligence_layer/use_cases/classify/classify.py
@@ -7,10 +7,15 @@
 from intelligence_layer.core import TextChunk
 from intelligence_layer.evaluation import (
     AggregationLogic,
+    DatasetRepository,
+    EvaluationRepository,
     Example,
     MeanAccumulator,
+    RepositoryNavigator,
+    RunRepository,
     SingleOutputEvaluationLogic,
 )
+from intelligence_layer.evaluation.evaluation.domain import FailedExampleEvaluation
 
 Probability = NewType("Probability", float)
 
@@ -111,6 +116,11 @@ def aggregate(
                 confusion_matrix[(evaluation.predicted, evaluation.expected)] += 1
                 by_label[evaluation.predicted]["predicted"] += 1
                 by_label[evaluation.expected]["expected"] += 1
+
+        if len(missing_labels) > 0:
+            warn_message = "[WARNING] There were examples with expected labels missing in the evaluation inputs. For a detailed list, see the 'statistics.missing_labels' field of the returned `AggregationOverview`."
+            warnings.warn(warn_message, RuntimeWarning)
+
         return AggregatedSingleLabelClassifyEvaluation(
             percentage_correct=acc.extract(),
             confusion_matrix=confusion_matrix,
@@ -158,6 +168,40 @@ def do_evaluate_single_output(
         )
 
 
+class FailedExampleIterator:
+    def __init__(
+        self,
+        dataset_repository: DatasetRepository,
+        run_repository: RunRepository,
+        evaluation_repository: EvaluationRepository,
+    ):
+        self.repository_navigator = RepositoryNavigator(
+            dataset_repository, run_repository, evaluation_repository
+        )
+
+    # TODO: Add test
+    def get_examples(
+        self, evaluation_overview_id: str, first_n: int = 0
+    ) -> Iterable[Example[ClassifyInput, str]]:
+        evaluation_lineages = self.repository_navigator.evaluation_lineages(
+            evaluation_id=evaluation_overview_id,
+            input_type=ClassifyInput,
+            expected_output_type=str,
+            output_type=SingleLabelClassifyOutput,
+            evaluation_type=SingleLabelClassifyEvaluation,
+        )
+        count_yielded = 0
+        for lineage in evaluation_lineages:
+            if first_n != 0 and count_yielded >= first_n:
+                break
+            if (
+                isinstance(lineage.evaluation.result, FailedExampleEvaluation)
+                or not lineage.evaluation.result.correct
+            ):
+                count_yielded += 1
+                yield lineage.example
+
+
 class MultiLabelClassifyEvaluation(BaseModel):
     """The evaluation of a single multi-label classification example.