From d8f097020d70654baaff8c4cc071a750480414c5 Mon Sep 17 00:00:00 2001 From: Sebastian Niehus Date: Wed, 3 Apr 2024 17:59:24 +0200 Subject: [PATCH] feat: Add FailedExampleIterator for retrieval of failed examples and adapt user_journey.ipynb Task IL-367 --- src/examples/user_journey.ipynb | 90 +++++++------------ .../use_cases/classify/classify.py | 44 +++++++++ 2 files changed, 77 insertions(+), 57 deletions(-) diff --git a/src/examples/user_journey.ipynb b/src/examples/user_journey.ipynb index 5f428d3f4..099f53740 100644 --- a/src/examples/user_journey.ipynb +++ b/src/examples/user_journey.ipynb @@ -259,7 +259,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, let's aggregate all individual evaluations to get seom eval statistics." + "As you can see, there are quite some warnings telling us that some expected label is missing from the input labels for the evaluation.\n", + "We will have to take care of those cases at some point, but first let's look at some of the failed examples:" ] }, { @@ -268,20 +269,20 @@ "metadata": {}, "outputs": [], "source": [ - "aggregation_overview = aggregator.aggregate_evaluation(eval_overview.id)\n", - "aggregation_overview" + "from intelligence_layer.use_cases.classify.classify import FailedExampleIterator\n", + "\n", + "failed_example_iterator = FailedExampleIterator(\n", + " dataset_repository, run_repository, evaluation_repository\n", + ")\n", + "list(failed_example_iterator.get_examples(eval_overview.id))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "It looks like we only predicted around 25% of classes correctly.\n", - "\n", - "However, a closer look at the overview suggests that we have a bunch of incorrect labels in our test dataset.\n", - "We will fix this later.\n", - "\n", - "First, let's have a look at a few failed examples in detail." + "As you can see, for some of the examples the expected output is missing from the set of labels.\n", + "Let's aggregate our results to get some more statistics:" ] }, { @@ -290,53 +291,24 @@ "metadata": {}, "outputs": [], "source": [ - "from intelligence_layer.use_cases import (\n", - " SingleLabelClassifyOutput,\n", - " SingleLabelClassifyEvaluation,\n", - ")\n", - "\n", - "\n", - "def get_failed_examples(run_id: str, eval_id: str, dataset_id: str, first_n: int):\n", - " overview = [\n", - " {\n", - " \"input\": example.input,\n", - " \"expected_output\": example.expected_output,\n", - " \"result\": sorted(\n", - " list(\n", - " next(\n", - " e\n", - " for e in run_repository.example_outputs(\n", - " run_id, SingleLabelClassifyOutput\n", - " )\n", - " if e.example_id == example.id\n", - " ).output.scores.items()\n", - " ),\n", - " key=lambda i: i[1],\n", - " reverse=True,\n", - " )[0],\n", - " \"eval\": evaluation_repository.example_evaluation(\n", - " evaluation_id=eval_id,\n", - " example_id=example.id,\n", - " evaluation_type=SingleLabelClassifyEvaluation,\n", - " ).result,\n", - " }\n", - " for example in dataset_repository.examples(\n", - " dataset_id=dataset_id, input_type=ClassifyInput, expected_output_type=str\n", - " )\n", - " ]\n", - " return [e for e in overview if not e[\"eval\"].correct][:first_n]\n", - "\n", - "\n", - "get_failed_examples(run_overview.id, eval_overview.id, dataset_id, 3)" + "aggregation_overview = aggregator.aggregate_evaluation(eval_overview.id)\n", + "aggregation_overview" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "This confirms it: some expected labels are missing. Let's try fixing this.\n", - "\n", - "We can do this two ways: Adjust our set of labels or adjust the eval set. In this case, we'll do the latter.\n" + "It looks like we only predicted around 30% of classes correctly.\n", + "And again, we get a warning about missing labels. We can even see those labels in the 'missing_labels' field of the 'statistics' section of the aggregation_overview.\n", + "So let's fix this. We can do this two ways: Adjust our set of labels or adjust the eval set. In this case, we'll do the latter." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n" ] }, { @@ -447,7 +419,9 @@ "source": [ "Cool, this already got us up to 62%!\n", "\n", - "Notice, how we don't actually tell our classification task, what each class means; we only supply it with all the labels.\n" + "Notice, how we don't actually tell our classification task, what each class means; we only supply it with all the labels.\n", + "\n", + "Another look at the remaining failed examples confirms that the failures are no longer because of the missing labels:" ] }, { @@ -456,12 +430,7 @@ "metadata": {}, "outputs": [], "source": [ - "get_failed_examples(\n", - " run_overview_prompt_adjusted.id,\n", - " eval_overview_prompt_adjusted.id,\n", - " cleaned_dataset_id,\n", - " 3,\n", - ")" + "list(failed_example_iterator.get_examples(eval_overview_prompt_adjusted.id))" ] }, { @@ -518,6 +487,13 @@ "source": [ "aggregation_overview_with_extended" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/src/intelligence_layer/use_cases/classify/classify.py b/src/intelligence_layer/use_cases/classify/classify.py index a2493c4ea..b359cafaf 100644 --- a/src/intelligence_layer/use_cases/classify/classify.py +++ b/src/intelligence_layer/use_cases/classify/classify.py @@ -7,10 +7,15 @@ from intelligence_layer.core import TextChunk from intelligence_layer.evaluation import ( AggregationLogic, + DatasetRepository, + EvaluationRepository, Example, MeanAccumulator, + RepositoryNavigator, + RunRepository, SingleOutputEvaluationLogic, ) +from intelligence_layer.evaluation.evaluation.domain import FailedExampleEvaluation Probability = NewType("Probability", float) @@ -111,6 +116,11 @@ def aggregate( confusion_matrix[(evaluation.predicted, evaluation.expected)] += 1 by_label[evaluation.predicted]["predicted"] += 1 by_label[evaluation.expected]["expected"] += 1 + + if len(missing_labels) > 0: + warn_message = "[WARNING] There were examples with expected labels missing in the evaluation inputs. For a detailed list, see the 'statistics.missing_labels' field of the returned `AggregationOverview`." + warnings.warn(warn_message, RuntimeWarning) + return AggregatedSingleLabelClassifyEvaluation( percentage_correct=acc.extract(), confusion_matrix=confusion_matrix, @@ -158,6 +168,40 @@ def do_evaluate_single_output( ) +class FailedExampleIterator: + def __init__( + self, + dataset_repository: DatasetRepository, + run_repository: RunRepository, + evaluation_repository: EvaluationRepository, + ): + self.repository_navigator = RepositoryNavigator( + dataset_repository, run_repository, evaluation_repository + ) + + # TODO: Add test + def get_examples( + self, evaluation_overview_id: str, first_n: int = 0 + ) -> Iterable[Example[ClassifyInput, str]]: + evaluation_lineages = self.repository_navigator.evaluation_lineages( + evaluation_id=evaluation_overview_id, + input_type=ClassifyInput, + expected_output_type=str, + output_type=SingleLabelClassifyOutput, + evaluation_type=SingleLabelClassifyEvaluation, + ) + count_yielded = 0 + for lineage in evaluation_lineages: + if first_n != 0 and count_yielded >= first_n: + break + if ( + isinstance(lineage.evaluation.result, FailedExampleEvaluation) + or not lineage.evaluation.result.correct + ): + count_yielded += 1 + yield lineage.example + + class MultiLabelClassifyEvaluation(BaseModel): """The evaluation of a single multi-label classification example.