diff --git a/src/examples/user_journey.ipynb b/src/examples/user_journey.ipynb index 6516a1aac..cd07e5a22 100644 --- a/src/examples/user_journey.ipynb +++ b/src/examples/user_journey.ipynb @@ -386,9 +386,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This confirms it: some expected labels are missing. Let's try fixing this.\n", - "\n", - "We can do this two ways: Adjust our set of labels or adjust the eval set. In this case, we'll do the latter.\n" + "\n" ] }, { diff --git a/src/intelligence_layer/use_cases/classify/classify.py b/src/intelligence_layer/use_cases/classify/classify.py index 9f3522b8c..7e28e0271 100644 --- a/src/intelligence_layer/use_cases/classify/classify.py +++ b/src/intelligence_layer/use_cases/classify/classify.py @@ -7,10 +7,15 @@ from intelligence_layer.core import TextChunk from intelligence_layer.evaluation import ( AggregationLogic, + DatasetRepository, + EvaluationRepository, Example, MeanAccumulator, + RepositoryNavigator, + RunRepository, SingleOutputEvaluationLogic, ) +from intelligence_layer.evaluation.evaluation.domain import FailedExampleEvaluation Probability = NewType("Probability", float) @@ -111,6 +116,11 @@ def aggregate( confusion_matrix[(evaluation.predicted, evaluation.expected)] += 1 by_label[evaluation.predicted]["predicted"] += 1 by_label[evaluation.expected]["expected"] += 1 + + if len(missing_labels) > 0: + warn_message = "[WARNING] There were examples with expected labels missing in the evaluation inputs. For a detailed list, see the 'statistics.missing_labels' field of the returned `AggregationOverview`." + warnings.warn(warn_message, RuntimeWarning) + return AggregatedSingleLabelClassifyEvaluation( percentage_correct=acc.extract(), confusion_matrix=confusion_matrix, @@ -158,6 +168,40 @@ def do_evaluate_single_output( ) +class FailedExampleIterator: + def __init__( + self, + dataset_repository: DatasetRepository, + run_repository: RunRepository, + evaluation_repository: EvaluationRepository, + ): + self.repository_navigator = RepositoryNavigator( + dataset_repository, run_repository, evaluation_repository + ) + + # TODO: Add test + def get_examples( + self, evaluation_overview_id: str, first_n: int = 0 + ) -> Iterable[Example[ClassifyInput, str]]: + evaluation_lineages = self.repository_navigator.evaluation_lineages( + evaluation_id=evaluation_overview_id, + input_type=ClassifyInput, + expected_output_type=str, + output_type=SingleLabelClassifyOutput, + evaluation_type=SingleLabelClassifyEvaluation, + ) + count_yielded = 0 + for lineage in evaluation_lineages: + if first_n != 0 and count_yielded >= first_n: + break + if ( + isinstance(lineage.evaluation.result, FailedExampleEvaluation) + or not lineage.evaluation.result.correct + ): + count_yielded += 1 + yield lineage.example + + class MultiLabelClassifyEvaluation(BaseModel): """The evaluation of a single multi-label classification example.