diff --git a/src/intelligence_layer/evaluation/base_logic.py b/src/intelligence_layer/evaluation/base_logic.py index 6adac8c8f..f505ce36a 100644 --- a/src/intelligence_layer/evaluation/base_logic.py +++ b/src/intelligence_layer/evaluation/base_logic.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Generic, Iterable +from typing import Generic, Iterable, final from intelligence_layer.core import Input, Output from intelligence_layer.evaluation.domain import ( @@ -48,3 +48,22 @@ def do_evaluate( The metrics that come from the evaluated :class:`Task`. """ pass + + +class SingleOutputEvaluationLogic( + EvaluationLogic[Input, Output, ExpectedOutput, Evaluation] +): + @final + def do_evaluate( + self, + example: Example[Input, ExpectedOutput], + *output: SuccessfulExampleOutput[Output], + ) -> Evaluation: + assert len(output) == 1 + return self.do_evaluate_single_output(example, output[0].output) + + @abstractmethod + def do_evaluate_single_output( + self, example: Example[Input, ExpectedOutput], output: Output + ) -> Evaluation: + pass diff --git a/src/intelligence_layer/use_cases/classify/classify.py b/src/intelligence_layer/use_cases/classify/classify.py index a6f28b921..4e5cd4a8c 100644 --- a/src/intelligence_layer/use_cases/classify/classify.py +++ b/src/intelligence_layer/use_cases/classify/classify.py @@ -4,12 +4,11 @@ from pydantic import BaseModel from intelligence_layer.core import Chunk -from intelligence_layer.evaluation import ( - Example, - MeanAccumulator, - SuccessfulExampleOutput, +from intelligence_layer.evaluation import Example, MeanAccumulator +from intelligence_layer.evaluation.base_logic import ( + AggregationLogic, + SingleOutputEvaluationLogic, ) -from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic Probability = NewType("Probability", float) @@ -87,21 +86,20 @@ def aggregate( class SingleLabelClassifyEvaluationLogic( - EvaluationLogic[ + SingleOutputEvaluationLogic[ ClassifyInput, SingleLabelClassifyOutput, Sequence[str], SingleLabelClassifyEvaluation, ] ): - def do_evaluate( + def do_evaluate_single_output( self, example: Example[ClassifyInput, Sequence[str]], - *output: SuccessfulExampleOutput[SingleLabelClassifyOutput], + output: SingleLabelClassifyOutput, ) -> SingleLabelClassifyEvaluation: - assert len(output) == 1 sorted_classes = sorted( - output[0].output.scores.items(), key=lambda item: item[1], reverse=True + output.scores.items(), key=lambda item: item[1], reverse=True ) if sorted_classes[0][0] in example.expected_output: correct = True @@ -236,7 +234,7 @@ def aggregate( class MultiLabelClassifyEvaluationLogic( - EvaluationLogic[ + SingleOutputEvaluationLogic[ ClassifyInput, MultiLabelClassifyOutput, Sequence[str], @@ -250,17 +248,13 @@ def __init__( super().__init__() self.threshold = threshold - def do_evaluate( + def do_evaluate_single_output( self, example: Example[ClassifyInput, Sequence[str]], - *output: SuccessfulExampleOutput[MultiLabelClassifyOutput], + output: MultiLabelClassifyOutput, ) -> MultiLabelClassifyEvaluation: - assert len(output) == 1 - single_output = output[0].output predicted_classes = frozenset( - label - for label, score in single_output.scores.items() - if score > self.threshold + label for label, score in output.scores.items() if score > self.threshold ) expected_classes = frozenset(example.expected_output) tp = predicted_classes & expected_classes diff --git a/src/intelligence_layer/use_cases/summarize/summarize.py b/src/intelligence_layer/use_cases/summarize/summarize.py index cc0c1036a..6edf3544c 100644 --- a/src/intelligence_layer/use_cases/summarize/summarize.py +++ b/src/intelligence_layer/use_cases/summarize/summarize.py @@ -8,9 +8,11 @@ Example, MeanAccumulator, RougeGrader, - SuccessfulExampleOutput, ) -from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic +from intelligence_layer.evaluation.base_logic import ( + AggregationLogic, + SingleOutputEvaluationLogic, +) class LongContextSummarizeInput(BaseModel): @@ -109,7 +111,7 @@ def aggregate( class SingleChunkSummarizeEvaluationLogic( - EvaluationLogic[ + SingleOutputEvaluationLogic[ SingleChunkSummarizeInput, SummarizeOutput, str, @@ -121,22 +123,20 @@ def __init__(self) -> None: self.bleu_grader = BleuGrader() self.rouge_grader = RougeGrader() - def do_evaluate( + def do_evaluate_single_output( self, example: Example[SingleChunkSummarizeInput, str], - *output: SuccessfulExampleOutput[SummarizeOutput], + output: SummarizeOutput, ) -> SummarizeEvaluation: - assert len(output) == 1 - single_output = output[0].output bleu_score = self.bleu_grader.calculate_bleu( - single_output.summary, example.expected_output + output.summary, example.expected_output ) rouge_score = self.rouge_grader.calculate_rouge( - single_output.summary, example.expected_output + output.summary, example.expected_output ) return SummarizeEvaluation( - bleu=bleu_score, rouge=rouge_score.recall, output=single_output + bleu=bleu_score, rouge=rouge_score.recall, output=output ) @@ -150,7 +150,7 @@ def aggregate( class LongContextSummarizeEvaluationLogic( - EvaluationLogic[ + SingleOutputEvaluationLogic[ LongContextSummarizeInput, LongContextSummarizeOutput, str, @@ -162,16 +162,13 @@ def __init__(self) -> None: self.bleu_grader = BleuGrader() self.rouge_grader = RougeGrader() - def do_evaluate( + def do_evaluate_single_output( self, example: Example[LongContextSummarizeInput, str], - *output: SuccessfulExampleOutput[LongContextSummarizeOutput], + output: LongContextSummarizeOutput, ) -> SummarizeEvaluation: - assert len(output) == 1 - single_output = output[0].output joint_summary = " ".join( - partial_summary.summary - for partial_summary in single_output.partial_summaries + partial_summary.summary for partial_summary in output.partial_summaries ) bleu_score = self.bleu_grader.calculate_bleu( joint_summary, example.expected_output @@ -181,7 +178,7 @@ def do_evaluate( ) return SummarizeEvaluation( - bleu=bleu_score, rouge=rouge_score.recall, output=single_output + bleu=bleu_score, rouge=rouge_score.recall, output=output ) diff --git a/tests/evaluation/test_evaluator.py b/tests/evaluation/test_evaluator.py index 11b53d419..4c00cdf69 100644 --- a/tests/evaluation/test_evaluator.py +++ b/tests/evaluation/test_evaluator.py @@ -16,7 +16,11 @@ Runner, SuccessfulExampleOutput, ) -from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic +from intelligence_layer.evaluation.base_logic import ( + AggregationLogic, + EvaluationLogic, + SingleOutputEvaluationLogic, +) from intelligence_layer.evaluation.data_storage.aggregation_repository import ( InMemoryAggregationRepository, ) @@ -38,20 +42,20 @@ def aggregate( class DummyEvaluationLogic( - EvaluationLogic[ + SingleOutputEvaluationLogic[ str, str, None, DummyEvaluation, ] ): - def do_evaluate( + def do_evaluate_single_output( self, example: Example[str, None], - *output: SuccessfulExampleOutput[str], + output: str, ) -> DummyEvaluation: assert len(output) == 1 - single_output = output[0].output + single_output = output if single_output == FAIL_IN_EVAL_INPUT: raise RuntimeError(output) return DummyEvaluation(result="pass")