IL-259 fix output logic

Aleph-Alpha · Feb 21, 2024 · 00588cc · 00588cc
1 parent f313de6
commit 00588cc
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 42 deletions.
diff --git a/src/intelligence_layer/evaluation/base_logic.py b/src/intelligence_layer/evaluation/base_logic.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Generic, Iterable
+from typing import Generic, Iterable, final
 
 from intelligence_layer.core import Input, Output
 from intelligence_layer.evaluation.domain import (
@@ -48,3 +48,22 @@ def do_evaluate(
             The metrics that come from the evaluated :class:`Task`.
         """
         pass
+
+
+class SingleOutputEvaluationLogic(
+    EvaluationLogic[Input, Output, ExpectedOutput, Evaluation]
+):
+    @final
+    def do_evaluate(
+        self,
+        example: Example[Input, ExpectedOutput],
+        *output: SuccessfulExampleOutput[Output],
+    ) -> Evaluation:
+        assert len(output) == 1
+        return self.do_evaluate_single_output(example, output[0].output)
+
+    @abstractmethod
+    def do_evaluate_single_output(
+        self, example: Example[Input, ExpectedOutput], output: Output
+    ) -> Evaluation:
+        pass
diff --git a/src/intelligence_layer/use_cases/classify/classify.py b/src/intelligence_layer/use_cases/classify/classify.py
@@ -4,12 +4,11 @@
 from pydantic import BaseModel
 
 from intelligence_layer.core import Chunk
-from intelligence_layer.evaluation import (
-    Example,
-    MeanAccumulator,
-    SuccessfulExampleOutput,
+from intelligence_layer.evaluation import Example, MeanAccumulator
+from intelligence_layer.evaluation.base_logic import (
+    AggregationLogic,
+    SingleOutputEvaluationLogic,
 )
-from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic
 
 Probability = NewType("Probability", float)
 
@@ -87,21 +86,20 @@ def aggregate(
 
 
 class SingleLabelClassifyEvaluationLogic(
-    EvaluationLogic[
+    SingleOutputEvaluationLogic[
         ClassifyInput,
         SingleLabelClassifyOutput,
         Sequence[str],
         SingleLabelClassifyEvaluation,
     ]
 ):
-    def do_evaluate(
+    def do_evaluate_single_output(
         self,
         example: Example[ClassifyInput, Sequence[str]],
-        *output: SuccessfulExampleOutput[SingleLabelClassifyOutput],
+        output: SingleLabelClassifyOutput,
     ) -> SingleLabelClassifyEvaluation:
-        assert len(output) == 1
         sorted_classes = sorted(
-            output[0].output.scores.items(), key=lambda item: item[1], reverse=True
+            output.scores.items(), key=lambda item: item[1], reverse=True
         )
         if sorted_classes[0][0] in example.expected_output:
             correct = True
@@ -236,7 +234,7 @@ def aggregate(
 
 
 class MultiLabelClassifyEvaluationLogic(
-    EvaluationLogic[
+    SingleOutputEvaluationLogic[
         ClassifyInput,
         MultiLabelClassifyOutput,
         Sequence[str],
@@ -250,17 +248,13 @@ def __init__(
         super().__init__()
         self.threshold = threshold
 
-    def do_evaluate(
+    def do_evaluate_single_output(
         self,
         example: Example[ClassifyInput, Sequence[str]],
-        *output: SuccessfulExampleOutput[MultiLabelClassifyOutput],
+        output: MultiLabelClassifyOutput,
     ) -> MultiLabelClassifyEvaluation:
-        assert len(output) == 1
-        single_output = output[0].output
         predicted_classes = frozenset(
-            label
-            for label, score in single_output.scores.items()
-            if score > self.threshold
+            label for label, score in output.scores.items() if score > self.threshold
         )
         expected_classes = frozenset(example.expected_output)
         tp = predicted_classes & expected_classes

diff --git a/src/intelligence_layer/use_cases/summarize/summarize.py b/src/intelligence_layer/use_cases/summarize/summarize.py
@@ -8,9 +8,11 @@
     Example,
     MeanAccumulator,
     RougeGrader,
-    SuccessfulExampleOutput,
 )
-from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic
+from intelligence_layer.evaluation.base_logic import (
+    AggregationLogic,
+    SingleOutputEvaluationLogic,
+)
 
 
 class LongContextSummarizeInput(BaseModel):
@@ -109,7 +111,7 @@ def aggregate(
 
 
 class SingleChunkSummarizeEvaluationLogic(
-    EvaluationLogic[
+    SingleOutputEvaluationLogic[
         SingleChunkSummarizeInput,
         SummarizeOutput,
         str,
@@ -121,22 +123,20 @@ def __init__(self) -> None:
         self.bleu_grader = BleuGrader()
         self.rouge_grader = RougeGrader()
 
-    def do_evaluate(
+    def do_evaluate_single_output(
         self,
         example: Example[SingleChunkSummarizeInput, str],
-        *output: SuccessfulExampleOutput[SummarizeOutput],
+        output: SummarizeOutput,
     ) -> SummarizeEvaluation:
-        assert len(output) == 1
-        single_output = output[0].output
         bleu_score = self.bleu_grader.calculate_bleu(
-            single_output.summary, example.expected_output
+            output.summary, example.expected_output
         )
         rouge_score = self.rouge_grader.calculate_rouge(
-            single_output.summary, example.expected_output
+            output.summary, example.expected_output
         )
 
         return SummarizeEvaluation(
-            bleu=bleu_score, rouge=rouge_score.recall, output=single_output
+            bleu=bleu_score, rouge=rouge_score.recall, output=output
         )
 
 
@@ -150,7 +150,7 @@ def aggregate(
 
 
 class LongContextSummarizeEvaluationLogic(
-    EvaluationLogic[
+    SingleOutputEvaluationLogic[
         LongContextSummarizeInput,
         LongContextSummarizeOutput,
         str,
@@ -162,16 +162,13 @@ def __init__(self) -> None:
         self.bleu_grader = BleuGrader()
         self.rouge_grader = RougeGrader()
 
-    def do_evaluate(
+    def do_evaluate_single_output(
         self,
         example: Example[LongContextSummarizeInput, str],
-        *output: SuccessfulExampleOutput[LongContextSummarizeOutput],
+        output: LongContextSummarizeOutput,
     ) -> SummarizeEvaluation:
-        assert len(output) == 1
-        single_output = output[0].output
         joint_summary = " ".join(
-            partial_summary.summary
-            for partial_summary in single_output.partial_summaries
+            partial_summary.summary for partial_summary in output.partial_summaries
         )
         bleu_score = self.bleu_grader.calculate_bleu(
             joint_summary, example.expected_output
@@ -181,7 +178,7 @@ def do_evaluate(
         )
 
         return SummarizeEvaluation(
-            bleu=bleu_score, rouge=rouge_score.recall, output=single_output
+            bleu=bleu_score, rouge=rouge_score.recall, output=output
         )
 
 

diff --git a/tests/evaluation/test_evaluator.py b/tests/evaluation/test_evaluator.py
@@ -16,7 +16,11 @@
     Runner,
     SuccessfulExampleOutput,
 )
-from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic
+from intelligence_layer.evaluation.base_logic import (
+    AggregationLogic,
+    EvaluationLogic,
+    SingleOutputEvaluationLogic,
+)
 from intelligence_layer.evaluation.data_storage.aggregation_repository import (
     InMemoryAggregationRepository,
 )
@@ -38,20 +42,20 @@ def aggregate(
 
 
 class DummyEvaluationLogic(
-    EvaluationLogic[
+    SingleOutputEvaluationLogic[
         str,
         str,
         None,
         DummyEvaluation,
     ]
 ):
-    def do_evaluate(
+    def do_evaluate_single_output(
         self,
         example: Example[str, None],
-        *output: SuccessfulExampleOutput[str],
+        output: str,
     ) -> DummyEvaluation:
         assert len(output) == 1
-        single_output = output[0].output
+        single_output = output
         if single_output == FAIL_IN_EVAL_INPUT:
             raise RuntimeError(output)
         return DummyEvaluation(result="pass")