Skip to content

Commit

Permalink
IL-259 fix output logic
Browse files Browse the repository at this point in the history
  • Loading branch information
Merlin Kallenborn committed Feb 21, 2024
1 parent f313de6 commit 00588cc
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 42 deletions.
21 changes: 20 additions & 1 deletion src/intelligence_layer/evaluation/base_logic.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import Generic, Iterable
from typing import Generic, Iterable, final

from intelligence_layer.core import Input, Output
from intelligence_layer.evaluation.domain import (
Expand Down Expand Up @@ -48,3 +48,22 @@ def do_evaluate(
The metrics that come from the evaluated :class:`Task`.
"""
pass


class SingleOutputEvaluationLogic(
EvaluationLogic[Input, Output, ExpectedOutput, Evaluation]
):
@final
def do_evaluate(
self,
example: Example[Input, ExpectedOutput],
*output: SuccessfulExampleOutput[Output],
) -> Evaluation:
assert len(output) == 1
return self.do_evaluate_single_output(example, output[0].output)

@abstractmethod
def do_evaluate_single_output(
self, example: Example[Input, ExpectedOutput], output: Output
) -> Evaluation:
pass
30 changes: 12 additions & 18 deletions src/intelligence_layer/use_cases/classify/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@
from pydantic import BaseModel

from intelligence_layer.core import Chunk
from intelligence_layer.evaluation import (
Example,
MeanAccumulator,
SuccessfulExampleOutput,
from intelligence_layer.evaluation import Example, MeanAccumulator
from intelligence_layer.evaluation.base_logic import (
AggregationLogic,
SingleOutputEvaluationLogic,
)
from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic

Probability = NewType("Probability", float)

Expand Down Expand Up @@ -87,21 +86,20 @@ def aggregate(


class SingleLabelClassifyEvaluationLogic(
EvaluationLogic[
SingleOutputEvaluationLogic[
ClassifyInput,
SingleLabelClassifyOutput,
Sequence[str],
SingleLabelClassifyEvaluation,
]
):
def do_evaluate(
def do_evaluate_single_output(
self,
example: Example[ClassifyInput, Sequence[str]],
*output: SuccessfulExampleOutput[SingleLabelClassifyOutput],
output: SingleLabelClassifyOutput,
) -> SingleLabelClassifyEvaluation:
assert len(output) == 1
sorted_classes = sorted(
output[0].output.scores.items(), key=lambda item: item[1], reverse=True
output.scores.items(), key=lambda item: item[1], reverse=True
)
if sorted_classes[0][0] in example.expected_output:
correct = True
Expand Down Expand Up @@ -236,7 +234,7 @@ def aggregate(


class MultiLabelClassifyEvaluationLogic(
EvaluationLogic[
SingleOutputEvaluationLogic[
ClassifyInput,
MultiLabelClassifyOutput,
Sequence[str],
Expand All @@ -250,17 +248,13 @@ def __init__(
super().__init__()
self.threshold = threshold

def do_evaluate(
def do_evaluate_single_output(
self,
example: Example[ClassifyInput, Sequence[str]],
*output: SuccessfulExampleOutput[MultiLabelClassifyOutput],
output: MultiLabelClassifyOutput,
) -> MultiLabelClassifyEvaluation:
assert len(output) == 1
single_output = output[0].output
predicted_classes = frozenset(
label
for label, score in single_output.scores.items()
if score > self.threshold
label for label, score in output.scores.items() if score > self.threshold
)
expected_classes = frozenset(example.expected_output)
tp = predicted_classes & expected_classes
Expand Down
33 changes: 15 additions & 18 deletions src/intelligence_layer/use_cases/summarize/summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
Example,
MeanAccumulator,
RougeGrader,
SuccessfulExampleOutput,
)
from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic
from intelligence_layer.evaluation.base_logic import (
AggregationLogic,
SingleOutputEvaluationLogic,
)


class LongContextSummarizeInput(BaseModel):
Expand Down Expand Up @@ -109,7 +111,7 @@ def aggregate(


class SingleChunkSummarizeEvaluationLogic(
EvaluationLogic[
SingleOutputEvaluationLogic[
SingleChunkSummarizeInput,
SummarizeOutput,
str,
Expand All @@ -121,22 +123,20 @@ def __init__(self) -> None:
self.bleu_grader = BleuGrader()
self.rouge_grader = RougeGrader()

def do_evaluate(
def do_evaluate_single_output(
self,
example: Example[SingleChunkSummarizeInput, str],
*output: SuccessfulExampleOutput[SummarizeOutput],
output: SummarizeOutput,
) -> SummarizeEvaluation:
assert len(output) == 1
single_output = output[0].output
bleu_score = self.bleu_grader.calculate_bleu(
single_output.summary, example.expected_output
output.summary, example.expected_output
)
rouge_score = self.rouge_grader.calculate_rouge(
single_output.summary, example.expected_output
output.summary, example.expected_output
)

return SummarizeEvaluation(
bleu=bleu_score, rouge=rouge_score.recall, output=single_output
bleu=bleu_score, rouge=rouge_score.recall, output=output
)


Expand All @@ -150,7 +150,7 @@ def aggregate(


class LongContextSummarizeEvaluationLogic(
EvaluationLogic[
SingleOutputEvaluationLogic[
LongContextSummarizeInput,
LongContextSummarizeOutput,
str,
Expand All @@ -162,16 +162,13 @@ def __init__(self) -> None:
self.bleu_grader = BleuGrader()
self.rouge_grader = RougeGrader()

def do_evaluate(
def do_evaluate_single_output(
self,
example: Example[LongContextSummarizeInput, str],
*output: SuccessfulExampleOutput[LongContextSummarizeOutput],
output: LongContextSummarizeOutput,
) -> SummarizeEvaluation:
assert len(output) == 1
single_output = output[0].output
joint_summary = " ".join(
partial_summary.summary
for partial_summary in single_output.partial_summaries
partial_summary.summary for partial_summary in output.partial_summaries
)
bleu_score = self.bleu_grader.calculate_bleu(
joint_summary, example.expected_output
Expand All @@ -181,7 +178,7 @@ def do_evaluate(
)

return SummarizeEvaluation(
bleu=bleu_score, rouge=rouge_score.recall, output=single_output
bleu=bleu_score, rouge=rouge_score.recall, output=output
)


Expand Down
14 changes: 9 additions & 5 deletions tests/evaluation/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
Runner,
SuccessfulExampleOutput,
)
from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic
from intelligence_layer.evaluation.base_logic import (
AggregationLogic,
EvaluationLogic,
SingleOutputEvaluationLogic,
)
from intelligence_layer.evaluation.data_storage.aggregation_repository import (
InMemoryAggregationRepository,
)
Expand All @@ -38,20 +42,20 @@ def aggregate(


class DummyEvaluationLogic(
EvaluationLogic[
SingleOutputEvaluationLogic[
str,
str,
None,
DummyEvaluation,
]
):
def do_evaluate(
def do_evaluate_single_output(
self,
example: Example[str, None],
*output: SuccessfulExampleOutput[str],
output: str,
) -> DummyEvaluation:
assert len(output) == 1
single_output = output[0].output
single_output = output
if single_output == FAIL_IN_EVAL_INPUT:
raise RuntimeError(output)
return DummyEvaluation(result="pass")
Expand Down

0 comments on commit 00588cc

Please sign in to comment.