Skip to content

Commit

Permalink
IL-259 fix TODOs
Browse files Browse the repository at this point in the history
  • Loading branch information
Merlin Kallenborn committed Feb 21, 2024
1 parent ce43c27 commit f313de6
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 19 deletions.
18 changes: 14 additions & 4 deletions src/intelligence_layer/evaluation/argilla.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,21 @@ class ArgillaEvaluator(
This evaluator runs a dataset and sends the input and output to Argilla to be evaluated.
After they have been evaluated, you can fetch the results by using the `aggregate_evaluation` method.
Args:
Arguments:
dataset_repository: The repository with the examples that will be taken for the evaluation.
run_repository: The repository of the runs to evaluate.
evaluation_repository: The repository that will be used to store evaluation results.
dataset_repository: The repository with the examples that will be taken for the evaluation
description: human-readable description for the evaluator
# TODO: docstrings
aggregation_repository: The repository that will be used to store aggregation results.
description: Human-readable description for the evaluator.
evaluation_logic: The logic to use for evaluation.
aggregation_logic: The logic to aggregate the evaluations.
Generics:
Input: Interface to be passed to the :class:`Task` that shall be evaluated.
Output: Type of the output of the :class:`Task` to be evaluated.
ExpectedOutput: Output that is expected from the run with the supplied input.
ArgillaEvaluation: Interface of the metrics that come from the Argilla task`.
AggregatedEvaluation: The aggregated results of an evaluation run with a :class:`Dataset`.
"""

def __init__(
Expand Down
7 changes: 3 additions & 4 deletions src/intelligence_layer/evaluation/base_logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,14 @@ def do_evaluate(
example: Example[Input, ExpectedOutput],
*output: SuccessfulExampleOutput[Output],
) -> Evaluation:
"""Executes the evaluation for this use-case.
"""Executes the evaluation for this specific example.
Responsible for comparing the input & expected output of a task to the
actually generated output.
Args:
TODO: find a better way to describe this
example: The data example data whose input was passed to the :class:`Task` to produce the output.
output: Output of the :class:`Task` that shall be evaluated.
example: Input data of :class:`Task` to produce the output.
output: Output of the :class:`Task`.
Returns:
The metrics that come from the evaluated :class:`Task`.
Expand Down
12 changes: 8 additions & 4 deletions src/intelligence_layer/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,17 @@ class Evaluator(
"""Evaluator that can handle automatic evaluation scenarios.
This evaluator should be used for automatic eval. A user still has to implement
:func:`BaseEvaluator.do_evaluate` and :func:`BaseEvaluator.aggregate`.
:class:`EvaluationLogic` and :class: `AggregationLogic`.
#TODO fix arguments
Arguments:
dataset_repository: The repository with the examples that will be taken for the evaluation.
run_repository: The repository of the runs to evaluate.
evaluation_repository: The repository that will be used to store evaluation results.
dataset_repository: The repository with the examples that will be taken for the evaluation
description: human-readable description for the evaluator
aggregation_repository: The repository that will be used to store aggregation results.
description: Human-readable description for the evaluator.
evaluation_logic: The logic to use for evaluation.
aggregation_logic: The logic to aggregate the evaluations.
Generics:
Input: Interface to be passed to the :class:`Task` that shall be evaluated.
Expand Down
12 changes: 5 additions & 7 deletions src/intelligence_layer/use_cases/summarize/summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ class AggregatedSummarizeEvaluation(BaseModel):
Attributes:
aggregate_bleu: average over BLEU-scores
aggregate_rouge: average over ROUGE-scores
evaluation: The actual evaluations
"""

aggregate_bleu: float
Expand Down Expand Up @@ -130,10 +129,10 @@ def do_evaluate(
assert len(output) == 1
single_output = output[0].output
bleu_score = self.bleu_grader.calculate_bleu(
single_output.summary, example.expected_output # TODO
single_output.summary, example.expected_output
)
rouge_score = self.rouge_grader.calculate_rouge(
single_output.summary, example.expected_output # TODO
single_output.summary, example.expected_output
)

return SummarizeEvaluation(
Expand Down Expand Up @@ -163,14 +162,13 @@ def __init__(self) -> None:
self.bleu_grader = BleuGrader()
self.rouge_grader = RougeGrader()

# mypy expects *args where this method only uses one output
def do_evaluate( # type: ignore
def do_evaluate(
self,
example: Example[LongContextSummarizeInput, str],
*output: LongContextSummarizeOutput,
*output: SuccessfulExampleOutput[LongContextSummarizeOutput],
) -> SummarizeEvaluation:
assert len(output) == 1
single_output = output[0]
single_output = output[0].output
joint_summary = " ".join(
partial_summary.summary
for partial_summary in single_output.partial_summaries
Expand Down

0 comments on commit f313de6

Please sign in to comment.