diff --git a/src/intelligence_layer/evaluation/argilla.py b/src/intelligence_layer/evaluation/argilla.py index 2ecef69b5..22851f362 100644 --- a/src/intelligence_layer/evaluation/argilla.py +++ b/src/intelligence_layer/evaluation/argilla.py @@ -14,20 +14,34 @@ RecordDataSequence, ) from intelligence_layer.core import Input, InstructInput, Output, PromptOutput -from intelligence_layer.evaluation.data_storage.run_repository import RunRepository -from intelligence_layer.evaluation.data_storage.dataset_repository import DatasetRepository -from intelligence_layer.evaluation.evaluator import Evaluator - +from intelligence_layer.evaluation.accumulator import MeanAccumulator from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic +from intelligence_layer.evaluation.data_storage.aggregation_repository import ( + AggregationRepository, +) +from intelligence_layer.evaluation.data_storage.dataset_repository import ( + DatasetRepository, +) +from intelligence_layer.evaluation.data_storage.evaluation_repository import ( + ArgillaEvaluationRepository, + EvaluationRepository, +) +from intelligence_layer.evaluation.data_storage.run_repository import RunRepository from intelligence_layer.evaluation.domain import ( AggregatedEvaluation, - SuccessfulExampleOutput, ExpectedOutput, Example, + Example, + ExpectedOutput, + SuccessfulExampleOutput, ) from intelligence_layer.evaluation.elo import ( AutomatedEloComparison, + EloCalculator, EloComparison, - build_tournaments, PlayerScore, + PlayerScore, + WinRateCalculator, + build_tournaments, ) +from intelligence_layer.evaluation.evaluator import Evaluator class ArgillaEvaluationLogic( diff --git a/src/intelligence_layer/evaluation/base_logic.py b/src/intelligence_layer/evaluation/base_logic.py index 905d87c2b..8598c6ef9 100644 --- a/src/intelligence_layer/evaluation/base_logic.py +++ b/src/intelligence_layer/evaluation/base_logic.py @@ -2,9 +2,11 @@ from typing import Generic, Iterable from intelligence_layer.core import Input, Output -from intelligence_layer.evaluation import Evaluation, Example, ExpectedOutput from intelligence_layer.evaluation.domain import ( AggregatedEvaluation, + Evaluation, + Example, + ExpectedOutput, SuccessfulExampleOutput, ) diff --git a/tests/use_cases/summarize/test_summarize.py b/tests/use_cases/summarize/test_summarize.py index 7a5042e7e..fbee3847f 100644 --- a/tests/use_cases/summarize/test_summarize.py +++ b/tests/use_cases/summarize/test_summarize.py @@ -199,7 +199,6 @@ def test_long_context_summarize_evaluator( long_context_summarize_runner: Runner[str, str], in_memory_dataset_repository: InMemoryDatasetRepository, long_text: str, - no_op_tracer: NoOpTracer, ) -> None: input = LongContextSummarizeInput(text=long_text, language=Language("en")) bad_example = Example(