diff --git a/src/intelligence_layer/core/__init__.py b/src/intelligence_layer/core/__init__.py index 56ba0405b..c006045c9 100644 --- a/src/intelligence_layer/core/__init__.py +++ b/src/intelligence_layer/core/__init__.py @@ -19,6 +19,8 @@ from .explain import Explain as Explain from .explain import ExplainInput as ExplainInput from .explain import ExplainOutput as ExplainOutput +from .instruct import Instruct as Instruct +from .instruct import InstructInput as InstructInput from .intelligence_app import ( AuthenticatedIntelligenceApp as AuthenticatedIntelligenceApp, ) diff --git a/src/intelligence_layer/evaluation/argilla.py b/src/intelligence_layer/evaluation/argilla.py index 41e288ea1..1ce8f436e 100644 --- a/src/intelligence_layer/evaluation/argilla.py +++ b/src/intelligence_layer/evaluation/argilla.py @@ -12,7 +12,7 @@ Question, RecordData, ) -from intelligence_layer.core import Input, InstructInput, Output, PromptOutput +from intelligence_layer.core import CompleteOutput, Input, InstructInput, Output from intelligence_layer.evaluation import Aggregator from intelligence_layer.evaluation.accumulator import MeanAccumulator from intelligence_layer.evaluation.base_logic import AggregationLogic, EvaluationLogic @@ -200,7 +200,7 @@ def aggregate( class InstructComparisonArgillaEvaluationLogic( - ArgillaEvaluationLogic[InstructInput, PromptOutput, None] + ArgillaEvaluationLogic[InstructInput, CompleteOutput, None] ): def __init__( self, @@ -215,11 +215,11 @@ def __init__( def _to_record( self, example: Example[InstructInput, None], - *outputs: SuccessfulExampleOutput[PromptOutput], + *outputs: SuccessfulExampleOutput[CompleteOutput], ) -> RecordDataSequence: def create_record_data( - first: SuccessfulExampleOutput[PromptOutput], - second: SuccessfulExampleOutput[PromptOutput], + first: SuccessfulExampleOutput[CompleteOutput], + second: SuccessfulExampleOutput[CompleteOutput], ) -> RecordData: if random.choice([True, False]): first, second = second, first diff --git a/src/intelligence_layer/evaluation/instruct_comparison_argilla_evaluator.py b/src/intelligence_layer/evaluation/instruct_comparison_argilla_evaluator.py deleted file mode 100644 index 5c2ad9873..000000000 --- a/src/intelligence_layer/evaluation/instruct_comparison_argilla_evaluator.py +++ /dev/null @@ -1,167 +0,0 @@ -import random -from itertools import combinations -from typing import Iterable, Mapping, Optional, Sequence - -from pydantic import BaseModel - -from intelligence_layer.connectors import Field -from intelligence_layer.connectors.argilla.argilla_client import ( - ArgillaEvaluation, - Question, - RecordData, -) -from intelligence_layer.core.instruct import InstructInput -from intelligence_layer.core.model import CompleteOutput -from intelligence_layer.evaluation import ( - ArgillaEvaluationRepository, - DatasetRepository, - Example, - MeanAccumulator, - RunRepository, - SuccessfulExampleOutput, -) -from intelligence_layer.evaluation.data_storage.aggregation_repository import ( - AggregationRepository, -) -from intelligence_layer.evaluation.elo import ( - AutomatedEloComparison, - EloCalculator, - EloComparison, - PlayerScore, - WinRateCalculator, - build_tournaments, -) -from intelligence_layer.evaluation.evaluator import ArgillaEvaluator - - -class AggregatedInstructComparison(BaseModel): - scores: Mapping[str, PlayerScore] - - -class InstructComparisonArgillaEvaluator( - ArgillaEvaluator[ - InstructInput, - CompleteOutput, - None, - AggregatedInstructComparison, - ] -): - KEY_INSTRUCTION = "instruction" - KEY_INPUT = "input" - KEY_RESPONSE_1 = "first" - KEY_RESPONSE_2 = "second" - KEY_QUESTION = "winner" - OPTIONS = [1, 2, 3] - - def __init__( - self, - dataset_repository: DatasetRepository, - run_repository: RunRepository, - evaluation_repository: ArgillaEvaluationRepository, - aggregation_repository: AggregationRepository, - description: str, - workspace_id: str, - high_priority_runs: Optional[frozenset[str]] = None, - ) -> None: - fields = [ - Field(name=self.KEY_INSTRUCTION, title="Instruction"), - Field(name=self.KEY_INPUT, title="Input"), - Field(name=self.KEY_RESPONSE_1, title="Response 1"), - Field(name=self.KEY_RESPONSE_2, title="Response 2"), - ] - questions = [ - Question( - name=self.KEY_QUESTION, - title="Which response is better?", - description="1: The first completion is better.\n2: The second completion is better.\n3: They are both equally good.", - options=self.OPTIONS, - ) - ] - - super().__init__( - dataset_repository, - run_repository, - evaluation_repository, - aggregation_repository, - description, - workspace_id, - fields, - questions, - ) - self._high_priority_runs = high_priority_runs - - def _to_record( - self, - example: Example[InstructInput, None], - *example_outputs: SuccessfulExampleOutput[CompleteOutput], - ) -> Sequence[RecordData]: - def create_record_data( - first: SuccessfulExampleOutput[CompleteOutput], - second: SuccessfulExampleOutput[CompleteOutput], - ) -> RecordData: - if random.choice([True, False]): - first, second = second, first - return RecordData( - content={ - self.KEY_INSTRUCTION: example.input.instruction, - self.KEY_INPUT: example.input.input or "", - self.KEY_RESPONSE_1: first.output.completion, - self.KEY_RESPONSE_2: second.output.completion, - }, - example_id=example.id, - metadata={ - self.KEY_RESPONSE_1: first.run_id, - self.KEY_RESPONSE_2: second.run_id, - }, - ) - - pairs = combinations(example_outputs, 2) - return [ - create_record_data(first, second) - for [first, second] in pairs - if self._high_priority_runs is None - or any( - run_id in self._high_priority_runs - for run_id in [first.run_id, second.run_id] - ) - ] - - def aggregate( - self, evaluations: Iterable[ArgillaEvaluation] - ) -> AggregatedInstructComparison: - elo_evaluations = [ - AutomatedEloComparison( - outputs=[ - EloComparison( - example_id=evaluation.example_id, - winner=int(evaluation.responses["winner"]), - first_run_id=evaluation.metadata["first"], - second_run_id=evaluation.metadata["second"], - ) - ] - ) - for evaluation in evaluations - ] - tournaments, players = build_tournaments(elo_evaluations) - - accumulators = {p: MeanAccumulator() for p in players} - tournaments_list = list(tournaments.items()) - for _ in range(100): - elo_calc = EloCalculator(players) - random.shuffle(tournaments_list) - for _, tournament in tournaments_list: - elo_calc.calculate_tournament(tournament) - for p in players: - accumulators[p].add(elo_calc.ratings[p]) - - win_rate_calc = WinRateCalculator(players) - win_rate = win_rate_calc.calculate( - [battle for tournament in tournaments.values() for battle in tournament] - ) - - return AggregatedInstructComparison( - scores={ - p: PlayerScore(elo=acc.extract(), win_rate=win_rate[p]) - for p, acc in accumulators.items() - }, - ) diff --git a/src/intelligence_layer/use_cases/__init__.py b/src/intelligence_layer/use_cases/__init__.py index c7c481641..2393f8c4b 100644 --- a/src/intelligence_layer/use_cases/__init__.py +++ b/src/intelligence_layer/use_cases/__init__.py @@ -63,7 +63,10 @@ AggregatedSummarizeEvaluation as AggregatedSummarizeEvaluation, ) from .summarize.summarize import ( - LongContextSummarizeEvaluator as LongContextSummarizeEvaluator, + LongContextSummarizeAggregationLogic as LongContextSummarizeAggregationLogic, +) +from .summarize.summarize import ( + LongContextSummarizeEvaluationLogic as LongContextSummarizeEvaluationLogic, ) from .summarize.summarize import LongContextSummarizeInput as LongContextSummarizeInput from .summarize.summarize import ( @@ -71,7 +74,10 @@ ) from .summarize.summarize import PartialSummary as PartialSummary from .summarize.summarize import ( - SingleChunkSummarizeEvaluator as SingleChunkSummarizeEvaluator, + SingleChunkSummarizeAggregationLogic as SingleChunkSummarizeAggregationLogic, +) +from .summarize.summarize import ( + SingleChunkSummarizeEvaluationLogic as SingleChunkSummarizeEvaluationLogic, ) from .summarize.summarize import SingleChunkSummarizeInput as SingleChunkSummarizeInput from .summarize.summarize import SummarizeEvaluation as SummarizeEvaluation diff --git a/tests/use_cases/summarize/test_summarize.py b/tests/use_cases/summarize/test_summarize.py index 913bbd4f0..0bbc5a60d 100644 --- a/tests/use_cases/summarize/test_summarize.py +++ b/tests/use_cases/summarize/test_summarize.py @@ -5,6 +5,7 @@ Aggregator, DatasetRepository, EvaluationRepository, + Evaluator, Example, InMemoryDatasetRepository, InMemoryEvaluationRepository, @@ -16,7 +17,9 @@ InMemoryAggregationRepository, ) from intelligence_layer.use_cases import ( - LongContextSummarizeEvaluator, + AggregatedSummarizeEvaluation, + LongContextSummarizeAggregationLogic, + LongContextSummarizeEvaluationLogic, LongContextSummarizeInput, LongContextSummarizeOutput, SingleChunkSummarizeAggregationLogic, @@ -60,7 +63,7 @@ def single_chunk_summarize_aggregator( in_memory_evaluation_repository: InMemoryEvaluationRepository, in_memory_aggregation_repository: InMemoryAggregationRepository, single_chunk_summarize_aggregation_logic: SingleChunkSummarizeAggregationLogic, -) -> Aggregator[SummarizeEvaluation, AggregatedSummarizeEvaluation,]: +) -> Aggregator[SummarizeEvaluation, AggregatedSummarizeEvaluation]: return Aggregator( in_memory_evaluation_repository, in_memory_aggregation_repository,