From 7481983c7530160ca515776496bf79e1f862dc98 Mon Sep 17 00:00:00 2001 From: Merlin Kallenborn Date: Wed, 15 May 2024 14:25:20 +0200 Subject: [PATCH] refactor: Change import structure for Elo Evaluation and Aggregation related types and fix elo qa eval notebook TASK: IL-394 --- src/documentation/elo_qa_eval.ipynb | 26 ++++++++++--------- .../evaluation/aggregation/elo_aggregation.py | 7 +++-- .../evaluation/evaluator/argilla_evaluator.py | 5 +++- .../evaluation/evaluator/elo_evaluator.py | 2 +- 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/src/documentation/elo_qa_eval.ipynb b/src/documentation/elo_qa_eval.ipynb index 467281ee6..714b3c628 100644 --- a/src/documentation/elo_qa_eval.ipynb +++ b/src/documentation/elo_qa_eval.ipynb @@ -33,7 +33,7 @@ "from dotenv import load_dotenv\n", "\n", "from intelligence_layer.connectors import LimitedConcurrencyClient\n", - "from intelligence_layer.evaluation.evaluation.elo_evaluator import Match\n", + "from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import Matches\n", "\n", "load_dotenv()\n", "\n", @@ -289,19 +289,20 @@ "metadata": {}, "outputs": [], "source": [ + "from intelligence_layer.core.model import Llama3InstructModel\n", "from intelligence_layer.evaluation import Evaluator\n", - "from intelligence_layer.evaluation.evaluation.elo_evaluator import EloEvaluationLogic\n", + "from intelligence_layer.examples.qa.elo_qa import EloQaEvaluationLogic\n", "\n", - "elo_evaluation_logic: EloEvaluationLogic[\n", - " SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput\n", - "] = EloEvaluationLogic()\n", + "elo_qa_evaluation_logic = EloQaEvaluationLogic(\n", + " model=Llama3InstructModel(name=\"llama-3-8b-instruct\")\n", + ")\n", "\n", "evaluator = Evaluator(\n", " dataset_repository=dataset_repository,\n", " run_repository=run_repository,\n", " evaluation_repository=evaluation_repository,\n", " description=\"ELO QA evaluation\", # this description will be used later to query for specific evaluations\n", - " evaluation_logic=elo_evaluation_logic,\n", + " evaluation_logic=elo_qa_evaluation_logic,\n", ")" ] }, @@ -321,11 +322,14 @@ "outputs": [], "source": [ "# ensure that for each example there are evaluated comparisons\n", + "\n", + "\n", "for example_evaluation in evaluation_repository.example_evaluations(\n", - " evaluation_overview.id, Match\n", + " evaluation_overview.id, Matches\n", "):\n", + " assert isinstance(example_evaluation.result, Matches)\n", " assert (\n", - " len(example_evaluation.result.matches) > 0\n", + " len(example_evaluation.result.comparison_evaluations) > 0\n", " ), f\"There are no matches (comparisons) for example ID {example_evaluation.example_id}\"" ] }, @@ -374,17 +378,15 @@ "metadata": {}, "outputs": [], "source": [ - "from intelligence_layer_experiments.use_cases.elo_usecase.elo_qa_aggregator import (\n", - " EloQaAggregationLogic,\n", - ")\n", "\n", "from intelligence_layer.evaluation import Aggregator\n", + "from intelligence_layer.evaluation.aggregation.elo_aggregation import MatchesAggregationLogic\n", "\n", "aggregator = Aggregator(\n", " evaluation_repository=evaluation_repository,\n", " aggregation_repository=aggregation_repository,\n", " description=\"ELO QA aggregation\",\n", - " aggregation_logic=EloQaAggregationLogic(),\n", + " aggregation_logic=MatchesAggregationLogic(),\n", ")\n", "\n", "aggregated_evaluation = aggregator.aggregate_evaluation(evaluation_overview.id)" diff --git a/src/intelligence_layer/evaluation/aggregation/elo_aggregation.py b/src/intelligence_layer/evaluation/aggregation/elo_aggregation.py index 3877839e0..006beeeb3 100644 --- a/src/intelligence_layer/evaluation/aggregation/elo_aggregation.py +++ b/src/intelligence_layer/evaluation/aggregation/elo_aggregation.py @@ -5,10 +5,13 @@ import numpy as np from pydantic import BaseModel -from intelligence_layer.evaluation import ComparisonEvaluation, MatchOutcome from intelligence_layer.evaluation.aggregation.accumulator import MeanAccumulator from intelligence_layer.evaluation.aggregation.aggregator import AggregationLogic -from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import Matches +from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import ( + ComparisonEvaluation, + Matches, + MatchOutcome, +) class PlayerScore(BaseModel): diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py index e38a9ef85..efa2e02ab 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/argilla_evaluator.py @@ -14,7 +14,6 @@ RecordData, ) from intelligence_layer.core import CompleteOutput, Input, InstructInput, Output -from intelligence_layer.evaluation import ComparisonEvaluation, MatchOutcome from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput from intelligence_layer.evaluation.evaluation.domain import ( @@ -31,6 +30,10 @@ from intelligence_layer.evaluation.evaluation.evaluator.base_evaluator import ( EvaluationLogicBase, ) +from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import ( + ComparisonEvaluation, + MatchOutcome, +) from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput from intelligence_layer.evaluation.run.run_repository import RunRepository diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/elo_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/elo_evaluator.py index 9bd794cc7..68739cc8c 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/elo_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/elo_evaluator.py @@ -6,8 +6,8 @@ from pydantic import BaseModel from intelligence_layer.core import Input, Output -from intelligence_layer.evaluation import EvaluationLogic from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput +from intelligence_layer.evaluation.evaluation.evaluator.evaluator import EvaluationLogic from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput