Skip to content

Commit

Permalink
refactor: Change import structure for Elo Evaluation and Aggregation …
Browse files Browse the repository at this point in the history
…related types and fix elo qa eval notebook

TASK: IL-394
  • Loading branch information
MerlinKallenbornAA committed May 15, 2024
1 parent c306a29 commit 7481983
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 16 deletions.
26 changes: 14 additions & 12 deletions src/documentation/elo_qa_eval.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
"from dotenv import load_dotenv\n",
"\n",
"from intelligence_layer.connectors import LimitedConcurrencyClient\n",
"from intelligence_layer.evaluation.evaluation.elo_evaluator import Match\n",
"from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import Matches\n",
"\n",
"load_dotenv()\n",
"\n",
Expand Down Expand Up @@ -289,19 +289,20 @@
"metadata": {},
"outputs": [],
"source": [
"from intelligence_layer.core.model import Llama3InstructModel\n",
"from intelligence_layer.evaluation import Evaluator\n",
"from intelligence_layer.evaluation.evaluation.elo_evaluator import EloEvaluationLogic\n",
"from intelligence_layer.examples.qa.elo_qa import EloQaEvaluationLogic\n",
"\n",
"elo_evaluation_logic: EloEvaluationLogic[\n",
" SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput\n",
"] = EloEvaluationLogic()\n",
"elo_qa_evaluation_logic = EloQaEvaluationLogic(\n",
" model=Llama3InstructModel(name=\"llama-3-8b-instruct\")\n",
")\n",
"\n",
"evaluator = Evaluator(\n",
" dataset_repository=dataset_repository,\n",
" run_repository=run_repository,\n",
" evaluation_repository=evaluation_repository,\n",
" description=\"ELO QA evaluation\", # this description will be used later to query for specific evaluations\n",
" evaluation_logic=elo_evaluation_logic,\n",
" evaluation_logic=elo_qa_evaluation_logic,\n",
")"
]
},
Expand All @@ -321,11 +322,14 @@
"outputs": [],
"source": [
"# ensure that for each example there are evaluated comparisons\n",
"\n",
"\n",
"for example_evaluation in evaluation_repository.example_evaluations(\n",
" evaluation_overview.id, Match\n",
" evaluation_overview.id, Matches\n",
"):\n",
" assert isinstance(example_evaluation.result, Matches)\n",
" assert (\n",
" len(example_evaluation.result.matches) > 0\n",
" len(example_evaluation.result.comparison_evaluations) > 0\n",
" ), f\"There are no matches (comparisons) for example ID {example_evaluation.example_id}\""
]
},
Expand Down Expand Up @@ -374,17 +378,15 @@
"metadata": {},
"outputs": [],
"source": [
"from intelligence_layer_experiments.use_cases.elo_usecase.elo_qa_aggregator import (\n",
" EloQaAggregationLogic,\n",
")\n",
"\n",
"from intelligence_layer.evaluation import Aggregator\n",
"from intelligence_layer.evaluation.aggregation.elo_aggregation import MatchesAggregationLogic\n",
"\n",
"aggregator = Aggregator(\n",
" evaluation_repository=evaluation_repository,\n",
" aggregation_repository=aggregation_repository,\n",
" description=\"ELO QA aggregation\",\n",
" aggregation_logic=EloQaAggregationLogic(),\n",
" aggregation_logic=MatchesAggregationLogic(),\n",
")\n",
"\n",
"aggregated_evaluation = aggregator.aggregate_evaluation(evaluation_overview.id)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@
import numpy as np
from pydantic import BaseModel

from intelligence_layer.evaluation import ComparisonEvaluation, MatchOutcome
from intelligence_layer.evaluation.aggregation.accumulator import MeanAccumulator
from intelligence_layer.evaluation.aggregation.aggregator import AggregationLogic
from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import Matches
from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import (
ComparisonEvaluation,
Matches,
MatchOutcome,
)


class PlayerScore(BaseModel):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
RecordData,
)
from intelligence_layer.core import CompleteOutput, Input, InstructInput, Output
from intelligence_layer.evaluation import ComparisonEvaluation, MatchOutcome
from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
from intelligence_layer.evaluation.evaluation.domain import (
Expand All @@ -31,6 +30,10 @@
from intelligence_layer.evaluation.evaluation.evaluator.base_evaluator import (
EvaluationLogicBase,
)
from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import (
ComparisonEvaluation,
MatchOutcome,
)
from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput
from intelligence_layer.evaluation.run.run_repository import RunRepository

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from pydantic import BaseModel

from intelligence_layer.core import Input, Output
from intelligence_layer.evaluation import EvaluationLogic
from intelligence_layer.evaluation.dataset.domain import Example, ExpectedOutput
from intelligence_layer.evaluation.evaluation.evaluator.evaluator import EvaluationLogic
from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput


Expand Down

0 comments on commit 7481983

Please sign in to comment.