From ee15ddfb2587bd1d25359de7be477d15df387e46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20K=C3=B6hnecke?= <155443293+NiklasKoehneckeTNG@users.noreply.github.com> Date: Thu, 15 Feb 2024 12:21:51 +0100 Subject: [PATCH] Il 258 Split up EvaluationRepository (#503) --- src/examples/classification.ipynb | 10 +- src/examples/document_index.ipynb | 10 +- src/examples/evaluation.ipynb | 20 +- src/examples/human_evaluation.ipynb | 22 +- src/examples/qa.ipynb | 10 +- src/examples/quickstart_task.ipynb | 21 +- src/examples/summarize.ipynb | 10 +- src/intelligence_layer/core/__init__.py | 6 + src/intelligence_layer/evaluation/__init__.py | 31 +- .../evaluation/data_storage/__init__.py | 0 .../{ => data_storage}/dataset_repository.py | 40 +- .../evaluation_repository.py | 298 +++++++++------ .../evaluation/data_storage/run_repository.py | 267 +++++++++++++ .../evaluation/data_storage/utils.py | 15 + src/intelligence_layer/evaluation/domain.py | 2 +- .../evaluation/evaluator.py | 358 ++---------------- .../evaluation/hugging_face.py | 4 +- .../instruct_comparison_argilla_evaluator.py | 15 +- src/intelligence_layer/evaluation/run.py | 22 +- src/intelligence_layer/evaluation/runner.py | 24 +- .../use_cases/classify/classify.py | 16 +- .../use_cases/summarize/summarize.py | 15 +- tests/conftest.py | 6 + tests/evaluation/conftest.py | 14 +- tests/evaluation/test_argilla_evaluator.py | 13 +- .../evaluation/test_evaluation_repository.py | 81 +--- tests/evaluation/test_evaluator.py | 31 +- ...t_instruct_comparison_argilla_evaluator.py | 25 +- tests/evaluation/test_run.py | 10 - tests/evaluation/test_run_repository.py | 89 +++++ tests/evaluation/test_runner.py | 18 +- tests/use_cases/classify/test_classify.py | 13 +- .../classify/test_prompt_based_classify.py | 12 +- tests/use_cases/summarize/test_summarize.py | 22 +- 34 files changed, 860 insertions(+), 690 deletions(-) create mode 100644 src/intelligence_layer/evaluation/data_storage/__init__.py rename src/intelligence_layer/evaluation/{ => data_storage}/dataset_repository.py (85%) rename src/intelligence_layer/evaluation/{ => data_storage}/evaluation_repository.py (60%) create mode 100644 src/intelligence_layer/evaluation/data_storage/run_repository.py create mode 100644 src/intelligence_layer/evaluation/data_storage/utils.py create mode 100644 tests/evaluation/test_run_repository.py diff --git a/src/examples/classification.ipynb b/src/examples/classification.ipynb index 6c6568a77..db9b0c178 100644 --- a/src/examples/classification.ipynb +++ b/src/examples/classification.ipynb @@ -48,6 +48,10 @@ "from intelligence_layer.use_cases import ClassifyInput, PromptBasedClassify\n", "from intelligence_layer.core import Chunk, InMemoryTracer\n", "\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()\n", + "\n", "text_to_classify = Chunk(\"In the distant future, a space exploration party embarked on a thrilling journey to the uncharted regions of the galaxy. \\n\\\n", "With excitement in their hearts and the cosmos as their canvas, they ventured into the unknown, discovering breathtaking celestial wonders. \\n\\\n", "As they gazed upon distant stars and nebulas, they forged unforgettable memories that would forever bind them as pioneers of the cosmos.\")\n", @@ -425,7 +429,7 @@ ], "metadata": { "kernelspec": { - "display_name": "3.10-intelligence", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -439,9 +443,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/src/examples/document_index.ipynb b/src/examples/document_index.ipynb index 7a32bfc20..454d219ed 100644 --- a/src/examples/document_index.ipynb +++ b/src/examples/document_index.ipynb @@ -49,6 +49,10 @@ "\n", "from intelligence_layer.connectors import DocumentIndexClient\n", "\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()\n", + "\n", "\n", "document_index = DocumentIndexClient(token=getenv(\"AA_TOKEN\"), base_document_index_url = \"https://document-index.aleph-alpha.com\")\n", "?document_index" @@ -282,7 +286,7 @@ ], "metadata": { "kernelspec": { - "display_name": "intelligence-layer-jSYEeheU-py3.10", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -296,9 +300,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/src/examples/evaluation.ipynb b/src/examples/evaluation.ipynb index 55295fa6a..2dd5f388e 100644 --- a/src/examples/evaluation.ipynb +++ b/src/examples/evaluation.ipynb @@ -44,19 +44,21 @@ "from dotenv import load_dotenv\n", "\n", "from intelligence_layer.connectors import LimitedConcurrencyClient\n", - "from intelligence_layer.evaluation import InMemoryEvaluationRepository, InMemoryDatasetRepository, Runner\n", + "from intelligence_layer.evaluation import InMemoryEvaluationRepository, InMemoryRunRepository, InMemoryDatasetRepository, Runner\n", "from intelligence_layer.use_cases import SingleLabelClassifyEvaluator, PromptBasedClassify\n", "\n", "load_dotenv()\n", "\n", "client = LimitedConcurrencyClient.from_token(os.getenv(\"AA_TOKEN\"))\n", "task = PromptBasedClassify(client)\n", - "evaluation_repository = InMemoryEvaluationRepository()\n", "dataset_repository = InMemoryDatasetRepository()\n", + "run_repository = InMemoryRunRepository()\n", + "evaluation_repository = InMemoryEvaluationRepository()\n", + "\n", "\n", "\n", - "evaluator = SingleLabelClassifyEvaluator(evaluation_repository, dataset_repository, \"singel-label-classify\")\n", - "runner = Runner(task, evaluation_repository, dataset_repository, \"prompt-based-classify\")\n" + "evaluator = SingleLabelClassifyEvaluator(dataset_repository, run_repository, evaluation_repository, \"singel-label-classify\")\n", + "runner = Runner(task, dataset_repository, run_repository, \"prompt-based-classify\")\n" ] }, { @@ -257,8 +259,8 @@ " ]\n", ")\n", "\n", - "embedding_based_classify_evaluator = MultiLabelClassifyEvaluator(evaluation_repository, dataset_repository, \"multi-label-classify\", threshold=0.6)\n", - "embedding_based_classify_runner = Runner(embedding_based_classify, evaluation_repository, dataset_repository, \"embedding-based-classify\")\n" + "embedding_based_classify_evaluator = MultiLabelClassifyEvaluator(dataset_repository, run_repository, evaluation_repository, \"multi-label-classify\", threshold=0.6)\n", + "embedding_based_classify_runner = Runner(embedding_based_classify,dataset_repository, run_repository, \"embedding-based-classify\")\n" ] }, { @@ -305,7 +307,7 @@ ], "metadata": { "kernelspec": { - "display_name": "intelligence-layer-tfT-HG2V-py3.11", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -319,9 +321,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/src/examples/human_evaluation.ipynb b/src/examples/human_evaluation.ipynb index 13be43884..26e76e211 100644 --- a/src/examples/human_evaluation.ipynb +++ b/src/examples/human_evaluation.ipynb @@ -62,6 +62,7 @@ " Example,\n", " InMemoryDatasetRepository,\n", " InMemoryEvaluationRepository,\n", + " InMemoryRunRepository,\n", " Runner,\n", " SuccessfulExampleOutput\n", ")\n", @@ -171,8 +172,8 @@ "outputs": [], "source": [ "task = Instruct(client, model=\"luminous-base-control\")\n", - "evaluation_repository = InMemoryEvaluationRepository()\n", - "runner = Runner(task, evaluation_repository, dataset_repository, \"Instruct\")\n", + "run_repository = InMemoryRunRepository()\n", + "runner = Runner(task,dataset_repository, run_repository, \"Instruct\")\n", "run_overview = runner.run_dataset(dataset_id)" ] }, @@ -321,10 +322,12 @@ " \n", "argilla_client = DefaultArgillaClient()\n", "workspace_id = argilla_client.create_workspace(\"test\")\n", + "evaluation_repository = InMemoryEvaluationRepository()\n", "\n", "evaluator = InstructArgillaEvaluator(\n", - " ArgillaEvaluationRepository(evaluation_repository, argilla_client),\n", " dataset_repository,\n", + " run_repository,\n", + " ArgillaEvaluationRepository(evaluation_repository, argilla_client),\n", " \"instruct\",\n", " workspace_id,\n", " fields,\n", @@ -371,11 +374,18 @@ " output = evaluator.aggregate_evaluation(eval_overview.id)\n", " print(output.statistics)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "intelligence-layer-WXd7Z3vu-py3.11", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -389,9 +399,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.6" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/src/examples/qa.ipynb b/src/examples/qa.ipynb index ace60726a..fa9e15bae 100644 --- a/src/examples/qa.ipynb +++ b/src/examples/qa.ipynb @@ -25,6 +25,10 @@ "load_dotenv()\n", "from intelligence_layer.connectors import LimitedConcurrencyClient\n", "\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()\n", + "\n", "client = LimitedConcurrencyClient.from_token(getenv(\"AA_TOKEN\"))" ] }, @@ -338,7 +342,7 @@ ], "metadata": { "kernelspec": { - "display_name": "3.10-intelligence", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -352,9 +356,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/src/examples/quickstart_task.ipynb b/src/examples/quickstart_task.ipynb index ab9e44fdb..e5669b47f 100644 --- a/src/examples/quickstart_task.ipynb +++ b/src/examples/quickstart_task.ipynb @@ -289,13 +289,6 @@ " KeywordExtractionAggregatedEvaluation,\n", " ]\n", "):\n", - " def __init__(\n", - " self, evaluation_repository: EvaluationRepository, dataset_repository: DatasetRepository, description: str\n", - " ) -> None:\n", - " \"\"\"We recommend adding the task to the init method of the evaluator\n", - "\n", - " This allows for easy comparing of different implementations of the same task.\"\"\"\n", - " super().__init__(evaluation_repository, dataset_repository, description)\n", "\n", " def do_evaluate(\n", " self,\n", @@ -342,12 +335,14 @@ "outputs": [], "source": [ "from intelligence_layer.core import NoOpTracer\n", - "from intelligence_layer.evaluation import InMemoryDatasetRepository, InMemoryEvaluationRepository, Runner, Example\n", + "from intelligence_layer.evaluation import InMemoryDatasetRepository, InMemoryEvaluationRepository, InMemoryRunRepository, Runner, Example\n", "\n", - "evaluation_repository = InMemoryEvaluationRepository()\n", "dataset_repository = InMemoryDatasetRepository()\n", - "evaluator = KeywordExtractionEvaluator(evaluation_repository, dataset_repository, \"keyword-extraction\")\n", - "runner = Runner(task, evaluation_repository, dataset_repository, \"keyword-extraction\")\n", + "run_repository = InMemoryRunRepository()\n", + "evaluation_repository = InMemoryEvaluationRepository()\n", + "\n", + "evaluator = KeywordExtractionEvaluator(dataset_repository, run_repository, evaluation_repository, \"keyword-extraction\")\n", + "runner = Runner(task, dataset_repository, run_repository, \"keyword-extraction\")\n", "\n", "input = KeywordExtractionInput(text=\"This is a text about dolphins and sharks.\")\n", "expected_output = KeywordExtractionExpectedOutput(keywords=[\"dolphins\", \"sharks\"])\n", @@ -428,7 +423,7 @@ "outputs": [], "source": [ "examples = list(dataset_repository.examples_by_id(dataset_id, evaluator.input_type(), evaluator.expected_output_type()))\n", - "last_example_result = evaluation_repository.example_trace(\n", + "last_example_result = run_repository.example_trace(\n", " next(iter(evaluation.run_overviews)).id, examples[-1].id\n", ")\n", "last_example_result.trace\n" @@ -498,7 +493,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/src/examples/summarize.ipynb b/src/examples/summarize.ipynb index ba1d32950..aa2313a1b 100644 --- a/src/examples/summarize.ipynb +++ b/src/examples/summarize.ipynb @@ -26,6 +26,10 @@ "\n", "from intelligence_layer.connectors import LimitedConcurrencyClient\n", "\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()\n", + "\n", "client = LimitedConcurrencyClient.from_token(getenv(\"AA_TOKEN\"))\n" ] }, @@ -194,7 +198,7 @@ ], "metadata": { "kernelspec": { - "display_name": "3.10-intelligence", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -208,9 +212,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/src/intelligence_layer/core/__init__.py b/src/intelligence_layer/core/__init__.py index 90ad8ede1..64484b59d 100644 --- a/src/intelligence_layer/core/__init__.py +++ b/src/intelligence_layer/core/__init__.py @@ -1,3 +1,9 @@ +from intelligence_layer.core.intelligence_app import ( + AuthenticatedIntelligenceApp as AuthenticatedIntelligenceApp, +) +from intelligence_layer.core.intelligence_app import AuthService as AuthService +from intelligence_layer.core.intelligence_app import IntelligenceApp as IntelligenceApp + from .chunk import Chunk as Chunk from .chunk import ChunkInput as ChunkInput from .chunk import ChunkOutput as ChunkOutput diff --git a/src/intelligence_layer/evaluation/__init__.py b/src/intelligence_layer/evaluation/__init__.py index 68f592d23..6ed399603 100644 --- a/src/intelligence_layer/evaluation/__init__.py +++ b/src/intelligence_layer/evaluation/__init__.py @@ -1,6 +1,26 @@ from .accumulator import MeanAccumulator as MeanAccumulator -from .dataset_repository import FileDatasetRepository as FileDatasetRepository -from .dataset_repository import InMemoryDatasetRepository as InMemoryDatasetRepository +from .data_storage.dataset_repository import DatasetRepository as DatasetRepository +from .data_storage.dataset_repository import ( + FileDatasetRepository as FileDatasetRepository, +) +from .data_storage.dataset_repository import ( + InMemoryDatasetRepository as InMemoryDatasetRepository, +) +from .data_storage.evaluation_repository import ( + ArgillaEvaluationRepository as ArgillaEvaluationRepository, +) +from .data_storage.evaluation_repository import ( + EvaluationRepository as EvaluationRepository, +) +from .data_storage.evaluation_repository import ( + FileEvaluationRepository as FileEvaluationRepository, +) +from .data_storage.evaluation_repository import ( + InMemoryEvaluationRepository as InMemoryEvaluationRepository, +) +from .data_storage.run_repository import FileRunRepository as FileRunRepository +from .data_storage.run_repository import InMemoryRunRepository as InMemoryRunRepository +from .data_storage.run_repository import RunRepository as RunRepository from .domain import Evaluation as Evaluation from .domain import EvaluationFailed as EvaluationFailed from .domain import EvaluationOverview as EvaluationOverview @@ -20,15 +40,8 @@ from .elo import PayoffMatrix as PayoffMatrix from .elo import PlayerScore as PlayerScore from .elo import WinRateCalculator as WinRateCalculator -from .evaluation_repository import FileEvaluationRepository as FileEvaluationRepository -from .evaluation_repository import ( - InMemoryEvaluationRepository as InMemoryEvaluationRepository, -) -from .evaluator import ArgillaEvaluationRepository as ArgillaEvaluationRepository from .evaluator import ArgillaEvaluator as ArgillaEvaluator from .evaluator import BaseEvaluator as BaseEvaluator -from .evaluator import DatasetRepository as DatasetRepository -from .evaluator import EvaluationRepository as EvaluationRepository from .evaluator import Evaluator as Evaluator from .graders import BleuGrader as BleuGrader from .graders import RougeGrader as RougeGrader diff --git a/src/intelligence_layer/evaluation/data_storage/__init__.py b/src/intelligence_layer/evaluation/data_storage/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/intelligence_layer/evaluation/dataset_repository.py b/src/intelligence_layer/evaluation/data_storage/dataset_repository.py similarity index 85% rename from src/intelligence_layer/evaluation/dataset_repository.py rename to src/intelligence_layer/evaluation/data_storage/dataset_repository.py index b1e08c780..2857e14b0 100644 --- a/src/intelligence_layer/evaluation/dataset_repository.py +++ b/src/intelligence_layer/evaluation/data_storage/dataset_repository.py @@ -1,3 +1,4 @@ +from abc import ABC, abstractmethod from pathlib import Path from typing import Dict, Iterable, Optional, Sequence, cast from uuid import uuid4 @@ -5,10 +6,45 @@ from fsspec import AbstractFileSystem # type: ignore from fsspec.implementations.local import LocalFileSystem # type: ignore -from intelligence_layer.core.task import Input +from intelligence_layer.core import Input from intelligence_layer.core.tracer import JsonSerializer, PydanticSerializable from intelligence_layer.evaluation.domain import Example, ExpectedOutput -from intelligence_layer.evaluation.evaluator import DatasetRepository + + +class DatasetRepository(ABC): + @abstractmethod + def create_dataset( + self, + examples: Iterable[Example[Input, ExpectedOutput]], + ) -> str: + ... + + @abstractmethod + def examples_by_id( + self, + dataset_id: str, + input_type: type[Input], + expected_output_type: type[ExpectedOutput], + ) -> Optional[Iterable[Example[Input, ExpectedOutput]]]: + ... + + @abstractmethod + def example( + self, + dataset_id: str, + example_id: str, + input_type: type[Input], + expected_output_type: type[ExpectedOutput], + ) -> Optional[Example[Input, ExpectedOutput]]: + ... + + @abstractmethod + def delete_dataset(self, dataset_id: str) -> None: + ... + + @abstractmethod + def list_datasets(self) -> Iterable[str]: + ... class FileSystemDatasetRepository(DatasetRepository): diff --git a/src/intelligence_layer/evaluation/evaluation_repository.py b/src/intelligence_layer/evaluation/data_storage/evaluation_repository.py similarity index 60% rename from src/intelligence_layer/evaluation/evaluation_repository.py rename to src/intelligence_layer/evaluation/data_storage/evaluation_repository.py index cc59c3ead..d07d681a6 100644 --- a/src/intelligence_layer/evaluation/evaluation_repository.py +++ b/src/intelligence_layer/evaluation/data_storage/evaluation_repository.py @@ -1,40 +1,29 @@ +from abc import ABC, abstractmethod from collections import defaultdict from pathlib import Path -from typing import Iterable, Optional, Sequence, cast +from typing import Optional, Sequence, TypeVar, cast from pydantic import BaseModel, ValidationError -from intelligence_layer.core.task import Output -from intelligence_layer.core.tracer import ( - FileTracer, - InMemoryTaskSpan, - InMemoryTracer, - JsonSerializer, - PydanticSerializable, - Tracer, +from intelligence_layer.connectors.argilla.argilla_client import ( + ArgillaClient, + ArgillaEvaluation, ) +from intelligence_layer.core import Output +from intelligence_layer.core.tracer import FileTracer, JsonSerializer, Tracer +from intelligence_layer.evaluation.data_storage.utils import read_utf8, write_utf8 from intelligence_layer.evaluation.domain import ( Evaluation, ExampleEvaluation, ExampleOutput, - ExampleTrace, FailedExampleEvaluation, IndividualEvaluationOverview, RunOverview, - TaskSpanTrace, ) -from intelligence_layer.evaluation.evaluator import ( - EvaluationOverviewType, - EvaluationRepository, -) - - -def write_utf8(path: Path, content: str) -> None: - path.write_text(content, encoding="utf-8") - -def read_utf8(path: Path) -> str: - return path.read_text(encoding="utf-8") +EvaluationOverviewType = TypeVar( + "EvaluationOverviewType", bound=IndividualEvaluationOverview +) class SerializedExampleEvaluation(BaseModel): @@ -80,6 +69,110 @@ def to_example_result( ) +class EvaluationRepository(ABC): + """Base evaluation repository interface. + + Provides methods to store and load evaluation results for individual examples + of a run and the aggregated evaluation of said run. + """ + + @abstractmethod + def eval_ids(self) -> Sequence[str]: + """Returns the ids of all stored evaluation runs. + + Having the id of an evaluation run, its overview can be retrieved with + :meth:`EvaluationRepository.evaluation_run_overview`. + + Returns: + The ids of all stored evaluation runs. + """ + ... + + @abstractmethod + def example_evaluation( + self, eval_id: str, example_id: str, evaluation_type: type[Evaluation] + ) -> Optional[ExampleEvaluation[Evaluation]]: + """Returns an :class:`ExampleEvaluation` of a given run by its id. + + Args: + eval_id: Identifier of the run to obtain the results for. + example_id: Example identifier, will match :class:`ExampleEvaluation` identifier. + evaluation_type: Type of evaluations that the `Evaluator` returned + in :func:`Evaluator.do_evaluate` + + Returns: + :class:`ExampleEvaluation` if one was found, `None` otherwise. + """ + ... + + @abstractmethod + def store_example_evaluation(self, result: ExampleEvaluation[Evaluation]) -> None: + """Stores an :class:`ExampleEvaluation` for a run in the repository. + + Args: + eval_id: Identifier of the eval run. + result: The result to be persisted. + """ + ... + + @abstractmethod + def example_evaluations( + self, eval_id: str, evaluation_type: type[Evaluation] + ) -> Sequence[ExampleEvaluation[Evaluation]]: + """Returns all :class:`ExampleResult` instances of a given run + + Args: + eval_id: Identifier of the eval run to obtain the results for. + evaluation_type: Type of evaluations that the :class:`Evaluator` returned + in :func:`Evaluator.do_evaluate` + + Returns: + All :class:`ExampleResult` of the run. Will return an empty list if there's none. + """ + ... + + @abstractmethod + def failed_example_evaluations( + self, eval_id: str, evaluation_type: type[Evaluation] + ) -> Sequence[ExampleEvaluation[Evaluation]]: + """Returns all failed :class:`ExampleResult` instances of a given run + + Args: + eval_id: Identifier of the eval run to obtain the results for. + evaluation_type: Type of evaluations that the :class:`Evaluator` returned + in :func:`Evaluator.do_evaluate` + + Returns: + All failed :class:`ExampleResult` of the run. Will return an empty list if there's none. + """ + ... + + @abstractmethod + def evaluation_overview( + self, eval_id: str, overview_type: type[EvaluationOverviewType] + ) -> EvaluationOverviewType | None: + """Returns an :class:`EvaluationOverview` of a given run by its id. + + Args: + eval_id: Identifier of the eval run to obtain the overview for. + aggregation_type: Type of aggregations that the :class:`Evaluator` returned + in :func:`Evaluator.aggregate` + + Returns: + :class:`EvaluationOverview` if one was found, `None` otherwise. + """ + ... + + @abstractmethod + def store_evaluation_overview(self, overview: IndividualEvaluationOverview) -> None: + """Stores an :class:`EvaluationRunOverview` in the repository. + + Args: + overview: The overview to be persisted. + """ + ... + + class FileEvaluationRepository(EvaluationRepository): """An :class:`EvaluationRepository` that stores evaluation results in json-files. @@ -144,38 +237,6 @@ def store_example_output(self, example_output: ExampleOutput[Output]) -> None: serialized_result.model_dump_json(indent=2), ) - def example_output( - self, run_id: str, example_id: str, output_type: type[Output] - ) -> Optional[ExampleOutput[Output]]: - file_path = self._example_output_path(run_id, example_id) - if not file_path.exists(): - return None - content = read_utf8(file_path) - # Mypy does not accept dynamic types - return ExampleOutput[output_type].model_validate_json(json_data=content) # type: ignore - - def example_outputs( - self, run_id: str, output_type: type[Output] - ) -> Iterable[ExampleOutput[Output]]: - def load_example_output( - path: Path, - ) -> Optional[ExampleOutput[Output]]: - id = path.with_suffix("").name - return self.example_output(run_id, id, output_type) - - path = self._output_directory(run_id) - output_files = path.glob("*.json") - return ( - example_output - for example_output in sorted( - (load_example_output(file) for file in output_files), - key=lambda example_output: example_output.example_id - if example_output - else "", - ) - if example_output - ) - def example_evaluations( self, eval_id: str, evaluation_type: type[Evaluation] ) -> Sequence[ExampleEvaluation[Evaluation]]: @@ -209,16 +270,6 @@ def example_evaluation( serialized_example = SerializedExampleEvaluation.model_validate_json(content) return serialized_example.to_example_result(evaluation_type) - def example_trace(self, run_id: str, example_id: str) -> Optional[ExampleTrace]: - file_path = self._example_trace_path(run_id, example_id) - if not file_path.exists(): - return None - in_memory_tracer = _parse_log(file_path) - trace = TaskSpanTrace.from_task_span( - cast(InMemoryTaskSpan, in_memory_tracer.entries[0]) - ) - return ExampleTrace(run_id=run_id, example_id=example_id, trace=trace) - def example_tracer(self, run_id: str, example_id: str) -> Tracer: file_path = self._example_trace_path(run_id, example_id) return FileTracer(file_path) @@ -245,23 +296,11 @@ def store_evaluation_overview(self, overview: IndividualEvaluationOverview) -> N overview.model_dump_json(indent=2), ) - def run_overview(self, run_id: str) -> RunOverview | None: - file_path = self._run_overview_path(run_id) - if not file_path.exists(): - return None - content = read_utf8(file_path) - return RunOverview.model_validate_json(content) - def store_run_overview(self, overview: RunOverview) -> None: write_utf8( self._run_overview_path(overview.id), overview.model_dump_json(indent=2) ) - def run_ids(self) -> Sequence[str]: - return [ - path.parent.name for path in self._run_root_directory().glob("*/output") - ] - def eval_ids( self, overview_type: type[EvaluationOverviewType] | None = None ) -> Sequence[str]: @@ -282,10 +321,6 @@ def evaluation_overview( return [overview.id for overview in overviews if overview is not None] -def _parse_log(log_path: Path) -> InMemoryTracer: - return FileTracer(log_path).trace() - - class InMemoryEvaluationRepository(EvaluationRepository): """An :class:`EvaluationRepository` that stores evaluation results in memory. @@ -293,56 +328,14 @@ class InMemoryEvaluationRepository(EvaluationRepository): """ def __init__(self) -> None: - self._example_outputs: dict[ - str, list[ExampleOutput[PydanticSerializable]] - ] = defaultdict(list) self._example_evaluations: dict[ str, list[ExampleEvaluation[BaseModel]] ] = defaultdict(list) - self._example_traces: dict[str, InMemoryTracer] = dict() self._evaluation_run_overviews: dict[str, IndividualEvaluationOverview] = dict() - self._run_overviews: dict[str, RunOverview] = dict() - - def run_ids(self) -> Sequence[str]: - return list(self._example_outputs.keys()) def eval_ids(self) -> Sequence[str]: return list(self._evaluation_run_overviews.keys()) - def store_example_output(self, example_output: ExampleOutput[Output]) -> None: - self._example_outputs[example_output.run_id].append( - cast(ExampleOutput[PydanticSerializable], example_output) - ) - - def example_outputs( - self, run_id: str, output_type: type[Output] - ) -> Iterable[ExampleOutput[Output]]: - return ( - cast(ExampleOutput[Output], example_output) - for example_output in sorted( - self._example_outputs[run_id], - key=lambda example_output: example_output.example_id, - ) - ) - - def example_trace(self, run_id: str, example_id: str) -> Optional[ExampleTrace]: - tracer = self._example_traces.get(f"{run_id}/{example_id}") - if tracer is None: - return None - assert tracer - return ExampleTrace( - run_id=run_id, - example_id=example_id, - trace=TaskSpanTrace.from_task_span( - cast(InMemoryTaskSpan, tracer.entries[0]) - ), - ) - - def example_tracer(self, run_id: str, example_id: str) -> Tracer: - tracer = InMemoryTracer() - self._example_traces[f"{run_id}/{example_id}"] = tracer - return tracer - def example_evaluation( self, eval_id: str, example_id: str, evaluation_type: type[Evaluation] ) -> ExampleEvaluation[Evaluation] | None: @@ -382,8 +375,61 @@ def evaluation_overview( def store_evaluation_overview(self, overview: IndividualEvaluationOverview) -> None: self._evaluation_run_overviews[overview.id] = overview - def run_overview(self, run_id: str) -> RunOverview | None: - return self._run_overviews.get(run_id) - def store_run_overview(self, overview: RunOverview) -> None: - self._run_overviews[overview.id] = overview +class ArgillaEvaluationRepository(EvaluationRepository): + """Evaluation repository used for the :class:`ArgillaEvaluator`. + + Wraps an :class:`Evaluator`. + Does not support storing evaluations, since the ArgillaEvaluator does not do automated evaluations. + + Args: + evaluation_repository: repository to wrap. + argilla_client: client used to connect to Argilla. + """ + + def __init__( + self, evaluation_repository: EvaluationRepository, argilla_client: ArgillaClient + ) -> None: + super().__init__() + self._evaluation_repository = evaluation_repository + self._client = argilla_client + + def eval_ids(self) -> Sequence[str]: + return self._evaluation_repository.eval_ids() + + def example_evaluation( + self, eval_id: str, example_id: str, evaluation_type: type[Evaluation] + ) -> Optional[ExampleEvaluation[Evaluation]]: + return self._evaluation_repository.example_evaluation( + eval_id, example_id, evaluation_type + ) + + def store_example_evaluation(self, _: ExampleEvaluation[Evaluation]) -> None: + raise TypeError( + "ArgillaEvaluationRepository does not support storing evaluations." + ) + + def example_evaluations( + self, eval_id: str, eval_type: type[Evaluation] + ) -> Sequence[ExampleEvaluation[Evaluation]]: + assert eval_type == ArgillaEvaluation + # Mypy does not derive that the return type is always ExampleEvaluation with ArgillaEvaluation + return [ + ExampleEvaluation(eval_id=eval_id, example_id=e.example_id, result=e) # type: ignore + for e in self._client.evaluations(eval_id) + ] + + def failed_example_evaluations( + self, eval_id: str, evaluation_type: type[Evaluation] + ) -> Sequence[ExampleEvaluation[Evaluation]]: + return self._evaluation_repository.failed_example_evaluations( + eval_id, evaluation_type + ) + + def evaluation_overview( + self, eval_id: str, overview_type: type[EvaluationOverviewType] + ) -> EvaluationOverviewType | None: + return self._evaluation_repository.evaluation_overview(eval_id, overview_type) + + def store_evaluation_overview(self, overview: IndividualEvaluationOverview) -> None: + return self._evaluation_repository.store_evaluation_overview(overview) diff --git a/src/intelligence_layer/evaluation/data_storage/run_repository.py b/src/intelligence_layer/evaluation/data_storage/run_repository.py new file mode 100644 index 000000000..47ff27529 --- /dev/null +++ b/src/intelligence_layer/evaluation/data_storage/run_repository.py @@ -0,0 +1,267 @@ +from abc import ABC, abstractmethod +from collections import defaultdict +from pathlib import Path +from typing import Iterable, Optional, Sequence, cast + +from intelligence_layer.core.task import Output +from intelligence_layer.core.tracer import ( + FileTracer, + InMemoryTaskSpan, + InMemoryTracer, + JsonSerializer, + PydanticSerializable, + Tracer, +) +from intelligence_layer.evaluation.data_storage.utils import ( + _parse_log, + read_utf8, + write_utf8, +) +from intelligence_layer.evaluation.domain import ( + ExampleOutput, + ExampleTrace, + RunOverview, + TaskSpanTrace, +) + + +class RunRepository(ABC): + @abstractmethod + def run_ids(self) -> Sequence[str]: + """Returns the ids of all stored runs. + + Having the id of a run, its outputs can be retrieved with + :meth:`EvaluationRepository.example_outputs`. + + Returns: + The ids of all stored runs. + """ + ... + + @abstractmethod + def example_outputs( + self, run_id: str, output_type: type[Output] + ) -> Iterable[ExampleOutput[Output]]: + """Returns all :class:`ExampleOutput` for a given run. + + Args: + run_id: The unique identifier of the run. + output_type: Type of output that the `Task` returned + in :func:`Task.do_run` + + Returns: + Iterable over all outputs. + """ + ... + + @abstractmethod + def store_example_output(self, example_output: ExampleOutput[Output]) -> None: + """Stores an individual :class:`ExampleOutput`. + + Args: + example_output: The actual output. + """ + ... + + @abstractmethod + def example_trace(self, run_id: str, example_id: str) -> Optional[ExampleTrace]: + """Returns an :class:`ExampleTrace` for an example in a run. + + Args: + run_id: The unique identifier of the run. + example_id: Example identifier, will match :class:`ExampleEvaluation` identifier. + example_output: The actual output. + """ + ... + + @abstractmethod + def example_tracer(self, run_id: str, example_id: str) -> Tracer: + """Returns a :class:`Tracer` to trace an individual example run. + + Args: + run_id: The unique identifier of the run. + example_id: Example identifier, will match :class:`ExampleEvaluation` identifier. + """ + ... + + @abstractmethod + def run_overview(self, run_id: str) -> RunOverview | None: + """Returns an :class:`RunOverview` of a given run by its id. + + Args: + run_id: Identifier of the eval run to obtain the overview for. + + Returns: + :class:`RunOverview` if one was found, `None` otherwise. + """ + ... + + @abstractmethod + def store_run_overview(self, overview: RunOverview) -> None: + """Stores an :class:`RunOverview` in the repository. + + Args: + overview: The overview to be persisted. + """ + ... + + +class FileRunRepository(RunRepository): + def __init__(self, root_directory: Path) -> None: + root_directory.mkdir(parents=True, exist_ok=True) + self._root_directory = root_directory + + def _example_trace_path(self, run_id: str, example_id: str) -> Path: + return (self._trace_directory(run_id) / example_id).with_suffix(".jsonl") + + def _run_root_directory(self) -> Path: + path = self._root_directory / "runs" + path.mkdir(exist_ok=True) + return path + + def _run_directory(self, run_id: str) -> Path: + path = self._run_root_directory() / run_id + path.mkdir(exist_ok=True) + return path + + def _trace_directory(self, run_id: str) -> Path: + path = self._run_directory(run_id) / "trace" + + path.mkdir(exist_ok=True) + return path + + def _run_overview_path(self, run_id: str) -> Path: + return self._run_directory(run_id).with_suffix(".json") + + def _output_directory(self, run_id: str) -> Path: + path = self._run_directory(run_id) / "output" + + path.mkdir(exist_ok=True) + return path + + def _example_output_path(self, run_id: str, example_id: str) -> Path: + return (self._output_directory(run_id) / example_id).with_suffix(".json") + + def run_overview(self, run_id: str) -> RunOverview | None: + file_path = self._run_overview_path(run_id) + if not file_path.exists(): + return None + content = read_utf8(file_path) + return RunOverview.model_validate_json(content) + + def example_trace(self, run_id: str, example_id: str) -> Optional[ExampleTrace]: + file_path = self._example_trace_path(run_id, example_id) + if not file_path.exists(): + return None + in_memory_tracer = _parse_log(file_path) + trace = TaskSpanTrace.from_task_span( + cast(InMemoryTaskSpan, in_memory_tracer.entries[0]) + ) + return ExampleTrace(run_id=run_id, example_id=example_id, trace=trace) + + def example_output( + self, run_id: str, example_id: str, output_type: type[Output] + ) -> Optional[ExampleOutput[Output]]: + file_path = self._example_output_path(run_id, example_id) + if not file_path.exists(): + return None + content = read_utf8(file_path) + # Mypy does not accept dynamic types + return ExampleOutput[output_type].model_validate_json( # type: ignore + json_data=content + ) + + def example_outputs( + self, run_id: str, output_type: type[Output] + ) -> Iterable[ExampleOutput[Output]]: + def load_example_output( + path: Path, + ) -> Optional[ExampleOutput[Output]]: + id = path.with_suffix("").name + return self.example_output(run_id, id, output_type) + + path = self._output_directory(run_id) + output_files = path.glob("*.json") + return ( + example_output + for example_output in sorted( + (load_example_output(file) for file in output_files), + key=lambda example_output: example_output.example_id + if example_output + else "", + ) + if example_output + ) + + def run_ids(self) -> Sequence[str]: + return [ + path.parent.name for path in self._run_root_directory().glob("*/output") + ] + + def example_tracer(self, run_id: str, example_id: str) -> Tracer: + file_path = self._example_trace_path(run_id, example_id) + return FileTracer(file_path) + + def store_run_overview(self, overview: RunOverview) -> None: + write_utf8( + self._run_overview_path(overview.id), overview.model_dump_json(indent=2) + ) + + def store_example_output(self, example_output: ExampleOutput[Output]) -> None: + serialized_result = JsonSerializer(root=example_output) + write_utf8( + self._example_output_path(example_output.run_id, example_output.example_id), + serialized_result.model_dump_json(indent=2), + ) + + +class InMemoryRunRepository(RunRepository): + def __init__(self) -> None: + self._example_outputs: dict[ + str, list[ExampleOutput[PydanticSerializable]] + ] = defaultdict(list) + self._example_traces: dict[str, InMemoryTracer] = dict() + self._run_overviews: dict[str, RunOverview] = dict() + + def run_ids(self) -> Sequence[str]: + return list(self._example_outputs.keys()) + + def store_example_output(self, example_output: ExampleOutput[Output]) -> None: + self._example_outputs[example_output.run_id].append( + cast(ExampleOutput[PydanticSerializable], example_output) + ) + + def example_outputs( + self, run_id: str, output_type: type[Output] + ) -> Iterable[ExampleOutput[Output]]: + return ( + cast(ExampleOutput[Output], example_output) + for example_output in sorted( + self._example_outputs[run_id], + key=lambda example_output: example_output.example_id, + ) + ) + + def example_trace(self, run_id: str, example_id: str) -> Optional[ExampleTrace]: + tracer = self._example_traces.get(f"{run_id}/{example_id}") + if tracer is None: + return None + assert tracer + return ExampleTrace( + run_id=run_id, + example_id=example_id, + trace=TaskSpanTrace.from_task_span( + cast(InMemoryTaskSpan, tracer.entries[0]) + ), + ) + + def example_tracer(self, run_id: str, example_id: str) -> Tracer: + tracer = InMemoryTracer() + self._example_traces[f"{run_id}/{example_id}"] = tracer + return tracer + + def run_overview(self, run_id: str) -> RunOverview | None: + return self._run_overviews.get(run_id) + + def store_run_overview(self, overview: RunOverview) -> None: + self._run_overviews[overview.id] = overview diff --git a/src/intelligence_layer/evaluation/data_storage/utils.py b/src/intelligence_layer/evaluation/data_storage/utils.py new file mode 100644 index 000000000..6dfc8328c --- /dev/null +++ b/src/intelligence_layer/evaluation/data_storage/utils.py @@ -0,0 +1,15 @@ +from pathlib import Path + +from intelligence_layer.core.tracer import FileTracer, InMemoryTracer + + +def write_utf8(path: Path, content: str) -> None: + path.write_text(content, encoding="utf-8") + + +def read_utf8(path: Path) -> str: + return path.read_text(encoding="utf-8") + + +def _parse_log(log_path: Path) -> InMemoryTracer: + return FileTracer(log_path).trace() diff --git a/src/intelligence_layer/evaluation/domain.py b/src/intelligence_layer/evaluation/domain.py index b18fff17e..8a6c3752a 100644 --- a/src/intelligence_layer/evaluation/domain.py +++ b/src/intelligence_layer/evaluation/domain.py @@ -274,7 +274,7 @@ class IndividualEvaluationOverview(BaseModel, frozen=True): """Overview of the unaggregated results of evaluating a :class:`Task` on a dataset. Attributes: - run_overview: Overview of the run that was evaluated. + run_overviews: Overviews of the runs that were evaluated. id: The unique identifier of this evaluation. start: The time when the evaluation run was started description: human-readable for the evaluator that created the evaluation diff --git a/src/intelligence_layer/evaluation/evaluator.py b/src/intelligence_layer/evaluation/evaluator.py index 52a26623f..90804bbf0 100644 --- a/src/intelligence_layer/evaluation/evaluator.py +++ b/src/intelligence_layer/evaluation/evaluator.py @@ -20,14 +20,22 @@ from tqdm import tqdm -from intelligence_layer.connectors import ArgillaClient, Field +from intelligence_layer.connectors import Field from intelligence_layer.connectors.argilla.argilla_client import ( ArgillaEvaluation, Question, RecordData, ) from intelligence_layer.core.task import Input, Output -from intelligence_layer.core.tracer import Tracer, utc_now +from intelligence_layer.core.tracer import utc_now +from intelligence_layer.evaluation.data_storage.dataset_repository import ( + DatasetRepository, +) +from intelligence_layer.evaluation.data_storage.evaluation_repository import ( + ArgillaEvaluationRepository, + EvaluationRepository, +) +from intelligence_layer.evaluation.data_storage.run_repository import RunRepository from intelligence_layer.evaluation.domain import ( AggregatedEvaluation, Evaluation, @@ -35,7 +43,6 @@ Example, ExampleEvaluation, ExampleOutput, - ExampleTrace, ExpectedOutput, FailedExampleEvaluation, FailedExampleRun, @@ -44,230 +51,6 @@ SuccessfulExampleOutput, ) -EvaluationOverviewType = TypeVar( - "EvaluationOverviewType", bound=IndividualEvaluationOverview -) - - -class EvaluationRepository(ABC): - """Base evaluation repository interface. - - Provides methods to store and load evaluation results for individual examples - of a run and the aggregated evaluation of said run. - """ - - @abstractmethod - def run_ids(self) -> Sequence[str]: - """Returns the ids of all stored runs. - - Having the id of a run, its outputs can be retrieved with - :meth:`EvaluationRepository.example_outputs`. - - Returns: - The ids of all stored runs. - """ - ... - - @abstractmethod - def eval_ids(self) -> Sequence[str]: - """Returns the ids of all stored evaluation runs. - - Having the id of an evaluation run, its overview can be retrieved with - :meth:`EvaluationRepository.evaluation_run_overview`. - - Returns: - The ids of all stored evaluation runs. - """ - ... - - @abstractmethod - def example_outputs( - self, run_id: str, output_type: type[Output] - ) -> Iterable[ExampleOutput[Output]]: - """Returns all :class:`ExampleOutput` for a given run. - - Args: - run_id: The unique identifier of the run. - output_type: Type of output that the `Task` returned - in :func:`Task.do_run` - - Returns: - Iterable over all outputs. - """ - ... - - @abstractmethod - def store_example_output(self, example_output: ExampleOutput[Output]) -> None: - """Stores an individual :class:`ExampleOutput`. - - Args: - example_output: The actual output. - """ - ... - - @abstractmethod - def example_trace(self, run_id: str, example_id: str) -> Optional[ExampleTrace]: - """Returns an :class:`ExampleTrace` for an example in a run. - - Args: - run_id: The unique identifier of the run. - example_id: Example identifier, will match :class:`ExampleEvaluation` identifier. - example_output: The actual output. - """ - ... - - @abstractmethod - def example_tracer(self, run_id: str, example_id: str) -> Tracer: - """Returns a :class:`Tracer` to trace an individual example run. - - Args: - run_id: The unique identifier of the run. - example_id: Example identifier, will match :class:`ExampleEvaluation` identifier. - """ - ... - - @abstractmethod - def example_evaluation( - self, eval_id: str, example_id: str, evaluation_type: type[Evaluation] - ) -> Optional[ExampleEvaluation[Evaluation]]: - """Returns an :class:`ExampleEvaluation` of a given run by its id. - - Args: - eval_id: Identifier of the run to obtain the results for. - example_id: Example identifier, will match :class:`ExampleEvaluation` identifier. - evaluation_type: Type of evaluations that the `Evaluator` returned - in :func:`Evaluator.do_evaluate` - - Returns: - :class:`ExampleEvaluation` if one was found, `None` otherwise. - """ - ... - - @abstractmethod - def store_example_evaluation(self, result: ExampleEvaluation[Evaluation]) -> None: - """Stores an :class:`ExampleEvaluation` for a run in the repository. - - Args: - eval_id: Identifier of the eval run. - result: The result to be persisted. - """ - ... - - @abstractmethod - def example_evaluations( - self, eval_id: str, evaluation_type: type[Evaluation] - ) -> Sequence[ExampleEvaluation[Evaluation]]: - """Returns all :class:`ExampleResult` instances of a given run - - Args: - eval_id: Identifier of the eval run to obtain the results for. - evaluation_type: Type of evaluations that the :class:`Evaluator` returned - in :func:`Evaluator.do_evaluate` - - Returns: - All :class:`ExampleResult` of the run. Will return an empty list if there's none. - """ - ... - - @abstractmethod - def failed_example_evaluations( - self, eval_id: str, evaluation_type: type[Evaluation] - ) -> Sequence[ExampleEvaluation[Evaluation]]: - """Returns all failed :class:`ExampleResult` instances of a given run - - Args: - eval_id: Identifier of the eval run to obtain the results for. - evaluation_type: Type of evaluations that the :class:`Evaluator` returned - in :func:`Evaluator.do_evaluate` - - Returns: - All failed :class:`ExampleResult` of the run. Will return an empty list if there's none. - """ - ... - - @abstractmethod - def evaluation_overview( - self, eval_id: str, overview_type: type[EvaluationOverviewType] - ) -> EvaluationOverviewType | None: - """Returns an :class:`EvaluationOverview` of a given run by its id. - - Args: - eval_id: Identifier of the eval run to obtain the overview for. - aggregation_type: Type of aggregations that the :class:`Evaluator` returned - in :func:`Evaluator.aggregate` - - Returns: - :class:`EvaluationOverview` if one was found, `None` otherwise. - """ - ... - - @abstractmethod - def store_evaluation_overview(self, overview: IndividualEvaluationOverview) -> None: - """Stores an :class:`EvaluationRunOverview` in the repository. - - Args: - overview: The overview to be persisted. - """ - ... - - @abstractmethod - def run_overview(self, run_id: str) -> RunOverview | None: - """Returns an :class:`RunOverview` of a given run by its id. - - Args: - run_id: Identifier of the eval run to obtain the overview for. - - Returns: - :class:`RunOverview` if one was found, `None` otherwise. - """ - ... - - @abstractmethod - def store_run_overview(self, overview: RunOverview) -> None: - """Stores an :class:`RunOverview` in the repository. - - Args: - overview: The overview to be persisted. - """ - ... - - -class DatasetRepository(ABC): - @abstractmethod - def create_dataset( - self, - examples: Iterable[Example[Input, ExpectedOutput]], - ) -> str: - ... - - @abstractmethod - def examples_by_id( - self, - dataset_id: str, - input_type: type[Input], - expected_output_type: type[ExpectedOutput], - ) -> Optional[Iterable[Example[Input, ExpectedOutput]]]: - ... - - @abstractmethod - def example( - self, - dataset_id: str, - example_id: str, - input_type: type[Input], - expected_output_type: type[ExpectedOutput], - ) -> Optional[Example[Input, ExpectedOutput]]: - ... - - @abstractmethod - def delete_dataset(self, dataset_id: str) -> None: - ... - - @abstractmethod - def list_datasets(self) -> Iterable[str]: - ... - - T = TypeVar("T") @@ -304,8 +87,9 @@ class BaseEvaluator( """Base evaluator interface. Arguments: - evaluation_repository: The repository that will be used to store evaluation results. dataset_repository: The repository with the examples that will be taken for the evaluation + run_repository: The repository with the run output that will be taken for the evaluation + evaluation_repository: The repository that will be used to store evaluation results. description: human-readable description for the evaluator Generics: @@ -318,12 +102,14 @@ class BaseEvaluator( def __init__( self, - evaluation_repository: EvaluationRepository, dataset_repository: DatasetRepository, + run_repository: RunRepository, + evaluation_repository: EvaluationRepository, description: str, ) -> None: - self._evaluation_repository = evaluation_repository self._dataset_repository = dataset_repository + self._run_repository = run_repository + self._evaluation_repository = evaluation_repository self.description = description @lru_cache(maxsize=1) @@ -441,7 +227,7 @@ def aggregate(self, evaluations: Iterable[Evaluation]) -> AggregatedEvaluation: It should create an `AggregatedEvaluation` class and return it at the end. Args: - evalautions: The results from running `evaluate_dataset` with a :class:`Task`. + evaluations: The results from running `evaluate_dataset` with a :class:`Task`. Returns: The aggregated results of an evaluation run with a :class:`Dataset`. @@ -477,10 +263,7 @@ def evaluate_runs( and their tasks have the same output-type. For each example in the dataset referenced by the runs the outputs of all runs are collected and if all of them were successful they are passed on to the implementation - specific evaluation. For a simple evaluation only a single run_id is provided. - If the output of multiple runs are to be compared (for example to compare - the performance of different model on the same task), multiple run_ids are - passed accordingly. + specific evaluation. The method compares all run of the provided ids to each other. num_examples: The number of examples which should be evaluated from the given runs. Always the first n runs stored in the evaluation repository @@ -490,15 +273,15 @@ def evaluate_runs( __init__. """ - def load_overview(run_id: str) -> RunOverview: - run_overview = self._evaluation_repository.run_overview(run_id) + def load_run_overview(run_id: str) -> RunOverview: + run_overview = self._run_repository.run_overview(run_id) if not run_overview: raise ValueError(f"No RunOverview found for run-id: {run_id}") return run_overview if not run_ids: raise ValueError("At least one run-id needs to be provided") - run_overviews = frozenset(load_overview(run_id) for run_id in run_ids) + run_overviews = frozenset(load_run_overview(run_id) for run_id in run_ids) if not all( next(iter(run_overviews)).dataset_id == run_overview.dataset_id for run_overview in run_overviews @@ -519,7 +302,7 @@ def load_overview(run_id: str) -> RunOverview: examples_zipped: Iterable[tuple[ExampleOutput[Output], ...]] = zip( *( - self._evaluation_repository.example_outputs( + self._run_repository.example_outputs( run_overview.id, self.output_type() ) for run_overview in run_overviews @@ -616,7 +399,7 @@ def aggregate_evaluation( An overview of the aggregated evaluation. """ - def load_overview(eval_id: str) -> IndividualEvaluationOverview: + def load_eval_overview(eval_id: str) -> IndividualEvaluationOverview: evaluation_overview = self._evaluation_repository.evaluation_overview( eval_id, IndividualEvaluationOverview ) @@ -626,7 +409,7 @@ def load_overview(eval_id: str) -> IndividualEvaluationOverview: ) return evaluation_overview - evaluation_overviews = frozenset(load_overview(id) for id in set(eval_ids)) + evaluation_overviews = frozenset(load_eval_overview(id) for id in set(eval_ids)) nested_evaluations = [ self._evaluation_repository.example_evaluations( @@ -688,11 +471,14 @@ class Evaluator( def __init__( self, - evaluation_repository: EvaluationRepository, dataset_repository: DatasetRepository, + run_repository: RunRepository, + evaluation_repository: EvaluationRepository, description: str, ) -> None: - super().__init__(evaluation_repository, dataset_repository, description) + super().__init__( + dataset_repository, run_repository, evaluation_repository, description + ) @abstractmethod def do_evaluate( @@ -757,85 +543,6 @@ def evaluate_dataset( return self.aggregate_evaluation(partial_evaluation_overview.id) -class ArgillaEvaluationRepository(EvaluationRepository): - """Evaluation repository used for the :class:`ArgillaEvaluator`. - - Wraps an :class:`Evaluator`. - Does not support storing evaluations, since the ArgillaEvaluator does not do automated evaluations. - - Args: - evaluation_repository: repository to wrap. - argilla_client: client used to connect to Argilla. - """ - - def __init__( - self, evaluation_repository: EvaluationRepository, argilla_client: ArgillaClient - ) -> None: - super().__init__() - self._evaluation_repository = evaluation_repository - self._client = argilla_client - - def run_ids(self) -> Sequence[str]: - return self._evaluation_repository.run_ids() - - def eval_ids(self) -> Sequence[str]: - return self._evaluation_repository.eval_ids() - - def example_outputs( - self, run_id: str, output_type: type[Output] - ) -> Iterable[ExampleOutput[Output]]: - return self._evaluation_repository.example_outputs(run_id, output_type) - - def store_example_output(self, example_output: ExampleOutput[Output]) -> None: - return self._evaluation_repository.store_example_output(example_output) - - def example_trace(self, run_id: str, example_id: str) -> Optional[ExampleTrace]: - return self._evaluation_repository.example_trace(run_id, example_id) - - def example_tracer(self, run_id: str, example_id: str) -> Tracer: - return self._evaluation_repository.example_tracer(run_id, example_id) - - def example_evaluation( - self, eval_id: str, example_id: str, evaluation_type: type[Evaluation] - ) -> Optional[ExampleEvaluation[Evaluation]]: - return self._evaluation_repository.example_evaluation( - eval_id, example_id, evaluation_type - ) - - def store_example_evaluation(self, _: ExampleEvaluation[Evaluation]) -> None: - raise TypeError( - "ArgillaEvaluationRepository does not support storing evaluations." - ) - - def example_evaluations( - self, eval_id: str, eval_type: type[Evaluation] - ) -> Sequence[ExampleEvaluation[Evaluation]]: - assert eval_type == ArgillaEvaluation - # Mypy does not derive that the return type is always ExampleEvaluation with ArgillaEvaluation - return [ExampleEvaluation(eval_id=eval_id, example_id=e.example_id, result=e) for e in self._client.evaluations(eval_id)] # type: ignore - - def failed_example_evaluations( - self, eval_id: str, evaluation_type: type[Evaluation] - ) -> Sequence[ExampleEvaluation[Evaluation]]: - return self._evaluation_repository.failed_example_evaluations( - eval_id, evaluation_type - ) - - def evaluation_overview( - self, eval_id: str, overview_type: type[EvaluationOverviewType] - ) -> EvaluationOverviewType | None: - return self._evaluation_repository.evaluation_overview(eval_id, overview_type) - - def store_evaluation_overview(self, overview: IndividualEvaluationOverview) -> None: - return self._evaluation_repository.store_evaluation_overview(overview) - - def run_overview(self, run_id: str) -> RunOverview | None: - return self._evaluation_repository.run_overview(run_id) - - def store_run_overview(self, overview: RunOverview) -> None: - return self._evaluation_repository.store_run_overview(overview) - - class ArgillaEvaluator( BaseEvaluator[ Input, Output, ExpectedOutput, ArgillaEvaluation, AggregatedEvaluation @@ -859,14 +566,17 @@ class ArgillaEvaluator( def __init__( self, - evaluation_repository: ArgillaEvaluationRepository, dataset_repository: DatasetRepository, + run_repository: RunRepository, + evaluation_repository: ArgillaEvaluationRepository, description: str, workspace_id: str, fields: Sequence[Field], questions: Sequence[Question], ) -> None: - super().__init__(evaluation_repository, dataset_repository, description) + super().__init__( + dataset_repository, run_repository, evaluation_repository, description + ) self._workspace_id = workspace_id self._fields = fields self._questions = questions diff --git a/src/intelligence_layer/evaluation/hugging_face.py b/src/intelligence_layer/evaluation/hugging_face.py index ff81f9469..c6573586d 100644 --- a/src/intelligence_layer/evaluation/hugging_face.py +++ b/src/intelligence_layer/evaluation/hugging_face.py @@ -1,7 +1,9 @@ import huggingface_hub # type: ignore from huggingface_hub import HfFileSystem, create_repo -from intelligence_layer.evaluation.dataset_repository import FileSystemDatasetRepository +from intelligence_layer.evaluation.data_storage.dataset_repository import ( + FileSystemDatasetRepository, +) class HuggingFaceDatasetRepository(FileSystemDatasetRepository): diff --git a/src/intelligence_layer/evaluation/instruct_comparison_argilla_evaluator.py b/src/intelligence_layer/evaluation/instruct_comparison_argilla_evaluator.py index 972941e65..91677b693 100644 --- a/src/intelligence_layer/evaluation/instruct_comparison_argilla_evaluator.py +++ b/src/intelligence_layer/evaluation/instruct_comparison_argilla_evaluator.py @@ -12,8 +12,11 @@ ) from intelligence_layer.core.complete import InstructInput, PromptOutput from intelligence_layer.evaluation import ( + ArgillaEvaluationRepository, + DatasetRepository, Example, MeanAccumulator, + RunRepository, SuccessfulExampleOutput, ) from intelligence_layer.evaluation.elo import ( @@ -24,11 +27,7 @@ WinRateCalculator, build_tournaments, ) -from intelligence_layer.evaluation.evaluator import ( - ArgillaEvaluationRepository, - ArgillaEvaluator, - DatasetRepository, -) +from intelligence_layer.evaluation.evaluator import ArgillaEvaluator class AggregatedInstructComparison(BaseModel): @@ -52,8 +51,9 @@ class InstructComparisonArgillaEvaluator( def __init__( self, - evaluation_repository: ArgillaEvaluationRepository, dataset_repository: DatasetRepository, + run_repository: RunRepository, + evaluation_repository: ArgillaEvaluationRepository, description: str, workspace_id: str, high_priority_runs: Optional[frozenset[str]] = None, @@ -74,8 +74,9 @@ def __init__( ] super().__init__( - evaluation_repository, dataset_repository, + run_repository, + evaluation_repository, description, workspace_id, fields, diff --git a/src/intelligence_layer/evaluation/run.py b/src/intelligence_layer/evaluation/run.py index e9c97ae3e..02db57b7d 100644 --- a/src/intelligence_layer/evaluation/run.py +++ b/src/intelligence_layer/evaluation/run.py @@ -9,8 +9,13 @@ from intelligence_layer.connectors.limited_concurrency_client import ( LimitedConcurrencyClient, ) -from intelligence_layer.evaluation.dataset_repository import FileDatasetRepository -from intelligence_layer.evaluation.evaluation_repository import FileEvaluationRepository +from intelligence_layer.evaluation.data_storage.dataset_repository import ( + FileDatasetRepository, +) +from intelligence_layer.evaluation.data_storage.evaluation_repository import ( + FileEvaluationRepository, +) +from intelligence_layer.evaluation.data_storage.run_repository import FileRunRepository from intelligence_layer.evaluation.runner import Runner @@ -80,15 +85,18 @@ def parse_args(cli_args: Sequence[str]) -> Namespace: def main(cli_args: Sequence[str]) -> None: args = parse_args(cli_args) - evaluation_repository = FileEvaluationRepository(args.target_dir) dataset_repository = FileDatasetRepository(args.dataset_repository_path) + runner_repository = FileRunRepository(args.target_dir) + evaluation_repository = FileEvaluationRepository(args.target_dir) description = args.description task = create_task(args.task) - runner = Runner(task, evaluation_repository, dataset_repository, args.task.__name__) + runner = Runner(task, dataset_repository, runner_repository, args.task.__name__) dataset_id = args.dataset_id - run_overview = runner.run_dataset(dataset_id) - evaluator = args.evaluator(evaluation_repository, dataset_repository, description) - evaluator.evaluate_dataset(run_overview.id) + run_overview_id = runner.run_dataset(dataset_id).id + evaluator = args.evaluator( + dataset_repository, runner_repository, evaluation_repository, description + ) + evaluator.evaluate_dataset(run_overview_id) if __name__ == "__main__": diff --git a/src/intelligence_layer/evaluation/runner.py b/src/intelligence_layer/evaluation/runner.py index c4bde68fd..204aa07a4 100644 --- a/src/intelligence_layer/evaluation/runner.py +++ b/src/intelligence_layer/evaluation/runner.py @@ -9,6 +9,10 @@ from intelligence_layer.core.task import Input, Output, Task from intelligence_layer.core.tracer import CompositeTracer, Tracer, utc_now +from intelligence_layer.evaluation.data_storage.dataset_repository import ( + DatasetRepository, +) +from intelligence_layer.evaluation.data_storage.run_repository import RunRepository from intelligence_layer.evaluation.domain import ( Example, ExampleOutput, @@ -16,22 +20,18 @@ FailedExampleRun, RunOverview, ) -from intelligence_layer.evaluation.evaluator import ( - DatasetRepository, - EvaluationRepository, -) class Runner(Generic[Input, Output]): def __init__( self, task: Task[Input, Output], - evaluation_repository: EvaluationRepository, dataset_repository: DatasetRepository, + run_repository: RunRepository, description: str, ) -> None: self._task = task - self._evaluation_repository = evaluation_repository + self._run_repository = run_repository self._dataset_repository = dataset_repository self.description = description @@ -39,7 +39,7 @@ def output_type(self) -> type[Output]: """Returns the type of the evaluated task's output. This can be used to retrieve properly typed outputs of an evaluation run - from a :class:`EvaluationRepository` + from a :class:`RunRepository` Returns: the type of the evaluated task's output. @@ -82,15 +82,13 @@ def run_dataset( Returns: An overview of the run. Outputs will not be returned but instead stored in the - :class:`EvaluationRepository` provided in the __init__. + :class:`RunRepository` provided in the __init__. """ def run( example: Example[Input, ExpectedOutput] ) -> tuple[str, Output | FailedExampleRun]: - evaluate_tracer = self._evaluation_repository.example_tracer( - run_id, example.id - ) + evaluate_tracer = self._run_repository.example_tracer(run_id, example.id) if tracer: evaluate_tracer = CompositeTracer([evaluate_tracer, tracer]) try: @@ -119,7 +117,7 @@ def run( failed_count += 1 else: successful_count += 1 - self._evaluation_repository.store_example_output( + self._run_repository.store_example_output( ExampleOutput[Output]( run_id=run_id, example_id=example_id, output=output ), @@ -133,5 +131,5 @@ def run( successful_example_count=successful_count, description=self.description, ) - self._evaluation_repository.store_run_overview(run_overview) + self._run_repository.store_run_overview(run_overview) return run_overview diff --git a/src/intelligence_layer/use_cases/classify/classify.py b/src/intelligence_layer/use_cases/classify/classify.py index 2398d1f98..fa582b586 100644 --- a/src/intelligence_layer/use_cases/classify/classify.py +++ b/src/intelligence_layer/use_cases/classify/classify.py @@ -9,6 +9,7 @@ EvaluationRepository, Evaluator, MeanAccumulator, + RunRepository, ) Probability = NewType("Probability", float) @@ -81,14 +82,6 @@ class SingleLabelClassifyEvaluator( AggregatedSingleLabelClassifyEvaluation, ] ): - def __init__( - self, - evaluation_repository: EvaluationRepository, - dataset_respository: DatasetRepository, - description: str, - ): - super().__init__(evaluation_repository, dataset_respository, description) - # mypy expects *args where this method only uses one output def do_evaluate( # type: ignore self, @@ -170,12 +163,15 @@ class MultiLabelClassifyEvaluator( ): def __init__( self, - evaluation_repository: EvaluationRepository, dataset_repository: DatasetRepository, + run_repository: RunRepository, + evaluation_repository: EvaluationRepository, description: str, threshold: float = 0.55, ): - super().__init__(evaluation_repository, dataset_repository, description) + super().__init__( + dataset_repository, run_repository, evaluation_repository, description + ) self.threshold = threshold # mypy expects *args where this method only uses one output diff --git a/src/intelligence_layer/use_cases/summarize/summarize.py b/src/intelligence_layer/use_cases/summarize/summarize.py index 4660eb2c8..0c73742e1 100644 --- a/src/intelligence_layer/use_cases/summarize/summarize.py +++ b/src/intelligence_layer/use_cases/summarize/summarize.py @@ -11,6 +11,7 @@ MeanAccumulator, RougeGrader, ) +from intelligence_layer.evaluation.data_storage.run_repository import RunRepository class LongContextSummarizeInput(BaseModel): @@ -111,11 +112,14 @@ class SingleChunkSummarizeEvaluator( ): def __init__( self, - repository: EvaluationRepository, dataset_repository: DatasetRepository, + run_repository: RunRepository, + evaluation_repository: EvaluationRepository, description: str, ) -> None: - super().__init__(repository, dataset_repository, description) + super().__init__( + dataset_repository, run_repository, evaluation_repository, description + ) self.bleu_grader = BleuGrader() self.rouge_grader = RougeGrader() @@ -150,11 +154,14 @@ class LongContextSummarizeEvaluator( ): def __init__( self, - evaluation_repository: EvaluationRepository, dataset_repository: DatasetRepository, + run_repository: RunRepository, + evaluation_repository: EvaluationRepository, description: str, ) -> None: - super().__init__(evaluation_repository, dataset_repository, description) + super().__init__( + dataset_repository, run_repository, evaluation_repository, description + ) self.bleu_grader = BleuGrader() self.rouge_grader = RougeGrader() diff --git a/tests/conftest.py b/tests/conftest.py index d0e4dea10..48743d975 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -30,6 +30,7 @@ from intelligence_layer.evaluation import ( InMemoryDatasetRepository, InMemoryEvaluationRepository, + InMemoryRunRepository, ) @@ -135,6 +136,11 @@ def in_memory_dataset_repository() -> InMemoryDatasetRepository: return InMemoryDatasetRepository() +@fixture +def in_memory_run_repository() -> InMemoryRunRepository: + return InMemoryRunRepository() + + @fixture def in_memory_evaluation_repository() -> InMemoryEvaluationRepository: return InMemoryEvaluationRepository() diff --git a/tests/evaluation/conftest.py b/tests/evaluation/conftest.py index 68359946c..471c83113 100644 --- a/tests/evaluation/conftest.py +++ b/tests/evaluation/conftest.py @@ -14,11 +14,12 @@ ExampleEvaluation, FailedExampleEvaluation, FileEvaluationRepository, - InMemoryEvaluationRepository, + FileRunRepository, + InMemoryDatasetRepository, + InMemoryRunRepository, Runner, RunOverview, ) -from intelligence_layer.evaluation.dataset_repository import InMemoryDatasetRepository from tests.conftest import DummyStringInput, DummyStringOutput FAIL_IN_EVAL_INPUT = "fail in eval" @@ -76,6 +77,11 @@ def file_evaluation_repository(tmp_path: Path) -> FileEvaluationRepository: return FileEvaluationRepository(tmp_path) +@fixture +def file_run_repository(tmp_path: Path) -> FileRunRepository: + return FileRunRepository(tmp_path) + + @fixture def string_dataset_id( dummy_string_examples: Iterable[Example[DummyStringInput, DummyStringOutput]], @@ -135,12 +141,12 @@ def dummy_string_examples( @fixture def dummy_runner( - in_memory_evaluation_repository: InMemoryEvaluationRepository, in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, ) -> Runner[str, str]: return Runner( DummyTask(), - in_memory_evaluation_repository, in_memory_dataset_repository, + in_memory_run_repository, "dummy-runner", ) diff --git a/tests/evaluation/test_argilla_evaluator.py b/tests/evaluation/test_argilla_evaluator.py index df762dfdb..81968a272 100644 --- a/tests/evaluation/test_argilla_evaluator.py +++ b/tests/evaluation/test_argilla_evaluator.py @@ -19,6 +19,9 @@ Runner, SuccessfulExampleOutput, ) +from intelligence_layer.evaluation.data_storage.run_repository import ( + InMemoryRunRepository, +) from tests.conftest import DummyStringInput, DummyStringOutput, DummyStringTask from tests.evaluation.conftest import DummyAggregatedEvaluation @@ -110,8 +113,9 @@ def stub_argilla_client() -> StubArgillaClient: @fixture def string_argilla_evaluator( - in_memory_evaluation_repository: InMemoryEvaluationRepository, # noqa: w0404 in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + in_memory_evaluation_repository: InMemoryEvaluationRepository, stub_argilla_client: StubArgillaClient, ) -> DummyStringTaskArgillaEvaluator: stub_argilla_client._expected_workspace_id = "workspace-id" @@ -128,10 +132,11 @@ def string_argilla_evaluator( Field(name="input", title="Input"), ] evaluator = DummyStringTaskArgillaEvaluator( + in_memory_dataset_repository, + in_memory_run_repository, ArgillaEvaluationRepository( in_memory_evaluation_repository, stub_argilla_client ), - in_memory_dataset_repository, "dummy-string-task", stub_argilla_client._expected_workspace_id, fields, @@ -145,13 +150,13 @@ def string_argilla_evaluator( @fixture def string_argilla_runner( dummy_string_task: DummyStringTask, - in_memory_evaluation_repository: InMemoryEvaluationRepository, # noqa: w0404 in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, ) -> Runner[DummyStringInput, DummyStringOutput]: return Runner( dummy_string_task, - in_memory_evaluation_repository, in_memory_dataset_repository, + in_memory_run_repository, "dummy-task", ) diff --git a/tests/evaluation/test_evaluation_repository.py b/tests/evaluation/test_evaluation_repository.py index c1eb7de19..4f6829d04 100644 --- a/tests/evaluation/test_evaluation_repository.py +++ b/tests/evaluation/test_evaluation_repository.py @@ -1,23 +1,18 @@ from datetime import datetime -from typing import Sequence, cast +from typing import Sequence from pydantic import BaseModel from pytest import fixture -from intelligence_layer.core import InMemoryTaskSpan -from intelligence_layer.core.tracer import CompositeTracer, InMemoryTracer from intelligence_layer.evaluation import ( EvaluationOverview, - EvaluationRepository, ExampleEvaluation, - ExampleOutput, ExampleTrace, FailedExampleEvaluation, FileEvaluationRepository, InMemoryEvaluationRepository, TaskSpanTrace, ) -from tests.conftest import DummyStringInput from tests.evaluation.conftest import DummyAggregatedEvaluation, DummyEvaluation @@ -44,26 +39,6 @@ def example_trace( ) -def test_can_store_example_evaluation_traces_in_file( - file_evaluation_repository: FileEvaluationRepository, -) -> None: - run_id = "run_id" - example_id = "example_id" - now = datetime.now() - - tracer = file_evaluation_repository.example_tracer(run_id, example_id) - expected = InMemoryTracer() - CompositeTracer([tracer, expected]).task_span( - "task", DummyStringInput(input="input"), now - ) - - assert file_evaluation_repository.example_trace(run_id, example_id) == ExampleTrace( - run_id=run_id, - example_id=example_id, - trace=TaskSpanTrace.from_task_span(cast(InMemoryTaskSpan, expected.entries[0])), - ) - - def test_can_store_example_results_in_file( file_evaluation_repository: FileEvaluationRepository, successful_example_result: ExampleEvaluation[DummyEvaluation], @@ -203,57 +178,3 @@ def test_file_repository_returns_none_for_nonexisting_overview( ) is None ) - - -def test_file_repository_run_id_returns_run_ids( - file_evaluation_repository: FileEvaluationRepository, -) -> None: - run_id = "id" - - file_evaluation_repository.store_example_output( - ExampleOutput(run_id=run_id, example_id="example_id", output=None) - ) - - assert file_evaluation_repository.run_ids() == [run_id] - - -def evaluation_repository_returns_examples_in_same_order_for_two_runs( - evaluation_repository: EvaluationRepository, -) -> None: - run_id_1 = "id_1" - run_id_2 = "id_2" - num_examples = 20 - - for example_id in range(num_examples): - evaluation_repository.store_example_output( - ExampleOutput(run_id=run_id_1, example_id=str(example_id), output=None), - ) - - for example_id in reversed(range(num_examples)): - evaluation_repository.store_example_output( - ExampleOutput(run_id=run_id_2, example_id=str(example_id), output=None), - ) - - assert list( - (output.example_id, output.output) - for output in evaluation_repository.example_outputs(run_id_1, type(None)) - ) == list( - (output.example_id, output.output) - for output in evaluation_repository.example_outputs(run_id_2, type(None)) - ) - - -def test_in_memory_evaluation_repository_returns_examples_in_same_order_for_two_runs( - in_memory_evaluation_repository: InMemoryEvaluationRepository, -) -> None: - evaluation_repository_returns_examples_in_same_order_for_two_runs( - in_memory_evaluation_repository - ) - - -def test_file_evaluation_repository_returns_examples_in_same_order_for_two_runs( - file_evaluation_repository: FileEvaluationRepository, -) -> None: - evaluation_repository_returns_examples_in_same_order_for_two_runs( - file_evaluation_repository - ) diff --git a/tests/evaluation/test_evaluator.py b/tests/evaluation/test_evaluator.py index ec9fead9e..07f3b49d2 100644 --- a/tests/evaluation/test_evaluator.py +++ b/tests/evaluation/test_evaluator.py @@ -15,6 +15,7 @@ FailedExampleEvaluation, InMemoryDatasetRepository, InMemoryEvaluationRepository, + InMemoryRunRepository, MeanAccumulator, Runner, SuccessfulExampleOutput, @@ -118,11 +119,15 @@ def sequence_good_examples() -> Iterable[Example[str, None]]: @fixture def dummy_evaluator( - in_memory_evaluation_repository: InMemoryEvaluationRepository, in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + in_memory_evaluation_repository: InMemoryEvaluationRepository, ) -> DummyEvaluator: return DummyEvaluator( - in_memory_evaluation_repository, in_memory_dataset_repository, "dummy-evaluator" + in_memory_dataset_repository, + in_memory_run_repository, + in_memory_evaluation_repository, + "dummy-evaluator", ) @@ -144,12 +149,14 @@ def good_dataset_id( @fixture def comparing_evaluator( - in_memory_evaluation_repository: InMemoryEvaluationRepository, in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + in_memory_evaluation_repository: InMemoryEvaluationRepository, ) -> ComparingEvaluator: return ComparingEvaluator( - in_memory_evaluation_repository, in_memory_dataset_repository, + in_memory_run_repository, + in_memory_evaluation_repository, "comparing-evaluator", ) @@ -234,7 +241,7 @@ def test_evaluate_dataset_stores_example_traces( dataset_id: str, dummy_runner: Runner[str, str], ) -> None: - evaluation_repository = dummy_evaluator._evaluation_repository + run_repository = dummy_evaluator._run_repository dataset_repository = dummy_evaluator._dataset_repository dataset: Optional[Iterable[Example[str, None]]] = dataset_repository.examples_by_id( dataset_id, str, type(None) @@ -244,13 +251,13 @@ def test_evaluate_dataset_stores_example_traces( run_overview = dummy_runner.run_dataset(dataset_id) evaluation_run_overview = dummy_evaluator.evaluate_dataset(run_overview.id) examples = list(dataset) - success_result = evaluation_repository.example_trace( + success_result = run_repository.example_trace( evaluation_run_overview.run_ids[0], examples[0].id ) - failure_result_task = evaluation_repository.example_trace( + failure_result_task = run_repository.example_trace( evaluation_run_overview.run_ids[0], examples[1].id ) - failure_result_eval = evaluation_repository.example_trace( + failure_result_eval = run_repository.example_trace( evaluation_run_overview.run_ids[0], examples[2].id ) @@ -317,8 +324,9 @@ def test_aggregate_evaluation_can_aggregate_multiple_evals( def test_base_evaluator_type_magic_works( - in_memory_evaluation_repository: InMemoryEvaluationRepository, in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + in_memory_evaluation_repository: InMemoryEvaluationRepository, ) -> None: class EvaluationType(BaseModel): pass @@ -364,7 +372,10 @@ class GreatGrandChildEvaluator( pass timmy = GreatGrandChildEvaluator( - in_memory_evaluation_repository, in_memory_dataset_repository, "dummy" + in_memory_dataset_repository, + in_memory_run_repository, + in_memory_evaluation_repository, + "dummy", ) who_is_timmy = timmy._get_types() diff --git a/tests/evaluation/test_instruct_comparison_argilla_evaluator.py b/tests/evaluation/test_instruct_comparison_argilla_evaluator.py index 66b14cf47..b06cf16f5 100644 --- a/tests/evaluation/test_instruct_comparison_argilla_evaluator.py +++ b/tests/evaluation/test_instruct_comparison_argilla_evaluator.py @@ -25,6 +25,7 @@ ExampleOutput, InMemoryDatasetRepository, InMemoryEvaluationRepository, + InMemoryRunRepository, InstructComparisonArgillaEvaluator, Payoff, PayoffMatrix, @@ -75,6 +76,7 @@ def argilla_fake() -> ArgillaClient: @fixture def evaluator( in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, in_memory_evaluation_repository: InMemoryEvaluationRepository, argilla_fake: ArgillaClient, ) -> InstructComparisonArgillaEvaluator: @@ -82,7 +84,11 @@ def evaluator( in_memory_evaluation_repository, argilla_fake ) return InstructComparisonArgillaEvaluator( - eval_repository, in_memory_dataset_repository, "instruct-evaluator", "workspace" + in_memory_dataset_repository, + in_memory_run_repository, + eval_repository, + "instruct-evaluator", + "workspace", ) @@ -102,8 +108,9 @@ def any_instruct_output() -> PromptOutput: def test_evaluate_run_submits_pairwise_comparison_records( evaluator: InstructComparisonArgillaEvaluator, - in_memory_evaluation_repository: InMemoryEvaluationRepository, in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + in_memory_evaluation_repository: InMemoryEvaluationRepository, any_instruct_output: PromptOutput, argilla_fake: ArgillaFake, ) -> None: @@ -122,12 +129,12 @@ def test_evaluate_run_submits_pairwise_comparison_records( ] ) for run_id in run_ids: - in_memory_evaluation_repository.store_example_output( + in_memory_run_repository.store_example_output( example_output=ExampleOutput( run_id=run_id, example_id="example_id", output=any_instruct_output ) ) - in_memory_evaluation_repository.store_run_overview( + in_memory_run_repository.store_run_overview( RunOverview( dataset_id=dataset_id, id=run_id, @@ -156,8 +163,9 @@ def test_evaluate_run_submits_pairwise_comparison_records( def test_evaluate_run_only_evaluates_high_priority( - in_memory_evaluation_repository: InMemoryEvaluationRepository, in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + in_memory_evaluation_repository: InMemoryEvaluationRepository, any_instruct_output: PromptOutput, argilla_fake: ArgillaFake, ) -> None: @@ -166,8 +174,9 @@ def test_evaluate_run_only_evaluates_high_priority( ) relevant_ids = frozenset({"1", "2"}) evaluator = InstructComparisonArgillaEvaluator( - eval_repository, in_memory_dataset_repository, + in_memory_run_repository, + eval_repository, "instruct-evaluator", "workspace", relevant_ids, @@ -188,12 +197,12 @@ def test_evaluate_run_only_evaluates_high_priority( ] ) for run_id in run_ids: - in_memory_evaluation_repository.store_example_output( + in_memory_run_repository.store_example_output( example_output=ExampleOutput( run_id=run_id, example_id="example_id", output=any_instruct_output ) ) - in_memory_evaluation_repository.store_run_overview( + in_memory_run_repository.store_run_overview( RunOverview( dataset_id=dataset_id, id=run_id, diff --git a/tests/evaluation/test_run.py b/tests/evaluation/test_run.py index 49f1af711..362dabef0 100644 --- a/tests/evaluation/test_run.py +++ b/tests/evaluation/test_run.py @@ -8,9 +8,7 @@ from intelligence_layer.connectors import AlephAlphaClientProtocol from intelligence_layer.core import Task, TaskSpan from intelligence_layer.evaluation import ( - DatasetRepository, EvaluationOverview, - EvaluationRepository, Evaluator, Example, FileDatasetRepository, @@ -48,14 +46,6 @@ def __init__(self, client: AlephAlphaClientProtocol) -> None: class DummyEvaluator(Evaluator[None, None, None, DummyEvaluation, DummyAggregation]): - def __init__( - self, - evaluation_repository: EvaluationRepository, - dataset_repository: DatasetRepository, - description: str, - ) -> None: - super().__init__(evaluation_repository, dataset_repository, description) - # mypy expects *args where this method only uses one output def do_evaluate( # type: ignore self, input: None, expected_output: None, output: None diff --git a/tests/evaluation/test_run_repository.py b/tests/evaluation/test_run_repository.py new file mode 100644 index 000000000..b58d9084e --- /dev/null +++ b/tests/evaluation/test_run_repository.py @@ -0,0 +1,89 @@ +from datetime import datetime +from typing import cast + +from intelligence_layer.core.tracer import ( + CompositeTracer, + InMemoryTaskSpan, + InMemoryTracer, +) +from intelligence_layer.evaluation import ExampleTrace, TaskSpanTrace +from intelligence_layer.evaluation.data_storage.run_repository import ( + FileRunRepository, + RunRepository, +) +from intelligence_layer.evaluation.domain import ExampleOutput +from tests.conftest import DummyStringInput + + +def test_can_store_example_evaluation_traces_in_file( + file_run_repository: FileRunRepository, +) -> None: + run_id = "run_id" + example_id = "example_id" + now = datetime.now() + + tracer = file_run_repository.example_tracer(run_id, example_id) + expected = InMemoryTracer() + CompositeTracer([tracer, expected]).task_span( + "task", DummyStringInput(input="input"), now + ) + + assert file_run_repository.example_trace(run_id, example_id) == ExampleTrace( + run_id=run_id, + example_id=example_id, + trace=TaskSpanTrace.from_task_span(cast(InMemoryTaskSpan, expected.entries[0])), + ) + + +def test_file_repository_run_id_returns_run_ids( + file_run_repository: FileRunRepository, +) -> None: + run_id = "id" + + file_run_repository.store_example_output( + ExampleOutput(run_id=run_id, example_id="example_id", output=None) + ) + + assert file_run_repository.run_ids() == [run_id] + + +# def test_in_memory_evaluation_repository_returns_examples_in_same_order_for_two_runs( +# in_memory_evaluation_repository: InMemoryEvaluationRepository, +# ) -> None: +# evaluation_repository_returns_examples_in_same_order_for_two_runs( +# in_memory_evaluation_repository +# ) + + +def test_file_evaluation_repository_returns_examples_in_same_order_for_two_runs( + file_run_repository: FileRunRepository, +) -> None: + evaluation_repository_returns_examples_in_same_order_for_two_runs( + file_run_repository + ) + + +def evaluation_repository_returns_examples_in_same_order_for_two_runs( + run_repository: RunRepository, +) -> None: + run_id_1 = "id_1" + run_id_2 = "id_2" + num_examples = 20 + + for example_id in range(num_examples): + run_repository.store_example_output( + ExampleOutput(run_id=run_id_1, example_id=str(example_id), output=None), + ) + + for example_id in reversed(range(num_examples)): + run_repository.store_example_output( + ExampleOutput(run_id=run_id_2, example_id=str(example_id), output=None), + ) + + assert list( + (output.example_id, output.output) + for output in run_repository.example_outputs(run_id_1, type(None)) + ) == list( + (output.example_id, output.output) + for output in run_repository.example_outputs(run_id_2, type(None)) + ) diff --git a/tests/evaluation/test_runner.py b/tests/evaluation/test_runner.py index d37138d8d..737008348 100644 --- a/tests/evaluation/test_runner.py +++ b/tests/evaluation/test_runner.py @@ -2,22 +2,19 @@ from intelligence_layer.evaluation import ( Example, InMemoryDatasetRepository, - InMemoryEvaluationRepository, + InMemoryRunRepository, Runner, ) from tests.evaluation.conftest import FAIL_IN_EVAL_INPUT, FAIL_IN_TASK_INPUT, DummyTask def test_runner_runs_dataset( - in_memory_evaluation_repository: InMemoryEvaluationRepository, in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, ) -> None: task = DummyTask() runner = Runner( - task, - in_memory_evaluation_repository, - in_memory_dataset_repository, - "dummy-runner", + task, in_memory_dataset_repository, in_memory_run_repository, "dummy-runner" ) examples = [ Example(input="success", expected_output=None), @@ -28,7 +25,7 @@ def test_runner_runs_dataset( dataset_id = in_memory_dataset_repository.create_dataset(examples=examples) overview = runner.run_dataset(dataset_id) outputs = list( - in_memory_evaluation_repository.example_outputs( + in_memory_run_repository.example_outputs( overview.id, output_type=runner.output_type() ) ) @@ -39,16 +36,13 @@ def test_runner_runs_dataset( def test_runner_runs_n_examples( - in_memory_evaluation_repository: InMemoryEvaluationRepository, in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, ) -> None: task = DummyTask() tracer = InMemoryTracer() runner = Runner( - task, - in_memory_evaluation_repository, - in_memory_dataset_repository, - "dummy-runner", + task, in_memory_dataset_repository, in_memory_run_repository, "dummy-runner" ) examples = [ Example(input="success", expected_output=None), diff --git a/tests/use_cases/classify/test_classify.py b/tests/use_cases/classify/test_classify.py index 85fd9586c..12d02478f 100644 --- a/tests/use_cases/classify/test_classify.py +++ b/tests/use_cases/classify/test_classify.py @@ -10,6 +10,7 @@ InMemoryDatasetRepository, InMemoryEvaluationRepository, Runner, + RunRepository, ) from intelligence_layer.use_cases.classify.classify import ( ClassifyInput, @@ -126,12 +127,14 @@ def multiple_entries_dataset_name( @fixture def classify_evaluator( - in_memory_evaluation_repository: InMemoryEvaluationRepository, in_memory_dataset_repository: DatasetRepository, + in_memory_run_repository: RunRepository, + in_memory_evaluation_repository: InMemoryEvaluationRepository, ) -> MultiLabelClassifyEvaluator: return MultiLabelClassifyEvaluator( - in_memory_evaluation_repository, in_memory_dataset_repository, + in_memory_run_repository, + in_memory_evaluation_repository, "multi-label-classify", ) @@ -139,13 +142,13 @@ def classify_evaluator( @fixture def classify_runner( embedding_based_classify: Task[ClassifyInput, MultiLabelClassifyOutput], - in_memory_evaluation_repository: InMemoryEvaluationRepository, in_memory_dataset_repository: DatasetRepository, + in_memory_run_repository: RunRepository, ) -> Runner[ClassifyInput, MultiLabelClassifyOutput]: return Runner( embedding_based_classify, - in_memory_evaluation_repository, in_memory_dataset_repository, + in_memory_run_repository, "multi-label-classify", ) @@ -158,7 +161,7 @@ def test_multi_label_classify_evaluator_single_example( run_overview = classify_runner.run_dataset(single_entry_dataset_name) evaluation_overview = classify_evaluator.evaluate_dataset(run_overview.id) - evaluation = classify_runner._evaluation_repository.example_evaluations( + evaluation = classify_evaluator._evaluation_repository.example_evaluations( evaluation_overview.individual_evaluation_overviews[0].id, MultiLabelClassifyEvaluation, )[0].result diff --git a/tests/use_cases/classify/test_prompt_based_classify.py b/tests/use_cases/classify/test_prompt_based_classify.py index b00b1e5ca..571525526 100644 --- a/tests/use_cases/classify/test_prompt_based_classify.py +++ b/tests/use_cases/classify/test_prompt_based_classify.py @@ -8,11 +8,11 @@ from intelligence_layer.core import Chunk, InMemoryTracer, NoOpTracer from intelligence_layer.evaluation import ( DatasetRepository, - EvaluationRepository, Example, InMemoryDatasetRepository, InMemoryEvaluationRepository, Runner, + RunRepository, ) from intelligence_layer.use_cases.classify.classify import ( ClassifyInput, @@ -32,12 +32,14 @@ def prompt_based_classify(client: AlephAlphaClientProtocol) -> PromptBasedClassi @fixture def classify_evaluator( - in_memory_evaluation_repository: InMemoryEvaluationRepository, in_memory_dataset_repository: DatasetRepository, + in_memory_run_repository: RunRepository, + in_memory_evaluation_repository: InMemoryEvaluationRepository, ) -> SingleLabelClassifyEvaluator: return SingleLabelClassifyEvaluator( - in_memory_evaluation_repository, in_memory_dataset_repository, + in_memory_run_repository, + in_memory_evaluation_repository, "single-label-classify", ) @@ -45,13 +47,13 @@ def classify_evaluator( @fixture def classify_runner( prompt_based_classify: PromptBasedClassify, - in_memory_evaluation_repository: EvaluationRepository, in_memory_dataset_repository: DatasetRepository, + in_memory_run_repository: RunRepository, ) -> Runner[ClassifyInput, SingleLabelClassifyOutput]: return Runner( prompt_based_classify, - in_memory_evaluation_repository, in_memory_dataset_repository, + in_memory_run_repository, "prompt-based-classify", ) diff --git a/tests/use_cases/summarize/test_summarize.py b/tests/use_cases/summarize/test_summarize.py index 9e1888fc9..ccae8abcf 100644 --- a/tests/use_cases/summarize/test_summarize.py +++ b/tests/use_cases/summarize/test_summarize.py @@ -7,7 +7,9 @@ Example, InMemoryDatasetRepository, InMemoryEvaluationRepository, + InMemoryRunRepository, Runner, + RunRepository, ) from intelligence_layer.use_cases.summarize.long_context_high_compression_summarize import ( LongContextHighCompressionSummarize, @@ -28,12 +30,14 @@ @fixture def single_chunk_summarize_evaluator( - in_memory_evaluation_repository: InMemoryEvaluationRepository, in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + in_memory_evaluation_repository: InMemoryEvaluationRepository, ) -> SingleChunkSummarizeEvaluator: return SingleChunkSummarizeEvaluator( - in_memory_evaluation_repository, in_memory_dataset_repository, + in_memory_run_repository, + in_memory_evaluation_repository, "single-chunk-summarize", ) @@ -41,25 +45,27 @@ def single_chunk_summarize_evaluator( @fixture def single_chunk_summarize_runner( single_chunk_few_shot_summarize: SingleChunkFewShotSummarize, - in_memory_evaluation_repository: InMemoryEvaluationRepository, in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, ) -> Runner[SingleChunkSummarizeInput, SummarizeOutput]: return Runner( single_chunk_few_shot_summarize, - in_memory_evaluation_repository, in_memory_dataset_repository, + in_memory_run_repository, "single-chunk-summarize", ) @fixture def long_context_summarize_evaluator( - in_memory_evaluation_repository: EvaluationRepository, in_memory_dataset_repository: DatasetRepository, + in_memory_run_repository: RunRepository, + in_memory_evaluation_repository: EvaluationRepository, ) -> LongContextSummarizeEvaluator: return LongContextSummarizeEvaluator( - in_memory_evaluation_repository, in_memory_dataset_repository, + in_memory_run_repository, + in_memory_evaluation_repository, "long-context-summarize", ) @@ -67,13 +73,13 @@ def long_context_summarize_evaluator( @fixture def long_context_summarize_runner( long_context_high_compression_summarize: LongContextHighCompressionSummarize, - in_memory_evaluation_repository: InMemoryEvaluationRepository, in_memory_dataset_repository: DatasetRepository, + in_memory_run_repository: InMemoryRunRepository, ) -> Runner[LongContextSummarizeInput, LongContextSummarizeOutput]: return Runner( long_context_high_compression_summarize, - in_memory_evaluation_repository, in_memory_dataset_repository, + in_memory_run_repository, "long-context-summarize", )