From a5d389f5ca0632b5b4efc10a47494091e3bf1957 Mon Sep 17 00:00:00 2001 From: James Mitchell-White Date: Mon, 21 Oct 2024 11:11:31 +0100 Subject: [PATCH 1/7] Create eval_tests.py Created the eval_tests file to hold tests Created the LLMPipelineTest class, inherits methods from SingleResultPipelineTest --- Carrot-Assistant/evaluation/eval_tests.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 Carrot-Assistant/evaluation/eval_tests.py diff --git a/Carrot-Assistant/evaluation/eval_tests.py b/Carrot-Assistant/evaluation/eval_tests.py new file mode 100644 index 0000000..d5efbf4 --- /dev/null +++ b/Carrot-Assistant/evaluation/eval_tests.py @@ -0,0 +1,22 @@ +from evaluation.evaltypes import ( + SingleResultPipelineTest, + SingleResultMetric, + SingleResultPipeline, +) +from evaluation.pipelines import LLMPipeline + + +class LLMPipelineTest(SingleResultPipelineTest): + def __init__( + self, + name: str, + pipeline: SingleResultPipeline, + metrics: list[SingleResultMetric], + ): + super().__init__(name, pipeline, metrics) + + def run_pipeline(self, input_data): + return super().run_pipeline(input_data) + + def evaluate(self, input_data, expected_output): + return super().evaluate(input_data, expected_output) From 99f78b9be3aae7f091d6785628ecbc86f650e683 Mon Sep 17 00:00:00 2001 From: James Mitchell-White Date: Mon, 21 Oct 2024 11:13:26 +0100 Subject: [PATCH 2/7] Update evaltypes.py Added type for pipeline --- Carrot-Assistant/evaluation/evaltypes.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Carrot-Assistant/evaluation/evaltypes.py b/Carrot-Assistant/evaluation/evaltypes.py index ad9d343..18afd9d 100644 --- a/Carrot-Assistant/evaluation/evaltypes.py +++ b/Carrot-Assistant/evaluation/evaltypes.py @@ -40,9 +40,10 @@ def run(self, *args, **kwargs) -> Any: M = TypeVar("M", bound=Metric) +P = TypeVar("P", bound=TestPipeline) -class PipelineTest(Generic[M]): +class PipelineTest(Generic[P, M]): """ Base class for Pipeline tests """ @@ -77,7 +78,7 @@ class SingleResultPipeline(TestPipeline): """ -class SingleResultPipelineTest(PipelineTest[SingleResultMetric]): +class SingleResultPipelineTest(PipelineTest[SingleResultPipeline, SingleResultMetric]): def __init__( self, name: str, From 20c4a84a9705a2da2768c45f6490810a42ce027d Mon Sep 17 00:00:00 2001 From: James Mitchell-White Date: Mon, 21 Oct 2024 11:53:57 +0100 Subject: [PATCH 3/7] Update evaltypes.py modified PipelineTest's init to take type P --- Carrot-Assistant/evaluation/evaltypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Carrot-Assistant/evaluation/evaltypes.py b/Carrot-Assistant/evaluation/evaltypes.py index 18afd9d..956e0f3 100644 --- a/Carrot-Assistant/evaluation/evaltypes.py +++ b/Carrot-Assistant/evaluation/evaltypes.py @@ -48,7 +48,7 @@ class PipelineTest(Generic[P, M]): Base class for Pipeline tests """ - def __init__(self, name: str, pipeline: TestPipeline, metrics: list[M]): + def __init__(self, name: str, pipeline: P, metrics: list[M]): self.name = name self.pipeline = pipeline self.metrics = metrics From 6d515b591e923ae383790de95066fee043de27e4 Mon Sep 17 00:00:00 2001 From: James Mitchell-White Date: Mon, 21 Oct 2024 12:07:19 +0100 Subject: [PATCH 4/7] Update eval_tests.py Overwrote LLMPipelineTest init so it asks for an LLMPipeline --- Carrot-Assistant/evaluation/eval_tests.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Carrot-Assistant/evaluation/eval_tests.py b/Carrot-Assistant/evaluation/eval_tests.py index d5efbf4..8bfc43d 100644 --- a/Carrot-Assistant/evaluation/eval_tests.py +++ b/Carrot-Assistant/evaluation/eval_tests.py @@ -1,7 +1,6 @@ from evaluation.evaltypes import ( SingleResultPipelineTest, SingleResultMetric, - SingleResultPipeline, ) from evaluation.pipelines import LLMPipeline @@ -10,7 +9,7 @@ class LLMPipelineTest(SingleResultPipelineTest): def __init__( self, name: str, - pipeline: SingleResultPipeline, + pipeline: LLMPipeline, metrics: list[SingleResultMetric], ): super().__init__(name, pipeline, metrics) From e4a490c3463fc065c3883bec271e00515c44bd01 Mon Sep 17 00:00:00 2001 From: James Mitchell-White Date: Mon, 21 Oct 2024 12:25:08 +0100 Subject: [PATCH 5/7] Update test_evals.py added the fixtures required to test.. the LLMPipelineTest (we might need new names) Added tests for the LLMPipelineTest --- Carrot-Assistant/tests/test_evals.py | 35 +++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/Carrot-Assistant/tests/test_evals.py b/Carrot-Assistant/tests/test_evals.py index f2ab72b..14be6b6 100644 --- a/Carrot-Assistant/tests/test_evals.py +++ b/Carrot-Assistant/tests/test_evals.py @@ -1,6 +1,7 @@ import pytest -from jinja2 import Environment, Template +from jinja2 import Environment +from evaluation.eval_tests import LLMPipelineTest from evaluation.evaltypes import SingleResultPipeline, SingleResultPipelineTest from evaluation.metrics import ExactMatchMetric from evaluation.pipelines import LLMPipeline @@ -90,3 +91,35 @@ def llm_pipeline(self, llm_prompt): def test_returns_string(self, llm_pipeline): model_output = llm_pipeline.run({"input_sentence": "Polly wants a cracker"}) assert isinstance(model_output, str) + + @pytest.fixture + def llm_input(self): + return [ + "Polly wants a cracker", + "I'm not a parrot!", + "I'm trapped in this machine!", + ] + + @pytest.fixture + def llm_pipeline_test(self, llm_pipeline): + return LLMPipelineTest("Parrot Pipeline", llm_pipeline, [ExactMatchMetric()]) + + def test_pipeline_called_from_eval_returns_string(self, llm_pipeline_test): + model_output = llm_pipeline_test.run_pipeline( + {"input_sentence": "Polly wants a cracker"} + ) + assert isinstance(model_output, str) + + def test_pipeline_called_from_eval_returns_list(self, llm_pipeline_test, llm_input): + model_output = [ + llm_pipeline_test.run_pipeline({"input_sentence": sentence}) + for sentence in llm_input + ] + assert isinstance(model_output, list) + + def test_llm_pipelinetest_evaluates(self, llm_pipeline_test, llm_input): + model_eval = llm_pipeline_test.evaluate( + input_data=[{"input_sentence": sentence} for sentence in llm_input], + expected_output=llm_input, + ) + assert isinstance(model_eval, dict) From 47a679856defeb1329874da7157c26b5ca38efb5 Mon Sep 17 00:00:00 2001 From: James Mitchell-White Date: Mon, 21 Oct 2024 13:07:52 +0100 Subject: [PATCH 6/7] Update test_evals.py Made it so the test runs a single evaluation --- Carrot-Assistant/tests/test_evals.py | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/Carrot-Assistant/tests/test_evals.py b/Carrot-Assistant/tests/test_evals.py index 14be6b6..a8fc99c 100644 --- a/Carrot-Assistant/tests/test_evals.py +++ b/Carrot-Assistant/tests/test_evals.py @@ -92,14 +92,6 @@ def test_returns_string(self, llm_pipeline): model_output = llm_pipeline.run({"input_sentence": "Polly wants a cracker"}) assert isinstance(model_output, str) - @pytest.fixture - def llm_input(self): - return [ - "Polly wants a cracker", - "I'm not a parrot!", - "I'm trapped in this machine!", - ] - @pytest.fixture def llm_pipeline_test(self, llm_pipeline): return LLMPipelineTest("Parrot Pipeline", llm_pipeline, [ExactMatchMetric()]) @@ -110,16 +102,9 @@ def test_pipeline_called_from_eval_returns_string(self, llm_pipeline_test): ) assert isinstance(model_output, str) - def test_pipeline_called_from_eval_returns_list(self, llm_pipeline_test, llm_input): - model_output = [ - llm_pipeline_test.run_pipeline({"input_sentence": sentence}) - for sentence in llm_input - ] - assert isinstance(model_output, list) - - def test_llm_pipelinetest_evaluates(self, llm_pipeline_test, llm_input): + def test_llm_pipelinetest_evaluates(self, llm_pipeline_test): model_eval = llm_pipeline_test.evaluate( - input_data=[{"input_sentence": sentence} for sentence in llm_input], - expected_output=llm_input, + input_data={"input_sentence": "Polly wants a cracker"}, + expected_output="Polly wants a cracker", ) assert isinstance(model_eval, dict) From 22e53279c14d84c7fda56365306880a4e6f149bf Mon Sep 17 00:00:00 2001 From: James Mitchell-White Date: Mon, 21 Oct 2024 13:25:50 +0100 Subject: [PATCH 7/7] Added documentation LLMPipelineTest documentation --- Carrot-Assistant/evaluation/eval_tests.py | 49 ++++++++++++++++++++++- Carrot-Assistant/tests/test_evals.py | 1 + 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/Carrot-Assistant/evaluation/eval_tests.py b/Carrot-Assistant/evaluation/eval_tests.py index 8bfc43d..64932fd 100644 --- a/Carrot-Assistant/evaluation/eval_tests.py +++ b/Carrot-Assistant/evaluation/eval_tests.py @@ -1,3 +1,4 @@ +from typing import Dict from evaluation.evaltypes import ( SingleResultPipelineTest, SingleResultMetric, @@ -6,16 +7,60 @@ class LLMPipelineTest(SingleResultPipelineTest): + """ + This class provides a pipeline test for LLM pipelines that return a single result + """ + def __init__( self, name: str, pipeline: LLMPipeline, metrics: list[SingleResultMetric], ): + """ + Initialises the LLMPipelineTest class + + Parameters + ---------- + name: str + Name given to the test + pipeline: LLMPipeline + The pipeline used to generate output + metrics: list[SingleResultMetric] + A list of metrics used to compare the pipeline output with the expected output + """ super().__init__(name, pipeline, metrics) - def run_pipeline(self, input_data): + def run_pipeline(self, input_data) -> str: + """ + Runs the provided pipeline on the input_data + + Parameters + ---------- + input_data + The data used for input to the pipeline + + Returns + ------- + str + The reply from the pipeline + """ return super().run_pipeline(input_data) - def evaluate(self, input_data, expected_output): + def evaluate(self, input_data, expected_output) -> Dict: + """ + Evaluates the attached pipeline's output against the expected output using the metrics + + Parameters + ---------- + input_data + The data used for input to the pipeline + expected_output + The expected result of running the input data through the pipeline + + Returns + ------- + Dict + A dictionary of results from evaluating the pipeline. + """ return super().evaluate(input_data, expected_output) diff --git a/Carrot-Assistant/tests/test_evals.py b/Carrot-Assistant/tests/test_evals.py index a8fc99c..3969788 100644 --- a/Carrot-Assistant/tests/test_evals.py +++ b/Carrot-Assistant/tests/test_evals.py @@ -104,6 +104,7 @@ def test_pipeline_called_from_eval_returns_string(self, llm_pipeline_test): def test_llm_pipelinetest_evaluates(self, llm_pipeline_test): model_eval = llm_pipeline_test.evaluate( + name="Testing the parrot pipeline", input_data={"input_sentence": "Polly wants a cracker"}, expected_output="Polly wants a cracker", )