From eccc8c5801fcd9873222b680ee32941b86701083 Mon Sep 17 00:00:00 2001 From: James Mitchell-White Date: Fri, 18 Oct 2024 15:02:03 +0100 Subject: [PATCH 01/10] Create pipelines.py --- Carrot-Assistant/evaluation/pipelines.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 Carrot-Assistant/evaluation/pipelines.py diff --git a/Carrot-Assistant/evaluation/pipelines.py b/Carrot-Assistant/evaluation/pipelines.py new file mode 100644 index 0000000..e69de29 From 1657e1ec65ef65f3b038bc933f99039a688aabd3 Mon Sep 17 00:00:00 2001 From: James Mitchell-White Date: Fri, 18 Oct 2024 15:08:55 +0100 Subject: [PATCH 02/10] Update pipelines.py Created LLMPipeline class, which initialises with an LLM and prompt_template --- Carrot-Assistant/evaluation/pipelines.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Carrot-Assistant/evaluation/pipelines.py b/Carrot-Assistant/evaluation/pipelines.py index e69de29..24d8dd5 100644 --- a/Carrot-Assistant/evaluation/pipelines.py +++ b/Carrot-Assistant/evaluation/pipelines.py @@ -0,0 +1,12 @@ +from evaltypes import SingleResultPipeline +from options.pipeline_options import LLMModel +from jinja2 import Template + + +class LLMPipeline(SingleResultPipeline): + def __init__(self, llm: LLMModel, prompt_template: Template) -> None: + self.llm = (LLMModel,) + self.prompt_template = prompt_template + + def run(self, input) -> str: + pass From e12d3709f1674b72ea4379906acf46ae12185301 Mon Sep 17 00:00:00 2001 From: James Mitchell-White Date: Fri, 18 Oct 2024 15:16:49 +0100 Subject: [PATCH 03/10] Update pipelines.py implemented LLMPipeline.run() to take an input (which has to be a dict so jinja2 can use as a template) --- Carrot-Assistant/evaluation/pipelines.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/Carrot-Assistant/evaluation/pipelines.py b/Carrot-Assistant/evaluation/pipelines.py index 24d8dd5..71fa8b3 100644 --- a/Carrot-Assistant/evaluation/pipelines.py +++ b/Carrot-Assistant/evaluation/pipelines.py @@ -1,12 +1,20 @@ +from typing import Dict from evaltypes import SingleResultPipeline from options.pipeline_options import LLMModel +from components.models import local_models from jinja2 import Template +from llama_cpp import Llama +from huggingface_hub import hf_hub_download class LLMPipeline(SingleResultPipeline): def __init__(self, llm: LLMModel, prompt_template: Template) -> None: - self.llm = (LLMModel,) + self.llm = llm self.prompt_template = prompt_template + self._model = Llama(hf_hub_download(**local_models[self.llm])) - def run(self, input) -> str: - pass + def run(self, input: Dict[str, str]) -> str: + prompt = self.prompt_template.render(input) + return self._model.create_chat_completion( + messages=[{"role": "user", "content": prompt}] + )["choices"][0]["message"] From 232e8ab557bf95926c574dcf6ace510054600f78 Mon Sep 17 00:00:00 2001 From: James Mitchell-White Date: Fri, 18 Oct 2024 16:06:09 +0100 Subject: [PATCH 04/10] black formatting deleted an import from components/pipeline.py, black formatting on save passed the value of the LLMModel enum to the evaluation LLMPipeline --- Carrot-Assistant/components/pipeline.py | 74 +++++++++++++----------- Carrot-Assistant/evaluation/pipelines.py | 2 +- 2 files changed, 41 insertions(+), 35 deletions(-) diff --git a/Carrot-Assistant/components/pipeline.py b/Carrot-Assistant/components/pipeline.py index 29e8249..7c35f95 100644 --- a/Carrot-Assistant/components/pipeline.py +++ b/Carrot-Assistant/components/pipeline.py @@ -9,7 +9,6 @@ from components.embeddings import Embeddings from components.models import get_model from components.prompt import Prompts -from tests.test_prompt_build import mock_rag_results class llm_pipeline: @@ -52,10 +51,12 @@ def get_simple_assistant(self) -> Pipeline: self._logger.info(f"Pipeline initialized in {time.time()-start} seconds") start = time.time() - pipeline.add_component("prompt", Prompts( - model_name=self._model_name, - eot_token=self._eot_token - ).get_prompt()) + pipeline.add_component( + "prompt", + Prompts( + model_name=self._model_name, eot_token=self._eot_token + ).get_prompt(), + ) self._logger.info(f"Prompt added to pipeline in {time.time()-start} seconds") start = time.time() @@ -72,6 +73,7 @@ def get_simple_assistant(self) -> Pipeline: self._logger.info(f"Pipeline connected in {time.time()-start} seconds") return pipeline + def get_rag_assistant(self) -> Pipeline: """ Get an assistant that uses vector search to populate a prompt for an LLM @@ -85,46 +87,50 @@ def get_rag_assistant(self) -> Pipeline: pipeline = Pipeline() self._logger.info(f"Pipeline initialized in {time.time()-start} seconds") start = time.time() - - + vec_search = Embeddings( - embeddings_path=self._opt.embeddings_path, - force_rebuild=self._opt.force_rebuild, - embed_vocab=self._opt.embed_vocab, - model_name=self._opt.embedding_model, - search_kwargs=self._opt.embedding_search_kwargs - ) - + embeddings_path=self._opt.embeddings_path, + force_rebuild=self._opt.force_rebuild, + embed_vocab=self._opt.embed_vocab, + model_name=self._opt.embedding_model, + search_kwargs=self._opt.embedding_search_kwargs, + ) + vec_embedder = vec_search.get_embedder() vec_retriever = vec_search.get_retriever() - router = ConditionalRouter(routes=[ - { - "condition": "{{vec_results[0].score > 0.95}}", - "output": "{{vec_results}}", - "output_name": "exact_match", - "output_type": List[Dict], - }, - { - "condition": "{{vec_results[0].score <=0.95}}", - "output": "{{vec_results}}", - "output_name": "no_exact_match", - "output_type": List[Dict] - } - ]) + router = ConditionalRouter( + routes=[ + { + "condition": "{{vec_results[0].score > 0.95}}", + "output": "{{vec_results}}", + "output_name": "exact_match", + "output_type": List[Dict], + }, + { + "condition": "{{vec_results[0].score <=0.95}}", + "output": "{{vec_results}}", + "output_name": "no_exact_match", + "output_type": List[Dict], + }, + ] + ) llm = get_model( model_name=self._model_name, temperature=self._opt.temperature, logger=self._logger, ) - + pipeline.add_component("query_embedder", vec_embedder) pipeline.add_component("retriever", vec_retriever) pipeline.add_component("router", router) - pipeline.add_component("prompt", Prompts( - model_name=self._model_name, - prompt_type="top_n_RAG", - eot_token=self._eot_token - ).get_prompt()) + pipeline.add_component( + "prompt", + Prompts( + model_name=self._model_name, + prompt_type="top_n_RAG", + eot_token=self._eot_token, + ).get_prompt(), + ) pipeline.add_component("llm", llm) pipeline.connect("query_embedder.embedding", "retriever.query_embedding") diff --git a/Carrot-Assistant/evaluation/pipelines.py b/Carrot-Assistant/evaluation/pipelines.py index 71fa8b3..f30eb46 100644 --- a/Carrot-Assistant/evaluation/pipelines.py +++ b/Carrot-Assistant/evaluation/pipelines.py @@ -11,7 +11,7 @@ class LLMPipeline(SingleResultPipeline): def __init__(self, llm: LLMModel, prompt_template: Template) -> None: self.llm = llm self.prompt_template = prompt_template - self._model = Llama(hf_hub_download(**local_models[self.llm])) + self._model = Llama(hf_hub_download(**local_models[self.llm.value])) def run(self, input: Dict[str, str]) -> str: prompt = self.prompt_template.render(input) From 7907b79470f0f787f55dbd1d7d7b5eba75714a47 Mon Sep 17 00:00:00 2001 From: James Mitchell-White Date: Fri, 18 Oct 2024 16:08:07 +0100 Subject: [PATCH 05/10] Update evaltypes.py Changed TestPipeline .run abstract method for compatibility of return types --- Carrot-Assistant/evaluation/evaltypes.py | 77 ++++++++++++++++-------- 1 file changed, 51 insertions(+), 26 deletions(-) diff --git a/Carrot-Assistant/evaluation/evaltypes.py b/Carrot-Assistant/evaluation/evaltypes.py index bc0ec7b..52798aa 100644 --- a/Carrot-Assistant/evaluation/evaltypes.py +++ b/Carrot-Assistant/evaluation/evaltypes.py @@ -1,18 +1,23 @@ from abc import ABC, abstractmethod from typing import TypeVar, Generic + class EvaluationFramework: - def __init__(self, results_file='results.json'): + def __init__(self, results_file="results.json"): self.results_file = results_file + def run_evaluations(self): # Run some tests self._save_evaluations + def _save_evaluations(self): - # Append to 'results.json' + # Append to 'results.json' pass + class Metric(ABC): """Base class for all metrics.""" + @abstractmethod def calculate(self, *args, **kwargs) -> float: """ @@ -20,23 +25,28 @@ def calculate(self, *args, **kwargs) -> float: """ pass + class TestPipeline(ABC): - """ - Base class for Pipeline runs - """ - @abstractmethod - def run(self, *args, **kwargs): - """ - Run the pipeline - """ - pass - -M = TypeVar('M', bound=Metric) + """ + Base class for Pipeline runs + """ + + @abstractmethod + def run(self, *args, **kwargs): + """ + Run the pipeline + """ + ... + + +M = TypeVar("M", bound=Metric) + class PipelineTest(Generic[M]): """ Base class for Pipeline tests """ + def __init__(self, name: str, pipeline: TestPipeline, metrics: list[M]): self.pipeline = pipeline self.metrics = metrics @@ -44,50 +54,65 @@ def __init__(self, name: str, pipeline: TestPipeline, metrics: list[M]): @abstractmethod def run_pipeline(self, *args, **kwargs): pass + @abstractmethod def evaluate(self, *args, **kwargs) -> dict[str, float]: - pass + pass + class SingleResultMetric(Metric): """Metric for evaluating pipelines that return a single result.""" + class InformationRetrievalMetric(Metric): """Metric for evaluating information retrieval pipelines.""" + pass + class SingleResultPipeline(TestPipeline): - """ - Base class for pipelines returning a single result - """ + """ + Base class for pipelines returning a single result + """ + class SingleResultPipelineTest(PipelineTest[SingleResultMetric]): - def __init__(self, name: str, pipeline: SingleResultPipeline, metrics: list[SingleResultMetric]): + def __init__( + self, + name: str, + pipeline: SingleResultPipeline, + metrics: list[SingleResultMetric], + ): super().__init__(name, pipeline, metrics) - + def run_pipeline(self, input_data): """ Run the pipeline with the given input data. - + Args: input_data: The input data for the pipeline. - + Returns: The result of running the pipeline on the input data. """ return self.pipeline.run(input_data) - + def evaluate(self, input_data, expected_output): """ Evaluate the pipeline by running it on the input data and comparing the result to the expected output using all metrics. - + Args: input_data: The input data for the pipeline. expected_output: The expected output to compare against. - + Returns: A dictionary mapping metric names to their calculated values. """ pipeline_output = self.run_pipeline(input_data) - return {metric.__class__.__name__: metric.calculate(pipeline_output, expected_output) - for metric in self.metrics} + return { + metric.__class__.__name__: metric.calculate( + pipeline_output, expected_output + ) + for metric in self.metrics + } From 13269d68b5d300da86892b20bf61a1a9b8cb9edc Mon Sep 17 00:00:00 2001 From: James Mitchell-White Date: Fri, 18 Oct 2024 16:26:37 +0100 Subject: [PATCH 06/10] Added type test minor change to type signature of TestPipeline.run abstract method Added a test that the LLMPipeline.run should return a string --- Carrot-Assistant/evaluation/evaltypes.py | 5 ++- Carrot-Assistant/tests/test_evals.py | 57 +++++++++++++++++++++--- 2 files changed, 53 insertions(+), 9 deletions(-) diff --git a/Carrot-Assistant/evaluation/evaltypes.py b/Carrot-Assistant/evaluation/evaltypes.py index 52798aa..ad9d343 100644 --- a/Carrot-Assistant/evaluation/evaltypes.py +++ b/Carrot-Assistant/evaluation/evaltypes.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import TypeVar, Generic +from typing import TypeVar, Generic, Any class EvaluationFramework: @@ -32,7 +32,7 @@ class TestPipeline(ABC): """ @abstractmethod - def run(self, *args, **kwargs): + def run(self, *args, **kwargs) -> Any: """ Run the pipeline """ @@ -48,6 +48,7 @@ class PipelineTest(Generic[M]): """ def __init__(self, name: str, pipeline: TestPipeline, metrics: list[M]): + self.name = name self.pipeline = pipeline self.metrics = metrics diff --git a/Carrot-Assistant/tests/test_evals.py b/Carrot-Assistant/tests/test_evals.py index f7f4c9c..a823fd3 100644 --- a/Carrot-Assistant/tests/test_evals.py +++ b/Carrot-Assistant/tests/test_evals.py @@ -1,18 +1,26 @@ import pytest +from jinja2 import Environment, Template + from evaluation.evaltypes import SingleResultPipeline, SingleResultPipelineTest from evaluation.metrics import ExactMatchMetric +from evaluation.pipelines import LLMPipeline + +from options.pipeline_options import LLMModel + class IdentityPipeline(SingleResultPipeline): def run(self, input_data): return input_data + class ExactMatchTest(SingleResultPipelineTest): def __init__(self, name: str, pipeline: SingleResultPipeline): super().__init__(name, pipeline, [ExactMatchMetric()]) - + def run_pipeline(self, input_data): return self.pipeline.run(input_data) - + + class TestExactMatch: @pytest.fixture def identity_pipeline(self): @@ -20,7 +28,9 @@ def identity_pipeline(self): @pytest.fixture def exact_match_test(self, identity_pipeline): - return SingleResultPipelineTest("Exact Match Test", identity_pipeline, [ExactMatchMetric()]) + return SingleResultPipelineTest( + "Exact Match Test", identity_pipeline, [ExactMatchMetric()] + ) @pytest.fixture def all_match_dataset(self): @@ -32,12 +42,20 @@ def no_match_dataset(self): @pytest.fixture def half_match_dataset(self): - return [("input1", "input1"), ("input2", "output2"), ("input3", "input3"), ("input4", "output4")] + return [ + ("input1", "input1"), + ("input2", "output2"), + ("input3", "input3"), + ("input4", "output4"), + ] def run_test(self, test, dataset): - results = [test.evaluate(input_data, expected_output) for input_data, expected_output in dataset] - exact_match_results = [result['ExactMatchMetric'] for result in results] - return sum(exact_match_results) / len(exact_match_results) + results = [ + test.evaluate(input_data, expected_output) + for input_data, expected_output in dataset + ] + exact_match_results = [result["ExactMatchMetric"] for result in results] + return sum(exact_match_results) / len(exact_match_results) def test_all_match(self, exact_match_test, all_match_dataset): assert self.run_test(exact_match_test, all_match_dataset) == 1.0 @@ -47,3 +65,28 @@ def test_no_match(self, exact_match_test, no_match_dataset): def test_half_match(self, exact_match_test, half_match_dataset): assert self.run_test(exact_match_test, half_match_dataset) == 0.5 + + +# LLM pipeline tests + + +class TestBasicLLM: + @pytest.fixture + def llm_prompt(self): + env = Environment() + template = env.from_string( + """ + You are a parrot that repeats whatever is said to you, with no explanation. You will be given a sentence as input, repeat it. + + Sentence: {{input_sentence}} + """ + ) + return template + + @pytest.fixture + def llm_pipeline(self, llm_prompt): + return LLMPipeline(LLMModel["llama-3.1-8b"], llm_prompt) + + def test_returns_string(self, llm_pipeline): + model_output = llm_pipeline.run({"input_sentence": "Polly wants a cracker"}) + assert isinstance(model_output, str) From 9f76c55596011c8040842902f5818c78419de001 Mon Sep 17 00:00:00 2001 From: James Mitchell-White Date: Fri, 18 Oct 2024 16:28:17 +0100 Subject: [PATCH 07/10] Update pipelines.py fixed import for testing --- Carrot-Assistant/evaluation/pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Carrot-Assistant/evaluation/pipelines.py b/Carrot-Assistant/evaluation/pipelines.py index f30eb46..62d228a 100644 --- a/Carrot-Assistant/evaluation/pipelines.py +++ b/Carrot-Assistant/evaluation/pipelines.py @@ -1,5 +1,5 @@ from typing import Dict -from evaltypes import SingleResultPipeline +from evaluation.evaltypes import SingleResultPipeline from options.pipeline_options import LLMModel from components.models import local_models from jinja2 import Template From 127dba32d703d030dfba56ce41ac2dd6703a9a01 Mon Sep 17 00:00:00 2001 From: James Mitchell-White Date: Fri, 18 Oct 2024 16:30:40 +0100 Subject: [PATCH 08/10] bugfixes shouldn't keep test logs forgot how to use enums --- .gitignore | 1 + Carrot-Assistant/tests/test_evals.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3969475..500b88f 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ RAG/tmp.py Carrot-Assistant/omop_tmp.py RAG/.cache/ *.qdrant +/Carrot-Assistant/tests/log diff --git a/Carrot-Assistant/tests/test_evals.py b/Carrot-Assistant/tests/test_evals.py index a823fd3..f2ab72b 100644 --- a/Carrot-Assistant/tests/test_evals.py +++ b/Carrot-Assistant/tests/test_evals.py @@ -85,7 +85,7 @@ def llm_prompt(self): @pytest.fixture def llm_pipeline(self, llm_prompt): - return LLMPipeline(LLMModel["llama-3.1-8b"], llm_prompt) + return LLMPipeline(LLMModel.LLAMA_3_1_8B, llm_prompt) def test_returns_string(self, llm_pipeline): model_output = llm_pipeline.run({"input_sentence": "Polly wants a cracker"}) From a5b4a3131df3319ed2cdc1f1cb63ee95a1fbc517 Mon Sep 17 00:00:00 2001 From: James Mitchell-White Date: Fri, 18 Oct 2024 16:36:40 +0100 Subject: [PATCH 09/10] Update pipelines.py Modified to pass tests - now returns a string --- Carrot-Assistant/evaluation/pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Carrot-Assistant/evaluation/pipelines.py b/Carrot-Assistant/evaluation/pipelines.py index 62d228a..50b066c 100644 --- a/Carrot-Assistant/evaluation/pipelines.py +++ b/Carrot-Assistant/evaluation/pipelines.py @@ -17,4 +17,4 @@ def run(self, input: Dict[str, str]) -> str: prompt = self.prompt_template.render(input) return self._model.create_chat_completion( messages=[{"role": "user", "content": prompt}] - )["choices"][0]["message"] + )["choices"][0]["message"]["content"] From 2b4d1ee475d37b5f6568348743ecec915c50ce43 Mon Sep 17 00:00:00 2001 From: James Mitchell-White Date: Fri, 18 Oct 2024 16:44:08 +0100 Subject: [PATCH 10/10] Update pipelines.py Documentation --- Carrot-Assistant/evaluation/pipelines.py | 27 ++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Carrot-Assistant/evaluation/pipelines.py b/Carrot-Assistant/evaluation/pipelines.py index 50b066c..30f3066 100644 --- a/Carrot-Assistant/evaluation/pipelines.py +++ b/Carrot-Assistant/evaluation/pipelines.py @@ -8,12 +8,39 @@ class LLMPipeline(SingleResultPipeline): + """ + This class runs a simple LLM-only pipeline on provided input + """ + def __init__(self, llm: LLMModel, prompt_template: Template) -> None: + """ + Initialises the LLMPipeline class + + Parameters + ---------- + llm: LLMModel + One of the model options in the LLMModel enum + prompt_template: Template + A jinja2 template for a prompt + """ self.llm = llm self.prompt_template = prompt_template self._model = Llama(hf_hub_download(**local_models[self.llm.value])) def run(self, input: Dict[str, str]) -> str: + """ + Runs the LLMPipeline on a given input + + Parameters + ---------- + input: Dict[str, str] + The input is rendered into a prompt string by the .render method of the prompt template, so needs to be a dictionary of the template's parameters + + Returns + ------- + str + The output of running the prompt through the given model + """ prompt = self.prompt_template.render(input) return self._model.create_chat_completion( messages=[{"role": "user", "content": prompt}]