diff --git a/.gitignore b/.gitignore index 3969475..500b88f 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ RAG/tmp.py Carrot-Assistant/omop_tmp.py RAG/.cache/ *.qdrant +/Carrot-Assistant/tests/log diff --git a/Carrot-Assistant/components/pipeline.py b/Carrot-Assistant/components/pipeline.py index 29e8249..7c35f95 100644 --- a/Carrot-Assistant/components/pipeline.py +++ b/Carrot-Assistant/components/pipeline.py @@ -9,7 +9,6 @@ from components.embeddings import Embeddings from components.models import get_model from components.prompt import Prompts -from tests.test_prompt_build import mock_rag_results class llm_pipeline: @@ -52,10 +51,12 @@ def get_simple_assistant(self) -> Pipeline: self._logger.info(f"Pipeline initialized in {time.time()-start} seconds") start = time.time() - pipeline.add_component("prompt", Prompts( - model_name=self._model_name, - eot_token=self._eot_token - ).get_prompt()) + pipeline.add_component( + "prompt", + Prompts( + model_name=self._model_name, eot_token=self._eot_token + ).get_prompt(), + ) self._logger.info(f"Prompt added to pipeline in {time.time()-start} seconds") start = time.time() @@ -72,6 +73,7 @@ def get_simple_assistant(self) -> Pipeline: self._logger.info(f"Pipeline connected in {time.time()-start} seconds") return pipeline + def get_rag_assistant(self) -> Pipeline: """ Get an assistant that uses vector search to populate a prompt for an LLM @@ -85,46 +87,50 @@ def get_rag_assistant(self) -> Pipeline: pipeline = Pipeline() self._logger.info(f"Pipeline initialized in {time.time()-start} seconds") start = time.time() - - + vec_search = Embeddings( - embeddings_path=self._opt.embeddings_path, - force_rebuild=self._opt.force_rebuild, - embed_vocab=self._opt.embed_vocab, - model_name=self._opt.embedding_model, - search_kwargs=self._opt.embedding_search_kwargs - ) - + embeddings_path=self._opt.embeddings_path, + force_rebuild=self._opt.force_rebuild, + embed_vocab=self._opt.embed_vocab, + model_name=self._opt.embedding_model, + search_kwargs=self._opt.embedding_search_kwargs, + ) + vec_embedder = vec_search.get_embedder() vec_retriever = vec_search.get_retriever() - router = ConditionalRouter(routes=[ - { - "condition": "{{vec_results[0].score > 0.95}}", - "output": "{{vec_results}}", - "output_name": "exact_match", - "output_type": List[Dict], - }, - { - "condition": "{{vec_results[0].score <=0.95}}", - "output": "{{vec_results}}", - "output_name": "no_exact_match", - "output_type": List[Dict] - } - ]) + router = ConditionalRouter( + routes=[ + { + "condition": "{{vec_results[0].score > 0.95}}", + "output": "{{vec_results}}", + "output_name": "exact_match", + "output_type": List[Dict], + }, + { + "condition": "{{vec_results[0].score <=0.95}}", + "output": "{{vec_results}}", + "output_name": "no_exact_match", + "output_type": List[Dict], + }, + ] + ) llm = get_model( model_name=self._model_name, temperature=self._opt.temperature, logger=self._logger, ) - + pipeline.add_component("query_embedder", vec_embedder) pipeline.add_component("retriever", vec_retriever) pipeline.add_component("router", router) - pipeline.add_component("prompt", Prompts( - model_name=self._model_name, - prompt_type="top_n_RAG", - eot_token=self._eot_token - ).get_prompt()) + pipeline.add_component( + "prompt", + Prompts( + model_name=self._model_name, + prompt_type="top_n_RAG", + eot_token=self._eot_token, + ).get_prompt(), + ) pipeline.add_component("llm", llm) pipeline.connect("query_embedder.embedding", "retriever.query_embedding") diff --git a/Carrot-Assistant/evaluation/evaltypes.py b/Carrot-Assistant/evaluation/evaltypes.py index bc0ec7b..ad9d343 100644 --- a/Carrot-Assistant/evaluation/evaltypes.py +++ b/Carrot-Assistant/evaluation/evaltypes.py @@ -1,18 +1,23 @@ from abc import ABC, abstractmethod -from typing import TypeVar, Generic +from typing import TypeVar, Generic, Any + class EvaluationFramework: - def __init__(self, results_file='results.json'): + def __init__(self, results_file="results.json"): self.results_file = results_file + def run_evaluations(self): # Run some tests self._save_evaluations + def _save_evaluations(self): - # Append to 'results.json' + # Append to 'results.json' pass + class Metric(ABC): """Base class for all metrics.""" + @abstractmethod def calculate(self, *args, **kwargs) -> float: """ @@ -20,74 +25,95 @@ def calculate(self, *args, **kwargs) -> float: """ pass + class TestPipeline(ABC): - """ - Base class for Pipeline runs - """ - @abstractmethod - def run(self, *args, **kwargs): - """ - Run the pipeline - """ - pass - -M = TypeVar('M', bound=Metric) + """ + Base class for Pipeline runs + """ + + @abstractmethod + def run(self, *args, **kwargs) -> Any: + """ + Run the pipeline + """ + ... + + +M = TypeVar("M", bound=Metric) + class PipelineTest(Generic[M]): """ Base class for Pipeline tests """ + def __init__(self, name: str, pipeline: TestPipeline, metrics: list[M]): + self.name = name self.pipeline = pipeline self.metrics = metrics @abstractmethod def run_pipeline(self, *args, **kwargs): pass + @abstractmethod def evaluate(self, *args, **kwargs) -> dict[str, float]: - pass + pass + class SingleResultMetric(Metric): """Metric for evaluating pipelines that return a single result.""" + class InformationRetrievalMetric(Metric): """Metric for evaluating information retrieval pipelines.""" + pass + class SingleResultPipeline(TestPipeline): - """ - Base class for pipelines returning a single result - """ + """ + Base class for pipelines returning a single result + """ + class SingleResultPipelineTest(PipelineTest[SingleResultMetric]): - def __init__(self, name: str, pipeline: SingleResultPipeline, metrics: list[SingleResultMetric]): + def __init__( + self, + name: str, + pipeline: SingleResultPipeline, + metrics: list[SingleResultMetric], + ): super().__init__(name, pipeline, metrics) - + def run_pipeline(self, input_data): """ Run the pipeline with the given input data. - + Args: input_data: The input data for the pipeline. - + Returns: The result of running the pipeline on the input data. """ return self.pipeline.run(input_data) - + def evaluate(self, input_data, expected_output): """ Evaluate the pipeline by running it on the input data and comparing the result to the expected output using all metrics. - + Args: input_data: The input data for the pipeline. expected_output: The expected output to compare against. - + Returns: A dictionary mapping metric names to their calculated values. """ pipeline_output = self.run_pipeline(input_data) - return {metric.__class__.__name__: metric.calculate(pipeline_output, expected_output) - for metric in self.metrics} + return { + metric.__class__.__name__: metric.calculate( + pipeline_output, expected_output + ) + for metric in self.metrics + } diff --git a/Carrot-Assistant/evaluation/pipelines.py b/Carrot-Assistant/evaluation/pipelines.py new file mode 100644 index 0000000..30f3066 --- /dev/null +++ b/Carrot-Assistant/evaluation/pipelines.py @@ -0,0 +1,47 @@ +from typing import Dict +from evaluation.evaltypes import SingleResultPipeline +from options.pipeline_options import LLMModel +from components.models import local_models +from jinja2 import Template +from llama_cpp import Llama +from huggingface_hub import hf_hub_download + + +class LLMPipeline(SingleResultPipeline): + """ + This class runs a simple LLM-only pipeline on provided input + """ + + def __init__(self, llm: LLMModel, prompt_template: Template) -> None: + """ + Initialises the LLMPipeline class + + Parameters + ---------- + llm: LLMModel + One of the model options in the LLMModel enum + prompt_template: Template + A jinja2 template for a prompt + """ + self.llm = llm + self.prompt_template = prompt_template + self._model = Llama(hf_hub_download(**local_models[self.llm.value])) + + def run(self, input: Dict[str, str]) -> str: + """ + Runs the LLMPipeline on a given input + + Parameters + ---------- + input: Dict[str, str] + The input is rendered into a prompt string by the .render method of the prompt template, so needs to be a dictionary of the template's parameters + + Returns + ------- + str + The output of running the prompt through the given model + """ + prompt = self.prompt_template.render(input) + return self._model.create_chat_completion( + messages=[{"role": "user", "content": prompt}] + )["choices"][0]["message"]["content"] diff --git a/Carrot-Assistant/tests/test_evals.py b/Carrot-Assistant/tests/test_evals.py index f7f4c9c..f2ab72b 100644 --- a/Carrot-Assistant/tests/test_evals.py +++ b/Carrot-Assistant/tests/test_evals.py @@ -1,18 +1,26 @@ import pytest +from jinja2 import Environment, Template + from evaluation.evaltypes import SingleResultPipeline, SingleResultPipelineTest from evaluation.metrics import ExactMatchMetric +from evaluation.pipelines import LLMPipeline + +from options.pipeline_options import LLMModel + class IdentityPipeline(SingleResultPipeline): def run(self, input_data): return input_data + class ExactMatchTest(SingleResultPipelineTest): def __init__(self, name: str, pipeline: SingleResultPipeline): super().__init__(name, pipeline, [ExactMatchMetric()]) - + def run_pipeline(self, input_data): return self.pipeline.run(input_data) - + + class TestExactMatch: @pytest.fixture def identity_pipeline(self): @@ -20,7 +28,9 @@ def identity_pipeline(self): @pytest.fixture def exact_match_test(self, identity_pipeline): - return SingleResultPipelineTest("Exact Match Test", identity_pipeline, [ExactMatchMetric()]) + return SingleResultPipelineTest( + "Exact Match Test", identity_pipeline, [ExactMatchMetric()] + ) @pytest.fixture def all_match_dataset(self): @@ -32,12 +42,20 @@ def no_match_dataset(self): @pytest.fixture def half_match_dataset(self): - return [("input1", "input1"), ("input2", "output2"), ("input3", "input3"), ("input4", "output4")] + return [ + ("input1", "input1"), + ("input2", "output2"), + ("input3", "input3"), + ("input4", "output4"), + ] def run_test(self, test, dataset): - results = [test.evaluate(input_data, expected_output) for input_data, expected_output in dataset] - exact_match_results = [result['ExactMatchMetric'] for result in results] - return sum(exact_match_results) / len(exact_match_results) + results = [ + test.evaluate(input_data, expected_output) + for input_data, expected_output in dataset + ] + exact_match_results = [result["ExactMatchMetric"] for result in results] + return sum(exact_match_results) / len(exact_match_results) def test_all_match(self, exact_match_test, all_match_dataset): assert self.run_test(exact_match_test, all_match_dataset) == 1.0 @@ -47,3 +65,28 @@ def test_no_match(self, exact_match_test, no_match_dataset): def test_half_match(self, exact_match_test, half_match_dataset): assert self.run_test(exact_match_test, half_match_dataset) == 0.5 + + +# LLM pipeline tests + + +class TestBasicLLM: + @pytest.fixture + def llm_prompt(self): + env = Environment() + template = env.from_string( + """ + You are a parrot that repeats whatever is said to you, with no explanation. You will be given a sentence as input, repeat it. + + Sentence: {{input_sentence}} + """ + ) + return template + + @pytest.fixture + def llm_pipeline(self, llm_prompt): + return LLMPipeline(LLMModel.LLAMA_3_1_8B, llm_prompt) + + def test_returns_string(self, llm_pipeline): + model_output = llm_pipeline.run({"input_sentence": "Polly wants a cracker"}) + assert isinstance(model_output, str)