From eccc8c5801fcd9873222b680ee32941b86701083 Mon Sep 17 00:00:00 2001
From: James Mitchell-White <jamesjimitchell@gmail.com>
Date: Fri, 18 Oct 2024 15:02:03 +0100
Subject: [PATCH 01/10] Create pipelines.py

---
 Carrot-Assistant/evaluation/pipelines.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 Carrot-Assistant/evaluation/pipelines.py

diff --git a/Carrot-Assistant/evaluation/pipelines.py b/Carrot-Assistant/evaluation/pipelines.py
new file mode 100644
index 0000000..e69de29

From 1657e1ec65ef65f3b038bc933f99039a688aabd3 Mon Sep 17 00:00:00 2001
From: James Mitchell-White <jamesjimitchell@gmail.com>
Date: Fri, 18 Oct 2024 15:08:55 +0100
Subject: [PATCH 02/10] Update pipelines.py

Created LLMPipeline class, which initialises with an LLM and prompt_template
---
 Carrot-Assistant/evaluation/pipelines.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/Carrot-Assistant/evaluation/pipelines.py b/Carrot-Assistant/evaluation/pipelines.py
index e69de29..24d8dd5 100644
--- a/Carrot-Assistant/evaluation/pipelines.py
+++ b/Carrot-Assistant/evaluation/pipelines.py
@@ -0,0 +1,12 @@
+from evaltypes import SingleResultPipeline
+from options.pipeline_options import LLMModel
+from jinja2 import Template
+
+
+class LLMPipeline(SingleResultPipeline):
+    def __init__(self, llm: LLMModel, prompt_template: Template) -> None:
+        self.llm = (LLMModel,)
+        self.prompt_template = prompt_template
+
+    def run(self, input) -> str:
+        pass

From e12d3709f1674b72ea4379906acf46ae12185301 Mon Sep 17 00:00:00 2001
From: James Mitchell-White <jamesjimitchell@gmail.com>
Date: Fri, 18 Oct 2024 15:16:49 +0100
Subject: [PATCH 03/10] Update pipelines.py

implemented LLMPipeline.run() to take an input (which has to be a dict so jinja2 can use as a template)
---
 Carrot-Assistant/evaluation/pipelines.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/Carrot-Assistant/evaluation/pipelines.py b/Carrot-Assistant/evaluation/pipelines.py
index 24d8dd5..71fa8b3 100644
--- a/Carrot-Assistant/evaluation/pipelines.py
+++ b/Carrot-Assistant/evaluation/pipelines.py
@@ -1,12 +1,20 @@
+from typing import Dict
 from evaltypes import SingleResultPipeline
 from options.pipeline_options import LLMModel
+from components.models import local_models
 from jinja2 import Template
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
 
 
 class LLMPipeline(SingleResultPipeline):
     def __init__(self, llm: LLMModel, prompt_template: Template) -> None:
-        self.llm = (LLMModel,)
+        self.llm = llm
         self.prompt_template = prompt_template
+        self._model = Llama(hf_hub_download(**local_models[self.llm]))
 
-    def run(self, input) -> str:
-        pass
+    def run(self, input: Dict[str, str]) -> str:
+        prompt = self.prompt_template.render(input)
+        return self._model.create_chat_completion(
+            messages=[{"role": "user", "content": prompt}]
+        )["choices"][0]["message"]

From 232e8ab557bf95926c574dcf6ace510054600f78 Mon Sep 17 00:00:00 2001
From: James Mitchell-White <jamesjimitchell@gmail.com>
Date: Fri, 18 Oct 2024 16:06:09 +0100
Subject: [PATCH 04/10] black formatting

deleted an import from components/pipeline.py, black formatting on save
passed the value of the LLMModel enum to the evaluation LLMPipeline
---
 Carrot-Assistant/components/pipeline.py  | 74 +++++++++++++-----------
 Carrot-Assistant/evaluation/pipelines.py |  2 +-
 2 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/Carrot-Assistant/components/pipeline.py b/Carrot-Assistant/components/pipeline.py
index 29e8249..7c35f95 100644
--- a/Carrot-Assistant/components/pipeline.py
+++ b/Carrot-Assistant/components/pipeline.py
@@ -9,7 +9,6 @@
 from components.embeddings import Embeddings
 from components.models import get_model
 from components.prompt import Prompts
-from tests.test_prompt_build import mock_rag_results
 
 
 class llm_pipeline:
@@ -52,10 +51,12 @@ def get_simple_assistant(self) -> Pipeline:
         self._logger.info(f"Pipeline initialized in {time.time()-start} seconds")
         start = time.time()
 
-        pipeline.add_component("prompt", Prompts(
-            model_name=self._model_name,
-            eot_token=self._eot_token
-            ).get_prompt())
+        pipeline.add_component(
+            "prompt",
+            Prompts(
+                model_name=self._model_name, eot_token=self._eot_token
+            ).get_prompt(),
+        )
         self._logger.info(f"Prompt added to pipeline in {time.time()-start} seconds")
         start = time.time()
 
@@ -72,6 +73,7 @@ def get_simple_assistant(self) -> Pipeline:
         self._logger.info(f"Pipeline connected in {time.time()-start} seconds")
 
         return pipeline
+
     def get_rag_assistant(self) -> Pipeline:
         """
         Get an assistant that uses vector search to populate a prompt for an LLM
@@ -85,46 +87,50 @@ def get_rag_assistant(self) -> Pipeline:
         pipeline = Pipeline()
         self._logger.info(f"Pipeline initialized in {time.time()-start} seconds")
         start = time.time()
-        
-        
+
         vec_search = Embeddings(
-                embeddings_path=self._opt.embeddings_path,
-                force_rebuild=self._opt.force_rebuild,
-                embed_vocab=self._opt.embed_vocab,
-                model_name=self._opt.embedding_model,
-                search_kwargs=self._opt.embedding_search_kwargs
-                )
-        
+            embeddings_path=self._opt.embeddings_path,
+            force_rebuild=self._opt.force_rebuild,
+            embed_vocab=self._opt.embed_vocab,
+            model_name=self._opt.embedding_model,
+            search_kwargs=self._opt.embedding_search_kwargs,
+        )
+
         vec_embedder = vec_search.get_embedder()
         vec_retriever = vec_search.get_retriever()
-        router = ConditionalRouter(routes=[
-            {
-                "condition": "{{vec_results[0].score > 0.95}}",
-                "output": "{{vec_results}}",
-                "output_name": "exact_match",
-                "output_type": List[Dict],
-            },
-            {
-                "condition": "{{vec_results[0].score <=0.95}}",
-                "output": "{{vec_results}}",
-                "output_name": "no_exact_match",
-                "output_type": List[Dict]
-            }
-            ])
+        router = ConditionalRouter(
+            routes=[
+                {
+                    "condition": "{{vec_results[0].score > 0.95}}",
+                    "output": "{{vec_results}}",
+                    "output_name": "exact_match",
+                    "output_type": List[Dict],
+                },
+                {
+                    "condition": "{{vec_results[0].score <=0.95}}",
+                    "output": "{{vec_results}}",
+                    "output_name": "no_exact_match",
+                    "output_type": List[Dict],
+                },
+            ]
+        )
         llm = get_model(
             model_name=self._model_name,
             temperature=self._opt.temperature,
             logger=self._logger,
         )
-        
+
         pipeline.add_component("query_embedder", vec_embedder)
         pipeline.add_component("retriever", vec_retriever)
         pipeline.add_component("router", router)
-        pipeline.add_component("prompt", Prompts(
-            model_name=self._model_name,
-            prompt_type="top_n_RAG",
-            eot_token=self._eot_token
-            ).get_prompt())
+        pipeline.add_component(
+            "prompt",
+            Prompts(
+                model_name=self._model_name,
+                prompt_type="top_n_RAG",
+                eot_token=self._eot_token,
+            ).get_prompt(),
+        )
         pipeline.add_component("llm", llm)
 
         pipeline.connect("query_embedder.embedding", "retriever.query_embedding")
diff --git a/Carrot-Assistant/evaluation/pipelines.py b/Carrot-Assistant/evaluation/pipelines.py
index 71fa8b3..f30eb46 100644
--- a/Carrot-Assistant/evaluation/pipelines.py
+++ b/Carrot-Assistant/evaluation/pipelines.py
@@ -11,7 +11,7 @@ class LLMPipeline(SingleResultPipeline):
     def __init__(self, llm: LLMModel, prompt_template: Template) -> None:
         self.llm = llm
         self.prompt_template = prompt_template
-        self._model = Llama(hf_hub_download(**local_models[self.llm]))
+        self._model = Llama(hf_hub_download(**local_models[self.llm.value]))
 
     def run(self, input: Dict[str, str]) -> str:
         prompt = self.prompt_template.render(input)

From 7907b79470f0f787f55dbd1d7d7b5eba75714a47 Mon Sep 17 00:00:00 2001
From: James Mitchell-White <jamesjimitchell@gmail.com>
Date: Fri, 18 Oct 2024 16:08:07 +0100
Subject: [PATCH 05/10] Update evaltypes.py

Changed TestPipeline .run abstract method for compatibility of return types
---
 Carrot-Assistant/evaluation/evaltypes.py | 77 ++++++++++++++++--------
 1 file changed, 51 insertions(+), 26 deletions(-)

diff --git a/Carrot-Assistant/evaluation/evaltypes.py b/Carrot-Assistant/evaluation/evaltypes.py
index bc0ec7b..52798aa 100644
--- a/Carrot-Assistant/evaluation/evaltypes.py
+++ b/Carrot-Assistant/evaluation/evaltypes.py
@@ -1,18 +1,23 @@
 from abc import ABC, abstractmethod
 from typing import TypeVar, Generic
 
+
 class EvaluationFramework:
-    def __init__(self, results_file='results.json'):
+    def __init__(self, results_file="results.json"):
         self.results_file = results_file
+
     def run_evaluations(self):
         # Run some tests
         self._save_evaluations
+
     def _save_evaluations(self):
-		# Append to 'results.json'
+        # Append to 'results.json'
         pass
 
+
 class Metric(ABC):
     """Base class for all metrics."""
+
     @abstractmethod
     def calculate(self, *args, **kwargs) -> float:
         """
@@ -20,23 +25,28 @@ def calculate(self, *args, **kwargs) -> float:
         """
         pass
 
+
 class TestPipeline(ABC):
-	"""
-	Base class for Pipeline runs
-	"""
-	@abstractmethod
-	def run(self, *args, **kwargs):
-		"""
-		Run the pipeline
-		"""
-		pass
-
-M = TypeVar('M', bound=Metric)
+    """
+    Base class for Pipeline runs
+    """
+
+    @abstractmethod
+    def run(self, *args, **kwargs):
+        """
+        Run the pipeline
+        """
+        ...
+
+
+M = TypeVar("M", bound=Metric)
+
 
 class PipelineTest(Generic[M]):
     """
     Base class for Pipeline tests
     """
+
     def __init__(self, name: str, pipeline: TestPipeline, metrics: list[M]):
         self.pipeline = pipeline
         self.metrics = metrics
@@ -44,50 +54,65 @@ def __init__(self, name: str, pipeline: TestPipeline, metrics: list[M]):
     @abstractmethod
     def run_pipeline(self, *args, **kwargs):
         pass
+
     @abstractmethod
     def evaluate(self, *args, **kwargs) -> dict[str, float]:
-    	pass
+        pass
+
 
 class SingleResultMetric(Metric):
     """Metric for evaluating pipelines that return a single result."""
 
+
 class InformationRetrievalMetric(Metric):
     """Metric for evaluating information retrieval pipelines."""
+
     pass
 
+
 class SingleResultPipeline(TestPipeline):
-	"""
-	Base class for pipelines returning a single result
-	"""
+    """
+    Base class for pipelines returning a single result
+    """
+
 
 class SingleResultPipelineTest(PipelineTest[SingleResultMetric]):
-    def __init__(self, name: str, pipeline: SingleResultPipeline, metrics: list[SingleResultMetric]):
+    def __init__(
+        self,
+        name: str,
+        pipeline: SingleResultPipeline,
+        metrics: list[SingleResultMetric],
+    ):
         super().__init__(name, pipeline, metrics)
-    
+
     def run_pipeline(self, input_data):
         """
         Run the pipeline with the given input data.
-        
+
         Args:
         input_data: The input data for the pipeline.
-        
+
         Returns:
         The result of running the pipeline on the input data.
         """
         return self.pipeline.run(input_data)
-    
+
     def evaluate(self, input_data, expected_output):
         """
         Evaluate the pipeline by running it on the input data and comparing the result
         to the expected output using all metrics.
-        
+
         Args:
         input_data: The input data for the pipeline.
         expected_output: The expected output to compare against.
-        
+
         Returns:
         A dictionary mapping metric names to their calculated values.
         """
         pipeline_output = self.run_pipeline(input_data)
-        return {metric.__class__.__name__: metric.calculate(pipeline_output, expected_output) 
-                for metric in self.metrics}
+        return {
+            metric.__class__.__name__: metric.calculate(
+                pipeline_output, expected_output
+            )
+            for metric in self.metrics
+        }

From 13269d68b5d300da86892b20bf61a1a9b8cb9edc Mon Sep 17 00:00:00 2001
From: James Mitchell-White <jamesjimitchell@gmail.com>
Date: Fri, 18 Oct 2024 16:26:37 +0100
Subject: [PATCH 06/10] Added type test

minor change to type signature of TestPipeline.run abstract method

Added a test that the LLMPipeline.run should return a string
---
 Carrot-Assistant/evaluation/evaltypes.py |  5 ++-
 Carrot-Assistant/tests/test_evals.py     | 57 +++++++++++++++++++++---
 2 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/Carrot-Assistant/evaluation/evaltypes.py b/Carrot-Assistant/evaluation/evaltypes.py
index 52798aa..ad9d343 100644
--- a/Carrot-Assistant/evaluation/evaltypes.py
+++ b/Carrot-Assistant/evaluation/evaltypes.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import TypeVar, Generic
+from typing import TypeVar, Generic, Any
 
 
 class EvaluationFramework:
@@ -32,7 +32,7 @@ class TestPipeline(ABC):
     """
 
     @abstractmethod
-    def run(self, *args, **kwargs):
+    def run(self, *args, **kwargs) -> Any:
         """
         Run the pipeline
         """
@@ -48,6 +48,7 @@ class PipelineTest(Generic[M]):
     """
 
     def __init__(self, name: str, pipeline: TestPipeline, metrics: list[M]):
+        self.name = name
         self.pipeline = pipeline
         self.metrics = metrics
 
diff --git a/Carrot-Assistant/tests/test_evals.py b/Carrot-Assistant/tests/test_evals.py
index f7f4c9c..a823fd3 100644
--- a/Carrot-Assistant/tests/test_evals.py
+++ b/Carrot-Assistant/tests/test_evals.py
@@ -1,18 +1,26 @@
 import pytest
+from jinja2 import Environment, Template
+
 from evaluation.evaltypes import SingleResultPipeline, SingleResultPipelineTest
 from evaluation.metrics import ExactMatchMetric
+from evaluation.pipelines import LLMPipeline
+
+from options.pipeline_options import LLMModel
+
 
 class IdentityPipeline(SingleResultPipeline):
     def run(self, input_data):
         return input_data
 
+
 class ExactMatchTest(SingleResultPipelineTest):
     def __init__(self, name: str, pipeline: SingleResultPipeline):
         super().__init__(name, pipeline, [ExactMatchMetric()])
-    
+
     def run_pipeline(self, input_data):
         return self.pipeline.run(input_data)
-    
+
+
 class TestExactMatch:
     @pytest.fixture
     def identity_pipeline(self):
@@ -20,7 +28,9 @@ def identity_pipeline(self):
 
     @pytest.fixture
     def exact_match_test(self, identity_pipeline):
-        return SingleResultPipelineTest("Exact Match Test", identity_pipeline, [ExactMatchMetric()])
+        return SingleResultPipelineTest(
+            "Exact Match Test", identity_pipeline, [ExactMatchMetric()]
+        )
 
     @pytest.fixture
     def all_match_dataset(self):
@@ -32,12 +42,20 @@ def no_match_dataset(self):
 
     @pytest.fixture
     def half_match_dataset(self):
-        return [("input1", "input1"), ("input2", "output2"), ("input3", "input3"), ("input4", "output4")]
+        return [
+            ("input1", "input1"),
+            ("input2", "output2"),
+            ("input3", "input3"),
+            ("input4", "output4"),
+        ]
 
     def run_test(self, test, dataset):
-        results = [test.evaluate(input_data, expected_output) for input_data, expected_output in dataset]
-        exact_match_results = [result['ExactMatchMetric'] for result in results]
-        return sum(exact_match_results) / len(exact_match_results)    
+        results = [
+            test.evaluate(input_data, expected_output)
+            for input_data, expected_output in dataset
+        ]
+        exact_match_results = [result["ExactMatchMetric"] for result in results]
+        return sum(exact_match_results) / len(exact_match_results)
 
     def test_all_match(self, exact_match_test, all_match_dataset):
         assert self.run_test(exact_match_test, all_match_dataset) == 1.0
@@ -47,3 +65,28 @@ def test_no_match(self, exact_match_test, no_match_dataset):
 
     def test_half_match(self, exact_match_test, half_match_dataset):
         assert self.run_test(exact_match_test, half_match_dataset) == 0.5
+
+
+# LLM pipeline tests
+
+
+class TestBasicLLM:
+    @pytest.fixture
+    def llm_prompt(self):
+        env = Environment()
+        template = env.from_string(
+            """
+                                   You are a parrot that repeats whatever is said to you, with no explanation. You will be given a sentence as input, repeat it.
+
+                                   Sentence: {{input_sentence}}
+                                   """
+        )
+        return template
+
+    @pytest.fixture
+    def llm_pipeline(self, llm_prompt):
+        return LLMPipeline(LLMModel["llama-3.1-8b"], llm_prompt)
+
+    def test_returns_string(self, llm_pipeline):
+        model_output = llm_pipeline.run({"input_sentence": "Polly wants a cracker"})
+        assert isinstance(model_output, str)

From 9f76c55596011c8040842902f5818c78419de001 Mon Sep 17 00:00:00 2001
From: James Mitchell-White <jamesjimitchell@gmail.com>
Date: Fri, 18 Oct 2024 16:28:17 +0100
Subject: [PATCH 07/10] Update pipelines.py

fixed import for testing
---
 Carrot-Assistant/evaluation/pipelines.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Carrot-Assistant/evaluation/pipelines.py b/Carrot-Assistant/evaluation/pipelines.py
index f30eb46..62d228a 100644
--- a/Carrot-Assistant/evaluation/pipelines.py
+++ b/Carrot-Assistant/evaluation/pipelines.py
@@ -1,5 +1,5 @@
 from typing import Dict
-from evaltypes import SingleResultPipeline
+from evaluation.evaltypes import SingleResultPipeline
 from options.pipeline_options import LLMModel
 from components.models import local_models
 from jinja2 import Template

From 127dba32d703d030dfba56ce41ac2dd6703a9a01 Mon Sep 17 00:00:00 2001
From: James Mitchell-White <jamesjimitchell@gmail.com>
Date: Fri, 18 Oct 2024 16:30:40 +0100
Subject: [PATCH 08/10] bugfixes

shouldn't keep test logs

forgot how to use enums
---
 .gitignore                           | 1 +
 Carrot-Assistant/tests/test_evals.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 3969475..500b88f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,4 @@ RAG/tmp.py
 Carrot-Assistant/omop_tmp.py
 RAG/.cache/
 *.qdrant
+/Carrot-Assistant/tests/log
diff --git a/Carrot-Assistant/tests/test_evals.py b/Carrot-Assistant/tests/test_evals.py
index a823fd3..f2ab72b 100644
--- a/Carrot-Assistant/tests/test_evals.py
+++ b/Carrot-Assistant/tests/test_evals.py
@@ -85,7 +85,7 @@ def llm_prompt(self):
 
     @pytest.fixture
     def llm_pipeline(self, llm_prompt):
-        return LLMPipeline(LLMModel["llama-3.1-8b"], llm_prompt)
+        return LLMPipeline(LLMModel.LLAMA_3_1_8B, llm_prompt)
 
     def test_returns_string(self, llm_pipeline):
         model_output = llm_pipeline.run({"input_sentence": "Polly wants a cracker"})

From a5b4a3131df3319ed2cdc1f1cb63ee95a1fbc517 Mon Sep 17 00:00:00 2001
From: James Mitchell-White <jamesjimitchell@gmail.com>
Date: Fri, 18 Oct 2024 16:36:40 +0100
Subject: [PATCH 09/10] Update pipelines.py

Modified to pass tests - now returns a string
---
 Carrot-Assistant/evaluation/pipelines.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Carrot-Assistant/evaluation/pipelines.py b/Carrot-Assistant/evaluation/pipelines.py
index 62d228a..50b066c 100644
--- a/Carrot-Assistant/evaluation/pipelines.py
+++ b/Carrot-Assistant/evaluation/pipelines.py
@@ -17,4 +17,4 @@ def run(self, input: Dict[str, str]) -> str:
         prompt = self.prompt_template.render(input)
         return self._model.create_chat_completion(
             messages=[{"role": "user", "content": prompt}]
-        )["choices"][0]["message"]
+        )["choices"][0]["message"]["content"]

From 2b4d1ee475d37b5f6568348743ecec915c50ce43 Mon Sep 17 00:00:00 2001
From: James Mitchell-White <jamesjimitchell@gmail.com>
Date: Fri, 18 Oct 2024 16:44:08 +0100
Subject: [PATCH 10/10] Update pipelines.py

Documentation
---
 Carrot-Assistant/evaluation/pipelines.py | 27 ++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/Carrot-Assistant/evaluation/pipelines.py b/Carrot-Assistant/evaluation/pipelines.py
index 50b066c..30f3066 100644
--- a/Carrot-Assistant/evaluation/pipelines.py
+++ b/Carrot-Assistant/evaluation/pipelines.py
@@ -8,12 +8,39 @@
 
 
 class LLMPipeline(SingleResultPipeline):
+    """
+    This class runs a simple LLM-only pipeline on provided input
+    """
+
     def __init__(self, llm: LLMModel, prompt_template: Template) -> None:
+        """
+        Initialises the LLMPipeline class
+
+        Parameters
+        ----------
+        llm: LLMModel
+            One of the model options in the LLMModel enum
+        prompt_template: Template
+            A jinja2 template for a prompt
+        """
         self.llm = llm
         self.prompt_template = prompt_template
         self._model = Llama(hf_hub_download(**local_models[self.llm.value]))
 
     def run(self, input: Dict[str, str]) -> str:
+        """
+        Runs the LLMPipeline on a given input
+
+        Parameters
+        ----------
+        input: Dict[str, str]
+            The input is rendered into a prompt string by the .render method of the prompt template, so needs to be a dictionary of the template's parameters
+
+        Returns
+        -------
+        str
+            The output of running the prompt through the given model
+        """
         prompt = self.prompt_template.render(input)
         return self._model.create_chat_completion(
             messages=[{"role": "user", "content": prompt}]