eye-on-surveillance · cathbrooks · Mar 13, 2024 · Mar 16, 2024 · Apr 17, 2024 · Apr 17, 2024
diff --git a/packages/googlecloud/functions/getanswer/evaluate/README.MD b/packages/googlecloud/functions/getanswer/evaluate/README.MD
@@ -1,32 +1,34 @@
-Using [confident-ai/deepeval](https://github.com/confident-ai/deepeval) LLM evaluation framework. 
+Using [confident-ai/deepeval](https://github.com/confident-ai/deepeval) LLM evaluation framework.  
 
 
 Requires installation of deepeval library over pip:
 
-'pip install -U deepeval'
+'pip install -U deepeval'  
 
 
 
 For windows:
-'export OPENAI_API_KEY=xxx' 
+'set OPENAI_API_KEY=xxx'  
 
 For OS:
-'set OPENAI_API_KEY=xxx' 
+'export OPENAI_API_KEY=xxx'  
 
 
 
 To run tests:
 
-test_evaluate_live.py 
-    reads in a live test query from user input, gets the sawt response, evaluates the response according
-    to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106
+test_evaluate_live.py:<br>
+    reads in a live test query from user input, gets the sawt response, evaluates the response according to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106.  
 
     usage: 
         'deepeval test run test_evaluate_live.py'
 
-test_evaluate_csv.py
-    reads test queries from file inputted by user, gets the sawt response, evaluates the responses according
-    to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106
+test_evaluate_tsv.py:<br>
+    reads test queries from tsv file inputted by user, gets the sawt responses, evaluates the responses according to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106.  
 
-        usage: 
-        'deepeval test run test_evaluate_csv.py'
+    This file can contain be a gold data set with feature 'expected response', a list of queries without expected responses, or a mix of both. Queries with specified expected responses will be eveluated on 2 more metrics than queries without expected responses.  
+
+    usage: 
+        'deepeval test run test_evaluate_tsv.py'  
+
+Test results and hyperparameters used by current model are logged in deepeval login.
diff --git a/packages/googlecloud/functions/getanswer/evaluate/gpt4.xlsx b/packages/googlecloud/functions/getanswer/evaluate/gpt4.xlsx
diff --git a/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_csv.py b/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_csv.py
diff --git a/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_live.py b/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_live.py
@@ -4,7 +4,7 @@
 This will read a live test query from user input, get the sawt response, then evaluate the response according
 to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106
 
-All hyperparameters used by current model are logged in deepeval login
+Test results and hyperparameters used by current model are logged in deepeval login.
 
 """
 import pytest

diff --git a/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_tsv.py b/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_tsv.py
@@ -0,0 +1,118 @@
+"""
+usage: 'deepeval test run test_evaluate_gold_dataset.py'
+
+Reads test queries from tsv file inputted by user, gets the sawt responses, evaluates the responses according to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106
+
+This file can contain be a gold data set with feature 'expected response', a list of queries without expected responses, or a mix of both. 
+Queries with specified expected responses will be eveluated on 2 more metrics than queries without expected responses.
+
+Test results and hyperparameters used by current model are logged in deepeval login.
+
+"""
+import pytest
+import deepeval
+import logging
+import sys
+import os
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../"))
+
+from deepeval import assert_test
+from deepeval.dataset import EvaluationDataset
+from deepeval.metrics import AnswerRelevancyMetric, BiasMetric, ContextualRelevancyMetric, FaithfulnessMetric, GEval, ContextualPrecisionMetric, ContextualRecallMetric
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from inquirer import route_question
+from helper import get_dbs
+from inquirer import INDEPTH_RESPONSE_LLM, INDEPTH_RESPONSE_PROMPT_TEMPLATE, INDEPTH_RESPONSE_K
+from api import RESPONSE_TYPE_DEPTH
+
+logger = logging.getLogger(__name__)
+
+
+# model to use for evaluating responses
+MODEL = 'gpt-3.5-turbo-1106'
+
+
+def get_test_cases():
+    """
+    Run sawt on all test queries and create LLMTestCases for each.
+    """
+    test_cases = []
+    db_fc, db_cj, db_pdf, db_pc, db_news, voting_roll_df = get_dbs()
+
+    tsv_file_name = input("Enter the name or path of your tsv file of queries (ex: queries.tsv):")
+    if not os.path.exists(tsv_file_name):
+        print("\nThe file ", tsv_file_name, " doesn't exist, check the path or name.")
+        sys.exit()
+    logger.info('generating answers to all test queries...')
+
+    with open(tsv_file_name) as file:
+        next(file)
+        for row in file:
+            row_obj = row.strip().split('\t')
+            if len(row_obj) == 1:
+                query = row_obj[0]
+                expected_output = ''
+            else:
+                query, expected_output = row_obj
+
+            actual_output, retrieval_context = route_question(
+                voting_roll_df,
+                db_fc,
+                db_cj,
+                db_pdf,
+                db_pc,
+                db_news,
+                query,
+                RESPONSE_TYPE_DEPTH,
+                k=5,
+                return_context=True
+            )
+            # get single string for text response
+            actual_output = ' '.join(i['response'] for i in actual_output['responses'])    
+            test_cases.append(LLMTestCase(input=query, actual_output=actual_output, expected_output=expected_output, retrieval_context=[retrieval_context]))
+
+    return EvaluationDataset(test_cases=test_cases)
+
+dataset = get_test_cases()
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    dataset,
+)
+def test_dataset(test_case: LLMTestCase):
+    #require expected_output
+    contextual_precision = ContextualPrecisionMetric(threshold=0.2, model=MODEL)
+    contextual_recall = ContextualRecallMetric(threshold=0.2, model=MODEL)
+
+    #don't require expected_output
+    answer_relevancy = AnswerRelevancyMetric(threshold=0.2, model=MODEL)
+    bias = BiasMetric(threshold=0.5, model=MODEL)
+    contextual_relevancy = ContextualRelevancyMetric(threshold=0.7, include_reason=True, model=MODEL)
+    faithfulness = FaithfulnessMetric(threshold=0.7, include_reason=True, model=MODEL)
+
+    readability = GEval(name="Readability",
+        criteria="Determine whether the text in 'actual output' is easy to read for those with a high school reading level.",
+        evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+        model=MODEL)
+
+
+    punctuation = GEval(name="Punctuation",
+        criteria="Determine whether the text in 'actual output' has proper punctuation.",
+        evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+        model=MODEL)
+
+    opinions = GEval(name="Number of Opinions",
+        criteria="Determine whether the text in 'actual output' expresses more than one opinion on the topic of the query.",
+        evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+        model=MODEL)
+
+    assert_test(test_case, [contextual_recall, contextual_precision, answer_relevancy, bias, contextual_relevancy, faithfulness,readability, punctuation, opinions])
+
+
+# Log hyperparameters so we can compare across different test runs in deepeval login
+@deepeval.log_hyperparameters(model=INDEPTH_RESPONSE_LLM.model_name, prompt_template=INDEPTH_RESPONSE_PROMPT_TEMPLATE.template)
+def hyperparameters():
+    return {'k': INDEPTH_RESPONSE_K}
+
diff --git a/packages/googlecloud/functions/getanswer/helper.py b/packages/googlecloud/functions/getanswer/helper.py
@@ -39,11 +39,11 @@ def get_dbs():
     faiss_news_index_path = dir.joinpath("cache/faiss_index_in_depth_news")
 
     # Loading new FAISS indices for each document type
-    db_fc = FAISS.load_local(faiss_fc_index_path, in_depth_embeddings)
-    db_cj = FAISS.load_local(faiss_cj_index_path, in_depth_embeddings)
-    db_pdf = FAISS.load_local(faiss_pdf_index_path, in_depth_embeddings)
-    db_pc = FAISS.load_local(faiss_pc_index_path, in_depth_embeddings)
-    db_news = FAISS.load_local(faiss_news_index_path, in_depth_embeddings)
+    db_fc = FAISS.load_local(faiss_fc_index_path, in_depth_embeddings, allow_dangerous_deserialization=True)
+    db_cj = FAISS.load_local(faiss_cj_index_path, in_depth_embeddings, allow_dangerous_deserialization=True)
+    db_pdf = FAISS.load_local(faiss_pdf_index_path, in_depth_embeddings, allow_dangerous_deserialization=True)
+    db_pc = FAISS.load_local(faiss_pc_index_path, in_depth_embeddings, allow_dangerous_deserialization=True)
+    db_news = FAISS.load_local(faiss_news_index_path, in_depth_embeddings, allow_dangerous_deserialization=True)
 
     voting_roll_df_path = dir.joinpath("cache/parsed_voting_rolls.csv")
     voting_roll_df = pd.read_csv(voting_roll_df_path)