diff --git a/packages/googlecloud/functions/getanswer/evaluate/README.MD b/packages/googlecloud/functions/getanswer/evaluate/README.MD
index 8dfd760d..29b04daa 100644
--- a/packages/googlecloud/functions/getanswer/evaluate/README.MD
+++ b/packages/googlecloud/functions/getanswer/evaluate/README.MD
@@ -1,32 +1,34 @@
-Using [confident-ai/deepeval](https://github.com/confident-ai/deepeval) LLM evaluation framework. 
+Using [confident-ai/deepeval](https://github.com/confident-ai/deepeval) LLM evaluation framework.  
 
 
 Requires installation of deepeval library over pip:
 
-'pip install -U deepeval'
+'pip install -U deepeval'  
 
 
 
 For windows:
-'export OPENAI_API_KEY=xxx' 
+'set OPENAI_API_KEY=xxx'  
 
 For OS:
-'set OPENAI_API_KEY=xxx' 
+'export OPENAI_API_KEY=xxx'  
 
 
 
 To run tests:
 
-test_evaluate_live.py 
-    reads in a live test query from user input, gets the sawt response, evaluates the response according
-    to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106
+test_evaluate_live.py:<br>
+    reads in a live test query from user input, gets the sawt response, evaluates the response according to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106.  
 
     usage: 
         'deepeval test run test_evaluate_live.py'
 
-test_evaluate_csv.py
-    reads test queries from file inputted by user, gets the sawt response, evaluates the responses according
-    to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106
+test_evaluate_tsv.py:<br>
+    reads test queries from tsv file inputted by user, gets the sawt responses, evaluates the responses according to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106.  
 
-        usage: 
-        'deepeval test run test_evaluate_csv.py'
+    This file can contain be a gold data set with feature 'expected response', a list of queries without expected responses, or a mix of both. Queries with specified expected responses will be eveluated on 2 more metrics than queries without expected responses.  
+
+    usage: 
+        'deepeval test run test_evaluate_tsv.py'  
+
+Test results and hyperparameters used by current model are logged in deepeval login.
\ No newline at end of file
diff --git a/packages/googlecloud/functions/getanswer/evaluate/gpt4.xlsx b/packages/googlecloud/functions/getanswer/evaluate/gpt4.xlsx
deleted file mode 100644
index b315af85..00000000
Binary files a/packages/googlecloud/functions/getanswer/evaluate/gpt4.xlsx and /dev/null differ
diff --git a/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_csv.py b/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_csv.py
deleted file mode 100644
index 432a5271..00000000
--- a/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_csv.py
+++ /dev/null
@@ -1,105 +0,0 @@
-"""
-usage: 'deepeval test run test_evaluate_csv.py
-
-
-This will read test queries from file inputted by user, then evaluate the responses according
-to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106
-
-All hyperparameters used by current model are logged in deepeval login
-
-"""
-import pytest
-import deepeval
-import logging
-import sys
-import os
-
-sys.path.append(os.path.join(os.path.dirname(__file__), "../"))
-
-from deepeval import assert_test
-from deepeval.dataset import EvaluationDataset
-from deepeval.metrics import AnswerRelevancyMetric, BiasMetric, ContextualRelevancyMetric, FaithfulnessMetric, GEval
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-from inquirer import route_question
-from helper import get_dbs
-from inquirer import INDEPTH_RESPONSE_LLM, INDEPTH_RESPONSE_PROMPT_TEMPLATE, INDEPTH_RESPONSE_K
-from api import RESPONSE_TYPE_DEPTH
-
-logger = logging.getLogger(__name__)
-
-
-# model to use for evaluating responses
-MODEL = 'gpt-3.5-turbo-1106'
-
-
-def get_test_cases():
-    """
-    Run sawt on all test queries and create LLMTestCases for each.
-    """
-    test_cases = []
-    db_fc, db_cj, db_pdf, db_pc, db_news, voting_roll_df = get_dbs()
-
-    csv_file_name = input("Enter the name or path of your csv file of queries (ex: queries.csv):")
-    if not os.path.exists(csv_file_name):
-        print("\nThe file ", csv_file_name, " doesn't exist, check the path or name.")
-        sys.exit()
-    logger.info('generating answers to all test queries...')
-
-
-    for query in open(csv_file_name):
-        query = query.strip()
-        actual_output, retrieval_context = route_question(
-            voting_roll_df,
-            db_fc,
-            db_cj,
-            db_pdf,
-            db_pc,
-            db_news,
-            query,
-            RESPONSE_TYPE_DEPTH,
-            k=5,
-            return_context=True
-        )
-        # get single string for text response.
-        actual_output = ' '.join(i['response'] for i in actual_output['responses'])    
-        test_cases.append(LLMTestCase(input=query, actual_output=actual_output, retrieval_context=[retrieval_context]))
-
-    return EvaluationDataset(test_cases=test_cases)
-
-dataset = get_test_cases()
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    dataset,
-)
-def test_dataset(test_case: LLMTestCase):
-    ansRel = AnswerRelevancyMetric(threshold=0.2, model=MODEL)
-    bias = BiasMetric(threshold=0.5, model=MODEL)
-    contRel = ContextualRelevancyMetric(threshold=0.7, include_reason=True, model=MODEL)
-    faithMet = FaithfulnessMetric(threshold=0.7, include_reason=True, model=MODEL)
-   
-    readability = GEval(name="Readability",
-        criteria="Determine whether the text in 'actual output' is easy to read for those with a high school reading level.",
-        evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
-        model=MODEL)
-
-
-    punctuation = GEval(name="Punctuation",
-        criteria="Determine whether the text in 'actual output' has proper punctuation.",
-        evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
-        model=MODEL)
-   
-    opinions = GEval(name="Number of Opinions",
-        criteria="Determine whether the text in 'actual output' expresses more than one opinion on the topic of the query.",
-        evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
-        model=MODEL)
-
-    assert_test(test_case, [ansRel, bias, contRel, faithMet,readability, punctuation, opinions])
-
-
-# Log hyperparameters so we can compare across different test runs in deepeval login
-@deepeval.log_hyperparameters(model=INDEPTH_RESPONSE_LLM.model_name, prompt_template=INDEPTH_RESPONSE_PROMPT_TEMPLATE.template)
-def hyperparameters():
-    return {'k': INDEPTH_RESPONSE_K}
-
diff --git a/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_live.py b/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_live.py
index 363a59cd..d2ef73d2 100644
--- a/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_live.py
+++ b/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_live.py
@@ -4,7 +4,7 @@
 This will read a live test query from user input, get the sawt response, then evaluate the response according
 to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106
 
-All hyperparameters used by current model are logged in deepeval login
+Test results and hyperparameters used by current model are logged in deepeval login.
 
 """
 import pytest
diff --git a/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_tsv.py b/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_tsv.py
new file mode 100644
index 00000000..e992d19d
--- /dev/null
+++ b/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_tsv.py
@@ -0,0 +1,118 @@
+"""
+usage: 'deepeval test run test_evaluate_gold_dataset.py'
+
+Reads test queries from tsv file inputted by user, gets the sawt responses, evaluates the responses according to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106
+
+This file can contain be a gold data set with feature 'expected response', a list of queries without expected responses, or a mix of both. 
+Queries with specified expected responses will be eveluated on 2 more metrics than queries without expected responses.
+
+Test results and hyperparameters used by current model are logged in deepeval login.
+
+"""
+import pytest
+import deepeval
+import logging
+import sys
+import os
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../"))
+
+from deepeval import assert_test
+from deepeval.dataset import EvaluationDataset
+from deepeval.metrics import AnswerRelevancyMetric, BiasMetric, ContextualRelevancyMetric, FaithfulnessMetric, GEval, ContextualPrecisionMetric, ContextualRecallMetric
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from inquirer import route_question
+from helper import get_dbs
+from inquirer import INDEPTH_RESPONSE_LLM, INDEPTH_RESPONSE_PROMPT_TEMPLATE, INDEPTH_RESPONSE_K
+from api import RESPONSE_TYPE_DEPTH
+
+logger = logging.getLogger(__name__)
+
+
+# model to use for evaluating responses
+MODEL = 'gpt-3.5-turbo-1106'
+
+
+def get_test_cases():
+    """
+    Run sawt on all test queries and create LLMTestCases for each.
+    """
+    test_cases = []
+    db_fc, db_cj, db_pdf, db_pc, db_news, voting_roll_df = get_dbs()
+
+    tsv_file_name = input("Enter the name or path of your tsv file of queries (ex: queries.tsv):")
+    if not os.path.exists(tsv_file_name):
+        print("\nThe file ", tsv_file_name, " doesn't exist, check the path or name.")
+        sys.exit()
+    logger.info('generating answers to all test queries...')
+
+    with open(tsv_file_name) as file:
+        next(file)
+        for row in file:
+            row_obj = row.strip().split('\t')
+            if len(row_obj) == 1:
+                query = row_obj[0]
+                expected_output = ''
+            else:
+                query, expected_output = row_obj
+
+            actual_output, retrieval_context = route_question(
+                voting_roll_df,
+                db_fc,
+                db_cj,
+                db_pdf,
+                db_pc,
+                db_news,
+                query,
+                RESPONSE_TYPE_DEPTH,
+                k=5,
+                return_context=True
+            )
+            # get single string for text response
+            actual_output = ' '.join(i['response'] for i in actual_output['responses'])    
+            test_cases.append(LLMTestCase(input=query, actual_output=actual_output, expected_output=expected_output, retrieval_context=[retrieval_context]))
+
+    return EvaluationDataset(test_cases=test_cases)
+
+dataset = get_test_cases()
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    dataset,
+)
+def test_dataset(test_case: LLMTestCase):
+    #require expected_output
+    contextual_precision = ContextualPrecisionMetric(threshold=0.2, model=MODEL)
+    contextual_recall = ContextualRecallMetric(threshold=0.2, model=MODEL)
+    
+    #don't require expected_output
+    answer_relevancy = AnswerRelevancyMetric(threshold=0.2, model=MODEL)
+    bias = BiasMetric(threshold=0.5, model=MODEL)
+    contextual_relevancy = ContextualRelevancyMetric(threshold=0.7, include_reason=True, model=MODEL)
+    faithfulness = FaithfulnessMetric(threshold=0.7, include_reason=True, model=MODEL)
+   
+    readability = GEval(name="Readability",
+        criteria="Determine whether the text in 'actual output' is easy to read for those with a high school reading level.",
+        evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+        model=MODEL)
+
+
+    punctuation = GEval(name="Punctuation",
+        criteria="Determine whether the text in 'actual output' has proper punctuation.",
+        evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+        model=MODEL)
+   
+    opinions = GEval(name="Number of Opinions",
+        criteria="Determine whether the text in 'actual output' expresses more than one opinion on the topic of the query.",
+        evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+        model=MODEL)
+
+    assert_test(test_case, [contextual_recall, contextual_precision, answer_relevancy, bias, contextual_relevancy, faithfulness,readability, punctuation, opinions])
+
+
+# Log hyperparameters so we can compare across different test runs in deepeval login
+@deepeval.log_hyperparameters(model=INDEPTH_RESPONSE_LLM.model_name, prompt_template=INDEPTH_RESPONSE_PROMPT_TEMPLATE.template)
+def hyperparameters():
+    return {'k': INDEPTH_RESPONSE_K}
+
diff --git a/packages/googlecloud/functions/getanswer/helper.py b/packages/googlecloud/functions/getanswer/helper.py
index e97989b5..b1a4b70a 100644
--- a/packages/googlecloud/functions/getanswer/helper.py
+++ b/packages/googlecloud/functions/getanswer/helper.py
@@ -39,11 +39,11 @@ def get_dbs():
     faiss_news_index_path = dir.joinpath("cache/faiss_index_in_depth_news")
 
     # Loading new FAISS indices for each document type
-    db_fc = FAISS.load_local(faiss_fc_index_path, in_depth_embeddings)
-    db_cj = FAISS.load_local(faiss_cj_index_path, in_depth_embeddings)
-    db_pdf = FAISS.load_local(faiss_pdf_index_path, in_depth_embeddings)
-    db_pc = FAISS.load_local(faiss_pc_index_path, in_depth_embeddings)
-    db_news = FAISS.load_local(faiss_news_index_path, in_depth_embeddings)
+    db_fc = FAISS.load_local(faiss_fc_index_path, in_depth_embeddings, allow_dangerous_deserialization=True)
+    db_cj = FAISS.load_local(faiss_cj_index_path, in_depth_embeddings, allow_dangerous_deserialization=True)
+    db_pdf = FAISS.load_local(faiss_pdf_index_path, in_depth_embeddings, allow_dangerous_deserialization=True)
+    db_pc = FAISS.load_local(faiss_pc_index_path, in_depth_embeddings, allow_dangerous_deserialization=True)
+    db_news = FAISS.load_local(faiss_news_index_path, in_depth_embeddings, allow_dangerous_deserialization=True)
 
     voting_roll_df_path = dir.joinpath("cache/parsed_voting_rolls.csv")
     voting_roll_df = pd.read_csv(voting_roll_df_path)