diff --git a/packages/googlecloud/functions/getanswer/evaluate/README.MD b/packages/googlecloud/functions/getanswer/evaluate/README.MD index 8dfd760d..29b04daa 100644 --- a/packages/googlecloud/functions/getanswer/evaluate/README.MD +++ b/packages/googlecloud/functions/getanswer/evaluate/README.MD @@ -1,32 +1,34 @@ -Using [confident-ai/deepeval](https://github.com/confident-ai/deepeval) LLM evaluation framework. +Using [confident-ai/deepeval](https://github.com/confident-ai/deepeval) LLM evaluation framework. Requires installation of deepeval library over pip: -'pip install -U deepeval' +'pip install -U deepeval' For windows: -'export OPENAI_API_KEY=xxx' +'set OPENAI_API_KEY=xxx' For OS: -'set OPENAI_API_KEY=xxx' +'export OPENAI_API_KEY=xxx' To run tests: -test_evaluate_live.py - reads in a live test query from user input, gets the sawt response, evaluates the response according - to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106 +test_evaluate_live.py:<br> + reads in a live test query from user input, gets the sawt response, evaluates the response according to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106. usage: 'deepeval test run test_evaluate_live.py' -test_evaluate_csv.py - reads test queries from file inputted by user, gets the sawt response, evaluates the responses according - to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106 +test_evaluate_tsv.py:<br> + reads test queries from tsv file inputted by user, gets the sawt responses, evaluates the responses according to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106. - usage: - 'deepeval test run test_evaluate_csv.py' + This file can contain be a gold data set with feature 'expected response', a list of queries without expected responses, or a mix of both. Queries with specified expected responses will be eveluated on 2 more metrics than queries without expected responses. + + usage: + 'deepeval test run test_evaluate_tsv.py' + +Test results and hyperparameters used by current model are logged in deepeval login. \ No newline at end of file diff --git a/packages/googlecloud/functions/getanswer/evaluate/gpt4.xlsx b/packages/googlecloud/functions/getanswer/evaluate/gpt4.xlsx deleted file mode 100644 index b315af85..00000000 Binary files a/packages/googlecloud/functions/getanswer/evaluate/gpt4.xlsx and /dev/null differ diff --git a/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_csv.py b/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_csv.py deleted file mode 100644 index 432a5271..00000000 --- a/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_csv.py +++ /dev/null @@ -1,105 +0,0 @@ -""" -usage: 'deepeval test run test_evaluate_csv.py - - -This will read test queries from file inputted by user, then evaluate the responses according -to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106 - -All hyperparameters used by current model are logged in deepeval login - -""" -import pytest -import deepeval -import logging -import sys -import os - -sys.path.append(os.path.join(os.path.dirname(__file__), "../")) - -from deepeval import assert_test -from deepeval.dataset import EvaluationDataset -from deepeval.metrics import AnswerRelevancyMetric, BiasMetric, ContextualRelevancyMetric, FaithfulnessMetric, GEval -from deepeval.test_case import LLMTestCase, LLMTestCaseParams -from inquirer import route_question -from helper import get_dbs -from inquirer import INDEPTH_RESPONSE_LLM, INDEPTH_RESPONSE_PROMPT_TEMPLATE, INDEPTH_RESPONSE_K -from api import RESPONSE_TYPE_DEPTH - -logger = logging.getLogger(__name__) - - -# model to use for evaluating responses -MODEL = 'gpt-3.5-turbo-1106' - - -def get_test_cases(): - """ - Run sawt on all test queries and create LLMTestCases for each. - """ - test_cases = [] - db_fc, db_cj, db_pdf, db_pc, db_news, voting_roll_df = get_dbs() - - csv_file_name = input("Enter the name or path of your csv file of queries (ex: queries.csv):") - if not os.path.exists(csv_file_name): - print("\nThe file ", csv_file_name, " doesn't exist, check the path or name.") - sys.exit() - logger.info('generating answers to all test queries...') - - - for query in open(csv_file_name): - query = query.strip() - actual_output, retrieval_context = route_question( - voting_roll_df, - db_fc, - db_cj, - db_pdf, - db_pc, - db_news, - query, - RESPONSE_TYPE_DEPTH, - k=5, - return_context=True - ) - # get single string for text response. - actual_output = ' '.join(i['response'] for i in actual_output['responses']) - test_cases.append(LLMTestCase(input=query, actual_output=actual_output, retrieval_context=[retrieval_context])) - - return EvaluationDataset(test_cases=test_cases) - -dataset = get_test_cases() - - -@pytest.mark.parametrize( - "test_case", - dataset, -) -def test_dataset(test_case: LLMTestCase): - ansRel = AnswerRelevancyMetric(threshold=0.2, model=MODEL) - bias = BiasMetric(threshold=0.5, model=MODEL) - contRel = ContextualRelevancyMetric(threshold=0.7, include_reason=True, model=MODEL) - faithMet = FaithfulnessMetric(threshold=0.7, include_reason=True, model=MODEL) - - readability = GEval(name="Readability", - criteria="Determine whether the text in 'actual output' is easy to read for those with a high school reading level.", - evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT], - model=MODEL) - - - punctuation = GEval(name="Punctuation", - criteria="Determine whether the text in 'actual output' has proper punctuation.", - evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT], - model=MODEL) - - opinions = GEval(name="Number of Opinions", - criteria="Determine whether the text in 'actual output' expresses more than one opinion on the topic of the query.", - evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT], - model=MODEL) - - assert_test(test_case, [ansRel, bias, contRel, faithMet,readability, punctuation, opinions]) - - -# Log hyperparameters so we can compare across different test runs in deepeval login -@deepeval.log_hyperparameters(model=INDEPTH_RESPONSE_LLM.model_name, prompt_template=INDEPTH_RESPONSE_PROMPT_TEMPLATE.template) -def hyperparameters(): - return {'k': INDEPTH_RESPONSE_K} - diff --git a/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_live.py b/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_live.py index 363a59cd..d2ef73d2 100644 --- a/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_live.py +++ b/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_live.py @@ -4,7 +4,7 @@ This will read a live test query from user input, get the sawt response, then evaluate the response according to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106 -All hyperparameters used by current model are logged in deepeval login +Test results and hyperparameters used by current model are logged in deepeval login. """ import pytest diff --git a/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_tsv.py b/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_tsv.py new file mode 100644 index 00000000..e992d19d --- /dev/null +++ b/packages/googlecloud/functions/getanswer/evaluate/test_evaluate_tsv.py @@ -0,0 +1,118 @@ +""" +usage: 'deepeval test run test_evaluate_gold_dataset.py' + +Reads test queries from tsv file inputted by user, gets the sawt responses, evaluates the responses according to several metrics as implemented by the deepeval library <https://github.com/confident-ai/deepeval/> and gpt-3.5-turbo-1106 + +This file can contain be a gold data set with feature 'expected response', a list of queries without expected responses, or a mix of both. +Queries with specified expected responses will be eveluated on 2 more metrics than queries without expected responses. + +Test results and hyperparameters used by current model are logged in deepeval login. + +""" +import pytest +import deepeval +import logging +import sys +import os + +sys.path.append(os.path.join(os.path.dirname(__file__), "../")) + +from deepeval import assert_test +from deepeval.dataset import EvaluationDataset +from deepeval.metrics import AnswerRelevancyMetric, BiasMetric, ContextualRelevancyMetric, FaithfulnessMetric, GEval, ContextualPrecisionMetric, ContextualRecallMetric +from deepeval.test_case import LLMTestCase, LLMTestCaseParams +from inquirer import route_question +from helper import get_dbs +from inquirer import INDEPTH_RESPONSE_LLM, INDEPTH_RESPONSE_PROMPT_TEMPLATE, INDEPTH_RESPONSE_K +from api import RESPONSE_TYPE_DEPTH + +logger = logging.getLogger(__name__) + + +# model to use for evaluating responses +MODEL = 'gpt-3.5-turbo-1106' + + +def get_test_cases(): + """ + Run sawt on all test queries and create LLMTestCases for each. + """ + test_cases = [] + db_fc, db_cj, db_pdf, db_pc, db_news, voting_roll_df = get_dbs() + + tsv_file_name = input("Enter the name or path of your tsv file of queries (ex: queries.tsv):") + if not os.path.exists(tsv_file_name): + print("\nThe file ", tsv_file_name, " doesn't exist, check the path or name.") + sys.exit() + logger.info('generating answers to all test queries...') + + with open(tsv_file_name) as file: + next(file) + for row in file: + row_obj = row.strip().split('\t') + if len(row_obj) == 1: + query = row_obj[0] + expected_output = '' + else: + query, expected_output = row_obj + + actual_output, retrieval_context = route_question( + voting_roll_df, + db_fc, + db_cj, + db_pdf, + db_pc, + db_news, + query, + RESPONSE_TYPE_DEPTH, + k=5, + return_context=True + ) + # get single string for text response + actual_output = ' '.join(i['response'] for i in actual_output['responses']) + test_cases.append(LLMTestCase(input=query, actual_output=actual_output, expected_output=expected_output, retrieval_context=[retrieval_context])) + + return EvaluationDataset(test_cases=test_cases) + +dataset = get_test_cases() + + +@pytest.mark.parametrize( + "test_case", + dataset, +) +def test_dataset(test_case: LLMTestCase): + #require expected_output + contextual_precision = ContextualPrecisionMetric(threshold=0.2, model=MODEL) + contextual_recall = ContextualRecallMetric(threshold=0.2, model=MODEL) + + #don't require expected_output + answer_relevancy = AnswerRelevancyMetric(threshold=0.2, model=MODEL) + bias = BiasMetric(threshold=0.5, model=MODEL) + contextual_relevancy = ContextualRelevancyMetric(threshold=0.7, include_reason=True, model=MODEL) + faithfulness = FaithfulnessMetric(threshold=0.7, include_reason=True, model=MODEL) + + readability = GEval(name="Readability", + criteria="Determine whether the text in 'actual output' is easy to read for those with a high school reading level.", + evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT], + model=MODEL) + + + punctuation = GEval(name="Punctuation", + criteria="Determine whether the text in 'actual output' has proper punctuation.", + evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT], + model=MODEL) + + opinions = GEval(name="Number of Opinions", + criteria="Determine whether the text in 'actual output' expresses more than one opinion on the topic of the query.", + evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT], + model=MODEL) + + assert_test(test_case, [contextual_recall, contextual_precision, answer_relevancy, bias, contextual_relevancy, faithfulness,readability, punctuation, opinions]) + + +# Log hyperparameters so we can compare across different test runs in deepeval login +@deepeval.log_hyperparameters(model=INDEPTH_RESPONSE_LLM.model_name, prompt_template=INDEPTH_RESPONSE_PROMPT_TEMPLATE.template) +def hyperparameters(): + return {'k': INDEPTH_RESPONSE_K} + diff --git a/packages/googlecloud/functions/getanswer/helper.py b/packages/googlecloud/functions/getanswer/helper.py index e97989b5..b1a4b70a 100644 --- a/packages/googlecloud/functions/getanswer/helper.py +++ b/packages/googlecloud/functions/getanswer/helper.py @@ -39,11 +39,11 @@ def get_dbs(): faiss_news_index_path = dir.joinpath("cache/faiss_index_in_depth_news") # Loading new FAISS indices for each document type - db_fc = FAISS.load_local(faiss_fc_index_path, in_depth_embeddings) - db_cj = FAISS.load_local(faiss_cj_index_path, in_depth_embeddings) - db_pdf = FAISS.load_local(faiss_pdf_index_path, in_depth_embeddings) - db_pc = FAISS.load_local(faiss_pc_index_path, in_depth_embeddings) - db_news = FAISS.load_local(faiss_news_index_path, in_depth_embeddings) + db_fc = FAISS.load_local(faiss_fc_index_path, in_depth_embeddings, allow_dangerous_deserialization=True) + db_cj = FAISS.load_local(faiss_cj_index_path, in_depth_embeddings, allow_dangerous_deserialization=True) + db_pdf = FAISS.load_local(faiss_pdf_index_path, in_depth_embeddings, allow_dangerous_deserialization=True) + db_pc = FAISS.load_local(faiss_pc_index_path, in_depth_embeddings, allow_dangerous_deserialization=True) + db_news = FAISS.load_local(faiss_news_index_path, in_depth_embeddings, allow_dangerous_deserialization=True) voting_roll_df_path = dir.joinpath("cache/parsed_voting_rolls.csv") voting_roll_df = pd.read_csv(voting_roll_df_path)