From 47c3d39803cad758f88cac5dfe32ea400eadabf2 Mon Sep 17 00:00:00 2001 From: lkk12014402 <641553140@qq.com> Date: Wed, 25 Sep 2024 14:40:45 +0000 Subject: [PATCH 1/6] add crud ragas evaluation. --- .../evaluation/rag_eval/examples/eval_crud.py | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/evals/evaluation/rag_eval/examples/eval_crud.py b/evals/evaluation/rag_eval/examples/eval_crud.py index 4a4ac8e6..2f4297e1 100644 --- a/evals/evaluation/rag_eval/examples/eval_crud.py +++ b/evals/evaluation/rag_eval/examples/eval_crud.py @@ -10,6 +10,7 @@ from evals.evaluation.rag_eval import Evaluator from evals.evaluation.rag_eval.template import CRUDTemplate +from evals.metrics.ragas import RagasMetric class CRUD_Evaluator(Evaluator): @@ -78,6 +79,41 @@ def get_template(self): def post_process(self, result): return result.split("")[-1].split("")[0].strip() + def get_ragas_metrics(self, results, arguments): + from langchain_huggingface import HuggingFaceEndpointEmbeddings + + embeddings = HuggingFaceEndpointEmbeddings(model=arguments.tei_embedding_endpoint) + + metric = RagasMetric(threshold=0.5, model=arguments.llm_endpoint, embeddings=embeddings, + metrics=["faithfulness", "answer_relevancy"]) + + all_answer_relevancy = 0 + all_faithfulness = 0 + ragas_inputs = { + "question": [], + "answer": [], + "ground_truth": [], + "contexts": [], + } + + valid_results = self.remove_invalid(results) + + for data in tqdm(valid_results): + data = data["original_data"] + + query = self.get_query(data) + generated_text = data["generated_text"] + ground_truth = data["ground_truth_text"] + retrieved_documents = data["retrieved_documents"] + + ragas_inputs["question"].append(query) + ragas_inputs["answer"].append(generated_text) + ragas_inputs["ground_truth"].append(ground_truth) + ragas_inputs["contexts"].append(retrieved_documents[:3]) + + ragas_metrics = metric.measure(ragas_inputs) + return ragas_metrics + def args_parser(): parser = argparse.ArgumentParser() @@ -116,6 +152,13 @@ def args_parser(): parser.add_argument( "--retrieval_endpoint", type=str, default="http://localhost:7000/v1/retrieval", help="Service URL address." ) + parser.add_argument( + "--tei_embedding_endpoint", + type=str, + default="http://localhost:8090", + help="Service URL address of tei embedding.", + ) + parser.add_argument("--ragas_metrics", action="store_true", help="Whether to compute ragas metrics.") parser.add_argument("--llm_endpoint", type=str, default=None, help="Service URL address.") parser.add_argument( "--show_progress_bar", action="store", default=True, type=bool, help="Whether to show a progress bar" From b9e741fcab55d065496dc99cfcd99290ce1b9183 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 25 Sep 2024 14:38:40 +0000 Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- evals/evaluation/rag_eval/examples/eval_crud.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/evals/evaluation/rag_eval/examples/eval_crud.py b/evals/evaluation/rag_eval/examples/eval_crud.py index 2f4297e1..f3e7a655 100644 --- a/evals/evaluation/rag_eval/examples/eval_crud.py +++ b/evals/evaluation/rag_eval/examples/eval_crud.py @@ -84,8 +84,12 @@ def get_ragas_metrics(self, results, arguments): embeddings = HuggingFaceEndpointEmbeddings(model=arguments.tei_embedding_endpoint) - metric = RagasMetric(threshold=0.5, model=arguments.llm_endpoint, embeddings=embeddings, - metrics=["faithfulness", "answer_relevancy"]) + metric = RagasMetric( + threshold=0.5, + model=arguments.llm_endpoint, + embeddings=embeddings, + metrics=["faithfulness", "answer_relevancy"], + ) all_answer_relevancy = 0 all_faithfulness = 0 From 1e5cd17af136bd64fe7400ce1a84d5a962c16372 Mon Sep 17 00:00:00 2001 From: lkk12014402 <641553140@qq.com> Date: Thu, 26 Sep 2024 01:51:46 +0000 Subject: [PATCH 3/6] update crud ragas evaluation. --- evals/evaluation/rag_eval/examples/eval_crud.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/evals/evaluation/rag_eval/examples/eval_crud.py b/evals/evaluation/rag_eval/examples/eval_crud.py index f3e7a655..75a90849 100644 --- a/evals/evaluation/rag_eval/examples/eval_crud.py +++ b/evals/evaluation/rag_eval/examples/eval_crud.py @@ -11,6 +11,7 @@ from evals.evaluation.rag_eval import Evaluator from evals.evaluation.rag_eval.template import CRUDTemplate from evals.metrics.ragas import RagasMetric +from tqdm import tqdm class CRUD_Evaluator(Evaluator): @@ -100,7 +101,7 @@ def get_ragas_metrics(self, results, arguments): "contexts": [], } - valid_results = self.remove_invalid(results) + valid_results = self.remove_invalid(results["results"]) for data in tqdm(valid_results): data = data["original_data"] @@ -200,6 +201,10 @@ def main(): results = evaluator.evaluate( args, show_progress_bar=args.show_progress_bar, contain_original_data=args.contain_original_data ) + print(results["overall"]) + if args.ragas_metrics: + ragas_metrics = evaluator.get_ragas_metrics(results, args) + print(ragas_metrics) print(f"Evaluation results of task {task} saved to {output_save_path}.") From 66c4b604dd567ffd11b0686f6c9f951b7e90118d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 26 Sep 2024 01:49:15 +0000 Subject: [PATCH 4/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- evals/evaluation/rag_eval/examples/eval_crud.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evals/evaluation/rag_eval/examples/eval_crud.py b/evals/evaluation/rag_eval/examples/eval_crud.py index 75a90849..58694297 100644 --- a/evals/evaluation/rag_eval/examples/eval_crud.py +++ b/evals/evaluation/rag_eval/examples/eval_crud.py @@ -8,10 +8,11 @@ import json import os +from tqdm import tqdm + from evals.evaluation.rag_eval import Evaluator from evals.evaluation.rag_eval.template import CRUDTemplate from evals.metrics.ragas import RagasMetric -from tqdm import tqdm class CRUD_Evaluator(Evaluator): From c4a325d29c5c28830493c1f583e3a9d15f405976 Mon Sep 17 00:00:00 2001 From: lkk12014402 <641553140@qq.com> Date: Thu, 26 Sep 2024 02:50:21 +0000 Subject: [PATCH 5/6] remove. --- evals/evaluation/rag_eval/examples/eval_crud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/evaluation/rag_eval/examples/eval_crud.py b/evals/evaluation/rag_eval/examples/eval_crud.py index 58694297..c7fcb050 100644 --- a/evals/evaluation/rag_eval/examples/eval_crud.py +++ b/evals/evaluation/rag_eval/examples/eval_crud.py @@ -195,7 +195,7 @@ def main(): ) output_save_path = os.path.join(args.output_dir, f"{task}.json") evaluator = CRUD_Evaluator( - dataset=dataset, output_path=output_save_path, task=task, llm_endpoint=args.llm_endpoint + dataset=dataset, output_path=output_save_path, task=task ) if args.ingest_docs: CRUD_Evaluator.ingest_docs(args.docs_path, args.database_endpoint, args.chunk_size, args.chunk_overlap) From 58423ddfad171afde3c324a42e2540bd64c4997a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 26 Sep 2024 02:47:16 +0000 Subject: [PATCH 6/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- evals/evaluation/rag_eval/examples/eval_crud.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/evals/evaluation/rag_eval/examples/eval_crud.py b/evals/evaluation/rag_eval/examples/eval_crud.py index c7fcb050..1cb3b247 100644 --- a/evals/evaluation/rag_eval/examples/eval_crud.py +++ b/evals/evaluation/rag_eval/examples/eval_crud.py @@ -194,9 +194,7 @@ def main(): "summarization, question_answering, continuation and hallucinated_modified." ) output_save_path = os.path.join(args.output_dir, f"{task}.json") - evaluator = CRUD_Evaluator( - dataset=dataset, output_path=output_save_path, task=task - ) + evaluator = CRUD_Evaluator(dataset=dataset, output_path=output_save_path, task=task) if args.ingest_docs: CRUD_Evaluator.ingest_docs(args.docs_path, args.database_endpoint, args.chunk_size, args.chunk_overlap) results = evaluator.evaluate(