diff --git a/evals/evaluation/rag_eval/README.md b/evals/evaluation/rag_eval/README.md index 1186464a..f15c0e53 100644 --- a/evals/evaluation/rag_eval/README.md +++ b/evals/evaluation/rag_eval/README.md @@ -63,6 +63,9 @@ To setup a LLM model, we can use [tgi-gaudi](https://github.com/huggingface/tgi- # please set your llm_port and hf_token docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.1 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2 + +# for better performance, set `PREFILL_BATCH_BUCKET_SIZE`, `BATCH_BUCKET_SIZE`, `max-batch-total-tokens`, `max-batch-prefill-tokens` +docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} -e PREFILL_BATCH_BUCKET_SIZE=1 -e BATCH_BUCKET_SIZE=8 --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2 --max-batch-total-tokens 65536 --max-batch-prefill-tokens 2048 ``` ### Prepare Dataset @@ -138,6 +141,9 @@ If you are using docker compose to deploy RAG system, you can simply run the eva ```bash cd examples python eval_crud.py --dataset_path ../data/split_merged.json --docs_path ../data/80000_docs --ingest_docs + +# if you want to get ragas metrics +python eval_crud.py --dataset_path ../data/split_merged.json --docs_path ../data/80000_docs --contain_original_data --llm_endpoint "http://{llm_as_judge_ip}:{llm_as_judge_port}" --ragas_metrics ``` If you are using Kubernetes manifest/helm to deploy RAG system, you must specify more arguments as following: diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index 9b0a1d3e..c80ff94e 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -4,12 +4,16 @@ # SPDX-License-Identifier: Apache-2.0 # import os +import re from typing import Dict, Optional, Union from langchain_core.embeddings import Embeddings from langchain_core.language_models import BaseLanguageModel from langchain_huggingface import HuggingFaceEndpoint +# import * is only allowed at module level according to python syntax +from ragas.metrics import * + def format_ragas_metric_name(name: str): return f"{name} (ragas)" @@ -29,16 +33,17 @@ def __init__( self.model = model self.embeddings = embeddings self.metrics = metrics - self.validated_list = [ - "answer_correctness", - "answer_relevancy", - "answer_similarity", - "context_precision", - "context_recall", - "faithfulness", - "context_utilization", - # "reference_free_rubrics_score", - ] + + # self.validated_list = [ + # "answer_correctness", + # "answer_relevancy", + # "answer_similarity", + # "context_precision", + # "context_recall", + # "faithfulness", + # "context_utilization", + # # "reference_free_rubrics_score", + # ] async def a_measure(self, test_case: Dict): return self.measure(test_case) @@ -47,37 +52,51 @@ def measure(self, test_case: Dict): # sends to server try: from ragas import evaluate - from ragas.metrics import ( - answer_correctness, - answer_relevancy, - answer_similarity, - context_precision, - context_recall, - context_utilization, - faithfulness, - ) + from ragas.metrics import ALL_METRICS + + self.metric_names = [metric.__class__.__name__ for metric in ALL_METRICS] + self.metric_names = [re.sub(r"(?