Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update rag eval doc #153

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions evals/evaluation/rag_eval/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ To setup a LLM model, we can use [tgi-gaudi](https://github.com/huggingface/tgi-
# please set your llm_port and hf_token

docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.1 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2

# for better performance, set `PREFILL_BATCH_BUCKET_SIZE`, `BATCH_BUCKET_SIZE`, `max-batch-total-tokens`, `max-batch-prefill-tokens`
docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} -e PREFILL_BATCH_BUCKET_SIZE=1 -e BATCH_BUCKET_SIZE=8 --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2 --max-batch-total-tokens 65536 --max-batch-prefill-tokens 2048
```

### Prepare Dataset
Expand Down Expand Up @@ -138,6 +141,9 @@ If you are using docker compose to deploy RAG system, you can simply run the eva
```bash
cd examples
python eval_crud.py --dataset_path ../data/split_merged.json --docs_path ../data/80000_docs --ingest_docs

# if you want to get ragas metrics
python eval_crud.py --dataset_path ../data/split_merged.json --docs_path ../data/80000_docs --contain_original_data --llm_endpoint "http://{llm_as_judge_ip}:{llm_as_judge_port}" --ragas_metrics
```

If you are using Kubernetes manifest/helm to deploy RAG system, you must specify more arguments as following:
Expand Down
107 changes: 64 additions & 43 deletions evals/metrics/ragas/ragas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@
# SPDX-License-Identifier: Apache-2.0
#
import os
import re
from typing import Dict, Optional, Union

from langchain_core.embeddings import Embeddings
from langchain_core.language_models import BaseLanguageModel
from langchain_huggingface import HuggingFaceEndpoint

# import * is only allowed at module level according to python syntax
from ragas.metrics import *


def format_ragas_metric_name(name: str):
return f"{name} (ragas)"
Expand All @@ -29,16 +33,17 @@ def __init__(
self.model = model
self.embeddings = embeddings
self.metrics = metrics
self.validated_list = [
"answer_correctness",
"answer_relevancy",
"answer_similarity",
"context_precision",
"context_recall",
"faithfulness",
"context_utilization",
# "reference_free_rubrics_score",
]

# self.validated_list = [
# "answer_correctness",
# "answer_relevancy",
# "answer_similarity",
# "context_precision",
# "context_recall",
# "faithfulness",
# "context_utilization",
# # "reference_free_rubrics_score",
# ]

async def a_measure(self, test_case: Dict):
return self.measure(test_case)
Expand All @@ -47,37 +52,51 @@ def measure(self, test_case: Dict):
# sends to server
try:
from ragas import evaluate
from ragas.metrics import (
answer_correctness,
answer_relevancy,
answer_similarity,
context_precision,
context_recall,
context_utilization,
faithfulness,
)
from ragas.metrics import ALL_METRICS

self.metric_names = [metric.__class__.__name__ for metric in ALL_METRICS]
self.metric_names = [re.sub(r"(?<!^)(?=[A-Z])", "_", name).lower() for name in self.metric_names]
self.metric_names = list(set(self.metric_names))
# Note - summarization score metric is not working with best open-source LLMs
# Note - which is why we are removing it from our offering at the moment.
self.metric_names.remove("summarization_score")
self.metric_instances = {}
for metric in self.metric_names:
try:
self.metric_instances[metric] = eval(metric)
except:
pass
# from ragas.metrics import (
# answer_correctness,
# answer_relevancy,
# answer_similarity,
# context_precision,
# context_recall,
# context_utilization,
# faithfulness,
# )
except ModuleNotFoundError:
raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
try:
from datasets import Dataset
except ModuleNotFoundError:
raise ModuleNotFoundError("Please install dataset")
self.metrics_instance = {
"answer_correctness": answer_correctness,
"answer_relevancy": answer_relevancy,
"answer_similarity": answer_similarity,
"context_precision": context_precision,
"context_recall": context_recall,
"faithfulness": faithfulness,
"context_utilization": context_utilization,
# "reference_free_rubrics_score": reference_free_rubrics_score,
}
# self.metrics_instance = {
# "answer_correctness": answer_correctness,
# "answer_relevancy": answer_relevancy,
# "answer_similarity": answer_similarity,
# "context_precision": context_precision,
# "context_recall": context_recall,
# "faithfulness": faithfulness,
# "context_utilization": context_utilization,
# # "reference_free_rubrics_score": reference_free_rubrics_score,
# }
# Set LLM model
openai_key = os.getenv("OPENAI_API_KEY", None)
if openai_key is not None:
print("OPENAI_API_KEY is provided, ragas initializes the model by OpenAI.")
self.model = None
if isinstance(self.model, str):
self.chat_model = None
elif isinstance(self.model, str):
print("LLM endpoint: ", self.model)
self.chat_model = HuggingFaceEndpoint(
endpoint_url=self.model,
Expand All @@ -92,36 +111,38 @@ def measure(self, test_case: Dict):
tmp_metrics = []
# check supported list
for metric in self.metrics:
if metric not in self.validated_list:
if metric not in self.metric_names:
raise ValueError(
"metric should be in supported list {}. ".format(self.validated_list)
"metric should be in supported list {}. ".format(self.metric_names)
+ "ClientResponseError raised with LangchainLLM "
+ "when context_precision, context_recall ran. "
+ "Here are the related issues described in ragas "
"https://github.com/explodinggradients/ragas/issues/934, "
+ "https://github.com/explodinggradients/ragas/issues/664."
)
else:
if metric == "answer_relevancy" and self.embeddings is None:
raise ValueError("answer_relevancy metric need provide embeddings model.")
if metric == "AnswerRelevancy" and self.embeddings is None:
raise ValueError("AnswerRelevancy metric need provide embeddings model.")
tmp_metrics.append(self.metrics_instance[metric])
self.metrics = tmp_metrics
else:
self.metrics = [
answer_relevancy,
faithfulness,
answer_correctness,
answer_similarity,
context_precision,
context_recall,
]
self.metrics = list(self.metric_instances.values())
# self.metrics = [
# answer_relevancy,
# faithfulness,
# answer_correctness,
# answer_similarity,
# context_precision,
# context_recall,
# ]
# Find necessary input fields using the given metrics
_required_columns = set()
column_map = { # this column maps new naming style in ragas to their old naming style
"user_input": "question",
"response": "answer",
"reference": "ground_truth",
"retrieved_contexts": "contexts",
"reference_contexts": "reference_contexts",
}
for metric in self.metrics:
if hasattr(metric, "_required_columns"):
Expand Down
2 changes: 1 addition & 1 deletion tests/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ jieba
langchain_community
langchain_huggingface
lm-eval==0.4.3
ragas
ragas==0.1.19
3 changes: 3 additions & 0 deletions tests/test_ragas.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,18 @@ def test_ragas(self):

# Replace this with the actual retrieved context from your RAG pipeline
retrieval_context = ["All customers are eligible for a 30 day full refund at no extra cost."]
reference_context = ["We can only process full refund upto 30 day after the purchase."]
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5")

metric = RagasMetric(threshold=0.5, model=f"http://{host_ip}:{port}", embeddings=embeddings)
test_case = {
"question": ["What if these shoes don't fit?"],
"answer": [actual_output],
"ground_truth": [expected_output],
"contexts": [retrieval_context],
"reference_contexts": [reference_context],
}

metric.measure(test_case)
Expand Down
Loading