opea-project · lkk12014402 · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024 · Sep 20, 2024
@@ -63,6 +63,9 @@ To setup a LLM model, we can use [tgi-gaudi](https://github.com/huggingface/tgi-
 # please set your llm_port and hf_token
 
 docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.1 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2
+
+# for better performance, set `PREFILL_BATCH_BUCKET_SIZE`, `BATCH_BUCKET_SIZE`, `max-batch-total-tokens`, `max-batch-prefill-tokens`
+docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} -e PREFILL_BATCH_BUCKET_SIZE=1 -e BATCH_BUCKET_SIZE=8 --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2 --max-batch-total-tokens 65536 --max-batch-prefill-tokens 2048
 ```
 
 ### Prepare Dataset
@@ -138,6 +141,9 @@ If you are using docker compose to deploy RAG system, you can simply run the eva
 ```bash
 cd examples
 python eval_crud.py --dataset_path ../data/split_merged.json --docs_path ../data/80000_docs --ingest_docs
+
+# if you want to get ragas metrics
+python eval_crud.py --dataset_path ../data/split_merged.json --docs_path ../data/80000_docs  --contain_original_data --llm_endpoint "http://{llm_as_judge_ip}:{llm_as_judge_port}"  --ragas_metrics
 ```
 
 If you are using Kubernetes manifest/helm to deploy RAG system, you must specify more arguments as following:

@@ -4,12 +4,16 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 import os
+import re
 from typing import Dict, Optional, Union
 
 from langchain_core.embeddings import Embeddings
 from langchain_core.language_models import BaseLanguageModel
 from langchain_huggingface import HuggingFaceEndpoint
 
+# import * is only allowed at module level according to python syntax
+from ragas.metrics import *
+
 
 def format_ragas_metric_name(name: str):
     return f"{name} (ragas)"
@@ -29,16 +33,17 @@ def __init__(
         self.model = model
         self.embeddings = embeddings
         self.metrics = metrics
-        self.validated_list = [
-            "answer_correctness",
-            "answer_relevancy",
-            "answer_similarity",
-            "context_precision",
-            "context_recall",
-            "faithfulness",
-            "context_utilization",
-            # "reference_free_rubrics_score",
-        ]
+
+        # self.validated_list = [
+        #     "answer_correctness",
+        #     "answer_relevancy",
+        #     "answer_similarity",
+        #     "context_precision",
+        #     "context_recall",
+        #     "faithfulness",
+        #     "context_utilization",
+        #     # "reference_free_rubrics_score",
+        # ]
 
     async def a_measure(self, test_case: Dict):
         return self.measure(test_case)
@@ -47,37 +52,51 @@ def measure(self, test_case: Dict):
         # sends to server
         try:
             from ragas import evaluate
-            from ragas.metrics import (
-                answer_correctness,
-                answer_relevancy,
-                answer_similarity,
-                context_precision,
-                context_recall,
-                context_utilization,
-                faithfulness,
-            )
+            from ragas.metrics import ALL_METRICS
+
+            self.metric_names = [metric.__class__.__name__ for metric in ALL_METRICS]
+            self.metric_names = [re.sub(r"(?<!^)(?=[A-Z])", "_", name).lower() for name in self.metric_names]
+            self.metric_names = list(set(self.metric_names))
+            # Note - summarization score metric is not working with best open-source LLMs
+            # Note - which is why we are removing it from our offering at the moment.
+            self.metric_names.remove("summarization_score")
+            self.metric_instances = {}
+            for metric in self.metric_names:
+                try:
+                    self.metric_instances[metric] = eval(metric)
+                except:
+                    pass
+            # from ragas.metrics import (
+            #     answer_correctness,
+            #     answer_relevancy,
+            #     answer_similarity,
+            #     context_precision,
+            #     context_recall,
+            #     context_utilization,
+            #     faithfulness,
+            # )
         except ModuleNotFoundError:
             raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
         try:
             from datasets import Dataset
         except ModuleNotFoundError:
             raise ModuleNotFoundError("Please install dataset")
-        self.metrics_instance = {
-            "answer_correctness": answer_correctness,
-            "answer_relevancy": answer_relevancy,
-            "answer_similarity": answer_similarity,
-            "context_precision": context_precision,
-            "context_recall": context_recall,
-            "faithfulness": faithfulness,
-            "context_utilization": context_utilization,
-            # "reference_free_rubrics_score": reference_free_rubrics_score,
-        }
+        # self.metrics_instance = {
+        #     "answer_correctness": answer_correctness,
+        #     "answer_relevancy": answer_relevancy,
+        #     "answer_similarity": answer_similarity,
+        #     "context_precision": context_precision,
+        #     "context_recall": context_recall,
+        #     "faithfulness": faithfulness,
+        #     "context_utilization": context_utilization,
+        #     # "reference_free_rubrics_score": reference_free_rubrics_score,
+        # }
         # Set LLM model
         openai_key = os.getenv("OPENAI_API_KEY", None)
         if openai_key is not None:
             print("OPENAI_API_KEY is provided, ragas initializes the model by OpenAI.")
-            self.model = None
-        if isinstance(self.model, str):
+            self.chat_model = None
+        elif isinstance(self.model, str):
             print("LLM endpoint: ", self.model)
             self.chat_model = HuggingFaceEndpoint(
                 endpoint_url=self.model,
@@ -92,36 +111,38 @@ def measure(self, test_case: Dict):
             tmp_metrics = []
             # check supported list
             for metric in self.metrics:
-                if metric not in self.validated_list:
+                if metric not in self.metric_names:
                     raise ValueError(
-                        "metric should be in supported list {}. ".format(self.validated_list)
+                        "metric should be in supported list {}. ".format(self.metric_names)
                         + "ClientResponseError raised with LangchainLLM "
                         + "when context_precision, context_recall ran. "
                         + "Here are the related issues described in ragas "
                         "https://github.com/explodinggradients/ragas/issues/934, "
                         + "https://github.com/explodinggradients/ragas/issues/664."
                     )
                 else:
-                    if metric == "answer_relevancy" and self.embeddings is None:
-                        raise ValueError("answer_relevancy metric need provide embeddings model.")
+                    if metric == "AnswerRelevancy" and self.embeddings is None:
+                        raise ValueError("AnswerRelevancy metric need provide embeddings model.")
                     tmp_metrics.append(self.metrics_instance[metric])
             self.metrics = tmp_metrics
         else:
-            self.metrics = [
-                answer_relevancy,
-                faithfulness,
-                answer_correctness,
-                answer_similarity,
-                context_precision,
-                context_recall,
-            ]
+            self.metrics = list(self.metric_instances.values())
+            # self.metrics = [
+            #     answer_relevancy,
+            #     faithfulness,
+            #     answer_correctness,
+            #     answer_similarity,
+            #     context_precision,
+            #     context_recall,
+            # ]
         # Find necessary input fields using the given metrics
         _required_columns = set()
         column_map = {  # this column maps new naming style in ragas to their old naming style
             "user_input": "question",
             "response": "answer",
             "reference": "ground_truth",
             "retrieved_contexts": "contexts",
+            "reference_contexts": "reference_contexts",
         }
         for metric in self.metrics:
             if hasattr(metric, "_required_columns"):

@@ -3,4 +3,4 @@ jieba
 langchain_community
 langchain_huggingface
 lm-eval==0.4.3
-ragas
+ragas==0.1.19   
@@ -25,15 +25,18 @@ def test_ragas(self):
 
         # Replace this with the actual retrieved context from your RAG pipeline
         retrieval_context = ["All customers are eligible for a 30 day full refund at no extra cost."]
+        reference_context = ["We can only process full refund upto 30 day after the purchase."]
         from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 
         embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5")
+
         metric = RagasMetric(threshold=0.5, model=f"http://{host_ip}:{port}", embeddings=embeddings)
         test_case = {
             "question": ["What if these shoes don't fit?"],
             "answer": [actual_output],
             "ground_truth": [expected_output],
             "contexts": [retrieval_context],
+            "reference_contexts": [reference_context],
         }
 
         metric.measure(test_case)