From 669997a6baf0979f9414e1f45bc8363ad07035c1 Mon Sep 17 00:00:00 2001
From: aasavari <aasavari.dhananjay.kakne@intel.com>
Date: Thu, 19 Sep 2024 04:33:00 +0000
Subject: [PATCH 01/21] minimized required fields/columns in user data

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
---
 evals/metrics/ragas/ragas.py | 80 +++++++++++++++---------------------
 1 file changed, 34 insertions(+), 46 deletions(-)

diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index 35449c08..672ac39e 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -1,24 +1,23 @@
+
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-
 #
 import os
 from typing import Dict, Optional, Union
-
+from langchain_huggingface import HuggingFaceEndpoint
 from langchain_core.embeddings import Embeddings
 from langchain_core.language_models import BaseLanguageModel
-from langchain_huggingface import HuggingFaceEndpoint
 
+import sys
+sys.path.append('/home/akakne/miniforge3/envs/recsys/bin')
 
 def format_ragas_metric_name(name: str):
     return f"{name} (ragas)"
 
-
 class RagasMetric:
     """This metric checks if the output is more than 3 letters."""
-
     def __init__(
         self,
         threshold: float = 0.3,
@@ -26,7 +25,6 @@ def __init__(
         embeddings: Optional[Embeddings] = None,
         metrics: Optional[list[str]] = None,
     ):
-
         self.threshold = threshold
         self.model = model
         self.embeddings = embeddings
@@ -39,10 +37,14 @@ def __init__(
             "context_recall",
             "faithfulness",
             "context_utilization",
-            "reference_free_rubrics_score",
+            # "reference_free_rubrics_score",
         ]
-
+    async def a_measure(self, test_case: Dict):
+        return self.measure(test_case)
+    def measure(self, test_case: Dict):
+        # sends to server
         try:
+            from ragas import evaluate
             from ragas.metrics import (
                 answer_correctness,
                 answer_relevancy,
@@ -51,16 +53,14 @@ def __init__(
                 context_recall,
                 context_utilization,
                 faithfulness,
-                reference_free_rubrics_score,
+                # reference_free_rubrics_score,
             )
         except ModuleNotFoundError:
             raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
-
         try:
             from datasets import Dataset
         except ModuleNotFoundError:
             raise ModuleNotFoundError("Please install dataset")
-
         self.metrics_instance = {
             "answer_correctness": answer_correctness,
             "answer_relevancy": answer_relevancy,
@@ -69,26 +69,24 @@ def __init__(
             "context_recall": context_recall,
             "faithfulness": faithfulness,
             "context_utilization": context_utilization,
-            "reference_free_rubrics_score": reference_free_rubrics_score,
+            # "reference_free_rubrics_score": reference_free_rubrics_score,
         }
-
         # Set LLM model
         openai_key = os.getenv("OPENAI_API_KEY", None)
         if openai_key is not None:
             print("OPENAI_API_KEY is provided, ragas initializes the model by OpenAI.")
             self.model = None
         if isinstance(self.model, str):
-            print("LLM endpoint: ", self.model)
-            self.chat_model = HuggingFaceEndpoint(
+            print("Loading a HuggingFace Endpoint")
+            chat_model = HuggingFaceEndpoint(
                 endpoint_url=self.model,
-                task="text-generation",
-                max_new_tokens=1024,
-                do_sample=False,
+                timeout=600,
             )
         else:
-            self.chat_model = self.model
-
-        # initialize metrics
+            print("Accepting user-initialized model as we could not detect OpenAI key or HuggingFace Endpoint URL.")
+            chat_model = self.model
+        # Create a dataset from the test case
+        # Convert the Dict to a format compatible with Dataset
         if self.metrics is not None:
             tmp_metrics = []
             # check supported list
@@ -106,10 +104,8 @@ def __init__(
                     if metric == "answer_relevancy" and self.embeddings is None:
                         raise ValueError("answer_relevancy metric need provide embeddings model.")
                     tmp_metrics.append(self.metrics_instance[metric])
-
             self.metrics = tmp_metrics
-
-        else:  # default metrics
+        else:
             self.metrics = [
                 answer_relevancy,
                 faithfulness,
@@ -118,39 +114,31 @@ def __init__(
                 context_precision,
                 context_recall,
             ]
-
-    async def a_measure(self, test_case: Dict):
-        return self.measure(test_case)
-
-    def measure(self, test_case: Dict):
-        from ragas import evaluate
-
-        try:
-            from datasets import Dataset
-        except ModuleNotFoundError:
-            raise ModuleNotFoundError("Please install dataset")
-
-        # Create a dataset from the test case
-        # Convert the Dict to a format compatible with Dataset
-        data = {
-            "question": test_case["question"],
-            "contexts": test_case["contexts"],
-            "answer": test_case["answer"],
-            "ground_truth": test_case["ground_truth"],
+        # Find necessary input fields using the given metrics
+        _required_columns = set()
+        for metric in self.metrics:
+            for column in list(metric._required_columns.values())[0]:
+                _required_columns.add(column)
+        column2field = {
+            "user_input" : "question",
+            "response" : "answer",
+            "reference" : "ground_truth",
+            "retrieved_contexts" : "contexts"
         }
+        _required_fields = [column2field[column] for column in _required_columns]
+        data = {field : test_case[field] for field in _required_fields}
         dataset = Dataset.from_dict(data)
 
+        # evaluate
         self.score = evaluate(
             dataset,
             metrics=self.metrics,
-            llm=self.chat_model,
+            llm=chat_model,
             embeddings=self.embeddings,
         )
         return self.score
-
     def is_successful(self):
         return self.success
-
     @property
     def __name__(self):
         return "RAGAS"

From 80e21609655529e8cd40354a64935417d4b70feb Mon Sep 17 00:00:00 2001
From: Ying Chun Guo <yingchun.guo@intel.com>
Date: Thu, 19 Sep 2024 14:33:11 +0800
Subject: [PATCH 02/21] add bench-target as the prefix of output folder (#133)

Signed-off-by: Yingchun Guo <yingchun.guo@intel.com>
Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
---
 evals/benchmark/stresscli/commands/load_test.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/evals/benchmark/stresscli/commands/load_test.py b/evals/benchmark/stresscli/commands/load_test.py
index fba1c5ee..8895e5ab 100644
--- a/evals/benchmark/stresscli/commands/load_test.py
+++ b/evals/benchmark/stresscli/commands/load_test.py
@@ -46,16 +46,17 @@ def locust_runtests(kubeconfig, profile):
         with open(profile, "r") as file:
             profile_data = yaml.safe_load(file)
 
+        global_settings = profile_data["profile"]["global-settings"]
+        runs = profile_data["profile"]["runs"]
+
         # create test log folder
         hostpath = profile_data["profile"]["storage"]["hostpath"]
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        base_folder = os.path.join(hostpath, f"{timestamp}")
+        testtarget = global_settings.get("bench-target", locust_defaults["bench-target"])
+        base_folder = os.path.join(hostpath, f"{testtarget}_{timestamp}")
         os.makedirs(base_folder, exist_ok=True)
 
         # Extract storage path and run details from profile
-        global_settings = profile_data["profile"]["global-settings"]
-        runs = profile_data["profile"]["runs"]
-
         index = 1
         for run in runs:
             print(f"===Starting test: {run['name']}")

From eb98d2e32f73b1612d1d84e92e15a2c8b8505a2b Mon Sep 17 00:00:00 2001
From: lkk <33276950+lkk12014402@users.noreply.github.com>
Date: Thu, 19 Sep 2024 16:11:01 +0800
Subject: [PATCH 03/21] remove examples. (#135)

Co-authored-by: root <root@idc708073.jf.intel.com>
Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
---
 examples/AudioQnA/README.md         | 48 ---------------
 examples/AudioQnA/local_eval.py     | 35 -----------
 examples/AudioQnA/online_eval.py    | 56 ------------------
 examples/AudioQnA/requirements.txt  |  8 ---
 examples/CodeGen/README.md          | 92 -----------------------------
 examples/FaqGen/README.md           | 63 --------------------
 examples/FaqGen/evaluate.py         | 45 --------------
 examples/FaqGen/generate_FAQ.py     | 28 ---------
 examples/FaqGen/get_context.py      | 17 ------
 examples/FaqGen/launch_tgi.sh       | 28 ---------
 examples/FaqGen/post_process_FAQ.py | 27 ---------
 11 files changed, 447 deletions(-)
 delete mode 100644 examples/AudioQnA/README.md
 delete mode 100644 examples/AudioQnA/local_eval.py
 delete mode 100644 examples/AudioQnA/online_eval.py
 delete mode 100644 examples/AudioQnA/requirements.txt
 delete mode 100644 examples/CodeGen/README.md
 delete mode 100644 examples/FaqGen/README.md
 delete mode 100644 examples/FaqGen/evaluate.py
 delete mode 100644 examples/FaqGen/generate_FAQ.py
 delete mode 100644 examples/FaqGen/get_context.py
 delete mode 100644 examples/FaqGen/launch_tgi.sh
 delete mode 100644 examples/FaqGen/post_process_FAQ.py

diff --git a/examples/AudioQnA/README.md b/examples/AudioQnA/README.md
deleted file mode 100644
index 918a7997..00000000
--- a/examples/AudioQnA/README.md
+++ /dev/null
@@ -1,48 +0,0 @@
-# AudioQnA accuracy Evaluation
-
-## Dataset 
-
-
-We evaluate the ASR accuracy on the test set of librispeech [dataset](andreagasparini/librispeech_test_only), which contains 2620 records of audio and texts.
-
-## Metrics
-
-We evaluate the WER (Word Error Rate) metric of the ASR microservice.
-
-## Evaluation
-
-### Launch ASR microservice
-
-Launch the ASR microserice with the following commands. For more details please refer to [doc](https://github.com/opea-project/GenAIComps/tree/main/comps/asr).
-
-```bash
-git clone https://github.com/opea-project/GenAIComps
-cd GenAIComps
-docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile .
-# change the name of model by editing model_name_or_path you want to evaluate
-docker run -p 7066:7066 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest --model_name_or_path "openai/whisper-tiny"
-```
-
-### Evaluate
-
-Install dependencies:
-
-```
-pip install -r requirements.txt
-```
-
-Evaluate the performance with the LLM:
-```py
-# validate the offline model
-# python offline_evaluate.py
-# validate the online asr microservice accuracy
-python online_evaluate.py
-```
-
-### Performance Result
-Here is the tested result for your reference
-||  WER   |
-| --- |  ----  |
-|whisper-large-v2| 2.87|
-|whisper-large| 2.7	|
-|whisper-medium| 3.45 |
diff --git a/examples/AudioQnA/local_eval.py b/examples/AudioQnA/local_eval.py
deleted file mode 100644
index 1ef7b6df..00000000
--- a/examples/AudioQnA/local_eval.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import torch
-from datasets import load_dataset
-from evaluate import load
-from transformers import WhisperForConditionalGeneration, WhisperProcessor
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-MODEL_NAME = "openai/whisper-large-v2"
-
-librispeech_test_clean = load_dataset(
-    "andreagasparini/librispeech_test_only", "clean", split="test", trust_remote_code=True
-)
-processor = WhisperProcessor.from_pretrained(MODEL_NAME)
-model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
-
-
-def map_to_pred(batch):
-    audio = batch["audio"]
-    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
-    batch["reference"] = processor.tokenizer._normalize(batch["text"])
-
-    with torch.no_grad():
-        predicted_ids = model.generate(input_features.to(device))[0]
-    transcription = processor.decode(predicted_ids)
-    batch["prediction"] = processor.tokenizer._normalize(transcription)
-    return batch
-
-
-result = librispeech_test_clean.map(map_to_pred)
-
-wer = load("wer")
-print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
diff --git a/examples/AudioQnA/online_eval.py b/examples/AudioQnA/online_eval.py
deleted file mode 100644
index a7854c95..00000000
--- a/examples/AudioQnA/online_eval.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import base64
-import json
-
-import requests
-import torch
-from datasets import load_dataset
-from evaluate import load
-from pydub import AudioSegment
-from transformers import WhisperForConditionalGeneration, WhisperProcessor
-
-MODEL_NAME = "openai/whisper-large-v2"
-processor = WhisperProcessor.from_pretrained(MODEL_NAME)
-
-librispeech_test_clean = load_dataset(
-    "andreagasparini/librispeech_test_only", "clean", split="test", trust_remote_code=True
-)
-
-
-def map_to_pred(batch):
-    batch["reference"] = processor.tokenizer._normalize(batch["text"])
-
-    file_path = batch["file"]
-    # process the file_path
-    pidx = file_path.rfind("/")
-    sidx = file_path.rfind(".")
-
-    file_path_prefix = file_path[: pidx + 1]
-    file_path_suffix = file_path[sidx:]
-    file_path_mid = file_path[pidx + 1 : sidx]
-    splits = file_path_mid.split("-")
-    file_path_mid = f"LibriSpeech/test-clean/{splits[0]}/{splits[1]}/{file_path_mid}"
-
-    file_path = file_path_prefix + file_path_mid + file_path_suffix
-
-    audio = AudioSegment.from_file(file_path)
-    audio.export("tmp.wav")
-    with open("tmp.wav", "rb") as f:
-        test_audio_base64_str = base64.b64encode(f.read()).decode("utf-8")
-
-    inputs = {"audio": test_audio_base64_str}
-    endpoint = "http://localhost:7066/v1/asr"
-    response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None})
-
-    result_str = response.json()["asr_result"]
-
-    batch["prediction"] = processor.tokenizer._normalize(result_str)
-    return batch
-
-
-result = librispeech_test_clean.map(map_to_pred)
-
-wer = load("wer")
-print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
diff --git a/examples/AudioQnA/requirements.txt b/examples/AudioQnA/requirements.txt
deleted file mode 100644
index c3f6c51a..00000000
--- a/examples/AudioQnA/requirements.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-datasets
-evaluate
-jiwer
-librosa
-pydub
-soundfile
-torch
-transformers
diff --git a/examples/CodeGen/README.md b/examples/CodeGen/README.md
deleted file mode 100644
index 5d118967..00000000
--- a/examples/CodeGen/README.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# CodeGen accuracy Evaluation
-
-## Evaluation Framework
-We evaluate accuracy by [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness). It  is a framework for the evaluation of code generation models. 
-
-
-## Evaluation FAQs
-
-### Launch CodeGen microservice
-Please refer to [CodeGen Examples](https://github.com/opea-project/GenAIExamples/tree/main/CodeGen), follow the guide to deploy CodeGen megeservice.
-
-Use cURL command to test codegen service and ensure that it has started properly
-```bash
-export CODEGEN_ENDPOINT = "http://${your_ip}:7778/v1/codegen"
-curl $CODEGEN_ENDPOINT \
-    -H "Content-Type: application/json" \
-    -d '{"messages": "Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception."}'
-
-```
-
-
-### Generation and Evaluation 
-
-For evaluating the models on coding tasks or specifically coding LLMs, we follow the [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness) and provide the command line usage and function call usage. [HumanEval](https://huggingface.co/datasets/openai_humaneval), [HumanEval+](https://huggingface.co/datasets/evalplus/humanevalplus), [InstructHumanEval](https://huggingface.co/datasets/codeparrot/instructhumaneval), [APPS](https://huggingface.co/datasets/codeparrot/apps), [MBPP](https://huggingface.co/datasets/mbpp), [MBPP+](https://huggingface.co/datasets/evalplus/mbppplus), and [DS-1000](https://github.com/HKUNLP/DS-1000/) for both completion (left-to-right) and insertion (FIM) mode are available.
-#### command line usage
-
-```shell
-cd evals/evaluation/bigcode_evaluation_harness/examples
-python main.py --model Qwen/CodeQwen1.5-7B-Chat \ 
-  --tasks humaneval \
-  --codegen_url $CODEGEN_ENDPOINT \
-  --max_length_generation 2048 \
-  --batch_size 1  \
-  --save_generations \
-  --save_references \
-  --allow_code_execution
-```
-
-***Note:*** Currently, our framework is designed to execute tasks in full. To ensure the accuracy of results, we advise against using the 'limit' or 'limit_start' parameters to restrict the number of test samples.
-
-
-### accuracy Result
-Here is the tested result for your reference
-```json
-{
-  "humaneval": {
-    "pass@1": 0.7195121951219512
-  },
-  "config": {
-    "prefix": "",
-    "do_sample": true,
-    "temperature": 0.2,
-    "top_k": 0,
-    "top_p": 0.95,
-    "n_samples": 1,
-    "eos": "<|endoftext|>",
-    "seed": 0,
-    "model": "Qwen/CodeQwen1.5-7B-Chat",
-    "modeltype": "causal",
-    "peft_model": null,
-    "revision": null,
-    "use_auth_token": false,
-    "trust_remote_code": false,
-    "tasks": "humaneval",
-    "instruction_tokens": null,
-    "batch_size": 1,
-    "max_length_generation": 2048,
-    "precision": "fp32",
-    "load_in_8bit": false,
-    "load_in_4bit": false,
-    "left_padding": false,
-    "limit": null,
-    "limit_start": 0,
-    "save_every_k_tasks": -1,
-    "postprocess": true,
-    "allow_code_execution": true,
-    "generation_only": false,
-    "load_generations_path": null,
-    "load_data_path": null,
-    "metric_output_path": "evaluation_results.json",
-    "save_generations": true,
-    "load_generations_intermediate_paths": null,
-    "save_generations_path": "generations.json",
-    "save_references": true,
-    "save_references_path": "references.json",
-    "prompt": "prompt",
-    "max_memory_per_gpu": null,
-    "check_references": false,
-    "codegen_url": "http://192.168.123.104:31234/v1/codegen"
-  }
-}
-```
diff --git a/examples/FaqGen/README.md b/examples/FaqGen/README.md
deleted file mode 100644
index 70d66744..00000000
--- a/examples/FaqGen/README.md
+++ /dev/null
@@ -1,63 +0,0 @@
-# FaqGen Performance Evaluation
-
-## Dataset 
-We evaluate performance on QA dataset [Squad_v2](https://huggingface.co/datasets/rajpurkar/squad_v2). Generate FAQs on "context" columns in validation dataset, which contains 1204 unique records.
-
-First download dataset and put at "./data".
-
-Extract unique "context" columns, which will be save to 'data/sqv2_context.json':
-```
-python get_context.py
-```
-
-## Generate FAQs
-
-### Launch FaQGen microservice
-Please refer to [FaQGen microservice](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/faq-generation/tgi), set up an microservice endpoint.
-```
-export FAQ_ENDPOINT = "http://${your_ip}:9000/v1/faqgen"
-```
-
-### Generate FAQs with microservice
-Use the microservice endpoint to generate FAQs for dataset.
-```
-python generate_FAQ.py
-```
-
-Post-process the output to get the right data, which will be save to 'data/sqv2_faq.json'.
-```
-python post_process_FAQ.py
-```
-
-## Evaluate with Ragas
-
-### Launch TGI service
-We use "mistralai/Mixtral-8x7B-Instruct-v0.1" as LLM referee to evaluate the model. First we need to launch a LLM endpoint on Gaudi.
-```
-export HUGGING_FACE_HUB_TOKEN="your_huggingface_token"
-bash launch_tgi.sh
-```
-Get the endpoint:
-```
-export LLM_ENDPOINT = "http://${ip_address}:8082"
-```
-
-Verify the service:
-```bash
-curl http://${ip_address}:8082/generate \
-    -X POST \
-    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
-    -H 'Content-Type: application/json'
-```
-
-### Evaluate
-evaluate the performance with the LLM:
-```
-python evaluate.py
-```
-
-### Performance Result
-Here is the tested result for your reference
-|  answer_relevancy   | faithfulness  | context_utilization | reference_free_rubrics_score |
-|  ----  | ----  |----  |----  |
-| 0.7191	| 0.9681	| 0.8964 |	4.4125|
diff --git a/examples/FaqGen/evaluate.py b/examples/FaqGen/evaluate.py
deleted file mode 100644
index a082d093..00000000
--- a/examples/FaqGen/evaluate.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import json
-import os
-
-from langchain_community.embeddings import HuggingFaceBgeEmbeddings
-
-from evals.metrics.ragas import RagasMetric
-
-llm_endpoint = os.getenv("LLM_ENDPOINT", "http://0.0.0.0:8082")
-
-f = open("data/sqv2_context.json", "r")
-sqv2_context = json.load(f)
-
-f = open("data/sqv2_faq.json", "r")
-sqv2_faq = json.load(f)
-
-templ = """Create a concise FAQs (frequently asked questions and answers) for following text:
-        TEXT: {text}
-        Do not use any prefix or suffix to the FAQ.
-    """
-
-number = 1204
-question = []
-answer = []
-ground_truth = ["None"] * number
-contexts = []
-for i in range(number):
-    inputs = sqv2_context[str(i)]
-    inputs_faq = templ.format_map({"text": inputs})
-    actual_output = sqv2_faq[str(i)]
-
-    question.append(inputs_faq)
-    answer.append(actual_output)
-    contexts.append([inputs_faq])
-
-embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5")
-metrics_faq = ["answer_relevancy", "faithfulness", "context_utilization", "reference_free_rubrics_score"]
-metric = RagasMetric(threshold=0.5, model=llm_endpoint, embeddings=embeddings, metrics=metrics_faq)
-
-test_case = {"question": question, "answer": answer, "ground_truth": ground_truth, "contexts": contexts}
-
-metric.measure(test_case)
-print(metric.score)
diff --git a/examples/FaqGen/generate_FAQ.py b/examples/FaqGen/generate_FAQ.py
deleted file mode 100644
index 2ed70b9e..00000000
--- a/examples/FaqGen/generate_FAQ.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import json
-import os
-import time
-
-import requests
-
-llm_endpoint = os.getenv("FAQ_ENDPOINT", "http://0.0.0.0:9000/v1/faqgen")
-
-f = open("data/sqv2_context.json", "r")
-sqv2_context = json.load(f)
-
-start_time = time.time()
-headers = {"Content-Type": "application/json"}
-for i in range(1204):
-    start_time_tmp = time.time()
-    print(i)
-    inputs = sqv2_context[str(i)]
-    data = {"query": inputs, "max_new_tokens": 128}
-    response = requests.post(llm_endpoint, json=data, headers=headers)
-    f = open(f"data/result/sqv2_faq_{i}", "w")
-    f.write(inputs)
-    f.write(str(response.content, encoding="utf-8"))
-    f.close()
-    print(f"Cost {time.time()-start_time_tmp} seconds")
-print(f"\n Finished! \n Totally Cost {time.time()-start_time} seconds\n")
diff --git a/examples/FaqGen/get_context.py b/examples/FaqGen/get_context.py
deleted file mode 100644
index 8cb73a05..00000000
--- a/examples/FaqGen/get_context.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import json
-import os
-
-import pandas as pd
-
-data_path = "./data"
-data = pd.read_parquet(os.path.join(data_path, "squad_v2/squad_v2/validation-00000-of-00001.parquet"))
-sq_context = list(data["context"].unique())
-sq_context_d = dict()
-for i in range(len(sq_context)):
-    sq_context_d[i] = sq_context[i]
-
-with open(os.path.join(data_path, "sqv2_context.json"), "w") as outfile:
-    json.dump(sq_context_d, outfile)
diff --git a/examples/FaqGen/launch_tgi.sh b/examples/FaqGen/launch_tgi.sh
deleted file mode 100644
index b3e04bbb..00000000
--- a/examples/FaqGen/launch_tgi.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-max_input_tokens=3072
-max_total_tokens=4096
-port_number=8082
-model_name="mistralai/Mixtral-8x7B-Instruct-v0.1"
-volume="./data"
-docker run -it --rm \
-    --name="tgi_Mixtral" \
-    -p $port_number:80 \
-    -v $volume:/data \
-    --runtime=habana \
-    --restart always \
-    -e HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \
-    -e HABANA_VISIBLE_DEVICES=all \
-    -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-    -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
-    --cap-add=sys_nice \
-    --ipc=host \
-    -e HTTPS_PROXY=$https_proxy \
-    -e HTTP_PROXY=$https_proxy \
-    ghcr.io/huggingface/tgi-gaudi:2.0.1 \
-    --model-id $model_name \
-    --max-input-tokens $max_input_tokens \
-    --max-total-tokens $max_total_tokens \
-    --sharded true \
-    --num-shard 2
diff --git a/examples/FaqGen/post_process_FAQ.py b/examples/FaqGen/post_process_FAQ.py
deleted file mode 100644
index 83e6b835..00000000
--- a/examples/FaqGen/post_process_FAQ.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import json
-
-faq_dict = {}
-fails = []
-for i in range(1204):
-    data = open(f"data/result/sqv2_faq_{i}", "r").readlines()
-    result = data[-6][6:]
-    # print(result)
-    if "LLMChain/final_output" not in result:
-        print(f"error1: fail for {i}")
-        fails.append(i)
-        continue
-    try:
-        result2 = json.loads(result)
-        result3 = result2["ops"][0]["value"]["text"]
-        faq_dict[str(i)] = result3
-    except:
-        print(f"error2: fail for {i}")
-        fails.append(i)
-        continue
-with open("data/sqv2_faq.json", "w") as outfile:
-    json.dump(faq_dict, outfile)
-print("Failure index:")
-print(fails)

From ad58bd8d8ed898b750eb8044b640185d2f032a6b Mon Sep 17 00:00:00 2001
From: aasavari <aasavari.dhananjay.kakne@intel.com>
Date: Fri, 20 Sep 2024 02:43:05 +0000
Subject: [PATCH 04/21] minor naming correction to maintain consistency

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
---
 evals/metrics/ragas/ragas.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index 672ac39e..2acd86d8 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -10,9 +10,6 @@
 from langchain_core.embeddings import Embeddings
 from langchain_core.language_models import BaseLanguageModel
 
-import sys
-sys.path.append('/home/akakne/miniforge3/envs/recsys/bin')
-
 def format_ragas_metric_name(name: str):
     return f"{name} (ragas)"
 
@@ -77,14 +74,14 @@ def measure(self, test_case: Dict):
             print("OPENAI_API_KEY is provided, ragas initializes the model by OpenAI.")
             self.model = None
         if isinstance(self.model, str):
-            print("Loading a HuggingFace Endpoint")
-            chat_model = HuggingFaceEndpoint(
+            print("LLM endpoint: ", self.model)
+            self.chat_model = HuggingFaceEndpoint(
                 endpoint_url=self.model,
                 timeout=600,
             )
         else:
             print("Accepting user-initialized model as we could not detect OpenAI key or HuggingFace Endpoint URL.")
-            chat_model = self.model
+            self.chat_model = self.model
         # Create a dataset from the test case
         # Convert the Dict to a format compatible with Dataset
         if self.metrics is not None:
@@ -133,7 +130,7 @@ def measure(self, test_case: Dict):
         self.score = evaluate(
             dataset,
             metrics=self.metrics,
-            llm=chat_model,
+            llm=self.chat_model,
             embeddings=self.embeddings,
         )
         return self.score

From c49ea8406031dd59cc7a921a33343fc75690cc53 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 20 Sep 2024 02:46:19 +0000
Subject: [PATCH 05/21] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
---
 evals/metrics/ragas/ragas.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index 2acd86d8..da093e2c 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -1,4 +1,3 @@
-
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # Copyright (C) 2024 Intel Corporation
@@ -6,15 +5,19 @@
 #
 import os
 from typing import Dict, Optional, Union
-from langchain_huggingface import HuggingFaceEndpoint
+
 from langchain_core.embeddings import Embeddings
 from langchain_core.language_models import BaseLanguageModel
+from langchain_huggingface import HuggingFaceEndpoint
+
 
 def format_ragas_metric_name(name: str):
     return f"{name} (ragas)"
 
+
 class RagasMetric:
     """This metric checks if the output is more than 3 letters."""
+
     def __init__(
         self,
         threshold: float = 0.3,
@@ -36,13 +39,15 @@ def __init__(
             "context_utilization",
             # "reference_free_rubrics_score",
         ]
+
     async def a_measure(self, test_case: Dict):
         return self.measure(test_case)
+
     def measure(self, test_case: Dict):
         # sends to server
         try:
             from ragas import evaluate
-            from ragas.metrics import (
+            from ragas.metrics import (  # reference_free_rubrics_score,
                 answer_correctness,
                 answer_relevancy,
                 answer_similarity,
@@ -50,7 +55,6 @@ def measure(self, test_case: Dict):
                 context_recall,
                 context_utilization,
                 faithfulness,
-                # reference_free_rubrics_score,
             )
         except ModuleNotFoundError:
             raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
@@ -117,13 +121,13 @@ def measure(self, test_case: Dict):
             for column in list(metric._required_columns.values())[0]:
                 _required_columns.add(column)
         column2field = {
-            "user_input" : "question",
-            "response" : "answer",
-            "reference" : "ground_truth",
-            "retrieved_contexts" : "contexts"
+            "user_input": "question",
+            "response": "answer",
+            "reference": "ground_truth",
+            "retrieved_contexts": "contexts",
         }
         _required_fields = [column2field[column] for column in _required_columns]
-        data = {field : test_case[field] for field in _required_fields}
+        data = {field: test_case[field] for field in _required_fields}
         dataset = Dataset.from_dict(data)
 
         # evaluate
@@ -134,8 +138,10 @@ def measure(self, test_case: Dict):
             embeddings=self.embeddings,
         )
         return self.score
+
     def is_successful(self):
         return self.success
+
     @property
     def __name__(self):
         return "RAGAS"

From 50d41670266883343b5e3ca56b461f6077b67598 Mon Sep 17 00:00:00 2001
From: ZePan110 <ze.pan@intel.com>
Date: Fri, 20 Sep 2024 10:51:32 +0800
Subject: [PATCH 06/21] Add hyperlinks and paths validation. (#132)

Signed-off-by: ZePan110 <ze.pan@intel.com>
Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
---
 .github/workflows/pr-path-detection.yml       | 123 ++++++++++++++++++
 README.md                                     |   2 +-
 doc/platform-optimization/README.md           |   2 +-
 evals/evaluation/autorag/evaluation/README.md |   4 +-
 evals/evaluation/rag_eval/README.md           |   2 +-
 examples/AudioQnA/README.md                   |  48 +++++++
 6 files changed, 176 insertions(+), 5 deletions(-)
 create mode 100644 .github/workflows/pr-path-detection.yml
 create mode 100644 examples/AudioQnA/README.md

diff --git a/.github/workflows/pr-path-detection.yml b/.github/workflows/pr-path-detection.yml
new file mode 100644
index 00000000..2bfb3969
--- /dev/null
+++ b/.github/workflows/pr-path-detection.yml
@@ -0,0 +1,123 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+name: Check Paths and Hyperlinks
+
+on:
+  pull_request:
+    branches: [main]
+    types: [opened, reopened, ready_for_review, synchronize]
+
+jobs:
+  check-the-validity-of-hyperlinks-in-README:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clean Up Working Directory
+        run: sudo rm -rf ${{github.workspace}}/*
+
+      - name: Checkout Repo GenAIEval
+        uses: actions/checkout@v4
+
+      - name: Check the Validity of Hyperlinks
+        run: |
+          cd ${{github.workspace}}
+          fail="FALSE"
+          url_lines=$(grep -Eo '\]\(http[s]?://[^)]+\)' --include='*.md' -r .|grep -Ev 'GenAIEval/blob/main')
+          if [ -n "$url_lines" ]; then
+            for url_line in $url_lines; do
+              url=$(echo "$url_line"|cut -d '(' -f2 | cut -d ')' -f1|sed 's/\.git$//')
+              path=$(echo "$url_line"|cut -d':' -f1 | cut -d'/' -f2-)
+              response=$(curl -L -s -o /dev/null -w "%{http_code}" "$url")
+              if [ "$response" -ne 200 ]; then
+                echo "**********Validation failed, try again**********"
+                response_retry=$(curl -s -o /dev/null -w "%{http_code}" "$url")
+                if [ "$response_retry" -eq 200 ]; then
+                  echo "*****Retry successfully*****"
+                else
+                  echo "Invalid link from ${{github.workspace}}/$path: $url"
+                  fail="TRUE"
+                fi
+              fi
+            done
+          fi
+
+          if [[ "$fail" == "TRUE" ]]; then
+            exit 1
+          else
+            echo "All hyperlinks are valid."
+          fi
+        shell: bash
+
+  check-the-validity-of-relative-path:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clean up Working Directory
+        run: sudo rm -rf ${{github.workspace}}/*
+
+      - name: Checkout Repo GenAIEval
+        uses: actions/checkout@v4
+
+      - name: Checking Relative Path Validity
+        run: |
+          cd ${{github.workspace}}
+          fail="FALSE"
+          repo_name=${{ github.event.pull_request.head.repo.full_name }}
+          if [ "$(echo "$repo_name"|cut -d'/' -f1)" != "opea-project" ]; then
+            owner=$(echo "${{ github.event.pull_request.head.repo.full_name }}" |cut -d'/' -f1)
+            branch="https://github.com/$owner/GenAIEval/tree/${{ github.event.pull_request.head.ref }}"
+          else
+            branch="https://github.com/opea-project/GenAIEval/blob/${{ github.event.pull_request.head.ref }}"
+          fi
+          link_head="https://github.com/opea-project/GenAIEval/blob/main"
+          png_lines=$(grep -Eo '\]\([^)]+\)' --include='*.md' -r .|grep -Ev 'http')
+          if [ -n "$png_lines" ]; then
+            for png_line in $png_lines; do
+              refer_path=$(echo "$png_line"|cut -d':' -f1 | cut -d'/' -f2-)
+              png_path=$(echo "$png_line"|cut -d '(' -f2 | cut -d ')' -f1)
+              if [[ "${png_path:0:1}" == "/" ]]; then
+                check_path=${{github.workspace}}$png_path
+              elif [[ "${png_path:0:1}" == "#" ]]; then
+                check_path=${{github.workspace}}/$refer_path$png_path
+              else
+                check_path=${{github.workspace}}/$(dirname "$refer_path")/$png_path
+              fi
+              real_path=$(realpath $check_path)
+              if [ $? -ne 0 ]; then
+                echo "Path $png_path in file ${{github.workspace}}/$refer_path does not exist"
+                fail="TRUE"
+              else
+                url=$link_head$(echo "$real_path" | sed 's|.*/GenAIEval||')
+                response=$(curl -I -L -s -o /dev/null -w "%{http_code}" "$url")
+                if [ "$response" -ne 200 ]; then
+                  echo "**********Validation failed, try again**********"
+                  response_retry=$(curl -s -o /dev/null -w "%{http_code}" "$url")
+                  if [ "$response_retry" -eq 200 ]; then
+                    echo "*****Retry successfully*****"
+                  else
+                    echo "Retry failed. Check branch ${{ github.event.pull_request.head.ref }}"
+                    url_dev=$branch$(echo "$real_path" | sed 's|.*/GenAIEval||')
+                    response=$(curl -I -L -s -o /dev/null -w "%{http_code}" "$url_dev")
+                    if [ "$response" -ne 200 ]; then
+                      echo "**********Validation failed, try again**********"
+                      response_retry=$(curl -s -o /dev/null -w "%{http_code}" "$url_dev")
+                      if [ "$response_retry" -eq 200 ]; then
+                        echo "*****Retry successfully*****"
+                      else
+                        echo "Invalid path from ${{github.workspace}}/$refer_path: $png_path"
+                        fail="TRUE"
+                      fi
+                    else
+                      echo "Check branch ${{ github.event.pull_request.head.ref }} successfully."
+                    fi
+                  fi
+                fi
+              fi
+            done
+          fi
+
+          if [[ "$fail" == "TRUE" ]]; then
+            exit 1
+          else
+            echo "All hyperlinks are valid."
+          fi
+        shell: bash
diff --git a/README.md b/README.md
index 8734f83a..3d6b6d6e 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ results = evaluate(args)
 
 #### remote service usage
 
-1. setup a separate server with [GenAIComps](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/lm-eval)
+1. setup a separate server with [GenAIComps](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/utils/lm-eval)
 
    ```
    # build cpu docker
diff --git a/doc/platform-optimization/README.md b/doc/platform-optimization/README.md
index ae74765d..8b98a21c 100644
--- a/doc/platform-optimization/README.md
+++ b/doc/platform-optimization/README.md
@@ -98,7 +98,7 @@ Let us consider isolating AI inference and reranking containers in
 application's Gaudi accelerated pipeline.
 
 In the
-[manifest](https://github.com/opea-project/GenAIExamples/blob/main/ChatQnA/kubernetes/manifests/gaudi/chatqna.yaml)
+[manifest](https://github.com/opea-project/GenAIExamples/blob/main/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna.yaml)
 there are "tgi", "tei" and "teirerank" containers in "chatqna-tgi" and
 "chatqna-tei" and "chatqna-teirerank" deployments that will need a lot
 of CPUs. They implement text-generation-interface and
diff --git a/evals/evaluation/autorag/evaluation/README.md b/evals/evaluation/autorag/evaluation/README.md
index 8068d58b..99a623d1 100644
--- a/evals/evaluation/autorag/evaluation/README.md
+++ b/evals/evaluation/autorag/evaluation/README.md
@@ -1,6 +1,6 @@
 # AutoRAG to evaluate the RAG system performance
 
-AutoRAG is help to end-to-end evaluate the performance of the whole system. Currently, we support to evaluate the performance from 4 perspectives, answer_relevancy, faithfulness, context_recall, context_precision. Before using this service, the use should firstly prepare the groundtruth dataset in the [standard format](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/ground_truth.jsonl). We also provide a [script](https://github.com/opea-project/GenAIEval/blob/main/evals/evaluation/autorag/data_generation/gen_eval_dataset.py) to automatically generate the groundtruth query and answer.
+AutoRAG is help to end-to-end evaluate the performance of the whole system. Currently, we support to evaluate the performance from 4 perspectives, answer_relevancy, faithfulness, context_recall, context_precision. Before using this service, the use should firstly prepare the groundtruth dataset in the [standard format](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/ragas/ground_truth.jsonl). We also provide a [script](https://github.com/opea-project/GenAIEval/blob/main/evals/evaluation/autorag/data_generation/gen_eval_dataset.py) to automatically generate the groundtruth query and answer.
 
 ## Service preparation
 The evaluation for the RAG system is based on the set up of the RAG services. Please follow [the steps](https://github.com/opea-project/GenAIExamples/blob/main/ChatQnA/README.md) to set up your RAG services.
@@ -12,7 +12,7 @@ At this moment, we provide a solution that test the single group of parameters a
 python -u ragas_evaluation_benchmark.py --ground_truth_file ground_truth.jsonl --search_type mmr --k 1 --fetch_k 5 --score_threshold 0.3 --top_n 1 --temperature 0.01 --top_k 5 --top_p 0.95 --repetition_penalty 1.1 --use_openai_key True
 ```
 
-For evaluating multiple groups of parameters, please use [this script](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/run_rag_benchmark.py). 
+For evaluating multiple groups of parameters, please use [this script](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/ragas/run_rag_benchmark.py). 
 ```bash
 python -u run_rag_benchmark.py --config config.yaml
 ```
diff --git a/evals/evaluation/rag_eval/README.md b/evals/evaluation/rag_eval/README.md
index 59f7dd2f..1186464a 100644
--- a/evals/evaluation/rag_eval/README.md
+++ b/evals/evaluation/rag_eval/README.md
@@ -7,7 +7,7 @@
   - [Prerequisites](#prerequisites)
   - [MultiHop (English dataset)](#multihop)
     - [Launch Service of RAG System](#launch-service-of-rag-system)
-    - [Launch Service of LLM-as-a-Judge](launch-service-of-llm)
+    - [Launch Service of LLM-as-a-Judge](#launch-service-of-llm-as-a-judge)
     - [Prepare Dataset](#prepare-dataset)
     - [Evaluation](#evaluation)
   - [CRUD (Chinese dataset)](#crud)
diff --git a/examples/AudioQnA/README.md b/examples/AudioQnA/README.md
new file mode 100644
index 00000000..45290620
--- /dev/null
+++ b/examples/AudioQnA/README.md
@@ -0,0 +1,48 @@
+# AudioQnA accuracy Evaluation
+
+## Dataset 
+
+
+We evaluate the ASR accuracy on the test set of librispeech [dataset](https://huggingface.co/datasets/andreagasparini/librispeech_test_only), which contains 2620 records of audio and texts.
+
+## Metrics
+
+We evaluate the WER (Word Error Rate) metric of the ASR microservice.
+
+## Evaluation
+
+### Launch ASR microservice
+
+Launch the ASR microserice with the following commands. For more details please refer to [doc](https://github.com/opea-project/GenAIComps/tree/main/comps/asr).
+
+```bash
+git clone https://github.com/opea-project/GenAIComps
+cd GenAIComps
+docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile .
+# change the name of model by editing model_name_or_path you want to evaluate
+docker run -p 7066:7066 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest --model_name_or_path "openai/whisper-tiny"
+```
+
+### Evaluate
+
+Install dependencies:
+
+```
+pip install -r requirements.txt
+```
+
+Evaluate the performance with the LLM:
+```py
+# validate the offline model
+# python offline_evaluate.py
+# validate the online asr microservice accuracy
+python online_evaluate.py
+```
+
+### Performance Result
+Here is the tested result for your reference
+||  WER   |
+| --- |  ----  |
+|whisper-large-v2| 2.87|
+|whisper-large| 2.7	|
+|whisper-medium| 3.45 |

From ecbe0d169fef829927a1252fa3301f61cdd8abca Mon Sep 17 00:00:00 2001
From: aasavari <aasavari.dhananjay.kakne@intel.com>
Date: Tue, 24 Sep 2024 07:05:13 +0000
Subject: [PATCH 07/21] added support for older version of ragas

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
---
 evals/metrics/ragas/ragas.py | 29 ++++++++++++++++++++---------
 tests/requirements.txt       |  1 +
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index da093e2c..8782cf96 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -37,7 +37,7 @@ def __init__(
             "context_recall",
             "faithfulness",
             "context_utilization",
-            # "reference_free_rubrics_score",
+            "reference_free_rubrics_score",
         ]
 
     async def a_measure(self, test_case: Dict):
@@ -47,7 +47,7 @@ def measure(self, test_case: Dict):
         # sends to server
         try:
             from ragas import evaluate
-            from ragas.metrics import (  # reference_free_rubrics_score,
+            from ragas.metrics import (  reference_free_rubrics_score,
                 answer_correctness,
                 answer_relevancy,
                 answer_similarity,
@@ -70,7 +70,7 @@ def measure(self, test_case: Dict):
             "context_recall": context_recall,
             "faithfulness": faithfulness,
             "context_utilization": context_utilization,
-            # "reference_free_rubrics_score": reference_free_rubrics_score,
+            "reference_free_rubrics_score": reference_free_rubrics_score,
         }
         # Set LLM model
         openai_key = os.getenv("OPENAI_API_KEY", None)
@@ -117,17 +117,28 @@ def measure(self, test_case: Dict):
             ]
         # Find necessary input fields using the given metrics
         _required_columns = set()
-        for metric in self.metrics:
-            for column in list(metric._required_columns.values())[0]:
-                _required_columns.add(column)
-        column2field = {
+        is_latest = faithfulness
+        column_map = { # this column maps new naming style in ragas to their old naming style
             "user_input": "question",
             "response": "answer",
             "reference": "ground_truth",
             "retrieved_contexts": "contexts",
         }
-        _required_fields = [column2field[column] for column in _required_columns]
-        data = {field: test_case[field] for field in _required_fields}
+        for metric in self.metrics:
+            if hasattr(metric, "_required_columns"):
+                for column in list(metric._required_columns.values())[0]:
+                    _required_columns.add(column_map[column])
+            elif hasattr(metric, "evaluation_mode"):
+                from ragas.metrics.base import get_required_columns
+                for column in get_required_columns(metric.evaluation_mode):
+                    _required_columns.add(column)
+            else:
+                print("metric has no attribute denoting required columns")
+        
+        print("Required columns for given list of metrics are = {}".format(_required_columns))
+
+        # get only neccessary columns from test case
+        data = {column: test_case[column] for column in _required_columns}
         dataset = Dataset.from_dict(data)
 
         # evaluate
diff --git a/tests/requirements.txt b/tests/requirements.txt
index cf468d39..d2cd20b0 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -1,4 +1,5 @@
 bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@e5c2f31625223431d7987f43b70b75b9d26ba118
+jieba
 langchain_community
 langchain_huggingface
 lm-eval==0.4.3

From 2db5af55077176e6e1c9e90fe5c322744fc644f2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 24 Sep 2024 07:03:49 +0000
Subject: [PATCH 08/21] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 evals/metrics/ragas/ragas.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index 8782cf96..f8d76664 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -47,7 +47,7 @@ def measure(self, test_case: Dict):
         # sends to server
         try:
             from ragas import evaluate
-            from ragas.metrics import (  reference_free_rubrics_score,
+            from ragas.metrics import (
                 answer_correctness,
                 answer_relevancy,
                 answer_similarity,
@@ -55,6 +55,7 @@ def measure(self, test_case: Dict):
                 context_recall,
                 context_utilization,
                 faithfulness,
+                reference_free_rubrics_score,
             )
         except ModuleNotFoundError:
             raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
@@ -118,7 +119,7 @@ def measure(self, test_case: Dict):
         # Find necessary input fields using the given metrics
         _required_columns = set()
         is_latest = faithfulness
-        column_map = { # this column maps new naming style in ragas to their old naming style
+        column_map = {  # this column maps new naming style in ragas to their old naming style
             "user_input": "question",
             "response": "answer",
             "reference": "ground_truth",
@@ -130,14 +131,15 @@ def measure(self, test_case: Dict):
                     _required_columns.add(column_map[column])
             elif hasattr(metric, "evaluation_mode"):
                 from ragas.metrics.base import get_required_columns
+
                 for column in get_required_columns(metric.evaluation_mode):
                     _required_columns.add(column)
             else:
                 print("metric has no attribute denoting required columns")
-        
+
         print("Required columns for given list of metrics are = {}".format(_required_columns))
 
-        # get only neccessary columns from test case
+        # get only necessary columns from test case
         data = {column: test_case[column] for column in _required_columns}
         dataset = Dataset.from_dict(data)
 

From 1047133da8a6024fb9e8b4578a30a7a8ac470595 Mon Sep 17 00:00:00 2001
From: aasavari <aasavari.dhananjay.kakne@intel.com>
Date: Wed, 25 Sep 2024 22:55:59 +0000
Subject: [PATCH 09/21] testing automatic validation of ragas metrics

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
---
 evals/metrics/ragas/ragas.py | 103 ++++++++++++++++++++---------------
 tests/test_ragas.py          |   7 ++-
 2 files changed, 65 insertions(+), 45 deletions(-)

diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index 9b0a1d3e..c9ff9660 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -4,12 +4,16 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 import os
+import re
 from typing import Dict, Optional, Union
 
 from langchain_core.embeddings import Embeddings
 from langchain_core.language_models import BaseLanguageModel
 from langchain_huggingface import HuggingFaceEndpoint
 
+# import * is only allowed at module level according to python syntax
+from ragas.metrics import *
+
 
 def format_ragas_metric_name(name: str):
     return f"{name} (ragas)"
@@ -29,16 +33,17 @@ def __init__(
         self.model = model
         self.embeddings = embeddings
         self.metrics = metrics
-        self.validated_list = [
-            "answer_correctness",
-            "answer_relevancy",
-            "answer_similarity",
-            "context_precision",
-            "context_recall",
-            "faithfulness",
-            "context_utilization",
-            # "reference_free_rubrics_score",
-        ]
+        
+        # self.validated_list = [
+        #     "answer_correctness",
+        #     "answer_relevancy",
+        #     "answer_similarity",
+        #     "context_precision",
+        #     "context_recall",
+        #     "faithfulness",
+        #     "context_utilization",
+        #     # "reference_free_rubrics_score",
+        # ]
 
     async def a_measure(self, test_case: Dict):
         return self.measure(test_case)
@@ -47,37 +52,46 @@ def measure(self, test_case: Dict):
         # sends to server
         try:
             from ragas import evaluate
-            from ragas.metrics import (
-                answer_correctness,
-                answer_relevancy,
-                answer_similarity,
-                context_precision,
-                context_recall,
-                context_utilization,
-                faithfulness,
-            )
+            from ragas.metrics import ALL_METRICS
+            self.metric_names = [metric.__class__.__name__ for metric in ALL_METRICS] 
+            self.metric_names = [re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower() for name in self.metric_names]
+            self.metric_instances = {}
+            for metric in self.metric_names:
+                try:
+                    self.metric_instances[metric] = eval(metric)
+                except:
+                    pass
+            # from ragas.metrics import (
+            #     answer_correctness,
+            #     answer_relevancy,
+            #     answer_similarity,
+            #     context_precision,
+            #     context_recall,
+            #     context_utilization,
+            #     faithfulness,
+            # )
         except ModuleNotFoundError:
             raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
         try:
             from datasets import Dataset
         except ModuleNotFoundError:
             raise ModuleNotFoundError("Please install dataset")
-        self.metrics_instance = {
-            "answer_correctness": answer_correctness,
-            "answer_relevancy": answer_relevancy,
-            "answer_similarity": answer_similarity,
-            "context_precision": context_precision,
-            "context_recall": context_recall,
-            "faithfulness": faithfulness,
-            "context_utilization": context_utilization,
-            # "reference_free_rubrics_score": reference_free_rubrics_score,
-        }
+        # self.metrics_instance = {
+        #     "answer_correctness": answer_correctness,
+        #     "answer_relevancy": answer_relevancy,
+        #     "answer_similarity": answer_similarity,
+        #     "context_precision": context_precision,
+        #     "context_recall": context_recall,
+        #     "faithfulness": faithfulness,
+        #     "context_utilization": context_utilization,
+        #     # "reference_free_rubrics_score": reference_free_rubrics_score,
+        # }
         # Set LLM model
         openai_key = os.getenv("OPENAI_API_KEY", None)
         if openai_key is not None:
             print("OPENAI_API_KEY is provided, ragas initializes the model by OpenAI.")
-            self.model = None
-        if isinstance(self.model, str):
+            self.chat_model = None
+        elif isinstance(self.model, str):
             print("LLM endpoint: ", self.model)
             self.chat_model = HuggingFaceEndpoint(
                 endpoint_url=self.model,
@@ -92,9 +106,9 @@ def measure(self, test_case: Dict):
             tmp_metrics = []
             # check supported list
             for metric in self.metrics:
-                if metric not in self.validated_list:
+                if metric not in self.metric_names:
                     raise ValueError(
-                        "metric should be in supported list {}. ".format(self.validated_list)
+                        "metric should be in supported list {}. ".format(self.metric_names)
                         + "ClientResponseError raised with LangchainLLM "
                         + "when context_precision, context_recall ran. "
                         + "Here are the related issues described in ragas "
@@ -102,19 +116,20 @@ def measure(self, test_case: Dict):
                         + "https://github.com/explodinggradients/ragas/issues/664."
                     )
                 else:
-                    if metric == "answer_relevancy" and self.embeddings is None:
-                        raise ValueError("answer_relevancy metric need provide embeddings model.")
+                    if metric == "AnswerRelevancy" and self.embeddings is None:
+                        raise ValueError("AnswerRelevancy metric need provide embeddings model.")
                     tmp_metrics.append(self.metrics_instance[metric])
             self.metrics = tmp_metrics
         else:
-            self.metrics = [
-                answer_relevancy,
-                faithfulness,
-                answer_correctness,
-                answer_similarity,
-                context_precision,
-                context_recall,
-            ]
+            self.metrics = list(self.metric_instances.values())
+            # self.metrics = [
+            #     answer_relevancy,
+            #     faithfulness,
+            #     answer_correctness,
+            #     answer_similarity,
+            #     context_precision,
+            #     context_recall,
+            # ]
         # Find necessary input fields using the given metrics
         _required_columns = set()
         column_map = {  # this column maps new naming style in ragas to their old naming style
@@ -122,6 +137,7 @@ def measure(self, test_case: Dict):
             "response": "answer",
             "reference": "ground_truth",
             "retrieved_contexts": "contexts",
+            "reference_contexts": "reference_contexts"
         }
         for metric in self.metrics:
             if hasattr(metric, "_required_columns"):
@@ -129,7 +145,6 @@ def measure(self, test_case: Dict):
                     _required_columns.add(column_map[column])
             elif hasattr(metric, "evaluation_mode"):
                 from ragas.metrics.base import get_required_columns
-
                 for column in get_required_columns(metric.evaluation_mode):
                     _required_columns.add(column)
             else:
diff --git a/tests/test_ragas.py b/tests/test_ragas.py
index 3376b0b5..c72aa31e 100644
--- a/tests/test_ragas.py
+++ b/tests/test_ragas.py
@@ -25,15 +25,20 @@ def test_ragas(self):
 
         # Replace this with the actual retrieved context from your RAG pipeline
         retrieval_context = ["All customers are eligible for a 30 day full refund at no extra cost."]
+        reference_context = ["We can only process full refund upto 30 day after the purchase."]
         from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 
         embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5")
-        metric = RagasMetric(threshold=0.5, model=f"http://{host_ip}:{port}", embeddings=embeddings)
+
+        metric = RagasMetric(threshold=0.5, 
+                                model=f"http://{host_ip}:{port}", 
+                                embeddings=embeddings)
         test_case = {
             "question": ["What if these shoes don't fit?"],
             "answer": [actual_output],
             "ground_truth": [expected_output],
             "contexts": [retrieval_context],
+            "reference_contexts": [reference_context]
         }
 
         metric.measure(test_case)

From 1a9d3e8eb0cc55dd806b20a1ffa3b3aa8196ba14 Mon Sep 17 00:00:00 2001
From: aasavari <aasavari.dhananjay.kakne@intel.com>
Date: Thu, 26 Sep 2024 22:31:09 +0000
Subject: [PATCH 10/21] removing summarization_score metric

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
---
 evals/metrics/ragas/ragas.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index c9ff9660..89cc8b0b 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -55,6 +55,10 @@ def measure(self, test_case: Dict):
             from ragas.metrics import ALL_METRICS
             self.metric_names = [metric.__class__.__name__ for metric in ALL_METRICS] 
             self.metric_names = [re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower() for name in self.metric_names]
+            self.metric_names = list(set(self.metric_names))
+            # Note - summarization score metric is not working with best open-source LLMs
+            # Note - which is why we are removing it from our offering at the moment. 
+            self.metric_names.remove("summarization_score")
             self.metric_instances = {}
             for metric in self.metric_names:
                 try:

From 76790f37b1a7d41ba1442d2d9e9f9ec11da20e38 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 26 Sep 2024 22:36:36 +0000
Subject: [PATCH 11/21] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 evals/metrics/ragas/ragas.py | 12 +++++++-----
 tests/test_ragas.py          |  6 ++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index 89cc8b0b..c80ff94e 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -33,7 +33,7 @@ def __init__(
         self.model = model
         self.embeddings = embeddings
         self.metrics = metrics
-        
+
         # self.validated_list = [
         #     "answer_correctness",
         #     "answer_relevancy",
@@ -53,11 +53,12 @@ def measure(self, test_case: Dict):
         try:
             from ragas import evaluate
             from ragas.metrics import ALL_METRICS
-            self.metric_names = [metric.__class__.__name__ for metric in ALL_METRICS] 
-            self.metric_names = [re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower() for name in self.metric_names]
+
+            self.metric_names = [metric.__class__.__name__ for metric in ALL_METRICS]
+            self.metric_names = [re.sub(r"(?<!^)(?=[A-Z])", "_", name).lower() for name in self.metric_names]
             self.metric_names = list(set(self.metric_names))
             # Note - summarization score metric is not working with best open-source LLMs
-            # Note - which is why we are removing it from our offering at the moment. 
+            # Note - which is why we are removing it from our offering at the moment.
             self.metric_names.remove("summarization_score")
             self.metric_instances = {}
             for metric in self.metric_names:
@@ -141,7 +142,7 @@ def measure(self, test_case: Dict):
             "response": "answer",
             "reference": "ground_truth",
             "retrieved_contexts": "contexts",
-            "reference_contexts": "reference_contexts"
+            "reference_contexts": "reference_contexts",
         }
         for metric in self.metrics:
             if hasattr(metric, "_required_columns"):
@@ -149,6 +150,7 @@ def measure(self, test_case: Dict):
                     _required_columns.add(column_map[column])
             elif hasattr(metric, "evaluation_mode"):
                 from ragas.metrics.base import get_required_columns
+
                 for column in get_required_columns(metric.evaluation_mode):
                     _required_columns.add(column)
             else:
diff --git a/tests/test_ragas.py b/tests/test_ragas.py
index c72aa31e..c0412c61 100644
--- a/tests/test_ragas.py
+++ b/tests/test_ragas.py
@@ -30,15 +30,13 @@ def test_ragas(self):
 
         embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5")
 
-        metric = RagasMetric(threshold=0.5, 
-                                model=f"http://{host_ip}:{port}", 
-                                embeddings=embeddings)
+        metric = RagasMetric(threshold=0.5, model=f"http://{host_ip}:{port}", embeddings=embeddings)
         test_case = {
             "question": ["What if these shoes don't fit?"],
             "answer": [actual_output],
             "ground_truth": [expected_output],
             "contexts": [retrieval_context],
-            "reference_contexts": [reference_context]
+            "reference_contexts": [reference_context],
         }
 
         metric.measure(test_case)

From 8f0b59bcabe5796784240d9f116a7a63b8195cca Mon Sep 17 00:00:00 2001
From: aasavari <aasavari.dhananjay.kakne@intel.com>
Date: Thu, 26 Sep 2024 23:03:35 +0000
Subject: [PATCH 12/21] upgrading ragas from 0.1.16 to 0.1.19

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
---
 tests/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/requirements.txt b/tests/requirements.txt
index d2cd20b0..9c6ead5c 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -3,4 +3,4 @@ jieba
 langchain_community
 langchain_huggingface
 lm-eval==0.4.3
-ragas
+ragas==0.1.19   

From 1c0a0f406f82a6acabe4dee283f2cbd36484b95b Mon Sep 17 00:00:00 2001
From: aasavari <aasavari.dhananjay.kakne@intel.com>
Date: Thu, 10 Oct 2024 22:47:05 -0700
Subject: [PATCH 13/21] adding annotation free RAG assessment

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
---
 evals/metrics/ragaaf/README.md                |  67 +++++++++++
 evals/metrics/ragaaf/__init__.py              |  10 ++
 evals/metrics/ragaaf/prompt_engineering.py    |  77 +++++++++++++
 .../ragaaf/prompt_templates/__init__.py       |  21 ++++
 .../ragaaf/prompt_templates/correctness.py    |  13 +++
 .../ragaaf/prompt_templates/factualness.py    |  13 +++
 .../ragaaf/prompt_templates/opening_prompt.py |  21 ++++
 .../ragaaf/prompt_templates/readability.py    |  13 +++
 .../ragaaf/prompt_templates/relevance.py      |  13 +++
 evals/metrics/ragaaf/rag_dataset.py           |  86 ++++++++++++++
 evals/metrics/ragaaf/run_eval.py              | 105 ++++++++++++++++++
 evals/metrics/ragaaf/utils/__init__.py        |   7 ++
 evals/metrics/ragaaf/utils/helper.py          |  88 +++++++++++++++
 evals/metrics/ragaaf/utils/model.py           |  84 ++++++++++++++
 evals/metrics/ragaaf/utils/retry.py           |  38 +++++++
 tests/requirements.txt                        |   1 +
 tests/test_ragaaf.py                          |  56 ++++++++++
 17 files changed, 713 insertions(+)
 create mode 100644 evals/metrics/ragaaf/README.md
 create mode 100644 evals/metrics/ragaaf/__init__.py
 create mode 100644 evals/metrics/ragaaf/prompt_engineering.py
 create mode 100644 evals/metrics/ragaaf/prompt_templates/__init__.py
 create mode 100644 evals/metrics/ragaaf/prompt_templates/correctness.py
 create mode 100644 evals/metrics/ragaaf/prompt_templates/factualness.py
 create mode 100644 evals/metrics/ragaaf/prompt_templates/opening_prompt.py
 create mode 100644 evals/metrics/ragaaf/prompt_templates/readability.py
 create mode 100644 evals/metrics/ragaaf/prompt_templates/relevance.py
 create mode 100644 evals/metrics/ragaaf/rag_dataset.py
 create mode 100644 evals/metrics/ragaaf/run_eval.py
 create mode 100644 evals/metrics/ragaaf/utils/__init__.py
 create mode 100644 evals/metrics/ragaaf/utils/helper.py
 create mode 100644 evals/metrics/ragaaf/utils/model.py
 create mode 100644 evals/metrics/ragaaf/utils/retry.py
 create mode 100644 tests/test_ragaaf.py

diff --git a/evals/metrics/ragaaf/README.md b/evals/metrics/ragaaf/README.md
new file mode 100644
index 00000000..691f668f
--- /dev/null
+++ b/evals/metrics/ragaaf/README.md
@@ -0,0 +1,67 @@
+# RAGAAF (RAG aasessment - Annotation Free) 
+
+We provide easy-to-use, flexible, opensource and annotation-free RAG evaluation tool using LLM-as-a-judge while benefitting from Intel's Gaudi2 AI accelator chips. 
+
+## Overview
+### Data 
+AutoEval is best suited for Long Form Question Answering (LFQA) datasets where you want to gauge quality and factualness of the answer via LLM's intelligence. Here, you can use benchmarking datasets or bring your own custom datasets. Please make sure to set `field_map` to map AutoEval fields such as "question" to your dataset's corresponding field like "query". 
+> Note : To use benchmarking datasets, set argument `data_mode=benchmarking`. Similarly, to use custom datasets, set `data_mode=local`.
+### Model
+AutoEval can run in 3 evaluation modes - 
+1. `evaluation_mode="endpoint"` uses HuggingFace endpoint. 
+- We recommend launching a HuggingFace endpoint on Gaudi AI accelerator machines to ensure maximum usage and performance. 
+- To launch HF endpoint on Gaudi2, please follow the 2-step instructions here - [tgi-gaudi](https://github.com/huggingface/tgi-gaudi). 
+- Pass your endpoint url as `model_name` argument. 
+2. `evaluation_mode="openai"` uses openai backend. 
+- Please set your `openai_key` and your choice of model as `model_name` argument.
+3. `evaluation_mode="local"` uses your local hardware. 
+- Set `hf_token` argument and set your favourite open-source model in `model_name` argument. 
+- GPU usage will be prioritized after checking it's availability. If GPU is unavailable, the model will run on CPU. 
+## Metrics
+AutoEval provides 4 metrics - factualness, correctness, relevance and readability. You can also bring your own metrics and grading scales. Don't forget to add your metric to `evaluation_metrics` argument. 
+## Generation configuration 
+Please set generation parameters as per your requirement in `GENERATION_CONFIG` in `run_eval.py`. 
+
+## Run using HF endpoint 
+```python3
+
+# step 1 : choose your dataset -- local or benchmarking
+dataset = "explodinggradients/ragas-wikiqa"
+data_mode = "benchmarking"
+field_map = {"question": "question", "answer": "generated_with_rag", "context": "context"}
+
+# step 2 - choose your favourite LLM and hardware
+
+# evaluation_mode = "openai"
+# model_name = "gpt-4o"
+# openai_key = "<add your openai key>"
+
+# evaluation_mode = "endpoint"
+# model_name = f"http://{host_ip}:{port}"
+
+evaluation_mode = "local"
+model_name = "meta-llama/Llama-3.2-1B-Instruct"
+hf_token = "<add your HF token>"
+
+# step 3 - choose metrics of your choice, you can also add custom metrics
+evaluation_metrics = ["factualness", "relevance", "correctness", "readability"]
+
+# step 4 - run evaluation 
+evaluator = AnnotationFreeEvaluate(
+    dataset=dataset,
+    data_mode=data_mode,
+    field_map=field_map,
+    evaluation_mode=evaluation_mode,
+    model_name=model_name,
+    evaluation_metrics=evaluation_metrics,
+    # openai_key=openai_key,
+    hf_token=hf_token,
+    debug_mode=True,
+)
+
+responses = evaluator.measure()
+
+for response in responses:
+    print(response)
+```
+That's it! For troubleshooting, please submit an issue and we will get right on it. 
diff --git a/evals/metrics/ragaaf/__init__.py b/evals/metrics/ragaaf/__init__.py
new file mode 100644
index 00000000..45d0b075
--- /dev/null
+++ b/evals/metrics/ragaaf/__init__.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+#
+
+from .run_eval import AnnotationFreeEvaluate
+
+__all__ = [AnnotationFreeEvaluate]
diff --git a/evals/metrics/ragaaf/prompt_engineering.py b/evals/metrics/ragaaf/prompt_engineering.py
new file mode 100644
index 00000000..3ab6e7e1
--- /dev/null
+++ b/evals/metrics/ragaaf/prompt_engineering.py
@@ -0,0 +1,77 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from jinja2 import Template
+
+from .prompt_templates import *
+from .prompt_templates import NAME2METRIC
+
+
+class Prompt:
+    """Class to customize prompt template using user-defined list of metrics."""
+
+    def __init__(self, metrics, input_fields):
+        self.metrics = metrics
+        self.input_fields = input_fields
+        self.template = self.load_prompt_template()
+
+    def create_grading_format(self):
+        grading_format = (
+            "You must ALWAYS provide every single one of the scores and reasonings in the following JSON format:"
+        )
+        grading_format += "\n" + "{" + "\n"
+        content = []
+        reasoning_prompt = "Reasoning for {}: [your one line step by step reasoning about the {} of the answer]"
+        scoring_prompt = "Score for {}: [your score number for the {} of the answer]"
+        for metric in self.metrics:
+            reasoning = reasoning_prompt.format(metric, metric)
+            score = scoring_prompt.format(metric, metric)
+            content += (reasoning + "\n" + score,)
+        grading_format += "\n\n".join(content)
+        grading_format += "\n" + "}"
+        return grading_format
+
+    def create_closing_prompt(self):
+        closing_prompt = ["Let's begin!"]
+        for f in self.input_fields:
+            closing_prompt += ("Provided {}:".format(f) + "\n" + "{{" + f + "}}",)
+        return "\n\n".join(closing_prompt)
+
+    def load_prompt_template(self):
+        content = []
+        for metric_name in ["opening_prompt"] + self.metrics:
+            metric_instance = NAME2METRIC[metric_name]
+            content += (metric_instance.template,)
+        content += (self.create_grading_format(),)
+        content += (self.create_closing_prompt(),)
+        return Template("\n\n".join(content))
+
+    def render_prompt(self, **kwargs) -> str:
+        text = self.template.render(**kwargs)
+        return text
+
+
+if __name__ == "__main__":
+
+    """Here, we test implementation of Prompt class."""
+
+    # step 0 - user input
+    metrics = ["factualness", "relevance", "correctness", "readability"]
+    input_fields = ["question", "answer", "context"]
+
+    # step 1 - load prompt using Prompt class
+    prompt = Prompt(metrics=metrics, input_fields=input_fields)
+
+    example = {
+        "question": "Who is wife of Barak Obama",
+        "context": "Michelle Obama, wife of Barak Obama (former President of the United States of America) is an attorney. Barak and Michelle Obama have 2 daughters - Malia and Sasha",
+        "answer": "Michelle Obama",
+        "ground_truth": "Wife of Barak Obama is Michelle Obama",
+    }
+
+    # step 2 - render prompt with given inputs
+    rendered_prompt = prompt.render_prompt(
+        question=example["question"], answer=example["answer"], context=example["context"]
+    )
+
+    print(rendered_prompt)
diff --git a/evals/metrics/ragaaf/prompt_templates/__init__.py b/evals/metrics/ragaaf/prompt_templates/__init__.py
new file mode 100644
index 00000000..2b3979ba
--- /dev/null
+++ b/evals/metrics/ragaaf/prompt_templates/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from .opening_prompt import OpeningPrompt
+
+from .correctness import Correctness
+from .factualness import Factualness
+from .relevance import Relevance
+from .readability import Readability
+
+__all__ = ["opening_prompt", "correctness", "factualness", "relevance", "readability"]
+
+NAME2METRIC = {}
+
+
+def snake2camel(s):
+    return "".join(x.capitalize() or "_" for x in s.split("_"))
+
+
+for name in __all__:
+    NAME2METRIC[name] = eval(snake2camel(name))
diff --git a/evals/metrics/ragaaf/prompt_templates/correctness.py b/evals/metrics/ragaaf/prompt_templates/correctness.py
new file mode 100644
index 00000000..a328d3d2
--- /dev/null
+++ b/evals/metrics/ragaaf/prompt_templates/correctness.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+class Correctness:
+    name = "correctness"
+    required_columns = ["answer", "context", "question"]
+    template = """- Correctness: correctness measures how accurately and comprehensively does the answer resolve problem posed in the question.
+  - Score 1: If the answer is empty string or something like "I do not know the answer", the correctness score is 1.
+  - Score 2: If the answer only addresses a small part of the question correctly or it is missing many critical steps/aspects of the answer or the answer is too short to fully answer the question or is missing many steps causing the answer to not fully address the problem described in the question, then the correctness score is 2.
+  - Score 3: The answer mostly addresses the question but one critical aspect/step is missing or is incorrect.
+  - Score 4: the answer mostly answer the question and covers all critical/main aspects of the question, but it’s missing important/necessary details about one or more aspects.
+  - Score 5: the answer correctly and completely addresses the query. It also covers important details about each step."""
diff --git a/evals/metrics/ragaaf/prompt_templates/factualness.py b/evals/metrics/ragaaf/prompt_templates/factualness.py
new file mode 100644
index 00000000..7fa6dfee
--- /dev/null
+++ b/evals/metrics/ragaaf/prompt_templates/factualness.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+class Factualness:
+    name = "factualness"
+    required_columns = ["answer", "context"]
+    template = """- Factualness: Factualness assesses how much of the provided answer is contained within the provided context. A higher score indicates that a higher proportion of claims present in the answer are present or can be derived from the provided context.
+  - Score 1: the answer is completely hallucinated i.e. not contained in the context at all or there is no answer.
+  - Score 2: only a small part of the answer is contained in the context but most of it is imaginary/hallucinated or the meaning is completely changed from what is represented in the context.
+  - Score 3: Only about half of the answer is contained in the context. Rest of the answer is hallucinated or imaginary.
+  - Score 4: Most of the claims in the answer can be inferred from the provided context with very little information that is not directly supported by the provided context.
+  - Score 5: All of the claims in the answer are directly supported by the provided context, demonstrating high faithfulness to the provided context."""
diff --git a/evals/metrics/ragaaf/prompt_templates/opening_prompt.py b/evals/metrics/ragaaf/prompt_templates/opening_prompt.py
new file mode 100644
index 00000000..441f371e
--- /dev/null
+++ b/evals/metrics/ragaaf/prompt_templates/opening_prompt.py
@@ -0,0 +1,21 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+class OpeningPrompt:
+    name = "opening_prompt"
+    required_columns = []
+
+    template = """Consider yourself as an helpful, truthful and impartial judge.
+
+Your task:
+You will be given an input consisting of a question, an answer and a context. Your task is to act as an impartial judge and provide a numerical score between 1 to 5 for each of the following metrics for the given answer.
+
+Important rules for you while completing this task:
+1. You MUST ALWAYS provide a score for every metric mentioned below.
+2. Make sure to understand definition of every metric fully before completing your task. Every metric is provided with grading scale and rubric. You MUST use this grading scale and rubric to determine your score.
+3. Ensure that your scores and reasoning for every metric is independent of each other e.g., score for factualness should not impact score for correctness and vice versa.
+4. Base your grading decision only on the given inputs and do not speculate or hallucinate.
+5. You must also provide reasoning for your score in a single sentence.
+
+Your metric definitions along with grading scale and rubric:"""
diff --git a/evals/metrics/ragaaf/prompt_templates/readability.py b/evals/metrics/ragaaf/prompt_templates/readability.py
new file mode 100644
index 00000000..4c03e6e7
--- /dev/null
+++ b/evals/metrics/ragaaf/prompt_templates/readability.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+class Readability:
+    name = "readability"
+    required_columns = ["answer"]
+    template = """- Readability: Readability measures clarity and lucidity of the answer. Readability is measured solely based on the answer and it does not consider the question or the context.
+  - Score 1: the answer is empty or "I do not know the answer" or completely unreadable or No meaningful information can be extracted from the answer, then the score is 1.
+  - Score 2: the answer is slightly readable, there are irrelevant symbols or HTML tags or repeated words, but it can roughly form a meaningful sentence that can cover some aspects of the answer.
+  - Score 3: Answer can be read but there are grammatical mistakes in the answer.
+  - Score 4: the answer readable, but the readability and style can improved to better appeal to the reader.
+  - Score 5: the answer is reader friendly and well written."""
diff --git a/evals/metrics/ragaaf/prompt_templates/relevance.py b/evals/metrics/ragaaf/prompt_templates/relevance.py
new file mode 100644
index 00000000..33743ecc
--- /dev/null
+++ b/evals/metrics/ragaaf/prompt_templates/relevance.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+class Relevance:
+    name = "relevance"
+    required_columns = ["question", "answer"]
+    template = """- Relevance: Relevance measures how well the answer relates to the question.
+  - Score 1: The answer doesn't mention anything about the question or is completely irrelevant to the question.
+  - Score 2: The answer only identifies the domain (e.g. cnvrg) mentioned in the question and provides information from the correct domain. But, the answer does not address the question itself and the point of the question is completely missed by it.
+  - Score 3: The answer correctly identifies the domain and essence of the question but the details in the answer are not relevant to the focus of the question.
+  - Score 4: The answer correctly identifies domain mentioned the question and essence of the question as well as stays consistent with both of them. But there is some part of the answer that is not relevant to the question or it's topic or it's essence. This irrelevant part is damaging the overall relevance of the answer.
+  - Score 5: The answer is completely relevant to the question and the details do not deviate from the essence of the question. There are no parts of the answer that are irrelevant or unnecessary for the given question."""
diff --git a/evals/metrics/ragaaf/rag_dataset.py b/evals/metrics/ragaaf/rag_dataset.py
new file mode 100644
index 00000000..a955eae6
--- /dev/null
+++ b/evals/metrics/ragaaf/rag_dataset.py
@@ -0,0 +1,86 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import jsonlines
+from datasets import Dataset, load_dataset
+
+
+class RAGDataset:
+    """Dataset class to store data in HF datasets API format."""
+
+    def __init__(self, dataset, field_map, mode):
+        self.dataset = dataset
+        self.field_map = field_map
+        assert mode in ["local", "benchmarking"], "mode can be either local or benchmarking"
+        self.mode = mode
+        self.data = self.load_data()
+        self.validate_dataset()
+
+    def load_data(self):
+        if self.mode == "local":
+            assert os.path.exists(self.dataset), "There is no such file - {}".format(self.dataset)
+            with jsonlines.open(self.dataset) as reader:
+                data = []
+                for obj in reader:
+                    ex = {}
+                    for out_field, in_field in self.field_map.items():
+                        if type(obj[in_field]) == list:
+                            ex[out_field] = "\n".join(obj[in_field])
+                        else:
+                            ex[out_field] = obj[in_field]
+                    data.append(ex)
+            return Dataset.from_list(data)
+        else:
+            data = []
+            for obj in load_dataset(self.dataset)["train"]:
+                ex = {}
+                for out_field, in_field in self.field_map.items():
+                    if type(obj[in_field]) == list:
+                        ex[out_field] = "\n".join(obj[in_field])
+                    else:
+                        ex[out_field] = obj[in_field]
+                data.append(ex)
+            return Dataset.from_list(data)
+
+    def validate_dataset(self):
+        for i, example in enumerate(self.data):
+            for out_field in self.field_map:
+                assert out_field in example, "Example {} does not have {} field".format(i + 1, out_field)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def __len__(self):
+        return len(self.data)
+
+    def __iter__(self):
+        return iter(self.data)
+
+
+if __name__ == "__main__":
+
+    dataset_path = "../../benchmark/ragas/ground_truth.jsonl"
+    field_map = {
+        "question": "question",
+        "ground_truth": "ground_truth",
+        "context": "context",
+    }
+
+    ds = RAGDataset(dataset=dataset_path, field_map=field_map, mode="local")
+
+    for i, ex in enumerate(ds):
+        assert ex["question"] == ds[i]["question"], "index {} does not have correct query".format(i)
+
+    dataset = "explodinggradients/ragas-wikiqa"
+    field_map = {
+        "question": "question",
+        "answer": "generated_with_rag",
+        "context": "context",
+        "ground_truth": "correct_answer",
+    }
+    ds = RAGDataset(dataset=dataset, field_map=field_map, mode="benchmarking")
+
+    for i, ex in enumerate(ds):
+        assert ex["question"] == ds[i]["question"], "index {} does not have correct query".format(i)
diff --git a/evals/metrics/ragaaf/run_eval.py b/evals/metrics/ragaaf/run_eval.py
new file mode 100644
index 00000000..8c170313
--- /dev/null
+++ b/evals/metrics/ragaaf/run_eval.py
@@ -0,0 +1,105 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from huggingface_hub import login
+
+from .prompt_engineering import Prompt
+from .rag_dataset import RAGDataset
+from .utils.helper import *
+from .utils.model import *
+
+
+class AnnotationFreeEvaluate:
+
+    def __init__(
+        self,
+        dataset,
+        data_mode,
+        field_map,
+        evaluation_mode,
+        model_name,
+        evaluation_metrics,
+        hf_token=None,
+        openai_key=None,
+        debug_mode=None,
+    ):
+        self.GENERATION_CONFIG = {
+            "openai": {"temperature": 0.1},
+            "endpoint": {"max_tokens": 500},
+            "local": {"max_new_tokens": 500},
+        }
+        self.data = RAGDataset(dataset=dataset, field_map=field_map, mode=data_mode)
+        self.evaluator = self.get_evaluator(evaluation_mode, model_name, openai_key, hf_token)
+        self.prompt_template = self.get_template(evaluation_metrics, field_map)
+        self.debug_mode = debug_mode
+        self.generation_config = self.GENERATION_CONFIG[evaluation_mode]
+
+    def get_evaluator(self, evaluation_mode, model_name, openai_key=None, hf_token=None):
+        if evaluation_mode == "openai":
+            print("Using {} openai key".format(openai_key))
+            evaluator = OAIEvaluator(openai_key, model_name)
+        elif evaluation_mode == "endpoint":
+            print("Loading HF endpoint at {}".format(model_name))
+            evaluator = EndpointEvaluator(model_name)
+        else:
+            assert evaluation_mode == "local", "evaluation mode must be openai / endpoint / local"
+            print("Loading {} model locally".format(model_name))
+            login(token=hf_token, add_to_git_credential=True)
+            evaluator = HFEvaluator(model_name)
+        return evaluator
+
+    def get_template(self, evaluation_metrics, field_map):
+        prompt = Prompt(metrics=evaluation_metrics, input_fields=field_map)
+        return prompt.template
+
+    def measure(self):
+        n_samples = 1 if self.debug_mode else len(self.data)
+        responses = [""] * n_samples
+        start = time.time()
+        for i in range(n_samples):
+            prompt = render_prompt(
+                self.prompt_template,
+                query=self.data[i]["question"],
+                answer=self.data[i]["answer"],
+                context=self.data[i]["context"],
+            )
+            messages = [{"role": "user", "content": prompt}]
+            response = self.evaluator.generate(messages, **self.generation_config)
+            responses[i] = response
+        end = time.time()
+        print("Generation of scores and reasoning took {:.2f} seconds for {:,} examples".format(end - start, n_samples))
+        return responses
+
+
+if __name__ == "__main__":
+
+    dataset = "explodinggradients/ragas-wikiqa"
+    data_mode = "benchmarking"
+    field_map = {"question": "question", "answer": "generated_with_rag", "context": "context"}
+
+    # evaluation_mode = "endpoint"
+    # model_name = f"http://{host_ip}:{port}"
+
+    evaluation_mode = "openai"
+    openai_key = "<add your openai key>"
+    model_name = "gpt-4o"
+
+    evaluation_metrics = ["factualness", "relevance", "correctness", "readability"]
+
+    evaluator = AnnotationFreeEvaluate(
+        dataset=dataset,
+        data_mode=data_mode,
+        field_map=field_map,
+        evaluation_mode=evaluation_mode,
+        model_name=model_name,
+        evaluation_metrics=evaluation_metrics,
+        openai_key=openai_key,
+        debug_mode=True,
+    )
+
+    responses = evaluator.measure()
+
+    for response in responses:
+        print(response)
diff --git a/evals/metrics/ragaaf/utils/__init__.py b/evals/metrics/ragaaf/utils/__init__.py
new file mode 100644
index 00000000..c3d7e5cf
--- /dev/null
+++ b/evals/metrics/ragaaf/utils/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
diff --git a/evals/metrics/ragaaf/utils/helper.py b/evals/metrics/ragaaf/utils/helper.py
new file mode 100644
index 00000000..71fdef65
--- /dev/null
+++ b/evals/metrics/ragaaf/utils/helper.py
@@ -0,0 +1,88 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+import re
+
+import numpy as np
+import pandas as pd
+import yaml
+from jinja2 import Template
+from scipy.stats import pearsonr
+from sklearn.metrics import mean_squared_error
+
+
+def load_jsonl(data_path):
+    result = []
+    with open(data_path, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            result.append(data)
+    return result
+
+
+def load_config(config_path):
+
+    with open(config_path, "r") as file:
+        config = yaml.safe_load(file)
+
+    return config
+
+
+def compute_mse(x, y):
+    return mean_squared_error(x, y)
+
+
+def compute_pearson(x, y):
+    corr, _ = pearsonr(x, y)
+    return corr
+
+
+def extract_delay_from_rate_limit_error_msg(text):
+    import re
+
+    pattern = r"retry after (\d+)"
+    match = re.search(pattern, text)
+    if match:
+        retry_time_from_message = match.group(1)
+        return float(retry_time_from_message)
+    else:
+        return 5
+
+
+def render_prompt(template: Template, **kwargs) -> str:
+    text = template.render(**kwargs)
+    return text
+
+
+def extract_score(pattern: str, text: str):
+    match = re.search(pattern, text.lower())
+
+    if match:
+        score = int(match.group(1))
+    else:
+        score = 1
+
+    return score
+
+
+def compute_metric_wise_assessment(metrics, groundtruth, prediction):
+    fine_grained_evaluation = pd.DataFrame(index=metrics)
+    for i, metric in enumerate(metrics):
+        fine_grained_evaluation.loc[metric, "MSE"] = compute_mse(groundtruth[i], prediction[i])
+        abs_diff = [abs(g - p) for g, p in zip(groundtruth[i], prediction[i])]
+        for diff in [0, 1, 2]:
+            fine_grained_evaluation.loc[metric, "|label - score| <= {}".format(diff)] = sum(
+                val <= diff for val in abs_diff
+            )
+    return fine_grained_evaluation
+
+
+def compute_weighted_assessment(weights, groundtruth, prediction):
+    weights, groundtruth, prediction = np.array(weights), np.array(groundtruth), np.array(prediction)
+    weighted_labels = np.sum(weights[:, np.newaxis] * groundtruth, axis=0)
+    weighted_scores = np.sum(weights[:, np.newaxis] * prediction, axis=0)
+    mse = compute_mse(weighted_labels, weighted_scores)
+    pearson_correlation = compute_pearson(weighted_labels, weighted_scores)
+    return mse, pearson_correlation
diff --git a/evals/metrics/ragaaf/utils/model.py b/evals/metrics/ragaaf/utils/model.py
new file mode 100644
index 00000000..1c46d959
--- /dev/null
+++ b/evals/metrics/ragaaf/utils/model.py
@@ -0,0 +1,84 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List
+
+import openai
+import torch
+from huggingface_hub import InferenceClient
+from transformers import AutoTokenizer, pipeline
+
+from .helper import extract_delay_from_rate_limit_error_msg
+from .retry import retry_and_handle_exceptions
+
+
+class EndpointEvaluator:
+    def __init__(self, model_name):
+        self.client = InferenceClient(base_url="{}/v1/chat/completions".format(model_name))
+
+    def generate(self, messages, **kwargs):
+        output = self.client.chat.completions.create(
+            model="tgi",
+            messages=messages,
+            stream=True,
+            **kwargs,
+        )
+        response = [chunk.choices[0].delta.content for chunk in output]
+        response = [content for content in response if content]
+        response = " ".join(response)
+        return response
+
+
+class HFEvaluator:
+    def __init__(self, model_name):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        device_map = "auto" if torch.cuda.is_available() else "cpu"
+        if device_map == "cpu":
+            self.pipe = pipeline(
+                "text-generation",
+                model=model_name,
+                tokenizer=self.tokenizer,
+                torch_dtype=torch.bfloat16,
+                device_map="cpu",
+            )
+        else:
+            self.pipe = pipeline(
+                "text-generation",
+                model=model_name,
+                tokenizer=self.tokenizer,
+                torch_dtype=torch.float16,
+                device_map="auto",
+            )
+
+    def generate(self, messages, **kwargs) -> List[float]:
+
+        prompt = self.pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        outputs = self.pipe(prompt, **kwargs, return_full_text=False)
+        result = outputs[0]["generated_text"]
+        return result
+
+
+class OAIEvaluator:
+    def __init__(self, openai_key, model_name):
+        openai.api_key = openai_key
+        self.model_name = model_name
+
+    @retry_and_handle_exceptions(
+        exception_to_check=(
+            openai.RateLimitError,
+            openai.APIError,
+            KeyError,
+        ),
+        max_retries=5,
+        extract_delay_from_error_message=extract_delay_from_rate_limit_error_msg,
+    )
+    def generate(self, messages: list, **kwargs) -> List[float]:
+        return (
+            openai.chat.completions.create(
+                model=self.model_name,
+                messages=messages,
+                **kwargs,
+            )
+            .choices[0]
+            .message.content
+        )
diff --git a/evals/metrics/ragaaf/utils/retry.py b/evals/metrics/ragaaf/utils/retry.py
new file mode 100644
index 00000000..bde26409
--- /dev/null
+++ b/evals/metrics/ragaaf/utils/retry.py
@@ -0,0 +1,38 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import functools
+import random
+import time
+from typing import Optional, Tuple, Union
+
+
+def retry_and_handle_exceptions(
+    exception_to_check: Union[Exception, Tuple[Exception]],
+    max_retries: int = 3,
+    initial_delay: float = 1,
+    exponential_base: float = 2,
+    jitter: bool = False,
+    extract_delay_from_error_message: Optional[any] = None,
+):
+    def deco_retry(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            delay = initial_delay
+            for i in range(max_retries):
+                try:
+                    return func(*args, **kwargs)
+                except exception_to_check as e:
+                    if i == max_retries - 1:
+                        raise Exception("Func execution failed after {0} retries: {1}".format(max_retries, e))
+                    delay *= exponential_base * (1 + jitter * random.random())
+                    delay_from_error_message = None
+                    if extract_delay_from_error_message is not None:
+                        delay_from_error_message = extract_delay_from_error_message(str(e))
+                    final_delay = delay_from_error_message if delay_from_error_message else delay
+                    print("Func execution failed. Retrying in {0} seconds: {1}".format(final_delay, e))
+                    time.sleep(final_delay)
+
+        return wrapper
+
+    return deco_retry
diff --git a/tests/requirements.txt b/tests/requirements.txt
index 9c6ead5c..6fabbdc3 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -4,3 +4,4 @@ langchain_community
 langchain_huggingface
 lm-eval==0.4.3
 ragas==0.1.19   
+openai
\ No newline at end of file
diff --git a/tests/test_ragaaf.py b/tests/test_ragaaf.py
new file mode 100644
index 00000000..123ccc3e
--- /dev/null
+++ b/tests/test_ragaaf.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import unittest
+
+from evals.metrics.ragaaf import AnnotationFreeEvaluate
+
+host_ip = os.getenv("host_ip", "localhost")
+port = os.getenv("port", "8008")
+
+
+class TestRagasMetric(unittest.TestCase):
+
+    # @unittest.skip("need pass localhost id")
+    def test_ragas(self):
+
+        dataset = "explodinggradients/ragas-wikiqa"
+        data_mode = "benchmarking"
+        field_map = {"question": "question", "answer": "generated_with_rag", "context": "context"}
+
+        # evaluation_mode = "openai"
+        # model_name = "gpt-4o"
+        # openai_key = "<add your openai key>"
+
+        evaluation_mode = "endpoint"
+        model_name = f"http://{host_ip}:{port}"
+
+        # evaluation_mode = "local"
+        # model_name = "meta-llama/Llama-3.2-1B-Instruct"
+        # hf_token = "<add your HF token>"
+
+        evaluation_metrics = ["factualness", "relevance", "correctness", "readability"]
+
+        evaluator = AnnotationFreeEvaluate(
+            dataset=dataset,
+            data_mode=data_mode,
+            field_map=field_map,
+            evaluation_mode=evaluation_mode,
+            model_name=model_name,
+            evaluation_metrics=evaluation_metrics,
+            # openai_key=openai_key,
+            # hf_token=hf_token,
+            debug_mode=True,
+        )
+
+        responses = evaluator.measure()
+
+        for response in responses:
+            print(response)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 0cfcebbf07ede5282b8d19daf6ecde3170cd840e Mon Sep 17 00:00:00 2001
From: aasavari <aasavari.dhananjay.kakne@intel.com>
Date: Thu, 10 Oct 2024 22:55:22 -0700
Subject: [PATCH 14/21] improved README

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
---
 evals/metrics/ragaaf/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/evals/metrics/ragaaf/README.md b/evals/metrics/ragaaf/README.md
index 691f668f..919ebb15 100644
--- a/evals/metrics/ragaaf/README.md
+++ b/evals/metrics/ragaaf/README.md
@@ -1,10 +1,10 @@
-# RAGAAF (RAG aasessment - Annotation Free) 
+# RAGAAF (RAG assessment - Annotation Free) 
 
-We provide easy-to-use, flexible, opensource and annotation-free RAG evaluation tool using LLM-as-a-judge while benefitting from Intel's Gaudi2 AI accelator chips. 
+We introduce - RAGAAF, Intel's easy-to-use, flexible, opensource and annotation-free RAG evaluation tool using LLM-as-a-judge while benefitting from Intel's Gaudi2 AI accelator chips. 
 
 ## Overview
 ### Data 
-AutoEval is best suited for Long Form Question Answering (LFQA) datasets where you want to gauge quality and factualness of the answer via LLM's intelligence. Here, you can use benchmarking datasets or bring your own custom datasets. Please make sure to set `field_map` to map AutoEval fields such as "question" to your dataset's corresponding field like "query". 
+RAGAAF is best suited for Long Form Question Answering (LFQA) datasets where you want to gauge quality and factualness of the answer via LLM's intelligence. Here, you can use benchmarking datasets or bring your own custom datasets. Please make sure to set `field_map` to map AutoEval fields such as "question" to your dataset's corresponding field like "query". 
 > Note : To use benchmarking datasets, set argument `data_mode=benchmarking`. Similarly, to use custom datasets, set `data_mode=local`.
 ### Model
 AutoEval can run in 3 evaluation modes - 
@@ -20,7 +20,7 @@ AutoEval can run in 3 evaluation modes -
 ## Metrics
 AutoEval provides 4 metrics - factualness, correctness, relevance and readability. You can also bring your own metrics and grading scales. Don't forget to add your metric to `evaluation_metrics` argument. 
 ## Generation configuration 
-Please set generation parameters as per your requirement in `GENERATION_CONFIG` in `run_eval.py`. 
+We provide recommended generation parameters after experimenting with different LLMs. If you'd like to edit them to your requirement, please set generation parameters in `GENERATION_CONFIG` in `run_eval.py`. 
 
 ## Run using HF endpoint 
 ```python3

From 8c86ae97638dc73ca16b29858dc85956e2545ee4 Mon Sep 17 00:00:00 2001
From: aasavari <aasavari.dhananjay.kakne@intel.com>
Date: Thu, 10 Oct 2024 22:57:17 -0700
Subject: [PATCH 15/21] adding jsonlines to requirements

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
---
 tests/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/requirements.txt b/tests/requirements.txt
index 6fabbdc3..a0b5b6a4 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -1,5 +1,6 @@
 bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@e5c2f31625223431d7987f43b70b75b9d26ba118
 jieba
+jsonlines
 langchain_community
 langchain_huggingface
 lm-eval==0.4.3

From e070c79f42793b959eee4c342b799643dd910549 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 11 Oct 2024 06:08:18 +0000
Subject: [PATCH 16/21] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 evals/metrics/ragaaf/README.md | 3 +--
 tests/requirements.txt         | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/evals/metrics/ragaaf/README.md b/evals/metrics/ragaaf/README.md
index 919ebb15..ac8d3b85 100644
--- a/evals/metrics/ragaaf/README.md
+++ b/evals/metrics/ragaaf/README.md
@@ -24,7 +24,6 @@ We provide recommended generation parameters after experimenting with different
 
 ## Run using HF endpoint 
 ```python3
-
 # step 1 : choose your dataset -- local or benchmarking
 dataset = "explodinggradients/ragas-wikiqa"
 data_mode = "benchmarking"
@@ -46,7 +45,7 @@ hf_token = "<add your HF token>"
 # step 3 - choose metrics of your choice, you can also add custom metrics
 evaluation_metrics = ["factualness", "relevance", "correctness", "readability"]
 
-# step 4 - run evaluation 
+# step 4 - run evaluation
 evaluator = AnnotationFreeEvaluate(
     dataset=dataset,
     data_mode=data_mode,
diff --git a/tests/requirements.txt b/tests/requirements.txt
index a0b5b6a4..f0b7a773 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -4,5 +4,5 @@ jsonlines
 langchain_community
 langchain_huggingface
 lm-eval==0.4.3
+openai
 ragas==0.1.19   
-openai
\ No newline at end of file

From 052431dd4c2c498dfb40f0957fc536abb0dd7ea9 Mon Sep 17 00:00:00 2001
From: aasavari <aasavari.dhananjay.kakne@intel.com>
Date: Thu, 10 Oct 2024 23:53:40 -0700
Subject: [PATCH 17/21] fulfilled feature request - allow unit test case for
 RAGAAF

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
---
 evals/metrics/ragaaf/rag_dataset.py | 88 +++++++++++------------------
 evals/metrics/ragaaf/run_eval.py    |  3 +-
 tests/requirements.txt              |  2 +-
 tests/test_ragaaf.py                | 23 ++++----
 4 files changed, 48 insertions(+), 68 deletions(-)

diff --git a/evals/metrics/ragaaf/rag_dataset.py b/evals/metrics/ragaaf/rag_dataset.py
index a955eae6..27ca8776 100644
--- a/evals/metrics/ragaaf/rag_dataset.py
+++ b/evals/metrics/ragaaf/rag_dataset.py
@@ -10,40 +10,47 @@
 class RAGDataset:
     """Dataset class to store data in HF datasets API format."""
 
-    def __init__(self, dataset, field_map, mode):
+    def __init__(self, dataset, field_map, mode, examples):
         self.dataset = dataset
         self.field_map = field_map
-        assert mode in ["local", "benchmarking"], "mode can be either local or benchmarking"
+        assert mode in ["unit", "local", "benchmarking"], "mode can be either unit or local or benchmarking"
         self.mode = mode
-        self.data = self.load_data()
+        self.data = self.load_data(examples)
         self.validate_dataset()
 
-    def load_data(self):
+    def load_example(self, obj):
+        ex = {}
+        for out_field, in_field in self.field_map.items():
+            if type(obj[in_field]) == list:
+                ex[out_field] = "\n".join(obj[in_field])
+            else:
+                ex[out_field] = obj[in_field]
+        return ex
+
+    def load_local_data(self):
+        assert os.path.exists(self.dataset), "There is no such file - {}".format(self.dataset)
+        with jsonlines.open(self.dataset) as reader:
+            data = [self.load_example(obj) for obj in reader]
+        return Dataset.from_list(data)
+
+    def load_unit_data(self, examples):
+        assert len(examples) >= 1, "Please provide atleast one example"
+        data = [self.load_example(obj) for obj in examples]
+        return Dataset.from_list(data)
+    
+    def load_benchmarking_data(self):
+        dataset = load_dataset(self.dataset)["train"]
+        data = [self.load_example(obj) for obj in dataset]
+        return Dataset.from_list(data)
+
+    def load_data(self, examples):
         if self.mode == "local":
-            assert os.path.exists(self.dataset), "There is no such file - {}".format(self.dataset)
-            with jsonlines.open(self.dataset) as reader:
-                data = []
-                for obj in reader:
-                    ex = {}
-                    for out_field, in_field in self.field_map.items():
-                        if type(obj[in_field]) == list:
-                            ex[out_field] = "\n".join(obj[in_field])
-                        else:
-                            ex[out_field] = obj[in_field]
-                    data.append(ex)
-            return Dataset.from_list(data)
+            return self.load_local_data()
+        elif self.mode == "unit":
+            return self.load_unit_data(examples)
         else:
-            data = []
-            for obj in load_dataset(self.dataset)["train"]:
-                ex = {}
-                for out_field, in_field in self.field_map.items():
-                    if type(obj[in_field]) == list:
-                        ex[out_field] = "\n".join(obj[in_field])
-                    else:
-                        ex[out_field] = obj[in_field]
-                data.append(ex)
-            return Dataset.from_list(data)
-
+            return self.load_benchmarking_data()
+            
     def validate_dataset(self):
         for i, example in enumerate(self.data):
             for out_field in self.field_map:
@@ -57,30 +64,3 @@ def __len__(self):
 
     def __iter__(self):
         return iter(self.data)
-
-
-if __name__ == "__main__":
-
-    dataset_path = "../../benchmark/ragas/ground_truth.jsonl"
-    field_map = {
-        "question": "question",
-        "ground_truth": "ground_truth",
-        "context": "context",
-    }
-
-    ds = RAGDataset(dataset=dataset_path, field_map=field_map, mode="local")
-
-    for i, ex in enumerate(ds):
-        assert ex["question"] == ds[i]["question"], "index {} does not have correct query".format(i)
-
-    dataset = "explodinggradients/ragas-wikiqa"
-    field_map = {
-        "question": "question",
-        "answer": "generated_with_rag",
-        "context": "context",
-        "ground_truth": "correct_answer",
-    }
-    ds = RAGDataset(dataset=dataset, field_map=field_map, mode="benchmarking")
-
-    for i, ex in enumerate(ds):
-        assert ex["question"] == ds[i]["question"], "index {} does not have correct query".format(i)
diff --git a/evals/metrics/ragaaf/run_eval.py b/evals/metrics/ragaaf/run_eval.py
index 8c170313..02914d1a 100644
--- a/evals/metrics/ragaaf/run_eval.py
+++ b/evals/metrics/ragaaf/run_eval.py
@@ -21,6 +21,7 @@ def __init__(
         evaluation_mode,
         model_name,
         evaluation_metrics,
+        examples=None,
         hf_token=None,
         openai_key=None,
         debug_mode=None,
@@ -30,7 +31,7 @@ def __init__(
             "endpoint": {"max_tokens": 500},
             "local": {"max_new_tokens": 500},
         }
-        self.data = RAGDataset(dataset=dataset, field_map=field_map, mode=data_mode)
+        self.data = RAGDataset(dataset=dataset, field_map=field_map, mode=data_mode, examples=examples)
         self.evaluator = self.get_evaluator(evaluation_mode, model_name, openai_key, hf_token)
         self.prompt_template = self.get_template(evaluation_metrics, field_map)
         self.debug_mode = debug_mode
diff --git a/tests/requirements.txt b/tests/requirements.txt
index f0b7a773..72ca037b 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -5,4 +5,4 @@ langchain_community
 langchain_huggingface
 lm-eval==0.4.3
 openai
-ragas==0.1.19   
+ragas==0.1.19   
\ No newline at end of file
diff --git a/tests/test_ragaaf.py b/tests/test_ragaaf.py
index 123ccc3e..72948c43 100644
--- a/tests/test_ragaaf.py
+++ b/tests/test_ragaaf.py
@@ -15,23 +15,21 @@
 class TestRagasMetric(unittest.TestCase):
 
     # @unittest.skip("need pass localhost id")
-    def test_ragas(self):
+    def test_ragaaf(self):
 
-        dataset = "explodinggradients/ragas-wikiqa"
-        data_mode = "benchmarking"
-        field_map = {"question": "question", "answer": "generated_with_rag", "context": "context"}
+        dataset = "sample data"
+        data_mode = "unit"
+        field_map = {"question": "question", "answer": "actual_output", "context": "contexts"}
 
-        # evaluation_mode = "openai"
-        # model_name = "gpt-4o"
-        # openai_key = "<add your openai key>"
+        question = "What if these shoes don't fit?"
+        actual_output = "We offer a 30-day full refund at no extra cost."
+        contexts = ["All customers are eligible for a 30 day full refund at no extra cost.", 
+                            "We can only process full refund upto 30 day after the purchase."]
+        examples = [{"question" : question, "actual_output" : actual_output, "contexts" : contexts}]
 
         evaluation_mode = "endpoint"
         model_name = f"http://{host_ip}:{port}"
 
-        # evaluation_mode = "local"
-        # model_name = "meta-llama/Llama-3.2-1B-Instruct"
-        # hf_token = "<add your HF token>"
-
         evaluation_metrics = ["factualness", "relevance", "correctness", "readability"]
 
         evaluator = AnnotationFreeEvaluate(
@@ -41,7 +39,8 @@ def test_ragas(self):
             evaluation_mode=evaluation_mode,
             model_name=model_name,
             evaluation_metrics=evaluation_metrics,
-            # openai_key=openai_key,
+            examples=examples,
+            openai_key=openai_key,
             # hf_token=hf_token,
             debug_mode=True,
         )

From ccc864b8f47582e307bc62c39e46f20b05d94c21 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 11 Oct 2024 06:58:45 +0000
Subject: [PATCH 18/21] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 evals/metrics/ragaaf/rag_dataset.py | 6 +++---
 tests/requirements.txt              | 2 +-
 tests/test_ragaaf.py                | 8 +++++---
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/evals/metrics/ragaaf/rag_dataset.py b/evals/metrics/ragaaf/rag_dataset.py
index 27ca8776..8b3824f4 100644
--- a/evals/metrics/ragaaf/rag_dataset.py
+++ b/evals/metrics/ragaaf/rag_dataset.py
@@ -34,10 +34,10 @@ def load_local_data(self):
         return Dataset.from_list(data)
 
     def load_unit_data(self, examples):
-        assert len(examples) >= 1, "Please provide atleast one example"
+        assert len(examples) >= 1, "Please provide at least one example"
         data = [self.load_example(obj) for obj in examples]
         return Dataset.from_list(data)
-    
+
     def load_benchmarking_data(self):
         dataset = load_dataset(self.dataset)["train"]
         data = [self.load_example(obj) for obj in dataset]
@@ -50,7 +50,7 @@ def load_data(self, examples):
             return self.load_unit_data(examples)
         else:
             return self.load_benchmarking_data()
-            
+
     def validate_dataset(self):
         for i, example in enumerate(self.data):
             for out_field in self.field_map:
diff --git a/tests/requirements.txt b/tests/requirements.txt
index 72ca037b..f0b7a773 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -5,4 +5,4 @@ langchain_community
 langchain_huggingface
 lm-eval==0.4.3
 openai
-ragas==0.1.19   
\ No newline at end of file
+ragas==0.1.19   
diff --git a/tests/test_ragaaf.py b/tests/test_ragaaf.py
index 72948c43..821e9b0b 100644
--- a/tests/test_ragaaf.py
+++ b/tests/test_ragaaf.py
@@ -23,9 +23,11 @@ def test_ragaaf(self):
 
         question = "What if these shoes don't fit?"
         actual_output = "We offer a 30-day full refund at no extra cost."
-        contexts = ["All customers are eligible for a 30 day full refund at no extra cost.", 
-                            "We can only process full refund upto 30 day after the purchase."]
-        examples = [{"question" : question, "actual_output" : actual_output, "contexts" : contexts}]
+        contexts = [
+            "All customers are eligible for a 30 day full refund at no extra cost.",
+            "We can only process full refund upto 30 day after the purchase.",
+        ]
+        examples = [{"question": question, "actual_output": actual_output, "contexts": contexts}]
 
         evaluation_mode = "endpoint"
         model_name = f"http://{host_ip}:{port}"

From 045eed690fad4580b0cde92b1274ec973d3a876a Mon Sep 17 00:00:00 2001
From: aasavari <aasavari.dhananjay.kakne@intel.com>
Date: Fri, 11 Oct 2024 00:04:01 -0700
Subject: [PATCH 19/21] removing extra inputs from evaluation

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
---
 tests/test_ragaaf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_ragaaf.py b/tests/test_ragaaf.py
index 72948c43..fdfef23d 100644
--- a/tests/test_ragaaf.py
+++ b/tests/test_ragaaf.py
@@ -40,7 +40,7 @@ def test_ragaaf(self):
             model_name=model_name,
             evaluation_metrics=evaluation_metrics,
             examples=examples,
-            openai_key=openai_key,
+            # openai_key=openai_key,
             # hf_token=hf_token,
             debug_mode=True,
         )

From 64493a418a80b233e92e252ad8c1c3b0d1ba498d Mon Sep 17 00:00:00 2001
From: aasavari <aasavari.dhananjay.kakne@intel.com>
Date: Fri, 11 Oct 2024 00:17:27 -0700
Subject: [PATCH 20/21] correcting class name for unit test

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
---
 tests/test_ragaaf.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/test_ragaaf.py b/tests/test_ragaaf.py
index 08b3ba4b..26f41524 100644
--- a/tests/test_ragaaf.py
+++ b/tests/test_ragaaf.py
@@ -12,7 +12,7 @@
 port = os.getenv("port", "8008")
 
 
-class TestRagasMetric(unittest.TestCase):
+class TestRagaafMetric(unittest.TestCase):
 
     # @unittest.skip("need pass localhost id")
     def test_ragaaf(self):
@@ -42,8 +42,6 @@ def test_ragaaf(self):
             model_name=model_name,
             evaluation_metrics=evaluation_metrics,
             examples=examples,
-            # openai_key=openai_key,
-            # hf_token=hf_token,
             debug_mode=True,
         )
 

From c58f377fa730a4956fed861e31049cbc324056dd Mon Sep 17 00:00:00 2001
From: aasavari <aasavari.dhananjay.kakne@intel.com>
Date: Fri, 11 Oct 2024 20:01:01 +0000
Subject: [PATCH 21/21] test needs local ID

Signed-off-by: aasavari <aasavari.dhananjay.kakne@intel.com>
---
 tests/test_ragaaf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_ragaaf.py b/tests/test_ragaaf.py
index 26f41524..b718bce1 100644
--- a/tests/test_ragaaf.py
+++ b/tests/test_ragaaf.py
@@ -14,7 +14,7 @@
 
 class TestRagaafMetric(unittest.TestCase):
 
-    # @unittest.skip("need pass localhost id")
+    @unittest.skip("need pass localhost id")
     def test_ragaaf(self):
 
         dataset = "sample data"