From ced68e1834d6d5c82356a7b16daa63e7e9410cc5 Mon Sep 17 00:00:00 2001
From: Yi Yao <yi.a.yao@intel.com>
Date: Sun, 3 Nov 2024 12:41:02 +0800
Subject: [PATCH 01/12] Add performance benchmark scripts for 4 use cases.
 (#1052)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 CodeGen/benchmark/performance/README.md       | 77 +++++++++++++++
 CodeGen/benchmark/performance/benchmark.sh    | 99 +++++++++++++++++++
 CodeGen/benchmark/performance/benchmark.yaml  | 47 +++++++++
 CodeTrans/benchmark/performance/README.md     | 77 +++++++++++++++
 CodeTrans/benchmark/performance/benchmark.sh  | 99 +++++++++++++++++++
 .../benchmark/performance/benchmark.yaml      | 47 +++++++++
 FaqGen/benchmark/performance/README.md        | 77 +++++++++++++++
 FaqGen/benchmark/performance/benchmark.sh     | 99 +++++++++++++++++++
 FaqGen/benchmark/performance/benchmark.yaml   | 47 +++++++++
 VisualQnA/benchmark/performance/README.md     | 77 +++++++++++++++
 VisualQnA/benchmark/performance/benchmark.sh  | 99 +++++++++++++++++++
 .../benchmark/performance/benchmark.yaml      | 47 +++++++++
 12 files changed, 892 insertions(+)
 create mode 100644 CodeGen/benchmark/performance/README.md
 create mode 100644 CodeGen/benchmark/performance/benchmark.sh
 create mode 100644 CodeGen/benchmark/performance/benchmark.yaml
 create mode 100644 CodeTrans/benchmark/performance/README.md
 create mode 100644 CodeTrans/benchmark/performance/benchmark.sh
 create mode 100644 CodeTrans/benchmark/performance/benchmark.yaml
 create mode 100644 FaqGen/benchmark/performance/README.md
 create mode 100644 FaqGen/benchmark/performance/benchmark.sh
 create mode 100644 FaqGen/benchmark/performance/benchmark.yaml
 create mode 100644 VisualQnA/benchmark/performance/README.md
 create mode 100644 VisualQnA/benchmark/performance/benchmark.sh
 create mode 100644 VisualQnA/benchmark/performance/benchmark.yaml

diff --git a/CodeGen/benchmark/performance/README.md b/CodeGen/benchmark/performance/README.md
new file mode 100644
index 0000000000..04bd383142
--- /dev/null
+++ b/CodeGen/benchmark/performance/README.md
@@ -0,0 +1,77 @@
+# CodeGen Benchmarking
+
+This folder contains a collection of scripts to enable inference benchmarking by leveraging a comprehensive benchmarking tool, [GenAIEval](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/README.md), that enables throughput analysis to assess inference performance.
+
+By following this guide, you can run benchmarks on your deployment and share the results with the OPEA community.
+
+## Purpose
+
+We aim to run these benchmarks and share them with the OPEA community for three primary reasons:
+
+- To offer insights on inference throughput in real-world scenarios, helping you choose the best service or deployment for your needs.
+- To establish a baseline for validating optimization solutions across different implementations, providing clear guidance on which methods are most effective for your use case.
+- To inspire the community to build upon our benchmarks, allowing us to better quantify new solutions in conjunction with current leading llms, serving frameworks etc.
+
+## Metrics
+
+The benchmark will report the below metrics, including:
+
+- Number of Concurrent Requests
+- End-to-End Latency: P50, P90, P99 (in milliseconds)
+- End-to-End First Token Latency: P50, P90, P99 (in milliseconds)
+- Average Next Token Latency (in milliseconds)
+- Average Token Latency (in milliseconds)
+- Requests Per Second (RPS)
+- Output Tokens Per Second
+- Input Tokens Per Second
+
+Results will be displayed in the terminal and saved as CSV file named `1_testspec.yaml`.
+
+## Getting Started
+
+We recommend using Kubernetes to deploy the CodeGen service, as it offers benefits such as load balancing and improved scalability. However, you can also deploy the service using Docker if that better suits your needs.
+
+### Prerequisites
+
+- Install Kubernetes by following [this guide](https://github.com/opea-project/docs/blob/main/guide/installation/k8s_install/k8s_install_kubespray.md).
+
+- Every node has direct internet access
+- Set up kubectl on the master node with access to the Kubernetes cluster.
+- Install Python 3.8+ on the master node for running GenAIEval.
+- Ensure all nodes have a local /mnt/models folder, which will be mounted by the pods.
+- Ensure that the container's ulimit can meet the the number of requests.
+
+```bash
+# The way to modify the containered ulimit:
+sudo systemctl edit containerd
+# Add two lines:
+[Service]
+LimitNOFILE=65536:1048576
+
+sudo systemctl daemon-reload; sudo systemctl restart containerd
+```
+
+### Test Steps
+
+Please deploy CodeGen service before benchmarking.
+
+##### Run Benchmark Test
+
+Before the benchmark, we can configure the number of test queries and test output directory by:
+
+```bash
+export USER_QUERIES="[128, 128, 128, 128]"
+export TEST_OUTPUT_DIR="/tmp/benchmark_output"
+```
+
+And then run the benchmark by:
+
+```bash
+bash benchmark.sh -n <node_count>
+```
+
+The argument `-n` refers to the number of test nodes.
+
+##### 4. Data collection
+
+All the test results will come to this folder `/tmp/benchmark_output` configured by the environment variable `TEST_OUTPUT_DIR` in previous steps.
diff --git a/CodeGen/benchmark/performance/benchmark.sh b/CodeGen/benchmark/performance/benchmark.sh
new file mode 100644
index 0000000000..e1ab2dae86
--- /dev/null
+++ b/CodeGen/benchmark/performance/benchmark.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+deployment_type="k8s"
+node_number=1
+service_port=7778
+query_per_node=128
+
+benchmark_tool_path="$(pwd)/GenAIEval"
+
+usage() {
+    echo "Usage: $0 [-d deployment_type] [-n node_number] [-i service_ip] [-p service_port]"
+    echo "  -d deployment_type    deployment type, select between k8s and docker (default: ${deployment_type})"
+    echo "  -n node_number        Test node number, required only for k8s deployment_type, (default: ${node_number})"
+    echo "  -i service_ip         service ip, required only for docker deployment_type"
+    echo "  -p service_port       service port, required only for docker deployment_type, (default: ${service_port})"
+    exit 1
+}
+
+while getopts ":d:n:i:p:" opt; do
+    case ${opt} in
+        d )
+            deployment_type=$OPTARG
+            ;;
+        n )
+            node_number=$OPTARG
+            ;;
+        i )
+            service_ip=$OPTARG
+            ;;
+        p )
+            service_port=$OPTARG
+            ;;
+        \? )
+            echo "Invalid option: -$OPTARG" 1>&2
+            usage
+            ;;
+        : )
+            echo "Invalid option: -$OPTARG requires an argument" 1>&2
+            usage
+            ;;
+    esac
+done
+
+if [[ "$deployment_type" == "docker" && -z "$service_ip" ]]; then
+    echo "Error: service_ip is required for docker deployment_type" 1>&2
+    usage
+fi
+
+if [[ "$deployment_type" == "k8s" && ( -n "$service_ip" || -n "$service_port" ) ]]; then
+    echo "Warning: service_ip and service_port are ignored for k8s deployment_type" 1>&2
+fi
+
+function main() {
+    if [[ ! -d ${benchmark_tool_path} ]]; then
+        echo "Benchmark tool not found, setting up..."
+        setup_env
+    fi
+    run_benchmark
+}
+
+function setup_env() {
+    git clone https://github.com/opea-project/GenAIEval.git
+    pushd ${benchmark_tool_path}
+    python3 -m venv stress_venv
+    source stress_venv/bin/activate
+    pip install -r requirements.txt
+    popd
+}
+
+function run_benchmark() {
+    source ${benchmark_tool_path}/stress_venv/bin/activate
+    export DEPLOYMENT_TYPE=${deployment_type}
+    export SERVICE_IP=${service_ip:-"None"}
+    export SERVICE_PORT=${service_port:-"None"}
+    if [[ -z $USER_QUERIES ]]; then
+        user_query=$((query_per_node*node_number))
+        export USER_QUERIES="[${user_query}, ${user_query}, ${user_query}, ${user_query}]"
+        echo "USER_QUERIES not configured, setting to: ${USER_QUERIES}."
+    fi
+    export WARMUP=$(echo $USER_QUERIES | sed -e 's/[][]//g' -e 's/,.*//')
+    if [[ -z $WARMUP ]]; then export WARMUP=0; fi
+    if [[ -z $TEST_OUTPUT_DIR ]]; then
+        if [[ $DEPLOYMENT_TYPE == "k8s" ]]; then
+            export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/node_${node_number}"
+        else
+            export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/docker"
+        fi
+        echo "TEST_OUTPUT_DIR not configured, setting to: ${TEST_OUTPUT_DIR}."
+    fi
+
+    envsubst < ./benchmark.yaml > ${benchmark_tool_path}/evals/benchmark/benchmark.yaml
+    cd ${benchmark_tool_path}/evals/benchmark
+    python benchmark.py
+}
+
+main
diff --git a/CodeGen/benchmark/performance/benchmark.yaml b/CodeGen/benchmark/performance/benchmark.yaml
new file mode 100644
index 0000000000..90d74d02bf
--- /dev/null
+++ b/CodeGen/benchmark/performance/benchmark.yaml
@@ -0,0 +1,47 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+test_suite_config: # Overall configuration settings for the test suite
+  examples: ["chatqna"]  # The specific test cases being tested, e.g., chatqna, codegen, codetrans, faqgen, audioqna, visualqna
+  deployment_type: "k8s"  # Default is "k8s", can also be "docker"
+  service_ip: None  # Leave as None for k8s, specify for Docker
+  service_port: None  # Leave as None for k8s, specify for Docker
+  warm_ups: 0  # Number of test requests for warm-up
+  run_time: 60m  # The max total run time for the test suite
+  seed:  # The seed for all RNGs
+  user_queries: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]  # Number of test requests at each concurrency level
+  query_timeout: 120  # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult.
+  random_prompt: false  # Use random prompts if true, fixed prompts if false
+  collect_service_metric: false  # Collect service metrics if true, do not collect service metrics if false
+  data_visualization: false # Generate data visualization if true, do not generate data visualization if false
+  llm_model: "Qwen/CodeQwen1.5-7B-Chat"  # The LLM model used for the test
+  test_output_dir: "/tmp/benchmark_output"  # The directory to store the test output
+  load_shape:              # Tenant concurrency pattern
+    name: constant           # poisson or constant(locust default load shape)
+    params:                  # Loadshape-specific parameters
+      constant:                # Constant load shape specific parameters, activate only if load_shape.name is constant
+        concurrent_level: 4      # If user_queries is specified, concurrent_level is target number of requests per user. If not, it is the number of simulated users
+        # arrival_rate: 1.0       # Request arrival rate. If set, concurrent_level will be overridden, constant load will be generated based on arrival-rate
+      poisson:                 # Poisson load shape specific parameters, activate only if load_shape.name is poisson
+        arrival_rate: 1.0        # Request arrival rate
+  namespace: "" # Fill the user-defined namespace. Otherwise, it will be default.
+
+test_cases:
+  codegen:
+    llm:
+      run_test: true
+      service_name: "llm-dependency-svc"  # Replace with your service name
+      parameters:
+        model_name: "Qwen/CodeQwen1.5-7B-Chat"
+        max_new_tokens: 128
+        temperature: 0.01
+        top_k: 10
+        top_p: 0.95
+        repetition_penalty: 1.03
+        streaming: true
+    llmserve:
+      run_test: true
+      service_name: "llm-svc"  # Replace with your service name
+    e2e:
+      run_test: true
+      service_name: "codegen-backend-svc"  # Replace with your service name
diff --git a/CodeTrans/benchmark/performance/README.md b/CodeTrans/benchmark/performance/README.md
new file mode 100644
index 0000000000..5e447d1eee
--- /dev/null
+++ b/CodeTrans/benchmark/performance/README.md
@@ -0,0 +1,77 @@
+# CodeTrans Benchmarking
+
+This folder contains a collection of scripts to enable inference benchmarking by leveraging a comprehensive benchmarking tool, [GenAIEval](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/README.md), that enables throughput analysis to assess inference performance.
+
+By following this guide, you can run benchmarks on your deployment and share the results with the OPEA community.
+
+## Purpose
+
+We aim to run these benchmarks and share them with the OPEA community for three primary reasons:
+
+- To offer insights on inference throughput in real-world scenarios, helping you choose the best service or deployment for your needs.
+- To establish a baseline for validating optimization solutions across different implementations, providing clear guidance on which methods are most effective for your use case.
+- To inspire the community to build upon our benchmarks, allowing us to better quantify new solutions in conjunction with current leading llms, serving frameworks etc.
+
+## Metrics
+
+The benchmark will report the below metrics, including:
+
+- Number of Concurrent Requests
+- End-to-End Latency: P50, P90, P99 (in milliseconds)
+- End-to-End First Token Latency: P50, P90, P99 (in milliseconds)
+- Average Next Token Latency (in milliseconds)
+- Average Token Latency (in milliseconds)
+- Requests Per Second (RPS)
+- Output Tokens Per Second
+- Input Tokens Per Second
+
+Results will be displayed in the terminal and saved as CSV file named `1_testspec.yaml`.
+
+## Getting Started
+
+We recommend using Kubernetes to deploy the CodeTrans service, as it offers benefits such as load balancing and improved scalability. However, you can also deploy the service using Docker if that better suits your needs.
+
+### Prerequisites
+
+- Install Kubernetes by following [this guide](https://github.com/opea-project/docs/blob/main/guide/installation/k8s_install/k8s_install_kubespray.md).
+
+- Every node has direct internet access
+- Set up kubectl on the master node with access to the Kubernetes cluster.
+- Install Python 3.8+ on the master node for running GenAIEval.
+- Ensure all nodes have a local /mnt/models folder, which will be mounted by the pods.
+- Ensure that the container's ulimit can meet the the number of requests.
+
+```bash
+# The way to modify the containered ulimit:
+sudo systemctl edit containerd
+# Add two lines:
+[Service]
+LimitNOFILE=65536:1048576
+
+sudo systemctl daemon-reload; sudo systemctl restart containerd
+```
+
+### Test Steps
+
+Please deploy CodeTrans service before benchmarking.
+
+##### Run Benchmark Test
+
+Before the benchmark, we can configure the number of test queries and test output directory by:
+
+```bash
+export USER_QUERIES="[1, 1, 1, 1]"
+export TEST_OUTPUT_DIR="/tmp/benchmark_output"
+```
+
+And then run the benchmark by:
+
+```bash
+bash benchmark.sh -n <node_count>
+```
+
+The argument `-n` refers to the number of test nodes.
+
+##### 4. Data collection
+
+All the test results will come to this folder `/tmp/benchmark_output` configured by the environment variable `TEST_OUTPUT_DIR` in previous steps.
diff --git a/CodeTrans/benchmark/performance/benchmark.sh b/CodeTrans/benchmark/performance/benchmark.sh
new file mode 100644
index 0000000000..6eac50baf8
--- /dev/null
+++ b/CodeTrans/benchmark/performance/benchmark.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+deployment_type="k8s"
+node_number=1
+service_port=7777
+query_per_node=128
+
+benchmark_tool_path="$(pwd)/GenAIEval"
+
+usage() {
+    echo "Usage: $0 [-d deployment_type] [-n node_number] [-i service_ip] [-p service_port]"
+    echo "  -d deployment_type    deployment type, select between k8s and docker (default: ${deployment_type})"
+    echo "  -n node_number        Test node number, required only for k8s deployment_type, (default: ${node_number})"
+    echo "  -i service_ip         service ip, required only for docker deployment_type"
+    echo "  -p service_port       service port, required only for docker deployment_type, (default: ${service_port})"
+    exit 1
+}
+
+while getopts ":d:n:i:p:" opt; do
+    case ${opt} in
+        d )
+            deployment_type=$OPTARG
+            ;;
+        n )
+            node_number=$OPTARG
+            ;;
+        i )
+            service_ip=$OPTARG
+            ;;
+        p )
+            service_port=$OPTARG
+            ;;
+        \? )
+            echo "Invalid option: -$OPTARG" 1>&2
+            usage
+            ;;
+        : )
+            echo "Invalid option: -$OPTARG requires an argument" 1>&2
+            usage
+            ;;
+    esac
+done
+
+if [[ "$deployment_type" == "docker" && -z "$service_ip" ]]; then
+    echo "Error: service_ip is required for docker deployment_type" 1>&2
+    usage
+fi
+
+if [[ "$deployment_type" == "k8s" && ( -n "$service_ip" || -n "$service_port" ) ]]; then
+    echo "Warning: service_ip and service_port are ignored for k8s deployment_type" 1>&2
+fi
+
+function main() {
+    if [[ ! -d ${benchmark_tool_path} ]]; then
+        echo "Benchmark tool not found, setting up..."
+        setup_env
+    fi
+    run_benchmark
+}
+
+function setup_env() {
+    git clone https://github.com/opea-project/GenAIEval.git
+    pushd ${benchmark_tool_path}
+    python3 -m venv stress_venv
+    source stress_venv/bin/activate
+    pip install -r requirements.txt
+    popd
+}
+
+function run_benchmark() {
+    source ${benchmark_tool_path}/stress_venv/bin/activate
+    export DEPLOYMENT_TYPE=${deployment_type}
+    export SERVICE_IP=${service_ip:-"None"}
+    export SERVICE_PORT=${service_port:-"None"}
+    if [[ -z $USER_QUERIES ]]; then
+        user_query=$((query_per_node*node_number))
+        export USER_QUERIES="[${user_query}, ${user_query}, ${user_query}, ${user_query}]"
+        echo "USER_QUERIES not configured, setting to: ${USER_QUERIES}."
+    fi
+    export WARMUP=$(echo $USER_QUERIES | sed -e 's/[][]//g' -e 's/,.*//')
+    if [[ -z $WARMUP ]]; then export WARMUP=0; fi
+    if [[ -z $TEST_OUTPUT_DIR ]]; then
+        if [[ $DEPLOYMENT_TYPE == "k8s" ]]; then
+            export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/node_${node_number}"
+        else
+            export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/docker"
+        fi
+        echo "TEST_OUTPUT_DIR not configured, setting to: ${TEST_OUTPUT_DIR}."
+    fi
+
+    envsubst < ./benchmark.yaml > ${benchmark_tool_path}/evals/benchmark/benchmark.yaml
+    cd ${benchmark_tool_path}/evals/benchmark
+    python benchmark.py
+}
+
+main
diff --git a/CodeTrans/benchmark/performance/benchmark.yaml b/CodeTrans/benchmark/performance/benchmark.yaml
new file mode 100644
index 0000000000..8680e886de
--- /dev/null
+++ b/CodeTrans/benchmark/performance/benchmark.yaml
@@ -0,0 +1,47 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+test_suite_config: # Overall configuration settings for the test suite
+  examples: ["codetrans"]  # The specific test cases being tested, e.g., chatqna, codegen, codetrans, faqgen, audioqna, visualqna
+  deployment_type: "k8s"  # Default is "k8s", can also be "docker"
+  service_ip: None  # Leave as None for k8s, specify for Docker
+  service_port: None  # Leave as None for k8s, specify for Docker
+  warm_ups: 0  # Number of test requests for warm-up
+  run_time: 60m  # The max total run time for the test suite
+  seed:  # The seed for all RNGs
+  user_queries: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]  # Number of test requests at each concurrency level
+  query_timeout: 120  # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult.
+  random_prompt: false  # Use random prompts if true, fixed prompts if false
+  collect_service_metric: false  # Collect service metrics if true, do not collect service metrics if false
+  data_visualization: false # Generate data visualization if true, do not generate data visualization if false
+  llm_model: "HuggingFaceH4/mistral-7b-grok"  # The LLM model used for the test
+  test_output_dir: "/home/sdp/benchmark_output"  # The directory to store the test output
+  load_shape:              # Tenant concurrency pattern
+    name: constant           # poisson or constant(locust default load shape)
+    params:                  # Loadshape-specific parameters
+      constant:                # Constant load shape specific parameters, activate only if load_shape.name is constant
+        concurrent_level: 4      # If user_queries is specified, concurrent_level is target number of requests per user. If not, it is the number of simulated users
+        # arrival_rate: 1.0       # Request arrival rate. If set, concurrent_level will be overridden, constant load will be generated based on arrival-rate
+      poisson:                 # Poisson load shape specific parameters, activate only if load_shape.name is poisson
+        arrival_rate: 1.0        # Request arrival rate
+  namespace: "" # Fill the user-defined namespace. Otherwise, it will be default.
+
+test_cases:
+  codetrans:
+    llm:
+      run_test: true
+      service_name: "llm-svc"  # Replace with your service name
+      parameters:
+        model_name: "HuggingFaceH4/mistral-7b-grok"
+        max_new_tokens: 128
+        temperature: 0.01
+        top_k: 10
+        top_p: 0.95
+        repetition_penalty: 1.03
+        streaming: true
+    llmserve:
+      run_test: true
+      service_name: "codetrans-llm-svc"  # Replace with your service name
+    e2e:
+      run_test: true
+      service_name: "codetrans-backend-server-svc"  # Replace with your service name
diff --git a/FaqGen/benchmark/performance/README.md b/FaqGen/benchmark/performance/README.md
new file mode 100644
index 0000000000..3cd38ae6ab
--- /dev/null
+++ b/FaqGen/benchmark/performance/README.md
@@ -0,0 +1,77 @@
+# FaqGen Benchmarking
+
+This folder contains a collection of scripts to enable inference benchmarking by leveraging a comprehensive benchmarking tool, [GenAIEval](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/README.md), that enables throughput analysis to assess inference performance.
+
+By following this guide, you can run benchmarks on your deployment and share the results with the OPEA community.
+
+## Purpose
+
+We aim to run these benchmarks and share them with the OPEA community for three primary reasons:
+
+- To offer insights on inference throughput in real-world scenarios, helping you choose the best service or deployment for your needs.
+- To establish a baseline for validating optimization solutions across different implementations, providing clear guidance on which methods are most effective for your use case.
+- To inspire the community to build upon our benchmarks, allowing us to better quantify new solutions in conjunction with current leading llms, serving frameworks etc.
+
+## Metrics
+
+The benchmark will report the below metrics, including:
+
+- Number of Concurrent Requests
+- End-to-End Latency: P50, P90, P99 (in milliseconds)
+- End-to-End First Token Latency: P50, P90, P99 (in milliseconds)
+- Average Next Token Latency (in milliseconds)
+- Average Token Latency (in milliseconds)
+- Requests Per Second (RPS)
+- Output Tokens Per Second
+- Input Tokens Per Second
+
+Results will be displayed in the terminal and saved as CSV file named `1_testspec.yaml`.
+
+## Getting Started
+
+We recommend using Kubernetes to deploy the FaqGen service, as it offers benefits such as load balancing and improved scalability. However, you can also deploy the service using Docker if that better suits your needs.
+
+### Prerequisites
+
+- Install Kubernetes by following [this guide](https://github.com/opea-project/docs/blob/main/guide/installation/k8s_install/k8s_install_kubespray.md).
+
+- Every node has direct internet access
+- Set up kubectl on the master node with access to the Kubernetes cluster.
+- Install Python 3.8+ on the master node for running GenAIEval.
+- Ensure all nodes have a local /mnt/models folder, which will be mounted by the pods.
+- Ensure that the container's ulimit can meet the the number of requests.
+
+```bash
+# The way to modify the containered ulimit:
+sudo systemctl edit containerd
+# Add two lines:
+[Service]
+LimitNOFILE=65536:1048576
+
+sudo systemctl daemon-reload; sudo systemctl restart containerd
+```
+
+### Test Steps
+
+Please deploy FaqGen service before benchmarking.
+
+##### Run Benchmark Test
+
+Before the benchmark, we can configure the number of test queries and test output directory by:
+
+```bash
+export USER_QUERIES="[1, 1, 1, 1]"
+export TEST_OUTPUT_DIR="/tmp/benchmark_output"
+```
+
+And then run the benchmark by:
+
+```bash
+bash benchmark.sh -n <node_count>
+```
+
+The argument `-n` refers to the number of test nodes.
+
+##### 4. Data collection
+
+All the test results will come to this folder `/tmp/benchmark_output` configured by the environment variable `TEST_OUTPUT_DIR` in previous steps.
diff --git a/FaqGen/benchmark/performance/benchmark.sh b/FaqGen/benchmark/performance/benchmark.sh
new file mode 100644
index 0000000000..44abdecbb1
--- /dev/null
+++ b/FaqGen/benchmark/performance/benchmark.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+deployment_type="k8s"
+node_number=1
+service_port=8888
+query_per_node=128
+
+benchmark_tool_path="$(pwd)/GenAIEval"
+
+usage() {
+    echo "Usage: $0 [-d deployment_type] [-n node_number] [-i service_ip] [-p service_port]"
+    echo "  -d deployment_type    deployment type, select between k8s and docker (default: ${deployment_type})"
+    echo "  -n node_number        Test node number, required only for k8s deployment_type, (default: ${node_number})"
+    echo "  -i service_ip         service ip, required only for docker deployment_type"
+    echo "  -p service_port       service port, required only for docker deployment_type, (default: ${service_port})"
+    exit 1
+}
+
+while getopts ":d:n:i:p:" opt; do
+    case ${opt} in
+        d )
+            deployment_type=$OPTARG
+            ;;
+        n )
+            node_number=$OPTARG
+            ;;
+        i )
+            service_ip=$OPTARG
+            ;;
+        p )
+            service_port=$OPTARG
+            ;;
+        \? )
+            echo "Invalid option: -$OPTARG" 1>&2
+            usage
+            ;;
+        : )
+            echo "Invalid option: -$OPTARG requires an argument" 1>&2
+            usage
+            ;;
+    esac
+done
+
+if [[ "$deployment_type" == "docker" && -z "$service_ip" ]]; then
+    echo "Error: service_ip is required for docker deployment_type" 1>&2
+    usage
+fi
+
+if [[ "$deployment_type" == "k8s" && ( -n "$service_ip" || -n "$service_port" ) ]]; then
+    echo "Warning: service_ip and service_port are ignored for k8s deployment_type" 1>&2
+fi
+
+function main() {
+    if [[ ! -d ${benchmark_tool_path} ]]; then
+        echo "Benchmark tool not found, setting up..."
+        setup_env
+    fi
+    run_benchmark
+}
+
+function setup_env() {
+    git clone https://github.com/opea-project/GenAIEval.git
+    pushd ${benchmark_tool_path}
+    python3 -m venv stress_venv
+    source stress_venv/bin/activate
+    pip install -r requirements.txt
+    popd
+}
+
+function run_benchmark() {
+    source ${benchmark_tool_path}/stress_venv/bin/activate
+    export DEPLOYMENT_TYPE=${deployment_type}
+    export SERVICE_IP=${service_ip:-"None"}
+    export SERVICE_PORT=${service_port:-"None"}
+    if [[ -z $USER_QUERIES ]]; then
+        user_query=$((query_per_node*node_number))
+        export USER_QUERIES="[${user_query}, ${user_query}, ${user_query}, ${user_query}]"
+        echo "USER_QUERIES not configured, setting to: ${USER_QUERIES}."
+    fi
+    export WARMUP=$(echo $USER_QUERIES | sed -e 's/[][]//g' -e 's/,.*//')
+    if [[ -z $WARMUP ]]; then export WARMUP=0; fi
+    if [[ -z $TEST_OUTPUT_DIR ]]; then
+        if [[ $DEPLOYMENT_TYPE == "k8s" ]]; then
+            export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/node_${node_number}"
+        else
+            export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/docker"
+        fi
+        echo "TEST_OUTPUT_DIR not configured, setting to: ${TEST_OUTPUT_DIR}."
+    fi
+
+    envsubst < ./benchmark.yaml > ${benchmark_tool_path}/evals/benchmark/benchmark.yaml
+    cd ${benchmark_tool_path}/evals/benchmark
+    python benchmark.py
+}
+
+main
diff --git a/FaqGen/benchmark/performance/benchmark.yaml b/FaqGen/benchmark/performance/benchmark.yaml
new file mode 100644
index 0000000000..2c9c914de3
--- /dev/null
+++ b/FaqGen/benchmark/performance/benchmark.yaml
@@ -0,0 +1,47 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+test_suite_config: # Overall configuration settings for the test suite
+  examples: ["faqgen"]  # The specific test cases being tested, e.g., chatqna, codegen, codetrans, faqgen, audioqna, visualqna
+  deployment_type: "k8s"  # Default is "k8s", can also be "docker"
+  service_ip: None  # Leave as None for k8s, specify for Docker
+  service_port: None  # Leave as None for k8s, specify for Docker
+  warm_ups: 0  # Number of test requests for warm-up
+  run_time: 60m  # The max total run time for the test suite
+  seed:  # The seed for all RNGs
+  user_queries: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]  # Number of test requests at each concurrency level
+  query_timeout: 120  # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult.
+  random_prompt: false  # Use random prompts if true, fixed prompts if false
+  collect_service_metric: false  # Collect service metrics if true, do not collect service metrics if false
+  data_visualization: false # Generate data visualization if true, do not generate data visualization if false
+  llm_model: "meta-llama/Meta-Llama-3-8B-Instruct"  # The LLM model used for the test
+  test_output_dir: "/tmp/benchmark_output"  # The directory to store the test output
+  load_shape:              # Tenant concurrency pattern
+    name: constant           # poisson or constant(locust default load shape)
+    params:                  # Loadshape-specific parameters
+      constant:                # Constant load shape specific parameters, activate only if load_shape.name is constant
+        concurrent_level: 4      # If user_queries is specified, concurrent_level is target number of requests per user. If not, it is the number of simulated users
+        # arrival_rate: 1.0       # Request arrival rate. If set, concurrent_level will be overridden, constant load will be generated based on arrival-rate
+      poisson:                 # Poisson load shape specific parameters, activate only if load_shape.name is poisson
+        arrival_rate: 1.0        # Request arrival rate
+  namespace: "" # Fill the user-defined namespace. Otherwise, it will be default.
+
+test_cases:
+  faqgen:
+    llm:
+      run_test: false
+      service_name: "faq-tgi-svc"  # Replace with your service name
+      parameters:
+        model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
+        max_new_tokens: 128
+        temperature: 0.01
+        top_k: 10
+        top_p: 0.95
+        repetition_penalty: 1.03
+        streaming: true
+    llmserve:
+      run_test: false
+      service_name: "faq-micro-svc"  # Replace with your service name
+    e2e:
+      run_test: true
+      service_name: "faq-mega-server-svc"  # Replace with your service name
diff --git a/VisualQnA/benchmark/performance/README.md b/VisualQnA/benchmark/performance/README.md
new file mode 100644
index 0000000000..41e062422d
--- /dev/null
+++ b/VisualQnA/benchmark/performance/README.md
@@ -0,0 +1,77 @@
+# VisualQnA Benchmarking
+
+This folder contains a collection of scripts to enable inference benchmarking by leveraging a comprehensive benchmarking tool, [GenAIEval](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/README.md), that enables throughput analysis to assess inference performance.
+
+By following this guide, you can run benchmarks on your deployment and share the results with the OPEA community.
+
+## Purpose
+
+We aim to run these benchmarks and share them with the OPEA community for three primary reasons:
+
+- To offer insights on inference throughput in real-world scenarios, helping you choose the best service or deployment for your needs.
+- To establish a baseline for validating optimization solutions across different implementations, providing clear guidance on which methods are most effective for your use case.
+- To inspire the community to build upon our benchmarks, allowing us to better quantify new solutions in conjunction with current leading llms, serving frameworks etc.
+
+## Metrics
+
+The benchmark will report the below metrics, including:
+
+- Number of Concurrent Requests
+- End-to-End Latency: P50, P90, P99 (in milliseconds)
+- End-to-End First Token Latency: P50, P90, P99 (in milliseconds)
+- Average Next Token Latency (in milliseconds)
+- Average Token Latency (in milliseconds)
+- Requests Per Second (RPS)
+- Output Tokens Per Second
+- Input Tokens Per Second
+
+Results will be displayed in the terminal and saved as CSV file named `1_testspec.yaml`.
+
+## Getting Started
+
+We recommend using Kubernetes to deploy the VisualQnA service, as it offers benefits such as load balancing and improved scalability. However, you can also deploy the service using Docker if that better suits your needs.
+
+### Prerequisites
+
+- Install Kubernetes by following [this guide](https://github.com/opea-project/docs/blob/main/guide/installation/k8s_install/k8s_install_kubespray.md).
+
+- Every node has direct internet access
+- Set up kubectl on the master node with access to the Kubernetes cluster.
+- Install Python 3.8+ on the master node for running GenAIEval.
+- Ensure all nodes have a local /mnt/models folder, which will be mounted by the pods.
+- Ensure that the container's ulimit can meet the the number of requests.
+
+```bash
+# The way to modify the containered ulimit:
+sudo systemctl edit containerd
+# Add two lines:
+[Service]
+LimitNOFILE=65536:1048576
+
+sudo systemctl daemon-reload; sudo systemctl restart containerd
+```
+
+### Test Steps
+
+Please deploy VisualQnA service before benchmarking.
+
+##### Run Benchmark Test
+
+Before the benchmark, we can configure the number of test queries and test output directory by:
+
+```bash
+export USER_QUERIES="[1, 1, 1, 1]"
+export TEST_OUTPUT_DIR="/tmp/benchmark_output"
+```
+
+And then run the benchmark by:
+
+```bash
+bash benchmark.sh -n <node_count>
+```
+
+The argument `-n` refers to the number of test nodes.
+
+##### 4. Data collection
+
+All the test results will come to this folder `/tmp/benchmark_output` configured by the environment variable `TEST_OUTPUT_DIR` in previous steps.
diff --git a/VisualQnA/benchmark/performance/benchmark.sh b/VisualQnA/benchmark/performance/benchmark.sh
new file mode 100644
index 0000000000..44abdecbb1
--- /dev/null
+++ b/VisualQnA/benchmark/performance/benchmark.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+deployment_type="k8s"
+node_number=1
+service_port=8888
+query_per_node=128
+
+benchmark_tool_path="$(pwd)/GenAIEval"
+
+usage() {
+    echo "Usage: $0 [-d deployment_type] [-n node_number] [-i service_ip] [-p service_port]"
+    echo "  -d deployment_type    deployment type, select between k8s and docker (default: ${deployment_type})"
+    echo "  -n node_number        Test node number, required only for k8s deployment_type, (default: ${node_number})"
+    echo "  -i service_ip         service ip, required only for docker deployment_type"
+    echo "  -p service_port       service port, required only for docker deployment_type, (default: ${service_port})"
+    exit 1
+}
+
+while getopts ":d:n:i:p:" opt; do
+    case ${opt} in
+        d )
+            deployment_type=$OPTARG
+            ;;
+        n )
+            node_number=$OPTARG
+            ;;
+        i )
+            service_ip=$OPTARG
+            ;;
+        p )
+            service_port=$OPTARG
+            ;;
+        \? )
+            echo "Invalid option: -$OPTARG" 1>&2
+            usage
+            ;;
+        : )
+            echo "Invalid option: -$OPTARG requires an argument" 1>&2
+            usage
+            ;;
+    esac
+done
+
+if [[ "$deployment_type" == "docker" && -z "$service_ip" ]]; then
+    echo "Error: service_ip is required for docker deployment_type" 1>&2
+    usage
+fi
+
+if [[ "$deployment_type" == "k8s" && ( -n "$service_ip" || -n "$service_port" ) ]]; then
+    echo "Warning: service_ip and service_port are ignored for k8s deployment_type" 1>&2
+fi
+
+function main() {
+    if [[ ! -d ${benchmark_tool_path} ]]; then
+        echo "Benchmark tool not found, setting up..."
+        setup_env
+    fi
+    run_benchmark
+}
+
+function setup_env() {
+    git clone https://github.com/opea-project/GenAIEval.git
+    pushd ${benchmark_tool_path}
+    python3 -m venv stress_venv
+    source stress_venv/bin/activate
+    pip install -r requirements.txt
+    popd
+}
+
+function run_benchmark() {
+    source ${benchmark_tool_path}/stress_venv/bin/activate
+    export DEPLOYMENT_TYPE=${deployment_type}
+    export SERVICE_IP=${service_ip:-"None"}
+    export SERVICE_PORT=${service_port:-"None"}
+    if [[ -z $USER_QUERIES ]]; then
+        user_query=$((query_per_node*node_number))
+        export USER_QUERIES="[${user_query}, ${user_query}, ${user_query}, ${user_query}]"
+        echo "USER_QUERIES not configured, setting to: ${USER_QUERIES}."
+    fi
+    export WARMUP=$(echo $USER_QUERIES | sed -e 's/[][]//g' -e 's/,.*//')
+    if [[ -z $WARMUP ]]; then export WARMUP=0; fi
+    if [[ -z $TEST_OUTPUT_DIR ]]; then
+        if [[ $DEPLOYMENT_TYPE == "k8s" ]]; then
+            export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/node_${node_number}"
+        else
+            export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/docker"
+        fi
+        echo "TEST_OUTPUT_DIR not configured, setting to: ${TEST_OUTPUT_DIR}."
+    fi
+
+    envsubst < ./benchmark.yaml > ${benchmark_tool_path}/evals/benchmark/benchmark.yaml
+    cd ${benchmark_tool_path}/evals/benchmark
+    python benchmark.py
+}
+
+main
diff --git a/VisualQnA/benchmark/performance/benchmark.yaml b/VisualQnA/benchmark/performance/benchmark.yaml
new file mode 100644
index 0000000000..9ddf922936
--- /dev/null
+++ b/VisualQnA/benchmark/performance/benchmark.yaml
@@ -0,0 +1,47 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+test_suite_config: # Overall configuration settings for the test suite
+  examples: ["visualqna"]  # The specific test cases being tested, e.g., chatqna, codegen, codetrans, faqgen, audioqna, visualqna
+  deployment_type: "k8s"  # Default is "k8s", can also be "docker"
+  service_ip: None  # Leave as None for k8s, specify for Docker
+  service_port: None  # Leave as None for k8s, specify for Docker
+  warm_ups: 0  # Number of test requests for warm-up
+  run_time: 60m  # The max total run time for the test suite
+  seed:  # The seed for all RNGs
+  user_queries: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]  # Number of test requests at each concurrency level
+  query_timeout: 120  # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult.
+  random_prompt: false  # Use random prompts if true, fixed prompts if false
+  collect_service_metric: false  # Collect service metrics if true, do not collect service metrics if false
+  data_visualization: false # Generate data visualization if true, do not generate data visualization if false
+  llm_model: "llava-hf/llava-v1.6-mistral-7b-hf"  # The LLM model used for the test
+  test_output_dir: "/tmp/benchmark_output"  # The directory to store the test output
+  load_shape:              # Tenant concurrency pattern
+    name: constant           # poisson or constant(locust default load shape)
+    params:                  # Loadshape-specific parameters
+      constant:                # Constant load shape specific parameters, activate only if load_shape.name is constant
+        concurrent_level: 4      # If user_queries is specified, concurrent_level is target number of requests per user. If not, it is the number of simulated users
+        # arrival_rate: 1.0       # Request arrival rate. If set, concurrent_level will be overridden, constant load will be generated based on arrival-rate
+      poisson:                 # Poisson load shape specific parameters, activate only if load_shape.name is poisson
+        arrival_rate: 1.0        # Request arrival rate
+  namespace: "" # Fill the user-defined namespace. Otherwise, it will be default.
+
+test_cases:
+  visualqna:
+    lvm:
+      run_test: true
+      service_name: "llm-svc"  # Replace with your service name
+      parameters:
+        model_name: "llava-hf/llava-v1.6-mistral-7b-hf"
+        max_new_tokens: 128
+        temperature: 0.01
+        top_k: 10
+        top_p: 0.95
+        repetition_penalty: 1.03
+        streaming: true
+    lvmserve:
+      run_test: true
+      service_name: "lvm-serving-svc"  # Replace with your service name
+    e2e:
+      run_test: true
+      service_name: "visualqna-backend-server-svc"  # Replace with your service name

From 5eb3d2869fc646c3529037321f1a1ad4ba19160e Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Sun, 3 Nov 2024 17:17:19 -0800
Subject: [PATCH 02/12] Update AgentQnA example for v1.1 release (#885)

Signed-off-by: minmin-intel <minmin.hou@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 AgentQnA/README.md                            | 76 +++++++++++++------
 .../docker_compose/intel/cpu/xeon/README.md   |  3 +
 .../intel/cpu/xeon/compose_openai.yaml        |  8 +-
 .../cpu/xeon/launch_agent_service_openai.sh   |  2 +-
 .../intel/hpu/gaudi/compose.yaml              | 39 ++--------
 .../gaudi/launch_agent_service_tgi_gaudi.sh   | 16 +---
 .../intel/hpu/gaudi/launch_tgi_gaudi.sh       | 25 ++++++
 .../intel/hpu/gaudi/tgi_gaudi.yaml            | 30 ++++++++
 ..._build_images.sh => step1_build_images.sh} |  0
 ..._tool.sh => step2_start_retrieval_tool.sh} |  0
 ...ep3_ingest_data_and_validate_retrieval.sh} |  0
 ...step4_launch_and_validate_agent_openai.sh} |  0
 ...=> step4_launch_and_validate_agent_tgi.sh} | 31 ++++++--
 AgentQnA/tests/test.py                        | 25 ++++++
 AgentQnA/tests/test_compose_on_gaudi.sh       | 24 ++++--
 AgentQnA/tools/supervisor_agent_tools.yaml    |  2 +-
 AgentQnA/tools/worker_agent_tools.py          | 35 ++++++---
 17 files changed, 212 insertions(+), 104 deletions(-)
 create mode 100644 AgentQnA/docker_compose/intel/cpu/xeon/README.md
 create mode 100644 AgentQnA/docker_compose/intel/hpu/gaudi/launch_tgi_gaudi.sh
 create mode 100644 AgentQnA/docker_compose/intel/hpu/gaudi/tgi_gaudi.yaml
 rename AgentQnA/tests/{1_build_images.sh => step1_build_images.sh} (100%)
 rename AgentQnA/tests/{2_start_retrieval_tool.sh => step2_start_retrieval_tool.sh} (100%)
 rename AgentQnA/tests/{3_ingest_data_and_validate_retrieval.sh => step3_ingest_data_and_validate_retrieval.sh} (100%)
 rename AgentQnA/tests/{4_launch_and_validate_agent_openai.sh => step4_launch_and_validate_agent_openai.sh} (100%)
 rename AgentQnA/tests/{4_launch_and_validate_agent_tgi.sh => step4_launch_and_validate_agent_tgi.sh} (64%)
 create mode 100644 AgentQnA/tests/test.py

diff --git a/AgentQnA/README.md b/AgentQnA/README.md
index e6cfaf7c9f..9c351a856f 100644
--- a/AgentQnA/README.md
+++ b/AgentQnA/README.md
@@ -81,17 +81,13 @@ flowchart LR
 3. Hierarchical agent can further improve performance.
    Expert worker agents, such as retrieval agent, knowledge graph agent, SQL agent, etc., can provide high-quality output for different aspects of a complex query, and the supervisor agent can aggregate the information together to provide a comprehensive answer.
 
-### Roadmap
+## Deployment with docker
 
-- v0.9: Worker agent uses open-source websearch tool (duckduckgo), agents use OpenAI GPT-4o-mini as llm backend.
-- v1.0: Worker agent uses OPEA retrieval megaservice as tool.
-- v1.0 or later: agents use open-source llm backend.
-- v1.1 or later: add safeguards
+1. Build agent docker image
 
-## Getting started
+   Note: this is optional. The docker images will be automatically pulled when running the docker compose commands. This step is only needed if pulling images failed.
 
-1. Build agent docker image </br>
-   First, clone the opea GenAIComps repo
+   First, clone the opea GenAIComps repo.
 
    ```
    export WORKDIR=<your-work-directory>
@@ -106,35 +102,63 @@ flowchart LR
    docker build -t opea/agent-langchain:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/agent/langchain/Dockerfile .
    ```
 
-2. Launch tool services </br>
-   In this example, we will use some of the mock APIs provided in the Meta CRAG KDD Challenge to demonstrate the benefits of gaining additional context from mock knowledge graphs.
-
-   ```
-   docker run -d -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0
-   ```
-
-3. Set up environment for this example </br>
-   First, clone this repo
+2. Set up environment for this example </br>
+   First, clone this repo.
 
    ```
    cd $WORKDIR
    git clone https://github.com/opea-project/GenAIExamples.git
    ```
 
-   Second, set up env vars
+   Second, set up env vars.
 
    ```
    export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
-   # optional: OPANAI_API_KEY
+   # for using open-source llms
+   export HUGGINGFACEHUB_API_TOKEN=<your-HF-token>
+   export HF_CACHE_DIR=<directory-where-llms-are-downloaded> #so that no need to redownload every time
+
+   # optional: OPANAI_API_KEY if you want to use OpenAI models
    export OPENAI_API_KEY=<your-openai-key>
    ```
 
-4. Launch agent services</br>
-   The configurations of the supervisor agent and the worker agent are defined in the docker-compose yaml file. We currently use openAI GPT-4o-mini as LLM, and we plan to add support for llama3.1-70B-instruct (served by TGI-Gaudi) in a subsequent release.
-   To use openai llm, run command below.
+3. Deploy the retrieval tool (i.e., DocIndexRetriever mega-service)
+
+   First, launch the mega-service.
+
+   ```
+   cd $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool
+   bash launch_retrieval_tool.sh
+   ```
+
+   Then, ingest data into the vector database. Here we provide an example. You can ingest your own data.
+
+   ```
+   bash run_ingest_data.sh
+   ```
+
+4. Launch other tools. </br>
+   In this example, we will use some of the mock APIs provided in the Meta CRAG KDD Challenge to demonstrate the benefits of gaining additional context from mock knowledge graphs.
+
+   ```
+   docker run -d -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0
+   ```
+
+5. Launch agent services</br>
+   We provide two options for `llm_engine` of the agents: 1. open-source LLMs, 2. OpenAI models via API calls.
+
+   To use open-source LLMs on Gaudi2, run commands below.
+
+   ```
+   cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi
+   bash launch_tgi_gaudi.sh
+   bash launch_agent_service_tgi_gaudi.sh
+   ```
+
+   To use OpenAI models, run commands below.
 
    ```
-   cd docker_compose/intel/cpu/xeon
+   cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon
    bash launch_agent_service_openai.sh
    ```
 
@@ -143,10 +167,12 @@ flowchart LR
 First look at logs of the agent docker containers:
 
 ```
-docker logs docgrader-agent-endpoint
+# worker agent
+docker logs rag-agent-endpoint
 ```
 
 ```
+# supervisor agent
 docker logs react-agent-endpoint
 ```
 
@@ -170,4 +196,4 @@ curl http://${ip_address}:9090/v1/chat/completions -X POST -H "Content-Type: app
 
 ## How to register your own tools with agent
 
-You can take a look at the tools yaml and python files in this example. For more details, please refer to the "Provide your own tools" section in the instructions [here](https://github.com/opea-project/GenAIComps/tree/main/comps/agent/langchain/README.md#5-customize-agent-strategy).
+You can take a look at the tools yaml and python files in this example. For more details, please refer to the "Provide your own tools" section in the instructions [here](https://github.com/opea-project/GenAIComps/tree/main/comps/agent/langchain/README.md).
diff --git a/AgentQnA/docker_compose/intel/cpu/xeon/README.md b/AgentQnA/docker_compose/intel/cpu/xeon/README.md
new file mode 100644
index 0000000000..852a0476c6
--- /dev/null
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/README.md
@@ -0,0 +1,3 @@
+# Deployment on Xeon
+
+We deploy the retrieval tool on Xeon. For LLMs, we support OpenAI models via API calls. For instructions on using open-source LLMs, please refer to the deployment guide [here](../../../../README.md).
diff --git a/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml b/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
index bac5bbc627..837f2a0871 100644
--- a/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
@@ -2,11 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 services:
-  worker-docgrader-agent:
+  worker-rag-agent:
     image: opea/agent-langchain:latest
-    container_name: docgrader-agent-endpoint
+    container_name: rag-agent-endpoint
     volumes:
-      - ${WORKDIR}/GenAIComps/comps/agent/langchain/:/home/user/comps/agent/langchain/
       - ${TOOLSET_PATH}:/home/user/tools/
     ports:
       - "9095:9095"
@@ -36,8 +35,9 @@ services:
   supervisor-react-agent:
     image: opea/agent-langchain:latest
     container_name: react-agent-endpoint
+    depends_on:
+      - worker-rag-agent
     volumes:
-      - ${WORKDIR}/GenAIComps/comps/agent/langchain/:/home/user/comps/agent/langchain/
       - ${TOOLSET_PATH}:/home/user/tools/
     ports:
       - "9090:9090"
diff --git a/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh b/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh
index 6c2094cc8e..f35e60fd13 100644
--- a/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh
@@ -7,7 +7,7 @@ export recursion_limit_worker=12
 export recursion_limit_supervisor=10
 export model="gpt-4o-mini-2024-07-18"
 export temperature=0
-export max_new_tokens=512
+export max_new_tokens=4096
 export OPENAI_API_KEY=${OPENAI_API_KEY}
 export WORKER_AGENT_URL="http://${ip_address}:9095/v1/chat/completions"
 export RETRIEVAL_TOOL_URL="http://${ip_address}:8889/v1/retrievaltool"
diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index 5200f757e3..6a9d0b4650 100644
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -2,37 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 services:
-  tgi-server:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
-    container_name: tgi-server
-    ports:
-      - "8085:80"
-    volumes:
-      - ${HF_CACHE_DIR}:/data
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-      HABANA_VISIBLE_DEVICES: all
-      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      PT_HPU_ENABLE_LAZY_COLLECTIVES: true
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
-    runtime: habana
-    cap_add:
-      - SYS_NICE
-    ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192 --sharded true --num-shard ${NUM_SHARDS}
-  worker-docgrader-agent:
+  worker-rag-agent:
     image: opea/agent-langchain:latest
-    container_name: docgrader-agent-endpoint
-    depends_on:
-      - tgi-server
+    container_name: rag-agent-endpoint
     volumes:
       # - ${WORKDIR}/GenAIExamples/AgentQnA/docker_image_build/GenAIComps/comps/agent/langchain/:/home/user/comps/agent/langchain/
       - ${TOOLSET_PATH}:/home/user/tools/
@@ -41,7 +13,7 @@ services:
     ipc: host
     environment:
       ip_address: ${ip_address}
-      strategy: rag_agent
+      strategy: rag_agent_llama
       recursion_limit: ${recursion_limit_worker}
       llm_engine: tgi
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
@@ -66,8 +38,7 @@ services:
     image: opea/agent-langchain:latest
     container_name: react-agent-endpoint
     depends_on:
-      - tgi-server
-      - worker-docgrader-agent
+      - worker-rag-agent
     volumes:
       # - ${WORKDIR}/GenAIExamples/AgentQnA/docker_image_build/GenAIComps/comps/agent/langchain/:/home/user/comps/agent/langchain/
       - ${TOOLSET_PATH}:/home/user/tools/
@@ -76,7 +47,7 @@ services:
     ipc: host
     environment:
       ip_address: ${ip_address}
-      strategy: react_langgraph
+      strategy: react_llama
       recursion_limit: ${recursion_limit_supervisor}
       llm_engine: tgi
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_tgi_gaudi.sh b/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_tgi_gaudi.sh
index f4154fb229..966a037974 100644
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_tgi_gaudi.sh
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_tgi_gaudi.sh
@@ -15,7 +15,7 @@ export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-70B-Instruct"
 export NUM_SHARDS=4
 export LLM_ENDPOINT_URL="http://${ip_address}:8085"
 export temperature=0.01
-export max_new_tokens=512
+export max_new_tokens=4096
 
 # agent related environment variables
 export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/
@@ -27,17 +27,3 @@ export RETRIEVAL_TOOL_URL="http://${ip_address}:8889/v1/retrievaltool"
 export CRAG_SERVER=http://${ip_address}:8080
 
 docker compose -f compose.yaml up -d
-
-sleep 5s
-echo "Waiting tgi gaudi ready"
-n=0
-until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do
-    docker logs tgi-server &> tgi-gaudi-service.log
-    n=$((n+1))
-    if grep -q Connected tgi-gaudi-service.log; then
-        break
-    fi
-    sleep 5s
-done
-sleep 5s
-echo "Service started successfully"
diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/launch_tgi_gaudi.sh b/AgentQnA/docker_compose/intel/hpu/gaudi/launch_tgi_gaudi.sh
new file mode 100644
index 0000000000..75b2a9c7f4
--- /dev/null
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/launch_tgi_gaudi.sh
@@ -0,0 +1,25 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# LLM related environment variables
+export HF_CACHE_DIR=${HF_CACHE_DIR}
+ls $HF_CACHE_DIR
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-70B-Instruct"
+export NUM_SHARDS=4
+
+docker compose -f tgi_gaudi.yaml up -d
+
+sleep 5s
+echo "Waiting tgi gaudi ready"
+n=0
+until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do
+    docker logs tgi-server &> tgi-gaudi-service.log
+    n=$((n+1))
+    if grep -q Connected tgi-gaudi-service.log; then
+        break
+    fi
+    sleep 5s
+done
+sleep 5s
+echo "Service started successfully"
diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/tgi_gaudi.yaml b/AgentQnA/docker_compose/intel/hpu/gaudi/tgi_gaudi.yaml
new file mode 100644
index 0000000000..cd5dd202d6
--- /dev/null
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/tgi_gaudi.yaml
@@ -0,0 +1,30 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  tgi-server:
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    container_name: tgi-server
+    ports:
+      - "8085:80"
+    volumes:
+      - ${HF_CACHE_DIR}:/data
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      PT_HPU_ENABLE_LAZY_COLLECTIVES: true
+      ENABLE_HPU_GRAPH: true
+      LIMIT_HPU_GRAPH: true
+      USE_FLASH_ATTENTION: true
+      FLASH_ATTENTION_RECOMPUTE: true
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192 --sharded true --num-shard ${NUM_SHARDS}
diff --git a/AgentQnA/tests/1_build_images.sh b/AgentQnA/tests/step1_build_images.sh
similarity index 100%
rename from AgentQnA/tests/1_build_images.sh
rename to AgentQnA/tests/step1_build_images.sh
diff --git a/AgentQnA/tests/2_start_retrieval_tool.sh b/AgentQnA/tests/step2_start_retrieval_tool.sh
similarity index 100%
rename from AgentQnA/tests/2_start_retrieval_tool.sh
rename to AgentQnA/tests/step2_start_retrieval_tool.sh
diff --git a/AgentQnA/tests/3_ingest_data_and_validate_retrieval.sh b/AgentQnA/tests/step3_ingest_data_and_validate_retrieval.sh
similarity index 100%
rename from AgentQnA/tests/3_ingest_data_and_validate_retrieval.sh
rename to AgentQnA/tests/step3_ingest_data_and_validate_retrieval.sh
diff --git a/AgentQnA/tests/4_launch_and_validate_agent_openai.sh b/AgentQnA/tests/step4_launch_and_validate_agent_openai.sh
similarity index 100%
rename from AgentQnA/tests/4_launch_and_validate_agent_openai.sh
rename to AgentQnA/tests/step4_launch_and_validate_agent_openai.sh
diff --git a/AgentQnA/tests/4_launch_and_validate_agent_tgi.sh b/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh
similarity index 64%
rename from AgentQnA/tests/4_launch_and_validate_agent_tgi.sh
rename to AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh
index f7b36da2a3..fde46e0d5a 100644
--- a/AgentQnA/tests/4_launch_and_validate_agent_tgi.sh
+++ b/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh
@@ -17,6 +17,12 @@ if [ ! -d "$HF_CACHE_DIR" ]; then
 fi
 ls $HF_CACHE_DIR
 
+function start_tgi(){
+    echo "Starting tgi-gaudi server"
+    cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi
+    bash launch_tgi_gaudi.sh
+
+}
 
 function start_agent_and_api_server() {
     echo "Starting CRAG server"
@@ -25,6 +31,7 @@ function start_agent_and_api_server() {
     echo "Starting Agent services"
     cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi
     bash launch_agent_service_tgi_gaudi.sh
+    sleep 10
 }
 
 function validate() {
@@ -43,18 +50,22 @@ function validate() {
 
 function validate_agent_service() {
     echo "----------------Test agent ----------------"
-    local CONTENT=$(http_proxy="" curl http://${ip_address}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "query": "Tell me about Michael Jackson song thriller"
-    }')
-    local EXIT_CODE=$(validate "$CONTENT" "Thriller" "react-agent-endpoint")
-    docker logs docgrader-agent-endpoint
+    # local CONTENT=$(http_proxy="" curl http://${ip_address}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
+    #  "query": "Tell me about Michael Jackson song thriller"
+    # }')
+    export agent_port="9095"
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py)
+    local EXIT_CODE=$(validate "$CONTENT" "Thriller" "rag-agent-endpoint")
+    docker logs rag-agent-endpoint
     if [ "$EXIT_CODE" == "1" ]; then
         exit 1
     fi
 
-    local CONTENT=$(http_proxy="" curl http://${ip_address}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "query": "Tell me about Michael Jackson song thriller"
-    }')
+    # local CONTENT=$(http_proxy="" curl http://${ip_address}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
+    #  "query": "Tell me about Michael Jackson song thriller"
+    # }')
+    export agent_port="9090"
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py)
     local EXIT_CODE=$(validate "$CONTENT" "Thriller" "react-agent-endpoint")
     docker logs react-agent-endpoint
     if [ "$EXIT_CODE" == "1" ]; then
@@ -64,6 +75,10 @@ function validate_agent_service() {
 }
 
 function main() {
+    echo "==================== Start TGI ===================="
+    start_tgi
+    echo "==================== TGI started ===================="
+
     echo "==================== Start agent ===================="
     start_agent_and_api_server
     echo "==================== Agent started ===================="
diff --git a/AgentQnA/tests/test.py b/AgentQnA/tests/test.py
new file mode 100644
index 0000000000..f0ef934412
--- /dev/null
+++ b/AgentQnA/tests/test.py
@@ -0,0 +1,25 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import requests
+
+
+def generate_answer_agent_api(url, prompt):
+    proxies = {"http": ""}
+    payload = {
+        "query": prompt,
+    }
+    response = requests.post(url, json=payload, proxies=proxies)
+    answer = response.json()["text"]
+    return answer
+
+
+if __name__ == "__main__":
+    ip_address = os.getenv("ip_address", "localhost")
+    agent_port = os.getenv("agent_port", "9095")
+    url = f"http://{ip_address}:{agent_port}/v1/chat/completions"
+    prompt = "Tell me about Michael Jackson song thriller"
+    answer = generate_answer_agent_api(url, prompt)
+    print(answer)
diff --git a/AgentQnA/tests/test_compose_on_gaudi.sh b/AgentQnA/tests/test_compose_on_gaudi.sh
index efe1aeeecd..5f7e899dcf 100644
--- a/AgentQnA/tests/test_compose_on_gaudi.sh
+++ b/AgentQnA/tests/test_compose_on_gaudi.sh
@@ -19,7 +19,6 @@ function stop_crag() {
 
 function stop_agent_docker() {
     cd $WORKPATH/docker_compose/intel/hpu/gaudi/
-    # docker compose -f compose.yaml down
     container_list=$(cat compose.yaml | grep container_name | cut -d':' -f2)
     for container_name in $container_list; do
         cid=$(docker ps -aq --filter "name=$container_name")
@@ -28,11 +27,21 @@ function stop_agent_docker() {
     done
 }
 
+function stop_tgi(){
+    cd $WORKPATH/docker_compose/intel/hpu/gaudi/
+    container_list=$(cat tgi_gaudi.yaml | grep container_name | cut -d':' -f2)
+    for container_name in $container_list; do
+        cid=$(docker ps -aq --filter "name=$container_name")
+        echo "Stopping container $container_name"
+        if [[ ! -z "$cid" ]]; then docker rm $cid -f && sleep 1s; fi
+    done
+
+}
+
 function stop_retrieval_tool() {
     echo "Stopping Retrieval tool"
     local RETRIEVAL_TOOL_PATH=$WORKPATH/../DocIndexRetriever
     cd $RETRIEVAL_TOOL_PATH/docker_compose/intel/cpu/xeon/
-    # docker compose -f compose.yaml down
     container_list=$(cat compose.yaml | grep container_name | cut -d':' -f2)
     for container_name in $container_list; do
         cid=$(docker ps -aq --filter "name=$container_name")
@@ -43,25 +52,26 @@ function stop_retrieval_tool() {
 echo "workpath: $WORKPATH"
 echo "=================== Stop containers ===================="
 stop_crag
+stop_tgi
 stop_agent_docker
 stop_retrieval_tool
 
 cd $WORKPATH/tests
 
 echo "=================== #1 Building docker images===================="
-bash 1_build_images.sh
+bash step1_build_images.sh
 echo "=================== #1 Building docker images completed===================="
 
 echo "=================== #2 Start retrieval tool===================="
-bash 2_start_retrieval_tool.sh
+bash step2_start_retrieval_tool.sh
 echo "=================== #2 Retrieval tool started===================="
 
 echo "=================== #3 Ingest data and validate retrieval===================="
-bash 3_ingest_data_and_validate_retrieval.sh
+bash step3_ingest_data_and_validate_retrieval.sh
 echo "=================== #3 Data ingestion and validation completed===================="
 
 echo "=================== #4 Start agent and API server===================="
-bash 4_launch_and_validate_agent_tgi.sh
+bash step4_launch_and_validate_agent_tgi.sh
 echo "=================== #4 Agent test passed ===================="
 
 echo "=================== #5 Stop agent and API server===================="
@@ -70,4 +80,6 @@ stop_agent_docker
 stop_retrieval_tool
 echo "=================== #5 Agent and API server stopped===================="
 
+echo y | docker system prune
+
 echo "ALL DONE!"
diff --git a/AgentQnA/tools/supervisor_agent_tools.yaml b/AgentQnA/tools/supervisor_agent_tools.yaml
index 58110e5292..4b53cc9f9f 100644
--- a/AgentQnA/tools/supervisor_agent_tools.yaml
+++ b/AgentQnA/tools/supervisor_agent_tools.yaml
@@ -25,7 +25,7 @@ get_billboard_rank_date:
   args_schema:
     rank:
       type: int
-      description: song name
+      description: the rank of interest, for example 1 for top 1
     date:
       type: str
       description: date
diff --git a/AgentQnA/tools/worker_agent_tools.py b/AgentQnA/tools/worker_agent_tools.py
index 1dfdb8409e..fded38ec3a 100644
--- a/AgentQnA/tools/worker_agent_tools.py
+++ b/AgentQnA/tools/worker_agent_tools.py
@@ -12,16 +12,31 @@ def search_knowledge_base(query: str) -> str:
     print(url)
     proxies = {"http": ""}
     payload = {
-        "text": query,
+        "messages": query,
     }
     response = requests.post(url, json=payload, proxies=proxies)
     print(response)
-    docs = response.json()["documents"]
-    context = ""
-    for i, doc in enumerate(docs):
-        if i == 0:
-            context = doc
-        else:
-            context += "\n" + doc
-    print(context)
-    return context
+    if "documents" in response.json():
+        docs = response.json()["documents"]
+        context = ""
+        for i, doc in enumerate(docs):
+            if i == 0:
+                context = doc
+            else:
+                context += "\n" + doc
+        # print(context)
+        return context
+    elif "text" in response.json():
+        return response.json()["text"]
+    elif "reranked_docs" in response.json():
+        docs = response.json()["reranked_docs"]
+        context = ""
+        for i, doc in enumerate(docs):
+            if i == 0:
+                context = doc["text"]
+            else:
+                context += "\n" + doc["text"]
+        # print(context)
+        return context
+    else:
+        return "Error parsing response from the knowledge base."

From 3372b9d480dc9dcd2f243359ac4cadebb8edb46a Mon Sep 17 00:00:00 2001
From: lkk <33276950+lkk12014402@users.noreply.github.com>
Date: Mon, 4 Nov 2024 09:18:49 +0800
Subject: [PATCH 03/12] update accuracy embedding endpoint for no wrapper
 (#1056)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 ChatQnA/benchmark/accuracy/eval_multihop.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ChatQnA/benchmark/accuracy/eval_multihop.py b/ChatQnA/benchmark/accuracy/eval_multihop.py
index 9b07ea2e34..a8f2b9911a 100644
--- a/ChatQnA/benchmark/accuracy/eval_multihop.py
+++ b/ChatQnA/benchmark/accuracy/eval_multihop.py
@@ -41,11 +41,11 @@ def get_reranked_documents(self, query, docs, arguments):
             return []
 
     def get_retrieved_documents(self, query, arguments):
-        data = {"text": query}
+        data = {"inputs": query}
         headers = {"Content-Type": "application/json"}
-        response = requests.post(arguments.embedding_endpoint, data=json.dumps(data), headers=headers)
+        response = requests.post(arguments.tei_embedding_endpoint + "/embed", data=json.dumps(data), headers=headers)
         if response.ok:
-            embedding = response.json()["embedding"]
+            embedding = response.json()[0]
         else:
             print(f"Request for embedding failed due to {response.text}.")
             return []

From 0306c620b541d99f0281bf6969bdfa2914def7f0 Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Mon, 4 Nov 2024 11:28:43 +0800
Subject: [PATCH 04/12] Update TGI CPU image to latest official release 2.4.0
 (#1035)

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../docker_compose/intel/cpu/xeon/compose.yaml     |  2 +-
 .../intel/cpu/xeon/compose_multilang.yaml          |  2 +-
 .../intel/cpu/xeon/manifest/audioqna.yaml          |  2 +-
 .../docker_compose/intel/cpu/xeon/compose.yaml     |  2 +-
 ChatQnA/docker_compose/intel/cpu/xeon/README.md    |  4 ++--
 ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml |  2 +-
 .../intel/cpu/xeon/compose_qdrant.yaml             |  2 +-
 .../intel/cpu/xeon/compose_without_rerank.yaml     |  2 +-
 ChatQnA/kubernetes/intel/README_gmc.md             |  2 +-
 .../cpu/xeon/manifest/chatqna-guardrails.yaml      |  4 ++--
 .../intel/cpu/xeon/manifest/chatqna.yaml           |  2 +-
 .../intel/cpu/xeon/manifest/chatqna_bf16.yaml      |  2 +-
 ChatQnA/tests/test_compose_on_xeon.sh              |  2 +-
 CodeGen/docker_compose/intel/cpu/xeon/compose.yaml |  2 +-
 .../intel/cpu/xeon/manifest/codegen.yaml           |  2 +-
 .../intel/cpu/xeon/manifest/codegen_react_ui.yaml  |  2 +-
 CodeGen/tests/test_compose_on_xeon.sh              |  2 +-
 .../docker_compose/intel/cpu/xeon/compose.yaml     |  2 +-
 .../intel/cpu/xeon/manifest/codetrans.yaml         |  2 +-
 CodeTrans/tests/test_compose_on_xeon.sh            |  2 +-
 DocSum/docker_compose/intel/cpu/xeon/compose.yaml  |  2 +-
 DocSum/kubernetes/intel/README_gmc.md              |  2 +-
 .../kubernetes/intel/cpu/xeon/manifest/docsum.yaml |  2 +-
 .../intel/cpu/xeon/manifest/ui/react-docsum.yaml   |  2 +-
 FaqGen/docker_compose/intel/cpu/xeon/compose.yaml  |  2 +-
 .../intel/cpu/xeon/manifest/faqgen_react_ui.yaml   |  2 +-
 .../intel/cpu/xeon/manifest/chatqna.yaml           |  2 +-
 .../intel/cpu/xeon/manifest/codegen.yaml           |  2 +-
 .../kubernetes/intel/cpu/xeon/manifest/docsum.yaml |  2 +-
 .../kubernetes/intel/cpu/xeon/manifest/faqgen.yaml |  2 +-
 .../docker_compose/intel/cpu/xeon/compose.yaml     |  2 +-
 SearchQnA/tests/test_compose_on_xeon.sh            |  2 +-
 .../docker_compose/intel/cpu/xeon/compose.yaml     |  2 +-
 .../intel/cpu/xeon/manifest/translation.yaml       |  2 +-
 Translation/tests/test_compose_on_xeon.sh          |  2 +-
 VisualQnA/docker_compose/intel/cpu/xeon/README.md  |  4 ++--
 .../docker_compose/intel/cpu/xeon/compose.yaml     |  2 +-
 .../intel/cpu/xeon/manifest/visualqna.yaml         |  2 +-
 VisualQnA/tests/test_compose_on_xeon.sh            |  2 +-
 VisualQnA/ui/svelte/package.json                   | 14 +++++++-------
 40 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml b/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
index a0ef81d172..ea3c45b919 100644
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -41,7 +41,7 @@ services:
     environment:
       TTS_ENDPOINT: ${TTS_ENDPOINT}
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     container_name: tgi-service
     ports:
       - "3006:80"
diff --git a/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml b/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
index d8ca1d7f8c..3e20dbc4af 100644
--- a/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml
@@ -26,7 +26,7 @@ services:
       https_proxy: ${https_proxy}
     restart: unless-stopped
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     container_name: tgi-service
     ports:
       - "3006:80"
diff --git a/AudioQnA/kubernetes/intel/cpu/xeon/manifest/audioqna.yaml b/AudioQnA/kubernetes/intel/cpu/xeon/manifest/audioqna.yaml
index bd76774835..6856d2b878 100644
--- a/AudioQnA/kubernetes/intel/cpu/xeon/manifest/audioqna.yaml
+++ b/AudioQnA/kubernetes/intel/cpu/xeon/manifest/audioqna.yaml
@@ -247,7 +247,7 @@ spec:
       - envFrom:
         - configMapRef:
             name: audio-qna-config
-        image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+        image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu"
         name: llm-dependency-deploy-demo
         securityContext:
           capabilities:
diff --git a/AvatarChatbot/docker_compose/intel/cpu/xeon/compose.yaml b/AvatarChatbot/docker_compose/intel/cpu/xeon/compose.yaml
index aa6f49bf87..2496b11e87 100644
--- a/AvatarChatbot/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/AvatarChatbot/docker_compose/intel/cpu/xeon/compose.yaml
@@ -42,7 +42,7 @@ services:
     environment:
       TTS_ENDPOINT: ${TTS_ENDPOINT}
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     container_name: tgi-service
     ports:
       - "3006:80"
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
index 4598c07ec0..3f2766ec57 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -195,7 +195,7 @@ For users in China who are unable to download models directly from Huggingface,
    export HF_TOKEN=${your_hf_token}
    export HF_ENDPOINT="https://hf-mirror.com"
    model_name="Intel/neural-chat-7b-v3-3"
-   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu --model-id $model_name
+   docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name
    ```
 
 2. Offline
@@ -209,7 +209,7 @@ For users in China who are unable to download models directly from Huggingface,
      ```bash
      export HF_TOKEN=${your_hf_token}
      export model_path="/path/to/model"
-     docker run -p 8008:80 -v $model_path:/data --name tgi_service --shm-size 1g ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu --model-id /data
+     docker run -p 8008:80 -v $model_path:/data --name tgi_service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data
      ```
 
 ### Setup Environment Variables
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 14794e8d4f..0c290b8683 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -73,7 +73,7 @@ services:
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     container_name: tgi-service
     ports:
       - "9009:80"
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
index 8d37bb83af..ad7df8fa79 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
@@ -72,7 +72,7 @@ services:
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     container_name: tgi-service
     ports:
       - "6042:80"
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
index e497985f8a..938a6690d3 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
@@ -57,7 +57,7 @@ services:
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
     restart: unless-stopped
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     container_name: tgi-service
     ports:
       - "9009:80"
diff --git a/ChatQnA/kubernetes/intel/README_gmc.md b/ChatQnA/kubernetes/intel/README_gmc.md
index dab86381fe..860bae7205 100644
--- a/ChatQnA/kubernetes/intel/README_gmc.md
+++ b/ChatQnA/kubernetes/intel/README_gmc.md
@@ -18,7 +18,7 @@ The ChatQnA uses the below prebuilt images if you choose a Xeon deployment
 - tei_embedding_service: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
 - retriever: opea/retriever-redis:latest
 - tei_xeon_service: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-- tgi-service: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+- tgi-service: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
 - chaqna-xeon-backend-server: opea/chatqna:latest
 
 Should you desire to use the Gaudi accelerator, two alternate images are used for the embedding and llm services.
diff --git a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna-guardrails.yaml b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna-guardrails.yaml
index 3d5f367d0f..70aa65bd63 100644
--- a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna-guardrails.yaml
+++ b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna-guardrails.yaml
@@ -1100,7 +1100,7 @@ spec:
             runAsUser: 1000
             seccompProfile:
               type: RuntimeDefault
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
@@ -1180,7 +1180,7 @@ spec:
             runAsUser: 1000
             seccompProfile:
               type: RuntimeDefault
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml
index 69e33b873d..744f095915 100644
--- a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml
+++ b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml
@@ -922,7 +922,7 @@ spec:
             runAsUser: 1000
             seccompProfile:
               type: RuntimeDefault
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna_bf16.yaml b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna_bf16.yaml
index 90db7043c7..b182851179 100644
--- a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna_bf16.yaml
+++ b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna_bf16.yaml
@@ -925,7 +925,7 @@ spec:
             runAsUser: 1000
             seccompProfile:
               type: RuntimeDefault
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/ChatQnA/tests/test_compose_on_xeon.sh b/ChatQnA/tests/test_compose_on_xeon.sh
index f906dfabbf..3535159b3f 100644
--- a/ChatQnA/tests/test_compose_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_on_xeon.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
     service_list="chatqna chatqna-ui chatqna-conversation-ui dataprep-redis retriever-redis nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
 
     docker images && sleep 1s
diff --git a/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml b/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
index ab1e4150ce..64b74db71f 100644
--- a/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     container_name: tgi-service
     ports:
       - "8028:80"
diff --git a/CodeGen/kubernetes/intel/cpu/xeon/manifest/codegen.yaml b/CodeGen/kubernetes/intel/cpu/xeon/manifest/codegen.yaml
index 4e6d8f91c8..d0070dc969 100644
--- a/CodeGen/kubernetes/intel/cpu/xeon/manifest/codegen.yaml
+++ b/CodeGen/kubernetes/intel/cpu/xeon/manifest/codegen.yaml
@@ -404,7 +404,7 @@ spec:
             runAsUser: 1000
             seccompProfile:
               type: RuntimeDefault
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/CodeGen/kubernetes/intel/cpu/xeon/manifest/codegen_react_ui.yaml b/CodeGen/kubernetes/intel/cpu/xeon/manifest/codegen_react_ui.yaml
index 5d77fb8cc8..a155af13a0 100644
--- a/CodeGen/kubernetes/intel/cpu/xeon/manifest/codegen_react_ui.yaml
+++ b/CodeGen/kubernetes/intel/cpu/xeon/manifest/codegen_react_ui.yaml
@@ -126,7 +126,7 @@ spec:
             - name: no_proxy
               value:
           securityContext: {}
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/CodeGen/tests/test_compose_on_xeon.sh b/CodeGen/tests/test_compose_on_xeon.sh
index 0821cd3cb6..b184c00f31 100644
--- a/CodeGen/tests/test_compose_on_xeon.sh
+++ b/CodeGen/tests/test_compose_on_xeon.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
     service_list="codegen codegen-ui llm-tgi"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     docker images && sleep 1s
 }
 
diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml b/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
index 122028b56e..16c05cf363 100644
--- a/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     container_name: codetrans-tgi-service
     ports:
       - "8008:80"
diff --git a/CodeTrans/kubernetes/intel/cpu/xeon/manifest/codetrans.yaml b/CodeTrans/kubernetes/intel/cpu/xeon/manifest/codetrans.yaml
index 4429083432..a778a8529e 100644
--- a/CodeTrans/kubernetes/intel/cpu/xeon/manifest/codetrans.yaml
+++ b/CodeTrans/kubernetes/intel/cpu/xeon/manifest/codetrans.yaml
@@ -404,7 +404,7 @@ spec:
             runAsUser: 1000
             seccompProfile:
               type: RuntimeDefault
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/CodeTrans/tests/test_compose_on_xeon.sh b/CodeTrans/tests/test_compose_on_xeon.sh
index d1f55c9a3d..63fe74f058 100644
--- a/CodeTrans/tests/test_compose_on_xeon.sh
+++ b/CodeTrans/tests/test_compose_on_xeon.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
     service_list="codetrans codetrans-ui llm-tgi nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     docker images && sleep 1s
 }
 
diff --git a/DocSum/docker_compose/intel/cpu/xeon/compose.yaml b/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
index 620ee36575..35e673563b 100644
--- a/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     container_name: tgi-service
     ports:
       - "8008:80"
diff --git a/DocSum/kubernetes/intel/README_gmc.md b/DocSum/kubernetes/intel/README_gmc.md
index b332292110..6046ca4dcb 100644
--- a/DocSum/kubernetes/intel/README_gmc.md
+++ b/DocSum/kubernetes/intel/README_gmc.md
@@ -8,7 +8,7 @@ Install GMC in your Kubernetes cluster, if you have not already done so, by foll
 The DocSum application is defined as a Custom Resource (CR) file that the above GMC operator acts upon. It first checks if the microservices listed in the CR yaml file are running, if not it starts them and then proceeds to connect them. When the DocSum RAG pipeline is ready, the service endpoint details are returned, letting you use the application. Should you use "kubectl get pods" commands you will see all the component microservices, in particular embedding, retriever, rerank, and llm.
 
 The DocSum pipeline uses  prebuilt images. The Xeon version uses the prebuilt image `llm-docsum-tgi:latest` which internally leverages the
-the image `ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu`. The service is called tgi-svc. Meanwhile, the Gaudi version launches the
+the image `ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu`. The service is called tgi-svc. Meanwhile, the Gaudi version launches the
 service tgi-gaudi-svc, which uses the image `ghcr.io/huggingface/tgi-gaudi:2.0.5`. Both TGI model services serve the model specified in the LLM_MODEL_ID variable that is exported by you. In the below example we use `Intel/neural-chat-7b-v3-3`.
 
 [NOTE]
diff --git a/DocSum/kubernetes/intel/cpu/xeon/manifest/docsum.yaml b/DocSum/kubernetes/intel/cpu/xeon/manifest/docsum.yaml
index 1416bdbcbc..9199888a10 100644
--- a/DocSum/kubernetes/intel/cpu/xeon/manifest/docsum.yaml
+++ b/DocSum/kubernetes/intel/cpu/xeon/manifest/docsum.yaml
@@ -404,7 +404,7 @@ spec:
             runAsUser: 1000
             seccompProfile:
               type: RuntimeDefault
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/DocSum/kubernetes/intel/cpu/xeon/manifest/ui/react-docsum.yaml b/DocSum/kubernetes/intel/cpu/xeon/manifest/ui/react-docsum.yaml
index 61e8799b0e..560e34a215 100644
--- a/DocSum/kubernetes/intel/cpu/xeon/manifest/ui/react-docsum.yaml
+++ b/DocSum/kubernetes/intel/cpu/xeon/manifest/ui/react-docsum.yaml
@@ -126,7 +126,7 @@ spec:
             - name: no_proxy
               value:
           securityContext: {}
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml b/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml
index 8c5c894aed..59df3093e9 100644
--- a/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     container_name: tgi-xeon-server
     ports:
       - "8008:80"
diff --git a/FaqGen/kubernetes/intel/cpu/xeon/manifest/faqgen_react_ui.yaml b/FaqGen/kubernetes/intel/cpu/xeon/manifest/faqgen_react_ui.yaml
index 845ba50412..53b2d541f3 100644
--- a/FaqGen/kubernetes/intel/cpu/xeon/manifest/faqgen_react_ui.yaml
+++ b/FaqGen/kubernetes/intel/cpu/xeon/manifest/faqgen_react_ui.yaml
@@ -126,7 +126,7 @@ spec:
             - name: no_proxy
               value:
           securityContext: {}
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml b/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml
index 43de640ad3..c3a65e92b6 100644
--- a/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml
+++ b/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml
@@ -993,7 +993,7 @@ spec:
                 name: chatqna-tgi-config
           securityContext:
             {}
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/codegen.yaml b/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/codegen.yaml
index 6c52c5d921..5eb3cd6eb4 100644
--- a/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/codegen.yaml
+++ b/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/codegen.yaml
@@ -229,7 +229,7 @@ spec:
                 name: codegen-tgi-config
           securityContext:
             {}
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/docsum.yaml b/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/docsum.yaml
index 0fda41f5e1..44d16ee9a8 100644
--- a/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/docsum.yaml
+++ b/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/docsum.yaml
@@ -229,7 +229,7 @@ spec:
                 name: docsum-tgi-config
           securityContext:
             {}
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/faqgen.yaml b/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/faqgen.yaml
index 749d984082..2c0b3bffc2 100644
--- a/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/faqgen.yaml
+++ b/ProductivitySuite/kubernetes/intel/cpu/xeon/manifest/faqgen.yaml
@@ -138,7 +138,7 @@ spec:
             - configMapRef:
                 name: faqgen-tgi-config
           securityContext: {}
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml b/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 0b7995118d..53be5846e3 100644
--- a/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/SearchQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -73,7 +73,7 @@ services:
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
     restart: unless-stopped
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     container_name: tgi-service
     ports:
       - "3006:80"
diff --git a/SearchQnA/tests/test_compose_on_xeon.sh b/SearchQnA/tests/test_compose_on_xeon.sh
index 5436cc1c50..6c73833acc 100644
--- a/SearchQnA/tests/test_compose_on_xeon.sh
+++ b/SearchQnA/tests/test_compose_on_xeon.sh
@@ -23,7 +23,7 @@ function build_docker_images() {
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     docker images && sleep 1s
 }
 
diff --git a/Translation/docker_compose/intel/cpu/xeon/compose.yaml b/Translation/docker_compose/intel/cpu/xeon/compose.yaml
index 108a5086d2..39ea18d460 100644
--- a/Translation/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/Translation/docker_compose/intel/cpu/xeon/compose.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     container_name: tgi-service
     ports:
       - "8008:80"
diff --git a/Translation/kubernetes/intel/cpu/xeon/manifest/translation.yaml b/Translation/kubernetes/intel/cpu/xeon/manifest/translation.yaml
index e30fee338e..9cc8c2798f 100644
--- a/Translation/kubernetes/intel/cpu/xeon/manifest/translation.yaml
+++ b/Translation/kubernetes/intel/cpu/xeon/manifest/translation.yaml
@@ -361,7 +361,7 @@ spec:
             runAsUser: 1000
             seccompProfile:
               type: RuntimeDefault
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/Translation/tests/test_compose_on_xeon.sh b/Translation/tests/test_compose_on_xeon.sh
index 2d0c5306d5..b7fc6acb39 100644
--- a/Translation/tests/test_compose_on_xeon.sh
+++ b/Translation/tests/test_compose_on_xeon.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
     service_list="translation translation-ui llm-tgi nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     docker images && sleep 1s
 }
 
diff --git a/VisualQnA/docker_compose/intel/cpu/xeon/README.md b/VisualQnA/docker_compose/intel/cpu/xeon/README.md
index 8f0d5b6b34..eb1ef817b0 100644
--- a/VisualQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/VisualQnA/docker_compose/intel/cpu/xeon/README.md
@@ -67,12 +67,12 @@ docker build --no-cache -t opea/visualqna-ui:latest --build-arg https_proxy=$htt
 ### 4. Pull TGI Xeon Image
 
 ```bash
-docker pull ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
 ```
 
 Then run the command `docker images`, you will have the following 5 Docker Images:
 
-1. `ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu`
+1. `ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu`
 2. `opea/lvm-tgi:latest`
 3. `opea/visualqna:latest`
 4. `opea/visualqna-ui:latest`
diff --git a/VisualQnA/docker_compose/intel/cpu/xeon/compose.yaml b/VisualQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 553b13908b..33b5e189b1 100644
--- a/VisualQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/VisualQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -3,7 +3,7 @@
 
 services:
   llava-tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     container_name: tgi-llava-xeon-server
     ports:
       - "8399:80"
diff --git a/VisualQnA/kubernetes/intel/cpu/xeon/manifest/visualqna.yaml b/VisualQnA/kubernetes/intel/cpu/xeon/manifest/visualqna.yaml
index 4d3ee3bf24..1f1b2d316a 100644
--- a/VisualQnA/kubernetes/intel/cpu/xeon/manifest/visualqna.yaml
+++ b/VisualQnA/kubernetes/intel/cpu/xeon/manifest/visualqna.yaml
@@ -216,7 +216,7 @@ spec:
                 name: visualqna-tgi-config
           securityContext:
             {}
-          image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu"
+          image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/VisualQnA/tests/test_compose_on_xeon.sh b/VisualQnA/tests/test_compose_on_xeon.sh
index 8829896387..4d9c194833 100644
--- a/VisualQnA/tests/test_compose_on_xeon.sh
+++ b/VisualQnA/tests/test_compose_on_xeon.sh
@@ -21,7 +21,7 @@ function build_docker_images() {
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
     docker compose -f build.yaml build --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
+    docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
     docker images && sleep 1s
 }
 
diff --git a/VisualQnA/ui/svelte/package.json b/VisualQnA/ui/svelte/package.json
index 6444d18c14..e2a39a2c4d 100644
--- a/VisualQnA/ui/svelte/package.json
+++ b/VisualQnA/ui/svelte/package.json
@@ -15,8 +15,7 @@
     "@fortawesome/free-solid-svg-icons": "6.2.0",
     "@playwright/test": "^1.33.0",
     "@sveltejs/adapter-auto": "1.0.0-next.75",
-    "@sveltejs/adapter-static": "^3.0.0",
-    "@sveltejs/kit": "^2.0.0",
+    "@sveltejs/kit": "^1.30.4",
     "@tailwindcss/typography": "0.5.7",
     "@types/debug": "4.1.7",
     "@types/node": "^20.12.13",
@@ -29,20 +28,21 @@
     "eslint": "^8.16.0",
     "eslint-config-prettier": "^8.3.0",
     "eslint-plugin-neverthrow": "1.1.4",
+    "eslint-plugin-svelte3": "^4.0.0",
     "postcss": "^8.4.31",
     "postcss-load-config": "^4.0.1",
     "postcss-preset-env": "^8.3.2",
     "prettier": "^2.8.8",
     "prettier-plugin-svelte": "^2.7.0",
     "prettier-plugin-tailwindcss": "^0.3.0",
-    "svelte": "^4.0.0",
-    "svelte-check": "^3.0.0",
+    "svelte": "^3.59.1",
+    "svelte-check": "^2.7.1",
     "svelte-fa": "3.0.3",
-    "svelte-preprocess": "^6.0.2",
+    "svelte-preprocess": "^4.10.7",
     "tailwindcss": "^3.1.5",
     "tslib": "^2.3.1",
-    "typescript": "^5.0.0",
-    "vite": "^5.0.0"
+    "typescript": "^4.7.4",
+    "vite": "^4.5.2"
   },
   "type": "module",
   "dependencies": {

From 7f7ad0e256c21483d409c167ef0a5cab329d473e Mon Sep 17 00:00:00 2001
From: ZePan110 <ze.pan@intel.com>
Date: Mon, 4 Nov 2024 17:08:15 +0800
Subject: [PATCH 05/12] Inject commit for the release docker image (#1060)

Signed-off-by: ZePan110 <ze.pan@intel.com>
---
 .github/workflows/_example-workflow.yml       | 6 ++++++
 .github/workflows/manual-example-workflow.yml | 6 ++++++
 .github/workflows/manual-image-build.yml      | 7 +++++++
 3 files changed, 19 insertions(+)

diff --git a/.github/workflows/_example-workflow.yml b/.github/workflows/_example-workflow.yml
index 07e857d61b..cfed39c952 100644
--- a/.github/workflows/_example-workflow.yml
+++ b/.github/workflows/_example-workflow.yml
@@ -40,6 +40,11 @@ on:
         default: "main"
         required: false
         type: string
+      inject_commit:
+        default: false
+        required: false
+        type: string
+
 jobs:
 ####################################################################################################
 # Image Build
@@ -83,6 +88,7 @@ jobs:
           docker_compose_path: ${{ github.workspace }}/${{ inputs.example }}/docker_image_build/build.yaml
           service_list: ${{ inputs.services }}
           registry: ${OPEA_IMAGE_REPO}opea
+          inject_commit: ${{ inputs.inject_commit }}
           tag: ${{ inputs.tag }}
 
 ####################################################################################################
diff --git a/.github/workflows/manual-example-workflow.yml b/.github/workflows/manual-example-workflow.yml
index 03ba728c79..9e31f26d78 100644
--- a/.github/workflows/manual-example-workflow.yml
+++ b/.github/workflows/manual-example-workflow.yml
@@ -50,6 +50,11 @@ on:
         description: 'OPEA branch for image build'
         required: false
         type: string
+      inject_commit:
+        default: true
+        description: "inject commit to docker images true or false"
+        required: false
+        type: string
 
 permissions: read-all
 jobs:
@@ -101,4 +106,5 @@ jobs:
       test_k8s: ${{ fromJSON(inputs.test_k8s) }}
       test_gmc: ${{ fromJSON(inputs.test_gmc) }}
       opea_branch: ${{ inputs.opea_branch }}
+      inject_commit: ${{ inputs.inject_commit }}
     secrets: inherit
diff --git a/.github/workflows/manual-image-build.yml b/.github/workflows/manual-image-build.yml
index 8a0b0cf2c5..53ba750ed6 100644
--- a/.github/workflows/manual-image-build.yml
+++ b/.github/workflows/manual-image-build.yml
@@ -30,6 +30,12 @@ on:
         description: 'OPEA branch for image build'
         required: false
         type: string
+      inject_commit:
+        default: true
+        description: "inject commit to docker images true or false"
+        required: false
+        type: string
+
 jobs:
   get-test-matrix:
     runs-on: ubuntu-latest
@@ -56,4 +62,5 @@ jobs:
       services: ${{ inputs.services }}
       tag: ${{ inputs.tag }}
       opea_branch: ${{ inputs.opea_branch }}
+      inject_commit: ${{ inputs.inject_commit }}
     secrets: inherit

From 78331ee67883eab96123ac6f3e748744cdd79b0c Mon Sep 17 00:00:00 2001
From: "chen, suyue" <suyue.chen@intel.com>
Date: Mon, 4 Nov 2024 17:22:56 +0800
Subject: [PATCH 06/12] Add nightly image build and publish action (#1067)

Signed-off-by: chensuyue <suyue.chen@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../nightly-docker-build-publish.yml          | 65 +++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 .github/workflows/nightly-docker-build-publish.yml

diff --git a/.github/workflows/nightly-docker-build-publish.yml b/.github/workflows/nightly-docker-build-publish.yml
new file mode 100644
index 0000000000..fd159d6f09
--- /dev/null
+++ b/.github/workflows/nightly-docker-build-publish.yml
@@ -0,0 +1,65 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+name: Nightly build/publish latest docker images
+
+on:
+  schedule:
+    - cron: "30 1 * * *"
+  workflow_dispatch:
+
+env:
+  examples: "AgentQnA,AudioQnA,ChatQnA,CodeGen,CodeTrans,DocIndexRetriever,DocSum,FaqGen,InstructionTuning,MultimodalQnA,ProductivitySuite,RerankFinetuning,SearchQnA,Translation,VideoQnA,VisualQnA"
+  node: gaudi
+  tag: latest
+  publish_tags: latest
+
+jobs:
+  get-build-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      examples: ${{ steps.get-matrix.outputs.examples }}
+    steps:
+      - name: Create Matrix
+        id: get-matrix
+        run: |
+          examples=($(echo ${{ env.examples }} | tr ',' ' '))
+          examples_json=$(printf '%s\n' "${examples[@]}" | sort -u | jq -R '.' | jq -sc '.')
+          echo "examples=$examples_json" >> $GITHUB_OUTPUT
+
+  build:
+    needs: get-build-matrix
+    strategy:
+      matrix:
+        example: ${{ fromJSON(needs.get-build-matrix.outputs.examples) }}
+      fail-fast: false
+    uses: ./.github/workflows/_example-workflow.yml
+    with:
+      node: ${{ env.node }}
+      example: ${{ matrix.example }}
+      inject_commit: true
+    secrets: inherit
+
+  get-image-list:
+    uses: ./.github/workflows/_get-image-list.yml
+    with:
+      examples: ${{ env.examples }}
+
+  publish:
+    needs: [get-image-list, build]
+    strategy:
+      matrix:
+        image: ${{ fromJSON(needs.get-image-list.outputs.matrix) }}
+    runs-on: "docker-build-${{ env.node }}"
+    steps:
+      - uses: docker/login-action@v3.2.0
+        with:
+          username: ${{ secrets.DOCKERHUB_USER }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Image Publish
+        uses: opea-project/validation/actions/image-publish@main
+        with:
+          local_image_ref: ${OPEA_IMAGE_REPO}opea/${{ matrix.image }}:${{ env.tag }}
+          image_name: opea/${{ matrix.image }}
+          publish_tags: ${{ env.publish_tags }}

From c2b7bd25d916a3792aae5d6f28a1c4bf1416501e Mon Sep 17 00:00:00 2001
From: "chen, suyue" <suyue.chen@intel.com>
Date: Mon, 4 Nov 2024 22:54:19 +0800
Subject: [PATCH 07/12] Use docker stop instead of docker compose stop to avoid
 container clean up issue (#1068)

Signed-off-by: chensuyue <suyue.chen@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .github/workflows/_run-docker-compose.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/_run-docker-compose.yml b/.github/workflows/_run-docker-compose.yml
index fe86a60392..60bf70dcb8 100644
--- a/.github/workflows/_run-docker-compose.yml
+++ b/.github/workflows/_run-docker-compose.yml
@@ -141,7 +141,11 @@ jobs:
           flag=${flag#test_}
           yaml_file=$(find . -type f -wholename "*${{ inputs.hardware }}/${flag}.yaml")
           echo $yaml_file
-          docker compose -f $yaml_file stop && docker compose -f $yaml_file rm -f || true
+          container_list=$(cat $yaml_file | grep container_name | cut -d':' -f2)
+          for container_name in $container_list; do
+              cid=$(docker ps -aq --filter "name=$container_name")
+              if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
+          done
           docker system prune -f
           docker rmi $(docker images --filter reference="*:5000/*/*" -q) || true
 

From cf86aceb18f638838aa4044d72b0e40c3553679e Mon Sep 17 00:00:00 2001
From: "chen, suyue" <suyue.chen@intel.com>
Date: Tue, 5 Nov 2024 09:14:44 +0800
Subject: [PATCH 08/12] Update nightly image build jobs (#1070)

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .../nightly-docker-build-publish.yml          | 34 +++++++++++--------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/nightly-docker-build-publish.yml b/.github/workflows/nightly-docker-build-publish.yml
index fd159d6f09..275e1862b7 100644
--- a/.github/workflows/nightly-docker-build-publish.yml
+++ b/.github/workflows/nightly-docker-build-publish.yml
@@ -9,48 +9,54 @@ on:
   workflow_dispatch:
 
 env:
-  examples: "AgentQnA,AudioQnA,ChatQnA,CodeGen,CodeTrans,DocIndexRetriever,DocSum,FaqGen,InstructionTuning,MultimodalQnA,ProductivitySuite,RerankFinetuning,SearchQnA,Translation,VideoQnA,VisualQnA"
-  node: gaudi
-  tag: latest
-  publish_tags: latest
+  EXAMPLES: "AgentQnA,AudioQnA,ChatQnA,CodeGen,CodeTrans,DocIndexRetriever,DocSum,FaqGen,InstructionTuning,MultimodalQnA,ProductivitySuite,RerankFinetuning,SearchQnA,Translation,VideoQnA,VisualQnA"
+  TAG: "latest"
+  PUBLISH_TAGS: "latest"
 
 jobs:
   get-build-matrix:
     runs-on: ubuntu-latest
     outputs:
-      examples: ${{ steps.get-matrix.outputs.examples }}
+      examples_json: ${{ steps.get-matrix.outputs.examples_json }}
+      EXAMPLES: ${{ steps.get-matrix.outputs.EXAMPLES }}
+      TAG: ${{ steps.get-matrix.outputs.TAG }}
+      PUBLISH_TAGS: ${{ steps.get-matrix.outputs.PUBLISH_TAGS }}
     steps:
       - name: Create Matrix
         id: get-matrix
         run: |
-          examples=($(echo ${{ env.examples }} | tr ',' ' '))
+          examples=($(echo ${EXAMPLES} | tr ',' ' '))
           examples_json=$(printf '%s\n' "${examples[@]}" | sort -u | jq -R '.' | jq -sc '.')
-          echo "examples=$examples_json" >> $GITHUB_OUTPUT
+          echo "examples_json=$examples_json" >> $GITHUB_OUTPUT
+          echo "EXAMPLES=$EXAMPLES" >> $GITHUB_OUTPUT
+          echo "TAG=$TAG" >> $GITHUB_OUTPUT
+          echo "PUBLISH_TAGS=$PUBLISH_TAGS" >> $GITHUB_OUTPUT
 
   build:
     needs: get-build-matrix
     strategy:
       matrix:
-        example: ${{ fromJSON(needs.get-build-matrix.outputs.examples) }}
+        example: ${{ fromJSON(needs.get-build-matrix.outputs.examples_json) }}
       fail-fast: false
     uses: ./.github/workflows/_example-workflow.yml
     with:
-      node: ${{ env.node }}
+      node: gaudi
       example: ${{ matrix.example }}
       inject_commit: true
     secrets: inherit
 
   get-image-list:
+    needs: get-build-matrix
     uses: ./.github/workflows/_get-image-list.yml
     with:
-      examples: ${{ env.examples }}
+      examples: ${{ needs.get-build-matrix.outputs.EXAMPLES }}
 
   publish:
-    needs: [get-image-list, build]
+    needs: [get-build-matrix, get-image-list, build]
     strategy:
       matrix:
         image: ${{ fromJSON(needs.get-image-list.outputs.matrix) }}
-    runs-on: "docker-build-${{ env.node }}"
+    runs-on: "docker-build-gaudi"
     steps:
       - uses: docker/login-action@v3.2.0
         with:
@@ -60,6 +66,6 @@ jobs:
       - name: Image Publish
         uses: opea-project/validation/actions/image-publish@main
         with:
-          local_image_ref: ${OPEA_IMAGE_REPO}opea/${{ matrix.image }}:${{ env.tag }}
+          local_image_ref: ${OPEA_IMAGE_REPO}opea/${{ matrix.image }}:${{ needs.get-build-matrix.outputs.TAG }}
           image_name: opea/${{ matrix.image }}
-          publish_tags: ${{ env.publish_tags }}
+          publish_tags: ${{ needs.get-build-matrix.outputs.PUBLISH_TAGS }}

From a0921f127f776f3b5131c45889e11f8e96dd3f13 Mon Sep 17 00:00:00 2001
From: xiguiw <111278656+xiguiw@users.noreply.github.com>
Date: Tue, 5 Nov 2024 13:35:12 +0800
Subject: [PATCH 09/12] [Doc] Fix broken build instruction (#1063)

Signed-off-by: Wang, Xigui <xigui.wang@intel.com>
---
 ChatQnA/docker_compose/intel/hpu/gaudi/README.md     | 4 ++--
 ChatQnA/docker_compose/nvidia/gpu/README.md          | 2 +-
 DocSum/docker_compose/intel/hpu/gaudi/README.md      | 2 +-
 SearchQnA/docker_compose/intel/hpu/gaudi/README.md   | 4 ++--
 Translation/docker_compose/intel/hpu/gaudi/README.md | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
index 43aa720f02..d3237e2fee 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -103,7 +103,7 @@ docker build -t opea/guardrails-tgi:latest --build-arg https_proxy=$https_proxy
 
    ```bash
    git clone https://github.com/opea-project/GenAIExamples.git
-   cd GenAIExamples/ChatQnA/docker
+   cd GenAIExamples/ChatQnA
    docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
    ```
 
@@ -123,7 +123,7 @@ docker build -t opea/guardrails-tgi:latest --build-arg https_proxy=$https_proxy
 
    ```bash
    git clone https://github.com/opea-project/GenAIExamples.git
-   cd GenAIExamples/ChatQnA/docker
+   cd GenAIExamples/ChatQnA
    docker build --no-cache -t opea/chatqna-without-rerank:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.without_rerank .
    ```
 
diff --git a/ChatQnA/docker_compose/nvidia/gpu/README.md b/ChatQnA/docker_compose/nvidia/gpu/README.md
index 5cd8d3ef08..cc8cb7193c 100644
--- a/ChatQnA/docker_compose/nvidia/gpu/README.md
+++ b/ChatQnA/docker_compose/nvidia/gpu/README.md
@@ -95,7 +95,7 @@ To construct the Mega Service, we utilize the [GenAIComps](https://github.com/op
 
 ```bash
 git clone https://github.com/opea-project/GenAIExamples.git
-cd GenAIExamples/ChatQnA/docker
+cd GenAIExamples/ChatQnA
 docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
 cd ../../..
 ```
diff --git a/DocSum/docker_compose/intel/hpu/gaudi/README.md b/DocSum/docker_compose/intel/hpu/gaudi/README.md
index 3480750db7..cf655dd088 100644
--- a/DocSum/docker_compose/intel/hpu/gaudi/README.md
+++ b/DocSum/docker_compose/intel/hpu/gaudi/README.md
@@ -28,7 +28,7 @@ To construct the Mega Service, we utilize the [GenAIComps](https://github.com/op
 
 ```bash
 git clone https://github.com/opea-project/GenAIExamples
-cd GenAIExamples/DocSum/docker
+cd GenAIExamples/DocSum
 docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
 ```
 
diff --git a/SearchQnA/docker_compose/intel/hpu/gaudi/README.md b/SearchQnA/docker_compose/intel/hpu/gaudi/README.md
index 7870aa629f..8777e77863 100644
--- a/SearchQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/SearchQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -38,14 +38,14 @@ To construct the Mega Service, we utilize the [GenAIComps](https://github.com/op
 
 ```bash
 git clone https://github.com/opea-project/GenAIExamples.git
-cd GenAIExamples/SearchQnA/docker
+cd GenAIExamples/SearchQnA
 docker build --no-cache -t opea/searchqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
 ```
 
 Then you need to build the last Docker image `opea/searchqna:latest`, which represents the Mega service through following commands:
 
 ```bash
-cd GenAIExamples/SearchQnA/docker
+cd GenAIExamples/SearchQnA
 docker build --no-cache -t opea/searchqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
 ```
 
diff --git a/Translation/docker_compose/intel/hpu/gaudi/README.md b/Translation/docker_compose/intel/hpu/gaudi/README.md
index a9e807a127..23d7acf12f 100644
--- a/Translation/docker_compose/intel/hpu/gaudi/README.md
+++ b/Translation/docker_compose/intel/hpu/gaudi/README.md
@@ -35,7 +35,7 @@ To construct the Mega Service, we utilize the [GenAIComps](https://github.com/op
 
 ```bash
 git clone https://github.com/opea-project/GenAIExamples
-cd GenAIExamples/Translation/docker
+cd GenAIExamples/Translation
 docker build -t opea/translation:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
 ```
 

From 2d9aeb37156087bfb724870d3f193a7e8a5c0bfa Mon Sep 17 00:00:00 2001
From: Neo Zhang Jianyu <jianyu.zhang@intel.com>
Date: Tue, 5 Nov 2024 17:01:40 +0800
Subject: [PATCH 10/12] fix wrong format which break online doc build (#1073)

Co-authored-by: ZhangJianyu <zhang.jianyu@outlook.com>
---
 CodeGen/benchmark/performance/README.md   | 4 ++--
 CodeTrans/benchmark/performance/README.md | 4 ++--
 FaqGen/benchmark/performance/README.md    | 4 ++--
 VisualQnA/benchmark/performance/README.md | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/CodeGen/benchmark/performance/README.md b/CodeGen/benchmark/performance/README.md
index 04bd383142..a9d1e9d5f6 100644
--- a/CodeGen/benchmark/performance/README.md
+++ b/CodeGen/benchmark/performance/README.md
@@ -55,7 +55,7 @@ sudo systemctl daemon-reload; sudo systemctl restart containerd
 
 Please deploy CodeGen service before benchmarking.
 
-##### Run Benchmark Test
+#### Run Benchmark Test
 
 Before the benchmark, we can configure the number of test queries and test output directory by:
 
@@ -72,6 +72,6 @@ bash benchmark.sh -n <node_count>
 
 The argument `-n` refers to the number of test nodes.
 
-##### 4. Data collection
+#### Data collection
 
 All the test results will come to this folder `/tmp/benchmark_output` configured by the environment variable `TEST_OUTPUT_DIR` in previous steps.
diff --git a/CodeTrans/benchmark/performance/README.md b/CodeTrans/benchmark/performance/README.md
index 5e447d1eee..4b519de980 100644
--- a/CodeTrans/benchmark/performance/README.md
+++ b/CodeTrans/benchmark/performance/README.md
@@ -55,7 +55,7 @@ sudo systemctl daemon-reload; sudo systemctl restart containerd
 
 Please deploy CodeTrans service before benchmarking.
 
-##### Run Benchmark Test
+#### Run Benchmark Test
 
 Before the benchmark, we can configure the number of test queries and test output directory by:
 
@@ -72,6 +72,6 @@ bash benchmark.sh -n <node_count>
 
 The argument `-n` refers to the number of test nodes.
 
-##### 4. Data collection
+#### Data collection
 
 All the test results will come to this folder `/tmp/benchmark_output` configured by the environment variable `TEST_OUTPUT_DIR` in previous steps.
diff --git a/FaqGen/benchmark/performance/README.md b/FaqGen/benchmark/performance/README.md
index 3cd38ae6ab..0587a85a1e 100644
--- a/FaqGen/benchmark/performance/README.md
+++ b/FaqGen/benchmark/performance/README.md
@@ -55,7 +55,7 @@ sudo systemctl daemon-reload; sudo systemctl restart containerd
 
 Please deploy FaqGen service before benchmarking.
 
-##### Run Benchmark Test
+#### Run Benchmark Test
 
 Before the benchmark, we can configure the number of test queries and test output directory by:
 
@@ -72,6 +72,6 @@ bash benchmark.sh -n <node_count>
 
 The argument `-n` refers to the number of test nodes.
 
-##### 4. Data collection
+#### Data collection
 
 All the test results will come to this folder `/tmp/benchmark_output` configured by the environment variable `TEST_OUTPUT_DIR` in previous steps.
diff --git a/VisualQnA/benchmark/performance/README.md b/VisualQnA/benchmark/performance/README.md
index 41e062422d..45e76558ca 100644
--- a/VisualQnA/benchmark/performance/README.md
+++ b/VisualQnA/benchmark/performance/README.md
@@ -55,7 +55,7 @@ sudo systemctl daemon-reload; sudo systemctl restart containerd
 
 Please deploy VisualQnA service before benchmarking.
 
-##### Run Benchmark Test
+#### Run Benchmark Test
 
 Before the benchmark, we can configure the number of test queries and test output directory by:
 
@@ -72,6 +72,6 @@ bash benchmark.sh -n <node_count>
 
 The argument `-n` refers to the number of test nodes.
 
-##### 4. Data collection
+#### Data collection
 
 All the test results will come to this folder `/tmp/benchmark_output` configured by the environment variable `TEST_OUTPUT_DIR` in previous steps.

From 944ae4794812aca806c3dad4c44392e12a167361 Mon Sep 17 00:00:00 2001
From: "Wang, Kai Lawrence" <109344418+wangkl2@users.noreply.github.com>
Date: Wed, 6 Nov 2024 10:22:21 +0800
Subject: [PATCH 11/12] [ChatQnA] Fix the service connection issue on GPU and
 modify the emb backend (#1059)

Signed-off-by: Wang, Kai Lawrence <kai.lawrence.wang@intel.com>
---
 ChatQnA/docker_compose/nvidia/gpu/README.md   |  6 ++--
 .../docker_compose/nvidia/gpu/compose.yaml    | 31 +++++++++----------
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/ChatQnA/docker_compose/nvidia/gpu/README.md b/ChatQnA/docker_compose/nvidia/gpu/README.md
index cc8cb7193c..31ab0549b8 100644
--- a/ChatQnA/docker_compose/nvidia/gpu/README.md
+++ b/ChatQnA/docker_compose/nvidia/gpu/README.md
@@ -97,7 +97,7 @@ To construct the Mega Service, we utilize the [GenAIComps](https://github.com/op
 git clone https://github.com/opea-project/GenAIExamples.git
 cd GenAIExamples/ChatQnA
 docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
-cd ../../..
+cd ../..
 ```
 
 ### 5. Build UI Docker Image
@@ -107,7 +107,7 @@ Construct the frontend Docker image using the command below:
 ```bash
 cd GenAIExamples/ChatQnA/ui
 docker build --no-cache -t opea/chatqna-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile .
-cd ../../../..
+cd ../../../
 ```
 
 ### 6. Build React UI Docker Image (Optional)
@@ -117,7 +117,7 @@ Construct the frontend Docker image using the command below:
 ```bash
 cd GenAIExamples/ChatQnA/ui
 docker build --no-cache -t opea/chatqna-react-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
-cd ../../../..
+cd ../../..
 ```
 
 ### 7. Build Nginx Docker Image
diff --git a/ChatQnA/docker_compose/nvidia/gpu/compose.yaml b/ChatQnA/docker_compose/nvidia/gpu/compose.yaml
index c35866b101..ba504c2eb3 100644
--- a/ChatQnA/docker_compose/nvidia/gpu/compose.yaml
+++ b/ChatQnA/docker_compose/nvidia/gpu/compose.yaml
@@ -20,10 +20,10 @@ services:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
-      REDIS_URL: ${REDIS_URL}
-      REDIS_HOST: ${REDIS_HOST}
+      REDIS_URL: redis://redis-vector-db:6379
+      REDIS_HOST: redis-vector-db
       INDEX_NAME: ${INDEX_NAME}
-      TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
+      TEI_ENDPOINT: http://tei-embedding-service:80
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
   tei-embedding-service:
     image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
@@ -39,13 +39,6 @@ services:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
     command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
   retriever:
     image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest}
     container_name: retriever-redis-server
@@ -58,12 +51,13 @@ services:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
-      REDIS_URL: ${REDIS_URL}
+      REDIS_URL: redis://redis-vector-db:6379
+      REDIS_HOST: redis-vector-db
       INDEX_NAME: ${INDEX_NAME}
       TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
     restart: unless-stopped
   tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+    image: ghcr.io/huggingface/text-embeddings-inference:1.5
     container_name: tei-reranking-server
     ports:
       - "8808:80"
@@ -123,11 +117,14 @@ services:
       - no_proxy=${no_proxy}
       - https_proxy=${https_proxy}
       - http_proxy=${http_proxy}
-      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - EMBEDDING_SERVICE_HOST_IP=${EMBEDDING_SERVICE_HOST_IP}
-      - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP}
-      - RERANK_SERVICE_HOST_IP=${RERANK_SERVICE_HOST_IP}
-      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+      - MEGA_SERVICE_HOST_IP=chaqna-backend-server
+      - EMBEDDING_SERVER_HOST_IP=tei-embedding-service
+      - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-80}
+      - RETRIEVER_SERVICE_HOST_IP=retriever
+      - RERANK_SERVER_HOST_IP=tei-reranking-service
+      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
+      - LLM_SERVER_HOST_IP=tgi-service
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80}
     ipc: host
     restart: always
   chaqna-ui-server:

From 2de7c0ba89f48c646abeadce45c89bb225a07fe0 Mon Sep 17 00:00:00 2001
From: "chen, suyue" <suyue.chen@intel.com>
Date: Thu, 7 Nov 2024 09:38:19 +0800
Subject: [PATCH 12/12] Enhance CI hardware list detect (#1077)

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .github/workflows/nightly-docker-build-publish.yml | 2 +-
 .github/workflows/scripts/get_test_matrix.sh       | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/nightly-docker-build-publish.yml b/.github/workflows/nightly-docker-build-publish.yml
index 275e1862b7..544c69924a 100644
--- a/.github/workflows/nightly-docker-build-publish.yml
+++ b/.github/workflows/nightly-docker-build-publish.yml
@@ -5,7 +5,7 @@ name: Nightly build/publish latest docker images
 
 on:
   schedule:
-    - cron: "30 1 * * *"
+    - cron: "30 13 * * *" # UTC time
   workflow_dispatch:
 
 env:
diff --git a/.github/workflows/scripts/get_test_matrix.sh b/.github/workflows/scripts/get_test_matrix.sh
index ac373f350a..a024617027 100644
--- a/.github/workflows/scripts/get_test_matrix.sh
+++ b/.github/workflows/scripts/get_test_matrix.sh
@@ -9,12 +9,15 @@ set -e
 changed_files=$changed_files
 test_mode=$test_mode
 run_matrix="{\"include\":["
-hardware_list="xeon gaudi" # current support hardware list
 
 examples=$(printf '%s\n' "${changed_files[@]}" | grep '/' | cut -d'/' -f1 | sort -u)
 for example in ${examples}; do
     cd $WORKSPACE/$example
     if [[ ! $(find . -type f | grep ${test_mode}) ]]; then continue; fi
+    cd tests
+    ls -l
+    hardware_list=$(find . -type f -name "test_compose*_on_*.sh" | cut -d/ -f2 | cut -d. -f1 | awk -F'_on_' '{print $2}'| sort -u)
+    echo "Test supported hardware list = ${hardware_list}"
 
     run_hardware=""
     if [[ $(printf '%s\n' "${changed_files[@]}" | grep ${example} | cut -d'/' -f2 | grep -E '*.py|Dockerfile*|ui|docker_image_build' ) ]]; then