opea-project · krish918 · Sep 5, 2024 · Sep 5, 2024 · Sep 6, 2024 · Sep 6, 2024
@@ -65,7 +65,7 @@ jobs:
           echo "CHART_NAME=$CHART_NAME" >> $GITHUB_ENV
           echo "RELEASE_NAME=${CHART_NAME}$(date +%Y%m%d%H%M%S)" >> $GITHUB_ENV
           echo "NAMESPACE=${CHART_NAME}-$(date +%Y%m%d%H%M%S)" >> $GITHUB_ENV
-          echo "ROLLOUT_TIMEOUT_SECONDS=600s" >> $GITHUB_ENV
+          echo "ROLLOUT_TIMEOUT_SECONDS=1200s" >> $GITHUB_ENV
           echo "TEST_TIMEOUT_SECONDS=600s" >> $GITHUB_ENV
           echo "KUBECTL_TIMEOUT_SECONDS=60s" >> $GITHUB_ENV
           echo "should_cleanup=false" >> $GITHUB_ENV

@@ -18,6 +18,19 @@ dependencies:
   - name: tgi
     version: 1.0.0
     repository: "file://../common/tgi"
+    condition: tgi.enabled
+  - name: vllm
+    version: 1.0.0
+    repository: "file://../common/vllm"
+    condition: vllm.enabled
+  - name: llm-uservice
+    version: 1.0.0
+    repository: "file://../common/llm-uservice"
+    condition: tgi.enabled
+  - name: llm-ctrl-uservice
+    version: 1.0.0
+    repository: "file://../common/llm-ctrl-uservice"
+    condition: vllm.enabled
   - name: tei
     version: 1.0.0
     repository: "file://../common/tei"

@@ -9,37 +9,91 @@ Helm chart for deploying ChatQnA service. ChatQnA depends on the following servi
 - [redis-vector-db](../common/redis-vector-db/README.md)
 - [reranking-usvc](../common/reranking-usvc/README.md)
 - [teirerank](../common/teirerank/README.md)
-- [llm-uservice](../common/llm-uservice/README.md)
-- [tgi](../common/tgi/README.md)
+
+For LLM inference, two more microservices will be required. We can either use [TGI](https://github.com/huggingface/text-generation-inference) or [vLLM](https://github.com/vllm-project/vllm) as our LLM backend. Depending on that, we will have following microservices as part of dependencies for ChatQnA application.
+
+1. For using **TGI** as an inference service, following 2 microservices will be required:
+
+   - [llm-uservice](../common/llm-uservice/README.md)
+   - [tgi](../common/tgi/README.md)
+
+2. For using **vLLM** as an inference service, following 2 microservices would be required:
+
+   - [llm-ctrl-uservice](../common/llm-ctrl-uservice/README.md)
+   - [vllm](../common/vllm/README.md)
+
+> **_NOTE :_** We shouldn't have both inference engine deployed. It is required to only setup either of them. To achieve this, conditional flags are added in the chart dependency. We will be switching off flag corresponding to one service and switching on the other, in order to have a proper setup of all ChatQnA dependencies.
 
 ## Installing the Chart
 
-To install the chart, run the following:
+Please follow the following steps to install the ChatQnA Chart:
+
+1. Clone the GenAIInfra repository:
+
+```bash
+git clone https://github.com/opea-project/GenAIInfra.git
+```
+
+2. Setup the dependencies and required environment variables:
 
-```console
+```bash
 cd GenAIInfra/helm-charts/
 ./update_dependency.sh
 helm dependency update chatqna
 export HFTOKEN="insert-your-huggingface-token-here"
 export MODELDIR="/mnt/opea-models"
 export MODELNAME="Intel/neural-chat-7b-v3-3"
+```
+
+3. Depending on the device which we are targeting for running ChatQnA, please use one the following installation commands:
+
+```bash
+# Install the chart on a Xeon machine
+
 # If you would like to use the traditional UI, please change the image as well as the containerport within the values
 # append these at the end of the command "--set chatqna-ui.image.repository=opea/chatqna-ui,chatqna-ui.image.tag=latest,chatqna-ui.containerPort=5173"
+
 helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME}
+```
+
+```bash
 # To use Gaudi device
-# To use Gaudi device
+# To use Gaudi device for TGI
-# To use Gaudi device
+# To use Gaudi device for TGI
-#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-values.yaml
+helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-values.yaml
+```
+
+```bash
 # To use Nvidia GPU
-#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/nv-values.yaml
+helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/nv-values.yaml
+```
+
+```bash
 # To include guardrail component in chatqna on Xeon
-#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-values.yaml
+helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-values.yaml
+```
+
+```bash
 # To include guardrail component in chatqna on Gaudi
-#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-gaudi-values.yaml
+helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-gaudi-values.yaml
+```
+
+> **_NOTE :_** Default installation will use [TGI (Text Generation Inference)](https://github.com/huggingface/text-generation-inference) as inference engine. To use vLLM as inference engine, please see below.
+
+```bash
+# To use vLLM inference engine on XEON device
+
+helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tgi.enabled=false --set vllm.enabled=true
+
+# To use OpenVINO optimized vLLM inference engine on XEON device
+
+helm install -f ./chatqna/vllm-openvino-values.yaml chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tgi.enabled=false --set vllm.enabled=true
 ```
 
 ### IMPORTANT NOTE
 
 1. Make sure your `MODELDIR` exists on the node where your workload is scheduled so you can cache the downloaded model for next time use. Otherwise, set `global.modelUseHostPath` to 'null' if you don't want to cache the model.
 
+2. Please set `http_proxy`, `https_proxy` and `no_proxy` values while installing chart, if you are behind a proxy.
+
 ## Verify
 
 To verify the installation, run the command `kubectl get pod` to make sure all pods are running.
@@ -52,8 +106,9 @@ Run the command `kubectl port-forward svc/chatqna 8888:8888` to expose the servi
 
 Open another terminal and run the following command to verify the service if working:
 
-```console
+```bash
 curl http://localhost:8888/v1/chatqna \
+    -X POST \
     -H "Content-Type: application/json" \
     -d '{"messages": "What is the revenue of Nike in 2023?"}'
 ```
@@ -71,12 +126,13 @@ Open a browser to access `http://<k8s-node-ip-address>:${port}` to play with the
 
 ## Values
 
-| Key               | Type   | Default                       | Description                                                                            |
-| ----------------- | ------ | ----------------------------- | -------------------------------------------------------------------------------------- |
-| image.repository  | string | `"opea/chatqna"`              |                                                                                        |
-| service.port      | string | `"8888"`                      |                                                                                        |
-| tgi.LLM_MODEL_ID  | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory               |
-| global.monitoring | bop;   | false                         | Enable usage metrics for the service components. See ../monitoring.md before enabling! |
+| Key                        | Type   | Default                       | Description                                                                            |
+| -------------------------- | ------ | ----------------------------- | -------------------------------------------------------------------------------------- |
+| image.repository           | string | `"opea/chatqna"`              |                                                                                        |
+| service.port               | string | `"8888"`                      |                                                                                        |
+| tgi.LLM_MODEL_ID           | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory               |
+| vllm-openvino.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory               |
+| global.monitoring          | bop;   | false                         | Enable usage metrics for the service components. See ../monitoring.md before enabling! |
 
 ## Troubleshooting
 

@@ -0,0 +1,25 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+  enabled: false
+
+vllm:
+  enabled: true
+  openvino_enabled: true
+  image:
+    repository: opea/vllm-openvino
+    pullPolicy: IfNotPresent
+    # Overrides the image tag whose default is the chart appVersion.
+    tag: "latest"
+
+  extraCmdArgs: []
+
+  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+
+  CUDA_GRAPHS: "0"
+  VLLM_CPU_KVCACHE_SPACE: 50
+  VLLM_OPENVINO_KVCACHE_SPACE: 32
+  OMPI_MCA_btl_vader_single_copy_mechanism: none
+
+  ov_command: ["/bin/bash"]
@@ -0,0 +1,8 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+  enabled: false
+
+vllm:
+  enabled: true
@@ -33,12 +33,25 @@ spec:
       containers:
         - name: {{ .Release.Name }}
           env:
+            {{- if .Values.vllm.enabled }}
+            - name: LLM_SERVICE_HOST_IP
+              value: {{ .Release.Name }}-llm-ctrl-uservice
+            - name: LLM_SERVER_HOST_IP
+              value: {{ .Release.Name }}-vllm
+            - name: LLM_MODEL
+              value: {{ .Values.vllm.LLM_MODEL_ID | quote }}
+            {{- else }}
+            - name: LLM_SERVICE_HOST_IP
+              value: {{ .Release.Name }}-llm-uservice
             - name: LLM_SERVER_HOST_IP
               value: {{ .Release.Name }}-tgi
-            - name: LLM_SERVER_PORT
-              value: "80"
             - name: LLM_MODEL
               value: {{ .Values.tgi.LLM_MODEL_ID | quote }}
+            {{- end }}
+            - name: RERANK_SERVICE_HOST_IP
+              value: {{ .Release.Name }}-reranking-usvc
+            - name: LLM_SERVER_PORT
+              value: "80"
             - name: RERANK_SERVER_HOST_IP
               value: {{ .Release.Name }}-teirerank
             - name: RERANK_SERVER_PORT

@@ -22,6 +22,14 @@ nginx:
   service:
     type: NodePort
 
+imagePullSecrets: []
+
+podAnnotations: {}
+
+podSecurityContext: {}
+
+resources: {}
+
 securityContext:
   readOnlyRootFilesystem: true
   allowPrivilegeEscalation: false
@@ -47,6 +55,14 @@ horizontalPodAutoscaler:
 # Override values in specific subcharts
 tgi:
   LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+  enabled: true
+
+vllm:
+  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+  enabled: false
+
+llm-ctrl-uservice:
+  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
 
 # disable guardrails-usvc by default
 # See guardrails-values.yaml for guardrail related options
@@ -66,9 +82,9 @@ global:
   https_proxy: ""
   no_proxy: ""
   HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
+
   # set modelUseHostPath or modelUsePVC to use model cache.
   modelUseHostPath: ""
-  # modelUseHostPath: /mnt/opea-models
   # modelUsePVC: model-volume
 
   # Install Prometheus serviceMonitors for service components

@@ -0,0 +1,21 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+vllm:
+  openvino_enabled: true
+  image:
+    repository: opea/vllm-openvino
+    pullPolicy: IfNotPresent
+    # Overrides the image tag whose default is the chart appVersion.
+    tag: "latest"
+
+  extraCmdArgs: []
+
+  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+
+  CUDA_GRAPHS: "0"
+  VLLM_CPU_KVCACHE_SPACE: 50
+  VLLM_OPENVINO_KVCACHE_SPACE: 32
+  OMPI_MCA_btl_vader_single_copy_mechanism: none
+
+  ov_command: ["/bin/bash"]
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
@@ -0,0 +1,14 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v2
+name: llm-ctrl-uservice
+description: A Helm chart for LLM controller microservice which connects with vLLM microservice to provide inferences.
+type: application
+version: 1.0.0
+appVersion: "v1.0"
+dependencies:
+  - name: vllm
+    version: 1.0.0
+    repository: file://../vllm
+    condition: vllm.enabled