Merge branch 'main' into multiple-model-with-remote-service

sgurunat · Nov 13, 2024 · d7414b9 · d7414b9
2 parents 53500d1 + 7adbba6
commit d7414b9
Show file tree

Hide file tree

Showing 80 changed files with 924 additions and 85 deletions.
diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/tgi_gaudi.yaml b/AgentQnA/docker_compose/intel/hpu/gaudi/tgi_gaudi.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-server:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
     container_name: tgi-server
     ports:
       - "8085:80"

diff --git a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -51,7 +51,7 @@ services:
     environment:
       TTS_ENDPOINT: ${TTS_ENDPOINT}
   tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
     container_name: tgi-gaudi-server
     ports:
       - "3006:80"

diff --git a/AudioQnA/kubernetes/intel/README_gmc.md b/AudioQnA/kubernetes/intel/README_gmc.md
@@ -25,7 +25,7 @@ The AudioQnA uses the below prebuilt images if you choose a Xeon deployment
 Should you desire to use the Gaudi accelerator, two alternate images are used for the embedding and llm services.
 For Gaudi:
 
-- tgi-service: ghcr.io/huggingface/tgi-gaudi:2.0.5
+- tgi-service: ghcr.io/huggingface/tgi-gaudi:2.0.6
 - whisper-gaudi: opea/whisper-gaudi:latest
 - speecht5-gaudi: opea/speecht5-gaudi:latest
 

diff --git a/AudioQnA/kubernetes/intel/hpu/gaudi/manifest/audioqna.yaml b/AudioQnA/kubernetes/intel/hpu/gaudi/manifest/audioqna.yaml
@@ -271,7 +271,7 @@ spec:
       - envFrom:
         - configMapRef:
             name: audio-qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
         name: llm-dependency-deploy-demo
         securityContext:
           capabilities:

diff --git a/AudioQnA/tests/test_compose_on_gaudi.sh b/AudioQnA/tests/test_compose_on_gaudi.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
     service_list="audioqna whisper-gaudi asr llm-tgi speecht5-gaudi tts"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
     docker images && sleep 1s
 }
 

diff --git a/AudioQnA/tests/test_compose_on_xeon.sh b/AudioQnA/tests/test_compose_on_xeon.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
     service_list="audioqna whisper asr llm-tgi speecht5 tts"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
     docker images && sleep 1s
 }
 

diff --git a/AvatarChatbot/docker_compose/intel/hpu/gaudi/compose.yaml b/AvatarChatbot/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -54,7 +54,7 @@ services:
     environment:
       TTS_ENDPOINT: ${TTS_ENDPOINT}
   tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
     container_name: tgi-gaudi-server
     ports:
       - "3006:80"

diff --git a/AvatarChatbot/tests/test_compose_on_gaudi.sh b/AvatarChatbot/tests/test_compose_on_gaudi.sh
@@ -29,7 +29,7 @@ function build_docker_images() {
     service_list="avatarchatbot whisper-gaudi asr llm-tgi speecht5-gaudi tts wav2lip-gaudi animation"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
 
     docker images && sleep 1s
 }

diff --git a/AvatarChatbot/tests/test_compose_on_xeon.sh b/AvatarChatbot/tests/test_compose_on_xeon.sh
@@ -29,7 +29,7 @@ function build_docker_images() {
     service_list="avatarchatbot whisper asr llm-tgi speecht5 tts wav2lip animation"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
 
     docker images && sleep 1s
 }

diff --git a/ChatQnA/benchmark/accuracy/README.md b/ChatQnA/benchmark/accuracy/README.md
@@ -48,7 +48,7 @@ To setup a LLM model, we can use [tgi-gaudi](https://github.com/huggingface/tgi-
 docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.1 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2
 
 # for better performance, set `PREFILL_BATCH_BUCKET_SIZE`, `BATCH_BUCKET_SIZE`, `max-batch-total-tokens`, `max-batch-prefill-tokens`
-docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} -e PREFILL_BATCH_BUCKET_SIZE=1 -e BATCH_BUCKET_SIZE=8 --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2 --max-batch-total-tokens 65536 --max-batch-prefill-tokens 2048
+docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} -e PREFILL_BATCH_BUCKET_SIZE=1 -e BATCH_BUCKET_SIZE=8 --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.6 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2 --max-batch-total-tokens 65536 --max-batch-prefill-tokens 2048
 ```
 
 ### Prepare Dataset

diff --git a/...hmark/performance-deprecated/oob/with_rerank/eight_gaudi/oob_eight_gaudi_with_rerank.yaml b/...hmark/performance-deprecated/oob/with_rerank/eight_gaudi/oob_eight_gaudi_with_rerank.yaml
@@ -237,7 +237,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
         imagePullPolicy: IfNotPresent
         name: llm-dependency-deploy
         ports:

diff --git a/...nchmark/performance-deprecated/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml b/...nchmark/performance-deprecated/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml
@@ -237,7 +237,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
         imagePullPolicy: IfNotPresent
         name: llm-dependency-deploy
         ports:

diff --git a/...ark/performance-deprecated/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml b/...ark/performance-deprecated/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml
@@ -237,7 +237,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
         imagePullPolicy: IfNotPresent
         name: llm-dependency-deploy
         ports:

diff --git a/...benchmark/performance-deprecated/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml b/...benchmark/performance-deprecated/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml
@@ -237,7 +237,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
         imagePullPolicy: IfNotPresent
         name: llm-dependency-deploy
         ports:

diff --git a/...performance-deprecated/oob/without_rerank/eight_gaudi/oob_eight_gaudi_without_rerank.yaml b/...performance-deprecated/oob/without_rerank/eight_gaudi/oob_eight_gaudi_without_rerank.yaml
@@ -237,7 +237,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
         imagePullPolicy: IfNotPresent
         name: llm-dependency-deploy
         ports:

diff --git a/...k/performance-deprecated/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml b/...k/performance-deprecated/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml
@@ -237,7 +237,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
         imagePullPolicy: IfNotPresent
         name: llm-dependency-deploy
         ports:

diff --git a/...rformance-deprecated/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml b/...rformance-deprecated/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml
@@ -237,7 +237,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
         imagePullPolicy: IfNotPresent
         name: llm-dependency-deploy
         ports:

diff --git a/...ark/performance-deprecated/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml b/...ark/performance-deprecated/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml
@@ -237,7 +237,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
         imagePullPolicy: IfNotPresent
         name: llm-dependency-deploy
         ports:

diff --git a/...nchmark/performance-deprecated/tuned/with_rerank/eight_gaudi/eight_gaudi_with_rerank.yaml b/...nchmark/performance-deprecated/tuned/with_rerank/eight_gaudi/eight_gaudi_with_rerank.yaml
@@ -255,7 +255,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
         imagePullPolicy: IfNotPresent
         name: llm-dependency-deploy
         ports:

diff --git a/...ark/performance-deprecated/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml b/...ark/performance-deprecated/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml
@@ -255,7 +255,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
         imagePullPolicy: IfNotPresent
         name: llm-dependency-deploy
         ports:

diff --git a/...performance-deprecated/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml b/...performance-deprecated/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml
@@ -255,7 +255,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
         imagePullPolicy: IfNotPresent
         name: llm-dependency-deploy
         ports:

diff --git a/...hmark/performance-deprecated/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml b/...hmark/performance-deprecated/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml
@@ -255,7 +255,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
         imagePullPolicy: IfNotPresent
         name: llm-dependency-deploy
         ports:

diff --git a/...ormance-deprecated/tuned/without_rerank/eight_gaudi/tuned_eight_gaudi_without_rerank.yaml b/...ormance-deprecated/tuned/without_rerank/eight_gaudi/tuned_eight_gaudi_without_rerank.yaml
@@ -255,7 +255,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
         imagePullPolicy: IfNotPresent
         name: llm-dependency-deploy
         ports:

diff --git a/...rformance-deprecated/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml b/...rformance-deprecated/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml
@@ -255,7 +255,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
         imagePullPolicy: IfNotPresent
         name: llm-dependency-deploy
         ports:

diff --git a/...mance-deprecated/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml b/...mance-deprecated/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml
@@ -255,7 +255,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
         imagePullPolicy: IfNotPresent
         name: llm-dependency-deploy
         ports:

diff --git a/...performance-deprecated/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml b/...performance-deprecated/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml
@@ -255,7 +255,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
         imagePullPolicy: IfNotPresent
         name: llm-dependency-deploy
         ports:

diff --git a/ChatQnA/chatqna.yaml b/ChatQnA/chatqna.yaml
@@ -38,7 +38,7 @@ opea_micro_services:
   tgi-service:
     host: ${TGI_SERVICE_IP}
     ports: ${TGI_SERVICE_PORT}
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
     volumes:
       - "./data:/data"
     runtime: habana

diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -432,6 +432,57 @@ curl -X POST "http://${host_ip}:6007/v1/dataprep/delete_file" \
      -H "Content-Type: application/json"
 ```
 
+
+### Profile Microservices
+
+To further analyze MicroService Performance, users could follow the instructions to profile MicroServices. 
+
+#### 1. vLLM backend Service
+   Users could follow previous section to testing vLLM microservice or ChatQnA MegaService.  
+   By default, vLLM profiling is not enabled. Users could start and stop profiling by following commands.  
+
+   ##### Start vLLM profiling
+
+   ```bash
+   curl http://${host_ip}:9009/start_profile \
+     -H "Content-Type: application/json" \
+     -d '{"model": "Intel/neural-chat-7b-v3-3"}'
+   ```
+   Users would see below docker logs from vllm-service if profiling is started correctly.
+   ```bash
+   INFO api_server.py:361] Starting profiler...
+   INFO api_server.py:363] Profiler started.
+   INFO:     x.x.x.x:35940 - "POST /start_profile HTTP/1.1" 200 OK
+   ```
+   After vLLM profiling is started, users could start asking questions and get responses from vLLM MicroService  
+   or ChatQnA MicroService.  
+   
+   ##### Stop vLLM profiling
+   By following command, users could stop vLLM profliing and generate a *.pt.trace.json.gz file as profiling result  
+   under /mnt folder in vllm-service docker instance.  
+   ```bash
+   # vLLM Service
+   curl http://${host_ip}:9009/stop_profile \
+     -H "Content-Type: application/json" \
+     -d '{"model": "Intel/neural-chat-7b-v3-3"}'
+   ```
+   Users would see below docker logs from vllm-service if profiling is stopped correctly.  
+   ```bash
+   INFO api_server.py:368] Stopping profiler...
+   INFO api_server.py:370] Profiler stopped.
+   INFO:     x.x.x.x:41614 - "POST /stop_profile HTTP/1.1" 200 OK
+   ```
+   After vllm profiling is stopped, users could use below command to get the *.pt.trace.json.gz file under /mnt folder.  
+   ```bash
+   docker cp  vllm-service:/mnt/ .
+   ```
+
+   ##### Check profiling result
+   Open a web browser and type "chrome://tracing" or "ui.perfetto.dev", and then load the json.gz file, you should be able  
+   to see the vLLM profiling result as below diagram. 
+![image](https://github.com/user-attachments/assets/55c7097e-5574-41dc-97a7-5e87c31bc286)
+
+   
 ## 🚀 Launch the UI
 
 ### Launch with origin port