Merge branch 'main' into vectorstore/milvus

opea-project · Aug 12, 2024 · 9672613 · 9672613
2 parents 9bd251f + 5262d05
commit 9672613
Show file tree

Hide file tree

Showing 12 changed files with 250 additions and 18 deletions.
diff --git a/.github/workflows/pr-dockerfile-path-scan.yaml b/.github/workflows/pr-dockerfile-path-scan.yaml
@@ -14,7 +14,68 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  file-change-detection:
+  Dockerfile-path-change-detection-in-GenAIComps:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clean Up Working Directory
+        run: sudo rm -rf ${{github.workspace}}/*
+
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Check for changed Dockerfile paths in yaml
+        run: |
+          set -xe
+          shopt -s globstar
+          cd ${{github.workspace}}
+          is_use="FALSE"
+          used_files=""
+          merged_commit=$(git log -1 --format='%H')
+          changed_files="$(git diff --name-status --diff-filter=DR ${{ github.event.pull_request.base.sha }} ${merged_commit} -- '**/Dockerfile**' | cut -f2)"
+          if [ -n "$changed_files" ]; then
+            for file in $changed_files; do
+              if grep -q "$file" .github/workflows/docker/compose/*.yaml; then
+                is_use="TRUE"
+                used_files+="$file "
+              fi
+            done
+          fi
+
+          if [[ "$is_use" == "TRUE" ]]; then
+            echo "Warning: Changed Dockerfile paths:"
+            echo "$used_files"
+            echo "Please modify the corresponding yaml in GenAIComps/.github/workflows/docker/compose and ask [email protected] for final confirmation."
+            exit 1
+          fi
+
+      - name: Check for changed Dockerfile paths in readme
+        run: |
+          set -xe
+          shopt -s globstar
+          cd ${{github.workspace}}
+          is_use="FALSE"
+          used_files=""
+          merged_commit=$(git log -1 --format='%H')
+          changed_files="$(git diff --name-status --diff-filter=DR ${{ github.event.pull_request.base.sha }} ${merged_commit} -- '**/Dockerfile**' | cut -f2)"
+          if [ -n "$changed_files" ]; then
+            for file in $changed_files; do
+              if grep -q "$file" ./**/*.md; then
+                is_use="TRUE"
+                used_files+="$file "
+              fi
+            done
+          fi
+
+          if [[ "$is_use" == "TRUE" ]]; then
+            echo "Warning: Changed Dockerfile paths:"
+            echo "$used_files"
+            echo "Please modify the corresponding README in GenAIComps and ask [email protected] for final confirmation."
+            exit 1
+          fi
+
+  Dockerfile-path-change-detection-in-GenAIExamples:
     runs-on: ubuntu-latest
     steps:
       - name: Clean Up Working Directory
@@ -32,6 +93,7 @@ jobs:
 
       - name: Check for changed Dockerfile paths
         run: |
+          set -xe
           shopt -s globstar
           cd ${{github.workspace}}
           is_use="FALSE"

diff --git a/comps/agent/langchain/docker/Dockerfile b/comps/agent/langchain/docker/Dockerfile
@@ -4,6 +4,7 @@
 FROM python:3.11-slim
 
 ENV LANG=C.UTF-8
+ARG ARCH=cpu
 
 RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
     build-essential \
@@ -19,8 +20,11 @@ USER user
 COPY comps /home/user/comps
 
 RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    if [ ${ARCH} = "cpu" ]; then pip install torch --index-url https://download.pytorch.org/whl/cpu; fi && \
-    pip install --no-cache-dir -r /home/user/comps/agent/langchain/requirements.txt
+    if [ ${ARCH} = "cpu" ]; then \
+        pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/agent/langchain/requirements.txt; \
+    else \
+        pip install --no-cache-dir -r /home/user/comps/agent/langchain/requirements.txt; \
+    fi
 
 ENV PYTHONPATH=$PYTHONPATH:/home/user
 

diff --git a/comps/asr/Dockerfile b/comps/asr/Dockerfile
@@ -9,11 +9,16 @@ RUN useradd -m -s /bin/bash user && \
 USER user
 
 ENV LANG=C.UTF-8
+ARG ARCH=cpu
 
 COPY comps /home/user/comps
 
 RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r /home/user/comps/asr/requirements.txt
+    if [ "${ARCH}" = "cpu" ]; then \
+        pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/asr/requirements.txt ; \
+    else \
+        pip install --no-cache-dir -r /home/user/comps/asr/requirements.txt ; \
+    fi
 
 ENV PYTHONPATH=$PYTHONPATH:/home/user
 

diff --git a/comps/asr/whisper/Dockerfile b/comps/asr/whisper/Dockerfile
@@ -10,6 +10,7 @@ RUN useradd -m -s /bin/bash user && \
 
 # Set environment variables
 ENV LANG=en_US.UTF-8
+ARG ARCH=cpu
 
 # Install system dependencies
 RUN apt-get update \
@@ -21,6 +22,11 @@ USER user
 
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r /home/user/comps/asr/requirements.txt && \
+    if [ "${ARCH}" = "cpu" ]; then \
+        pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/asr/requirements.txt ; \
+    else \
+        pip install --no-cache-dir -r /home/user/comps/asr/requirements.txt ; \
+    fi
     pip list
 
 ENV PYTHONPATH=$PYTHONPATH:/home/user

diff --git a/comps/asr/whisper/Dockerfile_hpu b/comps/asr/whisper/Dockerfile_hpu
@@ -11,6 +11,7 @@ RUN useradd -m -s /bin/bash user && \
 # Set environment variables
 ENV LANG=en_US.UTF-8
 ENV PYTHONPATH=/home/user:/usr/lib/habanalabs/:/optimum-habana
+ENV LD_LIBRARY_PATH=/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
 
 # Install system dependencies
 RUN apt-get update \

diff --git a/comps/llms/text-generation/vllm-openvino/README.md b/comps/llms/text-generation/vllm-openvino/README.md
@@ -1,5 +1,10 @@
 # Use vLLM with OpenVINO
 
+vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](https://github.com/vllm-project/vllm/blob/main/docs/source/models/supported_models.rst) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support. OpenVINO vLLM backend supports the following advanced vLLM features:
+
+- Prefix caching (`--enable-prefix-caching`)
+- Chunked prefill (`--enable-chunked-prefill`)
+
 ## Build Docker Image
 
 To build the docker image, run the command
@@ -59,15 +64,19 @@ export vLLM_LLM_ENDPOINT="http://xxx.xxx.xxx.xxx:8000"
 export LLM_MODEL=<model_name> # example: export LLM_MODEL="meta-llama/Llama-2-7b-hf"
 ```
 
-## Use Int-8 Weights Compression
+## Performance tips
+
+vLLM OpenVINO backend uses the following environment variables to control behavior:
+
+- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+
+- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
 
-Weights int-8 compression is disabled by default. For better performance and lower memory consumption, the weights compression can be enabled by setting the environment variable `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1`.
-To pass the variable in docker, use `-e VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1` as an additional argument to `docker run` command in the examples above.
+- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off.
 
-The variable enables weights compression logic described in [optimum-intel 8-bit weights quantization](https://huggingface.co/docs/optimum/intel/optimization_ov#8-bit).
-Hence, even if the variable is enabled, the compression is applied only for models starting with a certain size and avoids compression of too small models due to a significant accuracy drop.
+To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`)
 
-## Use UInt-8 KV cache Compression
+OpenVINO best known configuration is:
 
-KV cache uint-8 compression is disabled by default. For better performance and lower memory consumption, the KV cache compression can be enabled by setting the environment variable `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`.
-To pass the variable in docker, use `-e VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` as an additional argument to `docker run` command in the examples above.
+    $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
+        python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
diff --git a/comps/llms/text-generation/vllm-openvino/build_vllm_openvino.sh b/comps/llms/text-generation/vllm-openvino/build_vllm_openvino.sh
@@ -3,7 +3,8 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-
-git clone --branch openvino-model-executor https://github.com/ilya-lavrenov/vllm.git
+BASEDIR="$( cd "$( dirname "$0" )" && pwd )"
+git clone https://github.com/vllm-project/vllm.git vllm
 cd ./vllm/
 docker build -t vllm:openvino -f Dockerfile.openvino . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
+cd $BASEDIR && rm -rf vllm
diff --git a/comps/llms/text-generation/vllm-openvino/launch_model_server.sh b/comps/llms/text-generation/vllm-openvino/launch_model_server.sh
@@ -42,5 +42,20 @@ port_number=${port:-$default_port}
 # Set the Huggingface cache directory variable
 HF_CACHE_DIR=$HOME/.cache/huggingface
 
-# Start the model server using Openvino as the backend inference engine. Provide the container name that is unique and meaningful, typically one that includes the model name.
-docker run --rm --name="vllm-openvino-server" -p $port_number:$port_number -v $HF_CACHE_DIR:/root/.cache/huggingface vllm:openvino --model $model_name --port $port_number --disable-log-requests --swap-space $swap_space
+# Start the model server using Openvino as the backend inference engine.
+# Provide the container name that is unique and meaningful, typically one that includes the model name.
+
+docker run -d --rm --name="vllm-openvino-server" \
+  -p $port_number:80 \
+  --ipc=host \
+  -e HTTPS_PROXY=$https_proxy \
+  -e HTTP_PROXY=$https_proxy \
+  -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \
+  -v $HOME/.cache/huggingface:/root/.cache/huggingface \
+  vllm:openvino /bin/bash -c "\
+    cd / && \
+    export VLLM_CPU_KVCACHE_SPACE=50 && \
+    python3 -m vllm.entrypoints.openai.api_server \
+      --model \"$model_name\" \
+      --host 0.0.0.0 \
+      --port 80"
diff --git a/comps/tts/Dockerfile b/comps/tts/Dockerfile
@@ -7,11 +7,16 @@ RUN useradd -m -s /bin/bash user && \
     chown -R user /home/user/
 USER user
 ENV LANG=C.UTF-8
+ARG ARCH=cpu
 
 COPY comps /home/user/comps
 
 RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r /home/user/comps/tts/requirements.txt
+    if [ "${ARCH}" = "cpu" ]; then \
+        pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/tts/requirements.txt ; \
+    else \
+        pip install --no-cache-dir -r /home/user/comps/tts/requirements.txt ; \
+    fi
 
 ENV PYTHONPATH=$PYTHONPATH:/home/user
 

diff --git a/comps/tts/speecht5/Dockerfile b/comps/tts/speecht5/Dockerfile
@@ -9,6 +9,7 @@ RUN useradd -m -s /bin/bash user && \
 # Set environment variables
 ENV LANG=en_US.UTF-8
 ENV PYTHONPATH=/home/user
+ARG ARCH=cpu
 
 # Install system dependencies
 RUN apt-get update \
@@ -20,7 +21,11 @@ COPY --chown=user:user comps /home/user/comps
 USER user
 
 RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r /home/user/comps/tts/requirements.txt
+    if [ "${ARCH}" = "cpu" ]; then \
+        pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/tts/requirements.txt ; \
+    else \
+        pip install --no-cache-dir -r /home/user/comps/tts/requirements.txt ; \
+    fi
 
 ENV PYTHONPATH=$PYTHONPATH:/home/user
 

diff --git a/comps/tts/speecht5/Dockerfile_hpu b/comps/tts/speecht5/Dockerfile_hpu
@@ -11,6 +11,7 @@ RUN rm -rf /etc/ssh/ssh_host*
 # Set environment variables
 ENV LANG=en_US.UTF-8
 ENV PYTHONPATH=/home/user:/usr/lib/habanalabs/:/optimum-habana
+ENV LD_LIBRARY_PATH=/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
 
 # Install system dependencies
 RUN apt-get update \

diff --git a/tests/test_llms_text-generation_vllm-openvino.sh b/tests/test_llms_text-generation_vllm-openvino.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+
+WORKPATH="$( cd "$( dirname "$0" )" && pwd )"
+
+# Define variables
+port=8123
+HF_CACHE_DIR=$HOME/.cache/huggingface
+DOCKER_IMAGE="vllm:openvino"
+CONTAINER_NAME="vllm-openvino-container"
+
+function build_container() {
+    cd $WORKPATH
+    git clone https://github.com/vllm-project/vllm.git vllm-openvino
+    cd ./vllm-openvino/
+    docker build -t $DOCKER_IMAGE \
+      -f Dockerfile.openvino \
+      . \
+      --build-arg https_proxy=$https_proxy \
+      --build-arg http_proxy=$http_proxy
+    cd $WORKPATH
+    rm -rf vllm-openvino
+}
+
+# Function to start Docker container
+start_container() {
+
+    docker run -d --rm --name=$CONTAINER_NAME \
+      -p $port:$port \
+      --ipc=host \
+      -e HTTPS_PROXY=$https_proxy \
+      -e HTTP_PROXY=$https_proxy \
+      -v $HF_CACHE_DIR:/root/.cache/huggingface \
+      vllm:openvino /bin/bash -c "\
+        cd / && \
+        export VLLM_CPU_KVCACHE_SPACE=50 && \
+        python3 -m vllm.entrypoints.openai.api_server \
+          --model \"Intel/neural-chat-7b-v3-3\" \
+          --host 0.0.0.0 \
+          --port $port"
+
+    # check whether service is fully ready
+    n=0
+    until [[ "$n" -ge 300 ]]; do
+        docker logs $CONTAINER_NAME > /tmp/$CONTAINER_NAME.log 2>&1
+        n=$((n+1))
+        if grep -q "Uvicorn running on" /tmp/$CONTAINER_NAME.log; then
+            break
+        fi
+        sleep 3s
+    done
+
+}
+
+# Cleanup Function
+cleanup() {
+    # Stop and remove Docker container and images
+    cid=$(docker ps -aq --filter "name=$CONTAINER_NAME")
+        if [[ ! -z "$cid" ]]; then docker stop $cid || docker rm $cid && sleep 1s; fi
+    docker rmi -f $DOCKER_IMAGE
+    rm /tmp/$CONTAINER_NAME.log
+}
+
+# Function to test API endpoint
+function test_api_endpoint {
+    local endpoint="$1"
+    local expected_status="$2"
+
+    # Make the HTTP request
+    if test "$1" = "v1/completions"
+    then
+        local response=$(curl "http://localhost:$port/$endpoint" \
+          -H "Content-Type: application/json" \
+          -d '{
+                "model": "Intel/neural-chat-7b-v3-3",
+                "prompt": "What is the key advantage of Openvino framework",
+                "max_tokens": 300,
+                "temperature": 0.7
+              }' \
+          --write-out '%{http_code}' \
+          --silent \
+          --output /dev/null)
+    else
+        local response=$(curl "http://localhost:$port/$endpoint" \
+          --write-out '%{http_code}' \
+          --silent \
+          --output /dev/null)
+    fi
+
+    # Assert the response status code
+    if [[ "$response" -eq "$expected_status" ]]; then
+        echo "PASS: $endpoint returned expected status code: $expected_status"
+    else
+        echo "FAIL: $endpoint returned unexpected status code: $response (expected: $expected_status)"
+    fi
+}
+# Main function
+main() {
+
+    build_container
+    start_container
+
+    # Sleep to allow the container to start up fully
+    sleep 10
+    # Test the /v1/models API
+    test_api_endpoint "v1/models" 200
+
+    # Test the /v1/completions API
+    test_api_endpoint "v1/completions" 200
+
+    cleanup
+}
+
+# Call main function
+main