Skip to content

Commit

Permalink
Merge branch 'main' into vectorstore/milvus
Browse files Browse the repository at this point in the history
  • Loading branch information
letonghan authored Aug 12, 2024
2 parents 9bd251f + 5262d05 commit 9672613
Show file tree
Hide file tree
Showing 12 changed files with 250 additions and 18 deletions.
64 changes: 63 additions & 1 deletion .github/workflows/pr-dockerfile-path-scan.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,68 @@ concurrency:
cancel-in-progress: true

jobs:
file-change-detection:
Dockerfile-path-change-detection-in-GenAIComps:
runs-on: ubuntu-latest
steps:
- name: Clean Up Working Directory
run: sudo rm -rf ${{github.workspace}}/*

- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Check for changed Dockerfile paths in yaml
run: |
set -xe
shopt -s globstar
cd ${{github.workspace}}
is_use="FALSE"
used_files=""
merged_commit=$(git log -1 --format='%H')
changed_files="$(git diff --name-status --diff-filter=DR ${{ github.event.pull_request.base.sha }} ${merged_commit} -- '**/Dockerfile**' | cut -f2)"
if [ -n "$changed_files" ]; then
for file in $changed_files; do
if grep -q "$file" .github/workflows/docker/compose/*.yaml; then
is_use="TRUE"
used_files+="$file "
fi
done
fi
if [[ "$is_use" == "TRUE" ]]; then
echo "Warning: Changed Dockerfile paths:"
echo "$used_files"
echo "Please modify the corresponding yaml in GenAIComps/.github/workflows/docker/compose and ask [email protected] for final confirmation."
exit 1
fi
- name: Check for changed Dockerfile paths in readme
run: |
set -xe
shopt -s globstar
cd ${{github.workspace}}
is_use="FALSE"
used_files=""
merged_commit=$(git log -1 --format='%H')
changed_files="$(git diff --name-status --diff-filter=DR ${{ github.event.pull_request.base.sha }} ${merged_commit} -- '**/Dockerfile**' | cut -f2)"
if [ -n "$changed_files" ]; then
for file in $changed_files; do
if grep -q "$file" ./**/*.md; then
is_use="TRUE"
used_files+="$file "
fi
done
fi
if [[ "$is_use" == "TRUE" ]]; then
echo "Warning: Changed Dockerfile paths:"
echo "$used_files"
echo "Please modify the corresponding README in GenAIComps and ask [email protected] for final confirmation."
exit 1
fi
Dockerfile-path-change-detection-in-GenAIExamples:
runs-on: ubuntu-latest
steps:
- name: Clean Up Working Directory
Expand All @@ -32,6 +93,7 @@ jobs:
- name: Check for changed Dockerfile paths
run: |
set -xe
shopt -s globstar
cd ${{github.workspace}}
is_use="FALSE"
Expand Down
8 changes: 6 additions & 2 deletions comps/agent/langchain/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
FROM python:3.11-slim

ENV LANG=C.UTF-8
ARG ARCH=cpu

RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
build-essential \
Expand All @@ -19,8 +20,11 @@ USER user
COPY comps /home/user/comps

RUN pip install --no-cache-dir --upgrade pip setuptools && \
if [ ${ARCH} = "cpu" ]; then pip install torch --index-url https://download.pytorch.org/whl/cpu; fi && \
pip install --no-cache-dir -r /home/user/comps/agent/langchain/requirements.txt
if [ ${ARCH} = "cpu" ]; then \
pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/agent/langchain/requirements.txt; \
else \
pip install --no-cache-dir -r /home/user/comps/agent/langchain/requirements.txt; \
fi

ENV PYTHONPATH=$PYTHONPATH:/home/user

Expand Down
7 changes: 6 additions & 1 deletion comps/asr/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,16 @@ RUN useradd -m -s /bin/bash user && \
USER user

ENV LANG=C.UTF-8
ARG ARCH=cpu

COPY comps /home/user/comps

RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /home/user/comps/asr/requirements.txt
if [ "${ARCH}" = "cpu" ]; then \
pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/asr/requirements.txt ; \
else \
pip install --no-cache-dir -r /home/user/comps/asr/requirements.txt ; \
fi

ENV PYTHONPATH=$PYTHONPATH:/home/user

Expand Down
6 changes: 6 additions & 0 deletions comps/asr/whisper/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ RUN useradd -m -s /bin/bash user && \

# Set environment variables
ENV LANG=en_US.UTF-8
ARG ARCH=cpu

# Install system dependencies
RUN apt-get update \
Expand All @@ -21,6 +22,11 @@ USER user

RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /home/user/comps/asr/requirements.txt && \
if [ "${ARCH}" = "cpu" ]; then \
pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/asr/requirements.txt ; \
else \
pip install --no-cache-dir -r /home/user/comps/asr/requirements.txt ; \
fi
pip list

ENV PYTHONPATH=$PYTHONPATH:/home/user
Expand Down
1 change: 1 addition & 0 deletions comps/asr/whisper/Dockerfile_hpu
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ RUN useradd -m -s /bin/bash user && \
# Set environment variables
ENV LANG=en_US.UTF-8
ENV PYTHONPATH=/home/user:/usr/lib/habanalabs/:/optimum-habana
ENV LD_LIBRARY_PATH=/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH

# Install system dependencies
RUN apt-get update \
Expand Down
25 changes: 17 additions & 8 deletions comps/llms/text-generation/vllm-openvino/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Use vLLM with OpenVINO

vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](https://github.com/vllm-project/vllm/blob/main/docs/source/models/supported_models.rst) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support. OpenVINO vLLM backend supports the following advanced vLLM features:

- Prefix caching (`--enable-prefix-caching`)
- Chunked prefill (`--enable-chunked-prefill`)

## Build Docker Image

To build the docker image, run the command
Expand Down Expand Up @@ -59,15 +64,19 @@ export vLLM_LLM_ENDPOINT="http://xxx.xxx.xxx.xxx:8000"
export LLM_MODEL=<model_name> # example: export LLM_MODEL="meta-llama/Llama-2-7b-hf"
```

## Use Int-8 Weights Compression
## Performance tips

vLLM OpenVINO backend uses the following environment variables to control behavior:

- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.

- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.

Weights int-8 compression is disabled by default. For better performance and lower memory consumption, the weights compression can be enabled by setting the environment variable `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1`.
To pass the variable in docker, use `-e VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1` as an additional argument to `docker run` command in the examples above.
- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off.

The variable enables weights compression logic described in [optimum-intel 8-bit weights quantization](https://huggingface.co/docs/optimum/intel/optimization_ov#8-bit).
Hence, even if the variable is enabled, the compression is applied only for models starting with a certain size and avoids compression of too small models due to a significant accuracy drop.
To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`)

## Use UInt-8 KV cache Compression
OpenVINO best known configuration is:

KV cache uint-8 compression is disabled by default. For better performance and lower memory consumption, the KV cache compression can be enabled by setting the environment variable `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`.
To pass the variable in docker, use `-e VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` as an additional argument to `docker run` command in the examples above.
$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0


git clone --branch openvino-model-executor https://github.com/ilya-lavrenov/vllm.git
BASEDIR="$( cd "$( dirname "$0" )" && pwd )"
git clone https://github.com/vllm-project/vllm.git vllm
cd ./vllm/
docker build -t vllm:openvino -f Dockerfile.openvino . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
cd $BASEDIR && rm -rf vllm
19 changes: 17 additions & 2 deletions comps/llms/text-generation/vllm-openvino/launch_model_server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,20 @@ port_number=${port:-$default_port}
# Set the Huggingface cache directory variable
HF_CACHE_DIR=$HOME/.cache/huggingface

# Start the model server using Openvino as the backend inference engine. Provide the container name that is unique and meaningful, typically one that includes the model name.
docker run --rm --name="vllm-openvino-server" -p $port_number:$port_number -v $HF_CACHE_DIR:/root/.cache/huggingface vllm:openvino --model $model_name --port $port_number --disable-log-requests --swap-space $swap_space
# Start the model server using Openvino as the backend inference engine.
# Provide the container name that is unique and meaningful, typically one that includes the model name.

docker run -d --rm --name="vllm-openvino-server" \
-p $port_number:80 \
--ipc=host \
-e HTTPS_PROXY=$https_proxy \
-e HTTP_PROXY=$https_proxy \
-e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \
-v $HOME/.cache/huggingface:/root/.cache/huggingface \
vllm:openvino /bin/bash -c "\
cd / && \
export VLLM_CPU_KVCACHE_SPACE=50 && \
python3 -m vllm.entrypoints.openai.api_server \
--model \"$model_name\" \
--host 0.0.0.0 \
--port 80"
7 changes: 6 additions & 1 deletion comps/tts/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,16 @@ RUN useradd -m -s /bin/bash user && \
chown -R user /home/user/
USER user
ENV LANG=C.UTF-8
ARG ARCH=cpu

COPY comps /home/user/comps

RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /home/user/comps/tts/requirements.txt
if [ "${ARCH}" = "cpu" ]; then \
pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/tts/requirements.txt ; \
else \
pip install --no-cache-dir -r /home/user/comps/tts/requirements.txt ; \
fi

ENV PYTHONPATH=$PYTHONPATH:/home/user

Expand Down
7 changes: 6 additions & 1 deletion comps/tts/speecht5/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ RUN useradd -m -s /bin/bash user && \
# Set environment variables
ENV LANG=en_US.UTF-8
ENV PYTHONPATH=/home/user
ARG ARCH=cpu

# Install system dependencies
RUN apt-get update \
Expand All @@ -20,7 +21,11 @@ COPY --chown=user:user comps /home/user/comps
USER user

RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /home/user/comps/tts/requirements.txt
if [ "${ARCH}" = "cpu" ]; then \
pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/tts/requirements.txt ; \
else \
pip install --no-cache-dir -r /home/user/comps/tts/requirements.txt ; \
fi

ENV PYTHONPATH=$PYTHONPATH:/home/user

Expand Down
1 change: 1 addition & 0 deletions comps/tts/speecht5/Dockerfile_hpu
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ RUN rm -rf /etc/ssh/ssh_host*
# Set environment variables
ENV LANG=en_US.UTF-8
ENV PYTHONPATH=/home/user:/usr/lib/habanalabs/:/optimum-habana
ENV LD_LIBRARY_PATH=/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH

# Install system dependencies
RUN apt-get update \
Expand Down
118 changes: 118 additions & 0 deletions tests/test_llms_text-generation_vllm-openvino.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

set -xe

WORKPATH="$( cd "$( dirname "$0" )" && pwd )"

# Define variables
port=8123
HF_CACHE_DIR=$HOME/.cache/huggingface
DOCKER_IMAGE="vllm:openvino"
CONTAINER_NAME="vllm-openvino-container"

function build_container() {
cd $WORKPATH
git clone https://github.com/vllm-project/vllm.git vllm-openvino
cd ./vllm-openvino/
docker build -t $DOCKER_IMAGE \
-f Dockerfile.openvino \
. \
--build-arg https_proxy=$https_proxy \
--build-arg http_proxy=$http_proxy
cd $WORKPATH
rm -rf vllm-openvino
}

# Function to start Docker container
start_container() {

docker run -d --rm --name=$CONTAINER_NAME \
-p $port:$port \
--ipc=host \
-e HTTPS_PROXY=$https_proxy \
-e HTTP_PROXY=$https_proxy \
-v $HF_CACHE_DIR:/root/.cache/huggingface \
vllm:openvino /bin/bash -c "\
cd / && \
export VLLM_CPU_KVCACHE_SPACE=50 && \
python3 -m vllm.entrypoints.openai.api_server \
--model \"Intel/neural-chat-7b-v3-3\" \
--host 0.0.0.0 \
--port $port"

# check whether service is fully ready
n=0
until [[ "$n" -ge 300 ]]; do
docker logs $CONTAINER_NAME > /tmp/$CONTAINER_NAME.log 2>&1
n=$((n+1))
if grep -q "Uvicorn running on" /tmp/$CONTAINER_NAME.log; then
break
fi
sleep 3s
done

}

# Cleanup Function
cleanup() {
# Stop and remove Docker container and images
cid=$(docker ps -aq --filter "name=$CONTAINER_NAME")
if [[ ! -z "$cid" ]]; then docker stop $cid || docker rm $cid && sleep 1s; fi
docker rmi -f $DOCKER_IMAGE
rm /tmp/$CONTAINER_NAME.log
}

# Function to test API endpoint
function test_api_endpoint {
local endpoint="$1"
local expected_status="$2"

# Make the HTTP request
if test "$1" = "v1/completions"
then
local response=$(curl "http://localhost:$port/$endpoint" \
-H "Content-Type: application/json" \
-d '{
"model": "Intel/neural-chat-7b-v3-3",
"prompt": "What is the key advantage of Openvino framework",
"max_tokens": 300,
"temperature": 0.7
}' \
--write-out '%{http_code}' \
--silent \
--output /dev/null)
else
local response=$(curl "http://localhost:$port/$endpoint" \
--write-out '%{http_code}' \
--silent \
--output /dev/null)
fi

# Assert the response status code
if [[ "$response" -eq "$expected_status" ]]; then
echo "PASS: $endpoint returned expected status code: $expected_status"
else
echo "FAIL: $endpoint returned unexpected status code: $response (expected: $expected_status)"
fi
}
# Main function
main() {

build_container
start_container

# Sleep to allow the container to start up fully
sleep 10
# Test the /v1/models API
test_api_endpoint "v1/models" 200

# Test the /v1/completions API
test_api_endpoint "v1/completions" 200

cleanup
}

# Call main function
main

0 comments on commit 9672613

Please sign in to comment.