From 51c777db308323adc3c20d4b80d1e0e816c38d92 Mon Sep 17 00:00:00 2001 From: Sivanantham Chinnaiyan Date: Wed, 26 Jun 2024 15:57:33 +0530 Subject: [PATCH] Add vllm cpu test Signed-off-by: Sivanantham Chinnaiyan --- .github/workflows/huggingface-cpu.yml | 104 ++++++++++++++++++ python/huggingface_vllm_cpu.Dockerfile | 80 ++++++++++++++ .../huggingfaceserver/encoder_model.py | 2 +- .../huggingfaceserver/vllm/vllm_model.py | 3 +- test/e2e/predictor/test_huggingface.py | 55 ++++++++- 5 files changed, 240 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/huggingface-cpu.yml create mode 100644 python/huggingface_vllm_cpu.Dockerfile diff --git a/.github/workflows/huggingface-cpu.yml b/.github/workflows/huggingface-cpu.yml new file mode 100644 index 00000000000..703c2eb25e6 --- /dev/null +++ b/.github/workflows/huggingface-cpu.yml @@ -0,0 +1,104 @@ +name: Huggingface Vllm CPU Docker Publisher + +on: + push: + # Publish `master` as Docker `latest` image. + branches: + - master + + # Publish `v1.2.3` tags as releases. + tags: + - v* + # Run tests for any PRs. + pull_request: + +env: + IMAGE_NAME: huggingfaceserver-vllm-cpu + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + # Run tests. + # See also https://docs.docker.com/docker-hub/builds/automated-testing/ + test: + runs-on: ubuntu-latest + + steps: + - name: Checkout source + uses: actions/checkout@v4 + + - name: Free-up disk space + uses: ./.github/actions/free-up-disk-space + + - name: Setup Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: CPU Info + run: cat /proc/cpuinfo + + - name: Run tests + uses: docker/build-push-action@v5 + with: + platforms: linux/amd64 + context: python + file: python/huggingface_vllm_cpu.Dockerfile + push: false + # https://github.com/docker/buildx/issues/1533 + provenance: false + + # Push image to GitHub Packages. + # See also https://docs.docker.com/docker-hub/builds/ +# push: +# # Ensure test job passes before pushing image. +## needs: test +# +# runs-on: ubuntu-latest +## if: github.event_name == 'push' +# +# steps: +# - name: Checkout source +# uses: actions/checkout@v4 +# +# - name: Free-up disk space +# uses: ./.github/actions/free-up-disk-space +# +# - name: Setup Docker Buildx +# uses: docker/setup-buildx-action@v3 +# +# - name: Login to DockerHub +# uses: docker/login-action@v3 +# with: +# username: ${{ secrets.DOCKER_USER }} +# password: ${{ secrets.DOCKER_PASSWORD }} +# +# - name: Export version variable +# run: | +# IMAGE_ID=sivanantha/$IMAGE_NAME +# +# # Change all uppercase to lowercase +# IMAGE_ID=$(echo $IMAGE_ID | tr '[A-Z]' '[a-z]') +# +# # Strip git ref prefix from version +# VERSION=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,') +# +# # Strip "v" prefix from tag name +# # [[ "${{ github.ref }}" == "refs/tags/"* ]] && VERSION=$(echo $VERSION | sed -e 's/^v//') +# +# # Use Docker `latest` tag convention +# [ "$VERSION" == "master" ] && VERSION=latest +# +# echo VERSION=$VERSION >> $GITHUB_ENV +# echo IMAGE_ID=$IMAGE_ID >> $GITHUB_ENV +# +# - name: Build and push +# uses: docker/build-push-action@v5 +# with: +# platforms: linux/amd64 +# context: python +# file: python/huggingface_vllm_cpu_server.Dockerfile +# push: true +# tags: ${{ env.IMAGE_ID }}:${{ env.VERSION }} +# # https://github.com/docker/buildx/issues/1533 +# provenance: false diff --git a/python/huggingface_vllm_cpu.Dockerfile b/python/huggingface_vllm_cpu.Dockerfile new file mode 100644 index 00000000000..f03d1096636 --- /dev/null +++ b/python/huggingface_vllm_cpu.Dockerfile @@ -0,0 +1,80 @@ +ARG PYTHON_VERSION=3.10 +ARG BASE_IMAGE=python:${PYTHON_VERSION}-slim-bookworm +ARG VENV_PATH=/prod_venv + +FROM ${BASE_IMAGE} AS builder + +# Install Poetry +ARG POETRY_HOME=/opt/poetry +ARG POETRY_VERSION=1.7.1 + +# Install vllm +ARG VLLM_VERSION=0.5.0.post1 + +RUN apt-get update -y && apt-get install git gcc-12 g++-12 wget tar -y --no-install-recommends && apt-get clean && \ + rm -rf /var/lib/apt/lists/* +RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 + + +RUN wget https://github.com/vllm-project/vllm/archive/refs/tags/v${VLLM_VERSION}.tar.gz -O vllm.tar.gz && \ + tar -xzvp -f vllm.tar.gz + +RUN python3 -m venv ${POETRY_HOME} && ${POETRY_HOME}/bin/pip3 install poetry==${POETRY_VERSION} +ENV PATH="$PATH:${POETRY_HOME}/bin" + +# Activate virtual env +ARG VENV_PATH +ENV VIRTUAL_ENV=${VENV_PATH} +RUN python3 -m venv $VIRTUAL_ENV +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + + +COPY kserve/pyproject.toml kserve/poetry.lock kserve/ +RUN --mount=type=cache,target=/root/.cache cd kserve && poetry install --no-root --no-interaction +COPY kserve kserve +RUN --mount=type=cache,target=/root/.cache cd kserve && poetry install --no-interaction + +COPY huggingfaceserver/pyproject.toml huggingfaceserver/poetry.lock huggingfaceserver/ +RUN --mount=type=cache,target=/root/.cache cd huggingfaceserver && poetry install --no-root --no-interaction +COPY huggingfaceserver huggingfaceserver +RUN --mount=type=cache,target=/root/.cache cd huggingfaceserver && poetry install --no-interaction + +# Install vllm +RUN --mount=type=cache,target=/root/.cache cd vllm-${VLLM_VERSION} && pip install wheel packaging ninja setuptools>=49.4.0 && \ + pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +# Performance boost for PyTorch in intel cpu +RUN --mount=type=cache,target=/root/.cache pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl +RUN cd vllm-${VLLM_VERSION} && VLLM_TARGET_DEVICE=cpu python setup.py install + + +FROM ${BASE_IMAGE} AS prod + +COPY third_party third_party + +# For high performance memory allocation and better cache locality +RUN apt-get update -y && apt-get install libtcmalloc-minimal4 -y --no-install-recommends && apt-get clean && \ + rm -rf /var/lib/apt/lists/* \ + && echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc + + +# Activate virtual env +ARG VENV_PATH +ENV VIRTUAL_ENV=${VENV_PATH} +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +RUN useradd kserve -m -u 1000 -d /home/kserve + +COPY --from=builder --chown=kserve:kserve $VIRTUAL_ENV $VIRTUAL_ENV +COPY --from=builder kserve kserve +COPY --from=builder huggingfaceserver huggingfaceserver + +# Set a writable Hugging Face home folder to avoid permission issue. See https://github.com/kserve/kserve/issues/3562 +ENV HF_HOME="/tmp/huggingface" +# https://huggingface.co/docs/safetensors/en/speed#gpu-benchmark +ENV SAFETENSORS_FAST_GPU="1" +# https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhubdisabletelemetry +ENV HF_HUB_DISABLE_TELEMETRY="1" + +USER 1000 +ENTRYPOINT ["python3", "-m", "huggingfaceserver"] + diff --git a/python/huggingfaceserver/huggingfaceserver/encoder_model.py b/python/huggingfaceserver/huggingfaceserver/encoder_model.py index ae52d09c6c5..40b498e5e78 100644 --- a/python/huggingfaceserver/huggingfaceserver/encoder_model.py +++ b/python/huggingfaceserver/huggingfaceserver/encoder_model.py @@ -126,7 +126,7 @@ def load(self) -> bool: self.max_length = _get_and_verify_max_len(self.model_config, self.max_length) - # device_map = "auto" enables model parallelism but all model architcture dont support it. + # device_map = "auto" enables model parallelism but all model architecture don't support it. # For pre-check we initialize the model class without weights to check the `_no_split_modules` # device_map = "auto" for models that support this else set to either cuda/cpu with init_empty_weights(): diff --git a/python/huggingfaceserver/huggingfaceserver/vllm/vllm_model.py b/python/huggingfaceserver/huggingfaceserver/vllm/vllm_model.py index 059af5098e3..7d0226f2106 100644 --- a/python/huggingfaceserver/huggingfaceserver/vllm/vllm_model.py +++ b/python/huggingfaceserver/huggingfaceserver/vllm/vllm_model.py @@ -44,7 +44,8 @@ def __init__( self.vllm_engine_args = engine_args def load(self) -> bool: - self.vllm_engine_args.tensor_parallel_size = torch.cuda.device_count() + if torch.cuda.is_available(): + self.vllm_engine_args.tensor_parallel_size = torch.cuda.device_count() self.vllm_engine = AsyncLLMEngine.from_engine_args(self.vllm_engine_args) self.openai_serving_completion = OpenAIServingCompletion(self.vllm_engine) self.ready = True diff --git a/test/e2e/predictor/test_huggingface.py b/test/e2e/predictor/test_huggingface.py index 3933d7b9396..2776b5eafb3 100644 --- a/test/e2e/predictor/test_huggingface.py +++ b/test/e2e/predictor/test_huggingface.py @@ -33,14 +33,12 @@ @pytest.mark.llm def test_huggingface_openai_chat_completions(): service_name = "hf-opt-125m-chat" - protocol_version = "v2" predictor = V1beta1PredictorSpec( min_replicas=1, model=V1beta1ModelSpec( model_format=V1beta1ModelFormat( name="huggingface", ), - protocol_version=protocol_version, args=[ "--model_id", "facebook/opt-125m", @@ -279,3 +277,56 @@ def test_huggingface_openai_text_2_text(): assert res["choices"][0]["text"] == "Das ist für Deutschland" kserve_client.delete(service_name, KSERVE_TEST_NAMESPACE) + + +@pytest.mark.local +@pytest.mark.llm +def test_vllm_openai_chat_completions(): + service_name = "hf-opt-125m-chat" + predictor = V1beta1PredictorSpec( + min_replicas=1, + model=V1beta1ModelSpec( + model_format=V1beta1ModelFormat( + name="huggingface", + ), + args=[ + "--model_id", + "facebook/opt-125m", + "--model_revision", + "27dcfa74d334bc871f3234de431e71c6eeba5dd6", + "--tokenizer_revision", + "27dcfa74d334bc871f3234de431e71c6eeba5dd6", + "--backend", + "vllm", + "--device", + "cpu", + ], + resources=V1ResourceRequirements( + requests={"cpu": "1", "memory": "2Gi"}, + limits={"cpu": "1", "memory": "4Gi"}, + ), + ), + ) + + isvc = V1beta1InferenceService( + api_version=constants.KSERVE_V1BETA1, + kind=constants.KSERVE_KIND, + metadata=client.V1ObjectMeta( + name=service_name, namespace=KSERVE_TEST_NAMESPACE + ), + spec=V1beta1InferenceServiceSpec(predictor=predictor), + ) + + kserve_client = KServeClient( + config_file=os.environ.get("KUBECONFIG", "~/.kube/config") + ) + kserve_client.create(isvc) + kserve_client.wait_isvc_ready(service_name, namespace=KSERVE_TEST_NAMESPACE) + + res = generate(service_name, "./data/opt_125m_input_generate.json") + assert ( + res["choices"][0]["message"]["content"] + == "I'm not sure if this is a good idea, but I'm not sure if I should" + ) + + kserve_client.delete(service_name, KSERVE_TEST_NAMESPACE)