From 51c777db308323adc3c20d4b80d1e0e816c38d92 Mon Sep 17 00:00:00 2001
From: Sivanantham Chinnaiyan <sivanantham.chinnaiyan@ideas2it.com>
Date: Wed, 26 Jun 2024 15:57:33 +0530
Subject: [PATCH] Add vllm cpu test

Signed-off-by: Sivanantham Chinnaiyan <sivanantham.chinnaiyan@ideas2it.com>
---
 .github/workflows/huggingface-cpu.yml         | 104 ++++++++++++++++++
 python/huggingface_vllm_cpu.Dockerfile        |  80 ++++++++++++++
 .../huggingfaceserver/encoder_model.py        |   2 +-
 .../huggingfaceserver/vllm/vllm_model.py      |   3 +-
 test/e2e/predictor/test_huggingface.py        |  55 ++++++++-
 5 files changed, 240 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/huggingface-cpu.yml
 create mode 100644 python/huggingface_vllm_cpu.Dockerfile

diff --git a/.github/workflows/huggingface-cpu.yml b/.github/workflows/huggingface-cpu.yml
new file mode 100644
index 00000000000..703c2eb25e6
--- /dev/null
+++ b/.github/workflows/huggingface-cpu.yml
@@ -0,0 +1,104 @@
+name: Huggingface Vllm CPU Docker Publisher
+
+on:
+  push:
+    # Publish `master` as Docker `latest` image.
+    branches:
+      - master
+
+    # Publish `v1.2.3` tags as releases.
+    tags:
+      - v*
+  # Run tests for any PRs.
+  pull_request:
+
+env:
+  IMAGE_NAME: huggingfaceserver-vllm-cpu
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # Run tests.
+  # See also https://docs.docker.com/docker-hub/builds/automated-testing/
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout source
+        uses: actions/checkout@v4
+
+      - name: Free-up disk space
+        uses: ./.github/actions/free-up-disk-space
+
+      - name: Setup Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: CPU Info
+        run: cat /proc/cpuinfo
+
+      - name: Run tests
+        uses: docker/build-push-action@v5
+        with:
+          platforms: linux/amd64
+          context: python
+          file: python/huggingface_vllm_cpu.Dockerfile
+          push: false
+          # https://github.com/docker/buildx/issues/1533
+          provenance: false
+
+  # Push image to GitHub Packages.
+  # See also https://docs.docker.com/docker-hub/builds/
+#  push:
+#    # Ensure test job passes before pushing image.
+##    needs: test
+#
+#    runs-on: ubuntu-latest
+##    if: github.event_name == 'push'
+#
+#    steps:
+#      - name: Checkout source
+#        uses: actions/checkout@v4
+#
+#      - name: Free-up disk space
+#        uses: ./.github/actions/free-up-disk-space
+#
+#      - name: Setup Docker Buildx
+#        uses: docker/setup-buildx-action@v3
+#
+#      - name: Login to DockerHub
+#        uses: docker/login-action@v3
+#        with:
+#          username: ${{ secrets.DOCKER_USER }}
+#          password: ${{ secrets.DOCKER_PASSWORD }}
+#
+#      - name: Export version variable
+#        run: |
+#          IMAGE_ID=sivanantha/$IMAGE_NAME
+#
+#          # Change all uppercase to lowercase
+#          IMAGE_ID=$(echo $IMAGE_ID | tr '[A-Z]' '[a-z]')
+#
+#          # Strip git ref prefix from version
+#          VERSION=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,')
+#
+#          # Strip "v" prefix from tag name
+#          # [[ "${{ github.ref }}" == "refs/tags/"* ]] && VERSION=$(echo $VERSION | sed -e 's/^v//')
+#
+#          # Use Docker `latest` tag convention
+#          [ "$VERSION" == "master" ] && VERSION=latest
+#
+#          echo VERSION=$VERSION >> $GITHUB_ENV
+#          echo IMAGE_ID=$IMAGE_ID >> $GITHUB_ENV
+#
+#      - name: Build and push
+#        uses: docker/build-push-action@v5
+#        with:
+#          platforms: linux/amd64
+#          context: python
+#          file: python/huggingface_vllm_cpu_server.Dockerfile
+#          push: true
+#          tags: ${{ env.IMAGE_ID }}:${{ env.VERSION }}
+#          # https://github.com/docker/buildx/issues/1533
+#          provenance: false
diff --git a/python/huggingface_vllm_cpu.Dockerfile b/python/huggingface_vllm_cpu.Dockerfile
new file mode 100644
index 00000000000..f03d1096636
--- /dev/null
+++ b/python/huggingface_vllm_cpu.Dockerfile
@@ -0,0 +1,80 @@
+ARG PYTHON_VERSION=3.10
+ARG BASE_IMAGE=python:${PYTHON_VERSION}-slim-bookworm
+ARG VENV_PATH=/prod_venv
+
+FROM ${BASE_IMAGE} AS builder
+
+# Install Poetry
+ARG POETRY_HOME=/opt/poetry
+ARG POETRY_VERSION=1.7.1
+
+# Install vllm
+ARG VLLM_VERSION=0.5.0.post1
+
+RUN apt-get update -y && apt-get install git gcc-12 g++-12 wget tar -y --no-install-recommends && apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+
+RUN wget https://github.com/vllm-project/vllm/archive/refs/tags/v${VLLM_VERSION}.tar.gz -O vllm.tar.gz && \
+    tar -xzvp -f vllm.tar.gz
+
+RUN python3 -m venv ${POETRY_HOME} && ${POETRY_HOME}/bin/pip3 install poetry==${POETRY_VERSION}
+ENV PATH="$PATH:${POETRY_HOME}/bin"
+
+# Activate virtual env
+ARG VENV_PATH
+ENV VIRTUAL_ENV=${VENV_PATH}
+RUN python3 -m venv $VIRTUAL_ENV
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+
+COPY kserve/pyproject.toml kserve/poetry.lock kserve/
+RUN --mount=type=cache,target=/root/.cache cd kserve && poetry install --no-root --no-interaction
+COPY kserve kserve
+RUN --mount=type=cache,target=/root/.cache cd kserve && poetry install --no-interaction
+
+COPY huggingfaceserver/pyproject.toml huggingfaceserver/poetry.lock huggingfaceserver/
+RUN --mount=type=cache,target=/root/.cache cd huggingfaceserver && poetry install --no-root --no-interaction
+COPY huggingfaceserver huggingfaceserver
+RUN --mount=type=cache,target=/root/.cache cd huggingfaceserver && poetry install --no-interaction
+
+# Install vllm
+RUN --mount=type=cache,target=/root/.cache cd vllm-${VLLM_VERSION} && pip install wheel packaging ninja setuptools>=49.4.0 && \
+    pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+# Performance boost for PyTorch in intel cpu
+RUN --mount=type=cache,target=/root/.cache pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
+RUN cd vllm-${VLLM_VERSION} && VLLM_TARGET_DEVICE=cpu python setup.py install
+
+
+FROM ${BASE_IMAGE} AS prod
+
+COPY third_party third_party
+
+# For high performance memory allocation and better cache locality
+RUN apt-get update -y && apt-get install libtcmalloc-minimal4 -y --no-install-recommends && apt-get clean && \
+    rm -rf /var/lib/apt/lists/* \
+    && echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc
+
+
+# Activate virtual env
+ARG VENV_PATH
+ENV VIRTUAL_ENV=${VENV_PATH}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+RUN useradd kserve -m -u 1000 -d /home/kserve
+
+COPY --from=builder --chown=kserve:kserve $VIRTUAL_ENV $VIRTUAL_ENV
+COPY --from=builder kserve kserve
+COPY --from=builder huggingfaceserver huggingfaceserver
+
+# Set a writable Hugging Face home folder to avoid permission issue. See https://github.com/kserve/kserve/issues/3562
+ENV HF_HOME="/tmp/huggingface"
+# https://huggingface.co/docs/safetensors/en/speed#gpu-benchmark
+ENV SAFETENSORS_FAST_GPU="1"
+# https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhubdisabletelemetry
+ENV HF_HUB_DISABLE_TELEMETRY="1"
+
+USER 1000
+ENTRYPOINT ["python3", "-m", "huggingfaceserver"]
+
diff --git a/python/huggingfaceserver/huggingfaceserver/encoder_model.py b/python/huggingfaceserver/huggingfaceserver/encoder_model.py
index ae52d09c6c5..40b498e5e78 100644
--- a/python/huggingfaceserver/huggingfaceserver/encoder_model.py
+++ b/python/huggingfaceserver/huggingfaceserver/encoder_model.py
@@ -126,7 +126,7 @@ def load(self) -> bool:
 
         self.max_length = _get_and_verify_max_len(self.model_config, self.max_length)
 
-        # device_map = "auto" enables model parallelism but all model architcture dont support it.
+        # device_map = "auto" enables model parallelism but all model architecture don't support it.
         # For pre-check we initialize the model class without weights to check the `_no_split_modules`
         # device_map = "auto" for models that support this else set to either cuda/cpu
         with init_empty_weights():
diff --git a/python/huggingfaceserver/huggingfaceserver/vllm/vllm_model.py b/python/huggingfaceserver/huggingfaceserver/vllm/vllm_model.py
index 059af5098e3..7d0226f2106 100644
--- a/python/huggingfaceserver/huggingfaceserver/vllm/vllm_model.py
+++ b/python/huggingfaceserver/huggingfaceserver/vllm/vllm_model.py
@@ -44,7 +44,8 @@ def __init__(
         self.vllm_engine_args = engine_args
 
     def load(self) -> bool:
-        self.vllm_engine_args.tensor_parallel_size = torch.cuda.device_count()
+        if torch.cuda.is_available():
+            self.vllm_engine_args.tensor_parallel_size = torch.cuda.device_count()
         self.vllm_engine = AsyncLLMEngine.from_engine_args(self.vllm_engine_args)
         self.openai_serving_completion = OpenAIServingCompletion(self.vllm_engine)
         self.ready = True
diff --git a/test/e2e/predictor/test_huggingface.py b/test/e2e/predictor/test_huggingface.py
index 3933d7b9396..2776b5eafb3 100644
--- a/test/e2e/predictor/test_huggingface.py
+++ b/test/e2e/predictor/test_huggingface.py
@@ -33,14 +33,12 @@
 @pytest.mark.llm
 def test_huggingface_openai_chat_completions():
     service_name = "hf-opt-125m-chat"
-    protocol_version = "v2"
     predictor = V1beta1PredictorSpec(
         min_replicas=1,
         model=V1beta1ModelSpec(
             model_format=V1beta1ModelFormat(
                 name="huggingface",
             ),
-            protocol_version=protocol_version,
             args=[
                 "--model_id",
                 "facebook/opt-125m",
@@ -279,3 +277,56 @@ def test_huggingface_openai_text_2_text():
     assert res["choices"][0]["text"] == "Das ist für Deutschland"
 
     kserve_client.delete(service_name, KSERVE_TEST_NAMESPACE)
+
+
+@pytest.mark.local
+@pytest.mark.llm
+def test_vllm_openai_chat_completions():
+    service_name = "hf-opt-125m-chat"
+    predictor = V1beta1PredictorSpec(
+        min_replicas=1,
+        model=V1beta1ModelSpec(
+            model_format=V1beta1ModelFormat(
+                name="huggingface",
+            ),
+            args=[
+                "--model_id",
+                "facebook/opt-125m",
+                "--model_revision",
+                "27dcfa74d334bc871f3234de431e71c6eeba5dd6",
+                "--tokenizer_revision",
+                "27dcfa74d334bc871f3234de431e71c6eeba5dd6",
+                "--backend",
+                "vllm",
+                "--device",
+                "cpu",
+            ],
+            resources=V1ResourceRequirements(
+                requests={"cpu": "1", "memory": "2Gi"},
+                limits={"cpu": "1", "memory": "4Gi"},
+            ),
+        ),
+    )
+
+    isvc = V1beta1InferenceService(
+        api_version=constants.KSERVE_V1BETA1,
+        kind=constants.KSERVE_KIND,
+        metadata=client.V1ObjectMeta(
+            name=service_name, namespace=KSERVE_TEST_NAMESPACE
+        ),
+        spec=V1beta1InferenceServiceSpec(predictor=predictor),
+    )
+
+    kserve_client = KServeClient(
+        config_file=os.environ.get("KUBECONFIG", "~/.kube/config")
+    )
+    kserve_client.create(isvc)
+    kserve_client.wait_isvc_ready(service_name, namespace=KSERVE_TEST_NAMESPACE)
+
+    res = generate(service_name, "./data/opt_125m_input_generate.json")
+    assert (
+        res["choices"][0]["message"]["content"]
+        == "I'm not sure if this is a good idea, but I'm not sure if I should"
+    )
+
+    kserve_client.delete(service_name, KSERVE_TEST_NAMESPACE)