Skip to content

Commit

Permalink
Add vllm cpu test
Browse files Browse the repository at this point in the history
Signed-off-by: Sivanantham Chinnaiyan <[email protected]>
  • Loading branch information
sivanantha321 committed Jun 26, 2024
1 parent d19e310 commit 51c777d
Show file tree
Hide file tree
Showing 5 changed files with 240 additions and 4 deletions.
104 changes: 104 additions & 0 deletions .github/workflows/huggingface-cpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
name: Huggingface Vllm CPU Docker Publisher

on:
push:
# Publish `master` as Docker `latest` image.
branches:
- master

# Publish `v1.2.3` tags as releases.
tags:
- v*
# Run tests for any PRs.
pull_request:

env:
IMAGE_NAME: huggingfaceserver-vllm-cpu

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
# Run tests.
# See also https://docs.docker.com/docker-hub/builds/automated-testing/
test:
runs-on: ubuntu-latest

steps:
- name: Checkout source
uses: actions/checkout@v4

- name: Free-up disk space
uses: ./.github/actions/free-up-disk-space

- name: Setup Docker Buildx
uses: docker/setup-buildx-action@v3

- name: CPU Info
run: cat /proc/cpuinfo

- name: Run tests
uses: docker/build-push-action@v5
with:
platforms: linux/amd64
context: python
file: python/huggingface_vllm_cpu.Dockerfile
push: false
# https://github.com/docker/buildx/issues/1533
provenance: false

# Push image to GitHub Packages.
# See also https://docs.docker.com/docker-hub/builds/
# push:
# # Ensure test job passes before pushing image.
## needs: test
#
# runs-on: ubuntu-latest
## if: github.event_name == 'push'
#
# steps:
# - name: Checkout source
# uses: actions/checkout@v4
#
# - name: Free-up disk space
# uses: ./.github/actions/free-up-disk-space
#
# - name: Setup Docker Buildx
# uses: docker/setup-buildx-action@v3
#
# - name: Login to DockerHub
# uses: docker/login-action@v3
# with:
# username: ${{ secrets.DOCKER_USER }}
# password: ${{ secrets.DOCKER_PASSWORD }}
#
# - name: Export version variable
# run: |
# IMAGE_ID=sivanantha/$IMAGE_NAME
#
# # Change all uppercase to lowercase
# IMAGE_ID=$(echo $IMAGE_ID | tr '[A-Z]' '[a-z]')
#
# # Strip git ref prefix from version
# VERSION=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,')
#
# # Strip "v" prefix from tag name
# # [[ "${{ github.ref }}" == "refs/tags/"* ]] && VERSION=$(echo $VERSION | sed -e 's/^v//')
#
# # Use Docker `latest` tag convention
# [ "$VERSION" == "master" ] && VERSION=latest
#
# echo VERSION=$VERSION >> $GITHUB_ENV
# echo IMAGE_ID=$IMAGE_ID >> $GITHUB_ENV
#
# - name: Build and push
# uses: docker/build-push-action@v5
# with:
# platforms: linux/amd64
# context: python
# file: python/huggingface_vllm_cpu_server.Dockerfile
# push: true
# tags: ${{ env.IMAGE_ID }}:${{ env.VERSION }}
# # https://github.com/docker/buildx/issues/1533
# provenance: false
80 changes: 80 additions & 0 deletions python/huggingface_vllm_cpu.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
ARG PYTHON_VERSION=3.10
ARG BASE_IMAGE=python:${PYTHON_VERSION}-slim-bookworm
ARG VENV_PATH=/prod_venv

FROM ${BASE_IMAGE} AS builder

# Install Poetry
ARG POETRY_HOME=/opt/poetry
ARG POETRY_VERSION=1.7.1

# Install vllm
ARG VLLM_VERSION=0.5.0.post1

RUN apt-get update -y && apt-get install git gcc-12 g++-12 wget tar -y --no-install-recommends && apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12


RUN wget https://github.com/vllm-project/vllm/archive/refs/tags/v${VLLM_VERSION}.tar.gz -O vllm.tar.gz && \
tar -xzvp -f vllm.tar.gz

RUN python3 -m venv ${POETRY_HOME} && ${POETRY_HOME}/bin/pip3 install poetry==${POETRY_VERSION}
ENV PATH="$PATH:${POETRY_HOME}/bin"

# Activate virtual env
ARG VENV_PATH
ENV VIRTUAL_ENV=${VENV_PATH}
RUN python3 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"


COPY kserve/pyproject.toml kserve/poetry.lock kserve/
RUN --mount=type=cache,target=/root/.cache cd kserve && poetry install --no-root --no-interaction
COPY kserve kserve
RUN --mount=type=cache,target=/root/.cache cd kserve && poetry install --no-interaction

COPY huggingfaceserver/pyproject.toml huggingfaceserver/poetry.lock huggingfaceserver/
RUN --mount=type=cache,target=/root/.cache cd huggingfaceserver && poetry install --no-root --no-interaction
COPY huggingfaceserver huggingfaceserver
RUN --mount=type=cache,target=/root/.cache cd huggingfaceserver && poetry install --no-interaction

# Install vllm
RUN --mount=type=cache,target=/root/.cache cd vllm-${VLLM_VERSION} && pip install wheel packaging ninja setuptools>=49.4.0 && \
pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
# Performance boost for PyTorch in intel cpu
RUN --mount=type=cache,target=/root/.cache pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
RUN cd vllm-${VLLM_VERSION} && VLLM_TARGET_DEVICE=cpu python setup.py install


FROM ${BASE_IMAGE} AS prod

COPY third_party third_party

# For high performance memory allocation and better cache locality
RUN apt-get update -y && apt-get install libtcmalloc-minimal4 -y --no-install-recommends && apt-get clean && \
rm -rf /var/lib/apt/lists/* \
&& echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc


# Activate virtual env
ARG VENV_PATH
ENV VIRTUAL_ENV=${VENV_PATH}
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

RUN useradd kserve -m -u 1000 -d /home/kserve

COPY --from=builder --chown=kserve:kserve $VIRTUAL_ENV $VIRTUAL_ENV
COPY --from=builder kserve kserve
COPY --from=builder huggingfaceserver huggingfaceserver

# Set a writable Hugging Face home folder to avoid permission issue. See https://github.com/kserve/kserve/issues/3562
ENV HF_HOME="/tmp/huggingface"
# https://huggingface.co/docs/safetensors/en/speed#gpu-benchmark
ENV SAFETENSORS_FAST_GPU="1"
# https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhubdisabletelemetry
ENV HF_HUB_DISABLE_TELEMETRY="1"

USER 1000
ENTRYPOINT ["python3", "-m", "huggingfaceserver"]

Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def load(self) -> bool:

self.max_length = _get_and_verify_max_len(self.model_config, self.max_length)

# device_map = "auto" enables model parallelism but all model architcture dont support it.
# device_map = "auto" enables model parallelism but all model architecture don't support it.
# For pre-check we initialize the model class without weights to check the `_no_split_modules`
# device_map = "auto" for models that support this else set to either cuda/cpu
with init_empty_weights():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ def __init__(
self.vllm_engine_args = engine_args

def load(self) -> bool:
self.vllm_engine_args.tensor_parallel_size = torch.cuda.device_count()
if torch.cuda.is_available():
self.vllm_engine_args.tensor_parallel_size = torch.cuda.device_count()
self.vllm_engine = AsyncLLMEngine.from_engine_args(self.vllm_engine_args)
self.openai_serving_completion = OpenAIServingCompletion(self.vllm_engine)
self.ready = True
Expand Down
55 changes: 53 additions & 2 deletions test/e2e/predictor/test_huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,12 @@
@pytest.mark.llm
def test_huggingface_openai_chat_completions():
service_name = "hf-opt-125m-chat"
protocol_version = "v2"
predictor = V1beta1PredictorSpec(
min_replicas=1,
model=V1beta1ModelSpec(
model_format=V1beta1ModelFormat(
name="huggingface",
),
protocol_version=protocol_version,
args=[
"--model_id",
"facebook/opt-125m",
Expand Down Expand Up @@ -279,3 +277,56 @@ def test_huggingface_openai_text_2_text():
assert res["choices"][0]["text"] == "Das ist für Deutschland"

kserve_client.delete(service_name, KSERVE_TEST_NAMESPACE)


@pytest.mark.local
@pytest.mark.llm
def test_vllm_openai_chat_completions():
service_name = "hf-opt-125m-chat"
predictor = V1beta1PredictorSpec(
min_replicas=1,
model=V1beta1ModelSpec(
model_format=V1beta1ModelFormat(
name="huggingface",
),
args=[
"--model_id",
"facebook/opt-125m",
"--model_revision",
"27dcfa74d334bc871f3234de431e71c6eeba5dd6",
"--tokenizer_revision",
"27dcfa74d334bc871f3234de431e71c6eeba5dd6",
"--backend",
"vllm",
"--device",
"cpu",
],
resources=V1ResourceRequirements(
requests={"cpu": "1", "memory": "2Gi"},
limits={"cpu": "1", "memory": "4Gi"},
),
),
)

isvc = V1beta1InferenceService(
api_version=constants.KSERVE_V1BETA1,
kind=constants.KSERVE_KIND,
metadata=client.V1ObjectMeta(
name=service_name, namespace=KSERVE_TEST_NAMESPACE
),
spec=V1beta1InferenceServiceSpec(predictor=predictor),
)

kserve_client = KServeClient(
config_file=os.environ.get("KUBECONFIG", "~/.kube/config")
)
kserve_client.create(isvc)
kserve_client.wait_isvc_ready(service_name, namespace=KSERVE_TEST_NAMESPACE)

res = generate(service_name, "./data/opt_125m_input_generate.json")
assert (
res["choices"][0]["message"]["content"]
== "I'm not sure if this is a good idea, but I'm not sure if I should"
)

kserve_client.delete(service_name, KSERVE_TEST_NAMESPACE)

0 comments on commit 51c777d

Please sign in to comment.