opendatahub-io · dtrifiro · May 7, 2024 · Apr 30, 2024 · Apr 30, 2024 · Apr 30, 2024
diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
@@ -0,0 +1,36 @@
+import os
+import zipfile
+
+MAX_SIZE_MB = 100
+
+
+def print_top_10_largest_files(zip_file):
+    with zipfile.ZipFile(zip_file, 'r') as z:
+        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
+        file_sizes.sort(key=lambda x: x[1], reverse=True)
+        for f, size in file_sizes[:10]:
+            print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
+
+
+def check_wheel_size(directory):
+    for root, _, files in os.walk(directory):
+        for f in files:
+            if f.endswith(".whl"):
+                wheel_path = os.path.join(root, f)
+                wheel_size = os.path.getsize(wheel_path)
+                wheel_size_mb = wheel_size / (1024 * 1024)
+                if wheel_size_mb > MAX_SIZE_MB:
+                    print(
+                        f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
+                        f"compare to the allowed size ({MAX_SIZE_MB} MB).")
+                    print_top_10_largest_files(wheel_path)
+                    return 1
+                else:
+                    print(f"Wheel {wheel_path} is within the allowed size "
+                          f"({wheel_size_mb} MB).")
+    return 0
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(check_wheel_size(sys.argv[1]))
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
@@ -1,10 +1,11 @@
-# This script build the ROCm docker image and run the API server inside the container.
-# It serves a sanity check for compilation and basic model usage.
+# This script build the ROCm docker image and runs test inside it.
 set -ex
 
 # Print ROCm version
+echo "--- ROCm info"
 rocminfo
 
+echo "--- Resetting GPUs"
 
 echo "reset" > /opt/amdgpu/etc/gpu_state
 
@@ -16,37 +17,28 @@ while true; do
         fi
 done
 
+echo "--- Building container"
+sha=$(git rev-parse --short HEAD)
+container_name=rocm_${sha}
+docker build \
+        -t ${container_name} \
+        -f Dockerfile.rocm \
+        --progress plain \
+        .
+
+remove_docker_container() {
+   docker rm -f ${container_name} || docker image rm -f ${container_name} || true
+}
+trap remove_docker_container EXIT
 
+echo "--- Running container"
 
-# Try building the docker image
-docker build -t rocm -f Dockerfile.rocm .
-
-# Setup cleanup
-remove_docker_container() { docker rm -f rocm || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image
-export HIP_VISIBLE_DEVICES=1
-docker run --device /dev/kfd --device /dev/dri --network host -e HIP_VISIBLE_DEVICES --name rocm rocm python3 -m vllm.entrypoints.api_server &
-
-# Wait for the server to start
-wait_for_server_to_start() {
-    timeout=300
-    counter=0
-
-    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
-        sleep 1
-        counter=$((counter + 1))
-        if [ $counter -ge $timeout ]; then
-            echo "Timeout after $timeout seconds"
-            break
-        fi
-    done
-}
-wait_for_server_to_start
+docker run \
+        --device /dev/kfd --device /dev/dri \
+        --network host \
+        --rm \
+        -e HF_TOKEN \
+        --name ${container_name} \
+        ${container_name} \
+        /bin/bash -c $(echo $1 | sed "s/^'//" | sed "s/'$//")
 
-# Test a simple prompt
-curl -X POST -H "Content-Type: application/json" \
-    localhost:8000/generate \
-    -d '{"prompt": "San Francisco is a"}'
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
@@ -53,6 +53,11 @@ echo '```' >> benchmark_results.md
 tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
 echo '```' >> benchmark_results.md
 
+# if the agent binary is not found, skip uploading the results, exit 0
+if [ ! -f /workspace/buildkite-agent ]; then
+    exit 0
+fi
+
 # upload the results to buildkite
 /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
 

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -17,27 +17,38 @@ steps:
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 
 - label: Core Test
+  mirror_hardwares: [amd]
   command: pytest -v -s core
 
 - label: Distributed Comm Ops Test
   command: pytest -v -s test_comm_ops.py
   working_dir: "/vllm-workspace/tests/distributed"
-  num_gpus: 2 # only support 1 or 2 for now.
+  num_gpus: 2
 
 - label: Distributed Tests
   working_dir: "/vllm-workspace/tests/distributed"
+
   num_gpus: 2 # only support 1 or 2 for now.
+  mirror_hardwares: [amd]
+
   commands:
-  - pytest -v -s test_pynccl.py
   - pytest -v -s test_pynccl_library.py
   - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
 
+- label: Distributed Tests (Multiple Groups)
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 4
+  commands:
+  - pytest -v -s test_pynccl.py
+
 - label: Engine Test
+  mirror_hardwares: [amd]
   command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
 
 - label: Entrypoints Test
@@ -48,6 +59,7 @@ steps:
 
 - label: Examples Test
   working_dir: "/vllm-workspace/examples"
+  mirror_hardwares: [amd]
   commands:
     # install aws cli for llava_example.py
     - pip install awscli
@@ -61,29 +73,35 @@ steps:
   parallelism: 4
 
 - label: Models Test
+  mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
     - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
 
 - label: Llava Test
+  mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
     - pytest -v -s models/test_llava.py
 
 - label: Prefix Caching Test
+  mirror_hardwares: [amd]
   commands:
     - pytest -v -s prefix_caching
 
 - label: Samplers Test
   command: pytest -v -s samplers
 
 - label: LogitsProcessor Test
+  mirror_hardwares: [amd]
   command: pytest -v -s test_logits_processor.py
 
 - label: Worker Test
+  mirror_hardwares: [amd]
   command: pytest -v -s worker
 
 - label: Speculative decoding tests
+  mirror_hardwares: [amd]
   command: pytest -v -s spec_decode
 
 - label: LoRA Test %N
@@ -101,6 +119,7 @@ steps:
 
 - label: Benchmarks
   working_dir: "/vllm-workspace/.buildkite"
+  mirror_hardwares: [amd]
   commands:
   - pip install aiohttp
   - bash run-benchmarks.sh

diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
@@ -14,20 +14,33 @@ steps:
       automatic:
         - exit_status: -1  # Agent was lost
           limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
   - wait
 
-  - label: "AMD Test"
-    agents:
-      queue: amd
-    command: bash .buildkite/run-amd-test.sh
+  - group: "AMD Tests"
+    depends_on: ~
+    steps:
+    {% for step in steps %}
+    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
+      - label: "AMD: {{ step.label }}"
+        agents:
+          queue: amd
+        command: bash .buildkite/run-amd-test.sh "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
+        env:
+          DOCKER_BUILDKIT: "1"
+    {% endif %}
+    {% endfor %}
 
   - label: "Neuron Test"
+    depends_on: ~
     agents:
       queue: neuron
     command: bash .buildkite/run-neuron-test.sh
     soft_fail: true
 
-  - label: "CPU Test"
+  - label: "Intel Test"
+    depends_on: ~
     command: bash .buildkite/run-cpu-test.sh
 
   {% for step in steps %}
@@ -42,9 +55,14 @@ steps:
       automatic:
         - exit_status: -1  # Agent was lost
           limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
     plugins:
       - kubernetes:
           podSpec:
+            {% if step.num_gpus %}
+            priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
+            {% endif %}
             volumes:
               - name: dshm
                 emptyDir:

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
@@ -33,6 +33,7 @@ jobs:
     - name: Mypy
       run: |
         mypy vllm/attention --config-file pyproject.toml
+        mypy vllm/core --config-file pyproject.toml
         mypy vllm/distributed --config-file pyproject.toml
         mypy vllm/entrypoints --config-file pyproject.toml
         mypy vllm/executor --config-file pyproject.toml
@@ -42,9 +43,8 @@ jobs:
         mypy vllm/engine  --config-file pyproject.toml
         mypy vllm/worker --config-file pyproject.toml
         mypy vllm/spec_decode --config-file pyproject.toml
-        mypy vllm/lora --config-file pyproject.toml
         mypy vllm/model_executor  --config-file pyproject.toml
-
-        # TODO(sang): Fix nested dir
-        mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/lora --config-file pyproject.toml
+        mypy vllm/logging --config-file pyproject.toml
+        mypy vllm/model_executor --config-file pyproject.toml
 
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -58,6 +58,9 @@ jobs:
 
       - name: Setup ccache
         uses: hendrikmuhs/[email protected]
+        with:
+          create-symlink: true
+          key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
 
       - name: Set up Linux Env
         if: ${{ runner.os == 'Linux' }}
@@ -79,6 +82,8 @@ jobs:
 
       - name: Build wheel
         shell: bash
+        env:
+          CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
         run: |
           bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
           wheel_name=$(ls dist/*whl | xargs -n 1 basename)

diff --git a/.github/workflows/scripts/create_release.js b/.github/workflows/scripts/create_release.js
@@ -8,7 +8,7 @@ module.exports = async (github, context, core) => {
 			generate_release_notes: true,
 			name: process.env.RELEASE_TAG,
 			owner: context.repo.owner,
-			prerelease: false,
+			prerelease: true,
 			repo: context.repo.repo,
 			tag_name: process.env.RELEASE_TAG,
 		});

diff --git a/Dockerfile b/Dockerfile
@@ -1,9 +1,13 @@
 # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 # to run the OpenAI compatible server.
 
+# Please update any changes made here to
+# docs/source/dev/dockerfile/dockerfile.rst and
+# docs/source/assets/dev/dockerfile-stages-dependency.png
+
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
 
 RUN apt-get update -y \
     && apt-get install -y python3-pip git
@@ -12,7 +16,7 @@ RUN apt-get update -y \
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.1/compat/
+RUN ldconfig /usr/local/cuda-12.4/compat/
 
 WORKDIR /workspace
 
@@ -71,6 +75,10 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
     python3 setup.py bdist_wheel --dist-dir=dist
 
+# check the size of the wheel, we cannot upload wheels larger than 100MB
+COPY .buildkite/check-wheel-size.py check-wheel-size.py
+RUN python3 check-wheel-size.py dist
+
 # the `vllm_nccl` package must be installed from source distribution
 # pip is too smart to store a wheel in the cache, and other CI jobs
 # will directly use the wheel from the cache, which is not what we want.
@@ -98,7 +106,7 @@ RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
 
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
+FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
 WORKDIR /vllm-workspace
 
 RUN apt-get update -y \
@@ -108,7 +116,7 @@ RUN apt-get update -y \
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.1/compat/
+RUN ldconfig /usr/local/cuda-12.4/compat/
 
 # install vllm wheel first, so that torch etc will be installed
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \