Sync with [email protected]

Merge [vllm-project/vllm](https://github.com/vllm-project/vllm):[main](https://github.com/vllm-project/vllm/tree/main)@[v0.5.3](https://github.com/vllm-project/vllm/commit/bb2fc08072db2d96e547407b4301fb6ba141d9d6) into main
opendatahub-io · Jul 23, 2024 · c2ef5a8 · c2ef5a8
2 parents e7a4a41 + bb2fc08
commit c2ef5a8
Show file tree

Hide file tree

Showing 282 changed files with 11,844 additions and 3,885 deletions.
diff --git a/.buildkite/download-images.sh b/.buildkite/download-images.sh
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
+model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.905
+  - name: "exact_match,flexible-extract"
+    value: 0.905
+limit: 1000
+num_fewshot: 5
diff --git a/...kite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml b/...kite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.752
+  - name: "exact_match,flexible-extract"
+    value: 0.754
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.753
+  - name: "exact_match,flexible-extract"
+    value: 0.753
+limit: 1000
+num_fewshot: 5
diff --git a/...dkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml b/...dkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.758
+  - name: "exact_match,flexible-extract"
+    value: 0.759
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -1,3 +1,4 @@
+Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
 Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml

diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -2,4 +2,6 @@ Meta-Llama-3-8B-Instruct.yaml
 Meta-Llama-3-8B-Instruct-FP8.yaml
 Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
 done
 
 lm_eval --model vllm \
-  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
+  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
   --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
   --batch_size $BATCH_SIZE
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
@@ -3,13 +3,15 @@ steps:
     agents:
       queue: cpu_queue
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       # rename the files to change linux -> manylinux1
       - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
       - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
       - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
+    env:
+      DOCKER_BUILDKIT: "1"
     matrix:
       setup:
         cuda_version:

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
@@ -66,11 +66,18 @@ trap remove_docker_container EXIT
 
 echo "--- Running container"
 
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p ${HF_CACHE}
+HF_MOUNT="/root/.cache/huggingface"
+
 docker run \
         --device /dev/kfd --device /dev/dri \
         --network host \
+        --shm-size=16gb \
         --rm \
         -e HF_TOKEN \
+        -v ${HF_CACHE}:${HF_MOUNT} \
+        -e HF_HOME=${HF_MOUNT} \
         --name ${container_name} \
         ${image_name} \
         /bin/bash -c "${@}"

diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
@@ -0,0 +1,16 @@
+set -e
+
+# Build the docker image.
+docker build -f Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+remove_docker_container() { docker rm -f tpu-test || true; }
+trap remove_docker_container EXIT
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# For HF_TOKEN.
+source /etc/environment
+# Run a simple end-to-end example.
+docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu \
+    python3 /workspace/vllm/examples/offline_inference_tpu.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -12,7 +12,6 @@ steps:
   fast_check_only: true
   commands:
   - pytest -v -s async_engine # Async Engine
-  - bash ../.buildkite/download-images.sh # Inputs 
   - pytest -v -s test_inputs.py
   - pytest -v -s multimodal
   - pytest -v -s test_utils.py # Utils
@@ -22,7 +21,7 @@ steps:
   fast_check: true
   fast_check_only: true
   commands:
-  - apt-get install curl libsodium23 && pytest -v -s tensorizer_loader # Tensorizer
+  - apt-get install -y curl libsodium23 && pytest -v -s tensorizer_loader # Tensorizer
   - pytest -v -s metrics # Metrics
   - "pip install \
       opentelemetry-sdk \
@@ -45,16 +44,18 @@ steps:
   mirror_hardwares: [amd]
   fast_check: true
   commands:
-  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+  # This flashinfer installation will fail on AMD ROCm, so it is set as optional.
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl || true
   - pytest -v -s basic_correctness/test_basic_correctness.py
+  - pytest -v -s basic_correctness/test_cpu_offload.py
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 
 - label: Core Test
   mirror_hardwares: [amd]
   fast_check: true
-  commands: 
+  commands:
   - pytest -v -s core
   - pytest -v -s distributed/test_parallel_state.py
 
@@ -73,7 +74,7 @@ steps:
   commands:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
-    - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
 
@@ -82,10 +83,11 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   commands:
-  - bash ../.buildkite/download-images.sh
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
@@ -110,23 +112,19 @@ steps:
   # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
   # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 
 - label: Pipeline Parallelism Test
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   commands:
-  - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
-  - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
-  - TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
-  - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
-  - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
-
+  - pytest -v -s distributed/test_pipeline_parallel.py
 
 - label: Engine Test
   mirror_hardwares: [amd]
-  commands: 
+  commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py
   # OOM in the CI unless we run this separately
   - pytest -v -s tokenization
@@ -147,6 +145,7 @@ steps:
     # install tensorizer for tensorize_vllm_model.py
     - pip install awscli tensorizer
     - python3 offline_inference.py
+    - python3 cpu_offload.py
     - python3 offline_inference_with_prefix.py
     - python3 llm_engine_example.py
     - python3 llava_example.py
@@ -155,7 +154,6 @@ steps:
 - label: Inputs Test
   #mirror_hardwares: [amd]
   commands:
-    - bash ../.buildkite/download-images.sh
     - pytest -v -s test_inputs.py
     - pytest -v -s multimodal
 
@@ -175,7 +173,6 @@ steps:
 - label: Vision Language Models Test
   mirror_hardwares: [amd]
   commands:
-    - bash ../.buildkite/download-images.sh
     - pytest -v -s models -m vlm
 
 - label: Prefix Caching Test
@@ -225,7 +222,7 @@ steps:
 - label: Tensorizer Test
   #mirror_hardwares: [amd]
   commands:
-    - apt-get install curl libsodium23
+    - apt-get install -y curl libsodium23
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s tensorizer_loader
 

diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml
diff --git a/.github/workflows/add_label_ready_comment.yml b/.github/workflows/add_label_ready_comment.yml
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -33,7 +33,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # versions are derived from Dockerfile.rocm
 #
 set(TORCH_SUPPORTED_VERSION_CUDA "2.3.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
 
 #
 # Try to find python package with an executable that exactly matches
@@ -101,7 +101,7 @@ elseif(HIP_FOUND)
   # ROCm 5.X and 6.X
   if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
       NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
-    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM} "
+    message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
       "expected for ROCm build, saw ${Torch_VERSION} instead.")
   endif()
 else()
@@ -151,6 +151,7 @@ set(VLLM_EXT_SRC
   "csrc/quantization/fp8/common.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/moe_align_block_size_kernels.cu"
+  "csrc/prepare_inputs/advance_step.cu"
   "csrc/torch_bindings.cpp")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -171,6 +172,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+    "csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
     "csrc/quantization/fp8/fp8_marlin.cu"
     "csrc/custom_all_reduce.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"