diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 4f1729d46dae2..1a4dae8f65e99 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -9,36 +9,33 @@ CORE_RANGE=${CORE_RANGE:-48-95} NUMA_NODE=${NUMA_NODE:-1} # Try building the docker image -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu . -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu . # Setup cleanup -remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; } +remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; } trap remove_docker_container EXIT remove_docker_container # Run the image, setting --shm-size=4g for tensor parallel. docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test + --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2 + --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 function cpu_tests() { set -e export NUMA_NODE=$2 # offline inference - docker exec cpu-test-avx2-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " set -e - python3 examples/offline_inference.py" + python3 examples/offline_inference/offline_inference.py" # Run basic model test - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e - pip install pytest pytest-asyncio \ - decord einops librosa peft Pillow sentence-transformers soundfile \ - transformers_stream_generator matplotlib datamodel_code_generator - pip install torchvision --index-url https://download.pytorch.org/whl/cpu + pip install -r vllm/requirements-test.txt pytest -v -s tests/models/decoder_only/language -m cpu_model pytest -v -s tests/models/embedding/language -m cpu_model pytest -v -s tests/models/encoder_decoder/language -m cpu_model @@ -46,26 +43,26 @@ function cpu_tests() { pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" # Run compressed-tensor test - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" # Run AWQ test - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/quantization/test_ipex_quant.py" # Run chunked-prefill and prefix-cache test - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e pytest -s -v -k cpu_model \ tests/basic_correctness/test_chunked_prefill.py" # online inference - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e export VLLM_CPU_KVCACHE_SPACE=10 export VLLM_CPU_OMP_THREADS_BIND=$1 diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh index 4fc6d089cc666..1e5ff77895a38 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/run-gh200-test.sh @@ -24,5 +24,5 @@ remove_docker_container # Run the image and test offline inference docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' - python3 examples/offline_inference.py + python3 examples/offline_inference/offline_inference.py ' diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh index fa4f74fca7a11..a50570ab53438 100644 --- a/.buildkite/run-hpu-test.sh +++ b/.buildkite/run-hpu-test.sh @@ -13,4 +13,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py \ No newline at end of file +docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py \ No newline at end of file diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index d4e0cc963dfc3..a75af91a577c8 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ --name "${container_name}" \ ${image_name} \ - /bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys" + /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys" diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh index 6b12f424fd828..380f7a44a429a 100755 --- a/.buildkite/run-openvino-test.sh +++ b/.buildkite/run-openvino-test.sh @@ -13,4 +13,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py +docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh index 770dad6ffa3a1..13605a3e97142 100644 --- a/.buildkite/run-tpu-test.sh +++ b/.buildkite/run-tpu-test.sh @@ -14,4 +14,4 @@ remove_docker_container # For HF_TOKEN. source /etc/environment # Run a simple end-to-end example. -docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" +docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py" diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh index e0a12afbe7320..160e10aa3bb9b 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/run-xpu-test.sh @@ -14,6 +14,6 @@ remove_docker_container # Run the image and test offline inference/tensor parallel docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c ' - python3 examples/offline_inference.py - python3 examples/offline_inference_cli.py -tp 2 + python3 examples/offline_inference/offline_inference.py + python3 examples/offline_inference/offline_inference_cli.py -tp 2 ' diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index dcfe228ce8eae..f883595f6d9ad 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -38,7 +38,7 @@ steps: - pip install -r requirements-docs.txt - SPHINXOPTS=\"-W\" make html # Check API reference (if it fails, you may have missing mock imports) - - grep \"sig sig-object py\" build/html/dev/sampling_params.html + - grep \"sig sig-object py\" build/html/api/params.html - label: Async Engine, Inputs, Utils, Worker Test # 24min fast_check: true @@ -187,19 +187,19 @@ steps: - examples/ commands: - pip install tensorizer # for tensorizer test - - python3 offline_inference.py - - python3 cpu_offload.py - - python3 offline_inference_chat.py - - python3 offline_inference_with_prefix.py - - python3 llm_engine_example.py - - python3 offline_inference_vision_language.py - - python3 offline_inference_vision_language_multi_image.py - - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference_encoder_decoder.py - - python3 offline_inference_classification.py - - python3 offline_inference_embedding.py - - python3 offline_inference_scoring.py - - python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2 + - python3 offline_inference/offline_inference.py + - python3 offline_inference/cpu_offload.py + - python3 offline_inference/offline_inference_chat.py + - python3 offline_inference/offline_inference_with_prefix.py + - python3 offline_inference/llm_engine_example.py + - python3 offline_inference/offline_inference_vision_language.py + - python3 offline_inference/offline_inference_vision_language_multi_image.py + - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/offline_inference_encoder_decoder.py + - python3 offline_inference/offline_inference_classification.py + - python3 offline_inference/offline_inference_embedding.py + - python3 offline_inference/offline_inference_scoring.py + - python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2 - label: Prefix Caching Test # 9min mirror_hardwares: [amd] diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index ab6f6e5d2060d..ee768db63c96c 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -27,7 +27,7 @@ jobs: version: v3.10.1 - name: Run chart-testing (lint) - run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm + run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm - name: Setup minio run: | @@ -64,7 +64,7 @@ jobs: run: | export AWS_ACCESS_KEY_ID=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin - helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" + helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - name: curl test run: | diff --git a/.gitignore b/.gitignore index bb7e4d5b244a8..89dab8f13bab1 100644 --- a/.gitignore +++ b/.gitignore @@ -79,10 +79,7 @@ instance/ # Sphinx documentation docs/_build/ -docs/source/getting_started/examples/*.rst -!**/*.template.rst -docs/source/getting_started/examples/*.md -!**/*.template.md +docs/source/getting_started/examples/ # PyBuilder .pybuilder/ diff --git a/Dockerfile b/Dockerfile index 088314eb38dbe..4542bc9cf0bd2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,8 +2,8 @@ # to run the OpenAI compatible server. # Please update any changes made here to -# docs/source/dev/dockerfile/dockerfile.md and -# docs/source/assets/dev/dockerfile-stages-dependency.png +# docs/source/contributing/dockerfile/dockerfile.md and +# docs/source/assets/contributing/dockerfile-stages-dependency.png ARG CUDA_VERSION=12.4.1 #################### BASE BUILD IMAGE #################### @@ -250,7 +250,7 @@ ENV VLLM_USAGE_SOURCE production-docker-image # define sagemaker first, so it is not default from `docker build` FROM vllm-openai-base AS vllm-sagemaker -COPY examples/sagemaker-entrypoint.sh . +COPY examples/online_serving/sagemaker-entrypoint.sh . RUN chmod +x sagemaker-entrypoint.sh ENTRYPOINT ["./sagemaker-entrypoint.sh"] diff --git a/Dockerfile.openvino b/Dockerfile.openvino index 8bd188ffde408..32bcbfa9cc168 100644 --- a/Dockerfile.openvino +++ b/Dockerfile.openvino @@ -14,6 +14,7 @@ ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi +RUN python3 -m pip install -U pip # install build requirements RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt # build vLLM with OpenVINO backend diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le index 971248577983f..d3cd1c7b313bc 100644 --- a/Dockerfile.ppc64le +++ b/Dockerfile.ppc64le @@ -4,7 +4,7 @@ USER root ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/" -RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 +RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev # Some packages in requirements-cpu are installed here # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba @@ -18,9 +18,8 @@ ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi -# These packages will be in rocketce eventually RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \ + RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ torch==2.3.1 \ -r requirements-cpu.txt \ diff --git a/README.md b/README.md index 652268ec29cac..1f82229f39537 100644 --- a/README.md +++ b/README.md @@ -90,28 +90,33 @@ vLLM is a community project. Our compute resources for development and testing a - +Cash Donations: - a16z +- Dropbox +- Sequoia Capital +- Skywork AI +- ZhenFund + +Compute Resources: - AMD - Anyscale - AWS - Crusoe Cloud - Databricks - DeepInfra -- Dropbox - Google Cloud - Lambda Lab - Nebius +- Novita AI - NVIDIA - Replicate - Roblox - RunPod -- Sequoia Capital -- Skywork AI - Trainy - UC Berkeley - UC San Diego -- ZhenFund + +Slack Sponsor: Anyscale We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. diff --git a/SECURITY.md b/SECURITY.md index ad3f1f16ab560..de0032d26c87b 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -4,7 +4,7 @@ If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. -Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). +Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/contributing/vulnerability_management/). --- diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 0a14aedd5feba..e669ce4db299d 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -52,7 +52,7 @@ def run_to_completion(profile_dir: Optional[str] = None): llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) - print(p.key_averages()) + print(p.key_averages().table(sort_by="self_cuda_time_total")) else: start_time = time.perf_counter() llm.generate(dummy_prompts, diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 68f7ca1af05ad..714abca2a5ff7 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -4,6 +4,11 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + set(MACOSX_FOUND TRUE) +endif() + + # # Define environment variables for special configurations # @@ -13,6 +18,9 @@ endif() include_directories("${CMAKE_SOURCE_DIR}/csrc") + +set (ENABLE_NUMA TRUE) + # # Check the compile flags # @@ -22,18 +30,28 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") "-mf16c" ) endif() -list(APPEND CXX_COMPILE_FLAGS - "-fopenmp" - "-DVLLM_CPU_EXTENSION") -execute_process(COMMAND cat /proc/cpuinfo - RESULT_VARIABLE CPUINFO_RET - OUTPUT_VARIABLE CPUINFO) +if(MACOSX_FOUND) + list(APPEND CXX_COMPILE_FLAGS + "-Xpreprocessor" + "-fopenmp" + "-DVLLM_CPU_EXTENSION") +else() + list(APPEND CXX_COMPILE_FLAGS + "-fopenmp" + "-DVLLM_CPU_EXTENSION") +endif() -if (NOT CPUINFO_RET EQUAL 0) - message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo") +if (NOT MACOSX_FOUND) + execute_process(COMMAND cat /proc/cpuinfo + RESULT_VARIABLE CPUINFO_RET + OUTPUT_VARIABLE CPUINFO) + if (NOT CPUINFO_RET EQUAL 0) + message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo") + endif() endif() + function (find_isa CPUINFO TARGET OUT) string(FIND ${CPUINFO} ${TARGET} ISA_FOUND) if(NOT ISA_FOUND EQUAL -1) @@ -54,12 +72,17 @@ endfunction() is_avx512_disabled(AVX512_DISABLED) -find_isa(${CPUINFO} "avx2" AVX2_FOUND) -find_isa(${CPUINFO} "avx512f" AVX512_FOUND) -find_isa(${CPUINFO} "POWER10" POWER10_FOUND) -find_isa(${CPUINFO} "POWER9" POWER9_FOUND) -find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support -find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support +if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") + set(APPLE_SILICON_FOUND TRUE) +else() + find_isa(${CPUINFO} "avx2" AVX2_FOUND) + find_isa(${CPUINFO} "avx512f" AVX512_FOUND) + find_isa(${CPUINFO} "POWER10" POWER10_FOUND) + find_isa(${CPUINFO} "POWER9" POWER9_FOUND) + find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support + find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support +endif() + if (AVX512_FOUND AND NOT AVX512_DISABLED) list(APPEND CXX_COMPILE_FLAGS @@ -103,6 +126,9 @@ elseif (ASIMD_FOUND) set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16") endif() list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS}) +elseif(APPLE_SILICON_FOUND) + message(STATUS "Apple Silicon Detected") + set(ENABLE_NUMA OFF) else() message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.") endif() @@ -139,7 +165,12 @@ endif() message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") -list(APPEND LIBS numa) +if(ENABLE_NUMA) + list(APPEND LIBS numa) +else() + message(STATUS "NUMA is disabled") + add_compile_definitions(-DVLLM_NUMA_DISABLED) +endif() # # _C extension diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp index 73e0f8cb2e0fb..ae062a5b86892 100644 --- a/csrc/cpu/cpu_types_arm.hpp +++ b/csrc/cpu/cpu_types_arm.hpp @@ -91,11 +91,68 @@ struct FP16Vec16 : public Vec { vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); } } + + // Note: below is the unrolled version of the following code: + // + // for (int i = 0; i < remainder; ++i) { + // reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = + // vgetq_lane_f16(temp, i); + // } + // + // For macOS build (Clang), the arm/neon intrinsics function + // `vgetq_lane_f16` needs the parameter `i` to be constant at compile + // time. if (remainder > 0) { float16x8_t temp = reg.val[full_blocks]; - for (int i = 0; i < remainder; ++i) { - reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i); + __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr); + switch (remainder) + { + case 1: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + break; + case 2: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + break; + case 3: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + break; + case 4: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + break; + case 5: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + break; + case 6: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); + break; + case 7: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); + fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6); + break; + + default: + break; } } } diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp index 1138a55df2f05..42a1c1d924bac 100644 --- a/csrc/cpu/utils.cpp +++ b/csrc/cpu/utils.cpp @@ -1,10 +1,22 @@ -#include -#include -#include -#include +#ifndef VLLM_NUMA_DISABLED + #include + #include + #include + #include +#endif #include "cpu_types.hpp" +#ifdef VLLM_NUMA_DISABLED +std::string init_cpu_threads_env(const std::string& cpu_ids) { + return std::string( + "Warning: NUMA is not enabled in this build. `init_cpu_threads_env` has " + "no effect to setup thread affinity."); +} + +#endif + +#ifndef VLLM_NUMA_DISABLED std::string init_cpu_threads_env(const std::string& cpu_ids) { bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str()); TORCH_CHECK(omp_cpu_mask->size > 0); @@ -57,7 +69,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { omp_lock_t writelock; omp_init_lock(&writelock); -#pragma omp parallel for schedule(static, 1) + #pragma omp parallel for schedule(static, 1) for (size_t i = 0; i < omp_cpu_ids.size(); ++i) { cpu_set_t mask; CPU_ZERO(&mask); @@ -88,3 +100,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { return ss.str(); } +#endif \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile index d0c3cbf1020d5..5b801f79d1f26 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -18,3 +18,7 @@ help: # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +clean: + @$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + rm -rf "$(SOURCEDIR)/getting_started/examples" diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 25a700033cc9e..64cf6ef8fc19d 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -3,6 +3,7 @@ sphinx-book-theme==1.0.1 sphinx-copybutton==0.5.2 myst-parser==3.0.1 sphinx-argparse==0.4.0 +sphinx-togglebutton==0.3.2 msgspec cloudpickle diff --git a/docs/source/dev/engine/async_llm_engine.md b/docs/source/api/engine/async_llm_engine.md similarity index 100% rename from docs/source/dev/engine/async_llm_engine.md rename to docs/source/api/engine/async_llm_engine.md diff --git a/docs/source/dev/engine/engine_index.md b/docs/source/api/engine/index.md similarity index 100% rename from docs/source/dev/engine/engine_index.md rename to docs/source/api/engine/index.md diff --git a/docs/source/dev/engine/llm_engine.md b/docs/source/api/engine/llm_engine.md similarity index 100% rename from docs/source/dev/engine/llm_engine.md rename to docs/source/api/engine/llm_engine.md diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md new file mode 100644 index 0000000000000..51e24795a34cf --- /dev/null +++ b/docs/source/api/multimodal/index.md @@ -0,0 +1,28 @@ +(multi-modality)= + +# Multi-Modality + +vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. + +Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) +via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. + +Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs). + +## Module Contents + +```{eval-rst} +.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY +``` + +## Submodules + +```{toctree} +:maxdepth: 1 + +inputs +parse +processing +profiling +registry +``` diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md new file mode 100644 index 0000000000000..3d89666113229 --- /dev/null +++ b/docs/source/api/multimodal/inputs.md @@ -0,0 +1,49 @@ +# Input Definitions + +## User-facing inputs + +```{eval-rst} +.. autodata:: vllm.multimodal.MultiModalDataDict +``` + +## Internal data structures + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.PlaceholderRange + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autodata:: vllm.multimodal.inputs.NestedTensors +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2 + :members: + :show-inheritance: +``` diff --git a/docs/source/api/multimodal/parse.md b/docs/source/api/multimodal/parse.md new file mode 100644 index 0000000000000..4676139efe626 --- /dev/null +++ b/docs/source/api/multimodal/parse.md @@ -0,0 +1,9 @@ +# Data Parsing + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.parse + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/processing.md b/docs/source/api/multimodal/processing.md new file mode 100644 index 0000000000000..0d81c8d3966ee --- /dev/null +++ b/docs/source/api/multimodal/processing.md @@ -0,0 +1,9 @@ +# Data Processing + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.processing + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/profiling.md b/docs/source/api/multimodal/profiling.md new file mode 100644 index 0000000000000..b455145212202 --- /dev/null +++ b/docs/source/api/multimodal/profiling.md @@ -0,0 +1,9 @@ +# Memory Profiling + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.profiling + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/registry.md b/docs/source/api/multimodal/registry.md new file mode 100644 index 0000000000000..0737a4385cf32 --- /dev/null +++ b/docs/source/api/multimodal/registry.md @@ -0,0 +1,9 @@ +# Registry + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.registry + :members: + :member-order: bysource +``` diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/api/offline_inference/index.md similarity index 100% rename from docs/source/dev/offline_inference/offline_index.md rename to docs/source/api/offline_inference/index.md diff --git a/docs/source/dev/offline_inference/llm.md b/docs/source/api/offline_inference/llm.md similarity index 100% rename from docs/source/dev/offline_inference/llm.md rename to docs/source/api/offline_inference/llm.md diff --git a/docs/source/dev/offline_inference/llm_inputs.md b/docs/source/api/offline_inference/llm_inputs.md similarity index 100% rename from docs/source/dev/offline_inference/llm_inputs.md rename to docs/source/api/offline_inference/llm_inputs.md diff --git a/docs/source/api/params.md b/docs/source/api/params.md new file mode 100644 index 0000000000000..a3b4d9cbb44ec --- /dev/null +++ b/docs/source/api/params.md @@ -0,0 +1,22 @@ +# Optional Parameters + +Optional parameters for vLLM APIs. + +(sampling-params)= + +## Sampling Parameters + +```{eval-rst} +.. autoclass:: vllm.SamplingParams + :members: +``` + +(pooling-params)= + +## Pooling Parameters + +```{eval-rst} +.. autoclass:: vllm.PoolingParams + :members: +``` + diff --git a/docs/source/assets/dev/dockerfile-stages-dependency.png b/docs/source/assets/contributing/dockerfile-stages-dependency.png similarity index 100% rename from docs/source/assets/dev/dockerfile-stages-dependency.png rename to docs/source/assets/contributing/dockerfile-stages-dependency.png diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md index c6f83b3a92ca0..9d2af4c13b088 100644 --- a/docs/source/community/sponsors.md +++ b/docs/source/community/sponsors.md @@ -5,26 +5,32 @@ vLLM is a community project. Our compute resources for development and testing a +Cash Donations: - a16z +- Dropbox +- Sequoia Capital +- Skywork AI +- ZhenFund + +Compute Resources: - AMD - Anyscale - AWS - Crusoe Cloud - Databricks - DeepInfra -- Dropbox - Google Cloud - Lambda Lab - Nebius +- Novita AI - NVIDIA - Replicate - Roblox - RunPod -- Sequoia Capital -- Skywork AI - Trainy - UC Berkeley - UC San Diego -- ZhenFund + +Slack Sponsor: Anyscale We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. diff --git a/docs/source/conf.py b/docs/source/conf.py index 71394c5302a39..1ce11fe057071 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -43,6 +43,10 @@ "sphinx.ext.autosummary", "myst_parser", "sphinxarg.ext", + "sphinx_togglebutton", +] +myst_enable_extensions = [ + "colon_fence", ] # Add any paths that contain templates here, relative to this directory. diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md index 38ea956ba8dfb..cb142318b8724 100644 --- a/docs/source/contributing/dockerfile/dockerfile.md +++ b/docs/source/contributing/dockerfile/dockerfile.md @@ -17,7 +17,7 @@ The edges of the build graph represent: - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head) - > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png + > ```{figure} /assets/contributing/dockerfile-stages-dependency.png > :align: center > :alt: query > :width: 100% diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md index 46210957c19ec..97de40ff469f1 100644 --- a/docs/source/contributing/profiling/profiling_index.md +++ b/docs/source/contributing/profiling/profiling_index.md @@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve ### Offline Inference -Refer to for an example. +Refer to for an example. ### OpenAI Server diff --git a/docs/source/contributing/vulnerability_management.md b/docs/source/contributing/vulnerability_management.md new file mode 100644 index 0000000000000..422dc13e6a644 --- /dev/null +++ b/docs/source/contributing/vulnerability_management.md @@ -0,0 +1,43 @@ +# Vulnerability Management + +## Reporting Vulnerabilities + +As mentioned in the [security +policy](https://github.com/vllm-project/vllm/tree/main/SECURITY.md), security +vulnerabilities may be reported privately to the project via +[GitHub](https://github.com/vllm-project/vllm/security/advisories/new). + +## Vulnerability Management Team + +Once a vulnerability has been reported to the project, the Vulnerability +Management Team (VMT) is responsible for managing the vulnerability. The VMT is +responsible for: + +- Triaging the vulnerability. +- Coordinating with reporters and project maintainers on vulnerability analysis + and resolution. +- Drafting of security advisories for confirmed vulnerabilities, as appropriate. +- Coordination with project maintainers on a coordinated release of the fix and + security advisory. + +### Security Advisories + +Advisories are published via GitHub through the same system used to report +vulnerabilities. More information on the process can be found in the [GitHub +documentation](https://docs.github.com/en/code-security/security-advisories/working-with-repository-security-advisories/about-repository-security-advisories). + +### Team Members + +We prefer to keep all vulnerability-related communication on the security report +on GitHub. However, if you need to contact the VMT directly for an urgent issue, +you may contact the following individuals: + +- Simon Mo - simon.mo@hey.com +- Russell Bryant - rbryant@redhat.com + +## Slack Discussion + +You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai) +to discuss security-related topics. However, please do not disclose any +vulnerabilities in this channel. If you need to report a vulnerability, please +use the GitHub security advisory system or contact a VMT member privately. diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md index f02a943026922..657e7f2bc72cc 100644 --- a/docs/source/deployment/frameworks/skypilot.md +++ b/docs/source/deployment/frameworks/skypilot.md @@ -61,7 +61,7 @@ run: | echo 'Starting gradio server...' git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ + python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ -m $MODEL_NAME \ --port 8811 \ --model-url http://localhost:8081/v1 \ @@ -321,7 +321,7 @@ run: | echo 'Starting gradio server...' git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ + python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ -m $MODEL_NAME \ --port 8811 \ --model-url http://$ENDPOINT/v1 \ diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md index 5e0dd021ad02e..cec503ef2f77d 100644 --- a/docs/source/design/arch_overview.md +++ b/docs/source/design/arch_overview.md @@ -53,7 +53,7 @@ for output in outputs: ``` More API details can be found in the {doc}`Offline Inference -` section of the API docs. +` section of the API docs. The code for the `LLM` class can be found in . diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.md b/docs/source/design/multimodal/adding_multimodal_plugin.md deleted file mode 100644 index bcccd284879bb..0000000000000 --- a/docs/source/design/multimodal/adding_multimodal_plugin.md +++ /dev/null @@ -1,16 +0,0 @@ -(adding-multimodal-plugin)= - -# Adding a Multimodal Plugin - -This document teaches you how to add a new modality to vLLM. - -Each modality in vLLM is represented by a {class}`~vllm.multimodal.MultiModalPlugin` and registered to {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`. -For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to {meth}`~vllm.multimodal.MultiModalRegistry.register_plugin`. - -The remainder of this document details how to define custom {class}`~vllm.multimodal.MultiModalPlugin` s. - -```{note} -This article is a work in progress. -``` - -% TODO: Add more instructions on how to add new plugins once embeddings is in. diff --git a/docs/source/design/multimodal/multimodal_index.md b/docs/source/design/multimodal/multimodal_index.md deleted file mode 100644 index e4f2171e84ff7..0000000000000 --- a/docs/source/design/multimodal/multimodal_index.md +++ /dev/null @@ -1,83 +0,0 @@ -(multi-modality)= - -# Multi-Modality - -```{eval-rst} -.. currentmodule:: vllm.multimodal -``` - -vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. - -Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) -via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. - -Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities -by following [this guide](#adding-multimodal-plugin). - -Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs). - -## Guides - -```{toctree} -:maxdepth: 1 - -adding_multimodal_plugin -``` - -## Module Contents - -```{eval-rst} -.. automodule:: vllm.multimodal -``` - -### Registry - -```{eval-rst} -.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY -``` - -```{eval-rst} -.. autoclass:: vllm.multimodal.MultiModalRegistry - :members: - :show-inheritance: -``` - -### Base Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.base - :members: - :show-inheritance: -``` - -### Input Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.inputs - :members: - :show-inheritance: -``` - -### Audio Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.audio - :members: - :show-inheritance: -``` - -### Image Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.image - :members: - :show-inheritance: -``` - -### Video Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.video - :members: - :show-inheritance: -``` diff --git a/docs/source/dev/pooling_params.md b/docs/source/dev/pooling_params.md deleted file mode 100644 index 74b2c57443e4b..0000000000000 --- a/docs/source/dev/pooling_params.md +++ /dev/null @@ -1,6 +0,0 @@ -# Pooling Parameters - -```{eval-rst} -.. autoclass:: vllm.PoolingParams - :members: -``` diff --git a/docs/source/dev/sampling_params.md b/docs/source/dev/sampling_params.md deleted file mode 100644 index bdc36af5153db..0000000000000 --- a/docs/source/dev/sampling_params.md +++ /dev/null @@ -1,6 +0,0 @@ -# Sampling Parameters - -```{eval-rst} -.. autoclass:: vllm.SamplingParams - :members: -``` diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md index 645dc60807dd3..efa2efc66192e 100644 --- a/docs/source/features/disagg_prefill.md +++ b/docs/source/features/disagg_prefill.md @@ -21,7 +21,7 @@ Disaggregated prefill DOES NOT improve throughput. ## Usage example -Please refer to `examples/disaggregated_prefill.sh` for the example usage of disaggregated prefilling. +Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling. ## Benchmarks diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md index cf06916d70f44..b00d05147bb32 100644 --- a/docs/source/features/lora.md +++ b/docs/source/features/lora.md @@ -47,7 +47,7 @@ outputs = llm.generate( ) ``` -Check out for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. +Check out for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. ## Serving LoRA Adapters diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md index c02fbf0605a8c..3679595e3d4d0 100644 --- a/docs/source/features/quantization/auto_awq.md +++ b/docs/source/features/quantization/auto_awq.md @@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"') To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: ```console -$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq +$ python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq ``` AWQ models are also supported directly through the LLM entrypoint: diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md index f200c722d1d42..50edaf81fddd3 100644 --- a/docs/source/features/quantization/fp8_e4m3_kvcache.md +++ b/docs/source/features/quantization/fp8_e4m3_kvcache.md @@ -28,7 +28,7 @@ Here is an example of how to enable this feature: ```python # two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to -# https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own. +# https://github.com/vllm-project/vllm/blob/main/examples/other/fp8/README.md to generate kv_cache_scales.json of your own. from vllm import LLM, SamplingParams sampling_params = SamplingParams(temperature=1.3, top_p=0.8) diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md index bc8a0aa14dc5a..903acadb71426 100644 --- a/docs/source/features/spec_decode.md +++ b/docs/source/features/spec_decode.md @@ -159,6 +159,72 @@ A variety of speculative models of this type are available on HF hub: - [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator) - [granite-20b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator) +## Speculating using EAGLE based draft models + +The following code configures vLLM to use speculative decoding where proposals are generated by +an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +llm = LLM( + model="meta-llama/Meta-Llama-3-8B-Instruct", + tensor_parallel_size=4, + speculative_model="path/to/modified/eagle/model", + speculative_draft_tensor_parallel_size=1, +) + +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +``` + +A few important things to consider when using the EAGLE based draft models: + +1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) cannot be + used directly with vLLM due to differences in the expected layer names and model definition. + To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) + to convert them. Note that this script does not modify the model's weights. + + In the above example, use the script to first convert + the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model + and then use the converted checkpoint as the draft model in vLLM. + +2. The EAGLE based draft models need to be run without tensor parallelism + (i.e. speculative_draft_tensor_parallel_size is set to 1), although + it is possible to run the main model using tensor parallelism (see example above). + +3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is + reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under + investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565). + + +A variety of EAGLE draft models are available on the Hugging Face hub: + +| Base Model | EAGLE on Hugging Face | # EAGLE Parameters | +|---------------------------------------------------------------------|-------------------------------------------|--------------------| +| Vicuna-7B-v1.3 | yuhuili/EAGLE-Vicuna-7B-v1.3 | 0.24B | +| Vicuna-13B-v1.3 | yuhuili/EAGLE-Vicuna-13B-v1.3 | 0.37B | +| Vicuna-33B-v1.3 | yuhuili/EAGLE-Vicuna-33B-v1.3 | 0.56B | +| LLaMA2-Chat 7B | yuhuili/EAGLE-llama2-chat-7B | 0.24B | +| LLaMA2-Chat 13B | yuhuili/EAGLE-llama2-chat-13B | 0.37B | +| LLaMA2-Chat 70B | yuhuili/EAGLE-llama2-chat-70B | 0.99B | +| Mixtral-8x7B-Instruct-v0.1 | yuhuili/EAGLE-mixtral-instruct-8x7B | 0.28B | +| LLaMA3-Instruct 8B | yuhuili/EAGLE-LLaMA3-Instruct-8B | 0.25B | +| LLaMA3-Instruct 70B | yuhuili/EAGLE-LLaMA3-Instruct-70B | 0.99B | +| Qwen2-7B-Instruct | yuhuili/EAGLE-Qwen2-7B-Instruct | 0.26B | +| Qwen2-72B-Instruct | yuhuili/EAGLE-Qwen2-72B-Instruct | 1.05B | + + ## Lossless guarantees of Speculative Decoding In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md index 26c09bb0d8a0c..ccd9a6a1b1a14 100644 --- a/docs/source/features/structured_outputs.md +++ b/docs/source/features/structured_outputs.md @@ -131,7 +131,7 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` -Full example: +Full example: ## Experimental Automatic Parsing (OpenAI API) @@ -257,4 +257,4 @@ outputs = llm.generate( print(outputs[0].outputs[0].text) ``` -Full example: +Full example: diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index aef32f7559f74..aaa13d0fb6d3f 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -1,54 +1,239 @@ +import itertools import re +from dataclasses import dataclass, field from pathlib import Path +ROOT_DIR = Path(__file__).parent.parent.parent.resolve() +ROOT_DIR_RELATIVE = '../../../..' +EXAMPLE_DIR = ROOT_DIR / "examples" +EXAMPLE_DOC_DIR = ROOT_DIR / "docs/source/getting_started/examples" + def fix_case(text: str) -> str: - subs = [ - ("api", "API"), - ("llm", "LLM"), - ("vllm", "vLLM"), - ("openai", "OpenAI"), - ("multilora", "MultiLoRA"), - ] - for sub in subs: - text = re.sub(*sub, text, flags=re.IGNORECASE) + subs = { + "api": "API", + "Cli": "CLI", + "cpu": "CPU", + "llm": "LLM", + "tpu": "TPU", + "aqlm": "AQLM", + "gguf": "GGUF", + "lora": "LoRA", + "vllm": "vLLM", + "openai": "OpenAI", + "multilora": "MultiLoRA", + "mlpspeculator": "MLPSpeculator", + r"fp\d+": lambda x: x.group(0).upper(), # e.g. fp16, fp32 + r"int\d+": lambda x: x.group(0).upper(), # e.g. int8, int16 + } + for pattern, repl in subs.items(): + text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE) return text -def generate_title(filename: str) -> str: - # Turn filename into a title - title = filename.replace("_", " ").title() - # Handle acronyms and names - title = fix_case(title) - return f"# {title}" +@dataclass +class Index: + """ + Index class to generate a structured document index. + + Attributes: + path (Path): The path save the index file to. + title (str): The title of the index. + description (str): A brief description of the index. + caption (str): An optional caption for the table of contents. + maxdepth (int): The maximum depth of the table of contents. Defaults to 1. + documents (list[str]): A list of document paths to include in the index. Defaults to an empty list. + + Methods: + generate() -> str: + Generates the index content as a string in the specified format. + """ # noqa: E501 + path: Path + title: str + description: str + caption: str + maxdepth: int = 1 + documents: list[str] = field(default_factory=list) + + def generate(self) -> str: + content = f"# {self.title}\n\n{self.description}\n\n" + content += "```{toctree}\n" + content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n" + content += "\n".join(self.documents) + "\n```\n" + return content + + +@dataclass +class Example: + """ + Example class for generating documentation content from a given path. + + Attributes: + path (Path): The path to the main directory or file. + category (str): The category of the document. + main_file (Path): The main file in the directory. + other_files (list[Path]): List of other files in the directory. + title (str): The title of the document. + + Methods: + __post_init__(): Initializes the main_file, other_files, and title attributes. + determine_main_file() -> Path: Determines the main file in the given path. + determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file. + determine_title() -> str: Determines the title of the document. + generate() -> str: Generates the documentation content. + """ # noqa: E501 + path: Path + category: str = None + main_file: Path = field(init=False) + other_files: list[Path] = field(init=False) + title: str = field(init=False) + + def __post_init__(self): + self.main_file = self.determine_main_file() + self.other_files = self.determine_other_files() + self.title = self.determine_title() + + def determine_main_file(self) -> Path: + """ + Determines the main file in the given path. + If the path is a file, it returns the path itself. Otherwise, it searches + for Markdown files (*.md) in the directory and returns the first one found. + Returns: + Path: The main file path, either the original path if it's a file or the first + Markdown file found in the directory. + Raises: + IndexError: If no Markdown files are found in the directory. + """ # noqa: E501 + return self.path if self.path.is_file() else list( + self.path.glob("*.md")).pop() + + def determine_other_files(self) -> list[Path]: + """ + Determine other files in the directory excluding the main file. + + This method checks if the given path is a file. If it is, it returns an empty list. + Otherwise, it recursively searches through the directory and returns a list of all + files that are not the main file. + + Returns: + list[Path]: A list of Path objects representing the other files in the directory. + """ # noqa: E501 + if self.path.is_file(): + return [] + is_other_file = lambda file: file.is_file() and file != self.main_file + return [file for file in self.path.rglob("*") if is_other_file(file)] + + def determine_title(self) -> str: + return fix_case(self.path.stem.replace("_", " ").title()) + + def generate(self) -> str: + # Convert the path to a relative path from __file__ + make_relative = lambda path: ROOT_DIR_RELATIVE / path.relative_to( + ROOT_DIR) + + content = f"Source .\n\n" + include = "include" if self.main_file.suffix == ".md" else \ + "literalinclude" + if include == "literalinclude": + content += f"# {self.title}\n\n" + content += f":::{{{include}}} {make_relative(self.main_file)}\n" + if include == "literalinclude": + content += f":language: {self.main_file.suffix[1:]}\n" + content += ":::\n\n" + + if not self.other_files: + return content + + content += "## Example materials\n\n" + for file in self.other_files: + include = "include" if file.suffix == ".md" else "literalinclude" + content += f":::{{admonition}} {file.relative_to(self.path)}\n" + content += ":class: dropdown\n\n" + content += f":::{{{include}}} {make_relative(file)}\n:::\n" + content += ":::\n\n" + + return content def generate_examples(): - root_dir = Path(__file__).parent.parent.parent.resolve() - - # Source paths - script_dir = root_dir / "examples" - script_paths = sorted(script_dir.glob("*.py")) - - # Destination paths - doc_dir = root_dir / "docs/source/getting_started/examples" - doc_paths = [doc_dir / f"{path.stem}.md" for path in script_paths] - - # Generate the example docs for each example script - for script_path, doc_path in zip(script_paths, doc_paths): - # Make script_path relative to doc_path and call it include_path - include_path = '../../../..' / script_path.relative_to(root_dir) - content = (f"{generate_title(doc_path.stem)}\n\n" - f"Source: .\n\n" - f"```{{literalinclude}} {include_path}\n" - ":language: python\n" - ":linenos:\n```") + # Create the EXAMPLE_DOC_DIR if it doesn't exist + if not EXAMPLE_DOC_DIR.exists(): + EXAMPLE_DOC_DIR.mkdir(parents=True) + + # Create empty indices + examples_index = Index( + path=EXAMPLE_DOC_DIR / "examples_index.md", + title="Examples", + description= + "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using from examples found in .", # noqa: E501 + caption="Examples", + maxdepth=2) + # Category indices stored in reverse order because they are inserted into + # examples_index.documents at index 0 in order + category_indices = { + "other": + Index( + path=EXAMPLE_DOC_DIR / "examples_other_index.md", + title="Other", + description= + "Other examples that don't strongly fit into the online or offline serving categories.", # noqa: E501 + caption="Examples", + ), + "online_serving": + Index( + path=EXAMPLE_DOC_DIR / "examples_online_serving_index.md", + title="Online Serving", + description= + "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.", # noqa: E501 + caption="Examples", + ), + "offline_inference": + Index( + path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md", + title="Offline Inference", + description= + "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.", # noqa: E501 + caption="Examples", + ), + } + + examples = [] + glob_patterns = ["*.py", "*.md", "*.sh"] + # Find categorised examples + for category in category_indices: + category_dir = EXAMPLE_DIR / category + globs = [category_dir.glob(pattern) for pattern in glob_patterns] + for path in itertools.chain(*globs): + examples.append(Example(path, category)) + # Find examples in subdirectories + for path in category_dir.glob("*/*.md"): + examples.append(Example(path.parent, category)) + # Find uncategorised examples + globs = [EXAMPLE_DIR.glob(pattern) for pattern in glob_patterns] + for path in itertools.chain(*globs): + examples.append(Example(path)) + # Find examples in subdirectories + for path in EXAMPLE_DIR.glob("*/*.md"): + # Skip categorised examples + if path.parent.name in category_indices: + continue + examples.append(Example(path.parent)) + + # Generate the example documentation + for example in sorted(examples, key=lambda e: e.path.stem): + doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md" with open(doc_path, "w+") as f: - f.write(content) - - # Generate the toctree for the example scripts - with open(doc_dir / "examples_index.template.md") as f: - examples_index = f.read() - with open(doc_dir / "examples_index.md", "w+") as f: - example_docs = "\n".join(path.stem + ".md" for path in script_paths) - f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs)) + f.write(example.generate()) + # Add the example to the appropriate index + index = category_indices.get(example.category, examples_index) + index.documents.append(example.path.stem) + + # Generate the index files + for category_index in category_indices.values(): + if category_index.documents: + examples_index.documents.insert(0, category_index.path.name) + with open(category_index.path, "w+") as f: + f.write(category_index.generate()) + + with open(examples_index.path, "w+") as f: + f.write(examples_index.generate()) diff --git a/docs/source/getting_started/examples/examples_index.template.md b/docs/source/getting_started/examples/examples_index.template.md deleted file mode 100644 index de7a91c0ffa48..0000000000000 --- a/docs/source/getting_started/examples/examples_index.template.md +++ /dev/null @@ -1,8 +0,0 @@ -# Examples - -```{toctree} -:maxdepth: 1 -:caption: Scripts - -%EXAMPLE_DOCS% -``` \ No newline at end of file diff --git a/docs/source/getting_started/installation/cpu-apple.md b/docs/source/getting_started/installation/cpu-apple.md new file mode 100644 index 0000000000000..b55e4384d064d --- /dev/null +++ b/docs/source/getting_started/installation/cpu-apple.md @@ -0,0 +1,51 @@ +(installation-apple)= + +# Installation for macOS + +vLLM has experimental support for macOS with Apple Silicon. For now, users shall build from the source vLLM to natively run on macOS. For more details, like running on vLLM in a docker container, see [ARM CPU Documentation](installation-arm) + +Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. + +## Requirements + +- **Operating System**: `macOS Sonoma` or later +- **SDK** `XCode 15.4` or later with Command Line Tools +- **Compilers**: `Apple Clang >= 15.0.0` + + + +## Build and installation + +After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source. + +``` +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ pip install -r requirements-cpu.txt +$ pip install -e . +``` + +```{note} +On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. +``` + + + +## Troubleshooting + +If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your +[Command Line Tools for Xcode](https://developer.apple.com/download/all/). + +``` +[...] fatal error: 'map' file not found + 1 | #include + | ^~~~~ + 1 error generated. + [2/8] Building CXX object CMakeFiles/_C.dir/csrc/cpu/pos_encoding.cpp.o + +[...] fatal error: 'cstddef' file not found + 10 | #include + | ^~~~~~~~~ + 1 error generated. +``` + diff --git a/docs/source/getting_started/installation/cpu-arm.md b/docs/source/getting_started/installation/cpu-arm.md index a46e2c010600d..e199073ed721f 100644 --- a/docs/source/getting_started/installation/cpu-arm.md +++ b/docs/source/getting_started/installation/cpu-arm.md @@ -2,7 +2,7 @@ # Installation for ARM CPUs -vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering: +vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM (which also apply to Apple Silicon, see [Installation for macOS](#installation-apple) for more). For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering: - CPU backend inference capabilities - Relevant runtime environment variables @@ -20,7 +20,7 @@ Contents: ## Requirements - **Operating System**: Linux or macOS -- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended) +- **Compilers**: `gcc/g++ >= 12.3.0` (optional, but recommended) or `Apple Clang >= 15.0.0` for macOS - **Instruction Set Architecture (ISA)**: NEON support is required (arm-backend-quick-start-dockerfile)= diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md index bbb2d1872ef39..bb046dd0fd9dc 100644 --- a/docs/source/getting_started/installation/cpu-x86.md +++ b/docs/source/getting_started/installation/cpu-x86.md @@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library $ find / -name *libtcmalloc* # find the dynamic link library path $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD -$ python examples/offline_inference.py # run vLLM +$ python examples/offline_inference/offline_inference.py # run vLLM ``` - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: @@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 $ export VLLM_CPU_OMP_THREADS_BIND=0-7 -$ python examples/offline_inference.py +$ python examples/offline_inference/offline_inference.py ``` - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access. diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu-cuda.md index 7ea10bb8b59ff..419b8163fc034 100644 --- a/docs/source/getting_started/installation/gpu-cuda.md +++ b/docs/source/getting_started/installation/gpu-cuda.md @@ -12,24 +12,43 @@ vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) bin ## Install released versions -You can install vLLM using pip: +### Create a new Python environment + +You can create a new Python environment using `conda`: ```console $ # (Recommended) Create a new conda environment. $ conda create -n myenv python=3.12 -y $ conda activate myenv - -$ # Install vLLM with CUDA 12.1. -$ pip install vllm ``` ```{note} -Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See for more details. +[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. In particular, the PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See for more details. +``` + +Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command: + +```console +$ # (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment. +$ uv venv myenv --python 3.12 --seed +$ source myenv/bin/activate +``` + +In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. + +Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-from-source) for more details. + +### Install vLLM + +You can install vLLM using either `pip` or `uv pip`: + +```console +$ # Install vLLM with CUDA 12.1. +$ pip install vllm # If you are using pip. +$ uv pip install vllm # If you are using uv. ``` -````{note} -As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. -We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions: +As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions: ```console $ # Install vLLM with CUDA 11.8. @@ -38,29 +57,47 @@ $ export PYTHON_VERSION=310 $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 ``` -In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. - -Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions. -```` - (install-the-latest-code)= ## Install the latest code -LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. You can download and install it with the following command: +LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. + +### Install the latest code using `pip` ```console -$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +$ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly ``` -If you want to access the wheels for previous commits, you can specify the commit hash in the URL: +`--pre` is required for `pip` to consider pre-released versions. + +If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL: ```console $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +$ pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +``` + +Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. + +### Install the latest code using `uv` + +Another way to install the latest code is to use `uv`: + +```console +$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly +``` + +If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: + +```console +$ export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch +$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} ``` -Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. +The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. + +### Install the latest code using `docker` Another way to access the latest code is to use the docker images: @@ -89,7 +126,7 @@ $ cd vllm $ VLLM_USE_PRECOMPILED=1 pip install --editable . ``` -This will download the latest nightly wheel and use the compiled libraries from there in the install. +This will download the latest nightly wheel from https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation. The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files): diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md index 83de1aff409b2..0ebadca2ccec9 100644 --- a/docs/source/getting_started/installation/index.md +++ b/docs/source/getting_started/installation/index.md @@ -11,6 +11,7 @@ gpu-cuda gpu-rocm cpu-x86 cpu-arm +cpu-apple hpu-gaudi tpu xpu diff --git a/docs/source/getting_started/installation/xpu.md b/docs/source/getting_started/installation/xpu.md index be4e3b9bd1bc5..c1ab5478eb652 100644 --- a/docs/source/getting_started/installation/xpu.md +++ b/docs/source/getting_started/installation/xpu.md @@ -71,4 +71,4 @@ $ --pipeline-parallel-size=2 \ $ -tp=8 ``` -By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. +By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 3f9556165ece4..2808e1b386801 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -31,7 +31,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in ## Offline Batched Inference -With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`: @@ -42,7 +42,7 @@ The first line of this example imports the classes {class}`~vllm.LLM` and {class from vllm import LLM, SamplingParams ``` -The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](https://docs.vllm.ai/en/stable/dev/sampling_params.html). +The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params). ```python prompts = [ @@ -133,7 +133,7 @@ completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", print("Completion result:", completion) ``` -A more detailed client example can be found here: +A more detailed client example can be found here: ### OpenAI Chat Completions API with vLLM diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md index 5a0310da0f2cb..f5efe0bef7506 100644 --- a/docs/source/getting_started/troubleshooting.md +++ b/docs/source/getting_started/troubleshooting.md @@ -24,7 +24,7 @@ To isolate the model downloading and loading issue, you can use the `--load-form ## Model is too large -If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. +If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. ## Enable more logging diff --git a/docs/source/index.md b/docs/source/index.md index c335155bd6e14..6747a7fcce4fe 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -137,10 +137,10 @@ community/sponsors :caption: API Reference :maxdepth: 2 -dev/sampling_params -dev/pooling_params -dev/offline_inference/offline_index -dev/engine/engine_index +api/offline_inference/index +api/engine/index +api/multimodal/index +api/params ``` % Design Documents: Details about vLLM internals @@ -154,7 +154,6 @@ design/huggingface_integration design/plugin_system design/kernel/paged_attention design/input_processing/model_inputs_index -design/multimodal/multimodal_index design/automatic_prefix_caching design/multiprocessing ``` @@ -169,6 +168,7 @@ contributing/overview contributing/profiling/profiling_index contributing/dockerfile/dockerfile contributing/model/index +contributing/vulnerability_management ``` # Indices and tables diff --git a/docs/source/models/extensions/tensorizer.md b/docs/source/models/extensions/tensorizer.md index 42ed5c795dd27..ae17e3437bca6 100644 --- a/docs/source/models/extensions/tensorizer.md +++ b/docs/source/models/extensions/tensorizer.md @@ -9,7 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor For more information on CoreWeave's Tensorizer, please refer to [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see -the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html). +the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html). ```{note} Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md index 383299d61b5dd..6228c7c2ac957 100644 --- a/docs/source/models/generative_models.md +++ b/docs/source/models/generative_models.md @@ -46,7 +46,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -A code example can be found here: +A code example can be found here: ### `LLM.beam_search` @@ -103,7 +103,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -A code example can be found here: +A code example can be found here: If the model doesn't have a chat template or you want to specify another one, you can explicitly pass a chat template: diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 12ded68eb30b5..3e4407cfdc233 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -65,7 +65,7 @@ embeds = output.outputs.embedding print(f"Embeddings: {embeds!r} (size={len(embeds)})") ``` -A code example can be found here: +A code example can be found here: ### `LLM.classify` @@ -80,7 +80,7 @@ probs = output.outputs.probs print(f"Class Probabilities: {probs!r} (size={len(probs)})") ``` -A code example can be found here: +A code example can be found here: ### `LLM.score` @@ -102,7 +102,7 @@ score = output.outputs.score print(f"Score: {score}") ``` -A code example can be found here: +A code example can be found here: ## Online Inference diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 590bea992d1fc..3ba34c77205e5 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -430,6 +430,9 @@ You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask On the other hand, its 1.5B variant (`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention despite being described otherwise on its model card. + +Regardless of the variant, you need to enable `--trust-remote-code` for the correct tokenizer to be +loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882). ``` If your model is not in the above list, we will try to automatically convert the model using @@ -640,7 +643,7 @@ See [this page](#generative-models) for more information on how to use generativ - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. - - ✅︎ - - + - ✅︎ * - `LlavaOnevisionForConditionalGeneration` - LLaVA-Onevision - T + I+ + V+ @@ -710,7 +713,7 @@ See [this page](#generative-models) for more information on how to use generativ - `Qwen/Qwen2-Audio-7B-Instruct` - - ✅︎ - - + - ✅︎ * - `Qwen2VLForConditionalGeneration` - Qwen2-VL - T + IE+ + VE+ @@ -724,7 +727,7 @@ See [this page](#generative-models) for more information on how to use generativ - `fixie-ai/ultravox-v0_3` - - ✅︎ - - + - ✅︎ ``` E Pre-computed embeddings can be inputted for this modality. diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index b1703249d7224..4e0a9ef6ecf7d 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -51,7 +51,7 @@ $ --pipeline-parallel-size 2 If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration. -The first step, is to start containers and organize them into a cluster. We have provided the helper script to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command. +The first step, is to start containers and organize them into a cluster. We have provided the helper script to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command. Pick a node as the head node, and run the following command: diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md index 0efa09f2869ca..9f5e1b908d786 100644 --- a/docs/source/serving/multimodal_inputs.md +++ b/docs/source/serving/multimodal_inputs.md @@ -60,7 +60,7 @@ for o in outputs: print(generated_text) ``` -Full example: +Full example: To substitute multiple images inside the same text prompt, you can pass in a list of images instead: @@ -91,7 +91,7 @@ for o in outputs: print(generated_text) ``` -Full example: +Full example: Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: @@ -125,13 +125,13 @@ for o in outputs: You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary instead of using multi-image input. -Full example: +Full example: ### Audio You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary. -Full example: +Full example: ### Embedding @@ -271,7 +271,7 @@ chat_response = client.chat.completions.create( print("Chat completion output:", chat_response.choices[0].message.content) ``` -Full example: +Full example: ```{tip} Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine, @@ -342,7 +342,7 @@ result = chat_completion_from_url.choices[0].message.content print("Chat completion output from image url:", result) ``` -Full example: +Full example: ````{note} By default, the timeout for fetching videos through HTTP URL is `30` seconds. @@ -445,7 +445,7 @@ result = chat_completion_from_url.choices[0].message.content print("Chat completion output from audio url:", result) ``` -Full example: +Full example: ````{note} By default, the timeout for fetching audios through HTTP URL is `10` seconds. @@ -529,4 +529,4 @@ Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of th example below for details. ``` -Full example: +Full example: diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md index 83178f7811825..79092ab208784 100644 --- a/docs/source/serving/offline_inference.md +++ b/docs/source/serving/offline_inference.md @@ -23,7 +23,7 @@ The available APIs depend on the type of model that is being run: Please refer to the above pages for more details about each API. ```{seealso} -[API Reference](/dev/offline_inference/offline_index) +[API Reference](/api/offline_inference/index) ``` ## Configuration Options diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 1e5ea6357d202..ec5a367594743 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -191,11 +191,11 @@ The order of priorities is `command line > config file values > defaults`. Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions); you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. -Code example: +Code example: #### Extra parameters -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. +The following [sampling parameters](#sampling-params) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -222,11 +222,11 @@ We support both [Vision](https://platform.openai.com/docs/guides/vision)- and see our [Multimodal Inputs](#multimodal-inputs) guide for more information. - *Note: `image_url.detail` parameter is not supported.* -Code example: +Code example: #### Extra parameters -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. +The following [sampling parameters](#sampling-params) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -255,11 +255,11 @@ which will be treated as a single prompt to the model. This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details. ``` -Code example: +Code example: #### Extra parameters -The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported. +The following [pooling parameters](#pooling-params) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -299,7 +299,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_ The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats. -Code example: +Code example: (score-api)= ### Score API @@ -309,7 +309,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). -Code example: +Code example: #### Single inference @@ -447,7 +447,7 @@ Response: #### Extra parameters -The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported. +The following [pooling parameters](#pooling-params) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python diff --git a/examples/aqlm_example.py b/examples/offline_inference/aqlm_example.py similarity index 100% rename from examples/aqlm_example.py rename to examples/offline_inference/aqlm_example.py diff --git a/examples/cpu_offload.py b/examples/offline_inference/cpu_offload.py similarity index 100% rename from examples/cpu_offload.py rename to examples/offline_inference/cpu_offload.py diff --git a/examples/florence2_inference.py b/examples/offline_inference/florence2_inference.py similarity index 92% rename from examples/florence2_inference.py rename to examples/offline_inference/florence2_inference.py index b58ac2e1f7ed4..49dd2c331db5a 100644 --- a/examples/florence2_inference.py +++ b/examples/offline_inference/florence2_inference.py @@ -3,7 +3,8 @@ encoder/decoder models, specifically Florence-2 ''' # TODO(Isotr0py): -# Move to offline_inference_vision_language.py after porting vision backbone +# Move to offline_inference/offline_inference_vision_language.py +# after porting vision backbone from vllm import LLM, SamplingParams dtype = "float" diff --git a/examples/gguf_inference.py b/examples/offline_inference/gguf_inference.py similarity index 100% rename from examples/gguf_inference.py rename to examples/offline_inference/gguf_inference.py diff --git a/examples/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py similarity index 100% rename from examples/llm_engine_example.py rename to examples/offline_inference/llm_engine_example.py diff --git a/examples/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py similarity index 100% rename from examples/lora_with_quantization_inference.py rename to examples/offline_inference/lora_with_quantization_inference.py diff --git a/examples/multilora_inference.py b/examples/offline_inference/multilora_inference.py similarity index 100% rename from examples/multilora_inference.py rename to examples/offline_inference/multilora_inference.py diff --git a/examples/offline_chat_with_tools.py b/examples/offline_inference/offline_chat_with_tools.py similarity index 100% rename from examples/offline_chat_with_tools.py rename to examples/offline_inference/offline_chat_with_tools.py diff --git a/examples/offline_inference.py b/examples/offline_inference/offline_inference.py similarity index 100% rename from examples/offline_inference.py rename to examples/offline_inference/offline_inference.py diff --git a/examples/offline_inference_arctic.py b/examples/offline_inference/offline_inference_arctic.py similarity index 100% rename from examples/offline_inference_arctic.py rename to examples/offline_inference/offline_inference_arctic.py diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference/offline_inference_audio_language.py similarity index 100% rename from examples/offline_inference_audio_language.py rename to examples/offline_inference/offline_inference_audio_language.py diff --git a/examples/offline_inference_chat.py b/examples/offline_inference/offline_inference_chat.py similarity index 100% rename from examples/offline_inference_chat.py rename to examples/offline_inference/offline_inference_chat.py diff --git a/examples/offline_inference_classification.py b/examples/offline_inference/offline_inference_classification.py similarity index 100% rename from examples/offline_inference_classification.py rename to examples/offline_inference/offline_inference_classification.py diff --git a/examples/offline_inference_cli.py b/examples/offline_inference/offline_inference_cli.py similarity index 100% rename from examples/offline_inference_cli.py rename to examples/offline_inference/offline_inference_cli.py diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference/offline_inference_distributed.py similarity index 100% rename from examples/offline_inference_distributed.py rename to examples/offline_inference/offline_inference_distributed.py diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference/offline_inference_embedding.py similarity index 100% rename from examples/offline_inference_embedding.py rename to examples/offline_inference/offline_inference_embedding.py diff --git a/examples/offline_inference_encoder_decoder.py b/examples/offline_inference/offline_inference_encoder_decoder.py similarity index 100% rename from examples/offline_inference_encoder_decoder.py rename to examples/offline_inference/offline_inference_encoder_decoder.py diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference/offline_inference_mlpspeculator.py similarity index 100% rename from examples/offline_inference_mlpspeculator.py rename to examples/offline_inference/offline_inference_mlpspeculator.py diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference/offline_inference_neuron.py similarity index 100% rename from examples/offline_inference_neuron.py rename to examples/offline_inference/offline_inference_neuron.py diff --git a/examples/offline_inference_neuron_int8_quantization.py b/examples/offline_inference/offline_inference_neuron_int8_quantization.py similarity index 100% rename from examples/offline_inference_neuron_int8_quantization.py rename to examples/offline_inference/offline_inference_neuron_int8_quantization.py diff --git a/examples/offline_inference_openai.md b/examples/offline_inference/offline_inference_openai/offline_inference_openai.md similarity index 90% rename from examples/offline_inference_openai.md rename to examples/offline_inference/offline_inference_openai/offline_inference_openai.md index 2436417cb543a..6278a1943fe4a 100644 --- a/examples/offline_inference_openai.md +++ b/examples/offline_inference/offline_inference_openai/offline_inference_openai.md @@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format The OpenAI batch file format consists of a series of json objects on new lines. -[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl) +[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl) Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. @@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ``` -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ``` -$ cat openai_example_batch.jsonl +$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line. You can run the batch with the following command, which will write its results to a file called `results.jsonl` ``` -python -m vllm.entrypoints.openai.run_batch -i openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct ``` ### Step 3: Check your results @@ -66,10 +66,10 @@ $ cat results.jsonl The batch runner supports remote input and output urls that are accessible via http/https. -For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl`, you can run +For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run ``` -python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct ``` ## Example 3: Integrating with AWS S3 @@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls. To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ``` -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ``` -$ cat openai_example_batch.jsonl +$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -104,7 +104,7 @@ $ cat openai_example_batch.jsonl Now upload your batch file to your S3 bucket. ``` -aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl +aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl ``` ### Step 2: Generate your presigned urls diff --git a/examples/openai_example_batch.jsonl b/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl similarity index 100% rename from examples/openai_example_batch.jsonl rename to examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl diff --git a/examples/offline_inference_pixtral.py b/examples/offline_inference/offline_inference_pixtral.py similarity index 100% rename from examples/offline_inference_pixtral.py rename to examples/offline_inference/offline_inference_pixtral.py diff --git a/examples/offline_inference_scoring.py b/examples/offline_inference/offline_inference_scoring.py similarity index 100% rename from examples/offline_inference_scoring.py rename to examples/offline_inference/offline_inference_scoring.py diff --git a/examples/offline_inference_structured_outputs.py b/examples/offline_inference/offline_inference_structured_outputs.py similarity index 100% rename from examples/offline_inference_structured_outputs.py rename to examples/offline_inference/offline_inference_structured_outputs.py diff --git a/examples/offline_inference_tpu.py b/examples/offline_inference/offline_inference_tpu.py similarity index 100% rename from examples/offline_inference_tpu.py rename to examples/offline_inference/offline_inference_tpu.py diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference/offline_inference_vision_language.py similarity index 100% rename from examples/offline_inference_vision_language.py rename to examples/offline_inference/offline_inference_vision_language.py diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference/offline_inference_vision_language_embedding.py similarity index 100% rename from examples/offline_inference_vision_language_embedding.py rename to examples/offline_inference/offline_inference_vision_language_embedding.py diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference/offline_inference_vision_language_multi_image.py similarity index 93% rename from examples/offline_inference_vision_language_multi_image.py rename to examples/offline_inference/offline_inference_vision_language_multi_image.py index 6af8d7768e75d..cf2e90a325c6a 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference/offline_inference_vision_language_multi_image.py @@ -23,7 +23,7 @@ class ModelRequestData(NamedTuple): llm: LLM prompt: str - stop_token_ids: Optional[List[str]] + stop_token_ids: Optional[List[int]] image_data: List[Image] chat_template: Optional[str] @@ -44,12 +44,14 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData: prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n" "<|im_start|>assistant\n") stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] + return ModelRequestData( llm=llm, prompt=prompt, stop_token_ids=stop_token_ids, image_data=[fetch_image(url) for url in image_urls], - chat_template=None) + chat_template=None, + ) def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData: @@ -166,7 +168,8 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData: limit_mm_per_prompt={"image": len(image_urls)}, ) - prompt = f"<|image|><|image|><|begin_of_text|>{question}" + placeholders = "<|image|>" * len(image_urls) + prompt = f"{placeholders}<|begin_of_text|>{question}" return ModelRequestData( llm=llm, prompt=prompt, @@ -209,6 +212,31 @@ def load_nvlm_d(question: str, image_urls: List[str]): ) +def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData: + model_name = "mistral-community/pixtral-12b" + + # Adjust this as necessary to fit in GPU + llm = LLM( + model=model_name, + max_model_len=8192, + max_num_seqs=2, + tensor_parallel_size=2, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = "[IMG]" * len(image_urls) + prompt = f"[INST]{question}\n{placeholders}[/INST]" + stop_token_ids = None + + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=stop_token_ids, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: # num_crops is an override kwarg to the multimodal image processor; # For some models, e.g., Phi-3.5-vision-instruct, it is recommended @@ -244,7 +272,8 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: ) -def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData: +def load_qwen_vl_chat(question: str, + image_urls: List[str]) -> ModelRequestData: model_name = "Qwen/Qwen-VL-Chat" llm = LLM( model=model_name, @@ -274,6 +303,7 @@ def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData: stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + return ModelRequestData( llm=llm, prompt=prompt, @@ -348,7 +378,8 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: "mllama": load_mllama, "NVLM_D": load_nvlm_d, "phi3_v": load_phi3v, - "qwen_vl_chat": load_qwenvl_chat, + "pixtral_hf": load_pixtral_hf, + "qwen_vl_chat": load_qwen_vl_chat, "qwen2_vl": load_qwen2_vl, } diff --git a/examples/offline_inference_whisper.py b/examples/offline_inference/offline_inference_whisper.py similarity index 100% rename from examples/offline_inference_whisper.py rename to examples/offline_inference/offline_inference_whisper.py diff --git a/examples/offline_inference_with_default_generation_config.py b/examples/offline_inference/offline_inference_with_default_generation_config.py similarity index 100% rename from examples/offline_inference_with_default_generation_config.py rename to examples/offline_inference/offline_inference_with_default_generation_config.py diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference/offline_inference_with_prefix.py similarity index 100% rename from examples/offline_inference_with_prefix.py rename to examples/offline_inference/offline_inference_with_prefix.py diff --git a/examples/offline_inference_with_profiler.py b/examples/offline_inference/offline_inference_with_profiler.py similarity index 100% rename from examples/offline_inference_with_profiler.py rename to examples/offline_inference/offline_inference_with_profiler.py diff --git a/examples/offline_profile.py b/examples/offline_inference/offline_profile.py similarity index 99% rename from examples/offline_profile.py rename to examples/offline_inference/offline_profile.py index 46afe8aa2604b..187a05e4d70a2 100644 --- a/examples/offline_profile.py +++ b/examples/offline_inference/offline_profile.py @@ -363,7 +363,7 @@ def abort_requests(): example: ``` - python examples/offline_profile.py \\ + python examples/offline_inference/offline_profile.py \\ --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\ --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\ --enforce-eager run_num_steps -n 2 diff --git a/examples/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py similarity index 100% rename from examples/save_sharded_state.py rename to examples/offline_inference/save_sharded_state.py diff --git a/examples/api_client.py b/examples/online_serving/api_client.py similarity index 100% rename from examples/api_client.py rename to examples/online_serving/api_client.py diff --git a/examples/chart-helm/.helmignore b/examples/online_serving/chart-helm/.helmignore similarity index 100% rename from examples/chart-helm/.helmignore rename to examples/online_serving/chart-helm/.helmignore diff --git a/examples/chart-helm/Chart.yaml b/examples/online_serving/chart-helm/Chart.yaml similarity index 100% rename from examples/chart-helm/Chart.yaml rename to examples/online_serving/chart-helm/Chart.yaml diff --git a/examples/online_serving/chart-helm/README.md b/examples/online_serving/chart-helm/README.md new file mode 100644 index 0000000000000..6aa126d4fd22c --- /dev/null +++ b/examples/online_serving/chart-helm/README.md @@ -0,0 +1,21 @@ +# Helm Charts + +This directory contains a Helm chart for deploying the vllm application. The chart includes configurations for deployment, autoscaling, resource management, and more. + +## Files + +- Chart.yaml: Defines the chart metadata including name, version, and maintainers. +- ct.yaml: Configuration for chart testing. +- lintconf.yaml: Linting rules for YAML files. +- values.schema.json: JSON schema for validating values.yaml. +- values.yaml: Default values for the Helm chart. +- templates/_helpers.tpl: Helper templates for defining common configurations. +- templates/configmap.yaml: Template for creating ConfigMaps. +- templates/custom-objects.yaml: Template for custom Kubernetes objects. +- templates/deployment.yaml: Template for creating Deployments. +- templates/hpa.yaml: Template for Horizontal Pod Autoscaler. +- templates/job.yaml: Template for Kubernetes Jobs. +- templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget. +- templates/pvc.yaml: Template for Persistent Volume Claims. +- templates/secrets.yaml: Template for Kubernetes Secrets. +- templates/service.yaml: Template for creating Services. \ No newline at end of file diff --git a/examples/chart-helm/ct.yaml b/examples/online_serving/chart-helm/ct.yaml similarity index 100% rename from examples/chart-helm/ct.yaml rename to examples/online_serving/chart-helm/ct.yaml diff --git a/examples/chart-helm/lintconf.yaml b/examples/online_serving/chart-helm/lintconf.yaml similarity index 100% rename from examples/chart-helm/lintconf.yaml rename to examples/online_serving/chart-helm/lintconf.yaml diff --git a/examples/chart-helm/templates/_helpers.tpl b/examples/online_serving/chart-helm/templates/_helpers.tpl similarity index 100% rename from examples/chart-helm/templates/_helpers.tpl rename to examples/online_serving/chart-helm/templates/_helpers.tpl diff --git a/examples/chart-helm/templates/configmap.yaml b/examples/online_serving/chart-helm/templates/configmap.yaml similarity index 100% rename from examples/chart-helm/templates/configmap.yaml rename to examples/online_serving/chart-helm/templates/configmap.yaml diff --git a/examples/chart-helm/templates/custom-objects.yaml b/examples/online_serving/chart-helm/templates/custom-objects.yaml similarity index 100% rename from examples/chart-helm/templates/custom-objects.yaml rename to examples/online_serving/chart-helm/templates/custom-objects.yaml diff --git a/examples/chart-helm/templates/deployment.yaml b/examples/online_serving/chart-helm/templates/deployment.yaml similarity index 100% rename from examples/chart-helm/templates/deployment.yaml rename to examples/online_serving/chart-helm/templates/deployment.yaml diff --git a/examples/chart-helm/templates/hpa.yaml b/examples/online_serving/chart-helm/templates/hpa.yaml similarity index 100% rename from examples/chart-helm/templates/hpa.yaml rename to examples/online_serving/chart-helm/templates/hpa.yaml diff --git a/examples/chart-helm/templates/job.yaml b/examples/online_serving/chart-helm/templates/job.yaml similarity index 100% rename from examples/chart-helm/templates/job.yaml rename to examples/online_serving/chart-helm/templates/job.yaml diff --git a/examples/chart-helm/templates/poddisruptionbudget.yaml b/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml similarity index 100% rename from examples/chart-helm/templates/poddisruptionbudget.yaml rename to examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml diff --git a/examples/chart-helm/templates/pvc.yaml b/examples/online_serving/chart-helm/templates/pvc.yaml similarity index 100% rename from examples/chart-helm/templates/pvc.yaml rename to examples/online_serving/chart-helm/templates/pvc.yaml diff --git a/examples/chart-helm/templates/secrets.yaml b/examples/online_serving/chart-helm/templates/secrets.yaml similarity index 100% rename from examples/chart-helm/templates/secrets.yaml rename to examples/online_serving/chart-helm/templates/secrets.yaml diff --git a/examples/chart-helm/templates/service.yaml b/examples/online_serving/chart-helm/templates/service.yaml similarity index 100% rename from examples/chart-helm/templates/service.yaml rename to examples/online_serving/chart-helm/templates/service.yaml diff --git a/examples/chart-helm/values.schema.json b/examples/online_serving/chart-helm/values.schema.json similarity index 100% rename from examples/chart-helm/values.schema.json rename to examples/online_serving/chart-helm/values.schema.json diff --git a/examples/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml similarity index 100% rename from examples/chart-helm/values.yaml rename to examples/online_serving/chart-helm/values.yaml diff --git a/examples/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh similarity index 100% rename from examples/disaggregated_prefill.sh rename to examples/online_serving/disaggregated_prefill.sh diff --git a/examples/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py similarity index 100% rename from examples/gradio_openai_chatbot_webserver.py rename to examples/online_serving/gradio_openai_chatbot_webserver.py diff --git a/examples/gradio_webserver.py b/examples/online_serving/gradio_webserver.py similarity index 100% rename from examples/gradio_webserver.py rename to examples/online_serving/gradio_webserver.py diff --git a/examples/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py similarity index 100% rename from examples/openai_chat_completion_client.py rename to examples/online_serving/openai_chat_completion_client.py diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py similarity index 100% rename from examples/openai_chat_completion_client_for_multimodal.py rename to examples/online_serving/openai_chat_completion_client_for_multimodal.py diff --git a/examples/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py similarity index 100% rename from examples/openai_chat_completion_client_with_tools.py rename to examples/online_serving/openai_chat_completion_client_with_tools.py diff --git a/examples/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py similarity index 100% rename from examples/openai_chat_completion_structured_outputs.py rename to examples/online_serving/openai_chat_completion_structured_outputs.py diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py similarity index 100% rename from examples/openai_chat_embedding_client_for_multimodal.py rename to examples/online_serving/openai_chat_embedding_client_for_multimodal.py diff --git a/examples/openai_completion_client.py b/examples/online_serving/openai_completion_client.py similarity index 100% rename from examples/openai_completion_client.py rename to examples/online_serving/openai_completion_client.py diff --git a/examples/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py similarity index 100% rename from examples/openai_cross_encoder_score.py rename to examples/online_serving/openai_cross_encoder_score.py diff --git a/examples/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py similarity index 100% rename from examples/openai_embedding_client.py rename to examples/online_serving/openai_embedding_client.py diff --git a/examples/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py similarity index 100% rename from examples/openai_pooling_client.py rename to examples/online_serving/openai_pooling_client.py diff --git a/examples/production_monitoring/Otel.md b/examples/online_serving/opentelemetry/Otel.md similarity index 100% rename from examples/production_monitoring/Otel.md rename to examples/online_serving/opentelemetry/Otel.md diff --git a/examples/production_monitoring/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py similarity index 100% rename from examples/production_monitoring/dummy_client.py rename to examples/online_serving/opentelemetry/dummy_client.py diff --git a/examples/production_monitoring/README.md b/examples/online_serving/prometheus_grafana/README.md similarity index 95% rename from examples/production_monitoring/README.md rename to examples/online_serving/prometheus_grafana/README.md index 807c0470e7b30..c49e5306a1cb4 100644 --- a/examples/production_monitoring/README.md +++ b/examples/online_serving/prometheus_grafana/README.md @@ -1,4 +1,4 @@ -# vLLM + Prometheus/Grafana +# Prometheus and Grafana This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites. @@ -6,7 +6,7 @@ Install: - [`docker`](https://docs.docker.com/engine/install/) - [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository) -### Launch +## Launch Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint: ```bash @@ -35,11 +35,11 @@ python3 ../../benchmarks/benchmark_serving.py \ Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM. -### Grafana Dashboard +## Grafana Dashboard Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`). -#### Add Prometheus Data Source +### Add Prometheus Data Source Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus. @@ -47,7 +47,7 @@ On Prometheus configuration page, we need to add the `Prometheus Server URL` in Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.". -#### Import Dashboard +### Import Dashboard Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following: diff --git a/examples/production_monitoring/docker-compose.yaml b/examples/online_serving/prometheus_grafana/docker-compose.yaml similarity index 100% rename from examples/production_monitoring/docker-compose.yaml rename to examples/online_serving/prometheus_grafana/docker-compose.yaml diff --git a/examples/production_monitoring/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json similarity index 100% rename from examples/production_monitoring/grafana.json rename to examples/online_serving/prometheus_grafana/grafana.json diff --git a/examples/production_monitoring/prometheus.yaml b/examples/online_serving/prometheus_grafana/prometheus.yaml similarity index 100% rename from examples/production_monitoring/prometheus.yaml rename to examples/online_serving/prometheus_grafana/prometheus.yaml diff --git a/examples/run_cluster.sh b/examples/online_serving/run_cluster.sh similarity index 100% rename from examples/run_cluster.sh rename to examples/online_serving/run_cluster.sh diff --git a/examples/sagemaker-entrypoint.sh b/examples/online_serving/sagemaker-entrypoint.sh similarity index 100% rename from examples/sagemaker-entrypoint.sh rename to examples/online_serving/sagemaker-entrypoint.sh diff --git a/examples/fp8/README.md b/examples/other/fp8/README.md similarity index 88% rename from examples/fp8/README.md rename to examples/other/fp8/README.md index 181c36558fcff..4e8031d954113 100644 --- a/examples/fp8/README.md +++ b/examples/other/fp8/README.md @@ -20,12 +20,12 @@ Before incorporating the FP8 datatype for inference workloads, you must adhere t ### 2. Convert HF model into a quantized HF model. Note: The following steps are adapted from the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md). -`quantize.py` (examples/fp8/quantizer/quantize.py) uses the quantization toolkit (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format). +`quantize.py` (examples/other/fp8/quantizer/quantize.py) uses the quantization toolkit (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format). -The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/fp8/quantizer/README.md`. +The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/other/fp8/quantizer/README.md`. ### 3. Extract KV Cache Scaling Factors from quantized HF model. -`extract_scales.py` (examples/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following: +`extract_scales.py` (examples/other/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following: 1. **File Structure**: The utility operates under the assumption that all parameters, including KV cache scaling factors, corresponding to a particular Tensor Parallelism (TP) rank are stored in a single file. These files must adhere to a specific naming convention where the TP rank is immediately identified after a specific keyword (e.g., "rank") in the filename. 2. **TP Decomposition**: The utility assumes consistency between the TP decomposition employed by the quantizer tool and that used by vLLM. @@ -35,7 +35,7 @@ The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found a ```python # prerequisites: # - Quantized HF LLaMa 2 model -python3 examples/fp8/extract_scales.py --help +python3 examples/other/fp8/extract_scales.py --help Usage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--load_format {auto,safetensors,npz,pt}] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE] KV Scale Extraction Example @@ -52,11 +52,11 @@ Optional arguments: ``` ```python Example: -python3 examples/fp8/extract_scales.py --quantized_model --tp_size --output_dir +python3 examples/other/fp8/extract_scales.py --quantized_model --tp_size --output_dir ``` ### 4. Load KV Cache Scaling Factors into VLLM. This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8. -```python +``` # prerequisites: # - LLaMa 2 kv_cache_scales.json file @@ -90,7 +90,7 @@ optional arguments: --kv-cache-dtype {auto,fp8} Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported ```for common inference criteria. --quantization-param-path QUANT_PARAM_JSON Path to the JSON file containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. ``` -``` Example: +```console python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --quantization-param-path --model -```python +``` diff --git a/examples/fp8/extract_scales.py b/examples/other/fp8/extract_scales.py similarity index 100% rename from examples/fp8/extract_scales.py rename to examples/other/fp8/extract_scales.py diff --git a/examples/fp8/quantizer/README.md b/examples/other/fp8/quantizer/README.md similarity index 100% rename from examples/fp8/quantizer/README.md rename to examples/other/fp8/quantizer/README.md diff --git a/examples/fp8/quantizer/quantize.py b/examples/other/fp8/quantizer/quantize.py similarity index 100% rename from examples/fp8/quantizer/quantize.py rename to examples/other/fp8/quantizer/quantize.py diff --git a/examples/logging_configuration.md b/examples/other/logging_configuration.md similarity index 100% rename from examples/logging_configuration.md rename to examples/other/logging_configuration.md diff --git a/examples/tensorize_vllm_model.py b/examples/other/tensorize_vllm_model.py similarity index 96% rename from examples/tensorize_vllm_model.py rename to examples/other/tensorize_vllm_model.py index dd77a4ad0c6b7..5fff1fdf502c9 100644 --- a/examples/tensorize_vllm_model.py +++ b/examples/other/tensorize_vllm_model.py @@ -25,7 +25,7 @@ To serialize a model, install vLLM from source, then run something like this from the root level of this repository: -python -m examples.tensorize_vllm_model \ +python -m examples.offline_inference.tensorize_vllm_model \ --model facebook/opt-125m \ serialize \ --serialized-directory s3://my-bucket \ @@ -45,7 +45,7 @@ To deserialize a model, you can run something like this from the root level of this repository: -python -m examples.tensorize_vllm_model \ +python -m examples.offline_inference.tensorize_vllm_model \ --model EleutherAI/gpt-j-6B \ --dtype float16 \ deserialize \ @@ -63,11 +63,11 @@ model-rank-%03d.tensors For more information on the available arguments for serializing, run -`python -m examples.tensorize_vllm_model serialize --help`. +`python -m examples.offline_inference.tensorize_vllm_model serialize --help`. Or for deserializing: -`python -m examples.tensorize_vllm_model deserialize --help`. +`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`. Once a model is serialized, tensorizer can be invoked with the `LLM` class directly to load models: @@ -88,7 +88,7 @@ In order to see all of the available arguments usable to configure loading with tensorizer that are given to `TensorizerConfig`, run: -`python -m examples.tensorize_vllm_model deserialize --help` +`python -m examples.offline_inference.tensorize_vllm_model deserialize --help` under the `tensorizer options` section. These can also be used for deserialization in this example script, although `--tensorizer-uri` and diff --git a/pyproject.toml b/pyproject.toml index 45fa4bff4e680..0ac3f39ef7a5f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ build-backend = "setuptools.build_meta" line-length = 80 exclude = [ # External file, leaving license intact - "examples/fp8/quantizer/quantize.py" + "examples/other/fp8/quantizer/quantize.py" ] [tool.ruff.lint.per-file-ignores] diff --git a/python_only_dev.py b/python_only_dev.py index f70b4984025b3..7d95ac96e6e4b 100644 --- a/python_only_dev.py +++ b/python_only_dev.py @@ -7,7 +7,7 @@ or export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -export VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl pip install -e . """ # noqa diff --git a/requirements-cpu.txt b/requirements-cpu.txt index e62f313297762..056fbf5a7adec 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -2,7 +2,7 @@ -r requirements-common.txt # Dependencies for CPUs -torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" -torch==2.5.1; platform_machine == "aarch64" +torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin" +torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin" torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch -datasets # for benchmark scripts \ No newline at end of file +datasets # for benchmark scripts diff --git a/requirements-test.in b/requirements-test.in index fb4179c3d8423..4b4dc376d1fa5 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -13,6 +13,7 @@ einops # required for MPT, qwen-vl and Mamba httpx librosa # required for audio tests peft +pqdm ray[adag]==2.40.0 sentence-transformers # required for embedding tests soundfile # required for audio tests diff --git a/requirements-test.txt b/requirements-test.txt index 3771577fe8ed0..f576e42afcbbf 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -48,6 +48,8 @@ botocore==1.35.57 # awscli # boto3 # s3transfer +bounded-pool-executor==0.0.3 + # via pqdm buildkite-test-collector==0.1.9 # via -r requirements-test.in certifi==2024.8.30 @@ -342,6 +344,8 @@ pooch==1.8.2 # via librosa portalocker==2.10.1 # via sacrebleu +pqdm==0.2.0 + # via -r requirements-test.in propcache==0.2.0 # via yarl protobuf==5.28.3 diff --git a/setup.py b/setup.py index ba6953dbdc174..b6c1f5bc8ac3f 100644 --- a/setup.py +++ b/setup.py @@ -34,9 +34,14 @@ def load_module_from_path(module_name, path): VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE -if not sys.platform.startswith("linux"): +if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu": logger.warning( - "vLLM only supports Linux platform (including WSL). " + "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS") + VLLM_TARGET_DEVICE = "cpu" +elif not (sys.platform.startswith("linux") + or sys.platform.startswith("darwin")): + logger.warning( + "vLLM only supports Linux platform (including WSL) and MacOS." "Building on %s, " "so vLLM may not be able to run correctly", sys.platform) VLLM_TARGET_DEVICE = "empty" @@ -252,7 +257,7 @@ def run(self): class repackage_wheel(build_ext): """Extracts libraries and other files from an existing wheel.""" - default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + default_wheel = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" def run(self) -> None: wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py index 37a6d334ee60c..689d17be81889 100644 --- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py +++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py @@ -1,58 +1,132 @@ +import itertools +from functools import partial + import pytest from PIL import Image -from transformers import AutoTokenizer +from pqdm.threads import pqdm -from vllm.inputs import InputProcessingContext +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.parse import ImageSize +from vllm.multimodal.processing import BaseMultiModalProcessor +from vllm.multimodal.utils import cached_get_tokenizer from ....utils import build_model_context -# Fixtures lazy import to avoid initializing CUDA during test collection -@pytest.fixture() -def processor_for_llava_next(): - from vllm.model_executor.models.llava_next import ( - LlavaNextMultiModalProcessor) - return LlavaNextMultiModalProcessor +def _validate_image_prompt_replacements_one( + processor: BaseMultiModalProcessor, + num_imgs: int, + failed_size_excs: list[tuple[ImageSize, Exception]], + image_size: ImageSize, +) -> None: + prompt = "" * num_imgs + image = Image.new("RGB", size=image_size) + mm_data = {"image": [image] * num_imgs} + try: + # The processor will throw an error if there is a mismatch + # in the prompt replacements + processed_inputs = processor.apply(prompt, mm_data, {}) -@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) -@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488), - (488, 183), (198, 176), (176, 198), - (161, 184), (184, 161)]) -@pytest.mark.parametrize("num_imgs", [1, 2]) -def test_processor_prompt_replacements( - processor_for_llava_next, - model_id: str, - image_size: tuple[int, int], + image_placeholders = processed_inputs["mm_placeholders"]["image"] + assert len(image_placeholders) == num_imgs + + first_placeholder = image_placeholders[0] + + # NOTE: There is a BOS token + assert first_placeholder["offset"] == 1 + assert first_placeholder["length"] == ( + len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs + + except Exception as exc: + failed_size_excs.append((image_size, exc)) + + +def _test_image_prompt_replacements( + processor, + *, num_imgs: int, -): + image_sizes: list[ImageSize], +) -> None: """ - Ensure LlavaNextMultiModalProcessor handles prompt replacement properly. + Ensure LlavaNextMultiModalProcessor + handles prompt replacement properly for input images. """ + failed_size_excs = list[tuple[ImageSize, Exception]]() + + validate_one = partial( + _validate_image_prompt_replacements_one, + processor, + num_imgs, + failed_size_excs, + ) + pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes") + + if failed_size_excs: + msg = "Found failing image sizes:" \ + + "\n========\n".join(f"[{size}]\n{exc}" + for size, exc in failed_size_excs) + raise AssertionError(msg) + + +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_prompt_replacements_regression(model_id, num_imgs): ctx = build_model_context( model_name=model_id, tokenizer_name=model_id, mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - ctx = InputProcessingContext(ctx.model_config, tokenizer) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) - # Build the image str / prompt based on the number of images we pass - prompt = "" * num_imgs - mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs} + image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328), + (488, 183), (2560, 1669)] + image_sizes = [ + size for w, h in image_ratios + for size in [ImageSize(w, h), ImageSize(h, w)] + ] - # The processor will throw an error if there is a mismatch - # in the prompt replacements - processor = processor_for_llava_next(ctx) - processed_inputs = processor.apply(prompt, mm_data, {}) + _test_image_prompt_replacements( + processor, + num_imgs=num_imgs, + image_sizes=image_sizes, + ) - image_placeholders = processed_inputs["mm_placeholders"]["image"] - assert len(image_placeholders) == num_imgs - first_placeholder = image_placeholders[0] +@pytest.mark.skip("This test takes around 2 hours to run. " + "Comment this out to run it manually.") +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) +@pytest.mark.parametrize("num_imgs", [1]) +def test_processor_prompt_replacements_all(model_id, num_imgs): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) - # NOTE: There is a BOS token - assert first_placeholder["offset"] == 1 - assert first_placeholder["length"] == ( - len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs + seen_aspect_ratios = set[float]() + image_sizes = list[ImageSize]() + + # The aspect ratio of the grid layout is between 1 and 2 + # NOTE: Assumes that feature size calculation is the same if we + # swap the width and height of the image + for w, h in itertools.product(range(64, 1024), repeat=2): + aspect_ratio = w / h + if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios: + image_sizes.append(ImageSize(w, h)) + seen_aspect_ratios.add(aspect_ratio) + + _test_image_prompt_replacements( + processor, + num_imgs=num_imgs, + image_sizes=image_sizes, + ) diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py index ed3e2db799be7..a033354f0e9b8 100644 --- a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py @@ -1,60 +1,132 @@ +import itertools +from functools import partial + import pytest from PIL import Image -from transformers import AutoTokenizer +from pqdm.threads import pqdm -from vllm.inputs import InputProcessingContext +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.parse import ImageSize +from vllm.multimodal.processing import BaseMultiModalProcessor +from vllm.multimodal.utils import cached_get_tokenizer from ....utils import build_model_context -# Fixtures lazy import to avoid initializing CUDA during test collection -@pytest.fixture() -def processor_for_llava_onevision(): - from vllm.model_executor.models.llava_onevision import ( - LlavaOnevisionMultiModalProcessor) - return LlavaOnevisionMultiModalProcessor +def _validate_image_prompt_replacements_one( + processor: BaseMultiModalProcessor, + num_imgs: int, + failed_size_excs: list[tuple[ImageSize, Exception]], + image_size: ImageSize, +) -> None: + prompt = "" * num_imgs + image = Image.new("RGB", size=image_size) + mm_data = {"image": [image] * num_imgs} + try: + # The processor will throw an error if there is a mismatch + # in the prompt replacements + processed_inputs = processor.apply(prompt, mm_data, {}) -@pytest.mark.parametrize("model_id", - ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) -@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488), - (488, 183), (198, 176), (176, 198), - (161, 184), (184, 161)]) -@pytest.mark.parametrize("num_imgs", [1, 2]) -def test_processor_prompt_replacements( - processor_for_llava_onevision, - model_id: str, - image_size: tuple[int, int], + image_placeholders = processed_inputs["mm_placeholders"]["image"] + assert len(image_placeholders) == num_imgs + + first_placeholder = image_placeholders[0] + + assert first_placeholder["offset"] == 0 + assert first_placeholder["length"] == len( + processed_inputs["prompt_token_ids"]) // num_imgs + except Exception as exc: + failed_size_excs.append((image_size, exc)) + + +def _test_image_prompt_replacements( + processor, + *, num_imgs: int, -): + image_sizes: list[ImageSize], +) -> None: """ - Ensure LlavaOnevisionMultiModalProcessor handles prompt replacement - properly. + Ensure LlavaOnevisionMultiModalProcessor + handles prompt replacement properly for input images. """ + failed_size_excs = list[tuple[ImageSize, Exception]]() + + validate_one = partial( + _validate_image_prompt_replacements_one, + processor, + num_imgs, + failed_size_excs, + ) + pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes") + + if failed_size_excs: + msg = "Found failing image sizes:" \ + + "\n========\n".join(f"[{size}]\n{exc}" + for size, exc in failed_size_excs) + raise AssertionError(msg) + + +@pytest.mark.parametrize("model_id", + ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_prompt_replacements_regression(model_id, num_imgs): ctx = build_model_context( model_name=model_id, tokenizer_name=model_id, mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - ctx = InputProcessingContext(ctx.model_config, tokenizer) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + + image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328), + (488, 183), (2560, 1669)] + image_sizes = [ + size for w, h in image_ratios + for size in [ImageSize(w, h), ImageSize(h, w)] + ] + + _test_image_prompt_replacements( + processor, + num_imgs=num_imgs, + image_sizes=image_sizes, + ) - # Build the image str / prompt based on the number of images we pass - prompt = "" * num_imgs - mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs} - # The processor will throw an error if there is a mismatch - # in the prompt replacements - processor = processor_for_llava_onevision(ctx) - processed_inputs = processor.apply(prompt, mm_data, {}) +@pytest.mark.skip("This test takes around 2 hours to run. " + "Comment this out to run it manually.") +@pytest.mark.parametrize("model_id", + ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) +@pytest.mark.parametrize("num_imgs", [1]) +def test_processor_prompt_replacements_all(model_id, num_imgs): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) - image_placeholders = processed_inputs["mm_placeholders"]["image"] - assert len(image_placeholders) == num_imgs + seen_aspect_ratios = set[float]() + image_sizes = list[ImageSize]() - first_placeholder = image_placeholders[0] + # The aspect ratio of the grid layout is between 1 and 6 + # NOTE: Assumes that feature size calculation is the same if we + # swap the width and height of the image + for w, h in itertools.product(range(64, 1024), repeat=2): + aspect_ratio = w / h + if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios: + image_sizes.append(ImageSize(w, h)) + seen_aspect_ratios.add(aspect_ratio) - # NOTE: There is a BOS token - assert first_placeholder["offset"] == 0 - assert first_placeholder["length"] == len( - processed_inputs["prompt_token_ids"]) // num_imgs + _test_image_prompt_replacements( + processor, + num_imgs=num_imgs, + image_sizes=image_sizes, + ) diff --git a/tests/models/decoder_only/vision_language/processing/test_phi3v.py b/tests/models/decoder_only/vision_language/processing/test_phi3v.py index 249045b3c04ce..c5b77260c6544 100644 --- a/tests/models/decoder_only/vision_language/processing/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/processing/test_phi3v.py @@ -1,21 +1,13 @@ """Tests for phi3v's multimodal preprocessing kwargs.""" import pytest -from transformers import AutoTokenizer -from vllm.inputs import InputProcessingContext -from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.utils import cached_get_tokenizer from .....conftest import _ImageAssets from ....utils import build_model_context -# Wrap lazy imports to avoid initializing CUDA during test collection -@pytest.fixture() -def processor_for_phi3v(): - from vllm.model_executor.models.phi3v import Phi3VMultiModalProcessor - return Phi3VMultiModalProcessor - - @pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"]) # yapf: disable @pytest.mark.parametrize( @@ -29,7 +21,6 @@ def processor_for_phi3v(): # yapf: enable @pytest.mark.parametrize("num_imgs", [1, 2]) def test_processor_override( - processor_for_phi3v, image_assets: _ImageAssets, model_id: str, mm_processor_kwargs: dict[str, int], @@ -37,21 +28,26 @@ def test_processor_override( num_imgs: int, ): """Ensure input_processor_for_phi3v handles num_crops properly.""" + # Avoid initializing CUDA early + from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID + ctx = build_model_context( model_name=model_id, tokenizer_name=model_id, trust_remote_code=True, limit_mm_per_prompt={"image": num_imgs}, ) - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - ctx = InputProcessingContext(ctx.model_config, tokenizer) + tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=tokenizer, + ) # Build the image str / prompt based on the number of images we pass img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)]) prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" mm_data = {"image": [image_assets[0].pil_image] * num_imgs} - processor = processor_for_phi3v(ctx) processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) # Ensure we have the right number of placeholders per num_crops size diff --git a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py index b9ac887edf90f..0d54802f2b733 100644 --- a/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py @@ -1,19 +1,12 @@ import pytest -from transformers import AutoTokenizer -from vllm.inputs import InputProcessingContext +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.utils import cached_get_tokenizer from .....conftest import _ImageAssets from ....utils import build_model_context -# Fixtures lazy import to avoid initializing CUDA during test collection -@pytest.fixture() -def processor_for_qwen2_vl(): - from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor - return Qwen2VLMultiModalProcessor - - @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # yapf: disable @pytest.mark.parametrize( @@ -24,7 +17,6 @@ def processor_for_qwen2_vl(): # yapf: enable @pytest.mark.parametrize("num_imgs", [1, 2]) def test_processor_override( - processor_for_qwen2_vl, image_assets: _ImageAssets, model_id: str, mm_processor_kwargs: dict[str, object], @@ -39,18 +31,20 @@ def test_processor_override( mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - ctx = InputProcessingContext(ctx.model_config, tokenizer) + tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=tokenizer, + ) # Build the image str / prompt based on the number of images we pass prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs mm_data = {"image": [image_assets[0].pil_image] * num_imgs} - processor = processor_for_qwen2_vl(ctx) processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) # Ensure we have the right number of placeholders per num_crops size - hf_processor = processor._get_hf_processor(**mm_processor_kwargs) + hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs) image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token) img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 75d878217b657..d98bd9736b65f 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -10,12 +10,17 @@ from vllm.config import ModelConfig from vllm.inputs import InputProcessingContext from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.processing import (ProcessingCache, PromptReplacement, - _PlaceholderInfo, find_mm_placeholders, +# yapf conflicts with isort for this block +# yapf: disable +from vllm.multimodal.processing import (PlaceholderInfo, ProcessingCache, + PromptReplacement, + find_mm_placeholders, find_text_matches, find_token_matches, iter_token_matches, replace_text_matches, replace_token_matches) +# yapf: enable +from vllm.multimodal.profiling import MultiModalProfiler from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import full_groupby @@ -431,7 +436,7 @@ def test_find_replace_tokens( [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918], { "pattern_1": [ - _PlaceholderInfo( + PlaceholderInfo( modality="pattern_1", item_idx=0, start_idx=6, @@ -445,13 +450,13 @@ def test_find_replace_tokens( [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550], { "pattern_1": [ - _PlaceholderInfo( + PlaceholderInfo( modality="pattern_1", item_idx=0, start_idx=1, replacement=[32000, 32000], ), - _PlaceholderInfo( + PlaceholderInfo( modality="pattern_1", item_idx=1, start_idx=5, @@ -459,7 +464,7 @@ def test_find_replace_tokens( ), ], "pattern_3": [ - _PlaceholderInfo( + PlaceholderInfo( modality="pattern_3", item_idx=0, start_idx=7, @@ -472,13 +477,13 @@ def test_find_replace_tokens( [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550], { "pattern_1": [ - _PlaceholderInfo( + PlaceholderInfo( modality="pattern_1", item_idx=0, start_idx=1, replacement=[32000, 32000], ), - _PlaceholderInfo( + PlaceholderInfo( modality="pattern_1", item_idx=1, start_idx=3, @@ -486,7 +491,7 @@ def test_find_replace_tokens( ), ], "pattern_3": [ - _PlaceholderInfo( + PlaceholderInfo( modality="pattern_3", item_idx=0, start_idx=6, @@ -577,19 +582,15 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): revision=None, limit_mm_per_prompt=limit_mm_per_prompt, ) - model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) - processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] - ctx = InputProcessingContext( + processor = MULTIMODAL_REGISTRY.create_processor( model_config, tokenizer=cached_get_tokenizer(model_config.tokenizer), ) - - processor = processor_factory(ctx, cache=None) - profiler = processor.profiling_info + profiler = MultiModalProfiler(processor) mock_supported_mm_limits = MagicMock(return_value={"image": num_supported}) - profiler.get_supported_mm_limits = mock_supported_mm_limits + processor.info.get_supported_mm_limits = mock_supported_mm_limits if is_valid: exc_ctx = nullcontext() @@ -597,7 +598,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): exc_ctx = pytest.raises(ValueError, match="this model only supports") with exc_ctx: - profiler.get_mm_limits() + profiler.get_dummy_data(model_config.max_model_len) @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @@ -620,16 +621,12 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): revision=None, limit_mm_per_prompt=limit_mm_per_prompt, ) - model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) - processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] - ctx = InputProcessingContext( + processor = MULTIMODAL_REGISTRY.create_processor( model_config, tokenizer=cached_get_tokenizer(model_config.tokenizer), ) - processor = processor_factory(ctx, cache=None) - rng = np.random.RandomState(0) image = _rand_img(rng, min_wh=128, max_wh=256) if num_images == 0: @@ -681,9 +678,9 @@ def _test_processing_cache_correctness( hf_overrides=hf_overrides, limit_mm_per_prompt=limit_mm_per_prompt, ) - model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) - processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] + model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) + factories = MULTIMODAL_REGISTRY._processor_factories[model_cls] ctx = InputProcessingContext( model_config, tokenizer=cached_get_tokenizer(model_config.tokenizer), @@ -691,8 +688,9 @@ def _test_processing_cache_correctness( # Ensure that it can fit all of the data cache = ProcessingCache(capacity=1 << 30) - baseline_processor = processor_factory(ctx, cache=None) - cached_processor = processor_factory(ctx, cache=cache) + baseline_processor = factories.build_processor(ctx, cache=None) + cached_processor = factories.build_processor(ctx, cache=cache) + dummy_inputs = baseline_processor.dummy_inputs rng = np.random.RandomState(0) @@ -724,7 +722,7 @@ def _test_processing_cache_correctness( } mm_counts = {k: len(vs) for k, vs in mm_data.items()} - prompt = baseline_processor.profiling_info.get_dummy_processor_inputs( + prompt = dummy_inputs.get_dummy_processor_inputs( model_config.max_model_len, mm_counts, ).prompt_text diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py index 06dfebbb95527..ac64edfd4ec9d 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py @@ -2,13 +2,17 @@ import torch -from vllm.model_executor.models.llava import (LlavaForConditionalGeneration, - LlavaMultiModalProcessor) +from vllm.model_executor.models.llava import (LlavaDummyInputsBuilder, + LlavaForConditionalGeneration, + LlavaMultiModalProcessor, + LlavaProcessingInfo) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor, + info=LlavaProcessingInfo, + dummy_inputs=LlavaDummyInputsBuilder) class MyLlava(LlavaForConditionalGeneration): def compute_logits( diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index 0d27cf9f152e0..57518bd3e8299 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -5,7 +5,7 @@ def test_platform_plugins(): import os example_file = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(current_file))), - "examples", "offline_inference.py") + "examples", "offline_inference/offline_inference.py") runpy.run_path(example_file) # check if the plugin is loaded correctly diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 0b0792b6b845f..bf409d2d97aa1 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -163,8 +163,8 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): multilora_inference = import_from_path( - "examples.multilora_inference", - EXAMPLES_PATH / "multilora_inference.py", + "examples.offline_inference.multilora_inference", + EXAMPLES_PATH / "offline_inference/multilora_inference.py", ) model_ref = "meta-llama/Llama-2-7b-hf" diff --git a/tests/test_utils.py b/tests/test_utils.py index 32a6b0aed66aa..0285b00d73be1 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -5,6 +5,7 @@ import pytest import torch +from vllm_test_utils import monitor from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs, get_open_port, memory_profiling, merge_async_iterators, @@ -289,8 +290,16 @@ def test_memory_profiling(): weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB + def measure_current_non_torch(): + free, total = torch.cuda.mem_get_info() + current_used = total - free + current_torch = torch.cuda.memory_reserved() + current_non_torch = current_used - current_torch + return current_non_torch + with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes, - weights_memory_in_bytes=weights_memory_in_bytes) as result: + weights_memory_in_bytes=weights_memory_in_bytes) as result, \ + monitor(measure_current_non_torch) as monitored_values: # make a memory spike, 1 GiB spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32) del spike @@ -298,7 +307,15 @@ def test_memory_profiling(): # Add some extra non-torch memory 256 MiB (simulate NCCL) handle2 = lib.cudaMalloc(256 * 1024 * 1024) + # this is an analytic value, it is exact, + # we only have 256 MiB non-torch memory increase + measured_diff = monitored_values.values[-1] - monitored_values.values[0] + assert measured_diff == 256 * 1024 * 1024 + # Check that the memory usage is within 5% of the expected values + # 5% tolerance is caused by PyTorch caching allocator, + # we cannot control PyTorch's behavior of its internal buffers, + # which causes a small error (<10 MiB in practice) non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa assert abs(non_torch_ratio - 1) <= 0.05 diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py index bf0b62a5b75e3..6505c81546bb0 100644 --- a/tests/vllm_test_utils/vllm_test_utils/__init__.py +++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py @@ -4,5 +4,6 @@ """ from .blame import BlameResult, blame +from .monitor import MonitoredValues, monitor -__all__ = ["blame", "BlameResult"] +__all__ = ["blame", "BlameResult", "monitor", "MonitoredValues"] diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py new file mode 100644 index 0000000000000..a237f53a75d18 --- /dev/null +++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py @@ -0,0 +1,68 @@ +import contextlib +import dataclasses +import sys +import traceback +from typing import Callable, Generator, Generic, TypeVar + +_T = TypeVar("_T") + + +@dataclasses.dataclass +class MonitoredValues(Generic[_T]): + values: list[_T] = dataclasses.field(default_factory=list) + trace_stacks: list[str] = dataclasses.field(default_factory=list) + + +@contextlib.contextmanager +def monitor( + measure_func: Callable[[], + _T]) -> Generator[MonitoredValues[_T], None, None]: + """ + Trace the function calls to continuously monitor the change of + a value. + + Usage: + + ```python + + def measure_func(): + ... # measure the current value + return current_value + + with monitor(measure_func) as monitored_values: + # do something + + monitored_values.values # all changes of the values + monitored_values.trace_stacks # trace stacks of every change + ``` + """ + monitored_values = MonitoredValues[_T]() + + def _trace_calls(frame, event, arg=None): + nonlocal monitored_values + if event in ['line']: + # triggered by every line of Python code. + # only Python functions will trigger it, + # c/cpp functions will not trigger it. + try: + # Temporarily disable the trace function + sys.settrace(None) + # do a measurement + current_value = measure_func() + if len(monitored_values.values + ) == 0 or current_value != monitored_values.values[-1]: + monitored_values.values.append(current_value) + monitored_values.trace_stacks.append("".join( + traceback.format_stack())) + # Re-enable the trace function + sys.settrace(_trace_calls) + except NameError: + # modules are deleted during shutdown + pass + return _trace_calls + + try: + sys.settrace(_trace_calls) + yield monitored_values + finally: + sys.settrace(None) diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py index 394ca8663e189..49366abc7fb56 100644 --- a/tools/profiler/print_layerwise_table.py +++ b/tools/profiler/print_layerwise_table.py @@ -31,7 +31,7 @@ def get_entries(node, curr_depth=0): type=str, required=True, help="json trace file output by " - "examples/offline_profile.py") + "examples/offline_inference/offline_profile.py") parser.add_argument("--phase", type=str, required=True, diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py index da7a28da15c19..fa88ed4204d8f 100644 --- a/tools/profiler/visualize_layerwise_profile.py +++ b/tools/profiler/visualize_layerwise_profile.py @@ -534,11 +534,11 @@ def make_plot_title_suffix(profile_json: dict) -> str: if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument( - "--json-trace", - type=str, - required=True, - help="json trace file output by examples/offline_profile.py") + parser.add_argument("--json-trace", + type=str, + required=True, + help="json trace file output by \ + examples/offline_inference/offline_profile.py") parser.add_argument("--output-directory", type=str, required=False, diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index eb2f69df42624..afb350591e562 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -35,10 +35,6 @@ def register_fake(fn): # activation ops -def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - torch.ops._C.silu_and_mul(out, x) - - def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: torch.ops._C.gelu_and_mul(out, x) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index a8dd628b9cd6f..87655530cead4 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -145,6 +145,7 @@ def wrap_inductor(graph: fx.GraphModule, example_inputs, additional_inductor_config, compilation_config: CompilationConfig, + vllm_backend: "VllmBackend", graph_index: int = 0, num_graphs: int = 1, runtime_shape: Optional[int] = None, @@ -176,7 +177,7 @@ def wrap_inductor(graph: fx.GraphModule, # see https://github.com/pytorch/pytorch/issues/138980 graph = copy.deepcopy(graph) - cache_data = compilation_config.inductor_hash_cache + cache_data = vllm_backend.inductor_hash_cache if (runtime_shape, graph_index) in cache_data: # we compiled this graph before # so we can directly lookup the compiled graph via hash @@ -196,7 +197,7 @@ def wrap_inductor(graph: fx.GraphModule, hash_str, example_inputs, True, False) assert inductor_compiled_graph is not None, ( "Inductor cache lookup failed. Please remove" - f"the cache file {compilation_config.inductor_hash_cache.cache_file_path} and try again." # noqa + f"the cache file {cache_data.cache_file_path} and try again." # noqa ) # Inductor calling convention (function signature): @@ -354,7 +355,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): def __init__(self, module: torch.fx.GraphModule, compile_submod_names: List[str], vllm_config: VllmConfig, - graph_pool): + graph_pool, vllm_backend: "VllmBackend"): super().__init__(module) from torch._guards import detect_fake_mode self.fake_mode = detect_fake_mode() @@ -362,6 +363,7 @@ def __init__(self, module: torch.fx.GraphModule, self.compilation_config = vllm_config.compilation_config self.graph_pool = graph_pool self.vllm_config = vllm_config + self.vllm_backend = vllm_backend def run(self, *args): fake_args = [ @@ -389,6 +391,7 @@ def call_module(self, target: torch.fx.node.Target, args, self.compilation_config.inductor_compile_config, self.compilation_config, + self.vllm_backend, graph_index=index, num_graphs=len(self.compile_submod_names), runtime_shape=None, @@ -397,7 +400,7 @@ def call_module(self, target: torch.fx.node.Target, self.module.__dict__[target] = PiecewiseBackend( submod, self.vllm_config, self.graph_pool, index, len(self.compile_submod_names), sym_shape_indices, - compiled_graph_for_general_shape) + compiled_graph_for_general_shape, self.vllm_backend) compilation_counter.num_piecewise_capturable_graphs_seen += 1 @@ -430,6 +433,7 @@ class VllmBackend: post_grad_passes: Sequence[Callable] sym_tensor_indices: List[int] input_buffers: List[torch.Tensor] + inductor_hash_cache: InductorHashCache def __init__( self, @@ -472,6 +476,53 @@ def configure_post_pass(self): def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: + if not self.compilation_config.cache_dir: + # no provided cache dir, generate one based on the known factors + # that affects the compilation. if none of the factors change, + # the cache dir will be the same so that we can reuse the compiled + # graph. + + # 1. factors come from the vllm_config (it mainly summarizes how the + # model is created) + vllm_config = self.vllm_config + config_hash = vllm_config.compute_hash() + + # 2. factors come from the code files that are traced by Dynamo ( + # it mainly summarizes how the model is used in forward pass) + forward_code_files = list( + sorted(self.compilation_config.traced_files)) + self.compilation_config.traced_files.clear() + logger.debug( + "Traced files (to be considered for compilation cache):\n%s", + "\n".join(forward_code_files)) + hash_content = [] + for filepath in forward_code_files: + hash_content.append(filepath) + with open(filepath) as f: + hash_content.append(f.read()) + import hashlib + code_hash = hashlib.md5( + "\n".join(hash_content).encode()).hexdigest() + + # combine the two hashes to generate the cache dir + hash_key = hashlib.md5( + f"{config_hash}_{code_hash}".encode()).hexdigest()[:10] + cache_dir = os.path.join( + envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key, + f"rank_{vllm_config.parallel_config.rank}") + else: + cache_dir = self.compilation_config.cache_dir + os.makedirs(cache_dir, exist_ok=True) + + disabled = envs.VLLM_DISABLE_COMPILE_CACHE + self.inductor_hash_cache: InductorHashCache = InductorHashCache( + cache_dir, disabled=disabled) + if disabled: + logger.info("vLLM's torch.compile cache is disabled.") + else: + logger.info("Using cache directory: %s for vLLM's torch.compile", + cache_dir) + # when dynamo calls the backend, it means the bytecode # transform and analysis are done compilation_counter.num_graphs_seen += 1 @@ -507,8 +558,8 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: # propagate the split graph to the piecewise backend, # compile submodules with symbolic shapes PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile, - self.vllm_config, - self.graph_pool).run(*example_inputs) + self.vllm_config, self.graph_pool, + self).run(*example_inputs) self._called = True @@ -577,7 +628,8 @@ class PiecewiseBackend: def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, graph_pool: Any, piecewise_compile_index: int, total_piecewise_compiles: int, sym_shape_indices: List[int], - compiled_graph_for_general_shape: Callable): + compiled_graph_for_general_shape: Callable, + vllm_backend: VllmBackend): """ The backend for piecewise compilation. It mainly handles the compilation and cudagraph capturing. @@ -597,6 +649,7 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, self.graph_pool = graph_pool self.piecewise_compile_index = piecewise_compile_index self.total_piecewise_compiles = total_piecewise_compiles + self.vllm_backend = vllm_backend self.is_first_graph = piecewise_compile_index == 0 self.is_last_graph = ( @@ -634,7 +687,7 @@ def check_for_ending_compilation(self): if self.is_last_graph and not self.to_be_compiled_sizes: # no specific sizes to compile # save the hash of the inductor graph for the next run - self.compilation_config.inductor_hash_cache.save_to_file() + self.vllm_backend.inductor_hash_cache.save_to_file() end_monitoring_torch_compile(self.vllm_config) def __call__(self, *args) -> Any: @@ -662,6 +715,7 @@ def __call__(self, *args) -> Any: args, self.compilation_config.inductor_compile_config, self.compilation_config, + self.vllm_backend, graph_index=self.piecewise_compile_index, num_graphs=self.total_piecewise_compiles, runtime_shape=runtime_shape, diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 805a217ee6ca1..10513111ea7f1 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -1,8 +1,10 @@ import inspect from typing import Callable, Dict, List, Optional, TypeVar, Union, overload +from unittest.mock import patch import torch import torch.nn as nn +from torch._dynamo.symbolic_convert import InliningInstructionTranslator from vllm.compilation.counter import compilation_counter from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher @@ -196,7 +198,31 @@ def __call__(self, *args, **kwargs): # we need to control all the compilation of the model. torch._dynamo.eval_frame.remove_from_cache( self.original_code_object) - return self.compiled_callable(*args, **kwargs) + + # collect all relevant files traced by Dynamo, + # so that the compilation cache can trigger re-compilation + # properly when any of these files change. + + # 1. the file containing the top-level forward function + self.vllm_config.compilation_config.traced_files.add( + self.original_code_object.co_filename) + + # 2. every time Dynamo sees a function call, it will inline + # the function by calling InliningInstructionTranslator.inline_call + # we hijack this function to know all the functions called + # during Dynamo tracing, and their corresponding files + inline_call = InliningInstructionTranslator.inline_call + + def patched_inline_call(parent, func, args, kwargs): + code = func.get_code() + self.vllm_config.compilation_config.traced_files.add( + code.co_filename) + return inline_call(parent, func, args, kwargs) + + with patch.object(InliningInstructionTranslator, 'inline_call', + patched_inline_call): + output = self.compiled_callable(*args, **kwargs) + return output # usually, capturing the model once is enough, and then we can # dispatch to the compiled code directly, without going through diff --git a/vllm/config.py b/vllm/config.py index 8b824a1fca511..6dabeb3861af2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3,7 +3,7 @@ import enum import hashlib import json -import os +import sys import warnings from contextlib import contextmanager from dataclasses import dataclass, field, replace @@ -381,16 +381,16 @@ def maybe_pull_model_tokenizer_for_s3(self, model: str, """ if is_s3(model) or is_s3(tokenizer): if is_s3(model): - self.s3_model = S3Model() - self.s3_model.pull_files(model, allow_pattern=["*config.json"]) + s3_model = S3Model() + s3_model.pull_files(model, allow_pattern=["*config.json"]) self.model_weights = self.model - self.model = self.s3_model.dir + self.model = s3_model.dir if is_s3(tokenizer): - self.s3_tokenizer = S3Model() - self.s3_tokenizer.pull_files( + s3_tokenizer = S3Model() + s3_tokenizer.pull_files( model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"]) - self.tokenizer = self.s3_tokenizer.dir + self.tokenizer = s3_tokenizer.dir def _init_multimodal_config( self, limit_mm_per_prompt: Optional[Mapping[str, int]] @@ -2051,6 +2051,11 @@ def __post_init__(self): f"max_cpu_loras ({self.max_cpu_loras}) must be >= " f"max_loras ({self.max_loras})") + def verify_with_cache_config(self, cache_config: CacheConfig): + # TODO LoRA supports CPU offload. + if cache_config.cpu_offload_gb > 0: + raise ValueError("CPU offload is not supported with LoRA yet.") + def verify_with_model_config(self, model_config: ModelConfig): if self.lora_dtype in (None, "auto"): self.lora_dtype = model_config.dtype @@ -2254,6 +2259,17 @@ def _get_and_verify_dtype( "supported for POWERPC.") torch_dtype = torch.bfloat16 + # TODO: change this condition to check if the platform support bf16 + # instead of checking the OS. For instance M2 shall supports bf16 + # already. But we need to modify `cpu_extension.cmake` to activate + # the feature in the build. + if (current_platform.is_cpu() and sys.platform.startswith("darwin") + and current_platform.get_cpu_architecture() + == CpuArchEnum.ARM and config_dtype == torch.bfloat16): + logger.info("For macOS with Apple Silicon, currently bfloat16 " + "is not supported. Setting dtype to float16.") + torch_dtype = torch.float16 + if current_platform.is_hpu() and config_dtype == torch.float16: logger.info( "For HPU, we cast models to bfloat16 instead of" @@ -2761,9 +2777,8 @@ def model_post_init(self, __context: Any) -> None: # keep track of enabled and disabled custom ops enabled_custom_ops: Counter[str] = PrivateAttr disabled_custom_ops: Counter[str] = PrivateAttr + traced_files: Set[str] = PrivateAttr compilation_time: float = PrivateAttr - # should be InductorHashCache, but Pydantic does not support it - inductor_hash_cache: Any = PrivateAttr # Per-model forward context # Mainly used to store attention cls @@ -2801,6 +2816,7 @@ def __repr__(self) -> str: "compilation_time", "bs_to_padded_graph_size", "pass_config", + "traced_files", } return self.model_dump_json(exclude=exclude, exclude_unset=True) @@ -2860,6 +2876,7 @@ def model_post_init(self, __context: Any) -> None: self.enabled_custom_ops = Counter() self.disabled_custom_ops = Counter() + self.traced_files = set() self.static_forward_context = {} self.compilation_time = 0.0 @@ -2882,29 +2899,6 @@ def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]: # merge with the config use_inductor assert self.level == CompilationLevel.PIECEWISE - if not self.cache_dir: - # no provided cache dir, generate one based on the known factors - # that affects the compilation. if none of the factors change, - # the cache dir will be the same so that we can reuse the compiled - # graph. - hash_key = vllm_config.compute_hash() - cache_dir = os.path.join( - envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key, - f"rank_{vllm_config.parallel_config.rank}") - os.makedirs(cache_dir, exist_ok=True) - self.cache_dir = cache_dir - - disabled = envs.VLLM_DISABLE_COMPILE_CACHE - from vllm.compilation.backends import InductorHashCache - self.inductor_hash_cache: InductorHashCache = InductorHashCache( - self.cache_dir, disabled=disabled) - if disabled: - logger.info("vLLM's torch.compile cache is disabled.") - else: - logger.info( - "Using cache directory: %s for vLLM's torch.compile", - self.cache_dir) - from vllm.compilation.backends import VllmBackend return VllmBackend(vllm_config) @@ -3138,6 +3132,7 @@ def __post_init__(self): self.cache_config.verify_with_parallel_config(self.parallel_config) if self.lora_config: + self.lora_config.verify_with_cache_config(self.cache_config) self.lora_config.verify_with_model_config(self.model_config) self.lora_config.verify_with_scheduler_config( self.scheduler_config) diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 9f97b0f01ad8a..4ced991f62f66 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -247,7 +247,8 @@ def __init__( self.handle = Handle( connect_ip=connect_ip, local_reader_ranks=local_reader_ranks, - buffer_handle=self.buffer.handle(), + buffer_handle=self.buffer.handle() + if self.buffer is not None else None, local_subscribe_port=local_subscribe_port, remote_subscribe_port=remote_subscribe_port, ) diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md index dab2d10c4c9d0..e20c992a381a3 100644 --- a/vllm/distributed/kv_transfer/README.md +++ b/vllm/distributed/kv_transfer/README.md @@ -22,7 +22,7 @@ NOTE: If you want to not only transfer KV caches, but adjust the model execution ## Disaggregated prefilling -The example usage is in [this file](../../../examples/disaggregated_prefill.sh). +The example usage is in [this file](../../../examples/online_serving/disaggregated_prefill.sh). Here is the diagram of how we run disaggretgated prefilling. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e94664308cf8d..0850bab6bb7e1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1157,6 +1157,12 @@ def create_engine_config(self, if self.enable_chunked_prefill and self.pipeline_parallel_size > 1: raise ValueError("Multi-Step Chunked-Prefill is not supported " "for pipeline-parallel-size > 1") + from vllm.platforms import current_platform + if current_platform.is_cpu(): + logger.warning("Multi-Step (--num-scheduler-steps > 1) is " + "currently not supported for CPUs and has been " + "disabled.") + self.num_scheduler_steps = 1 # make sure num_lookahead_slots is set the higher value depending on # if we are using speculative decoding or multi-step diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 047f699e4f277..bc1471e1f534d 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -7,6 +7,7 @@ import re import signal import socket +import sys import tempfile import uuid from argparse import Namespace @@ -805,6 +806,8 @@ def signal_handler(*_) -> None: ssl_certfile=args.ssl_certfile, ssl_ca_certs=args.ssl_ca_certs, ssl_cert_reqs=args.ssl_cert_reqs, + # Workaround to work on macOS + fd=sock.fileno() if sys.platform.startswith("darwin") else None, **uvicorn_kwargs, ) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index b362ee0cac328..6ddc1eb76f10d 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -7,7 +7,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry -from vllm.multimodal.processing import MultiModalDataDict, MultiModalInputsV2 +from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2 from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.utils import print_info_once, print_warning_once diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 2d9d024e03e80..b22b3f1594f24 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -323,6 +323,7 @@ def dummy_data_for_profiling( # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture from vllm.multimodal import MultiModalKwargs + from vllm.multimodal.profiling import MultiModalProfiler from vllm.multimodal.utils import cached_get_tokenizer if mm_registry.has_processor(model_config): @@ -331,7 +332,8 @@ def dummy_data_for_profiling( trust_remote_code=model_config.trust_remote_code, ) processor = mm_registry.create_processor(model_config, tokenizer) - dummy_data = processor.get_dummy_data(seq_len) + profiler = MultiModalProfiler(processor) + dummy_data = profiler.get_dummy_data(seq_len) else: model_cls, _ = get_model_architecture(model_config) if is_encoder_data: diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 34d65ed51ef3f..32456fee06a28 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -10,6 +10,7 @@ get_tensor_model_parallel_world_size) from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform from vllm.utils import LazyDict @@ -58,27 +59,31 @@ class SiluAndMul(CustomOp): return: (num_tokens, d) or (batch_size, seq_len, d) """ + def __init__(self): + super().__init__() + if current_platform.is_cuda_alike() or current_platform.is_cpu(): + self.op = torch.ops._C.silu_and_mul + elif current_platform.is_xpu(): + from vllm._ipex_ops import ipex_ops + self.op = ipex_ops.silu_and_mul + def forward_native(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" d = x.shape[-1] // 2 return F.silu(x[..., :d]) * x[..., d:] def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - from vllm import _custom_ops as ops - d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - ops.silu_and_mul(out, x) + self.op(out, x) return out def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: - from vllm._ipex_ops import ipex_ops as ops - d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - ops.silu_and_mul(out, x) + self.op(out, x) return out diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 4741d69de11ac..87993267c05b5 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -4,7 +4,6 @@ import torch -from vllm import _custom_ops as ops from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_topk, moe_align_block_size, try_get_optimal_moe_config) from vllm.scalar_type import scalar_types @@ -301,7 +300,8 @@ def fused_marlin_moe( False, ) - ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) + torch.ops._C.silu_and_mul(intermediate_cache2, + intermediate_cache1.view(-1, 2 * N)) intermediate_cache3 = torch.ops._moe_C.marlin_gemm_moe( intermediate_cache2, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 4101facbe7874..1bb6bc753d37c 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -753,7 +753,8 @@ def fused_experts_impl(hidden_states: torch.Tensor, use_int8_w8a16=use_int8_w8a16, block_shape=block_shape) - ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) + torch.ops._C.silu_and_mul(intermediate_cache2, + intermediate_cache1.view(-1, N)) invoke_fused_moe_kernel(intermediate_cache2, w2, diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index a9c1fa7221217..0033fbff0e9ac 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -452,9 +452,9 @@ def _load_model_serialized_cpu( """Load a serialized model with tensorizer to the CPU. This is only necessary when the model isn't vLLM-tensorized (see - examples/tensorize_vllm_model.py) This should still be faster than - default HuggingFace loading, but will be slower than loading a - vLLM-tensorized model. + examples/other/tensorize_vllm_model.py) This should still + be faster than default HuggingFace loading, but will be slower than + loading a vLLM-tensorized model. """ device_config = vllm_config.device_config model_config = vllm_config.model_config @@ -472,7 +472,7 @@ def _load_model_serialized( """Load a serialized model with tensorizer. Expects a vLLM-tensorized model. See the - examples/tensorize_vllm_model.py example script + examples/other/tensorize_vllm_model.py example script for serializing vLLM models.""" device_config = vllm_config.device_config @@ -529,7 +529,8 @@ class ShardedStateLoader(BaseModelLoader): Model loader that directly loads each worker's model state dict, which enables a fast load path for large tensor-parallel models where each worker only needs to read its own shard rather than the entire checkpoint. See - `examples/save_sharded_state.py` for creating a sharded checkpoint. + `examples/offline_inference/save_sharded_state.py` for creating a sharded + checkpoint. """ DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors" diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 8b929f299c8d8..fbd4937112e11 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -155,7 +155,7 @@ class TensorizerArgs: encryption_keyfile: File path to a binary file containing a binary key to use for decryption. `None` (the default) means no decryption. See the example script in - examples/tensorize_vllm_model.py. + examples/other/tensorize_vllm_model.py. s3_access_key_id: The access key for the S3 bucket. Can also be set via the S3_ACCESS_KEY_ID environment variable. s3_secret_access_key: The secret access key for the S3 bucket. Can also @@ -363,12 +363,12 @@ def deserialize(self): def tensorizer_weights_iterator( tensorizer_args: "TensorizerArgs" ) -> Generator[Tuple[str, torch.Tensor], None, None]: - logger.warning( - "Deserializing HuggingFace models is not optimized for " - "loading on vLLM, as tensorizer is forced to load to CPU. " - "Consider deserializing a vLLM model instead for faster " - "load times. See the examples/tensorize_vllm_model.py example " - "script for serializing vLLM models.") + logger.warning("Deserializing HuggingFace models is not optimized for " + "loading on vLLM, as tensorizer is forced to load to CPU. " + "Consider deserializing a vLLM model instead for faster " + "load times. See the " + "examples/other/tensorize_vllm_model.py example script " + "for serializing vLLM models.") deserializer_args = tensorizer_args.deserializer_params stream_params = tensorizer_args.stream_params diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 8aa0c98df70d2..a2c991cfdb74e 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -503,7 +503,8 @@ def kv_cache_scales_loader( KV cache scaling factors. The serialization should represent a dictionary whose keys are the TP ranks and values are another dictionary mapping layers to their KV cache scaling factors. - Keep this function in sync with the output of examples/fp8/extract_scales.py + Keep this function in sync with the output of + examples/other/fp8/extract_scales.py """ try: with open(filename) as f: diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 2e649f10c0765..089062ab53fc3 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -23,10 +23,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, NestedTensors) +from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessingMixin, - PromptReplacement) -from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs + BaseProcessingInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.aria import (AriaMoELMConfig, AriaVisionConfig) @@ -445,33 +445,33 @@ def build_mm_projector(config: PretrainedConfig): ) -class AriaProcessingMixin(ProcessingMixin): +class AriaProcessingInfo(BaseProcessingInfo): - def _get_hf_config(self): + def get_hf_config(self): return self.ctx.get_hf_config() - def _get_vision_config(self) -> AriaVisionConfig: - return self._get_hf_config().vision_config - - def _get_num_image_tokens(self) -> int: - hf_config = self._get_hf_config() - return max(hf_config.projector_patch_to_query_dict.values()) - - -class AriaProfilingInfo(AriaProcessingMixin, BaseProfilingInfo): + def get_vision_config(self) -> AriaVisionConfig: + return self.get_hf_config().vision_config def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - return {"image": self._get_num_image_tokens()} + return {"image": self.get_num_image_tokens()} + + def get_num_image_tokens(self) -> int: + hf_config = self.get_hf_config() + return max(hf_config.projector_patch_to_query_dict.values()) + + +class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]): def get_dummy_processor_inputs( self, seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - vision_config = self._get_vision_config() + vision_config = self.info.get_vision_config() max_image_size = vision_config.image_size num_images = mm_counts.get("image", 0) @@ -483,7 +483,7 @@ def get_dummy_processor_inputs( num_images=num_images) } - hf_processor = self._get_hf_processor() + hf_processor = self.info.get_hf_processor() image_token: str = hf_processor.image_token # type: ignore return ProcessorInputs( @@ -492,10 +492,7 @@ def get_dummy_processor_inputs( ) -class AriaMultiModalProcessor(AriaProcessingMixin, BaseMultiModalProcessor): - - def _get_profiling_info(self) -> BaseProfilingInfo: - return AriaProfilingInfo(self.ctx) +class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]): def _get_mm_fields_config( self, @@ -513,10 +510,10 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self._get_hf_config() + hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index - num_image_tokens = self._get_num_image_tokens() + num_image_tokens = self.info.get_num_image_tokens() return [ PromptReplacement( @@ -527,7 +524,9 @@ def _get_prompt_replacements( ] -@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor, + info=AriaProcessingInfo, + dummy_inputs=AriaDummyInputsBuilder) class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): """ Aria model for conditional generation tasks. diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index fd45783f167b4..7dfc0b687c6e3 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -17,10 +17,10 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) +from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessingMixin, - PromptReplacement) -from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs + BaseProcessingInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from .blip import BlipVisionModel @@ -397,30 +397,30 @@ def forward( return sequence_output -class Blip2ProcessingMixin(ProcessingMixin): +class Blip2ProcessingInfo(BaseProcessingInfo): - def _get_hf_config(self): + def get_hf_config(self): return self.ctx.get_hf_config(Blip2Config) - def _get_num_image_tokens(self) -> int: - hf_config = self._get_hf_config() - return hf_config.num_query_tokens - - -class Blip2ProfilingInfo(Blip2ProcessingMixin, BaseProfilingInfo): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - return {"image": self._get_num_image_tokens()} + return {"image": self.get_num_image_tokens()} + + def get_num_image_tokens(self) -> int: + hf_config = self.get_hf_config() + return hf_config.num_query_tokens + + +class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]): def get_dummy_processor_inputs( self, seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - hf_config = self._get_hf_config() + hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config max_image_size = vision_config.image_size @@ -439,10 +439,7 @@ def get_dummy_processor_inputs( ) -class Blip2MultiModalProcessor(Blip2ProcessingMixin, BaseMultiModalProcessor): - - def _get_profiling_info(self) -> BaseProfilingInfo: - return Blip2ProfilingInfo(self.ctx) +class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]): def _get_mm_fields_config( self, @@ -460,7 +457,7 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - num_image_tokens = self._get_num_image_tokens() + num_image_tokens = self.info.get_num_image_tokens() return [ PromptReplacement( @@ -491,7 +488,9 @@ def apply( return result -@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor, + info=Blip2ProcessingInfo, + dummy_inputs=Blip2DummyInputsBuilder) class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 73ed73b61ebf9..acff926891bbe 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -30,10 +30,10 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) +from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessingMixin, - PromptReplacement) -from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs + BaseProcessingInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import print_warning_once @@ -49,33 +49,34 @@ class ChameleonImagePixelInputs(TypedDict): """Shape: `(batch_size * num_images, num_channels, height, width)`""" -class ChameleonProcessingMixin(ProcessingMixin): +class ChameleonProcessingInfo(BaseProcessingInfo): - def _get_hf_config(self): + def get_hf_config(self): return self.ctx.get_hf_config(ChameleonConfig) - def _get_hf_processor(self): + def get_hf_processor(self): return self.ctx.get_hf_processor(ChameleonProcessor) - def _get_num_image_tokens(self) -> int: - processor = self._get_hf_processor() - return processor.image_seq_length - - -class ChameleonProfilingInfo(ChameleonProcessingMixin, BaseProfilingInfo): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - return {"image": self._get_num_image_tokens()} + return {"image": self.get_num_image_tokens()} + + def get_num_image_tokens(self) -> int: + processor = self.get_hf_processor() + return processor.image_seq_length + + +class ChameleonDummyInputsBuilder( + BaseDummyInputsBuilder[ChameleonProcessingInfo]): def get_dummy_processor_inputs( self, seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - config = self._get_hf_config() + config = self.info.get_hf_config() width = height = config.vq_config.resolution num_images = mm_counts.get("image", 0) @@ -93,11 +94,8 @@ def get_dummy_processor_inputs( ) -class ChameleonMultiModalProcessor(ChameleonProcessingMixin, - BaseMultiModalProcessor): - - def _get_profiling_info(self) -> BaseProfilingInfo: - return ChameleonProfilingInfo(self.ctx) +class ChameleonMultiModalProcessor( + BaseMultiModalProcessor[ChameleonProcessingInfo]): def _get_mm_fields_config( self, @@ -112,7 +110,7 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - processor = self._get_hf_processor(**hf_processor_mm_kwargs) + processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) return [ PromptReplacement( @@ -120,7 +118,7 @@ def _get_prompt_replacements( target="", replacement="".join([ processor.image_start_token, - processor.image_token * self._get_num_image_tokens(), + processor.image_token * self.info.get_num_image_tokens(), processor.image_end_token, ]), ) @@ -916,7 +914,10 @@ def forward( return hidden_states -@MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor( + ChameleonMultiModalProcessor, + info=ChameleonProcessingInfo, + dummy_inputs=ChameleonDummyInputsBuilder) class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 1bde45cb140cb..dd69f6c9a5aff 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -20,11 +20,10 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges, - repeat_and_pad_placeholder_tokens, - resolve_visual_encoder_outputs) + repeat_and_pad_placeholder_tokens) from vllm.sequence import SequenceData -from .vision import VisionEncoderInfo +from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int: diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index c937fcb0978b9..59af5f0b3ae98 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -33,11 +33,11 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) -from vllm.multimodal.parse import ImageProcessorItems, ImageSize +from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, + MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessingMixin, - PromptReplacement) -from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs + BaseProcessingInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from .interfaces import SupportsMultiModal, SupportsPP @@ -64,24 +64,38 @@ class FuyuImagePatchInputs(TypedDict): """ -class FuyuProcessingMixin(ProcessingMixin): +class FuyuProcessingInfo(BaseProcessingInfo): - def _get_hf_config(self): + def get_hf_config(self): return self.ctx.get_hf_config(FuyuConfig) - def _get_hf_processor(self): + def get_hf_processor(self): return self.ctx.get_hf_processor(FuyuProcessor) - def _get_image_processor(self) -> FuyuImageProcessor: - return self._get_hf_processor().image_processor + def get_image_processor(self) -> FuyuImageProcessor: + return self.get_hf_processor().image_processor - def _get_image_feature_grid_size( + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": 1} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + target_width, target_height = self.get_image_size_with_most_features() + + max_ncols, max_nrows = self.get_image_feature_grid_size( + image_width=target_width, + image_height=target_height, + ) + max_image_tokens = (max_ncols + 1) * max_nrows + + return {"image": max_image_tokens} + + def get_image_feature_grid_size( self, *, image_width: int, image_height: int, ) -> tuple[int, int]: - image_processor = self._get_image_processor() + image_processor = self.get_image_processor() target_width = image_processor.size["width"] target_height = image_processor.size["height"] @@ -97,34 +111,21 @@ def _get_image_feature_grid_size( nrows = math.ceil(image_height / 30) return ncols, nrows - -class FuyuProfilingInfo(FuyuProcessingMixin, BaseProfilingInfo): - - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": 1} - - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - target_width, target_height = self._get_image_size_with_most_features() - - max_ncols, max_nrows = self._get_image_feature_grid_size( - image_width=target_width, - image_height=target_height, - ) - max_image_tokens = (max_ncols + 1) * max_nrows - - return {"image": max_image_tokens} - - def _get_image_size_with_most_features(self) -> ImageSize: - image_processor = self._get_image_processor() + def get_image_size_with_most_features(self) -> ImageSize: + image_processor = self.get_image_processor() return ImageSize(width=image_processor.size["width"], height=image_processor.size["height"]) + +class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]): + def get_dummy_processor_inputs( self, seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - target_width, target_height = self._get_image_size_with_most_features() + target_width, target_height = \ + self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) mm_data = { @@ -140,10 +141,7 @@ def get_dummy_processor_inputs( ) -class FuyuMultiModalProcessor(FuyuProcessingMixin, BaseMultiModalProcessor): - - def _get_profiling_info(self) -> BaseProfilingInfo: - return FuyuProfilingInfo(self.ctx) +class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]): def _call_hf_processor( self, @@ -156,7 +154,7 @@ def _call_hf_processor( # Avoid warning from HF logger for text-only input # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id # Tokenizer won't add boa_token_id by default, we add it manually. - tokenizer = self._get_tokenizer() + tokenizer = self.info.get_tokenizer() boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore prompt_ids = tokenizer.encode(prompt) + [boa_token_id] return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") @@ -196,10 +194,10 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self._get_hf_config() + hf_config = self.info.get_hf_config() bos_token_id = hf_config.bos_token_id - tokenizer = self._get_tokenizer() + tokenizer = self.info.get_tokenizer() eot_token_id = tokenizer.bos_token_id assert isinstance(eot_token_id, int) @@ -207,7 +205,7 @@ def get_replacement_fuyu(item_idx: int): images = mm_items.get_items("image", ImageProcessorItems) image_size = images.get_image_size(item_idx) - ncols, nrows = self._get_image_feature_grid_size( + ncols, nrows = self.info.get_image_feature_grid_size( image_width=image_size.width, image_height=image_size.height, ) @@ -244,7 +242,9 @@ def apply( return result -@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor, + info=FuyuProcessingInfo, + dummy_inputs=FuyuDummyInputsBuilder) class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 4299af8cd03a2..8d94acf3b21d5 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,7 +1,7 @@ -from abc import ABC, abstractmethod +from abc import abstractmethod from functools import cached_property from typing import (Final, Iterable, List, Literal, Mapping, Optional, - Protocol, Set, Tuple, TypedDict, Union) + Protocol, Set, Tuple, TypedDict, TypeVar, Union) import torch import torch.nn as nn @@ -25,11 +25,11 @@ MultiModalInputsV2, MultiModalKwargs, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, - ImageSize) + ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessingCache, - ProcessingMixin, PromptReplacement) -from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs + BaseProcessingInfo, ProcessingCache, + PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from .clip import CLIPVisionModel @@ -105,34 +105,23 @@ class LlavaLikeProcessor(Protocol): image_token: Final[str] -class BaseLlavaProcessingMixin(ProcessingMixin, ABC): +class BaseLlavaProcessingInfo(BaseProcessingInfo): - def _get_hf_config(self) -> LlavaLikeConfig: + def get_hf_config(self) -> LlavaLikeConfig: return self.ctx.get_hf_config(LlavaConfig) - def _get_vision_encoder_info(self): - return get_vision_encoder_info(self._get_hf_config()) + def get_vision_encoder_info(self): + return get_vision_encoder_info(self.get_hf_config()) @abstractmethod - def _get_hf_processor(self) -> LlavaLikeProcessor: + def get_hf_processor(self) -> LlavaLikeProcessor: raise NotImplementedError - def _get_num_image_tokens( - self, - *, - image_width: int, - image_height: int, - ) -> int: - hf_config = self._get_hf_config() - vision_encoder_info = self._get_vision_encoder_info() + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} - return self._apply_feature_select_strategy( - hf_config.vision_feature_select_strategy, - vision_encoder_info.get_num_image_tokens( - image_width=image_width, - image_height=image_height, - ), - ) + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return {"image": self.get_max_image_tokens()} def _apply_feature_select_strategy( self, @@ -147,28 +136,42 @@ def _apply_feature_select_strategy( msg = f"Unexpected feature select strategy: {strategy!r}" raise NotImplementedError(msg) + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self.get_hf_config() + vision_encoder_info = self.get_vision_encoder_info() -class BaseLlavaProfilingInfo(BaseLlavaProcessingMixin, BaseProfilingInfo): - - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None} - - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - return {"image": self._get_max_image_tokens()} + return self._apply_feature_select_strategy( + hf_config.vision_feature_select_strategy, + vision_encoder_info.get_num_image_tokens( + image_width=image_width, + image_height=image_height, + ), + ) - def _get_image_size_with_most_features(self) -> ImageSize: - vision_encoder_info = self._get_vision_encoder_info() + def get_image_size_with_most_features(self) -> ImageSize: + vision_encoder_info = self.get_vision_encoder_info() width = height = vision_encoder_info.get_image_size() return ImageSize(width=width, height=height) - def _get_max_image_tokens(self) -> int: - target_width, target_height = self._get_image_size_with_most_features() + def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() - return self._get_num_image_tokens( + return self.get_num_image_tokens( image_width=target_width, image_height=target_height, ) + +_I = TypeVar("_I", bound=BaseLlavaProcessingInfo) + + +class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]): + def get_dummy_processor_inputs( self, seq_len: int, @@ -176,9 +179,10 @@ def get_dummy_processor_inputs( ) -> ProcessorInputs: num_images = mm_counts.get("image", 0) - processor = self._get_hf_processor() + processor = self.info.get_hf_processor() image_token = processor.image_token - target_width, target_height = self._get_image_size_with_most_features() + target_width, target_height = \ + self.info.get_image_size_with_most_features() mm_data = { "image": @@ -193,23 +197,13 @@ def get_dummy_processor_inputs( ) -class LlavaProcessingMixin(BaseLlavaProcessingMixin): +class LlavaProcessingInfo(BaseLlavaProcessingInfo): - def _get_hf_processor(self): + def get_hf_processor(self): return self.ctx.get_hf_processor(LlavaProcessor) -class LlavaProfilingInfo(LlavaProcessingMixin, BaseLlavaProfilingInfo): - pass - - -class BaseLlavaMultiModalProcessor(LlavaProcessingMixin, - BaseMultiModalProcessor): - - # Copied from BaseMultiModalProcessor - @abstractmethod - def _get_profiling_info(self) -> BaseProfilingInfo: - raise NotImplementedError +class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]): # Copied from BaseMultiModalProcessor @abstractmethod @@ -226,7 +220,7 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self._get_hf_config() + hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index def get_replacement(item_idx: int): @@ -237,7 +231,7 @@ def get_replacement(item_idx: int): num_image_tokens = images.get_feature_size(item_idx) else: image_size = images.get_image_size(item_idx) - num_image_tokens = self._get_num_image_tokens( + num_image_tokens = self.info.get_num_image_tokens( image_width=image_size.width, image_height=image_size.height, ) @@ -253,10 +247,8 @@ def get_replacement(item_idx: int): ] -class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor): - - def _get_profiling_info(self) -> BaseProfilingInfo: - return LlavaProfilingInfo(self.ctx) +class LlavaMultiModalProcessor( + BaseLlavaMultiModalProcessor[LlavaProcessingInfo]): def _get_mm_fields_config( self, @@ -269,21 +261,14 @@ def _get_mm_fields_config( ) -class PixtralHFProcessingMixin(BaseLlavaProcessingMixin): +class PixtralHFProcessingInfo(BaseLlavaProcessingInfo): - def _get_hf_processor(self): + def get_hf_processor(self): return self.ctx.get_hf_processor(PixtralProcessor) -class PixtralHFProfilingInfo(PixtralHFProcessingMixin, BaseLlavaProfilingInfo): - pass - - -class PixtralHFMultiModalProcessor(PixtralHFProcessingMixin, - BaseMultiModalProcessor): - - def _get_profiling_info(self) -> BaseProfilingInfo: - return PixtralHFProfilingInfo(self.ctx) +class PixtralHFMultiModalProcessor( + BaseMultiModalProcessor[PixtralHFProcessingInfo]): def _call_hf_processor( self, @@ -328,10 +313,10 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self._get_hf_config() + hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index - processor = self._get_hf_processor() + processor = self.info.get_hf_processor() image_token = processor.image_token image_break_token = processor.image_break_token image_end_token = processor.image_end_token @@ -363,26 +348,40 @@ def get_replacement(item_idx: int): ] +def _build_llava_or_pixtral_hf_info( + ctx: InputProcessingContext, ) -> BaseLlavaProcessingInfo: + hf_config = ctx.get_hf_config(LlavaConfig) + + if isinstance(hf_config.vision_config, PixtralVisionConfig): + return PixtralHFProcessingInfo(ctx) + + return LlavaProcessingInfo(ctx) + + def _build_llava_or_pixtral_hf_processor( - ctx: InputProcessingContext, + info: _I, + dummy_inputs: BaseDummyInputsBuilder[_I], *, cache: Optional[ProcessingCache] = None, enable_sanity_checks: bool = True, ) -> BaseMultiModalProcessor: - hf_config = ctx.get_hf_config(LlavaConfig) - - if isinstance(hf_config.vision_config, PixtralVisionConfig): + if isinstance(info, PixtralHFProcessingInfo): return PixtralHFMultiModalProcessor( - ctx, + info, + dummy_inputs, # type: ignore cache=cache, enable_sanity_checks=enable_sanity_checks, ) - return LlavaMultiModalProcessor( - ctx, - cache=cache, - enable_sanity_checks=enable_sanity_checks, - ) + if isinstance(info, LlavaProcessingInfo): + return LlavaMultiModalProcessor( + info, + dummy_inputs, # type: ignore + cache=cache, + enable_sanity_checks=enable_sanity_checks, + ) + + raise NotImplementedError(type(info)) def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int: @@ -460,7 +459,9 @@ def init_vision_tower_for_llava( raise NotImplementedError(msg) -@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor) +@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor, + info=_build_llava_or_pixtral_hf_info, + dummy_inputs=LlavaDummyInputsBuilder) class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): # BitandBytes specific attributes bitsandbytes_stacked_params_mapping = { @@ -546,6 +547,12 @@ def _parse_and_validate_image_input( raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") + if self.config.vision_config.model_type == "pixtral": + return LlavaImagePixelInputs( + type="pixel_values", + data=flatten_bn(pixel_values), + ) + return LlavaImagePixelInputs( type="pixel_values", data=self._validate_pixel_values( @@ -721,11 +728,11 @@ def apply( mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], ) -> MultiModalInputsV2: - hf_config = self._get_hf_config() + hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index # Assume that it doesn't depend on the image size - num_image_tokens = self._get_num_image_tokens( + num_image_tokens = self.info.get_num_image_tokens( image_width=-1, image_height=-1, ) @@ -790,6 +797,8 @@ def get_replacement_mantis(item_idx: int): # To use this model, please use # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` -@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor, + info=LlavaProcessingInfo, + dummy_inputs=LlavaDummyInputsBuilder) class MantisForConditionalGeneration(LlavaForConditionalGeneration): pass diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 258352416d4a7..fda4f22d366b1 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -1,8 +1,8 @@ +from abc import abstractmethod from functools import cached_property from typing import (Final, Iterable, List, Literal, Mapping, Optional, - Protocol, Set, Tuple, TypedDict, Union) + Protocol, Set, Tuple, TypedDict, TypeVar, Union) -import numpy as np import torch import torch.nn as nn from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor @@ -17,13 +17,12 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors from vllm.multimodal.parse import ImageSize -from vllm.multimodal.profiling import BaseProfilingInfo from vllm.sequence import IntermediateTensors from .clip import CLIPVisionModel from .interfaces import SupportsMultiModal, SupportsPP -from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingMixin, - BaseLlavaProfilingInfo, LlavaLikeConfig, +from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingInfo, + LlavaDummyInputsBuilder, LlavaLikeConfig, LlavaMultiModalProjector, init_vision_tower_for_llava) from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn, @@ -66,23 +65,23 @@ class LlavaNextLikeConfig(LlavaLikeConfig, Protocol): image_grid_pinpoints: Final[list[list[int]]] -class LlavaNextProcessingMixin(BaseLlavaProcessingMixin): +class LlavaNextProcessingInfo(BaseLlavaProcessingInfo): - def _get_hf_config(self) -> LlavaNextLikeConfig: + def get_hf_config(self) -> LlavaNextLikeConfig: return self.ctx.get_hf_config(LlavaNextConfig) - def _get_hf_processor(self): + def get_hf_processor(self): return self.ctx.get_hf_processor(LlavaNextProcessor) - # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106 - def _get_num_image_tokens( + # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L113 + def get_num_image_tokens( self, *, image_width: int, image_height: int, ) -> int: - hf_config = self._get_hf_config() - vision_encoder_info = self._get_vision_encoder_info() + hf_config = self.get_hf_config() + vision_encoder_info = self.get_vision_encoder_info() base_feature_size = self._apply_feature_select_strategy( hf_config.vision_feature_select_strategy, @@ -111,7 +110,7 @@ def _get_num_image_tokens( return unpadded_feature_size + newline_feature_size + base_feature_size - # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79 + # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86 def _get_num_unpadded_features( self, *, @@ -121,42 +120,33 @@ def _get_num_unpadded_features( num_patch_height: int, num_patch_width: int, ) -> tuple[int, int]: - # NOTE: Use float32 to remain consistent with HF output - current_height_f = np.float32(npatches * num_patch_height) - current_width_f = np.float32(npatches * num_patch_width) + current_height = npatches * num_patch_height + current_width = npatches * num_patch_width - original_width_f = np.float32(original_width) - original_height_f = np.float32(original_height) + aspect_ratio = original_width / original_height + current_aspect_ratio = current_width / current_height - original_aspect_ratio = original_width_f / original_height_f - current_aspect_ratio = current_width_f / current_height_f - - if original_aspect_ratio > current_aspect_ratio: - scale_factor = current_width_f / original_width_f - new_height = int(original_height_f * scale_factor) - padding = (current_height_f - new_height) // 2 - current_height_f -= 2 * padding + if aspect_ratio > current_aspect_ratio: + new_height = (original_height * current_width) // original_width + padding = (current_height - new_height) // 2 + current_height = current_height - (2 * padding) else: - scale_factor = current_height_f / original_height_f - new_width = int(original_width_f * scale_factor) - padding = (current_width_f - new_width) // 2 - current_width_f -= 2 * padding + new_width = (original_width * current_height) // original_height + padding = (current_width - new_width) // 2 + current_width = current_width - (2 * padding) - unpadded_features = int(current_height_f * current_width_f) - newline_features = int(current_height_f) + unpadded_features = current_height * current_width + newline_features = current_height return (unpadded_features, newline_features) - -class LlavaNextProfilingInfo(LlavaNextProcessingMixin, BaseLlavaProfilingInfo): - - def _get_image_size_with_most_features(self) -> ImageSize: - hf_config = self._get_hf_config() + def get_image_size_with_most_features(self) -> ImageSize: + hf_config = self.get_hf_config() largest_feature_size, largest_feature_pinpoint = 0, None for (height, width) in hf_config.image_grid_pinpoints: - feat_size = self._get_num_image_tokens(image_width=width, - image_height=height) + feat_size = self.get_num_image_tokens(image_width=width, + image_height=height) if feat_size > largest_feature_size: largest_feature_size = feat_size largest_feature_pinpoint = ImageSize(width=width, @@ -168,11 +158,23 @@ def _get_image_size_with_most_features(self) -> ImageSize: return largest_feature_pinpoint -class LlavaNextMultiModalProcessor(LlavaNextProcessingMixin, - BaseLlavaMultiModalProcessor): +_I = TypeVar("_I", bound=LlavaNextProcessingInfo) + + +class BaseLlavaNextMultiModalProcessor(BaseLlavaMultiModalProcessor[_I]): + + # Copied from BaseMultiModalProcessor + @abstractmethod + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + raise NotImplementedError + - def _get_profiling_info(self) -> BaseProfilingInfo: - return LlavaNextProfilingInfo(self.ctx) +class LlavaNextMultiModalProcessor( + BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]): def _get_mm_fields_config( self, @@ -186,7 +188,9 @@ def _get_mm_fields_config( ) -@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor, + info=LlavaNextProcessingInfo, + dummy_inputs=LlavaDummyInputsBuilder) class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 6e82cee1c95a4..5be85d7c0f033 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -17,12 +17,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, NestedTensors) -from vllm.multimodal.parse import (ImageSize, VideoEmbeddingItems, - VideoProcessorItems) +from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, + VideoEmbeddingItems, VideoProcessorItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessingMixin, - PromptReplacement) -from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs + BaseProcessingInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -47,33 +46,52 @@ class LlavaNextVideoPixelInputs(TypedDict): """ -class LlavaNextVideoProcessingMixin(ProcessingMixin): +class LlavaNextVideoProcessingInfo(BaseProcessingInfo): - def _get_hf_config(self): + def get_hf_config(self): return self.ctx.get_hf_config(LlavaNextVideoConfig) - def _get_vision_encoder_info(self): - return get_vision_encoder_info(self._get_hf_config()) + def get_vision_encoder_info(self): + return get_vision_encoder_info(self.get_hf_config()) - def _get_hf_processor(self): + def get_hf_processor(self): return self.ctx.get_hf_processor(LlavaNextVideoProcessor) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"video": 1} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + target_width, target_height = self.get_image_size_with_most_features() + + max_video_tokens = self.get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=self.get_num_frames_with_most_features(seq_len), + ) + + return {"video": max_video_tokens} + + def get_image_size_with_most_features(self) -> ImageSize: + vision_encoder_info = self.get_vision_encoder_info() + width = height = vision_encoder_info.get_image_size() + return ImageSize(width=width, height=height) + def _get_num_frame_tokens( self, *, image_width: int, image_height: int, ) -> int: - hf_config = self._get_hf_config() + hf_config = self.get_hf_config() spatial_pool_stride = hf_config.spatial_pool_stride - vision_encoder_info = self._get_vision_encoder_info() + vision_encoder_info = self.get_vision_encoder_info() patch_grid_length = vision_encoder_info.get_patch_grid_length() pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride) return pooled_grid_length * pooled_grid_length - def _get_num_video_tokens( + def get_num_video_tokens( self, *, image_width: int, @@ -87,37 +105,14 @@ def _get_num_video_tokens( return num_frame_tokens * num_frames - -class LlavaNextVideoProfilingInfo(LlavaNextVideoProcessingMixin, - BaseProfilingInfo): - - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"video": 1} - - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - target_width, target_height = self._get_image_size_with_most_features() - - max_video_tokens = self._get_num_video_tokens( - image_width=target_width, - image_height=target_height, - num_frames=self._get_dummy_num_frames(seq_len), - ) - - return {"video": max_video_tokens} - - def _get_image_size_with_most_features(self) -> ImageSize: - vision_encoder_info = self._get_vision_encoder_info() - width = height = vision_encoder_info.get_image_size() - return ImageSize(width=width, height=height) - def _get_max_video_frames(self, max_tokens: int) -> int: - target_width, target_height = self._get_image_size_with_most_features() + target_width, target_height = self.get_image_size_with_most_features() num_frames = 0 while True: next_num_frames = num_frames + 1 - next_max_tokens = self._get_num_video_tokens( + next_max_tokens = self.get_num_video_tokens( image_width=target_width, image_height=target_height, num_frames=next_num_frames, @@ -130,7 +125,7 @@ def _get_max_video_frames(self, max_tokens: int) -> int: return num_frames - def _get_dummy_num_frames(self, seq_len: int) -> int: + def get_num_frames_with_most_features(self, seq_len: int) -> int: mm_config = self.ctx.get_mm_config() max_videos = mm_config.limit_per_prompt.get("video", 1) @@ -138,6 +133,10 @@ def _get_dummy_num_frames(self, seq_len: int) -> int: return max(max_total_frames // max(max_videos, 1), 1) + +class LlavaNextVideoDummyInputsBuilder( + BaseDummyInputsBuilder[LlavaNextVideoProcessingInfo]): + def get_dummy_processor_inputs( self, seq_len: int, @@ -145,16 +144,20 @@ def get_dummy_processor_inputs( ) -> ProcessorInputs: num_videos = mm_counts.get("video", 0) - processor = self._get_hf_processor() + processor = self.info.get_hf_processor() video_token = processor.video_token - target_width, target_height = self._get_image_size_with_most_features() + + target_width, target_height = \ + self.info.get_image_size_with_most_features() + target_num_frames = \ + self.info.get_num_frames_with_most_features(seq_len) mm_data = { "video": self._get_dummy_videos( width=target_width, height=target_height, - num_frames=self._get_dummy_num_frames(seq_len), + num_frames=target_num_frames, num_videos=num_videos, ) } @@ -165,11 +168,8 @@ def get_dummy_processor_inputs( ) -class LlavaNextVideoMultiModalProcessor(LlavaNextVideoProcessingMixin, - BaseMultiModalProcessor): - - def _get_profiling_info(self) -> BaseProfilingInfo: - return LlavaNextVideoProfilingInfo(self.ctx) +class LlavaNextVideoMultiModalProcessor( + BaseMultiModalProcessor[LlavaNextVideoProcessingInfo]): def _get_mm_fields_config( self, @@ -184,7 +184,7 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self._get_hf_config() + hf_config = self.info.get_hf_config() video_token_id = hf_config.video_token_index def get_replacement(item_idx: int): @@ -195,7 +195,7 @@ def get_replacement(item_idx: int): num_video_tokens = videos.get_feature_size(item_idx) else: image_size = videos.get_frame_size(item_idx) - num_video_tokens = self._get_num_video_tokens( + num_video_tokens = self.info.get_num_video_tokens( image_width=image_size.width, image_height=image_size.height, num_frames=videos.get_num_frames(item_idx), @@ -269,7 +269,11 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: return hidden_states -@MULTIMODAL_REGISTRY.register_processor(LlavaNextVideoMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor( + LlavaNextVideoMultiModalProcessor, + info=LlavaNextVideoProcessingInfo, + dummy_inputs=LlavaNextVideoDummyInputsBuilder, +) class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 62dae74e377be..78a47e64d9afc 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -3,7 +3,6 @@ from typing import (Final, Iterable, List, Literal, Mapping, Optional, Protocol, Set, Tuple, TypedDict, Union) -import numpy as np import torch import torch.nn as nn from transformers import (BatchFeature, LlavaOnevisionConfig, @@ -18,19 +17,20 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors -from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, - VideoEmbeddingItems, VideoProcessorItems) -from vllm.multimodal.processing import MultiModalFieldConfig, PromptReplacement -from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems, + VideoProcessorItems) +from vllm.multimodal.processing import PromptReplacement +from vllm.multimodal.profiling import ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of from .clip import CLIPVisionModel from .interfaces import SupportsMultiModal, SupportsPP -from .llava import BaseLlavaProfilingInfo, init_vision_tower_for_llava -from .llava_next import (LlavaNextLikeConfig, LlavaNextMultiModalProcessor, - LlavaNextProcessingMixin) +from .llava import LlavaDummyInputsBuilder, init_vision_tower_for_llava +from .llava_next import (BaseLlavaNextMultiModalProcessor, LlavaNextLikeConfig, + LlavaNextProcessingInfo) from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -90,14 +90,25 @@ class LlavaOnevisionLikeConfig(LlavaNextLikeConfig, Protocol): video_token_index: Final[int] -class LlavaOnevisionProcessingMixin(LlavaNextProcessingMixin): +class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo): - def _get_hf_config(self) -> LlavaOnevisionLikeConfig: + def get_hf_config(self) -> LlavaOnevisionLikeConfig: return self.ctx.get_hf_config(LlavaOnevisionConfig) - def _get_hf_processor(self): + def get_hf_processor(self): return self.ctx.get_hf_processor(LlavaOnevisionProcessor) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return { + "image": self.get_max_image_tokens(), + "video": self.get_max_video_tokens(seq_len), + } + + # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86 + # with additional logic afterwards taken from LlavaOnevisionProcessor def _get_num_unpadded_features( self, *, @@ -107,35 +118,28 @@ def _get_num_unpadded_features( num_patch_height: int, num_patch_width: int, ) -> tuple[int, int]: - # NOTE: Use float32 to remain consistent with HF output - current_height_f = np.float32(npatches * num_patch_height) - current_width_f = np.float32(npatches * num_patch_width) - - original_width_f = np.float32(original_width) - original_height_f = np.float32(original_height) + current_height = npatches * num_patch_height + current_width = npatches * num_patch_width - original_aspect_ratio = original_width_f / original_height_f - current_aspect_ratio = current_width_f / current_height_f + aspect_ratio = original_width / original_height + current_aspect_ratio = current_width / current_height - if original_aspect_ratio > current_aspect_ratio: - scale_factor = current_width_f / original_width_f - new_height = int(original_height_f * scale_factor) - padding = (current_height_f - new_height) // 2 - current_height_f -= 2 * padding + if aspect_ratio > current_aspect_ratio: + new_height = (original_height * current_width) // original_width + padding = (current_height - new_height) // 2 + current_height = current_height - (2 * padding) else: - scale_factor = current_height_f / original_height_f - new_width = int(original_width_f * scale_factor) - padding = (current_width_f - new_width) // 2 - current_width_f -= 2 * padding + new_width = (original_width * current_height) // original_height + padding = (current_width - new_width) // 2 + current_width = current_width - (2 * padding) - unpadded_features = int(current_height_f * current_width_f) - newline_features = int(current_height_f) + unpadded_features = current_height * current_width + newline_features = current_height - ratio = math.sqrt(current_height_f * current_width_f / - (9 * npatches**2)) + ratio = math.sqrt(current_height * current_width / (9 * npatches**2)) if ratio > 1.1: - height_factor = int(current_height_f // ratio) - width_factor = int(current_width_f // ratio) + height_factor = int(current_height // ratio) + width_factor = int(current_width // ratio) unpadded_features = height_factor * width_factor newline_features = height_factor @@ -147,16 +151,16 @@ def _get_num_frame_tokens( image_width: int, image_height: int, ) -> int: - hf_config = self._get_hf_config() + hf_config = self.get_hf_config() spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2) - vision_encoder_info = self._get_vision_encoder_info() + vision_encoder_info = self.get_vision_encoder_info() patch_grid_length = vision_encoder_info.get_patch_grid_length() pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride) return pooled_grid_length * pooled_grid_length - def _get_num_video_tokens( + def get_num_video_tokens( self, *, image_width: int, @@ -170,43 +174,14 @@ def _get_num_video_tokens( return num_frame_tokens * num_frames + 1 # Newline token - -class LlavaOnevisionProfilingInfo(LlavaOnevisionProcessingMixin, - BaseLlavaProfilingInfo): - - def _get_image_size_with_most_features(self) -> ImageSize: - hf_config = self._get_hf_config() - largest_feature_size, largest_feature_pinpoint = 0, None - for (height, width) in hf_config.image_grid_pinpoints: - feat_size = self._get_num_image_tokens(image_width=width, - image_height=height) - if feat_size > largest_feature_size: - largest_feature_size = feat_size - largest_feature_pinpoint = ImageSize(width=width, - height=height) - - if largest_feature_size == 0 or largest_feature_pinpoint is None: - raise ValueError("Cannot have a largest feature size of 0!") - - return largest_feature_pinpoint - - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None, "video": None} - - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - return { - "image": self._get_max_image_tokens(), - "video": self._get_max_video_tokens(seq_len), - } - def _get_max_video_frames(self, max_tokens: int) -> int: - target_width, target_height = self._get_image_size_with_most_features() + target_width, target_height = self.get_image_size_with_most_features() num_frames = 0 while True: next_num_frames = num_frames + 1 - next_max_tokens = self._get_num_video_tokens( + next_max_tokens = self.get_num_video_tokens( image_width=target_width, image_height=target_height, num_frames=next_num_frames, @@ -219,12 +194,12 @@ def _get_max_video_frames(self, max_tokens: int) -> int: return num_frames - def _get_dummy_num_frames(self, seq_len: int) -> int: + def get_num_frames_with_most_features(self, seq_len: int) -> int: mm_config = self.ctx.get_mm_config() max_images = mm_config.limit_per_prompt.get("image", 1) max_videos = mm_config.limit_per_prompt.get("video", 1) - max_image_tokens = self._get_max_image_tokens() * max_images + max_image_tokens = self.get_max_image_tokens() * max_images max_total_frames = self._get_max_video_frames(seq_len - max_image_tokens) max_frames_per_video = min(max_total_frames // max(max_videos, 1), @@ -232,15 +207,19 @@ def _get_dummy_num_frames(self, seq_len: int) -> int: return max(max_frames_per_video, 1) - def _get_max_video_tokens(self, seq_len: int) -> int: - target_width, target_height = self._get_image_size_with_most_features() + def get_max_video_tokens(self, seq_len: int) -> int: + target_width, target_height = self.get_image_size_with_most_features() - return self._get_num_video_tokens( + return self.get_num_video_tokens( image_width=target_width, image_height=target_height, - num_frames=self._get_dummy_num_frames(seq_len), + num_frames=self.get_num_frames_with_most_features(seq_len), ) + +class LlavaOnevisionDummyInputsBuilder( + LlavaDummyInputsBuilder[LlavaOnevisionProcessingInfo]): + def get_dummy_processor_inputs( self, seq_len: int, @@ -249,10 +228,14 @@ def get_dummy_processor_inputs( num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) - processor = self._get_hf_processor() + processor = self.info.get_hf_processor() image_token = processor.image_token video_token = processor.video_token - target_width, target_height = self._get_image_size_with_most_features() + + target_width, target_height = \ + self.info.get_image_size_with_most_features() + target_num_frames = \ + self.info.get_num_frames_with_most_features(seq_len) mm_data = { "image": @@ -263,7 +246,7 @@ def get_dummy_processor_inputs( self._get_dummy_videos( width=target_width, height=target_height, - num_frames=self._get_dummy_num_frames(seq_len), + num_frames=target_num_frames, num_videos=num_videos, ) } @@ -274,11 +257,8 @@ def get_dummy_processor_inputs( ) -class LlavaOnevisionMultiModalProcessor(LlavaOnevisionProcessingMixin, - LlavaNextMultiModalProcessor): - - def _get_profiling_info(self) -> BaseProfilingInfo: - return LlavaOnevisionProfilingInfo(self.ctx) +class LlavaOnevisionMultiModalProcessor( + BaseLlavaNextMultiModalProcessor[LlavaOnevisionProcessingInfo]): def _get_mm_fields_config( self, @@ -309,7 +289,7 @@ def _call_hf_processor( mm_kwargs=mm_kwargs, ) - processor = self._get_hf_processor() + processor = self.info.get_hf_processor() video_token = processor.video_token # LLaVA-OneVision processor doesn't support multiple videos @@ -351,7 +331,7 @@ def _get_prompt_replacements( out_mm_kwargs=out_mm_kwargs, ) - hf_config = self._get_hf_config() + hf_config = self.info.get_hf_config() video_token_id = hf_config.video_token_index def get_video_replacement(item_idx: int): @@ -362,7 +342,7 @@ def get_video_replacement(item_idx: int): num_video_tokens = videos.get_feature_size(item_idx) else: image_size = videos.get_frame_size(item_idx) - num_video_tokens = self._get_num_video_tokens( + num_video_tokens = self.info.get_num_video_tokens( image_width=image_size.width, image_height=image_size.height, num_frames=videos.get_num_frames(item_idx), @@ -399,7 +379,10 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: return hidden_states -@MULTIMODAL_REGISTRY.register_processor(LlavaOnevisionMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor( + LlavaOnevisionMultiModalProcessor, + info=LlavaOnevisionProcessingInfo, + dummy_inputs=LlavaOnevisionDummyInputsBuilder) class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index c8418c14e5fdf..a1b1af35604db 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -34,13 +34,12 @@ MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, - ImageSize) + ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessingMixin, - PromptReplacement, - _BoundPromptReplacement, - _PlaceholderInfo) -from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs + BaseProcessingInfo, + BoundPromptReplacement, + PlaceholderInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -302,9 +301,9 @@ def add_image_newline(self, image_features_hd): return image_features_hd_newline -class Phi3VProcessingMixin(ProcessingMixin): +class Phi3VProcessingInfo(BaseProcessingInfo): - def _get_hf_processor( + def get_hf_processor( self, *, num_crops: Optional[int] = None, @@ -314,39 +313,42 @@ def _get_hf_processor( return self.ctx.get_hf_processor() - def _get_num_image_tokens( - self, - *, - image_width: int, - image_height: int, - ) -> int: - processor = self._get_hf_processor() - - return processor.calc_num_image_tokens_from_image_size( # type: ignore - width=image_width, - height=image_height, - ) - - -class Phi3VProfilingInfo(Phi3VProcessingMixin, BaseProfilingInfo): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - target_width, target_height = self._get_image_size_with_most_features() + target_width, target_height = self.get_image_size_with_most_features() - max_image_tokens = self._get_num_image_tokens( + max_image_tokens = self.get_num_image_tokens( image_width=target_width, image_height=target_height, + processor=None, ) return {"image": max_image_tokens} - def _get_image_size_with_most_features(self) -> ImageSize: + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + processor: Optional[ProcessorMixin], + ) -> int: + if processor is None: + processor = self.get_hf_processor() + + return processor.calc_num_image_tokens_from_image_size( # type: ignore + width=image_width, + height=image_height, + ) + + def get_image_size_with_most_features(self) -> ImageSize: # Result in the max possible feature size (h:w = 16:1) return ImageSize(height=8000, width=50) + +class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]): + def get_dummy_processor_inputs( self, seq_len: int, @@ -354,7 +356,8 @@ def get_dummy_processor_inputs( ) -> ProcessorInputs: num_images = mm_counts.get("image", 0) - target_width, target_height = self._get_image_size_with_most_features() + target_width, target_height = \ + self.info.get_image_size_with_most_features() mm_data = { "image": @@ -363,7 +366,7 @@ def get_dummy_processor_inputs( num_images=num_images) } - hf_processor = self._get_hf_processor() + hf_processor = self.info.get_hf_processor() image_tokens: list[str] = hf_processor.img_tokens # type: ignore return ProcessorInputs( @@ -372,10 +375,7 @@ def get_dummy_processor_inputs( ) -class Phi3VMultiModalProcessor(Phi3VProcessingMixin, BaseMultiModalProcessor): - - def _get_profiling_info(self) -> BaseProfilingInfo: - return Phi3VProfilingInfo(self.ctx) +class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]): def _call_hf_processor( self, @@ -416,10 +416,10 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, Any], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs) + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_tokens: list[str] = hf_processor.img_tokens # type: ignore - tokenizer = self._get_tokenizer() + tokenizer = self.info.get_tokenizer() bos_token_id = tokenizer.bos_token_id assert isinstance(bos_token_id, int) @@ -431,9 +431,10 @@ def get_replacement_phi3v(item_idx: int): num_image_tokens = images.get_feature_size(item_idx) else: image_size = images.get_image_size(item_idx) - num_image_tokens = self._get_num_image_tokens( + num_image_tokens = self.info.get_num_image_tokens( image_width=image_size.width, image_height=image_size.height, + processor=hf_processor, ) return [_IMAGE_TOKEN_ID] * num_image_tokens + [bos_token_id] @@ -451,9 +452,9 @@ def get_replacement_phi3v(item_idx: int): def _apply_prompt_replacements( self, token_ids: list[int], - mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], + mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], mm_item_counts: Mapping[str, int], - ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]: + ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]: token_ids, text, placeholders = super()._apply_prompt_replacements( token_ids=token_ids, mm_prompt_repls=mm_prompt_repls, @@ -466,7 +467,7 @@ def _apply_prompt_replacements( token_ids = [token_ids[0], *token_ids[2:]] placeholders = { modality: [ - _PlaceholderInfo( + PlaceholderInfo( modality=p.modality, item_idx=p.item_idx, start_idx=p.start_idx - 1, @@ -499,7 +500,9 @@ def apply( return result -@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor, + info=Phi3VProcessingInfo, + dummy_inputs=Phi3VDummyInputsBuilder) class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 9e1d38512c0b4..37b9989e489ec 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -31,14 +31,13 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.inputs import NestedTensors, PlaceholderRange from vllm.multimodal.utils import (cached_get_tokenizer, - consecutive_placeholder_ranges, - resolve_visual_encoder_outputs) + consecutive_placeholder_ranges) from vllm.sequence import IntermediateTensors, SequenceData from .interfaces import SupportsMultiModal, SupportsPP from .utils import (init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from .vision import VisionEncoderInfo +from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs try: from xformers import ops as xops @@ -774,7 +773,7 @@ def get_num_image_tokens( ) -> int: return get_pixtral_hf_image_feature_size( image_size=self.vision_config.image_size, - patch_size=self.get_image_size(), + patch_size=self.vision_config.patch_size, ) def get_max_image_tokens(self) -> int: diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 01745b5fd53e1..d20fb150f7e39 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -298,7 +298,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): lambda prefix: Qwen2DecoderLayer(config=config, cache_config=cache_config, quant_config=quant_config, - prefix=f"{prefix}.layers"), + prefix=prefix), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 576b01776e5de..0dff9595c6c08 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -38,11 +38,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, NestedTensors) -from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataParser +from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, + MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessingMixin, - PromptReplacement) -from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs + BaseProcessingInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from .interfaces import SupportsMultiModal, SupportsPP @@ -80,12 +80,12 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor): return feat_lengths, output_lengths -class Qwen2AudioProcessingMixin(ProcessingMixin): +class Qwen2AudioProcessingInfo(BaseProcessingInfo): - def _get_hf_config(self): + def get_hf_config(self): return self.ctx.get_hf_config(Qwen2AudioConfig) - def _get_hf_processor( + def get_hf_processor( self, *, # Ignored in initialization @@ -93,36 +93,37 @@ def _get_hf_processor( ) -> Qwen2AudioProcessor: return self.ctx.get_hf_processor(Qwen2AudioProcessor) - def _get_feature_extractor( + def get_feature_extractor( self, *, # Ignored in initialization sampling_rate: Optional[int] = None, ) -> WhisperFeatureExtractor: - hf_processor = self._get_hf_processor(sampling_rate=sampling_rate) + hf_processor = self.get_hf_processor(sampling_rate=sampling_rate) feature_extractor = hf_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor - -class Qwen2AudioProfilingInfo(Qwen2AudioProcessingMixin, BaseProfilingInfo): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None} def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - hf_config = self._get_hf_config() + hf_config = self.get_hf_config() max_source_positions = hf_config.audio_config.max_source_positions max_output_lengths = (max_source_positions - 2) // 2 + 1 return {"audio": max_output_lengths} + +class Qwen2AudioDummyInputsBuilder( + BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]): + def get_dummy_processor_inputs( self, seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - feature_extractor = self._get_feature_extractor() + feature_extractor = self.info.get_feature_extractor() sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate @@ -139,14 +140,11 @@ def get_dummy_processor_inputs( ) -class Qwen2AudioMultiModalProcessor(Qwen2AudioProcessingMixin, - BaseMultiModalProcessor): - - def _get_profiling_info(self) -> BaseProfilingInfo: - return Qwen2AudioProfilingInfo(self.ctx) +class Qwen2AudioMultiModalProcessor( + BaseMultiModalProcessor[Qwen2AudioProcessingInfo]): def _get_data_parser(self) -> MultiModalDataParser: - feature_extractor = self._get_feature_extractor() + feature_extractor = self.info.get_feature_extractor() return MultiModalDataParser(target_sr=feature_extractor.sampling_rate) def _call_hf_processor( @@ -161,7 +159,7 @@ def _call_hf_processor( if audios: mm_data["audios"] = audios - feature_extractor = self._get_feature_extractor(**mm_kwargs) + feature_extractor = self.info.get_feature_extractor(**mm_kwargs) mm_kwargs = dict( **mm_kwargs, sampling_rate=feature_extractor.sampling_rate, @@ -194,7 +192,7 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self._get_hf_config() + hf_config = self.info.get_hf_config() placeholder = hf_config.audio_token_index feature_attention_mask = out_mm_kwargs.get("feature_attention_mask") @@ -234,10 +232,13 @@ def _always_apply_prompt_replacements(self) -> bool: # has already performed processing for multi-audio input when the input # audios are short (the corresponding placeholders may take up fewer # tokens than the number of audio items) - return not hasattr(self._get_hf_processor(), "audio_token") + return not hasattr(self.info.get_hf_processor(), "audio_token") -@MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor( + Qwen2AudioMultiModalProcessor, + info=Qwen2AudioProcessingInfo, + dummy_inputs=Qwen2AudioDummyInputsBuilder) class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): @@ -335,13 +336,16 @@ def _process_audio_input(self, selected_audio_feature = audio_outputs.last_hidden_state audio_features = self.multi_modal_projector(selected_audio_feature) num_audios, max_audio_tokens, embed_dim = audio_features.shape + audio_output_lengths = audio_output_lengths.unsqueeze(1) audio_features_mask = torch.arange(max_audio_tokens).expand( - num_audios, max_audio_tokens - ).to(audio_output_lengths.device) < audio_output_lengths.unsqueeze(1) + num_audios, max_audio_tokens).to( + audio_output_lengths.device) < audio_output_lengths masked_audio_features = audio_features[audio_features_mask].view( -1, embed_dim) - return masked_audio_features + # Split to tuple of embeddings for individual audio input. + return torch.split(masked_audio_features, + audio_output_lengths.flatten().tolist()) def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: audio_input = self._parse_and_validate_audio_input(**kwargs) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index a5c2fb9e84df3..76a810e8f0c20 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -57,18 +57,18 @@ MultiModalFieldConfig, MultiModalKwargs, NestedTensors, VideoItem) from vllm.multimodal.parse import (ImageSize, ModalityDataItems, - MultiModalDataParser) + MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessingMixin, - PromptReplacement) -from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs + BaseProcessingInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP -from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend, +from .utils import (AutoWeightsLoader, WeightsMapper, init_vllm_registered_model, maybe_prefix) +from .vision import get_vit_attn_backend logger = init_logger(__name__) @@ -709,12 +709,12 @@ def _parse_video_data( return super()._parse_video_data(data) -class Qwen2VLProcessingMixin(ProcessingMixin): +class Qwen2VLProcessingInfo(BaseProcessingInfo): - def _get_hf_config(self): + def get_hf_config(self): return self.ctx.get_hf_config(Qwen2VLConfig) - def _get_hf_processor( + def get_hf_processor( self, *, min_pixels: Optional[int] = None, @@ -736,18 +736,27 @@ def _get_hf_processor( return hf_processor - def _get_image_processor( + def get_image_processor( self, *, min_pixels: Optional[int] = None, max_pixels: Optional[int] = None, ): - hf_processor = self._get_hf_processor(min_pixels=min_pixels, - max_pixels=max_pixels) + hf_processor = self.get_hf_processor(min_pixels=min_pixels, + max_pixels=max_pixels) image_processor = hf_processor.image_processor # type: ignore assert isinstance(image_processor, Qwen2VLImageProcessor) return image_processor + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return { + "image": self.get_max_image_tokens(), + "video": self.get_max_video_tokens(seq_len), + } + def _get_vision_info( self, *, @@ -755,15 +764,17 @@ def _get_vision_info( image_height: int, num_frames: int = 1, do_resize: bool = True, + image_processor: Optional[Qwen2VLImageProcessor], ) -> tuple[ImageSize, int]: - hf_config = self._get_hf_config() + if image_processor is None: + image_processor = self.get_image_processor() + + hf_config = self.get_hf_config() vision_config = hf_config.vision_config patch_size = vision_config.patch_size merge_size = vision_config.spatial_merge_size temporal_patch_size = vision_config.temporal_patch_size - image_processor = self._get_image_processor() - if do_resize: resized_height, resized_width = smart_resize( height=image_height, @@ -787,70 +798,65 @@ def _get_vision_info( return preprocessed_size, num_vision_tokens - def _get_num_image_tokens( + def get_num_image_tokens( self, *, image_width: int, image_height: int, + image_processor: Optional[Qwen2VLImageProcessor], ) -> int: _, num_image_tokens = self._get_vision_info( image_width=image_width, image_height=image_height, + image_processor=image_processor, ) return num_image_tokens - def _get_num_video_tokens( + def get_num_video_tokens( self, *, image_width: int, image_height: int, num_frames: int, + image_processor: Optional[Qwen2VLImageProcessor], ) -> int: _, num_video_tokens = self._get_vision_info( image_width=image_width, image_height=image_height, num_frames=num_frames, + image_processor=image_processor, ) return num_video_tokens - -class Qwen2VLProfilingInfo(Qwen2VLProcessingMixin, BaseProfilingInfo): - - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None, "video": None} - - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - return { - "image": self._get_max_image_tokens(), - "video": self._get_max_video_tokens(seq_len), - } - - def _get_image_size_with_most_features(self) -> ImageSize: + def get_image_size_with_most_features(self) -> ImageSize: max_image_size, _ = self._get_vision_info( image_width=9999999, image_height=9999999, + image_processor=None, ) return max_image_size - def _get_max_image_tokens(self) -> int: - target_width, target_height = self._get_image_size_with_most_features() + def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() - return self._get_num_image_tokens( + return self.get_num_image_tokens( image_width=target_width, image_height=target_height, + image_processor=None, ) def _get_max_video_frames(self, max_tokens: int) -> int: - target_width, target_height = self._get_image_size_with_most_features() + target_width, target_height = self.get_image_size_with_most_features() num_frames = 0 while True: next_num_frames = num_frames + 1 - next_max_tokens = self._get_num_video_tokens( + next_max_tokens = self.get_num_video_tokens( image_width=target_width, image_height=target_height, num_frames=next_num_frames, + image_processor=None, ) if next_max_tokens > max_tokens: @@ -860,12 +866,12 @@ def _get_max_video_frames(self, max_tokens: int) -> int: return num_frames - def _get_dummy_num_frames(self, seq_len: int) -> int: + def get_num_frames_with_most_features(self, seq_len: int) -> int: mm_config = self.ctx.get_mm_config() max_images = mm_config.limit_per_prompt.get("image", 1) max_videos = mm_config.limit_per_prompt.get("video", 1) - max_image_tokens = self._get_max_image_tokens() * max_images + max_image_tokens = self.get_max_image_tokens() * max_images max_total_frames = self._get_max_video_frames(seq_len - max_image_tokens) @@ -877,15 +883,19 @@ def _get_dummy_num_frames(self, seq_len: int) -> int: return num_frames - def _get_max_video_tokens(self, seq_len: int) -> int: - target_width, target_height = self._get_image_size_with_most_features() + def get_max_video_tokens(self, seq_len: int) -> int: + target_width, target_height = self.get_image_size_with_most_features() - return self._get_num_video_tokens( + return self.get_num_video_tokens( image_width=target_width, image_height=target_height, - num_frames=self._get_dummy_num_frames(seq_len), + num_frames=self.get_num_frames_with_most_features(seq_len), + image_processor=None, ) + +class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]): + def get_dummy_processor_inputs( self, seq_len: int, @@ -894,10 +904,14 @@ def get_dummy_processor_inputs( num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) - hf_processor = self._get_hf_processor() + hf_processor = self.info.get_hf_processor() image_token: str = hf_processor.image_token video_token: str = hf_processor.video_token - target_width, target_height = self._get_image_size_with_most_features() + + target_width, target_height = \ + self.info.get_image_size_with_most_features() + target_num_frames = \ + self.info.get_num_frames_with_most_features(seq_len) mm_data = { "image": @@ -908,7 +922,7 @@ def get_dummy_processor_inputs( self._get_dummy_videos( width=target_width, height=target_height, - num_frames=self._get_dummy_num_frames(seq_len), + num_frames=target_num_frames, num_videos=num_videos, ) } @@ -919,11 +933,8 @@ def get_dummy_processor_inputs( ) -class Qwen2VLMultiModalProcessor(Qwen2VLProcessingMixin, - BaseMultiModalProcessor): - - def _get_profiling_info(self) -> BaseProfilingInfo: - return Qwen2VLProfilingInfo(self.ctx) +class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] + ): def _get_data_parser(self) -> MultiModalDataParser: return Qwen2MultiModalDataParser() @@ -934,8 +945,9 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, Any], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs) - image_processor = self._get_image_processor(**hf_processor_mm_kwargs) + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + image_processor = self.info.get_image_processor( + **hf_processor_mm_kwargs) # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has # image_token and video_token registered @@ -991,7 +1003,9 @@ def _get_mm_fields_config( ) -@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor, + info=Qwen2VLProcessingInfo, + dummy_inputs=Qwen2VLDummyInputsBuilder) class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP): packed_modules_mapping = { diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 7ea177e94afc0..cca42842bc06e 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -24,11 +24,10 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges, - repeat_and_pad_placeholder_tokens, - resolve_visual_encoder_outputs) + repeat_and_pad_placeholder_tokens) from vllm.sequence import SequenceData -from .vision import VisionEncoderInfo +from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int: diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index ba823acecbb56..fada22d685dd6 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -1,6 +1,5 @@ # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py """PyTorch Ultravox model.""" - import math from functools import cached_property from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set, @@ -14,6 +13,7 @@ from transformers.models.whisper import WhisperFeatureExtractor from transformers.models.whisper.modeling_whisper import WhisperEncoder +from vllm import envs from vllm.attention import AttentionMetadata from vllm.config import VllmConfig from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn @@ -24,19 +24,21 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, NestedTensors) -from vllm.multimodal.parse import MultiModalDataParser +from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessingMixin, - PromptReplacement) -from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs + BaseProcessingInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.ultravox import UltravoxConfig from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings, merge_multimodal_embeddings_from_map) +_AUDIO_PLACEHOLDER_OVERRIDE = "<|reserved_special_token_0|>" +_AUDIO_PLACEHOLDER_TOKEN = 128002 _AUDIO_TOKENS_PER_SECOND = 6.25 @@ -56,47 +58,55 @@ class UltravoxAudioEmbeddingInputs(TypedDict): UltravoxAudioEmbeddingInputs] -class UltravoxProcessingMixin(ProcessingMixin): +class UltravoxProcessingInfo(BaseProcessingInfo): - def _get_hf_processor( + def get_hf_processor( self, *, # Ignored in initialization sampling_rate: Optional[int] = None, ) -> ProcessorMixin: - return self.ctx.get_hf_processor() + hf_processor = self.ctx.get_hf_processor() + + # NOTE: Ultravox processing definition uses '<|eot_id|>' as the + # placeholder that will cause confusion with the actual end of turn + # token, thus we override placeholder with a reserved special + # token. + hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE + return hf_processor - def _get_feature_extractor( + def get_feature_extractor( self, *, # Ignored in initialization sampling_rate: Optional[int] = None, ) -> WhisperFeatureExtractor: - hf_processor = self._get_hf_processor(sampling_rate=sampling_rate) + hf_processor = self.get_hf_processor(sampling_rate=sampling_rate) audio_processor = hf_processor.audio_processor # type: ignore feature_extractor = audio_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor - -class UltravoxProfilingInfo(UltravoxProcessingMixin, BaseProfilingInfo): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None} def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - feature_extractor = self._get_feature_extractor() + feature_extractor = self.get_feature_extractor() max_audio_tokens = math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND) return {"audio": max_audio_tokens} + +class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo] + ): + def get_dummy_processor_inputs( self, seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - feature_extractor = self._get_feature_extractor() + feature_extractor = self.info.get_feature_extractor() sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate @@ -113,14 +123,11 @@ def get_dummy_processor_inputs( ) -class UltravoxMultiModalProcessor(UltravoxProcessingMixin, - BaseMultiModalProcessor): - - def _get_profiling_info(self) -> BaseProfilingInfo: - return UltravoxProfilingInfo(self.ctx) +class UltravoxMultiModalProcessor( + BaseMultiModalProcessor[UltravoxProcessingInfo]): def _get_data_parser(self) -> MultiModalDataParser: - feature_extractor = self._get_feature_extractor() + feature_extractor = self.info.get_feature_extractor() return MultiModalDataParser(target_sr=feature_extractor.sampling_rate) def _call_hf_processor( @@ -131,7 +138,7 @@ def _call_hf_processor( ) -> BatchFeature: # Text-only input not supported in composite processor if not mm_data: - tokenizer = self._get_tokenizer() + tokenizer = self.info.get_tokenizer() prompt_ids = tokenizer.encode( prompt, @@ -150,7 +157,7 @@ def _call_hf_processor( mm_kwargs=mm_kwargs, ) - feature_extractor = self._get_feature_extractor() + feature_extractor = self.info.get_feature_extractor() mm_kwargs = dict( **mm_kwargs, sampling_rate=feature_extractor.sampling_rate, @@ -198,7 +205,7 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, Any], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs) + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) placeholder = hf_processor.audio_token_replacement # type: ignore def get_replacement_ultravox(item_idx: int): @@ -332,7 +339,10 @@ def forward( return hidden_states -@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor, + info=UltravoxProcessingInfo, + dummy_inputs=UltravoxDummyInputsBuilder + ) class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): hf_to_vllm_mapper = WeightsMapper( @@ -465,11 +475,15 @@ def get_input_embeddings( inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: - # TODO(ywang96): use merge_multimodal_embeddings after - # v0 is deprecated - merge_multimodal_embeddings_from_map( - inputs_embeds, multimodal_embeddings, - attn_metadata.multi_modal_placeholder_index_maps["audio"]) + # TODO(ywang96): remove this block after v0 is deprecated. + if not envs.VLLM_USE_V1: + merge_multimodal_embeddings_from_map( + inputs_embeds, multimodal_embeddings, + attn_metadata.multi_modal_placeholder_index_maps["audio"]) + else: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + _AUDIO_PLACEHOLDER_TOKEN) return inputs_embeds def forward(self, diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 31017f16d3c97..43b3c973c97b8 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -8,16 +8,12 @@ from torch.func import functional_call from transformers import PretrainedConfig -import vllm.envs as envs -from vllm.attention.selector import (backend_name_to_enum, - get_global_forced_attn_backend) from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors -from vllm.platforms import _Backend, current_platform from vllm.sequence import IntermediateTensors -from vllm.utils import is_pin_memory_available, print_warning_once +from vllm.utils import is_pin_memory_available logger = init_logger(__name__) @@ -281,6 +277,15 @@ def flatten_bn( ... +@overload +def flatten_bn( + x: Union[List[torch.Tensor], torch.Tensor], + *, + concat: bool = False, +) -> Union[List[torch.Tensor], torch.Tensor]: + ... + + def flatten_bn( x: Union[List[torch.Tensor], torch.Tensor], *, @@ -603,37 +608,6 @@ def make_empty_intermediate_tensors( return make_empty_intermediate_tensors -def get_vit_attn_backend(support_fa: bool = False) -> _Backend: - """ - Get the available attention backend for Vision Transformer. - """ - # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn. - selected_backend: Optional[_Backend] = get_global_forced_attn_backend() - if selected_backend is None: - backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND - if backend_by_env_var is not None: - selected_backend = backend_name_to_enum(backend_by_env_var) - if selected_backend is None: - # For Volta and Turing GPUs, use xformers instead. - device_available = current_platform.has_device_capability(80) - if device_available and support_fa: - from transformers.utils import is_flash_attn_2_available - if is_flash_attn_2_available(): - selected_backend = _Backend.FLASH_ATTN - else: - print_warning_once( - "Current `vllm-flash-attn` has a bug inside vision module, " - "so we use xformers backend instead. You can run " - "`pip install flash-attn` to use flash-attention backend.") - selected_backend = _Backend.XFORMERS - elif current_platform.is_cpu() or current_platform.is_rocm(): - # ROCM doesn't support xformers - selected_backend = _Backend.TORCH_SDPA - else: - selected_backend = _Backend.XFORMERS - return selected_backend - - def maybe_prefix(prefix: str, name: str) -> str: """Add a prefix to a name if the prefix is non-empty. diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 8516c9f7066f7..e6a9e153d9107 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -1,8 +1,15 @@ from abc import ABC, abstractmethod -from typing import Final, Generic, Protocol, TypeVar +from typing import Final, Generic, Optional, Protocol, TypeVar, Union +import torch from transformers import PretrainedConfig +import vllm.envs as envs +from vllm.attention.selector import (backend_name_to_enum, + get_global_forced_attn_backend) +from vllm.platforms import _Backend, current_platform +from vllm.utils import print_warning_once + _C = TypeVar("_C", bound=PretrainedConfig) @@ -60,3 +67,77 @@ def get_vision_encoder_info( msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) + + +def get_vit_attn_backend(support_fa: bool = False) -> _Backend: + """ + Get the available attention backend for Vision Transformer. + """ + # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn. + selected_backend: Optional[_Backend] = get_global_forced_attn_backend() + if selected_backend is None: + backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND + if backend_by_env_var is not None: + selected_backend = backend_name_to_enum(backend_by_env_var) + if selected_backend is None: + # For Volta and Turing GPUs, use xformers instead. + device_available = current_platform.has_device_capability(80) + if device_available and support_fa: + from transformers.utils import is_flash_attn_2_available + if is_flash_attn_2_available(): + selected_backend = _Backend.FLASH_ATTN + else: + print_warning_once( + "Current `vllm-flash-attn` has a bug inside vision module, " + "so we use xformers backend instead. You can run " + "`pip install flash-attn` to use flash-attention backend.") + selected_backend = _Backend.XFORMERS + elif current_platform.is_cpu() or current_platform.is_rocm(): + # ROCM doesn't support xformers + selected_backend = _Backend.TORCH_SDPA + else: + selected_backend = _Backend.XFORMERS + return selected_backend + + +def resolve_visual_encoder_outputs( + encoder_outputs: Union[torch.Tensor, list[torch.Tensor]], + feature_sample_layers: Optional[list[int]], + post_layer_norm: Optional[torch.nn.LayerNorm], + max_possible_layers: int, +) -> torch.Tensor: + """Given the outputs a visual encoder module that may correspond to the + output of the last layer, or a list of hidden states to be stacked, + handle post normalization and resolve it into a single output tensor. + + Args: + encoder_outputs: Output of encoder's last layer or all hidden states. + feature_sample_layers: Optional layer indices to grab from the encoder + outputs; if provided, encoder outputs must be a list. + post_layer_norm: Post norm to apply to the output of the encoder. + max_possible_layers: Total layers in the fully loaded visual encoder. + + """ + if feature_sample_layers is None: + if post_layer_norm is not None: + return post_layer_norm(encoder_outputs) + return encoder_outputs + + # Get the hidden states corresponding to the layer indices. + # Negative values are relative to the full visual encoder, + # so offset them depending on how many layers were loaded. + # NOTE: this assumes that encoder_outputs contains a list + # of hidden states in the same order as the encoder layers + # that produced them. + offset = max_possible_layers - len(encoder_outputs) + hs_pool = [ + encoder_outputs[layer_idx] + if layer_idx >= 0 else encoder_outputs[layer_idx + offset] + for layer_idx in feature_sample_layers + ] + + # Apply post-norm on the final hidden state if we are using it + uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1) + if post_layer_norm is not None and uses_last_layer: + hs_pool[-1] = post_layer_norm(encoder_outputs) + return torch.cat(hs_pool, dim=-1) diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 7f4029e726332..4941fbac963ca 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -49,9 +49,6 @@ class MultiModalPlugin(ABC): process the same data differently). This registry is in turn used by :class:`~MultiModalRegistry` which acts at a higher level (i.e., the modality of the data). - - See also: - :ref:`adding-multimodal-plugin` """ def __init__(self) -> None: diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 8fdcc4b524035..8680e4175593b 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -100,11 +100,7 @@ class MultiModalDataBuiltins(TypedDict, total=False): """ A dictionary containing an entry for each modality type to input. -Note: - This dictionary also accepts modality keys defined outside - :class:`MultiModalDataBuiltins` as long as a customized plugin - is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. - Read more on that :ref:`here `. +The built-in modalities are defined by :class:`MultiModalDataBuiltins`. """ @@ -491,7 +487,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]: MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]] """ -A dictionary containing placeholder ranges. +A dictionary containing placeholder ranges for each modality. """ diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 6be046ba77ca7..ccff0e857eec4 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -13,14 +13,16 @@ from .audio import resample_audio from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem, - ImageItem, ModalityData, MultiModalDataDict, - NestedTensors, VideoItem) + ImageItem, ModalityData, MultiModalDataDict, VideoItem) _T = TypeVar("_T") _I = TypeVar("_I") class ModalityDataItems(ABC, Generic[_T, _I]): + """ + Represents data items for a modality in :class:`MultiModalDataItems`. + """ def __init__(self, data: _T, modality: str) -> None: super().__init__() @@ -69,6 +71,7 @@ def get_passthrough_data(self) -> Mapping[str, object]: class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]): + """Base class for data items that are arranged in a list.""" def get_count(self) -> int: return len(self.data) @@ -83,7 +86,12 @@ def get_passthrough_data(self) -> Mapping[str, object]: return {} -class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]): +class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]], + torch.Tensor]): + """ + Base class for data items that are expressed as a batched embedding tensor, + or a list of embedding tensors (one per item). + """ def get_count(self) -> int: return len(self.data) @@ -109,7 +117,7 @@ def __init__(self, data: Sequence[HfAudioItem]) -> None: class AudioEmbeddingItems(EmbeddingItems): - def __init__(self, data: NestedTensors) -> None: + def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None: super().__init__(data, "audio") @@ -137,7 +145,7 @@ def get_image_size(self, item_idx: int) -> ImageSize: class ImageEmbeddingItems(EmbeddingItems): - def __init__(self, data: NestedTensors) -> None: + def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None: super().__init__(data, "image") @@ -163,7 +171,7 @@ def get_frame_size(self, item_idx: int) -> ImageSize: class VideoEmbeddingItems(EmbeddingItems): - def __init__(self, data: NestedTensors) -> None: + def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None: super().__init__(data, "video") @@ -172,8 +180,8 @@ def __init__(self, data: NestedTensors) -> None: class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): """ - As :class:`MultiModalDataDict`, but normalized such that each entry - corresponds to a list. + As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized + such that each entry corresponds to a list. """ def get_count(self, modality: str, *, strict: bool = True) -> int: @@ -226,7 +234,8 @@ def get_items( class MultiModalDataParser: """ - Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`. + Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into + :class:`MultiModalDataItems`. Args: target_sr (float, optional): Enables automatic resampling of audio @@ -238,7 +247,9 @@ def __init__(self, *, target_sr: Optional[float] = None) -> None: self.target_sr = target_sr - def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]: + def _is_embeddings( + self, data: object + ) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]: if isinstance(data, torch.Tensor): return data.ndim == 3 if is_list_of(data, torch.Tensor): diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 41113cd85bd16..07d883d5d7295 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -4,12 +4,13 @@ from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence from dataclasses import dataclass, field from functools import lru_cache -from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union +from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol, + TypeVar, Union) from transformers import BatchFeature, PretrainedConfig, ProcessorMixin -from vllm import envs -from vllm.inputs import DummyData, InputProcessingContext +import vllm.envs as envs +from vllm.inputs import InputProcessingContext from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens, encode_tokens) @@ -20,7 +21,9 @@ MultiModalInputsV2, MultiModalKwargs, MultiModalKwargsItem, PlaceholderRange) from .parse import MultiModalDataItems, MultiModalDataParser -from .profiling import BaseProfilingInfo + +if TYPE_CHECKING: + from .profiling import BaseDummyInputsBuilder logger = init_logger(__name__) @@ -30,24 +33,28 @@ @dataclass class PromptReplacement: + """ + Defines how to replace portions of an input prompt with placeholder tokens. + """ + modality: str """The modality for which the replacement is made.""" target: _PromptSeq - """The text or token sequence to find and replace.""" + """The token sequence (or text) to find and replace.""" replacement: Union[Callable[[int], _PromptSeq], _PromptSeq] = field(repr=False) """ - Given the index of the processed item within :attr:`modality`, output the - replacement text or token sequence. + Given the index of the processed item within :attr:`modality`, + output the replacement token sequence (or text). - For convenience, you can pass in the replacement instead of a function - if it does not depend on the input. + For convenience, you can directly pass in the replacement token sequence + (or text) instead of a function if it does not depend on the input. """ - def bind(self, tokenizer: AnyTokenizer) -> "_BoundPromptReplacement": - return _BoundPromptReplacement( + def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement": + return BoundPromptReplacement( tokenizer=tokenizer, modality=self.modality, _target=self.target, @@ -128,7 +135,12 @@ def token_ids(self) -> list[int]: @dataclass -class _BoundPromptReplacement: +class BoundPromptReplacement: + """ + A :class:`PromptReplacement` bound to a tokenizer to automatically + convert :attr:`target` and the result of :meth:`get_replacement` between + token sequence and text representations. + """ tokenizer: AnyTokenizer = field(repr=False) modality: str @@ -141,6 +153,7 @@ def __post_init__(self) -> None: @property def target(self) -> _BoundPromptSequence: + """The token sequence (or text) to find and replace.""" target = self._target return _BoundPromptSequence( @@ -150,6 +163,10 @@ def target(self) -> _BoundPromptSequence: ) def get_replacement(self, item_idx: int) -> _BoundPromptSequence: + """ + Given the index of the processed item within :attr:`modality`, + output the replacement token sequence (or text). + """ replacement = self._replacement if callable(replacement): cache_key = item_idx @@ -207,7 +224,7 @@ def iter_token_matches( @dataclass(repr=False) class _PromptReplacementMatch(ABC): - prompt_repl: _BoundPromptReplacement + prompt_repl: BoundPromptReplacement @property def modality(self) -> str: @@ -255,7 +272,7 @@ def end_idx(self) -> int: @dataclass -class _PlaceholderInfo: +class PlaceholderInfo: modality: str item_idx: int start_idx: int @@ -274,7 +291,7 @@ def to_range(self) -> PlaceholderRange: def find_token_matches( prompt: list[int], - prompt_repls: Sequence[_BoundPromptReplacement], + prompt_repls: Sequence[BoundPromptReplacement], ) -> list[_PromptReplacementTokenMatch]: """Return each target of :code:`prompt_repls` found in :code:`prompt`.""" return [ @@ -286,7 +303,7 @@ def find_token_matches( def find_text_matches( prompt: str, - prompt_repls: Sequence[_BoundPromptReplacement], + prompt_repls: Sequence[BoundPromptReplacement], ) -> list[_PromptReplacementTextMatch]: """Return each target of :code:`prompt_repls` found in :code:`prompt`.""" return [ @@ -390,9 +407,9 @@ def replace_text_matches( def _iter_modality_placeholders( prompt: list[int], modality: str, - modality_repls: Sequence[_BoundPromptReplacement], + modality_repls: Sequence[BoundPromptReplacement], modal_item_count: int, -) -> Iterable[_PlaceholderInfo]: +) -> Iterable[PlaceholderInfo]: if modal_item_count == 0: return @@ -413,7 +430,7 @@ def _iter_modality_placeholders( continue if prompt[start_idx:end_idx] == repl_tokens: - yield _PlaceholderInfo( + yield PlaceholderInfo( modality=modality, item_idx=item_idx, start_idx=start_idx, @@ -434,10 +451,10 @@ def _iter_modality_placeholders( def _iter_placeholders( - mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], + mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], prompt: list[int], mm_item_counts: Mapping[str, int], -) -> Iterable[_PlaceholderInfo]: +) -> Iterable[PlaceholderInfo]: """ For each modality, yield each set of placeholder tokens found in :code:`prompt`. @@ -455,10 +472,10 @@ def _iter_placeholders( def find_mm_placeholders( - mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], + mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], prompt: list[int], mm_item_counts: Mapping[str, int], -) -> Mapping[str, list[_PlaceholderInfo]]: +) -> Mapping[str, list[PlaceholderInfo]]: it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts) return dict(full_groupby_modality(it)) @@ -524,29 +541,59 @@ def put( self._cache.put(cache_key, output_kwargs) -class ProcessingMixin: - """ - Contains helper functions to perform processing. +class BaseProcessingInfo: + """Base class to provide the information necessary for data processing.""" - Not to be confused with :class:`transformers.ProcessorMixin`. - """ - ctx: InputProcessingContext + def __init__(self, ctx: InputProcessingContext) -> None: + super().__init__() - def _get_tokenizer(self) -> AnyTokenizer: + self.ctx = ctx + + @property + def model_id(self) -> str: + return self.ctx.model_config.model + + def get_tokenizer(self) -> AnyTokenizer: return self.ctx.tokenizer - def _get_hf_config(self) -> PretrainedConfig: + def get_hf_config(self) -> PretrainedConfig: return self.ctx.get_hf_config() - def _get_hf_processor(self, **kwargs: object) -> ProcessorMixin: + def get_hf_processor(self, **kwargs: object) -> ProcessorMixin: """ Subclasses can override this method to handle specific kwargs from model config or user inputs. """ return self.ctx.get_hf_processor(**kwargs) + @abstractmethod + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + """ + Return the maximum supported number of items for each modality. + + A value of `None` means unlimited number of items. + + Omitting a modality from the returned dictionary means that + it is not supported at all. + """ + raise NotImplementedError -class BaseMultiModalProcessor(ProcessingMixin, ABC): + @abstractmethod + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + """ + Get the maximum possible number of tokens per data item + for each modality. + + The dictionary returned by this method should have the same + keys as that returned by :meth:`get_supported_mm_limits`. + """ + raise NotImplementedError + + +_I = TypeVar("_I", bound=BaseProcessingInfo) + + +class BaseMultiModalProcessor(ABC, Generic[_I]): """ Abstract base class to process multi-modal inputs to be used in vLLM. @@ -554,18 +601,19 @@ class BaseMultiModalProcessor(ProcessingMixin, ABC): """ def __init__(self, - ctx: InputProcessingContext, + info: _I, + dummy_inputs: "BaseDummyInputsBuilder[_I]", *, cache: Optional[ProcessingCache] = None, enable_sanity_checks: bool = True) -> None: super().__init__() - self.ctx = ctx + self.info = info + self.dummy_inputs = dummy_inputs self.cache = cache self.enable_sanity_checks = enable_sanity_checks self.data_parser = self._get_data_parser() - self.profiling_info = self._get_profiling_info() def __call__( self, @@ -585,13 +633,6 @@ def _get_data_parser(self) -> MultiModalDataParser: """ return MultiModalDataParser() - def _get_profiling_info(self) -> BaseProfilingInfo: - """ - Get the profiling information to find the worst-case memory usage of - the model. - """ - raise NotImplementedError - def _to_mm_items( self, mm_data: MultiModalDataDict, @@ -602,7 +643,7 @@ def _to_mm_items( """ mm_items = self.data_parser.parse_mm_data(mm_data) - mm_limits = self.ctx.get_mm_config().limit_per_prompt + mm_limits = self.info.ctx.get_mm_config().limit_per_prompt for modality, items in mm_items.items(): limit = mm_limits.get(modality, 1) if len(items) > limit: @@ -646,19 +687,19 @@ def _get_prompt_replacements( def _find_mm_placeholders( self, - mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], + mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], new_token_ids: list[int], mm_item_counts: Mapping[str, int], - ) -> Mapping[str, list[_PlaceholderInfo]]: + ) -> Mapping[str, list[PlaceholderInfo]]: return find_mm_placeholders(mm_prompt_repls, new_token_ids, mm_item_counts) def _get_hf_mm_data( self, mm_items: MultiModalDataItems, - ) -> tuple[dict[str, Any], dict[str, Any]]: - processor_data = dict[str, Any]() - passthrough_data = dict[str, Any]() + ) -> tuple[Mapping[str, object], Mapping[str, object]]: + processor_data = dict[str, object]() + passthrough_data = dict[str, object]() for items in mm_items.values(): processor_data.update(items.get_processor_data()) @@ -678,8 +719,8 @@ def _call_hf_processor( Call the HF processor on the prompt text and associated multi-modal data. """ - return self.ctx.call_hf_processor( - self._get_hf_processor(**mm_kwargs), + return self.info.ctx.call_hf_processor( + self.info.get_hf_processor(**mm_kwargs), dict(text=prompt, **mm_data), mm_kwargs, ) @@ -738,8 +779,8 @@ def _apply_hf_processor_missing( # Some HF processors (e.g. Qwen2-VL) expect corresponding # multi-modal tokens to be in the prompt text - dummy_inputs = self.profiling_info.get_dummy_processor_inputs( - self.ctx.model_config.max_model_len, + dummy_inputs = self.dummy_inputs.get_dummy_processor_inputs( + self.info.ctx.model_config.max_model_len, mm_missing_counts, ) @@ -762,7 +803,7 @@ def _cached_apply_hf_processor( caching the results and reusing cached results. """ cache = self.cache - model_id = self.ctx.model_config.model + model_id = self.info.model_id _, passthrough_data = self._get_hf_mm_data(mm_data_items) if cache is None or passthrough_data: @@ -838,8 +879,8 @@ def _cached_apply_hf_processor( def _bind_and_group_repls( self, prompt_repls: list[PromptReplacement], - ) -> dict[str, list[_BoundPromptReplacement]]: - tokenizer = self._get_tokenizer() + ) -> dict[str, list[BoundPromptReplacement]]: + tokenizer = self.info.get_tokenizer() it = (prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls) return dict(full_groupby_modality(it)) @@ -859,10 +900,10 @@ def _always_apply_prompt_replacements(self) -> bool: def _apply_prompt_replacements( self, token_ids: list[int], - mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], + mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], mm_item_counts: Mapping[str, int], - ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]: - tokenizer = self._get_tokenizer() + ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]: + tokenizer = self.info.get_tokenizer() mm_token_matches = { modality: find_token_matches(token_ids, prompt_repls) @@ -950,7 +991,7 @@ def _validate_mm_kwargs( def _validate_mm_placeholders( self, - mm_placeholders: Mapping[str, list[_PlaceholderInfo]], + mm_placeholders: Mapping[str, list[PlaceholderInfo]], mm_item_counts: Mapping[str, int], *, allow_missing: bool = False, @@ -1001,7 +1042,7 @@ def apply( # instead of rehashing. if envs.VLLM_USE_V1: - model_id = self.ctx.model_config.model + model_id = self.info.model_id mm_hashes = { modality: [ MultiModalHasher.hash_kwargs(model_id=model_id, @@ -1046,7 +1087,7 @@ def apply( allow_missing=True, ) - mm_missing_repls = dict[str, list[_BoundPromptReplacement]]() + mm_missing_repls = dict[str, list[BoundPromptReplacement]]() for modality, missing_repl_count in mm_missing_repl_counts.items(): if missing_repl_count == 0: mm_missing_repls[modality] = [] @@ -1059,7 +1100,7 @@ def apply( # If HF processor already inserts placeholder tokens, # there is no need for us to insert them if all(len(repls) == 0 for repls in mm_missing_repls.items()): - tokenizer = self._get_tokenizer() + tokenizer = self.info.get_tokenizer() prompt_text = decode_tokens(tokenizer, prompt_ids) mm_placeholders = hf_mm_placeholders else: @@ -1090,79 +1131,3 @@ def apply( mm_hashes=mm_hashes, mm_placeholders=mm_placeholder_ranges, ) - - def _get_dummy_mm_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> MultiModalInputsV2: - profiling = self.profiling_info - processor_inputs = profiling.get_dummy_processor_inputs( - seq_len, mm_counts) - - return self.apply( - prompt_text=processor_inputs.prompt_text, - mm_data=processor_inputs.mm_data, - hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, - ) - - def get_dummy_data(self, seq_len: int) -> DummyData: - # Avoid circular import - from vllm.sequence import SequenceData - - profiling = self.profiling_info - mm_counts = profiling.get_mm_limits() - mm_max_tokens_per_item = profiling.get_mm_max_tokens_per_item(seq_len) - if mm_counts.keys() != mm_max_tokens_per_item.keys(): - raise AssertionError( - "The keys returned by `get_supported_mm_limits`" - f"({set(mm_counts.keys())}) should be the same as those " - "returned by `get_mm_max_tokens_per_item` " - f"({set(mm_max_tokens_per_item.keys())})") - - mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) - prompt_token_ids = mm_inputs["prompt_token_ids"] - placeholders_by_modality = mm_inputs["mm_placeholders"] - - total_placeholders_by_modality = { - modality: sum(item["length"] for item in placeholders) - for modality, placeholders in placeholders_by_modality.items() - } - expected_placeholders_by_modality = { - modality: mm_max_tokens_per_item[modality] * mm_counts[modality] - for modality in placeholders_by_modality - } - if total_placeholders_by_modality != expected_placeholders_by_modality: - raise AssertionError( - f"The processed dummy data has a total of " - f"{total_placeholders_by_modality} placeholder tokens, which " - f"is not the expected {expected_placeholders_by_modality} " - "tokens.") - - total_len = len(prompt_token_ids) - - # V0 does not support chunked prefill. - if total_len > seq_len and not envs.VLLM_USE_V1: - logger.warning( - "The context length (%d) of the model is too short " - "to hold the multi-modal embeddings in the worst case " - "(%d tokens in total, out of which %s are reserved for " - "multi-modal embeddings). This may cause certain multi-modal " - "inputs to fail during inference, even when the input text is " - "short. To avoid this, you should increase `max_model_len`, " - "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len, - total_len, total_placeholders_by_modality) - - return DummyData( - seq_data=SequenceData.from_prompt_token_counts((0, seq_len)), - multi_modal_data=None, - multi_modal_placeholders=None, - ) - - prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids))) - - return DummyData( - seq_data=SequenceData.from_seqs(prompt_token_ids), - multi_modal_data=mm_inputs["mm_kwargs"], - multi_modal_placeholders=placeholders_by_modality, - ) diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 2ecf0db1a485d..6f7da1509990f 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -1,61 +1,46 @@ from abc import ABC, abstractmethod from collections.abc import Mapping from dataclasses import dataclass, field -from typing import Optional +from typing import Generic, TypeVar import numpy as np import numpy.typing as npt from PIL import Image -from vllm.inputs import InputProcessingContext +import vllm.envs as envs +from vllm.inputs import DummyData from vllm.logger import init_logger -from .inputs import MultiModalDataDict +from .inputs import MultiModalDataDict, MultiModalInputsV2 +from .processing import BaseMultiModalProcessor, BaseProcessingInfo logger = init_logger(__name__) @dataclass class ProcessorInputs: - """Keyword arguments to :meth:`BaseMultiModalProcessor`.""" + """ + Represents the keyword arguments to + :meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`. + """ prompt_text: str mm_data: MultiModalDataDict hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict) -class BaseProfilingInfo(ABC): +_I = TypeVar("_I", bound=BaseProcessingInfo) + + +class BaseDummyInputsBuilder(ABC, Generic[_I]): """ - Abstract base class that provides the information necessary to profile + Abstract base class that constructs the dummy data to profile multi-modal models. """ - def __init__(self, ctx: InputProcessingContext) -> None: + def __init__(self, info: _I) -> None: super().__init__() - self.ctx = ctx - - @abstractmethod - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - """ - Return the maximum supported number of items for each modality. - - A value of `None` means unlimited number of items. - - Omitting a modality from the returned dictionary means that - it is not supported at all. - """ - raise NotImplementedError - - @abstractmethod - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - """ - Get the maximum possible number of tokens per data item - for each modality. - - The dictionary returned by this method should have the same - keys as that returned by :meth:`get_supported_mm_limits`. - """ - raise NotImplementedError + self.info = info @abstractmethod def get_dummy_processor_inputs( @@ -64,8 +49,8 @@ def get_dummy_processor_inputs( mm_counts: Mapping[str, int], ) -> ProcessorInputs: """ - Build the multi-modal portion of the input which, after processing, - results in `mm_max_tokens` in :meth:`get_mm_max_tokens_per_item`. + Build the input which, after processing, results in + :code:`self.info.get_mm_max_tokens_per_item()` placeholder tokens. """ raise NotImplementedError @@ -99,11 +84,33 @@ def _get_dummy_videos( video = np.zeros((num_frames, width, height, 3)) return [video] * num_videos - def get_mm_limits(self) -> Mapping[str, int]: - mm_config = self.ctx.get_mm_config() + +class MultiModalProfiler(Generic[_I]): + """ + Contains code for running memory profiling for multi-modal models. + """ + + def __init__( + self, + processor: BaseMultiModalProcessor[_I], + ) -> None: + super().__init__() + + self.processor = processor + + @property + def processing_info(self) -> BaseProcessingInfo: + return self.processor.info + + @property + def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]: + return self.processor.dummy_inputs + + def _get_mm_limits(self) -> Mapping[str, int]: + mm_config = self.processing_info.ctx.get_mm_config() mm_limit_per_prompt = mm_config.limit_per_prompt - supported_mm_limits = self.get_supported_mm_limits() + supported_mm_limits = self.processing_info.get_supported_mm_limits() mm_limits = { modality: mm_limit_per_prompt.get(modality, 1) @@ -119,3 +126,81 @@ def get_mm_limits(self) -> Mapping[str, int]: f"at most {supported_limit} {modality} items.") return mm_limits + + def _get_dummy_mm_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalInputsV2: + factory = self.dummy_inputs + processor_inputs = factory.get_dummy_processor_inputs( + seq_len, mm_counts) + + return self.processor.apply( + prompt_text=processor_inputs.prompt_text, + mm_data=processor_inputs.mm_data, + hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, + ) + + def get_dummy_data(self, seq_len: int) -> DummyData: + # Avoid circular import + from vllm.sequence import SequenceData + + mm_counts = self._get_mm_limits() + + info = self.processing_info + mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(seq_len) + + if mm_counts.keys() != mm_max_tokens_per_item.keys(): + raise AssertionError( + "The keys returned by `get_supported_mm_limits`" + f"({set(mm_counts.keys())}) should be the same as those " + "returned by `get_mm_max_tokens_per_item` " + f"({set(mm_max_tokens_per_item.keys())})") + + mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) + prompt_token_ids = mm_inputs["prompt_token_ids"] + placeholders_by_modality = mm_inputs["mm_placeholders"] + + total_placeholders_by_modality = { + modality: sum(item["length"] for item in placeholders) + for modality, placeholders in placeholders_by_modality.items() + } + expected_placeholders_by_modality = { + modality: mm_max_tokens_per_item[modality] * mm_counts[modality] + for modality in placeholders_by_modality + } + if total_placeholders_by_modality != expected_placeholders_by_modality: + raise AssertionError( + f"The processed dummy data has a total of " + f"{total_placeholders_by_modality} placeholder tokens, which " + f"is not the expected {expected_placeholders_by_modality} " + "tokens.") + + total_len = len(prompt_token_ids) + + # V0 does not support chunked prefill. + if total_len > seq_len and not envs.VLLM_USE_V1: + logger.warning( + "The context length (%d) of the model is too short " + "to hold the multi-modal embeddings in the worst case " + "(%d tokens in total, out of which %s are reserved for " + "multi-modal embeddings). This may cause certain multi-modal " + "inputs to fail during inference, even when the input text is " + "short. To avoid this, you should increase `max_model_len`, " + "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len, + total_len, total_placeholders_by_modality) + + return DummyData( + seq_data=SequenceData.from_prompt_token_counts((0, seq_len)), + multi_modal_data=None, + multi_modal_placeholders=None, + ) + + prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids))) + + return DummyData( + seq_data=SequenceData.from_seqs(prompt_token_ids), + multi_modal_data=mm_inputs["mm_kwargs"], + multi_modal_placeholders=placeholders_by_modality, + ) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index f75a594a4c4e0..9eceefb08c93f 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -1,7 +1,8 @@ import functools from collections import UserDict -from typing import (TYPE_CHECKING, Any, Dict, Mapping, Optional, Protocol, - Sequence, Type, TypeVar) +from dataclasses import dataclass +from typing import (TYPE_CHECKING, Any, Dict, Generic, Mapping, Optional, + Protocol, Sequence, Type, TypeVar) import torch.nn as nn @@ -14,7 +15,9 @@ from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc from .image import ImagePlugin from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors -from .processing import BaseMultiModalProcessor, ProcessingCache +from .processing import (BaseMultiModalProcessor, BaseProcessingInfo, + ProcessingCache) +from .profiling import BaseDummyInputsBuilder from .utils import cached_get_tokenizer from .video import VideoPlugin @@ -27,20 +30,59 @@ MM_CACHE_SIZE = 256 N = TypeVar("N", bound=Type[nn.Module]) +_I = TypeVar("_I", bound=BaseProcessingInfo) +_I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True) -class MultiModalProcessorFactory(Protocol): +class ProcessingInfoFactory(Protocol[_I_co]): """Constructs a :class:`MultiModalProcessor` instance from the context.""" def __call__( self, ctx: InputProcessingContext, + ) -> _I_co: + ... + + +class DummyInputsBuilderFactory(Protocol[_I]): + """ + Constructs a :class:`BaseDummyInputsBuilder` instance from the context. + """ + + def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]: + ... + + +class MultiModalProcessorFactory(Protocol[_I]): + """Constructs a :class:`MultiModalProcessor` instance from the context.""" + + def __call__( + self, + info: _I, + dummy_inputs: BaseDummyInputsBuilder[_I], *, cache: Optional[ProcessingCache] = None, - ) -> BaseMultiModalProcessor: + ) -> BaseMultiModalProcessor[_I]: ... +@dataclass(frozen=True) +class _ProcessorFactories(Generic[_I]): + info: ProcessingInfoFactory[_I] + processor: MultiModalProcessorFactory[_I] + dummy_inputs: DummyInputsBuilderFactory[_I] + + def build_processor( + self, + ctx: InputProcessingContext, + *, + cache: Optional[ProcessingCache] = None, + ): + info = self.info(ctx) + dummy_inputs_builder = self.dummy_inputs(info) + return self.processor(info, dummy_inputs_builder, cache=cache) + + class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]): """ Wraps `_limits_by_model` for a more informative error message @@ -71,7 +113,7 @@ def __init__( self._plugins = {p.get_data_key(): p for p in plugins} self._processor_factories = ClassRegistry[nn.Module, - MultiModalProcessorFactory]() + _ProcessorFactories]() # This is used for non-multimodal models self._disabled_limits_per_plugin = {k: 0 for k in self._plugins} @@ -83,9 +125,6 @@ def __init__( def register_plugin(self, plugin: MultiModalPlugin) -> None: """ Register a multi-modal plugin so it can be recognized by vLLM. - - See also: - :ref:`adding-multimodal-plugin` """ data_type_key = plugin.get_data_key() @@ -224,7 +263,7 @@ def get_max_tokens_per_item_by_modality( tokenizer = cached_get_tokenizer(model_config.tokenizer) processor = self.create_processor(model_config, tokenizer) seq_len = model_config.max_model_len - return processor.profiling_info.get_mm_max_tokens_per_item(seq_len) + return processor.info.get_mm_max_tokens_per_item(seq_len) return { key: plugin.get_max_multimodal_tokens(model_config) @@ -315,7 +354,10 @@ def get_mm_limits_per_prompt( def register_processor( self, - factory: MultiModalProcessorFactory, + processor: MultiModalProcessorFactory[_I], + *, + info: ProcessingInfoFactory[_I], + dummy_inputs: DummyInputsBuilderFactory[_I], ): """ Register a multi-modal processor to a model class. The processor @@ -336,7 +378,11 @@ def wrapper(model_cls: N) -> N: "registered to %s. It is overwritten by the new one.", model_cls, self) - self._processor_factories[model_cls] = factory + self._processor_factories[model_cls] = _ProcessorFactories( + info=info, + dummy_inputs=dummy_inputs, + processor=processor, + ) return model_cls @@ -359,15 +405,15 @@ def create_processor( self, model_config: "ModelConfig", tokenizer: AnyTokenizer, - ) -> BaseMultiModalProcessor: + ) -> BaseMultiModalProcessor[BaseProcessingInfo]: """ Create a multi-modal processor for a specific model and tokenizer. """ model_cls = self._get_model_cls(model_config) - processor_factory = self._processor_factories[model_cls] + factories = self._processor_factories[model_cls] ctx = InputProcessingContext(model_config, tokenizer) cache = (None if model_config.disable_mm_preprocessor_cache else self._processing_cache) - return processor_factory(ctx, cache=cache) + return factories.build_processor(ctx, cache=cache) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index f4a514ba55d0c..1c6bbf77b926f 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -5,7 +5,6 @@ import numpy as np import numpy.typing as npt -import torch from PIL import Image import vllm.envs as envs @@ -285,49 +284,6 @@ def encode_video_base64(frames: npt.NDArray) -> str: return video_io.encode_base64(frames) -def resolve_visual_encoder_outputs( - encoder_outputs: Union[torch.Tensor, list[torch.Tensor]], - feature_sample_layers: Optional[list[int]], - post_layer_norm: Optional[torch.nn.LayerNorm], - max_possible_layers: int, -) -> torch.Tensor: - """Given the outputs a visual encoder module that may correspond to the - output of the last layer, or a list of hidden states to be stacked, - handle post normalization and resolve it into a single output tensor. - - Args: - encoder_outputs: Output of encoder's last layer or all hidden states. - feature_sample_layers: Optional layer indices to grab from the encoder - outputs; if provided, encoder outputs must be a list. - post_layer_norm: Post norm to apply to the output of the encoder. - max_possible_layers: Total layers in the fully loaded visual encoder. - - """ - if feature_sample_layers is None: - if post_layer_norm is not None: - return post_layer_norm(encoder_outputs) - return encoder_outputs - - # Get the hidden states corresponding to the layer indices. - # Negative values are relative to the full visual encoder, - # so offset them depending on how many layers were loaded. - # NOTE: this assumes that encoder_outputs contains a list - # of hidden states in the same order as the encoder layers - # that produced them. - offset = max_possible_layers - len(encoder_outputs) - hs_pool = [ - encoder_outputs[layer_idx] - if layer_idx >= 0 else encoder_outputs[layer_idx + offset] - for layer_idx in feature_sample_layers - ] - - # Apply post-norm on the final hidden state if we are using it - uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1) - if post_layer_norm is not None and uses_last_layer: - hs_pool[-1] = post_layer_norm(encoder_outputs) - return torch.cat(hs_pool, dim=-1) - - # Utilities for input processors _T = TypeVar("_T", str, int) diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index c50eb2cef4cd5..e5fa4f0e4a2f6 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -63,8 +63,8 @@ def load_general_plugins(): from vllm.platforms import current_platform if current_platform.is_xpu(): - # see https://github.com/pytorch/pytorch/blob/8cada5cbe5450e17c26fb8b358116785324537b2/torch/_dynamo/config.py#L158 # noqa - os.environ['TORCH_COMPILE_DISABLE'] = 'True' + # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158 + torch._dynamo.config.disable = True if current_platform.is_hpu(): # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1) # does not support torch.compile @@ -72,7 +72,6 @@ def load_general_plugins(): # torch.compile support is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1' if is_lazy: - # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158 torch._dynamo.config.disable = True # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only) # requires enabling lazy collectives diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 2635c0bccd1c4..b24b7e91a7ae7 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -7,7 +7,7 @@ class PoolingParams( msgspec.Struct, omit_defaults=True, # type: ignore[call-arg] array_like=True): # type: ignore[call-arg] - """Pooling parameters for embeddings API. + """API parameters for pooling models. This is currently a placeholder. Attributes: additional_data: Any additional data needed for pooling. diff --git a/vllm/sequence.py b/vllm/sequence.py index 0157abbd2eed5..5857f656dfc10 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1108,6 +1108,13 @@ class IntermediateTensors: tensors: Dict[str, torch.Tensor] + def __init__(self, tensors): + # manually define this function, so that + # Dynamo knows `IntermediateTensors()` comes from this file. + # Otherwise, dataclass will generate this function by evaluating + # a string, and we will lose the information about the source file. + self.tensors = tensors + def __getitem__(self, key: Union[str, slice]): if isinstance(key, str): return self.tensors[key] diff --git a/vllm/utils.py b/vllm/utils.py index 63057153f851d..c09cae70e9af8 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -524,6 +524,13 @@ def get_open_port() -> int: def find_process_using_port(port: int) -> Optional[psutil.Process]: + # TODO: We can not check for running processes with network + # port on macOS. Therefore, we can not have a full graceful shutdown + # of vLLM. For now, let's not look for processes in this case. + # Ref: https://www.florianreinhard.de/accessdenied-in-psutil/ + if sys.platform.startswith("darwin"): + return None + for conn in psutil.net_connections(): if conn.laddr.port == port: try: @@ -1742,10 +1749,10 @@ class MemorySnapshot: timestamp: float = 0.0 def measure(self): - self.torch_peak_in_bytes = torch.cuda.memory_stats( - )["allocated_bytes.all.peak"] - self.torch_memory_in_bytes = torch.cuda.memory_stats( - )["allocated_bytes.all.current"] + self.torch_peak_in_bytes = torch.cuda.max_memory_reserved() + # torch.cuda.memory_reserved() is how many bytes + # PyTorch gets from cuda (by calling cudaMalloc, etc.) + self.torch_memory_in_bytes = torch.cuda.memory_reserved() self.timestamp = time.time() def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot": @@ -1822,10 +1829,10 @@ def memory_profiling( The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`. - The increase of ``torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.). + The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.). (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`), - subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_stats()["allocated_bytes.all.current"]`. + subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_reserved()`. """ # noqa torch.cuda.reset_peak_memory_stats() diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index baaf3329dc79f..b26716f5c02e6 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -5,8 +5,6 @@ from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.logger import init_logger -from vllm.multimodal import MultiModalKwargs -from vllm.multimodal.base import PlaceholderRange from vllm.sampling_params import SamplingParams from vllm.v1.core.encoder_cache_manager import EncoderCacheManager from vllm.v1.core.kv_cache_manager import KVCacheManager