diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 400bb9936e02d..0532550359d6d 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -19,27 +19,30 @@ def main(args: argparse.Namespace): # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. - llm = LLM(model=args.model, - speculative_model=args.speculative_model, - num_speculative_tokens=args.num_speculative_tokens, - tokenizer=args.tokenizer, - quantization=args.quantization, - quantized_weights_path=args.quantized_weights_path, - tensor_parallel_size=args.tensor_parallel_size, - trust_remote_code=args.trust_remote_code, - dtype=args.dtype, - enforce_eager=args.enforce_eager, - kv_cache_dtype=args.kv_cache_dtype, - quantization_param_path=args.quantization_param_path, - device=args.device, - ray_workers_use_nsight=args.ray_workers_use_nsight, - worker_use_ray=args.worker_use_ray, - use_v2_block_manager=args.use_v2_block_manager, - enable_chunked_prefill=args.enable_chunked_prefill, - download_dir=args.download_dir, - block_size=args.block_size, - disable_custom_all_reduce=args.disable_custom_all_reduce, - gpu_memory_utilization=args.gpu_memory_utilization) + llm = LLM( + model=args.model, + speculative_model=args.speculative_model, + num_speculative_tokens=args.num_speculative_tokens, + tokenizer=args.tokenizer, + quantization=args.quantization, + quantized_weights_path=args.quantized_weights_path, + tensor_parallel_size=args.tensor_parallel_size, + trust_remote_code=args.trust_remote_code, + dtype=args.dtype, + enforce_eager=args.enforce_eager, + kv_cache_dtype=args.kv_cache_dtype, + quantization_param_path=args.quantization_param_path, + device=args.device, + ray_workers_use_nsight=args.ray_workers_use_nsight, + worker_use_ray=args.worker_use_ray, + use_v2_block_manager=args.use_v2_block_manager, + enable_chunked_prefill=args.enable_chunked_prefill, + download_dir=args.download_dir, + block_size=args.block_size, + disable_custom_all_reduce=args.disable_custom_all_reduce, + gpu_memory_utilization=args.gpu_memory_utilization, + distributed_executor_backend=args.distributed_executor_backend, + ) sampling_params = SamplingParams( n=args.n, @@ -237,5 +240,13 @@ def run_to_completion(profile_dir: Optional[str] = None): help='the fraction of GPU memory to be used for ' 'the model executor, which can range from 0 to 1.' 'If unspecified, will use the default value of 0.9.') + parser.add_argument( + '--distributed-executor-backend', + choices=['ray', 'mp', 'torchrun'], + default=None, + help='Backend to use for distributed serving. When more than 1 GPU ' + 'is used, on CUDA this will be automatically set to "ray" if ' + 'installed or "mp" (multiprocessing) otherwise. On ROCm, this is ' + 'instead set to torchrun by default.') args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index eb59e38fa2c9d..302746e316514 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -79,6 +79,7 @@ def run_vllm( enable_prefix_caching: bool, enable_chunked_prefill: bool, max_num_batched_tokens: int, + distributed_executor_backend: Optional[str], gpu_memory_utilization: float = 0.9, worker_use_ray: bool = False, download_dir: Optional[str] = None, @@ -104,6 +105,7 @@ def run_vllm( download_dir=download_dir, enable_chunked_prefill=enable_chunked_prefill, max_num_batched_tokens=max_num_batched_tokens, + distributed_executor_backend=distributed_executor_backend, ) # Add the requests to the engine. @@ -229,8 +231,9 @@ def main(args: argparse.Namespace): args.max_model_len, args.enforce_eager, args.kv_cache_dtype, args.quantization_param_path, args.device, args.enable_prefix_caching, args.enable_chunked_prefill, - args.max_num_batched_tokens, args.gpu_memory_utilization, - args.worker_use_ray, args.download_dir) + args.max_num_batched_tokens, args.distributed_executor_backend, + args.gpu_memory_utilization, args.worker_use_ray, + args.download_dir) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -384,6 +387,14 @@ def main(args: argparse.Namespace): type=str, default=None, help='Path to save the throughput results in JSON format.') + parser.add_argument( + '--distributed-executor-backend', + choices=['ray', 'mp', 'torchrun'], + default=None, + help='Backend to use for distributed serving. When more than 1 GPU ' + 'is used, on CUDA this will be automatically set to "ray" if ' + 'installed or "mp" (multiprocessing) otherwise. On ROCm, this is ' + 'instead set to torchrun by default.') args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/vllm/config.py b/vllm/config.py index 409b0e1a44f7a..d606ef79d0bc5 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -599,7 +599,7 @@ def __init__( if self.distributed_executor_backend is None and self.world_size > 1: if is_hip(): logger.info("Using torchrun for multi-GPU on " - "ROCM platform. Use --worker-use-ray or " + "ROCm platform. Use --worker-use-ray or " "--distributed-executor-backend={ray, mp} to " "override") if not os.environ.get("RANK"): diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e461feb5e05a7..059407c05b220 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -233,7 +233,7 @@ def add_cli_args( help='Backend to use for distributed serving. When more than 1 GPU ' 'is used, on CUDA this will be automatically set to "ray" if ' 'installed or "mp" (multiprocessing) otherwise. On ROCm, this is ' - 'instead automatically set to torchrun.') + 'instead set to torchrun by default.') parser.add_argument( '--worker-use-ray', action='store_true',