Skip to content

Commit

Permalink
Merge pull request opendatahub-io#15 from ROCm/torchrun_cache_init_fix
Browse files Browse the repository at this point in the history
Correctly calculating the same value for the required cache blocks num for all torchrun processes
  • Loading branch information
gshtras authored May 15, 2024
2 parents bc20e48 + 2e6b63f commit 4b39609
Showing 1 changed file with 14 additions and 1 deletion.
15 changes: 14 additions & 1 deletion vllm/executor/torchrun_gpu_executor.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import os
from typing import Dict, List, Optional

import torch

from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig, SpeculativeConfig,
VisionLanguageConfig)
from vllm.executor.executor_base import ExecutorAsyncBase
from vllm.executor.gpu_executor import GPUExecutor
from vllm.logger import init_logger
from vllm.model_executor.parallel_utils.communication_op import (
broadcast_object_list)
broadcast_object_list, tensor_model_parallel_all_gather)
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
make_async)
Expand Down Expand Up @@ -63,6 +65,17 @@ def _init_worker(self):
self.driver_worker.init_device()
self.driver_worker.load_model()

def determine_num_available_blocks(self) -> tuple[int, int]:
num_gpu_blocks, num_cpu_blocks = (
self.driver_worker.determine_num_available_blocks())
t = torch.tensor(
[[num_gpu_blocks], [num_cpu_blocks]],
device="cuda",
dtype=torch.int32,
)
output = tensor_model_parallel_all_gather(t)
return (torch.min(output[0]).item(), torch.min(output[1]).item())

def execute_model(self,
seq_group_metadata_list: List[SequenceGroupMetadata],
blocks_to_swap_in: Dict[int, int],
Expand Down

0 comments on commit 4b39609

Please sign in to comment.