Merge pull request opendatahub-io#15 from ROCm/torchrun_cache_init_fix

Correctly calculating the same value for the required cache blocks num for all torchrun processes
prarit · May 15, 2024 · 4b39609 · 4b39609
2 parents bc20e48 + 2e6b63f
commit 4b39609
Showing 1 changed file with 14 additions and 1 deletion.
diff --git a/vllm/executor/torchrun_gpu_executor.py b/vllm/executor/torchrun_gpu_executor.py
@@ -1,14 +1,16 @@
 import os
 from typing import Dict, List, Optional
 
+import torch
+
 from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, SpeculativeConfig,
                          VisionLanguageConfig)
 from vllm.executor.executor_base import ExecutorAsyncBase
 from vllm.executor.gpu_executor import GPUExecutor
 from vllm.logger import init_logger
 from vllm.model_executor.parallel_utils.communication_op import (
-    broadcast_object_list)
+    broadcast_object_list, tensor_model_parallel_all_gather)
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
                         make_async)
@@ -63,6 +65,17 @@ def _init_worker(self):
         self.driver_worker.init_device()
         self.driver_worker.load_model()
 
+    def determine_num_available_blocks(self) -> tuple[int, int]:
+        num_gpu_blocks, num_cpu_blocks = (
+            self.driver_worker.determine_num_available_blocks())
+        t = torch.tensor(
+            [[num_gpu_blocks], [num_cpu_blocks]],
+            device="cuda",
+            dtype=torch.int32,
+        )
+        output = tensor_model_parallel_all_gather(t)
+        return (torch.min(output[0]).item(), torch.min(output[1]).item())
+
     def execute_model(self,
                       seq_group_metadata_list: List[SequenceGroupMetadata],
                       blocks_to_swap_in: Dict[int, int],