[V1][Minor] Set pin_memory=False for token_ids_cpu tensor (vllm-proje…

…ct#11581) Signed-off-by: Woosuk Kwon <[email protected]> Signed-off-by: xcnick <[email protected]>
xcnick · Dec 31, 2024 · 08003bf · 08003bf
1 parent 9ec8df8
commit 08003bf
Showing 1 changed file with 3 additions and 1 deletion.
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
@@ -57,11 +57,13 @@ def __init__(
 
         # TODO(woosuk): This buffer could be too large if max_model_len is big.
         # Find a way to reduce the CPU memory usage.
+        # This buffer is not directly transferred to the GPU, so it does not
+        # need to be pinned.
         self.token_ids_cpu_tensor = torch.zeros(
             (max_num_reqs, max_model_len),
             device="cpu",
             dtype=torch.int32,
-            pin_memory=pin_memory,
+            pin_memory=False,
         )
         self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
         self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)