From 08003bf00a40b78d85683e755fb8328a47c8c1d7 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 28 Dec 2024 22:33:12 +0900 Subject: [PATCH] [V1][Minor] Set pin_memory=False for token_ids_cpu tensor (#11581) Signed-off-by: Woosuk Kwon Signed-off-by: xcnick --- vllm/v1/worker/gpu_input_batch.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 6c4d300ec6efe..e79145300fe06 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -57,11 +57,13 @@ def __init__( # TODO(woosuk): This buffer could be too large if max_model_len is big. # Find a way to reduce the CPU memory usage. + # This buffer is not directly transferred to the GPU, so it does not + # need to be pinned. self.token_ids_cpu_tensor = torch.zeros( (max_num_reqs, max_model_len), device="cpu", dtype=torch.int32, - pin_memory=pin_memory, + pin_memory=False, ) self.token_ids_cpu = self.token_ids_cpu_tensor.numpy() self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)