From 3940ee7064214cee62e389b2978a2c2388a3bed0 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 2 Nov 2024 08:29:43 +0000 Subject: [PATCH] cuda: fix check for GPU device availability The check for `/dev/nvidiactl` to determine if the CUDA plugin can be used is unreliable because in some cases the default path for driver installation is different [1]. This patch changes the logic to check if a GPU device is available in `/proc/driver/nvidia/gpus/`. This is approach is similar to `torch.cuda.is_available()` and it is a more accurate indicator. The subsequent check for support of the `cuda-checkpoint --action` option would confirm if the driver supports checkpoint/restore. [1] https://github.com/NVIDIA/gpu-operator Fixes: #2509 Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index c4fc67fa9f..337954bed6 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -470,6 +470,20 @@ int cuda_plugin_resume_devices_late(int pid) } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late) +/** + * Check if a CUDA device is available on the system + */ +static bool is_cuda_device_available(void) +{ + const char *gpu_path = "/proc/driver/nvidia/gpus/"; + struct stat sb; + + if (stat(gpu_path, &sb) != 0) + return false; + + return S_ISDIR(sb.st_mode); +} + int cuda_plugin_init(int stage) { int ret; @@ -481,8 +495,8 @@ int cuda_plugin_init(int stage) } } - if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) { - pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n"); + if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && !is_cuda_device_available()) { + pr_info("No GPU device found; CUDA plugin is disabled\n"); plugin_disabled = true; return 0; }