Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
Nizben committed Dec 12, 2024
1 parent 76ae6b5 commit 1aa2035
Show file tree
Hide file tree
Showing 2 changed files with 151 additions and 40 deletions.
21 changes: 17 additions & 4 deletions keopscore/keopscore/binders/nvrtc/keops_nvrtc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -633,15 +633,28 @@ template <typename TYPE> class KeOps_module {
if (tagHostDevice == 0)
CUDA_SAFE_CALL(cuMemFree(p_data));

// if (RR.tagRanges == 1) {
// CUDA_SAFE_CALL(cuMemFree((CUdeviceptr)lookup_d));
// if (SS.nbatchdims > 0) {
// CUDA_SAFE_CALL(cuMemFree((CUdeviceptr)slices_x_d));
// CUDA_SAFE_CALL(cuMemFree((CUdeviceptr)ranges_y_d));
// CUDA_SAFE_CALL(cuMemFree((CUdeviceptr)offsets_d));
// }
// }

if (RR.tagRanges == 1) {
CUDA_SAFE_CALL(cuMemFree((CUdeviceptr)lookup_d));
if (SS.nbatchdims > 0) {
CUDA_SAFE_CALL(cuMemFree((CUdeviceptr)lookup_d));
// Always free slices_x_d and ranges_y_d if they are allocated
CUDA_SAFE_CALL(cuMemFree((CUdeviceptr)slices_x_d));
CUDA_SAFE_CALL(cuMemFree((CUdeviceptr)ranges_y_d));
CUDA_SAFE_CALL(cuMemFree((CUdeviceptr)offsets_d));
}

// offsets_d is only allocated if nbatchdims > 0
if (SS.nbatchdims > 0) {
CUDA_SAFE_CALL(cuMemFree((CUdeviceptr)offsets_d));
}
}


// end_ = end = clock();
////std::cout << " time for last part : " << double(//end_ - start_) /
/// CLOCKS_PER_SEC << std::endl; /std::cout << "time for launch_keops inner
Expand Down
170 changes: 134 additions & 36 deletions keopscore/keopscore/config/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,48 +296,146 @@ def print_nvrtc_flags(self):
"""Print the NVRTC flags for CUDA compilation."""
print(f"NVRTC Flags: {self.nvrtc_flags}")


def get_gpu_props(self):
"""Retrieve GPU properties and set related attributes."""
try:
libcuda = ctypes.CDLL(find_library("cuda"))
nGpus = ctypes.c_int()
result = libcuda.cuInit(0)
if result != self.CUDA_SUCCESS:
KeOps_Warning("cuInit failed; no CUDA driver available.")
self.n_gpus = 0
return self.n_gpus, self.gpu_compile_flags
result = libcuda.cuDeviceGetCount(ctypes.byref(nGpus))
if result != self.CUDA_SUCCESS:
KeOps_Warning("cuDeviceGetCount failed.")
self.n_gpus = 0
return self.n_gpus, self.gpu_compile_flags
self.n_gpus = nGpus.value
self.gpu_compile_flags = f"-DMAXIDGPU={self.n_gpus - 1} "
for d in range(self.n_gpus):
device = ctypes.c_int()
libcuda.cuDeviceGet(ctypes.byref(device), d)
max_threads = ctypes.c_int()
libcuda.cuDeviceGetAttribute(
ctypes.byref(max_threads),
self.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
device,
)
shared_mem = ctypes.c_int()
libcuda.cuDeviceGetAttribute(
ctypes.byref(shared_mem),
self.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
device,
)
self.gpu_compile_flags += (
f"-DMAXTHREADSPERBLOCK{d}={max_threads.value} "
"""
Retrieve GPU properties and set related attributes.
"""

def safe_call(d, result):
test = (result == self.CUDA_SUCCESS)
if not test:
KeOps_Warning(
f"""
CUDA was detected, the driver API has been initialized,
but there was an error detecting properties of GPU device nr {d}.
Switching to CPU only.
"""
)
self.gpu_compile_flags += f"-DSHAREDMEMPERBLOCK{d}={shared_mem.value} "
return test

# Attempt to load the CUDA driver library
libcuda = ctypes.CDLL(find_library("cuda"))
if not libcuda:
KeOps_Warning("cuda library not found. Switching to CPU only.")
self.n_gpus = 0
self.gpu_compile_flags = ""
return self.n_gpus, self.gpu_compile_flags
except Exception as e:
KeOps_Warning(f"Error retrieving GPU properties: {e}")

# Initialize CUDA
result = libcuda.cuInit(0)
if result != self.CUDA_SUCCESS:
KeOps_Warning(
"cuda was detected, but driver API could not be initialized. Switching to CPU only."
)
self.n_gpus = 0
self.gpu_compile_flags = ""
return self.n_gpus, self.gpu_compile_flags

# Get GPU count
nGpus = ctypes.c_int()
result = libcuda.cuDeviceGetCount(ctypes.byref(nGpus))
if result != self.CUDA_SUCCESS:
KeOps_Warning(
"cuda was detected, driver API has been initialized, but no working GPU found. Switching to CPU only."
)
self.n_gpus = 0
self.gpu_compile_flags = ""
return self.n_gpus, self.gpu_compile_flags

self.n_gpus = nGpus.value
# If no GPUs, return immediately
if self.n_gpus == 0:
self.gpu_compile_flags = ""
return self.n_gpus, self.gpu_compile_flags

# Query each GPU for properties
MaxThreadsPerBlock = [0] * self.n_gpus
SharedMemPerBlock = [0] * self.n_gpus
test = True

for d in range(self.n_gpus):
device = ctypes.c_int()
if not safe_call(d, libcuda.cuDeviceGet(ctypes.byref(device), d)):
test = False
break

output = ctypes.c_int()
if not safe_call(d, libcuda.cuDeviceGetAttribute(
ctypes.byref(output),
self.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
device,
)):
test = False
break
MaxThreadsPerBlock[d] = output.value

if not safe_call(d, libcuda.cuDeviceGetAttribute(
ctypes.byref(output),
self.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
device,
)):
test = False
break
SharedMemPerBlock[d] = output.value

# If any call failed, switch to CPU mode
if not test:
self.n_gpus = 0
self.gpu_compile_flags = ""
return self.n_gpus, self.gpu_compile_flags

# Build the compile flags string based on GPU properties
self.gpu_compile_flags = f"-DMAXIDGPU={self.n_gpus - 1} "
for d in range(self.n_gpus):
self.gpu_compile_flags += f"-DMAXTHREADSPERBLOCK{d}={MaxThreadsPerBlock[d]} "
self.gpu_compile_flags += f"-DSHAREDMEMPERBLOCK{d}={SharedMemPerBlock[d]} "

return self.n_gpus, self.gpu_compile_flags


# def get_gpu_props(self):
# """Retrieve GPU properties and set related attributes."""
# try:
# libcuda = ctypes.CDLL(find_library("cuda"))
# nGpus = ctypes.c_int()
# result = libcuda.cuInit(0)
# if result != self.CUDA_SUCCESS:
# KeOps_Warning("cuInit failed; no CUDA driver available.")
# self.n_gpus = 0
# return self.n_gpus, self.gpu_compile_flags
# result = libcuda.cuDeviceGetCount(ctypes.byref(nGpus))
# if result != self.CUDA_SUCCESS:
# KeOps_Warning("cuDeviceGetCount failed.")
# self.n_gpus = 0
# return self.n_gpus, self.gpu_compile_flags
# self.n_gpus = nGpus.value
# self.gpu_compile_flags = f"-DMAXIDGPU={self.n_gpus - 1} "
# for d in range(self.n_gpus):
# device = ctypes.c_int()
# libcuda.cuDeviceGet(ctypes.byref(device), d)
# max_threads = ctypes.c_int()
# libcuda.cuDeviceGetAttribute(
# ctypes.byref(max_threads),
# self.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
# device,
# )
# shared_mem = ctypes.c_int()
# libcuda.cuDeviceGetAttribute(
# ctypes.byref(shared_mem),
# self.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
# device,
# )
# self.gpu_compile_flags += (
# f"-DMAXTHREADSPERBLOCK{d}={max_threads.value} "
# )
# self.gpu_compile_flags += f"-DSHAREDMEMPERBLOCK{d}={shared_mem.value} "
# return self.n_gpus, self.gpu_compile_flags
# except Exception as e:
# KeOps_Warning(f"Error retrieving GPU properties: {e}")
# self.n_gpus = 0
# return self.n_gpus, self.gpu_compile_flags

def print_all(self):
"""
Print all CUDA-related configuration and system health status.
Expand Down

0 comments on commit 1aa2035

Please sign in to comment.