Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check CUDA memory pool support #3931

Closed
10 changes: 10 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ option(LLAMA_CUBLAS "llama: use CUDA"
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
option(LLAMA_CUDA_USE_CUDA_POOL "llama: use CUDA memory instead of custom pool" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
Expand Down Expand Up @@ -270,6 +271,11 @@ if (LLAMA_CUBLAS)
if (LLAMA_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()

if (LLAMA_CUDA_USE_CUDA_POOL)
add_compile_definitions(GGML_USE_CUDA_MEMORY_POOL)
endif()

add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
if (DEFINED LLAMA_CUDA_DMMV_Y)
Expand Down Expand Up @@ -373,6 +379,10 @@ if (LLAMA_HIPBLAS)
if (LLAMA_CUDA_FORCE_MMQ)
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
endif()
if (LLAMA_CUDA_USE_CUDA_POOL)
target_compile_definitions(ggml-rocm PRIVATE GGML_USE_CUDA_MEMORY_POOL)
endif()

target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
Expand Down
121 changes: 102 additions & 19 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@
#define CUDA_USE_TENSOR_CORES
#endif

#if defined(GGML_USE_CUDA_MEMORY_POOL)
#define CUDA_USE_MEMORY_POOL
#endif

// max batch size to use MMQ kernels when tensor cores are available
#define MMQ_MAX_BATCH_SIZE 32

Expand Down Expand Up @@ -5844,8 +5848,48 @@ void ggml_init_cublas() {
for (int id = 0; id < g_device_count; ++id) {
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
fprintf(stderr, " Device %d: %s, compute capability %d.%d", id, prop.name, prop.major, prop.minor);

#if defined(CUDA_USE_MEMORY_POOL)
bool support_mem_pool = true;
#if CUDART_VERSION >= 12000
support_mem_pool = (prop.memoryPoolsSupported == 1);
#endif
if (support_mem_pool) {
cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id);
if (err == cudaSuccess) {
size_t treshold = UINT64_MAX;
err = (cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold));
if (err == cudaSuccess) {
fprintf(stderr, ", CUDA memory pool is supported\n");
} else {
g_cudaMemPools[id] = nullptr;
fprintf(stderr, ", CUDA memory pool is not supported (release threshold error)\n");
}
} else {
g_cudaMemPools[id] = nullptr;
fprintf(stderr, ", CUDA memory pool is not supported (can't load default pool)\n");
}
// test alloc/dealoc
if (err == cudaSuccess) {
void *testPtr;
size_t testSize = 1024;
err = cudaMallocFromPoolAsync(&testPtr, testSize, g_cudaMemPools[id], g_cudaStreams[id][0]);
if (err == cudaSuccess) {
err = cudaFreeAsync(testPtr, g_cudaStreams[id][0]);
if (err != cudaSuccess) {
g_cudaMemPools[id] = nullptr;
fprintf(stderr, ", CUDA memory pool is not supported (deallocation failed)\n");
}
} else {
g_cudaMemPools[id] = nullptr;
fprintf(stderr, ", CUDA memory pool is not supported (allocation failed)\n");
}
}
} else {
fprintf(stderr, ", CUDA memory pool is not supported\n");
}
#endif
g_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem;
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
Expand All @@ -5854,6 +5898,52 @@ void ggml_init_cublas() {
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
}

#if defined(CUDA_USE_MEMORY_POOL)
if (g_device_count > 1) {
// give access to devices memory pools
if (g_cudaMemPools[g_main_device] != nullptr) {
cudaMemPool_t main_device_pool;
cudaMemAccessDesc desc_main_device = {};
desc_main_device.location.type = cudaMemLocationTypeDevice;
desc_main_device.location.id = g_main_device;
desc_main_device.flags = cudaMemAccessFlagsProtReadWrite;
CUDA_CHECK(cudaDeviceGetDefaultMemPool(&main_device_pool, g_main_device));
for (int id = 0; id < g_device_count; ++id) {
if (id == g_main_device) continue;

if (g_cudaMemPools[id] == nullptr) {
fprintf(stderr,
"Warning: Device %d doesnt support CUDA memory pool, skipping pool access config\n",
id);
continue;
}

cudaMemAccessDesc desc_device = {};
desc_device.location.type = cudaMemLocationTypeDevice;
desc_device.location.id = id;
desc_device.flags = cudaMemAccessFlagsProtReadWrite;
cudaError_t err = cudaMemPoolSetAccess(main_device_pool, &desc_device, 1 /* numDescs */);
if (err != cudaSuccess) {
fprintf(stderr, "Can't give access for main device memory pool to device %d\n", id);
}
cudaMemPool_t mempool;
CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, id));
err = cudaMemPoolSetAccess(mempool, &desc_main_device, 1 /* numDescs */);
if (err != cudaSuccess) {
fprintf(stderr, "Can't give access for device %d memory pool to main device \n", id);
}
}
} else {
fprintf(stderr,
"WARNING: Your main GPU device doesnt support CUDA memory pools. Using custom memory pool implementation.\n");
for (int id = 0; id < g_device_count; ++id) {
g_cudaMemPools[id] = nullptr;
}
}
}
#endif

for (int id = 0; id < g_device_count; ++id) {
g_tensor_split[id] /= total_vram;
}
Expand All @@ -5869,13 +5959,6 @@ void ggml_init_cublas() {
// create cublas handle
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));

// configure memory pool
cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id);
if (err == cudaSuccess) {
size_t treshold = UINT64_MAX;
CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold));
}
}

// configure logging to stdout
Expand Down Expand Up @@ -6375,7 +6458,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;

if (src1_convert_f16) {
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
src1_dfloat = (half *) ggml_cuda_pool_malloc_async(ne00*sizeof(half), &ash, g_main_device, stream);
ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
ne00, 1, sizeof(float), 0, 0,
ne00, 1, sizeof(half), 0, 0, stream);
Expand Down Expand Up @@ -6776,22 +6859,22 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
if (src0_on_device) {
src0_ddf = (float *) src0_extra->data_device[g_main_device];
} else {
src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
src0_ddf = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src0), &src0_asf, g_main_device, main_stream);
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
}

if (use_src1 && !src1_stays_on_host) {
if (src1_on_device) {
src1_ddf = (float *) src1_extra->data_device[g_main_device];
} else {
src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
src1_ddf = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src1), &src1_asf, g_main_device, main_stream);
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
}
}
if (dst_on_device) {
dst_ddf = (float *) dst_extra->data_device[g_main_device];
} else {
dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
dst_ddf = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(dst), &dst_asf, g_main_device, main_stream);
}

// do the computation
Expand All @@ -6803,18 +6886,18 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
}

if (dst->backend == GGML_BACKEND_CPU) {
CUDA_CHECK(cudaDeviceSynchronize());
}

if (src0_asf > 0) {
ggml_cuda_pool_free(src0_ddf, src0_asf);
ggml_cuda_pool_free_async(src0_ddf, src0_asf, g_main_device, main_stream);
}
if (src1_asf > 0) {
ggml_cuda_pool_free(src1_ddf, src1_asf);
ggml_cuda_pool_free_async(src1_ddf, src1_asf, g_main_device, main_stream);
}
if (dst_asf > 0) {
ggml_cuda_pool_free(dst_ddf, dst_asf);
}

if (dst->backend == GGML_BACKEND_CPU) {
CUDA_CHECK(cudaDeviceSynchronize());
ggml_cuda_pool_free_async(dst_ddf, dst_asf, g_main_device, main_stream);
}
}

Expand Down