From fbb0cf35de5eb27268183a78582c98909cc0f39c Mon Sep 17 00:00:00 2001 From: FindHao Date: Thu, 10 Dec 2020 00:43:49 +0000 Subject: [PATCH 1/3] remove redundant memory traffic --- src/convolutional_layer.c | 6 +++--- src/dark_cuda.c | 18 ++++++++++++++++++ src/dark_cuda.h | 1 + 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index 1d52dd1d23c..7aa51c52d58 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -717,7 +717,7 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, if (train) l.bias_updates_gpu = cuda_make_array(l.bias_updates, n); } - l.output_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n); + l.output_gpu = cuda_make_array_init2zero(total_batch*out_h*out_w*n); if (train) l.delta_gpu = cuda_make_array(l.delta, total_batch*out_h*out_w*n); if(binary){ @@ -761,9 +761,9 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, } if (train) { - l.x_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n); + l.x_gpu = cuda_make_array_init2zero(total_batch*out_h*out_w*n); #ifndef CUDNN - l.x_norm_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n); + l.x_norm_gpu = cuda_make_array_init2zero( total_batch*out_h*out_w*n); #endif // CUDNN } } diff --git a/src/dark_cuda.c b/src/dark_cuda.c index fec06473880..a0339b28a2e 100644 --- a/src/dark_cuda.c +++ b/src/dark_cuda.c @@ -380,6 +380,24 @@ float *cuda_make_array(float *x, size_t n) return x_gpu; } +float *cuda_make_array_init2zero(size_t n) { + float *x_gpu; + size_t size = sizeof(float) * n; + cudaError_t status = cudaMalloc((void **)&x_gpu, size); + // cudaError_t status = cudaMallocManaged((void **)&x_gpu, size, + // cudaMemAttachGlobal); status = cudaMemAdvise(x_gpu, size, + // cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId); + if (status != cudaSuccess) + fprintf(stderr, " Try to set subdivisions=64 in your cfg-file. \n"); + CHECK_CUDA(status); + // status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice); + status =cudaMemset(x_gpu, 0, size); + CHECK_CUDA(status); + if (!x_gpu) + error("Cuda malloc failed\n"); + return x_gpu; +} + void **cuda_make_array_pointers(void **x, size_t n) { void **x_gpu; diff --git a/src/dark_cuda.h b/src/dark_cuda.h index 0e5f39f9a35..6fd7c786cbb 100644 --- a/src/dark_cuda.h +++ b/src/dark_cuda.h @@ -64,6 +64,7 @@ extern "C" { float *cuda_make_array_pinned_preallocated(float *x, size_t n); float *cuda_make_array_pinned(float *x, size_t n); float *cuda_make_array(float *x, size_t n); + float *cuda_make_array_init2zero(size_t n); void **cuda_make_array_pointers(void **x, size_t n); int *cuda_make_int_array(size_t n); int *cuda_make_int_array_new_api(int *x, size_t n); From 95fc755db45bbd6e4c7306f3eef613e250076004 Mon Sep 17 00:00:00 2001 From: FindHao Date: Mon, 14 Dec 2020 21:53:22 +0000 Subject: [PATCH 2/3] remove more uncessnary memory copy and change memset to async --- src/dark_cuda.c | 2 +- src/route_layer.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dark_cuda.c b/src/dark_cuda.c index a0339b28a2e..d8e3e796686 100644 --- a/src/dark_cuda.c +++ b/src/dark_cuda.c @@ -391,7 +391,7 @@ float *cuda_make_array_init2zero(size_t n) { fprintf(stderr, " Try to set subdivisions=64 in your cfg-file. \n"); CHECK_CUDA(status); // status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice); - status =cudaMemset(x_gpu, 0, size); + status =cudaMemsetAsync(x_gpu, 0, size, 0); CHECK_CUDA(status); if (!x_gpu) error("Cuda malloc failed\n"); diff --git a/src/route_layer.c b/src/route_layer.c index 2e0699d59a5..57574f15f46 100644 --- a/src/route_layer.c +++ b/src/route_layer.c @@ -34,8 +34,8 @@ route_layer make_route_layer(int batch, int n, int *input_layers, int *input_siz l.forward_gpu = forward_route_layer_gpu; l.backward_gpu = backward_route_layer_gpu; - l.delta_gpu = cuda_make_array(l.delta, outputs*batch); - l.output_gpu = cuda_make_array(l.output, outputs*batch); + l.delta_gpu = cuda_make_array_init2zero(outputs*batch); + l.output_gpu = cuda_make_array_init2zero(outputs*batch); #endif return l; } From a589476f188f1b8b64767c54c49a079a16189972 Mon Sep 17 00:00:00 2001 From: Stefano Sinigardi Date: Fri, 1 Sep 2023 14:25:35 +0200 Subject: [PATCH 3/3] restore build compatibility --- src/dark_cuda.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/dark_cuda.c b/src/dark_cuda.c index b1dae9ea25e..2a76c60c49b 100644 --- a/src/dark_cuda.c +++ b/src/dark_cuda.c @@ -512,8 +512,7 @@ float *cuda_make_array_init2zero(size_t n) { // status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice); status =cudaMemsetAsync(x_gpu, 0, size, 0); CHECK_CUDA(status); - if (!x_gpu) - error("Cuda malloc failed\n"); + if (!x_gpu) error("Cuda malloc failed", DARKNET_LOC); return x_gpu; }