Skip to content

Commit

Permalink
CUDA: fix sum.cu compilation for CUDA < 11.7
Browse files Browse the repository at this point in the history
  • Loading branch information
JohannesGaessler committed Sep 20, 2024
1 parent 722ec1e commit 7f7e684
Showing 1 changed file with 8 additions and 4 deletions.
12 changes: 8 additions & 4 deletions ggml/src/ggml-cuda/sum.cu
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
#define USE_CUB
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700

#ifdef USE_CUB
// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
// For this reason CUB must be included BEFORE anything else.
#include <cub/cub.cuh>
using namespace cub;
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
#endif // USE_CUB

#include "sumrows.cuh"
#include "sum.cuh"

#include <cstdint>

void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
#ifdef USE_CUB
size_t tmp_size = 0;
DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream);
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
Expand All @@ -21,7 +25,7 @@ void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int
// For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
sum_rows_f32_cuda(x, dst, ne, 1, stream);
GGML_UNUSED(pool);
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
#endif // USE_CUB
}

void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
Expand Down

0 comments on commit 7f7e684

Please sign in to comment.