From d994354286b93e7f57e53683b97847f9711ea6cf Mon Sep 17 00:00:00 2001 From: manauref Date: Thu, 4 Jun 2020 23:13:52 -0400 Subject: [PATCH] First attempt at global CartField reduction. WORK IN PROGRESS. This commit is mainly to push the error checking functions in Cuda/GkylCudaFuncs.h, which are simply copied over from cuda_helper.h --- Cuda/GkylCudaFuncs.h | 430 +++++++++++++++++++++++++++++------ Unit/Reduce.cu | 411 +++++++++++++++++++++++++++++++++ Unit/test_CudaReductions.lua | 182 +++++++++++++++ 3 files changed, 951 insertions(+), 72 deletions(-) create mode 100644 Unit/Reduce.cu create mode 100644 Unit/test_CudaReductions.lua diff --git a/Cuda/GkylCudaFuncs.h b/Cuda/GkylCudaFuncs.h index c0b44e7f5..0cb9ffd6d 100644 --- a/Cuda/GkylCudaFuncs.h +++ b/Cuda/GkylCudaFuncs.h @@ -9,80 +9,366 @@ #define GKYL_CUDA_FUNCS_H #include +#include extern "C" { -// Pre-defined objects and constants - -// Error codes (enum cudaError) - DECL_GET_CUDA_OBJECT(int, cudaSuccess); - DECL_GET_CUDA_OBJECT(int, cudaErrorInvalidValue); - DECL_GET_CUDA_OBJECT(int, cudaErrorMemoryAllocation); - DECL_GET_CUDA_OBJECT(int, cudaErrorInvalidMemcpyDirection); - -// Codes for CudaMemcpy (enum cudaMemcpyKind) - DECL_GET_CUDA_OBJECT(int, cudaMemcpyHostToHost); - DECL_GET_CUDA_OBJECT(int, cudaMemcpyHostToDevice); - DECL_GET_CUDA_OBJECT(int, cudaMemcpyDeviceToHost); - DECL_GET_CUDA_OBJECT(int, cudaMemcpyDeviceToDevice); - DECL_GET_CUDA_OBJECT(int, cudaMemcpyDefault); - -// Flags for cudaMallocManaged - DECL_GET_CUDA_OBJECT(unsigned, cudaMemAttachGlobal); - DECL_GET_CUDA_OBJECT(unsigned, cudaMemAttachHost); - -// This struct has most elements in cudaDeviceProp struct. Directly -// using that struct from LuaJIT is causing a segfault and so this -// copying is needed - typedef struct { - char name[256]; - size_t totalGlobalMem; - size_t sharedMemPerBlock; - int regsPerBlock; - int warpSize; - size_t memPitch; - int maxThreadsPerBlock; - int maxThreadsDim[3]; - int maxGridSize[3]; - int clockRate; - size_t totalConstMem; - int major; - int minor; - size_t textureAlignment; - size_t texturePitchAlignment; - int deviceOverlap; - int multiProcessorCount; - int kernelExecTimeoutEnabled; - int integrated; - int canMapHostMemory; - int computeMode; - int concurrentKernels; - int ECCEnabled; - int asyncEngineCount; - int unifiedAddressing; - int memoryClockRate; - int memoryBusWidth; - int l2CacheSize; - int maxThreadsPerMultiProcessor; - int streamPrioritiesSupported; - int globalL1CacheSupported; - int localL1CacheSupported; - size_t sharedMemPerMultiprocessor; - int regsPerMultiprocessor; - int managedMemory; - int isMultiGpuBoard; - int multiGpuBoardGroupID; - int singleToDoublePrecisionPerfRatio; - int pageableMemoryAccess; - int concurrentManagedAccess; - int computePreemptionSupported; - int canUseHostPointerForRegisteredMem; - int cooperativeLaunch; - int cooperativeMultiDeviceLaunch; - int pageableMemoryAccessUsesHostPageTables; - int directManagedMemAccessFromHost; - } GkDeviceProp; - - int GkCuda_GetDeviceProp(GkDeviceProp *prop, int dev); + // Pre-defined objects and constants + + // Error codes (enum cudaError) + DECL_GET_CUDA_OBJECT(int, cudaSuccess); + DECL_GET_CUDA_OBJECT(int, cudaErrorInvalidValue); + DECL_GET_CUDA_OBJECT(int, cudaErrorMemoryAllocation); + DECL_GET_CUDA_OBJECT(int, cudaErrorInvalidMemcpyDirection); + + // Codes for CudaMemcpy (enum cudaMemcpyKind) + DECL_GET_CUDA_OBJECT(int, cudaMemcpyHostToHost); + DECL_GET_CUDA_OBJECT(int, cudaMemcpyHostToDevice); + DECL_GET_CUDA_OBJECT(int, cudaMemcpyDeviceToHost); + DECL_GET_CUDA_OBJECT(int, cudaMemcpyDeviceToDevice); + DECL_GET_CUDA_OBJECT(int, cudaMemcpyDefault); + + // Flags for cudaMallocManaged + DECL_GET_CUDA_OBJECT(unsigned, cudaMemAttachGlobal); + DECL_GET_CUDA_OBJECT(unsigned, cudaMemAttachHost); + + // This struct has most elements in cudaDeviceProp struct. Directly + // using that struct from LuaJIT is causing a segfault and so this + // copying is needed + typedef struct { + char name[256]; + size_t totalGlobalMem; + size_t sharedMemPerBlock; + int regsPerBlock; + int warpSize; + size_t memPitch; + int maxThreadsPerBlock; + int maxThreadsDim[3]; + int maxGridSize[3]; + int clockRate; + size_t totalConstMem; + int major; + int minor; + size_t textureAlignment; + size_t texturePitchAlignment; + int deviceOverlap; + int multiProcessorCount; + int kernelExecTimeoutEnabled; + int integrated; + int canMapHostMemory; + int computeMode; + int concurrentKernels; + int ECCEnabled; + int asyncEngineCount; + int unifiedAddressing; + int memoryClockRate; + int memoryBusWidth; + int l2CacheSize; + int maxThreadsPerMultiProcessor; + int streamPrioritiesSupported; + int globalL1CacheSupported; + int localL1CacheSupported; + size_t sharedMemPerMultiprocessor; + int regsPerMultiprocessor; + int managedMemory; + int isMultiGpuBoard; + int multiGpuBoardGroupID; + int singleToDoublePrecisionPerfRatio; + int pageableMemoryAccess; + int concurrentManagedAccess; + int computePreemptionSupported; + int canUseHostPointerForRegisteredMem; + int cooperativeLaunch; + int cooperativeMultiDeviceLaunch; + int pageableMemoryAccessUsesHostPageTables; + int directManagedMemAccessFromHost; + } GkDeviceProp; + + int GkCuda_GetDeviceProp(GkDeviceProp *prop, int dev); + +} + +// CUDA Runtime error messages +#ifdef __DRIVER_TYPES_H__ +static const char *_cudaGetErrorEnum(cudaError_t error) { + return cudaGetErrorName(error); +} +#endif + +#ifdef CUDA_DRIVER_API +// CUDA Driver API errors +static const char *_cudaGetErrorEnum(CUresult error) { + static char unknown[] = ""; + const char *ret = NULL; + cuGetErrorName(error, &ret); + return ret ? ret : unknown; +} +#endif + +#ifdef CUBLAS_API_H_ +// cuBLAS API errors +static const char *_cudaGetErrorEnum(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + } + + return ""; +} +#endif + +#ifdef _CUFFT_H_ +// cuFFT API errors +static const char *_cudaGetErrorEnum(cufftResult error) { + switch (error) { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; + + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; + + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; + + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSPARSEAPI +// cuSPARSE API errors +static const char *_cudaGetErrorEnum(cusparseStatus_t error) { + switch (error) { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSOLVER_COMMON_H_ +// cuSOLVER API errors +static const char *_cudaGetErrorEnum(cusolverStatus_t error) { + switch (error) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_NOT_SUPPORTED "; + case CUSOLVER_STATUS_ZERO_PIVOT: + return "CUSOLVER_STATUS_ZERO_PIVOT"; + case CUSOLVER_STATUS_INVALID_LICENSE: + return "CUSOLVER_STATUS_INVALID_LICENSE"; + } + + return ""; +} +#endif + +#ifdef CURAND_H_ +// cuRAND API errors +static const char *_cudaGetErrorEnum(curandStatus_t error) { + switch (error) { + case CURAND_STATUS_SUCCESS: + return "CURAND_STATUS_SUCCESS"; + + case CURAND_STATUS_VERSION_MISMATCH: + return "CURAND_STATUS_VERSION_MISMATCH"; + + case CURAND_STATUS_NOT_INITIALIZED: + return "CURAND_STATUS_NOT_INITIALIZED"; + + case CURAND_STATUS_ALLOCATION_FAILED: + return "CURAND_STATUS_ALLOCATION_FAILED"; + + case CURAND_STATUS_TYPE_ERROR: + return "CURAND_STATUS_TYPE_ERROR"; + + case CURAND_STATUS_OUT_OF_RANGE: + return "CURAND_STATUS_OUT_OF_RANGE"; + + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + + case CURAND_STATUS_LAUNCH_FAILURE: + return "CURAND_STATUS_LAUNCH_FAILURE"; + + case CURAND_STATUS_PREEXISTING_FAILURE: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + + case CURAND_STATUS_INITIALIZATION_FAILED: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + + case CURAND_STATUS_ARCH_MISMATCH: + return "CURAND_STATUS_ARCH_MISMATCH"; + + case CURAND_STATUS_INTERNAL_ERROR: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +template +void check(T result, char const *const func, const char *const file, + int const line) { + if (result) { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, + static_cast(result), _cudaGetErrorEnum(result), func); + exit(EXIT_FAILURE); + } +} + +#ifdef __DRIVER_TYPES_H__ +// This will output the proper CUDA error strings in the event +// that a CUDA host call returns an error +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + +// This will output the proper error string when calling cudaGetLastError +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) { + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, static_cast(err), + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +// This will only print the proper error string when calling cudaGetLastError +// but not exit program incase error detected. +#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) + +inline void __printLastCudaError(const char *errorMessage, const char *file, + const int line) { + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, static_cast(err), + cudaGetErrorString(err)); + } } +#endif #endif // GKYL_CUDA_FUNCS_H diff --git a/Unit/Reduce.cu b/Unit/Reduce.cu new file mode 100644 index 000000000..388ca275c --- /dev/null +++ b/Unit/Reduce.cu @@ -0,0 +1,411 @@ +// Gkyl ------------------------------------------------------------------------ +// +// Functions to compute reductions in GPU (Cuda). +// +// _______ ___ +// + 6 @ |||| # P ||| + +//------------------------------------------------------------------------------ + +#include +#include + +#include +#include +#include + +extern "C" +{ + void getNumBlocksAndThreads(GkDeviceProp *prop, int numElements, int maxBlocks, + int maxThreads, int *blocks, int *threads); + void cartFieldMax(int numCellsTot, int numBlocks, int numThreads, int maxBlocks, int maxThreads, GkDeviceProp *prop, + GkylCartField_t *fIn, double *blockOut, double *intermediate, double *out); +} + +bool isPow2(unsigned int x) { return ((x & (x - 1)) == 0); } + +unsigned int nextPow2(unsigned int x) { + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return ++x; +} + +#ifndef MIN +#define MIN(x, y) ((x < y) ? x : y) +#endif + +#ifndef MAX +#define MAX(x, y) ((x > y) ? x : y) +#endif + +// Compute the number of threads and blocks to use for the given reduction +// kerne. We set threads/block to the minimum of maxThreads and n/2. +// We observe the maximum specified number of blocks, because +// each thread in the kernel can process a variable number of elements. +void getNumBlocksAndThreads(GkDeviceProp *prop, int numElements, int maxBlocks, + int maxThreads, int *blocks, int *threads) { + + threads[0] = (numElements < maxThreads * 2) ? nextPow2((numElements + 1) / 2) : maxThreads; + blocks[0] = (numElements + (threads * 2 - 1)) / (threads * 2); + + if ((float)threads * blocks > + (float)(prop->maxGridSize)[0] * prop->maxThreadsPerBlock) { + printf("n is too large, please choose a smaller number!\n"); + } + + if (blocks > (prop->maxGridSize)[0]) { + printf("Grid size <%d> exceeds the device capability <%d>, set block size as %d (original %d)\n", + blocks, (prop->maxGridSize)[0], threads * 2, threads); + + blocks /= 2; + threads *= 2; + } + + blocks = MIN(maxBlocks, blocks); +} + +// This version adds multiple elements per thread sequentially. This reduces +// the overall cost of the algorithm while keeping the work complexity O(n) and +// the step complexity O(log n). (Brent's Theorem optimization) +// Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory. +// In other words if blockSize <= 32, allocate 64*sizeof(T) bytes. +// If blockSize > 32, allocate blockSize*sizeof(T) bytes. +template +__global__ void d_reduceCartField(GkylCartField_t *fIn, double *redPerBlock) { + // Handle to thread block group + cg::thread_block cgThreadBlock = cg::this_thread_block(); + extern __shared__ double sdata[]; // Stores partial reductions. + + // Perform first level of reduction, reading from global memory, writing to shared memory. + unsigned int tID = threadIdx.x; + unsigned int linearIdx = blockIdx.x * BLOCKSIZE * 2 + threadIdx.x; + unsigned int gridSize = BLOCKSIZE * 2 * gridDim.x; + + double myMax = 0; + + GkylRange_t *localRange = fIn->localRange; + Gkyl::GenIndexer localIdxr(localRange); + Gkyl::GenIndexer fIdxr = fIn->genIndexer(); + + int idx[2]; // Need to template over CDIM+VDIM + localIdxr.invIndex(linearIdx, idx); + int linIdx = fIdxr.index(idx); + + const double *fld = fIn->getDataPtrAt(linIdx); + unsigned int numCellsTot = localRange->volume(); + + // We reduce multiple elements per thread. The number is determined by the + // number of active thread blocks (via gridDim). More blocks will result + // in a larger gridSize and therefore fewer elements per thread. + while (linearIdx < numCellsTot) { + myMax = MAX(myMax, fld[0]); + + // Ensure we don't read out of bounds (optimized away for powerOf2 sized arrays). + if (nIsPow2 || linearIdx + BLOCKSIZE < numCellsTot) { + unsigned int newLinearIdx = linearIdx + BLOCKSIZE; + localIdxr.invIndex(newLinearIdx, idx); + linIdx = fIdxr.index(idx); + fld = fIn->getDataPtrAt(linIdx); + + myMax = MAX(myMax, fld[0]); + } + + linearIdx += gridSize; + } + + // Each thread puts its local max into shared memory. + sdata[tID] = myMax; + cg::sync(cgThreadBlock); + + // Do reduction in shared mem. + if ((BLOCKSIZE >= 512) && (tID < 256)) { + sdata[tID] = myMax = MAX(myMax, sdata[tID + 256]); + } + + cg::sync(cgThreadBlock); + + if ((BLOCKSIZE >= 256) && (tID < 128)) { + sdata[tID] = myMax = MAX(myMax, sdata[tID + 128]); + } + + cg::sync(cgThreadBlock); + + if ((BLOCKSIZE >= 128) && (tID < 64)) { + sdata[tID] = myMax = MAX(myMax, sdata[tID + 64]); + } + + cg::sync(cgThreadBlock); + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cgThreadBlock); + + if (cgThreadBlock.thread_rank() < 32) { + // Fetch final intermediate max from 2nd warp. + if (BLOCKSIZE >= 64) myMax = MAX(myMax, sdata[tID + 32]); + // Reduce final warp using shuffle. + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { + myMax = MAX(myMax, tile32.shfl_down(myMax, offset)); + } + } + + // Write result for this block to global mem. + if (cgThreadBlock.thread_rank() == 0) redPerBlock[blockIdx.x] = myMax; +} + +// This algorithm reduces multiple elements per thread sequentially. This reduces +// the overall cost of the algorithm while keeping the work complexity O(n) and +// the step complexity O(log n). (Brent's Theorem optimization) +// Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory. +// In other words if blockSize <= 32, allocate 64*sizeof(T) bytes. +// If blockSize > 32, allocate blockSize*sizeof(T) bytes. +template +__global__ void d_reduceMax(double *dataIn, double *out, unsigned int nElements) { + // Handle to thread block group. + cg::thread_block cgThreadBlock = cg::this_thread_block(); + extern __shared__ double sdata[]; // Stores partial reductions. + + // Perform first level of reduction, reading from global memory, writing to shared memory. + unsigned int tID = threadIdx.x; + unsigned int linearIdx = blockIdx.x * BLOCKSIZE * 2 + threadIdx.x; + unsigned int gridSize = BLOCKSIZE * 2 * gridDim.x; + + double myMax = 0; + + // We reduce multiple elements per thread. The number is determined by the + // number of active thread blocks (via gridDim). More blocks will result + // in a larger gridSize and therefore fewer elements per thread + while (linearIdx < nElements) { + myMax = MAX(myMax, dataIn[linearIdx]); + + // Ensure we don't read out of bounds (optimized away for powerOf2 sized arrays)/ + if (nIsPow2 || linearIdx + BLOCKSIZE < nElements) myMax = MAX(myMax, dataIn[linearIdx + BLOCKSIZE]); + + linearIdx += gridSize; + } + + // Each thread puts its local sum into shared memory. + sdata[tID] = myMax; + cg::sync(cgThreadBlock); + + // Do reduction in shared mem. + if ((BLOCKSIZE >= 512) && (tID < 256)) { + sdata[tID] = myMax = MAX(myMax, sdata[tID + 256]); + } + + cg::sync(cgThreadBlock); + + if ((BLOCKSIZE >= 256) && (tID < 128)) { + sdata[tID] = myMax = MAX(myMax, sdata[tID + 128]); + } + + cg::sync(cgThreadBlock); + + if ((BLOCKSIZE >= 128) && (tID < 64)) { + sdata[tID] = myMax = MAX(myMax, sdata[tID + 64]); + } + + cg::sync(cgThreadBlock); + + cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cgThreadBlock); + + if (cgThreadBlock.thread_rank() < 32) { + // Fetch final intermediate sum from 2nd warp. + if (BLOCKSIZE >= 64) myMax = MAX(myMax, sdata[tID + 32]); + // Reduce final warp using shuffle. + for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { + myMax = MAX(myMax, tile32.shfl_down(myMax, offset)); + } + } + + // Write result for this block to global mem. + if (cgThreadBlock.thread_rank() == 0) out[blockIdx.x] = myMax; +} + +void reduceCartField(int numCellsTot, int blocks, int threads, GkylCartField_t *fIn, double *blockRed) { + // Launch the device kernel that reduces a CartField to an array, + // each element of the array being the reduction performed by a block. + + // When there is only one warp per block, we need to allocate two warps + // worth of shared memory so that we don't index shared memory out of bounds + int smemSize = (threads <= 32) ? 2 * threads * sizeof(double) : threads * sizeof(double); + + if (isPow2(numCellsTot)) { + switch (threads) { + case 512: + d_reduceCartField<512, true><<>>(fIn, blockRed); + break; + case 256: + d_reduceCartField<256, true><<>>(fIn, blockRed); + break; + case 128: + d_reduceCartField<128, true><<>>(fIn, blockRed); + break; + case 64: + d_reduceCartField<64, true><<>>(fIn, blockRed); + break; + case 32: + d_reduceCartField<32, true><<>>(fIn, blockRed); + break; + case 16: + d_reduceCartField<16, true><<>>(fIn, blockRed); + break; + case 8: + d_reduceCartField<8, true><<>>(fIn, blockRed); + break; + case 4: + d_reduceCartField<4, true><<>>(fIn, blockRed); + break; + case 2: + d_reduceCartField<2, true><<>>(fIn, blockRed); + break; + case 1: + d_reduceCartField<1, true><<>>(fIn, blockRed); + break; + } + } else { + switch (threads) { + case 512: + d_reduceCartField<512, false><<>>(fIn, blockRed); + break; + case 256: + d_reduceCartField<256, false><<>>(fIn, blockRed); + break; + case 128: + d_reduceCartField<128, false><<>>(fIn, blockRed); + break; + case 64: + d_reduceCartField<64, false><<>>(fIn, blockRed); + break; + case 32: + d_reduceCartField<32, false><<>>(fIn, blockRed); + break; + case 16: + d_reduceCartField<16, false><<>>(fIn, blockRed); + break; + case 8: + d_reduceCartField<8, false><<>>(fIn, blockRed); + break; + case 4: + d_reduceCartField<4, false><<>>(fIn, blockRed); + break; + case 2: + d_reduceCartField<2, false><<>>(fIn, blockRed); + break; + case 1: + d_reduceCartField<1, false><<>>(fIn, blockRed); + break; + } + } +} + +void reduceMax(int numElements, int blocks, int threads, double *d_dataIn, double *d_dataOut) { + // Launch the device kernel that reduces a device array 'd_dataIn' + // containing 'numElements' elements. + + // When there is only one warp per block, we need to allocate two warps + // worth of shared memory so that we don't index shared memory out of bounds + int smemSize = (threads <= 32) ? 2 * threads * sizeof(double) : threads * sizeof(double); + + if (isPow2(numElements)) { + switch (threads) { + case 512: + d_reduceMax<512, true><<>>(d_dataIn, d_dataOut, numElements); + break; + case 256: + d_reduceMax<256, true><<>>(d_dataIn, d_dataOut, numElements); + break; + case 128: + d_reduceMax<128, true><<>>(d_dataIn, d_dataOut, numElements); + break; + case 64: + d_reduceMax<64, true><<>>(d_dataIn, d_dataOut, numElements); + break; + case 32: + d_reduceMax<32, true><<>>(d_dataIn, d_dataOut, numElements); + break; + case 16: + d_reduceMax<16, true><<>>(d_dataIn, d_dataOut, numElements); + break; + case 8: + d_reduceMax<8, true><<>>(d_dataIn, d_dataOut, numElements); + break; + case 4: + d_reduceMax<4, true><<>>(d_dataIn, d_dataOut, numElements); + break; + case 2: + d_reduceMax<2, true><<>>(d_dataIn, d_dataOut, numElements); + break; + case 1: + d_reduceMax<1, true><<>>(d_dataIn, d_dataOut, numElements); + break; + } + } else { + switch (threads) { + case 512: + d_reduceMax<512, false><<>>(d_dataIn, d_dataOut, numElements); + break; + case 256: + d_reduceMax<256, false><<>>(d_dataIn, d_dataOut, numElements); + break; + case 128: + d_reduceMax<128, false><<>>(d_dataIn, d_dataOut, numElements); + break; + case 64: + d_reduceMax<64, false><<>>(d_dataIn, d_dataOut, numElements); + break; + case 32: + d_reduceMax<32, false><<>>(d_dataIn, d_dataOut, numElements); + break; + case 16: + d_reduceMax<16, false><<>>(d_dataIn, d_dataOut, numElements); + break; + case 8: + d_reduceMax<8, false><<>>(d_dataIn, d_dataOut, numElements); + break; + case 4: + d_reduceMax<4, false><<>>(d_dataIn, d_dataOut, numElements); + break; + case 2: + d_reduceMax<2, false><<>>(d_dataIn, d_dataOut, numElements); + break; + case 1: + d_reduceMax<1, false><<>>(d_dataIn, d_dataOut, numElements); + break; + } + } +} + +void cartFieldMax(int numCellsTot, int numBlocks, int numThreads, int maxBlocks, int maxThreads, GkDeviceProp *prop, + GkylCartField_t *fIn, double *blockOut, double *intermediate, double *out) { + // Find the maximum in the CartField 'fIn' (type double) and place + // it in the device-memory variable 'out'. + // This function follows 'reduce6' (using Cooperative Groups) in cuda-samples: + // https://github.com/NVIDIA/cuda-samples/tree/master/Samples/reduction + // The algorithm uses two other temporary variables: 'blockOut' and + // 'intermediate' have size=numBlocks and were allocated already. + + + // Call the kernel that reduces a CartField (fIn) into a device array (blockOut) + // which contains the reduction performed by each block. + reduceCartField(numCellsTot, numBlocks, numThreads, fIn, blockOut); + + // Reduce partial block reductions on GPU. + int newNum = numBlocks; + while (newNum > 1) { + int threads = 0, blocks = 0; + + getNumBlocksAndThreads(prop, newNum, maxBlocks, maxThreads, &blocks, &threads); + + checkCudaErrors(cudaMemcpy(intermediate, blockOut, newNum * sizeof(double), cudaMemcpyDeviceToDevice)); + + reduceMax(newNum, blocks, threads, intermediate, blockOut); + + newNum = (newNum + (threads * 2 - 1)) / (threads * 2); + } + + // Copy final reduction to output variable. + checkCudaErrors(cudaMemcpy(out, blockOut, sizeof(double), cudaMemcpyDeviceToDevice)); +} + diff --git a/Unit/test_CudaReductions.lua b/Unit/test_CudaReductions.lua new file mode 100644 index 000000000..219c97009 --- /dev/null +++ b/Unit/test_CudaReductions.lua @@ -0,0 +1,182 @@ +-- Gkyl ------------------------------------------------------------------------ +-- +-- Test for CUDA kernels that perform reductions. +-- +-- _______ ___ +-- + 6 @ |||| # P ||| + +-------------------------------------------------------------------------------- + +-- Don't do anything if we were not built with CUDA. +if GKYL_HAVE_CUDA == false then + print("**** Can't run CUDA tests without CUDA enabled GPUs!") + return 0 +end + +local Unit = require "Unit" +local DataStruct = require "DataStruct" +local Grid = require "Grid" +local Basis = require "Basis" +local lume = require "Lib.lume" +local cudaRunTime = require "Cuda.RunTime" +local cudaAlloc = require "Cuda.Alloc" +local Alloc = require "Lib.Alloc" +local ffi = require "ffi" + +local assert_equal = Unit.assert_equal +local stats = Unit.stats + +ffi.cdef [[ + double reduceMax(int numBlocks, int numThreads, int maxBlocks, int maxThreads, GkDeviceProp *prop, GkylCartField_t *fIn, double *blockOut, double *intermediate, double *out); +]] + +local function createGrid(lo,up,nCells) + local gridOut = Grid.RectCart { + lower = lo, + upper = up, + cells = nCells, + } + return gridOut +end + +local function createBasis(dim, pOrder, bKind) + local basis + if (bKind=="Ser") then + basis = Basis.CartModalSerendipity { ndim = dim, polyOrder = pOrder } + elseif (bKind=="Max") then + basis = Basis.CartModalMaxOrder { ndim = dim, polyOrder = pOrder } + elseif (bKind=="Tensor") then + basis = Basis.CartModalTensor { ndim = dim, polyOrder = pOrder } + else + assert(false,"Invalid basis") + end + return basis +end + +local function createField(grid, basis, deviceCopy, ghosts, isP0, vComp) + vComp = vComp or 1 + if ghost then + ghostCells = {1, 1} + else + ghostCells = {0, 0} + end + if isP0 then + numBasisElements = 1 + else + numBasisElements = basis:numBasis()*vComp + end + local fld = DataStruct.Field { + onGrid = grid, + numComponents = numBasisElements, + ghost = ghostCells, + createDeviceCopy = deviceCopy, + metaData = { + polyOrder = basis:polyOrder(), + basisType = basis:id() + }, + } + return fld +end + +local function nextPowerOf2(nIn) + return math.floor(2^(math.ceil(math.log(5,2)))) +end + +local function prevPowerOf2(nIn) + return math.floor(2^(math.floor(math.log(5,2)))) +end + +local function getNumBlocksAndThreadsReduceP0(numElements, maxBlocks, maxThreads, devProp) + -- Given a 1-component field (p=0) with numElements cells, establish how + -- many thread blocks and threads/block to use in performing a reduction + -- on a GPU with device properties devProp. + -- This function follows 'reduce6' (using Cooperative Groups) in cuda-samples: + -- https://github.com/NVIDIA/cuda-samples/tree/master/Samples/reduction + + local numBlocks, numThreads + if numElements < maxThreads*2 then + numThreads = nextPowerOf2(math.floor((numElements+1)/2)) + else + numThreads = maxThreads + end + + numBlocks = math.floor((numElements + (numThreads*2-1))/(numThreads*2)) + + if (numBlocks > devProp.maxGridSize[1]) then + print(string.format("Grid size <%d> exceeds the device capability <%d>, set block size as %d (original %d)",numBlocks, devProp.maxGridSize[1], numThreads*2, numThreads)) + numBlocks = math.floor(numBlocks/2) + numThreads = numThreads*2 + end + + numBlocks = math.min(maxBlocks, numBlocks) + + return numBlocks, numThreads +end + + +local pOrder = 1 +local basis = "Ser" +local phaseLower = {0.0, -6.0} +local phaseUpper = {1.0, 6.0} +local phaseNumCells = {8, 32} +local confLower = { phaseLower[1]} +local confUpper = { phaseUpper[1]} +local confNumCells = {phaseNumCells[1]} + +local devNum, _ = cudaRunTime.GetDevice() +local err = cudaRunTime.SetDevice(devNum) +local devProp, err = cudaRunTime.GetDeviceProperties(devNum) + +local numCellsTot = lume.reduce(phaseNumCells, function(a,b) return a*b end) +local numBlocksMAX = 64 +local numThreadsMAX = GKYL_DEFAULT_NUM_THREADS + +-- Phase-space and config-space grids. +local phaseGrid = createGrid(phaseLower, phaseUpper, phaseNumCells) +local confGrid = createGrid(confLower, confUpper, confNumCells) +-- Basis functions. +local phaseBasis = createBasis(phaseGrid:ndim(), pOrder, basis) +local confBasis = createBasis(confGrid:ndim(), pOrder, basis) +-- Fields. +local p0Field = createField(phaseGrid, phaseBasis, true, false, true) +-- Initialize field to random numbers. +math.randomseed(1000*os.time()) +local fldRange = p0Field:localRange() +local fldIdxr = p0Field:genIndexer() +for idx in fldRange:rowMajorIter() do + local fldItr = p0Field:get(fldIdxr( idx )) + fldItr[1] = math.random() +end +-- Get the maximum on the CPU (for reference). +maxVal = -9.0e19 +for idx in fldRange:rowMajorIter() do + local fldItr = p0Field:get(fldIdxr( idx )) + maxVal = math.max(maxVal,fldItr[1]) +end +print(string.format(" CPU max = %f",maxVal)) +print("") + + +print(" Operation on the GPU:") +-- Establish the number of blocks and threads to use. +local numBlocks, numThreads = getNumBlocksAndThreadsReduceP0(numCellsTot,numBlocksMAX,numThreadsMAX,devProp) +print(string.format(" Use %d blocks of size %d (threads)",numBlocks,numThreads)) + +-- Allocate device memory needed for intermediate reductions. +local d_blockRed, d_intermediateRed = cudaAlloc.Double(numBlocks), cudaAlloc.Double(numBlocks) +local d_maxVal = cudaAlloc.Double(1) + +--ffi.C.reduceMax(numBlocks, numThreads, numBlocksMAX, numThreadsMAX, devProp, +-- p0Field._onDevice, d_blockRed, d_intermediateRed, d_maxVal) + +-- Test that the value found is correct. +local maxVal_gpu = Alloc.Double(1) +local err = d_maxVal:copyDeviceToHost(maxVal_gpu) + +print(string.format(" GPU max = %f",maxVal_gpu[1])) +print("") +print(string.format(" Error = %f",maxVal-maxVal_gpu[1])) + +cudaRunTime.Free(d_blockRed) +cudaRunTime.Free(d_intermediateRed) +cudaRunTime.Free(d_maxVal) +