From d994354286b93e7f57e53683b97847f9711ea6cf Mon Sep 17 00:00:00 2001
From: manauref <mfrancisquez@gmail.com>
Date: Thu, 4 Jun 2020 23:13:52 -0400
Subject: [PATCH] First attempt at global CartField reduction. WORK IN
 PROGRESS. This commit is mainly to push the error checking functions in
 Cuda/GkylCudaFuncs.h, which are simply copied over from cuda_helper.h

---
 Cuda/GkylCudaFuncs.h         | 430 +++++++++++++++++++++++++++++------
 Unit/Reduce.cu               | 411 +++++++++++++++++++++++++++++++++
 Unit/test_CudaReductions.lua | 182 +++++++++++++++
 3 files changed, 951 insertions(+), 72 deletions(-)
 create mode 100644 Unit/Reduce.cu
 create mode 100644 Unit/test_CudaReductions.lua

diff --git a/Cuda/GkylCudaFuncs.h b/Cuda/GkylCudaFuncs.h
index c0b44e7f5..0cb9ffd6d 100644
--- a/Cuda/GkylCudaFuncs.h
+++ b/Cuda/GkylCudaFuncs.h
@@ -9,80 +9,366 @@
 #define GKYL_CUDA_FUNCS_H
 
 #include <GkylCudaMacros.h>
+#include <stdio.h>
 
 extern "C" {
-// Pre-defined objects and constants
-
-// Error codes (enum cudaError)
-    DECL_GET_CUDA_OBJECT(int, cudaSuccess);
-    DECL_GET_CUDA_OBJECT(int, cudaErrorInvalidValue);
-    DECL_GET_CUDA_OBJECT(int, cudaErrorMemoryAllocation);
-    DECL_GET_CUDA_OBJECT(int, cudaErrorInvalidMemcpyDirection);
-
-// Codes for CudaMemcpy (enum cudaMemcpyKind)
-    DECL_GET_CUDA_OBJECT(int, cudaMemcpyHostToHost);
-    DECL_GET_CUDA_OBJECT(int, cudaMemcpyHostToDevice);
-    DECL_GET_CUDA_OBJECT(int, cudaMemcpyDeviceToHost);
-    DECL_GET_CUDA_OBJECT(int, cudaMemcpyDeviceToDevice);
-    DECL_GET_CUDA_OBJECT(int, cudaMemcpyDefault);
-
-// Flags for cudaMallocManaged
-    DECL_GET_CUDA_OBJECT(unsigned, cudaMemAttachGlobal);
-    DECL_GET_CUDA_OBJECT(unsigned, cudaMemAttachHost);
-
-// This struct has most elements in cudaDeviceProp struct. Directly
-// using that struct from LuaJIT is causing a segfault and so this
-// copying is needed
-    typedef struct {
-        char name[256];
-        size_t totalGlobalMem;
-        size_t sharedMemPerBlock;
-        int regsPerBlock;
-        int warpSize;
-        size_t memPitch;
-        int maxThreadsPerBlock;
-        int maxThreadsDim[3];
-        int maxGridSize[3];
-        int clockRate;
-        size_t totalConstMem;
-        int major;
-        int minor;
-        size_t textureAlignment;
-        size_t texturePitchAlignment;
-        int deviceOverlap;
-        int multiProcessorCount;
-        int kernelExecTimeoutEnabled;
-        int integrated;
-        int canMapHostMemory;
-        int computeMode;
-        int concurrentKernels;
-        int ECCEnabled;
-        int asyncEngineCount;
-        int unifiedAddressing;
-        int memoryClockRate;
-        int memoryBusWidth;
-        int l2CacheSize;
-        int maxThreadsPerMultiProcessor;
-        int streamPrioritiesSupported;
-        int globalL1CacheSupported;
-        int localL1CacheSupported;
-        size_t sharedMemPerMultiprocessor;
-        int regsPerMultiprocessor;
-        int managedMemory;
-        int isMultiGpuBoard;
-        int multiGpuBoardGroupID;
-        int singleToDoublePrecisionPerfRatio;
-        int pageableMemoryAccess;
-        int concurrentManagedAccess;
-        int computePreemptionSupported;
-        int canUseHostPointerForRegisteredMem;
-        int cooperativeLaunch;
-        int cooperativeMultiDeviceLaunch;
-        int pageableMemoryAccessUsesHostPageTables;
-        int directManagedMemAccessFromHost;
-    } GkDeviceProp;
-
-    int GkCuda_GetDeviceProp(GkDeviceProp *prop, int dev);
+  // Pre-defined objects and constants
+
+  // Error codes (enum cudaError)
+  DECL_GET_CUDA_OBJECT(int, cudaSuccess);
+  DECL_GET_CUDA_OBJECT(int, cudaErrorInvalidValue);
+  DECL_GET_CUDA_OBJECT(int, cudaErrorMemoryAllocation);
+  DECL_GET_CUDA_OBJECT(int, cudaErrorInvalidMemcpyDirection);
+
+  // Codes for CudaMemcpy (enum cudaMemcpyKind)
+  DECL_GET_CUDA_OBJECT(int, cudaMemcpyHostToHost);
+  DECL_GET_CUDA_OBJECT(int, cudaMemcpyHostToDevice);
+  DECL_GET_CUDA_OBJECT(int, cudaMemcpyDeviceToHost);
+  DECL_GET_CUDA_OBJECT(int, cudaMemcpyDeviceToDevice);
+  DECL_GET_CUDA_OBJECT(int, cudaMemcpyDefault);
+
+  // Flags for cudaMallocManaged
+  DECL_GET_CUDA_OBJECT(unsigned, cudaMemAttachGlobal);
+  DECL_GET_CUDA_OBJECT(unsigned, cudaMemAttachHost);
+
+  // This struct has most elements in cudaDeviceProp struct. Directly
+  // using that struct from LuaJIT is causing a segfault and so this
+  // copying is needed
+  typedef struct {
+    char name[256];
+    size_t totalGlobalMem;
+    size_t sharedMemPerBlock;
+    int regsPerBlock;
+    int warpSize;
+    size_t memPitch;
+    int maxThreadsPerBlock;
+    int maxThreadsDim[3];
+    int maxGridSize[3];
+    int clockRate;
+    size_t totalConstMem;
+    int major;
+    int minor;
+    size_t textureAlignment;
+    size_t texturePitchAlignment;
+    int deviceOverlap;
+    int multiProcessorCount;
+    int kernelExecTimeoutEnabled;
+    int integrated;
+    int canMapHostMemory;
+    int computeMode;
+    int concurrentKernels;
+    int ECCEnabled;
+    int asyncEngineCount;
+    int unifiedAddressing;
+    int memoryClockRate;
+    int memoryBusWidth;
+    int l2CacheSize;
+    int maxThreadsPerMultiProcessor;
+    int streamPrioritiesSupported;
+    int globalL1CacheSupported;
+    int localL1CacheSupported;
+    size_t sharedMemPerMultiprocessor;
+    int regsPerMultiprocessor;
+    int managedMemory;
+    int isMultiGpuBoard;
+    int multiGpuBoardGroupID;
+    int singleToDoublePrecisionPerfRatio;
+    int pageableMemoryAccess;
+    int concurrentManagedAccess;
+    int computePreemptionSupported;
+    int canUseHostPointerForRegisteredMem;
+    int cooperativeLaunch;
+    int cooperativeMultiDeviceLaunch;
+    int pageableMemoryAccessUsesHostPageTables;
+    int directManagedMemAccessFromHost;
+  } GkDeviceProp;
+
+  int GkCuda_GetDeviceProp(GkDeviceProp *prop, int dev);
+
+}
+
+// CUDA Runtime error messages
+#ifdef __DRIVER_TYPES_H__
+static const char *_cudaGetErrorEnum(cudaError_t error) {
+  return cudaGetErrorName(error);
+}
+#endif
+
+#ifdef CUDA_DRIVER_API
+// CUDA Driver API errors
+static const char *_cudaGetErrorEnum(CUresult error) {
+  static char unknown[] = "<unknown>";
+  const char *ret = NULL;
+  cuGetErrorName(error, &ret);
+  return ret ? ret : unknown;
+}
+#endif
+
+#ifdef CUBLAS_API_H_
+// cuBLAS API errors
+static const char *_cudaGetErrorEnum(cublasStatus_t error) {
+  switch (error) {
+    case CUBLAS_STATUS_SUCCESS:
+      return "CUBLAS_STATUS_SUCCESS";
+
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef _CUFFT_H_
+// cuFFT API errors
+static const char *_cudaGetErrorEnum(cufftResult error) {
+  switch (error) {
+    case CUFFT_SUCCESS:
+      return "CUFFT_SUCCESS";
+
+    case CUFFT_INVALID_PLAN:
+      return "CUFFT_INVALID_PLAN";
+
+    case CUFFT_ALLOC_FAILED:
+      return "CUFFT_ALLOC_FAILED";
+
+    case CUFFT_INVALID_TYPE:
+      return "CUFFT_INVALID_TYPE";
+
+    case CUFFT_INVALID_VALUE:
+      return "CUFFT_INVALID_VALUE";
+
+    case CUFFT_INTERNAL_ERROR:
+      return "CUFFT_INTERNAL_ERROR";
+
+    case CUFFT_EXEC_FAILED:
+      return "CUFFT_EXEC_FAILED";
+
+    case CUFFT_SETUP_FAILED:
+      return "CUFFT_SETUP_FAILED";
+
+    case CUFFT_INVALID_SIZE:
+      return "CUFFT_INVALID_SIZE";
+
+    case CUFFT_UNALIGNED_DATA:
+      return "CUFFT_UNALIGNED_DATA";
+
+    case CUFFT_INCOMPLETE_PARAMETER_LIST:
+      return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+
+    case CUFFT_INVALID_DEVICE:
+      return "CUFFT_INVALID_DEVICE";
+
+    case CUFFT_PARSE_ERROR:
+      return "CUFFT_PARSE_ERROR";
+
+    case CUFFT_NO_WORKSPACE:
+      return "CUFFT_NO_WORKSPACE";
+
+    case CUFFT_NOT_IMPLEMENTED:
+      return "CUFFT_NOT_IMPLEMENTED";
+
+    case CUFFT_LICENSE_ERROR:
+      return "CUFFT_LICENSE_ERROR";
+
+    case CUFFT_NOT_SUPPORTED:
+      return "CUFFT_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSPARSEAPI
+// cuSPARSE API errors
+static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
+  switch (error) {
+    case CUSPARSE_STATUS_SUCCESS:
+      return "CUSPARSE_STATUS_SUCCESS";
+
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+      return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+    case CUSPARSE_STATUS_INVALID_VALUE:
+      return "CUSPARSE_STATUS_INVALID_VALUE";
+
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+      return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+      return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSOLVER_COMMON_H_
+// cuSOLVER API errors
+static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
+  switch (error) {
+    case CUSOLVER_STATUS_SUCCESS:
+      return "CUSOLVER_STATUS_SUCCESS";
+    case CUSOLVER_STATUS_NOT_INITIALIZED:
+      return "CUSOLVER_STATUS_NOT_INITIALIZED";
+    case CUSOLVER_STATUS_ALLOC_FAILED:
+      return "CUSOLVER_STATUS_ALLOC_FAILED";
+    case CUSOLVER_STATUS_INVALID_VALUE:
+      return "CUSOLVER_STATUS_INVALID_VALUE";
+    case CUSOLVER_STATUS_ARCH_MISMATCH:
+      return "CUSOLVER_STATUS_ARCH_MISMATCH";
+    case CUSOLVER_STATUS_MAPPING_ERROR:
+      return "CUSOLVER_STATUS_MAPPING_ERROR";
+    case CUSOLVER_STATUS_EXECUTION_FAILED:
+      return "CUSOLVER_STATUS_EXECUTION_FAILED";
+    case CUSOLVER_STATUS_INTERNAL_ERROR:
+      return "CUSOLVER_STATUS_INTERNAL_ERROR";
+    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSOLVER_STATUS_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_NOT_SUPPORTED ";
+    case CUSOLVER_STATUS_ZERO_PIVOT:
+      return "CUSOLVER_STATUS_ZERO_PIVOT";
+    case CUSOLVER_STATUS_INVALID_LICENSE:
+      return "CUSOLVER_STATUS_INVALID_LICENSE";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CURAND_H_
+// cuRAND API errors
+static const char *_cudaGetErrorEnum(curandStatus_t error) {
+  switch (error) {
+    case CURAND_STATUS_SUCCESS:
+      return "CURAND_STATUS_SUCCESS";
+
+    case CURAND_STATUS_VERSION_MISMATCH:
+      return "CURAND_STATUS_VERSION_MISMATCH";
+
+    case CURAND_STATUS_NOT_INITIALIZED:
+      return "CURAND_STATUS_NOT_INITIALIZED";
+
+    case CURAND_STATUS_ALLOCATION_FAILED:
+      return "CURAND_STATUS_ALLOCATION_FAILED";
+
+    case CURAND_STATUS_TYPE_ERROR:
+      return "CURAND_STATUS_TYPE_ERROR";
+
+    case CURAND_STATUS_OUT_OF_RANGE:
+      return "CURAND_STATUS_OUT_OF_RANGE";
+
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+
+    case CURAND_STATUS_LAUNCH_FAILURE:
+      return "CURAND_STATUS_LAUNCH_FAILURE";
+
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+      return "CURAND_STATUS_PREEXISTING_FAILURE";
+
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+      return "CURAND_STATUS_INITIALIZATION_FAILED";
+
+    case CURAND_STATUS_ARCH_MISMATCH:
+      return "CURAND_STATUS_ARCH_MISMATCH";
+
+    case CURAND_STATUS_INTERNAL_ERROR:
+      return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+           int const line) {
+  if (result) {
+    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+    exit(EXIT_FAILURE);
+  }
+}
+
+#ifdef __DRIVER_TYPES_H__
+// This will output the proper CUDA error strings in the event
+// that a CUDA host call returns an error
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char *errorMessage, const char *file,
+                               const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
+
+// This will only print the proper error string when calling cudaGetLastError
+// but not exit program incase error detected.
+#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __printLastCudaError(const char *errorMessage, const char *file,
+                                 const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+  }
 }
+#endif
 
 #endif // GKYL_CUDA_FUNCS_H
diff --git a/Unit/Reduce.cu b/Unit/Reduce.cu
new file mode 100644
index 000000000..388ca275c
--- /dev/null
+++ b/Unit/Reduce.cu
@@ -0,0 +1,411 @@
+// Gkyl ------------------------------------------------------------------------
+//
+// Functions to compute reductions in GPU (Cuda).
+//
+//    _______     ___
+// + 6 @ |||| # P ||| +
+//------------------------------------------------------------------------------
+
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+
+#include <GkylCudaFuncs.h>
+#include <GkylCartField.h>
+#include <GkylRange.h>
+
+extern "C"
+{
+  void getNumBlocksAndThreads(GkDeviceProp *prop, int numElements, int maxBlocks,
+                              int maxThreads, int *blocks, int *threads);
+  void cartFieldMax(int numCellsTot, int numBlocks, int numThreads, int maxBlocks, int maxThreads, GkDeviceProp *prop,
+                    GkylCartField_t *fIn, double *blockOut, double *intermediate, double *out);
+}
+
+bool isPow2(unsigned int x) { return ((x & (x - 1)) == 0); }
+
+unsigned int nextPow2(unsigned int x) {
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return ++x;
+}
+
+#ifndef MIN
+#define MIN(x, y) ((x < y) ? x : y)
+#endif
+
+#ifndef MAX
+#define MAX(x, y) ((x > y) ? x : y)
+#endif
+
+// Compute the number of threads and blocks to use for the given reduction
+// kerne. We set threads/block to the minimum of maxThreads and n/2.
+// We observe the maximum specified number of blocks, because
+// each thread in the kernel can process a variable number of elements.
+void getNumBlocksAndThreads(GkDeviceProp *prop, int numElements, int maxBlocks,
+                            int maxThreads, int *blocks, int *threads) {
+
+  threads[0] = (numElements < maxThreads * 2) ? nextPow2((numElements + 1) / 2) : maxThreads;
+  blocks[0]  = (numElements + (threads * 2 - 1)) / (threads * 2);
+
+  if ((float)threads * blocks >
+      (float)(prop->maxGridSize)[0] * prop->maxThreadsPerBlock) {
+    printf("n is too large, please choose a smaller number!\n");
+  }
+
+  if (blocks > (prop->maxGridSize)[0]) {
+    printf("Grid size <%d> exceeds the device capability <%d>, set block size as %d (original %d)\n",
+        blocks, (prop->maxGridSize)[0], threads * 2, threads);
+
+    blocks  /= 2;
+    threads *= 2;
+  }
+
+  blocks = MIN(maxBlocks, blocks);
+}
+
+// This version adds multiple elements per thread sequentially. This reduces
+// the overall cost of the algorithm while keeping the work complexity O(n) and
+// the step complexity O(log n). (Brent's Theorem optimization)
+// Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
+// In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
+// If blockSize > 32, allocate blockSize*sizeof(T) bytes.
+template <unsigned int BLOCKSIZE, bool nIsPow2>
+__global__ void d_reduceCartField(GkylCartField_t *fIn, double *redPerBlock) {
+  // Handle to thread block group
+  cg::thread_block cgThreadBlock = cg::this_thread_block();
+  extern __shared__ double sdata[];  // Stores partial reductions.
+
+  // Perform first level of reduction, reading from global memory, writing to shared memory.
+  unsigned int tID       = threadIdx.x;
+  unsigned int linearIdx = blockIdx.x * BLOCKSIZE * 2 + threadIdx.x;
+  unsigned int gridSize  = BLOCKSIZE * 2 * gridDim.x;
+
+  double myMax = 0;
+
+  GkylRange_t *localRange = fIn->localRange;
+  Gkyl::GenIndexer localIdxr(localRange);
+  Gkyl::GenIndexer fIdxr = fIn->genIndexer();
+
+  int idx[2];  // Need to template over CDIM+VDIM
+  localIdxr.invIndex(linearIdx, idx);
+  int linIdx = fIdxr.index(idx);
+
+  const double *fld        = fIn->getDataPtrAt(linIdx);
+  unsigned int numCellsTot = localRange->volume();
+
+  // We reduce multiple elements per thread.  The number is determined by the
+  // number of active thread blocks (via gridDim). More blocks will result
+  // in a larger gridSize and therefore fewer elements per thread.
+  while (linearIdx < numCellsTot) {
+    myMax = MAX(myMax, fld[0]);
+
+    // Ensure we don't read out of bounds (optimized away for powerOf2 sized arrays).
+    if (nIsPow2 || linearIdx + BLOCKSIZE < numCellsTot) {
+      unsigned int newLinearIdx = linearIdx + BLOCKSIZE;
+      localIdxr.invIndex(newLinearIdx, idx);
+      linIdx = fIdxr.index(idx);
+      fld    = fIn->getDataPtrAt(linIdx);
+
+      myMax = MAX(myMax, fld[0]);
+    }
+
+    linearIdx += gridSize;
+  }
+
+  // Each thread puts its local max into shared memory.
+  sdata[tID] = myMax;
+  cg::sync(cgThreadBlock);
+
+  // Do reduction in shared mem.
+  if ((BLOCKSIZE >= 512) && (tID < 256)) {
+    sdata[tID] = myMax = MAX(myMax, sdata[tID + 256]);
+  }
+
+  cg::sync(cgThreadBlock);
+
+  if ((BLOCKSIZE >= 256) && (tID < 128)) {
+    sdata[tID] = myMax = MAX(myMax, sdata[tID + 128]);
+  }
+
+  cg::sync(cgThreadBlock);
+
+  if ((BLOCKSIZE >= 128) && (tID < 64)) {
+    sdata[tID] = myMax = MAX(myMax, sdata[tID + 64]);
+  }
+
+  cg::sync(cgThreadBlock);
+
+  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cgThreadBlock);
+
+  if (cgThreadBlock.thread_rank() < 32) {
+    // Fetch final intermediate max from 2nd warp.
+    if (BLOCKSIZE >= 64) myMax = MAX(myMax, sdata[tID + 32]);
+    // Reduce final warp using shuffle.
+    for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
+      myMax = MAX(myMax, tile32.shfl_down(myMax, offset));
+    }
+  }
+
+  // Write result for this block to global mem.
+  if (cgThreadBlock.thread_rank() == 0) redPerBlock[blockIdx.x] = myMax;
+}
+
+// This algorithm reduces multiple elements per thread sequentially. This reduces
+// the overall cost of the algorithm while keeping the work complexity O(n) and
+// the step complexity O(log n). (Brent's Theorem optimization)
+// Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
+// In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
+// If blockSize > 32, allocate blockSize*sizeof(T) bytes.
+template <unsigned int BLOCKSIZE, bool nIsPow2>
+__global__ void d_reduceMax(double *dataIn, double *out, unsigned int nElements) {
+  // Handle to thread block group.
+  cg::thread_block cgThreadBlock = cg::this_thread_block();
+  extern __shared__ double sdata[];  // Stores partial reductions.
+
+  // Perform first level of reduction, reading from global memory, writing to shared memory.
+  unsigned int tID       = threadIdx.x;
+  unsigned int linearIdx = blockIdx.x * BLOCKSIZE * 2 + threadIdx.x;
+  unsigned int gridSize  = BLOCKSIZE * 2 * gridDim.x;
+
+  double myMax = 0;
+
+  // We reduce multiple elements per thread.  The number is determined by the
+  // number of active thread blocks (via gridDim).  More blocks will result
+  // in a larger gridSize and therefore fewer elements per thread
+  while (linearIdx < nElements) {
+    myMax = MAX(myMax, dataIn[linearIdx]);
+
+    // Ensure we don't read out of bounds (optimized away for powerOf2 sized arrays)/
+    if (nIsPow2 || linearIdx + BLOCKSIZE < nElements) myMax = MAX(myMax, dataIn[linearIdx + BLOCKSIZE]);
+
+    linearIdx += gridSize;
+  }
+
+  // Each thread puts its local sum into shared memory.
+  sdata[tID] = myMax;
+  cg::sync(cgThreadBlock);
+
+  // Do reduction in shared mem.
+  if ((BLOCKSIZE >= 512) && (tID < 256)) {
+    sdata[tID] = myMax = MAX(myMax, sdata[tID + 256]);
+  }
+
+  cg::sync(cgThreadBlock);
+
+  if ((BLOCKSIZE >= 256) && (tID < 128)) {
+    sdata[tID] = myMax = MAX(myMax, sdata[tID + 128]);
+  }
+
+  cg::sync(cgThreadBlock);
+
+  if ((BLOCKSIZE >= 128) && (tID < 64)) {
+    sdata[tID] = myMax = MAX(myMax, sdata[tID + 64]);
+  }
+
+  cg::sync(cgThreadBlock);
+
+  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cgThreadBlock);
+
+  if (cgThreadBlock.thread_rank() < 32) {
+    // Fetch final intermediate sum from 2nd warp.
+    if (BLOCKSIZE >= 64) myMax = MAX(myMax, sdata[tID + 32]);
+    // Reduce final warp using shuffle.
+    for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
+      myMax = MAX(myMax, tile32.shfl_down(myMax, offset));
+    }
+  }
+
+  // Write result for this block to global mem.
+  if (cgThreadBlock.thread_rank() == 0) out[blockIdx.x] = myMax;
+}
+
+void reduceCartField(int numCellsTot, int blocks, int threads, GkylCartField_t *fIn, double *blockRed) {
+  // Launch the device kernel that reduces a CartField to an array,
+  // each element of the array being the reduction performed by a block.
+
+  // When there is only one warp per block, we need to allocate two warps
+  // worth of shared memory so that we don't index shared memory out of bounds
+  int smemSize = (threads <= 32) ? 2 * threads * sizeof(double) : threads * sizeof(double);
+
+  if (isPow2(numCellsTot)) {
+    switch (threads) {
+      case 512:
+        d_reduceCartField<512, true><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 256:
+        d_reduceCartField<256, true><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 128:
+        d_reduceCartField<128, true><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 64:
+        d_reduceCartField<64, true><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 32:
+        d_reduceCartField<32, true><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 16:
+        d_reduceCartField<16, true><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 8:
+        d_reduceCartField<8, true><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 4:
+        d_reduceCartField<4, true><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 2:
+        d_reduceCartField<2, true><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 1:
+        d_reduceCartField<1, true><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+    }
+  } else {
+    switch (threads) {
+      case 512:
+        d_reduceCartField<512, false><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 256:
+        d_reduceCartField<256, false><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 128:
+        d_reduceCartField<128, false><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 64:
+        d_reduceCartField<64, false><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 32:
+        d_reduceCartField<32, false><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 16:
+        d_reduceCartField<16, false><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 8:
+        d_reduceCartField<8, false><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 4:
+        d_reduceCartField<4, false><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 2:
+        d_reduceCartField<2, false><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+      case 1:
+        d_reduceCartField<1, false><<<blocks, threads, smemSize>>>(fIn, blockRed);
+        break;
+    }
+  }
+}
+
+void reduceMax(int numElements, int blocks, int threads, double *d_dataIn, double *d_dataOut) {
+  // Launch the device kernel that reduces a device array 'd_dataIn'
+  // containing 'numElements' elements.
+
+  // When there is only one warp per block, we need to allocate two warps
+  // worth of shared memory so that we don't index shared memory out of bounds
+  int smemSize = (threads <= 32) ? 2 * threads * sizeof(double) : threads * sizeof(double);
+
+  if (isPow2(numElements)) {
+    switch (threads) {
+      case 512:
+        d_reduceMax<512, true><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 256:
+        d_reduceMax<256, true><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 128:
+        d_reduceMax<128, true><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 64:
+        d_reduceMax<64, true><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 32:
+        d_reduceMax<32, true><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 16:
+        d_reduceMax<16, true><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 8:
+        d_reduceMax<8, true><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 4:
+        d_reduceMax<4, true><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 2:
+        d_reduceMax<2, true><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 1:
+        d_reduceMax<1, true><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+    }
+  } else {
+    switch (threads) {
+      case 512:
+        d_reduceMax<512, false><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 256:
+        d_reduceMax<256, false><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 128:
+        d_reduceMax<128, false><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 64:
+        d_reduceMax<64, false><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 32:
+        d_reduceMax<32, false><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 16:
+        d_reduceMax<16, false><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 8:
+        d_reduceMax<8, false><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 4:
+        d_reduceMax<4, false><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 2:
+        d_reduceMax<2, false><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+      case 1:
+        d_reduceMax<1, false><<<blocks, threads, smemSize>>>(d_dataIn, d_dataOut, numElements);
+        break;
+    }
+  }
+}
+
+void cartFieldMax(int numCellsTot, int numBlocks, int numThreads, int maxBlocks, int maxThreads, GkDeviceProp *prop,
+                  GkylCartField_t *fIn, double *blockOut, double *intermediate, double *out) {
+  // Find the maximum in the CartField 'fIn' (type double) and place
+  // it in the device-memory variable 'out'.
+  // This function follows 'reduce6' (using Cooperative Groups) in cuda-samples:
+  //   https://github.com/NVIDIA/cuda-samples/tree/master/Samples/reduction
+  // The algorithm uses two other temporary variables: 'blockOut' and 
+  // 'intermediate' have size=numBlocks and were allocated already.
+
+
+  // Call the kernel that reduces a CartField (fIn) into a device array (blockOut)
+  // which contains the reduction performed by each block.
+  reduceCartField(numCellsTot, numBlocks, numThreads, fIn, blockOut);
+
+  // Reduce partial block reductions on GPU.
+  int newNum = numBlocks;
+  while (newNum > 1) {
+    int threads = 0, blocks = 0;
+
+    getNumBlocksAndThreads(prop, newNum, maxBlocks, maxThreads, &blocks, &threads);
+
+    checkCudaErrors(cudaMemcpy(intermediate, blockOut, newNum * sizeof(double), cudaMemcpyDeviceToDevice));
+
+    reduceMax(newNum, blocks, threads, intermediate, blockOut);
+
+    newNum = (newNum + (threads * 2 - 1)) / (threads * 2);
+  }
+
+  // Copy final reduction to output variable.
+  checkCudaErrors(cudaMemcpy(out, blockOut, sizeof(double), cudaMemcpyDeviceToDevice));
+}
+
diff --git a/Unit/test_CudaReductions.lua b/Unit/test_CudaReductions.lua
new file mode 100644
index 000000000..219c97009
--- /dev/null
+++ b/Unit/test_CudaReductions.lua
@@ -0,0 +1,182 @@
+-- Gkyl ------------------------------------------------------------------------
+--
+-- Test for CUDA kernels that perform reductions.
+--
+--    _______     ___
+-- + 6 @ |||| # P ||| +
+--------------------------------------------------------------------------------
+
+-- Don't do anything if we were not built with CUDA.
+if GKYL_HAVE_CUDA == false then
+   print("**** Can't run CUDA tests without CUDA enabled GPUs!")
+   return 0
+end
+
+local Unit        = require "Unit"
+local DataStruct  = require "DataStruct"
+local Grid        = require "Grid"
+local Basis       = require "Basis"
+local lume        = require "Lib.lume"
+local cudaRunTime = require "Cuda.RunTime"
+local cudaAlloc   = require "Cuda.Alloc"
+local Alloc       = require "Lib.Alloc"
+local ffi         = require "ffi"
+
+local assert_equal = Unit.assert_equal
+local stats        = Unit.stats
+
+ffi.cdef [[
+  double reduceMax(int numBlocks, int numThreads, int maxBlocks, int maxThreads, GkDeviceProp *prop, GkylCartField_t *fIn, double *blockOut, double *intermediate, double *out);
+]]
+
+local function createGrid(lo,up,nCells)
+   local gridOut = Grid.RectCart {
+      lower = lo,
+      upper = up,
+      cells = nCells,
+   }
+   return gridOut
+end
+
+local function createBasis(dim, pOrder, bKind)
+   local basis
+   if (bKind=="Ser") then
+      basis = Basis.CartModalSerendipity { ndim = dim, polyOrder = pOrder }
+   elseif (bKind=="Max") then
+      basis = Basis.CartModalMaxOrder { ndim = dim, polyOrder = pOrder }
+   elseif (bKind=="Tensor") then
+      basis = Basis.CartModalTensor { ndim = dim, polyOrder = pOrder }
+   else
+      assert(false,"Invalid basis")
+   end
+   return basis
+end
+
+local function createField(grid, basis, deviceCopy, ghosts, isP0, vComp)
+   vComp = vComp or 1
+   if ghost then
+      ghostCells = {1, 1}
+   else
+      ghostCells = {0, 0}
+   end
+   if isP0 then
+      numBasisElements = 1
+   else
+      numBasisElements = basis:numBasis()*vComp
+   end
+   local fld = DataStruct.Field {
+      onGrid           = grid,
+      numComponents    = numBasisElements,
+      ghost            = ghostCells,
+      createDeviceCopy = deviceCopy,
+      metaData = {
+         polyOrder = basis:polyOrder(),
+         basisType = basis:id()
+      },
+   }
+   return fld
+end
+
+local function nextPowerOf2(nIn)
+   return math.floor(2^(math.ceil(math.log(5,2))))
+end
+
+local function prevPowerOf2(nIn)
+   return math.floor(2^(math.floor(math.log(5,2))))
+end
+
+local function getNumBlocksAndThreadsReduceP0(numElements, maxBlocks, maxThreads, devProp)
+   -- Given a 1-component field (p=0) with numElements cells, establish how
+   -- many thread blocks and threads/block to use in performing a reduction
+   -- on a GPU with device properties devProp.
+   -- This function follows 'reduce6' (using Cooperative Groups) in cuda-samples:
+   --   https://github.com/NVIDIA/cuda-samples/tree/master/Samples/reduction
+
+   local numBlocks, numThreads
+   if numElements < maxThreads*2 then
+      numThreads = nextPowerOf2(math.floor((numElements+1)/2))
+   else
+      numThreads = maxThreads
+   end
+   
+   numBlocks = math.floor((numElements + (numThreads*2-1))/(numThreads*2))
+
+   if (numBlocks > devProp.maxGridSize[1]) then
+      print(string.format("Grid size <%d> exceeds the device capability <%d>, set block size as %d (original %d)",numBlocks, devProp.maxGridSize[1], numThreads*2, numThreads))
+      numBlocks  = math.floor(numBlocks/2)
+      numThreads = numThreads*2
+   end
+
+   numBlocks = math.min(maxBlocks, numBlocks)
+
+   return numBlocks, numThreads
+end
+
+
+local pOrder        = 1
+local basis         = "Ser"
+local phaseLower    = {0.0, -6.0}
+local phaseUpper    = {1.0,  6.0}
+local phaseNumCells = {8, 32}
+local confLower     = {   phaseLower[1]}
+local confUpper     = {   phaseUpper[1]}
+local confNumCells  = {phaseNumCells[1]}
+
+local devNum, _    = cudaRunTime.GetDevice()
+local err          = cudaRunTime.SetDevice(devNum)
+local devProp, err = cudaRunTime.GetDeviceProperties(devNum)
+
+local numCellsTot   = lume.reduce(phaseNumCells, function(a,b) return a*b end)
+local numBlocksMAX  = 64
+local numThreadsMAX = GKYL_DEFAULT_NUM_THREADS
+
+-- Phase-space and config-space grids.
+local phaseGrid = createGrid(phaseLower, phaseUpper, phaseNumCells)
+local confGrid  = createGrid(confLower, confUpper, confNumCells)
+-- Basis functions.
+local phaseBasis = createBasis(phaseGrid:ndim(), pOrder, basis)
+local confBasis  = createBasis(confGrid:ndim(), pOrder, basis)
+-- Fields.
+local p0Field = createField(phaseGrid, phaseBasis, true, false, true)
+-- Initialize field to random numbers.
+math.randomseed(1000*os.time())
+local fldRange = p0Field:localRange()
+local fldIdxr  = p0Field:genIndexer()
+for idx in fldRange:rowMajorIter() do
+   local fldItr = p0Field:get(fldIdxr( idx ))
+   fldItr[1]    = math.random() 
+end
+-- Get the maximum on the CPU (for reference).
+maxVal = -9.0e19
+for idx in fldRange:rowMajorIter() do
+   local fldItr = p0Field:get(fldIdxr( idx ))
+   maxVal       = math.max(maxVal,fldItr[1])
+end
+print(string.format(" CPU max = %f",maxVal))
+print("")
+
+
+print(" Operation on the GPU:")
+-- Establish the number of blocks and threads to use.
+local numBlocks, numThreads = getNumBlocksAndThreadsReduceP0(numCellsTot,numBlocksMAX,numThreadsMAX,devProp) 
+print(string.format("   Use %d blocks of size %d (threads)",numBlocks,numThreads))
+
+-- Allocate device memory needed for intermediate reductions.
+local d_blockRed, d_intermediateRed = cudaAlloc.Double(numBlocks), cudaAlloc.Double(numBlocks)
+local d_maxVal = cudaAlloc.Double(1)
+
+--ffi.C.reduceMax(numBlocks, numThreads, numBlocksMAX, numThreadsMAX, devProp,
+--                p0Field._onDevice, d_blockRed, d_intermediateRed, d_maxVal)
+
+-- Test that the value found is correct.
+local maxVal_gpu = Alloc.Double(1)
+local err        = d_maxVal:copyDeviceToHost(maxVal_gpu)
+
+print(string.format(" GPU max = %f",maxVal_gpu[1]))
+print("")
+print(string.format(" Error = %f",maxVal-maxVal_gpu[1]))
+
+cudaRunTime.Free(d_blockRed)
+cudaRunTime.Free(d_intermediateRed)
+cudaRunTime.Free(d_maxVal)
+