diff --git a/src/cl/sum.cl b/src/cl/sum.cl index 0c93b043..7b29353b 100644 --- a/src/cl/sum.cl +++ b/src/cl/sum.cl @@ -13,8 +13,7 @@ __kernel void sum_gpu_1(__global const unsigned int *arr, atomic_add(sum, arr[gid]); } -#define VALUES_PER_WORKITEM 32 -#define GROUP_SIZE 128 +#define VALUES_PER_WORKITEM 256 __kernel void sum_gpu_2(__global const unsigned int* arr, __global unsigned int* sum, @@ -50,6 +49,7 @@ __kernel void sum_gpu_3(__global const unsigned int *arr, atomic_add(sum, res); } +#define GROUP_SIZE 128 __kernel void sum_gpu_4(__global const unsigned int *arr, __global unsigned int *sum, unsigned int n) { @@ -70,7 +70,9 @@ __kernel void sum_gpu_4(__global const unsigned int *arr, atomic_add(sum, group_res); } } +#undef GROUP_SIZE +#define GROUP_SIZE 64 __kernel void sum_gpu_5(__global const unsigned int *arr, __global unsigned int *sum, unsigned int n) { @@ -94,4 +96,5 @@ __kernel void sum_gpu_5(__global const unsigned int *arr, if (lid == 0) atomic_add(sum, buf[0]); -} \ No newline at end of file +} +#undef GROUP_SIZE \ No newline at end of file diff --git a/src/main_sum.cpp b/src/main_sum.cpp index 53afed47..0dff8b54 100644 --- a/src/main_sum.cpp +++ b/src/main_sum.cpp @@ -19,11 +19,9 @@ void raiseFail(const T &a, const T &b, std::string message, std::string filename template -void runSumKernel(KERNEL_NAME_TYPE kernelName, KERNEL_LENGTH_TYPE kernelLength, const std::string& kernel_func_name, const ARRAY_TYPE& arr, unsigned int n, int benchmarkingIters = 20) { +void runSumKernel(KERNEL_NAME_TYPE kernelName, KERNEL_LENGTH_TYPE kernelLength, const std::string& kernel_func_name, const ARRAY_TYPE& arr, unsigned int n, int work_size, int global_work_size, int benchmarkingIters = 20) { ocl::Kernel sumKernel(kernelName, kernelLength , kernel_func_name); sumKernel.compile(); - unsigned int workGroupSize = 128; - unsigned int global_work_size = (n + workGroupSize - 1) / workGroupSize * workGroupSize; gpu::gpu_mem_32u res; res.resizeN(1); @@ -32,7 +30,7 @@ void runSumKernel(KERNEL_NAME_TYPE kernelName, KERNEL_LENGTH_TYPE kernelLength, for (int iter = 0; iter < benchmarkingIters; ++iter) { sum = 0; res.writeN(&sum, 1); - sumKernel.exec(gpu::WorkSize(workGroupSize, global_work_size), arr, res, n); + sumKernel.exec(gpu::WorkSize(work_size, global_work_size), arr, res, n); t.nextLap(); } res.readN(&sum, 1); @@ -99,10 +97,20 @@ int main(int argc, char **argv) arrBuff.resizeN(n); arrBuff.writeN(as.data(), n); - runSumKernel(sum_kernel, sum_kernel_length, "sum_gpu_1", arrBuff, n); - runSumKernel(sum_kernel, sum_kernel_length, "sum_gpu_2", arrBuff, n); - runSumKernel(sum_kernel, sum_kernel_length, "sum_gpu_3", arrBuff, n); - runSumKernel(sum_kernel, sum_kernel_length, "sum_gpu_4", arrBuff, n); - runSumKernel(sum_kernel, sum_kernel_length, "sum_gpu_5", arrBuff, n); + auto get_global_size = [](unsigned int n, int workSize) { + return (n + workSize - 1) / workSize * workSize; + }; + int workSize = 256, globalWorkSize = get_global_size(n, workSize); + runSumKernel(sum_kernel, sum_kernel_length, "sum_gpu_1", arrBuff, n, workSize, globalWorkSize); + workSize = 256; + globalWorkSize = (n + workSize - 1) / workSize; + runSumKernel(sum_kernel, sum_kernel_length, "sum_gpu_2", arrBuff, n, workSize, globalWorkSize); + runSumKernel(sum_kernel, sum_kernel_length, "sum_gpu_3", arrBuff, n, workSize, globalWorkSize); + workSize = 128; + globalWorkSize = get_global_size(n, workSize); + runSumKernel(sum_kernel, sum_kernel_length, "sum_gpu_4", arrBuff, n, workSize, globalWorkSize); + workSize = 64; + globalWorkSize = get_global_size(n, workSize); + runSumKernel(sum_kernel, sum_kernel_length, "sum_gpu_5", arrBuff, n, workSize, globalWorkSize); } }