From 06e3deb83ae9f99c38eb26785e417ee2ca6a5223 Mon Sep 17 00:00:00 2001 From: AvvAL Date: Mon, 30 Oct 2023 01:05:01 +0300 Subject: [PATCH] done --- CMakeLists.txt | 4 ++- src/cl/matrix_transpose.cl | 26 +++++++++++++++++++ src/cl/prefix_sum.cl | 11 ++++++++ src/cl/radix.cl | 48 +++++++++++++++++++++++++++++++++-- src/main_radix.cpp | 52 +++++++++++++++++++++++++++++++++++--- 5 files changed, 134 insertions(+), 7 deletions(-) create mode 100644 src/cl/matrix_transpose.cl create mode 100644 src/cl/prefix_sum.cl diff --git a/CMakeLists.txt b/CMakeLists.txt index 909de69b..f9421647 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,5 +21,7 @@ endif() # Обратите внимание что это происходит на этапе компиляции, кроме того необходимо чтобы файл src/cl/aplusb_cl.h был перечислен среди исходников для компиляции при вызове add_executable convertIntoHeader(src/cl/radix.cl src/cl/radix_cl.h radix_kernel) -add_executable(radix src/main_radix.cpp src/cl/radix_cl.h) +convertIntoHeader(src/cl/prefix_sum.cl src/cl/prefix_sum_cl.h prefix_sum_kernel) +convertIntoHeader(src/cl/matrix_transpose.cl src/cl/matrix_transpose.cl.h matrix_transpose_kernel) +add_executable(radix src/main_radix.cpp src/cl/prefix_sum_cl.h src/cl/matrix_transpose.cl.h src/cl/radix_cl.h) target_link_libraries(radix libclew libgpu libutils) diff --git a/src/cl/matrix_transpose.cl b/src/cl/matrix_transpose.cl new file mode 100644 index 00000000..90449634 --- /dev/null +++ b/src/cl/matrix_transpose.cl @@ -0,0 +1,26 @@ +#ifdef __CLION_IDE__ + +#include + +#endif + +#define TS 16 +__kernel void matrix_transpose(__global float *a, __global float *at, unsigned int M, unsigned int K) { + const unsigned int gx = get_group_id(0) * TS; + const unsigned int gy = get_group_id(1) * TS; + const unsigned int lx = get_local_id(0); + const unsigned int ly = get_local_id(1); + + __local float buf[TS][TS + 1]; + + if (gx + lx < M && gy + ly < K) { + buf[ly][lx] = a[(gy + ly) * M + (gx + lx)]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (gx + ly < M && gy + lx < K) { + at[(gx + ly) * K + (gy + lx)] = buf[lx][ly]; + } + +} \ No newline at end of file diff --git a/src/cl/prefix_sum.cl b/src/cl/prefix_sum.cl new file mode 100644 index 00000000..145ab49f --- /dev/null +++ b/src/cl/prefix_sum.cl @@ -0,0 +1,11 @@ +#ifdef __CLION_IDE__ +#include +#endif + +__kernel void prefix_sum(__global unsigned int* as, __global unsigned int* res, unsigned int k) { + unsigned gid = get_global_id(0); + if (gid >= k) { + res[gid] = as[gid] + as[gid - k]; + } else + res[gid] = as[gid]; +} \ No newline at end of file diff --git a/src/cl/radix.cl b/src/cl/radix.cl index f8d09a80..44b1256e 100644 --- a/src/cl/radix.cl +++ b/src/cl/radix.cl @@ -1,3 +1,47 @@ -__kernel void radix(__global unsigned int *as) { - // TODO +#ifdef __CLION_IDE__ +#include +#include +#endif + +#define BITS_PER_ITER 4 +#define BITS_VALUE (1 << BITS_PER_ITER) + +__kernel void radix_count(__global unsigned int *as, __global unsigned int* counting, unsigned int shift) { + __local unsigned int count_local[BITS_VALUE]; + int group_id = get_group_id(0); + int global_id = get_global_id(0); + int local_id = get_local_id(0); + + if (local_id < BITS_VALUE) { + count_local[local_id] = 0; + } + + barrier(CLK_LOCAL_MEM_FENCE); + atomic_inc(&count_local[(as[global_id] >> shift) & (BITS_VALUE - 1)]); + barrier(CLK_LOCAL_MEM_FENCE); + + if (local_id < BITS_VALUE) { + counting[group_id * BITS_VALUE + local_id] = count_local[local_id]; + } +} + +#define WORK_SIZE 128 +__kernel void radix(__global unsigned int *as, __global unsigned int* bs, __global unsigned int* prefix_sum, __global unsigned int* cnt, unsigned int shift) { + unsigned int global_id = get_global_id(0); + unsigned int local_id = get_local_id(0); + unsigned int group_id = get_group_id(0); + unsigned int group_cnt = get_num_groups(0); + + __local unsigned int local_as[WORK_SIZE]; + + local_as[local_id] = (as[global_id] >> shift) & (BITS_VALUE - 1); + barrier(CLK_LOCAL_MEM_FENCE); + + unsigned int offset = 0; + unsigned int cur_value = local_as[local_id]; + for (int i = 0; i < local_id; ++i) { + offset += cur_value == local_as[i] ? 1 : 0; + } + int prev_values = prefix_sum[group_cnt * cur_value + group_id] - cnt[group_id * BITS_VALUE + cur_value]; + bs[prev_values + offset] = as[global_id]; } diff --git a/src/main_radix.cpp b/src/main_radix.cpp index b5bbb311..cb8a6301 100644 --- a/src/main_radix.cpp +++ b/src/main_radix.cpp @@ -6,6 +6,8 @@ // Этот файл будет сгенерирован автоматически в момент сборки - см. convertIntoHeader в CMakeLists.txt:18 #include "cl/radix_cl.h" +#include "cl/matrix_transpose.cl.h" +#include "cl/prefix_sum_cl.h" #include #include @@ -50,21 +52,64 @@ int main(int argc, char **argv) { std::cout << "CPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl; std::cout << "CPU: " << (n / 1000 / 1000) / t.lapAvg() << " millions/s" << std::endl; } - /* - gpu::gpu_mem_32u as_gpu; + + unsigned int work_size = 128; + unsigned int work_group_cnt = n / work_size; + + unsigned int total_bit = 32; + unsigned int bit_per_iter = 4; + + unsigned int numbers_per_cnt_group = 1 << bit_per_iter; + unsigned int total_counting_size = numbers_per_cnt_group * work_group_cnt; + + gpu::gpu_mem_32u counting_gpu; + counting_gpu.resizeN(total_counting_size); + + gpu::gpu_mem_32u prefix_sum_gpu; + prefix_sum_gpu.resizeN(total_counting_size); + + gpu::gpu_mem_32u prefix_sum_tmp_gpu; + prefix_sum_tmp_gpu.resizeN(total_counting_size); + + gpu::gpu_mem_32u as_gpu, bs_gpu; as_gpu.resizeN(n); + bs_gpu.resizeN(n); + std::vector test(total_counting_size, 0); { ocl::Kernel radix(radix_kernel, radix_kernel_length, "radix"); radix.compile(); + ocl::Kernel radix_count(radix_kernel, radix_kernel_length, "radix_count"); + radix_count.compile(); + + ocl::Kernel matrix_transpose(matrix_transpose_kernel, matrix_transpose_kernel_length, "matrix_transpose"); + matrix_transpose.compile(); + + ocl::Kernel prefix_sum(prefix_sum_kernel, prefix_sum_kernel_length, "prefix_sum"); + prefix_sum.compile(); + timer t; for (int iter = 0; iter < benchmarkingIters; ++iter) { as_gpu.writeN(as.data(), n); t.restart();// Запускаем секундомер после прогрузки данных, чтобы замерять время работы кернела, а не трансфер данных + for (unsigned int shift = 0; shift < total_bit; shift += bit_per_iter) { + + radix_count.exec(gpu::WorkSize(work_size, n), as_gpu, counting_gpu, shift); - // TODO + unsigned int wg_n = 16; + matrix_transpose.exec(gpu::WorkSize(wg_n, wg_n, numbers_per_cnt_group, work_group_cnt), counting_gpu, prefix_sum_gpu, numbers_per_cnt_group, work_group_cnt); + + for (unsigned int k = 1; k < total_counting_size; k <<= 1) { + prefix_sum.exec(gpu::WorkSize(work_size, total_counting_size), prefix_sum_gpu, prefix_sum_tmp_gpu, k); + prefix_sum_tmp_gpu.swap(prefix_sum_gpu); + } + + radix.exec(gpu::WorkSize(work_size, n), as_gpu, bs_gpu, prefix_sum_gpu, counting_gpu, shift); + bs_gpu.swap(as_gpu); + } + t.nextLap(); } std::cout << "GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl; std::cout << "GPU: " << (n / 1000 / 1000) / t.lapAvg() << " millions/s" << std::endl; @@ -76,6 +121,5 @@ int main(int argc, char **argv) { for (int i = 0; i < n; ++i) { EXPECT_THE_SAME(as[i], cpu_sorted[i], "GPU results should be equal to CPU results!"); } -*/ return 0; }