From 06e3deb83ae9f99c38eb26785e417ee2ca6a5223 Mon Sep 17 00:00:00 2001
From: AvvAL <cghjhfdshj@gmail.com>
Date: Mon, 30 Oct 2023 01:05:01 +0300
Subject: [PATCH] done

---
 CMakeLists.txt             |  4 ++-
 src/cl/matrix_transpose.cl | 26 +++++++++++++++++++
 src/cl/prefix_sum.cl       | 11 ++++++++
 src/cl/radix.cl            | 48 +++++++++++++++++++++++++++++++++--
 src/main_radix.cpp         | 52 +++++++++++++++++++++++++++++++++++---
 5 files changed, 134 insertions(+), 7 deletions(-)
 create mode 100644 src/cl/matrix_transpose.cl
 create mode 100644 src/cl/prefix_sum.cl

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 909de69b..f9421647 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,5 +21,7 @@ endif()
 # Обратите внимание что это происходит на этапе компиляции, кроме того необходимо чтобы файл src/cl/aplusb_cl.h был перечислен среди исходников для компиляции при вызове add_executable
 
 convertIntoHeader(src/cl/radix.cl src/cl/radix_cl.h radix_kernel)
-add_executable(radix src/main_radix.cpp src/cl/radix_cl.h)
+convertIntoHeader(src/cl/prefix_sum.cl src/cl/prefix_sum_cl.h prefix_sum_kernel)
+convertIntoHeader(src/cl/matrix_transpose.cl src/cl/matrix_transpose.cl.h matrix_transpose_kernel)
+add_executable(radix src/main_radix.cpp src/cl/prefix_sum_cl.h src/cl/matrix_transpose.cl.h src/cl/radix_cl.h)
 target_link_libraries(radix libclew libgpu libutils)
diff --git a/src/cl/matrix_transpose.cl b/src/cl/matrix_transpose.cl
new file mode 100644
index 00000000..90449634
--- /dev/null
+++ b/src/cl/matrix_transpose.cl
@@ -0,0 +1,26 @@
+#ifdef __CLION_IDE__
+
+#include <libgpu/opencl/cl/clion_defines.cl>
+
+#endif
+
+#define TS 16
+__kernel void matrix_transpose(__global float *a, __global float *at, unsigned int M, unsigned int K) {
+    const unsigned int gx = get_group_id(0) * TS;
+    const unsigned int gy = get_group_id(1) * TS;
+    const unsigned int lx = get_local_id(0);
+    const unsigned int ly = get_local_id(1);
+
+    __local float buf[TS][TS + 1];
+
+    if (gx + lx < M && gy + ly < K) {
+        buf[ly][lx] = a[(gy + ly) * M + (gx + lx)];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (gx + ly < M && gy + lx < K) {
+        at[(gx + ly) * K + (gy + lx)] = buf[lx][ly];
+    }
+
+}
\ No newline at end of file
diff --git a/src/cl/prefix_sum.cl b/src/cl/prefix_sum.cl
new file mode 100644
index 00000000..145ab49f
--- /dev/null
+++ b/src/cl/prefix_sum.cl
@@ -0,0 +1,11 @@
+#ifdef __CLION_IDE__
+#include <libgpu/opencl/cl/clion_defines.cl>
+#endif
+
+__kernel void prefix_sum(__global unsigned int* as, __global unsigned int* res, unsigned int k) {
+    unsigned gid = get_global_id(0);
+    if (gid >= k) {
+        res[gid] = as[gid] + as[gid - k];
+    } else
+        res[gid] = as[gid];
+}
\ No newline at end of file
diff --git a/src/cl/radix.cl b/src/cl/radix.cl
index f8d09a80..44b1256e 100644
--- a/src/cl/radix.cl
+++ b/src/cl/radix.cl
@@ -1,3 +1,47 @@
-__kernel void radix(__global unsigned int *as) {
-    // TODO
+#ifdef __CLION_IDE__
+#include <libgpu/opencl/cl/clion_defines.cl>
+#include <stdio.h>
+#endif
+
+#define BITS_PER_ITER 4
+#define BITS_VALUE (1 << BITS_PER_ITER)
+
+__kernel void radix_count(__global unsigned int *as, __global unsigned int* counting, unsigned int shift) {
+    __local unsigned int count_local[BITS_VALUE];
+    int group_id = get_group_id(0);
+    int global_id = get_global_id(0);
+    int local_id = get_local_id(0);
+
+    if (local_id < BITS_VALUE) {
+        count_local[local_id] = 0;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    atomic_inc(&count_local[(as[global_id] >> shift) & (BITS_VALUE - 1)]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (local_id < BITS_VALUE) {
+        counting[group_id * BITS_VALUE + local_id] = count_local[local_id];
+    }
+}
+
+#define WORK_SIZE 128
+__kernel void radix(__global unsigned int *as, __global unsigned int* bs, __global unsigned int* prefix_sum, __global unsigned int* cnt, unsigned int shift) {
+    unsigned int global_id = get_global_id(0);
+    unsigned int local_id = get_local_id(0);
+    unsigned int group_id = get_group_id(0);
+    unsigned int group_cnt = get_num_groups(0);
+
+    __local unsigned int local_as[WORK_SIZE];
+
+    local_as[local_id] = (as[global_id] >> shift) & (BITS_VALUE - 1);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    unsigned int offset = 0;
+    unsigned int cur_value = local_as[local_id];
+    for (int i = 0; i < local_id; ++i) {
+        offset += cur_value == local_as[i] ? 1 : 0;
+    }
+    int prev_values = prefix_sum[group_cnt * cur_value + group_id] - cnt[group_id * BITS_VALUE + cur_value];
+    bs[prev_values + offset] = as[global_id];
 }
diff --git a/src/main_radix.cpp b/src/main_radix.cpp
index b5bbb311..cb8a6301 100644
--- a/src/main_radix.cpp
+++ b/src/main_radix.cpp
@@ -6,6 +6,8 @@
 
 // Этот файл будет сгенерирован автоматически в момент сборки - см. convertIntoHeader в CMakeLists.txt:18
 #include "cl/radix_cl.h"
+#include "cl/matrix_transpose.cl.h"
+#include "cl/prefix_sum_cl.h"
 
 #include <iostream>
 #include <stdexcept>
@@ -50,21 +52,64 @@ int main(int argc, char **argv) {
         std::cout << "CPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
         std::cout << "CPU: " << (n / 1000 / 1000) / t.lapAvg() << " millions/s" << std::endl;
     }
-    /*
-    gpu::gpu_mem_32u as_gpu;
+
+    unsigned int work_size = 128;
+    unsigned int work_group_cnt = n / work_size;
+
+    unsigned int total_bit = 32;
+    unsigned int bit_per_iter = 4;
+
+    unsigned int numbers_per_cnt_group = 1 << bit_per_iter;
+    unsigned int total_counting_size = numbers_per_cnt_group * work_group_cnt;
+
+    gpu::gpu_mem_32u counting_gpu;
+    counting_gpu.resizeN(total_counting_size);
+
+    gpu::gpu_mem_32u prefix_sum_gpu;
+    prefix_sum_gpu.resizeN(total_counting_size);
+
+    gpu::gpu_mem_32u prefix_sum_tmp_gpu;
+    prefix_sum_tmp_gpu.resizeN(total_counting_size);
+
+    gpu::gpu_mem_32u as_gpu, bs_gpu;
     as_gpu.resizeN(n);
+    bs_gpu.resizeN(n);
 
+    std::vector<unsigned int> test(total_counting_size, 0);
     {
         ocl::Kernel radix(radix_kernel, radix_kernel_length, "radix");
         radix.compile();
 
+        ocl::Kernel radix_count(radix_kernel, radix_kernel_length, "radix_count");
+        radix_count.compile();
+
+        ocl::Kernel matrix_transpose(matrix_transpose_kernel, matrix_transpose_kernel_length, "matrix_transpose");
+        matrix_transpose.compile();
+
+        ocl::Kernel prefix_sum(prefix_sum_kernel, prefix_sum_kernel_length, "prefix_sum");
+        prefix_sum.compile();
+
         timer t;
         for (int iter = 0; iter < benchmarkingIters; ++iter) {
             as_gpu.writeN(as.data(), n);
 
             t.restart();// Запускаем секундомер после прогрузки данных, чтобы замерять время работы кернела, а не трансфер данных
+            for (unsigned int shift = 0; shift < total_bit; shift += bit_per_iter) {
+
+                radix_count.exec(gpu::WorkSize(work_size, n), as_gpu, counting_gpu, shift);
 
-            // TODO
+                unsigned int wg_n = 16;
+                matrix_transpose.exec(gpu::WorkSize(wg_n, wg_n, numbers_per_cnt_group, work_group_cnt), counting_gpu, prefix_sum_gpu, numbers_per_cnt_group, work_group_cnt);
+
+                for (unsigned int k = 1; k < total_counting_size; k <<= 1) {
+                    prefix_sum.exec(gpu::WorkSize(work_size, total_counting_size), prefix_sum_gpu, prefix_sum_tmp_gpu, k);
+                    prefix_sum_tmp_gpu.swap(prefix_sum_gpu);
+                }
+
+                radix.exec(gpu::WorkSize(work_size, n), as_gpu, bs_gpu, prefix_sum_gpu, counting_gpu, shift);
+                bs_gpu.swap(as_gpu);
+            }
+            t.nextLap();
         }
         std::cout << "GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
         std::cout << "GPU: " << (n / 1000 / 1000) / t.lapAvg() << " millions/s" << std::endl;
@@ -76,6 +121,5 @@ int main(int argc, char **argv) {
     for (int i = 0; i < n; ++i) {
         EXPECT_THE_SAME(as[i], cpu_sorted[i], "GPU results should be equal to CPU results!");
     }
-*/
     return 0;
 }