Skip to content

Commit

Permalink
done
Browse files Browse the repository at this point in the history
  • Loading branch information
AvvALlV committed Oct 29, 2023
1 parent 2cbf150 commit 06e3deb
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 7 deletions.
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,7 @@ endif()
# Обратите внимание что это происходит на этапе компиляции, кроме того необходимо чтобы файл src/cl/aplusb_cl.h был перечислен среди исходников для компиляции при вызове add_executable

convertIntoHeader(src/cl/radix.cl src/cl/radix_cl.h radix_kernel)
add_executable(radix src/main_radix.cpp src/cl/radix_cl.h)
convertIntoHeader(src/cl/prefix_sum.cl src/cl/prefix_sum_cl.h prefix_sum_kernel)
convertIntoHeader(src/cl/matrix_transpose.cl src/cl/matrix_transpose.cl.h matrix_transpose_kernel)
add_executable(radix src/main_radix.cpp src/cl/prefix_sum_cl.h src/cl/matrix_transpose.cl.h src/cl/radix_cl.h)
target_link_libraries(radix libclew libgpu libutils)
26 changes: 26 additions & 0 deletions src/cl/matrix_transpose.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#ifdef __CLION_IDE__

#include <libgpu/opencl/cl/clion_defines.cl>

#endif

#define TS 16
__kernel void matrix_transpose(__global float *a, __global float *at, unsigned int M, unsigned int K) {
const unsigned int gx = get_group_id(0) * TS;
const unsigned int gy = get_group_id(1) * TS;
const unsigned int lx = get_local_id(0);
const unsigned int ly = get_local_id(1);

__local float buf[TS][TS + 1];

if (gx + lx < M && gy + ly < K) {
buf[ly][lx] = a[(gy + ly) * M + (gx + lx)];
}

barrier(CLK_LOCAL_MEM_FENCE);

if (gx + ly < M && gy + lx < K) {
at[(gx + ly) * K + (gy + lx)] = buf[lx][ly];
}

}
11 changes: 11 additions & 0 deletions src/cl/prefix_sum.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#ifdef __CLION_IDE__
#include <libgpu/opencl/cl/clion_defines.cl>
#endif

__kernel void prefix_sum(__global unsigned int* as, __global unsigned int* res, unsigned int k) {
unsigned gid = get_global_id(0);
if (gid >= k) {
res[gid] = as[gid] + as[gid - k];
} else
res[gid] = as[gid];
}
48 changes: 46 additions & 2 deletions src/cl/radix.cl
Original file line number Diff line number Diff line change
@@ -1,3 +1,47 @@
__kernel void radix(__global unsigned int *as) {
// TODO
#ifdef __CLION_IDE__
#include <libgpu/opencl/cl/clion_defines.cl>
#include <stdio.h>
#endif

#define BITS_PER_ITER 4
#define BITS_VALUE (1 << BITS_PER_ITER)

__kernel void radix_count(__global unsigned int *as, __global unsigned int* counting, unsigned int shift) {
__local unsigned int count_local[BITS_VALUE];
int group_id = get_group_id(0);
int global_id = get_global_id(0);
int local_id = get_local_id(0);

if (local_id < BITS_VALUE) {
count_local[local_id] = 0;
}

barrier(CLK_LOCAL_MEM_FENCE);
atomic_inc(&count_local[(as[global_id] >> shift) & (BITS_VALUE - 1)]);
barrier(CLK_LOCAL_MEM_FENCE);

if (local_id < BITS_VALUE) {
counting[group_id * BITS_VALUE + local_id] = count_local[local_id];
}
}

#define WORK_SIZE 128
__kernel void radix(__global unsigned int *as, __global unsigned int* bs, __global unsigned int* prefix_sum, __global unsigned int* cnt, unsigned int shift) {
unsigned int global_id = get_global_id(0);
unsigned int local_id = get_local_id(0);
unsigned int group_id = get_group_id(0);
unsigned int group_cnt = get_num_groups(0);

__local unsigned int local_as[WORK_SIZE];

local_as[local_id] = (as[global_id] >> shift) & (BITS_VALUE - 1);
barrier(CLK_LOCAL_MEM_FENCE);

unsigned int offset = 0;
unsigned int cur_value = local_as[local_id];
for (int i = 0; i < local_id; ++i) {
offset += cur_value == local_as[i] ? 1 : 0;
}
int prev_values = prefix_sum[group_cnt * cur_value + group_id] - cnt[group_id * BITS_VALUE + cur_value];
bs[prev_values + offset] = as[global_id];
}
52 changes: 48 additions & 4 deletions src/main_radix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

// Этот файл будет сгенерирован автоматически в момент сборки - см. convertIntoHeader в CMakeLists.txt:18
#include "cl/radix_cl.h"
#include "cl/matrix_transpose.cl.h"
#include "cl/prefix_sum_cl.h"

#include <iostream>
#include <stdexcept>
Expand Down Expand Up @@ -50,21 +52,64 @@ int main(int argc, char **argv) {
std::cout << "CPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
std::cout << "CPU: " << (n / 1000 / 1000) / t.lapAvg() << " millions/s" << std::endl;
}
/*
gpu::gpu_mem_32u as_gpu;

unsigned int work_size = 128;
unsigned int work_group_cnt = n / work_size;

unsigned int total_bit = 32;
unsigned int bit_per_iter = 4;

unsigned int numbers_per_cnt_group = 1 << bit_per_iter;
unsigned int total_counting_size = numbers_per_cnt_group * work_group_cnt;

gpu::gpu_mem_32u counting_gpu;
counting_gpu.resizeN(total_counting_size);

gpu::gpu_mem_32u prefix_sum_gpu;
prefix_sum_gpu.resizeN(total_counting_size);

gpu::gpu_mem_32u prefix_sum_tmp_gpu;
prefix_sum_tmp_gpu.resizeN(total_counting_size);

gpu::gpu_mem_32u as_gpu, bs_gpu;
as_gpu.resizeN(n);
bs_gpu.resizeN(n);

std::vector<unsigned int> test(total_counting_size, 0);
{
ocl::Kernel radix(radix_kernel, radix_kernel_length, "radix");
radix.compile();

ocl::Kernel radix_count(radix_kernel, radix_kernel_length, "radix_count");
radix_count.compile();

ocl::Kernel matrix_transpose(matrix_transpose_kernel, matrix_transpose_kernel_length, "matrix_transpose");
matrix_transpose.compile();

ocl::Kernel prefix_sum(prefix_sum_kernel, prefix_sum_kernel_length, "prefix_sum");
prefix_sum.compile();

timer t;
for (int iter = 0; iter < benchmarkingIters; ++iter) {
as_gpu.writeN(as.data(), n);

t.restart();// Запускаем секундомер после прогрузки данных, чтобы замерять время работы кернела, а не трансфер данных
for (unsigned int shift = 0; shift < total_bit; shift += bit_per_iter) {

radix_count.exec(gpu::WorkSize(work_size, n), as_gpu, counting_gpu, shift);

// TODO
unsigned int wg_n = 16;
matrix_transpose.exec(gpu::WorkSize(wg_n, wg_n, numbers_per_cnt_group, work_group_cnt), counting_gpu, prefix_sum_gpu, numbers_per_cnt_group, work_group_cnt);

for (unsigned int k = 1; k < total_counting_size; k <<= 1) {
prefix_sum.exec(gpu::WorkSize(work_size, total_counting_size), prefix_sum_gpu, prefix_sum_tmp_gpu, k);
prefix_sum_tmp_gpu.swap(prefix_sum_gpu);
}

radix.exec(gpu::WorkSize(work_size, n), as_gpu, bs_gpu, prefix_sum_gpu, counting_gpu, shift);
bs_gpu.swap(as_gpu);
}
t.nextLap();
}
std::cout << "GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
std::cout << "GPU: " << (n / 1000 / 1000) / t.lapAvg() << " millions/s" << std::endl;
Expand All @@ -76,6 +121,5 @@ int main(int argc, char **argv) {
for (int i = 0; i < n; ++i) {
EXPECT_THE_SAME(as[i], cpu_sorted[i], "GPU results should be equal to CPU results!");
}
*/
return 0;
}

0 comments on commit 06e3deb

Please sign in to comment.