Skip to content

Commit

Permalink
use formatter
Browse files Browse the repository at this point in the history
  • Loading branch information
AvvALlV committed Oct 1, 2023
1 parent 2e15a2c commit 3e562f5
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 31 deletions.
16 changes: 12 additions & 4 deletions src/cl/matrix_multiplication.cl
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
#ifdef __CLION_IDE__

#include <libgpu/opencl/cl/clion_defines.cl>

#endif


#define TS 16
#define WPT 4
__kernel void matrix_multiplication(__global float* a, __global float* b, __global float* c, unsigned int M, unsigned int K, unsigned N) {
__kernel void
matrix_multiplication(__global float *a, __global float *b, __global float *c, unsigned int M, unsigned int K,
unsigned N) {
int lx = get_local_id(0);
int ly = get_local_id(1);
int gx = get_global_id(0);
Expand All @@ -20,7 +24,7 @@ __kernel void matrix_multiplication(__global float* a, __global float* b, __glob
for (int tile_k = 0; tile_k * TS < K; ++tile_k) {
for (int w = 0; w < WPT; ++w) {
tileA[ly + w * RTS][lx] = gx < N && (gy + w * RTS) < M ? a[(gy + w * RTS) * K + tile_k * TS + lx] : 0;
tileB[ly + w * RTS][lx] = gx < N && (gy + w * RTS) < M ? b[(ly + tile_k * TS + w * RTS) * N + gx] : 0;
tileB[ly + w * RTS][lx] = gx < N && (gy + w * RTS) < M ? b[(ly + tile_k * TS + w * RTS) * N + gx] : 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int k = 0; k < TS; ++k) {
Expand All @@ -36,7 +40,9 @@ __kernel void matrix_multiplication(__global float* a, __global float* b, __glob
}
}

__kernel void matrix_multiplication_local(__global float* a, __global float* b, __global float* c, unsigned int M, unsigned int K, unsigned N) {
__kernel void
matrix_multiplication_local(__global float *a, __global float *b, __global float *c, unsigned int M, unsigned int K,
unsigned N) {
int gx = get_global_id(0);
int gy = get_global_id(1);
int lx = get_local_id(0);
Expand All @@ -62,7 +68,9 @@ __kernel void matrix_multiplication_local(__global float* a, __global float* b,
}


__kernel void matrix_multiplication_naive(__global float* a, __global float* b, __global float* c, unsigned int M, unsigned int K, unsigned N) {
__kernel void
matrix_multiplication_naive(__global float *a, __global float *b, __global float *c, unsigned int M, unsigned int K,
unsigned N) {
int i = get_global_id(0);
int j = get_global_id(1);

Expand Down
5 changes: 3 additions & 2 deletions src/cl/matrix_transpose.cl
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#ifdef __CLION_IDE__

#include <libgpu/opencl/cl/clion_defines.cl>

#endif

#define TS 16
__kernel void matrix_transpose(__global float* a, __global float* at, unsigned int M, unsigned int K)
{
__kernel void matrix_transpose(__global float *a, __global float *at, unsigned int M, unsigned int K) {
unsigned int gx = get_global_id(0);
unsigned int gy = get_global_id(1);
unsigned int lx = get_local_id(0);
Expand Down
28 changes: 14 additions & 14 deletions src/main_matrix_multiplication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <string>

template<typename EXECUTOR_TYPE>
void runner(const std::string& kernel_name, EXECUTOR_TYPE executor, int benchmarkingIters, int gflops) {
void runner(const std::string &kernel_name, EXECUTOR_TYPE executor, int benchmarkingIters, int gflops) {
std::cout << kernel_name << std::endl;
ocl::Kernel matrix_multiplication_kernel(matrix_multiplication, matrix_multiplication_length, kernel_name);
matrix_multiplication_kernel.compile();
Expand All @@ -31,8 +31,7 @@ void runner(const std::string& kernel_name, EXECUTOR_TYPE executor, int benchmar
}


int main(int argc, char **argv)
{
int main(int argc, char **argv) {
gpu::Device device = gpu::chooseGPUDevice(argc, argv);

gpu::Context context;
Expand All @@ -43,13 +42,14 @@ int main(int argc, char **argv)
unsigned int M = 1024;
unsigned int K = 1024;
unsigned int N = 1024;
const size_t gflops = ((size_t) M * K * N * 2) / (1000 * 1000 * 1000); // умножить на два, т.к. операция сложения и умножения
const size_t gflops =
((size_t) M * K * N * 2) / (1000 * 1000 * 1000); // умножить на два, т.к. операция сложения и умножения

std::vector<float> as(M*K, 0);
std::vector<float> bs(K*N, 0);
std::vector<float> cs(M*N, 0);
std::vector<float> as(M * K, 0);
std::vector<float> bs(K * N, 0);
std::vector<float> cs(M * N, 0);

FastRandom r(M+K+N);
FastRandom r(M + K + N);
for (unsigned int i = 0; i < as.size(); ++i) {
as[i] = r.nextf();
}
Expand Down Expand Up @@ -79,14 +79,14 @@ int main(int argc, char **argv)
const std::vector<float> cs_cpu_reference = cs;

gpu::gpu_mem_32f as_gpu, bs_gpu, cs_gpu;
as_gpu.resizeN(M*K);
bs_gpu.resizeN(K*N);
cs_gpu.resizeN(M*N);
as_gpu.resizeN(M * K);
bs_gpu.resizeN(K * N);
cs_gpu.resizeN(M * N);

as_gpu.writeN(as.data(), M*K);
bs_gpu.writeN(bs.data(), K*N);
as_gpu.writeN(as.data(), M * K);
bs_gpu.writeN(bs.data(), K * N);

auto checker = [&] () {
auto checker = [&]() {
// Проверяем корректность результатов
double diff_sum = 0;
for (int i = 0; i < M * N; ++i) {
Expand Down
20 changes: 9 additions & 11 deletions src/main_matrix_transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,9 @@

#include <vector>
#include <iostream>
#include <stdexcept>


int main(int argc, char **argv)
{
int main(int argc, char **argv) {
gpu::Device device = gpu::chooseGPUDevice(argc, argv);

gpu::Context context;
Expand All @@ -23,21 +21,21 @@ int main(int argc, char **argv)
unsigned int M = 1024;
unsigned int K = 1024;

std::vector<float> as(M*K, 0);
std::vector<float> as_t(M*K, 0);
std::vector<float> as(M * K, 0);
std::vector<float> as_t(M * K, 0);

FastRandom r(M+K);
FastRandom r(M + K);
for (unsigned int i = 0; i < as.size(); ++i) {
as[i] = r.nextf();
}
std::cout << "Data generated for M=" << M << ", K=" << K << std::endl;


gpu::gpu_mem_32f as_gpu, as_t_gpu;
as_gpu.resizeN(M*K);
as_t_gpu.resizeN(K*M);
as_gpu.resizeN(M * K);
as_t_gpu.resizeN(K * M);

as_gpu.writeN(as.data(), M*K);
as_gpu.writeN(as.data(), M * K);

ocl::Kernel matrix_transpose_kernel(matrix_transpose, matrix_transpose_length, "matrix_transpose");
matrix_transpose_kernel.compile();
Expand All @@ -50,10 +48,10 @@ int main(int argc, char **argv)
t.nextLap();
}
std::cout << "GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
std::cout << "GPU: " << M*K/1000.0/1000.0 / t.lapAvg() << " millions/s" << std::endl;
std::cout << "GPU: " << M * K / 1000.0 / 1000.0 / t.lapAvg() << " millions/s" << std::endl;
}

as_t_gpu.readN(as_t.data(), M*K);
as_t_gpu.readN(as_t.data(), M * K);

// Проверяем корректность результатов
for (int j = 0; j < M; ++j) {
Expand Down

0 comments on commit 3e562f5

Please sign in to comment.