diff --git a/src/cl/matrix_multiplication.cl b/src/cl/matrix_multiplication.cl index 000ee044..3d47b044 100644 --- a/src/cl/matrix_multiplication.cl +++ b/src/cl/matrix_multiplication.cl @@ -1,11 +1,15 @@ #ifdef __CLION_IDE__ + #include + #endif #define TS 16 #define WPT 4 -__kernel void matrix_multiplication(__global float* a, __global float* b, __global float* c, unsigned int M, unsigned int K, unsigned N) { +__kernel void +matrix_multiplication(__global float *a, __global float *b, __global float *c, unsigned int M, unsigned int K, + unsigned N) { int lx = get_local_id(0); int ly = get_local_id(1); int gx = get_global_id(0); @@ -20,7 +24,7 @@ __kernel void matrix_multiplication(__global float* a, __global float* b, __glob for (int tile_k = 0; tile_k * TS < K; ++tile_k) { for (int w = 0; w < WPT; ++w) { tileA[ly + w * RTS][lx] = gx < N && (gy + w * RTS) < M ? a[(gy + w * RTS) * K + tile_k * TS + lx] : 0; - tileB[ly + w * RTS][lx] = gx < N && (gy + w * RTS) < M ? b[(ly + tile_k * TS + w * RTS) * N + gx] : 0; + tileB[ly + w * RTS][lx] = gx < N && (gy + w * RTS) < M ? b[(ly + tile_k * TS + w * RTS) * N + gx] : 0; } barrier(CLK_LOCAL_MEM_FENCE); for (int k = 0; k < TS; ++k) { @@ -36,7 +40,9 @@ __kernel void matrix_multiplication(__global float* a, __global float* b, __glob } } -__kernel void matrix_multiplication_local(__global float* a, __global float* b, __global float* c, unsigned int M, unsigned int K, unsigned N) { +__kernel void +matrix_multiplication_local(__global float *a, __global float *b, __global float *c, unsigned int M, unsigned int K, + unsigned N) { int gx = get_global_id(0); int gy = get_global_id(1); int lx = get_local_id(0); @@ -62,7 +68,9 @@ __kernel void matrix_multiplication_local(__global float* a, __global float* b, } -__kernel void matrix_multiplication_naive(__global float* a, __global float* b, __global float* c, unsigned int M, unsigned int K, unsigned N) { +__kernel void +matrix_multiplication_naive(__global float *a, __global float *b, __global float *c, unsigned int M, unsigned int K, + unsigned N) { int i = get_global_id(0); int j = get_global_id(1); diff --git a/src/cl/matrix_transpose.cl b/src/cl/matrix_transpose.cl index 88b176eb..d5cd85a7 100644 --- a/src/cl/matrix_transpose.cl +++ b/src/cl/matrix_transpose.cl @@ -1,10 +1,11 @@ #ifdef __CLION_IDE__ + #include + #endif #define TS 16 -__kernel void matrix_transpose(__global float* a, __global float* at, unsigned int M, unsigned int K) -{ +__kernel void matrix_transpose(__global float *a, __global float *at, unsigned int M, unsigned int K) { unsigned int gx = get_global_id(0); unsigned int gy = get_global_id(1); unsigned int lx = get_local_id(0); diff --git a/src/main_matrix_multiplication.cpp b/src/main_matrix_multiplication.cpp index 379411fd..26b9b509 100644 --- a/src/main_matrix_multiplication.cpp +++ b/src/main_matrix_multiplication.cpp @@ -12,7 +12,7 @@ #include template -void runner(const std::string& kernel_name, EXECUTOR_TYPE executor, int benchmarkingIters, int gflops) { +void runner(const std::string &kernel_name, EXECUTOR_TYPE executor, int benchmarkingIters, int gflops) { std::cout << kernel_name << std::endl; ocl::Kernel matrix_multiplication_kernel(matrix_multiplication, matrix_multiplication_length, kernel_name); matrix_multiplication_kernel.compile(); @@ -31,8 +31,7 @@ void runner(const std::string& kernel_name, EXECUTOR_TYPE executor, int benchmar } -int main(int argc, char **argv) -{ +int main(int argc, char **argv) { gpu::Device device = gpu::chooseGPUDevice(argc, argv); gpu::Context context; @@ -43,13 +42,14 @@ int main(int argc, char **argv) unsigned int M = 1024; unsigned int K = 1024; unsigned int N = 1024; - const size_t gflops = ((size_t) M * K * N * 2) / (1000 * 1000 * 1000); // умножить на два, т.к. операция сложения и умножения + const size_t gflops = + ((size_t) M * K * N * 2) / (1000 * 1000 * 1000); // умножить на два, т.к. операция сложения и умножения - std::vector as(M*K, 0); - std::vector bs(K*N, 0); - std::vector cs(M*N, 0); + std::vector as(M * K, 0); + std::vector bs(K * N, 0); + std::vector cs(M * N, 0); - FastRandom r(M+K+N); + FastRandom r(M + K + N); for (unsigned int i = 0; i < as.size(); ++i) { as[i] = r.nextf(); } @@ -79,14 +79,14 @@ int main(int argc, char **argv) const std::vector cs_cpu_reference = cs; gpu::gpu_mem_32f as_gpu, bs_gpu, cs_gpu; - as_gpu.resizeN(M*K); - bs_gpu.resizeN(K*N); - cs_gpu.resizeN(M*N); + as_gpu.resizeN(M * K); + bs_gpu.resizeN(K * N); + cs_gpu.resizeN(M * N); - as_gpu.writeN(as.data(), M*K); - bs_gpu.writeN(bs.data(), K*N); + as_gpu.writeN(as.data(), M * K); + bs_gpu.writeN(bs.data(), K * N); - auto checker = [&] () { + auto checker = [&]() { // Проверяем корректность результатов double diff_sum = 0; for (int i = 0; i < M * N; ++i) { diff --git a/src/main_matrix_transpose.cpp b/src/main_matrix_transpose.cpp index 181d3184..6f889eb7 100644 --- a/src/main_matrix_transpose.cpp +++ b/src/main_matrix_transpose.cpp @@ -8,11 +8,9 @@ #include #include -#include -int main(int argc, char **argv) -{ +int main(int argc, char **argv) { gpu::Device device = gpu::chooseGPUDevice(argc, argv); gpu::Context context; @@ -23,10 +21,10 @@ int main(int argc, char **argv) unsigned int M = 1024; unsigned int K = 1024; - std::vector as(M*K, 0); - std::vector as_t(M*K, 0); + std::vector as(M * K, 0); + std::vector as_t(M * K, 0); - FastRandom r(M+K); + FastRandom r(M + K); for (unsigned int i = 0; i < as.size(); ++i) { as[i] = r.nextf(); } @@ -34,10 +32,10 @@ int main(int argc, char **argv) gpu::gpu_mem_32f as_gpu, as_t_gpu; - as_gpu.resizeN(M*K); - as_t_gpu.resizeN(K*M); + as_gpu.resizeN(M * K); + as_t_gpu.resizeN(K * M); - as_gpu.writeN(as.data(), M*K); + as_gpu.writeN(as.data(), M * K); ocl::Kernel matrix_transpose_kernel(matrix_transpose, matrix_transpose_length, "matrix_transpose"); matrix_transpose_kernel.compile(); @@ -50,10 +48,10 @@ int main(int argc, char **argv) t.nextLap(); } std::cout << "GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl; - std::cout << "GPU: " << M*K/1000.0/1000.0 / t.lapAvg() << " millions/s" << std::endl; + std::cout << "GPU: " << M * K / 1000.0 / 1000.0 / t.lapAvg() << " millions/s" << std::endl; } - as_t_gpu.readN(as_t.data(), M*K); + as_t_gpu.readN(as_t.data(), M * K); // Проверяем корректность результатов for (int j = 0; j < M; ++j) {