diff --git a/software/apps/baremetal/matmul_i32/main.c b/software/apps/baremetal/matmul_i32/main.c index 3713dcabe..b6744a49c 100644 --- a/software/apps/baremetal/matmul_i32/main.c +++ b/software/apps/baremetal/matmul_i32/main.c @@ -13,9 +13,10 @@ #include "runtime.h" #include "synchronization.h" +#include "data_matmul_i32.h" + #include "baremetal/mempool_checks.h" #include "baremetal/mempool_matmul_i32p.h" -#include "data_matmul_i32.h" int32_t l1_A[matrix_M * matrix_N] __attribute__((section(".l1_prio"))); int32_t l1_B[matrix_N * matrix_P] __attribute__((section(".l1_prio"))); diff --git a/software/apps/baremetal/matrix_mul/main.c b/software/apps/baremetal/matrix_mul/main.c index 094817929..a1b6626ac 100644 --- a/software/apps/baremetal/matrix_mul/main.c +++ b/software/apps/baremetal/matrix_mul/main.c @@ -7,7 +7,6 @@ #include #include -#include "baremetal/mempool_matmul_i32p.h" #include "encoding.h" #include "printf.h" #include "runtime.h" @@ -15,19 +14,21 @@ // Define Matrix dimensions: // C = AB with A=[MxN], B=[NxP], C=[MxP] -#define M (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2)) -#define N (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2)) -#define P (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2)) +#define matrix_M (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2)) +#define matrix_N (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2)) +#define matrix_P (NUM_CORES >= 256 ? 128 : (NUM_CORES / 2)) // Specify how the matrices A and B should be initialized // The entries will follow this format: // a(i,j) = A_a*i + A_b*j + A_c // b(i,j) = B_a*i + B_b*j + B_c // The result will be the following matrix -// c(i,j) = (A_a*B_b*i*j + A_a*B_c*i + A_c*B_b*j + A_c*B_c) * N -// + (A_a*B_a*i + A_b*B_b*j + A_b*B_c + B_a*A_c) * (N*(N-1))/2 -// + (A_b*B_a) * (N*(N-1)*(2*N-1))/6 -// Note: To keep the code simpler, we use indices that go from 0 to N-1 instead -// of 1 to N as the mathematicians do. Hence, for A, i=[0,M-1] j=[0,M-1] +// c(i,j) = (A_a*B_b*i*j + A_a*B_c*i + A_c*B_b*j + A_c*B_c) * matrix_N +// + (A_a*B_a*i + A_b*B_b*j + A_b*B_c + B_a*A_c) * +// (matrix_N*(matrix_N-1))/2 +// + (A_b*B_a) * (matrix_N*(matrix_N-1)*(2*matrix_N-1))/6 +// Note: To keep the code simpler, we use indices that go from 0 to matrix_N-1 +// instead of 1 to matrix_N as the mathematicians do. Hence, for A, +// i=[0,matrix_M-1] j=[0,matrix_M-1] #define A_a 1 #define A_b 1 #define A_c -32 @@ -37,10 +38,11 @@ // Enable verbose printing // #define VERBOSE +#include "baremetal/mempool_matmul_i32p.h" int32_t volatile init __attribute__((section(".l2"))) = 0; -int32_t a[M * N] __attribute__((section(".l1"))); -int32_t b[N * P] __attribute__((section(".l1"))); -int32_t c[M * P] __attribute__((section(".l1"))); +int32_t a[matrix_M * matrix_N] __attribute__((section(".l1"))); +int32_t b[matrix_N * matrix_P] __attribute__((section(".l1"))); +int32_t c[matrix_M * matrix_P] __attribute__((section(".l1"))); // Initialize the matrices in parallel void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, @@ -61,10 +63,13 @@ int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, // Parallelize over rows for (int32_t i = 0; i < (int32_t)num_rows; ++i) { for (int32_t j = 0; j < (int32_t)num_columns; ++j) { - int32_t lin = (aa * bb * i * j + aa * bc * i + ac * bb * j + ac * bc) * N; - int32_t qua = - ((aa * ba * i + ab * bb * j + ab * bc + ba * ac) * (N * (N - 1))) / 2; - int32_t cub = ((ab * ba) * (N * (N - 1) * (2 * N - 1))) / 6; + int32_t lin = + (aa * bb * i * j + aa * bc * i + ac * bb * j + ac * bc) * matrix_N; + int32_t qua = ((aa * ba * i + ab * bb * j + ab * bc + ba * ac) * + (matrix_N * (matrix_N - 1))) / + 2; + int32_t cub = + ((ab * ba) * (matrix_N * (matrix_N - 1) * (2 * matrix_N - 1))) / 6; int32_t golden = lin + qua + cub; if (matrix[i * (int32_t)num_columns + j] != golden) { return (i + j) == 0 ? -1 : i * (int32_t)num_columns + j; @@ -100,14 +105,14 @@ int main() { // #endif // Initialize Matrices - init_matrix(a, M, N, A_a, A_b, A_c, core_id, num_cores); - init_matrix(b, N, P, B_a, B_b, B_c, core_id, num_cores); + init_matrix(a, matrix_M, matrix_N, A_a, A_b, A_c, core_id, num_cores); + init_matrix(b, matrix_N, matrix_P, B_a, B_b, B_c, core_id, num_cores); #ifdef VERBOSE mempool_barrier(num_cores); if (core_id == 0) { - print_matrix(a, M, N); - print_matrix(b, N, P); + print_matrix(a, matrix_M, matrix_N); + print_matrix(b, matrix_N, matrix_P); } #endif @@ -121,20 +126,24 @@ int main() { mempool_start_benchmark(); switch (i) { case 0: - mat_mul_parallel(a, b, c, M, N, P, core_id, num_cores); + mat_mul_parallel(a, b, c, matrix_M, matrix_N, matrix_P, core_id, + num_cores); break; case 1: - mat_mul_unrolled_parallel(a, b, c, M, N, P, core_id, num_cores); + mat_mul_unrolled_parallel(a, b, c, matrix_M, matrix_N, matrix_P, core_id, + num_cores); break; case 2: - mat_mul_asm_parallel(a, b, c, M, N, P, core_id, num_cores); + mat_mul_asm_parallel(a, b, c, matrix_M, matrix_N, matrix_P, core_id, + num_cores); break; case 3: - mat_mul_parallel_finegrained(a, b, c, M, N, P, core_id, num_cores); + mat_mul_parallel_finegrained(a, b, c, matrix_M, matrix_N, matrix_P, + core_id, num_cores); break; case 4: - mat_mul_unrolled_parallel_finegrained(a, b, c, M, N, P, core_id, - num_cores); + mat_mul_unrolled_parallel_finegrained(a, b, c, matrix_M, matrix_N, + matrix_P, core_id, num_cores); break; } mempool_stop_benchmark(); @@ -144,7 +153,8 @@ int main() { // Check result if (core_id == 0) { // printf("Duration: %d\n", cycles); - int error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c); + int error = + verify_matrix(c, matrix_M, matrix_P, A_a, A_b, A_c, B_a, B_b, B_c); if (error != 0) { printf("Error code %d\n", error); printf("c[%d]=%d\n", error, c[error]); @@ -154,7 +164,7 @@ int main() { #endif } else { // Wait for the approx amount it takes core 0 to verify the result - mempool_wait(M * P * 12); + mempool_wait(matrix_M * matrix_P * 12); } } @@ -163,7 +173,7 @@ int main() { #ifdef VERBOSE if (core_id == 0) { - print_matrix(c, M, P); + print_matrix(c, matrix_M, matrix_P); } mempool_barrier(num_cores); #endif diff --git a/software/apps/baremetal/tests/main.c b/software/apps/baremetal/tests/main.c index d835d28fb..622b35e34 100644 --- a/software/apps/baremetal/tests/main.c +++ b/software/apps/baremetal/tests/main.c @@ -7,7 +7,6 @@ #include #include -#include "baremetal/mempool_matmul_i32p.h" #include "encoding.h" #include "printf.h" #include "runtime.h" @@ -25,6 +24,8 @@ #define matrix_P (NUM_CORES) #endif +#include "baremetal/mempool_matmul_i32p.h" + int32_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1_prio"))); int32_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1_prio"))); int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1_prio"))); diff --git a/software/apps/matmul_i32_conflict_opt/main.c b/software/apps/matmul_i32_conflict_opt/main.c deleted file mode 100644 index c3d61b64b..000000000 --- a/software/apps/matmul_i32_conflict_opt/main.c +++ /dev/null @@ -1,208 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Yichao Zhang, ETH Zurich -// Author: Samuel Riedel, ETH Zurich - -#include -#include - -#include "encoding.h" -#include "printf.h" -#include "runtime.h" -#include "synchronization.h" - -// Define Matrix dimensions: -// C = AB with A=[MxN], B=[NxP], C=[MxP] -#define matrix_M 128 -#define matrix_N 64 -#define matrix_P 128 - -// Define Benchmark Flag -#define SERIAL_MODE (0) -#define PARALLEL_MODE (1) -#define CONCURRENT_MODE (0) -#define NUM_PARALLEL_CORES (1024) - -// Define kernel include -#include "kernel/mat_mul.h" -#include "kernel/mat_mul_conflict_opt.h" - -// Define memory distributing -int32_t matrix_a[matrix_M * matrix_N] __attribute__((section(".l1"))); -int32_t matrix_b[matrix_N * matrix_P] __attribute__((section(".l1"))); -int32_t matrix_c[matrix_M * matrix_P] __attribute__((section(".l1"))); -#if (CONCURRENT_MODE == 1) -int32_t matrix_d[matrix_M * matrix_N] __attribute__((section(".l1"))); -int32_t matrix_e[matrix_N * matrix_P] __attribute__((section(".l1"))); -int32_t matrix_f[matrix_M * matrix_P] __attribute__((section(".l1"))); -#endif -int volatile error __attribute__((section(".l2"))); - -// Function init_matrix -void init_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - int32_t a, int32_t b, int32_t c, uint32_t core_id, - uint32_t num_cores) { - // How many rows/columns to split the matrix into - uint32_t const split = 4; - if (num_columns > num_rows) { - // Parallelize over columns - uint32_t const c_start = (num_rows / split) * (core_id % split); - uint32_t const c_end = (num_rows / split) * ((core_id % split) + 1); - for (uint32_t j = (core_id / split); j < num_columns; - j += (num_cores / split)) { - for (uint32_t i = c_start; i < c_end; ++i) { - matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; - } - } - } else { - // Parallelize over rows - uint32_t const c_start = (num_columns / split) * (core_id % split); - uint32_t const c_end = (num_columns / split) * ((core_id % split) + 1); - for (uint32_t i = (core_id / split); i < num_rows; - i += (num_cores / split)) { - for (uint32_t j = c_start; j < c_end; ++j) { - matrix[i * num_columns + j] = a * (int32_t)i + b * (int32_t)j + c; - } - } - } -} - -// Function verify_matrix -int verify_matrix(int32_t *matrix, uint32_t num_rows, uint32_t num_columns, - uint32_t inner_dim, int32_t aa, int32_t ab, int32_t ac, - int32_t ba, int32_t bb, int32_t bc, uint32_t core_id, - uint32_t num_cores) { - // Convert to signed - int32_t n = (int32_t)inner_dim; - // Parallelize over rows - for (uint32_t i = core_id; i < num_rows; i += num_cores) { - for (uint32_t j = 0; j < num_columns; ++j) { - int32_t ii = (int32_t)i; - int32_t jj = (int32_t)j; - int32_t lin = - (aa * bb * ii * jj + aa * bc * ii + ac * bb * jj + ac * bc) * n; - int32_t qua = - ((aa * ba * ii + ab * bb * jj + ab * bc + ba * ac) * (n * (n - 1))) / - 2; - int32_t cub = ((ab * ba) * (n * (n - 1) * (2 * n - 1))) / 6; - int32_t golden = lin + qua + cub; - if (matrix[i * num_columns + j] != golden) { - return (i + j) == 0 ? -1 : (int)(i * num_columns + j); - } - matrix[i * num_columns + j] = 0; - } - } - return 0; -} - -// Function test_matrix_multiplication -int test_matrix_multiplication(int32_t *__restrict__ A, int32_t *__restrict__ B, - int32_t *__restrict__ C, uint32_t M, uint32_t N, - uint32_t P, uint32_t core_id, - uint32_t num_cores) { - int32_t const A_a = 1; - int32_t const A_b = 2; - int32_t const A_c = -32; - int32_t const B_a = 1; - int32_t const B_b = 1; - int32_t const B_c = 16; - - // Initialize Matrices - init_matrix(A, M, N, A_a, A_b, A_c, core_id, num_cores); - init_matrix(B, N, P, B_a, B_b, B_c, core_id, num_cores); -#if (CONCURRENT_MODE == 1) - init_matrix(D, M, N, A_a, A_b, A_c, core_id, num_cores); - init_matrix(E, N, P, B_a, B_b, B_c, core_id, num_cores); -#endif - mempool_barrier(num_cores); - -// Serial Benchmark -#if (SERIAL_MODE == 1) - if (core_id == 0) { - printf("Serial Calculation Start\n"); - mempool_start_benchmark(); - mat_mul_unrolled_4x4_serial(A, B, C, M, N, P); - mempool_stop_benchmark(); - printf("Calculation Finish\n"); - } -#endif - -// Parallel Benchmark -#if (PARALLEL_MODE == 1) - if (core_id == 0) { - printf("Parallel Calculation Start\n"); - } - mempool_barrier(num_cores); - - if (core_id < NUM_PARALLEL_CORES) { - mempool_start_benchmark(); - mat_mul_unrolled_4x4_conflict_opt_parallel_asm(A, B, C, M, N, P, core_id, - NUM_PARALLEL_CORES); - mempool_start_benchmark(); - mempool_log_partial_barrier(2, core_id, NUM_PARALLEL_CORES); - mempool_stop_benchmark(); - } - mempool_barrier(num_cores); -#endif - -// Concurrent Benchmark -#if (CONCURRENT_MODE == 1) - if (core_id == 0) { - printf("Concurrent Calculation Start\n"); - } - mempool_barrier(num_cores); - - if (core_id < 512) { - mempool_start_benchmark(); - mat_mul_unrolled_4x4_conflict_opt_parallel_asm(A, B, C, M, N, P, core_id, - 512); - mempool_start_benchmark(); - mempool_log_partial_barrier(2, core_id, 512); - mempool_stop_benchmark(); - } - if (core_id >= 512) { - uint32_t core_id_new = core_id - 512; - mempool_start_benchmark(); - mat_mul_unrolled_4x4_conflict_opt_parallel_asm(D, E, F, M, N, P, - core_id_new, 512); - mempool_start_benchmark(); - mempool_log_partial_barrier(2, core_id, 512); - mempool_stop_benchmark(); - } - mempool_barrier(num_cores); -#endif - - // Verify results - if (core_id == 0) { - printf("Start Verify Results\n"); - } - mempool_barrier(num_cores); - if (verify_matrix(C, M, P, N, A_a, A_b, A_c, B_a, B_b, B_c, core_id, - num_cores)) { - error = 1; - return -1; - } - return 0; -} - -// Main function block -int main() { - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - // Initialize barrier and synchronize - mempool_barrier_init(core_id); - - if (core_id == 0) { - error = 0; - } - - // Test the Matrix multiplication - test_matrix_multiplication(matrix_a, matrix_b, matrix_c, matrix_M, matrix_N, - matrix_P, core_id, num_cores); - // wait until all cores have finished - mempool_barrier(num_cores); - - return error; -} diff --git a/software/kernels/baremetal/mat_mul_conflict_opt.h b/software/kernels/baremetal/mat_mul_conflict_opt.h deleted file mode 100644 index 108f074a8..000000000 --- a/software/kernels/baremetal/mat_mul_conflict_opt.h +++ /dev/null @@ -1,828 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Yichao Zhang, ETH Zurich -// Author: Samuel Riedel, ETH Zurich - -/* This library implements the matrix multiplication in multiple different ways. - * The functions all follow the following format: - * - * A is an M x N matrix, B is a N x P matrix, and C is a M x P matrix - * C = AB - */ - -/* For parallel computation, general kernels support the power of 2 of matrix - * dimension; The max size for M and P should be M=P=<4096; The min size, it - * need to make sure "c_end - c_start >=4"; For mempool, the min matrix size is - * M=P=64; For terapool, the min matrix size is M=P=128; - */ - -void mat_mul_unrolled_4x2_serial(int32_t const *__restrict__ A, - int32_t const *__restrict__ B, - int32_t *__restrict__ C, uint32_t M, - uint32_t N, uint32_t P) { - // Parallelize by assigning each core one row - for (uint32_t i = 0; i < M; i += 2) { - for (uint32_t j = 0; j < P; j += 4) { - int32_t c00 = 0; - int32_t c01 = 0; - int32_t c02 = 0; - int32_t c03 = 0; - int32_t c10 = 0; - int32_t c11 = 0; - int32_t c12 = 0; - int32_t c13 = 0; - for (uint32_t k = 0; k < N; k += 2) { - // Explicitly load the values first to help with scheduling - int32_t val_a00 = A[(i + 0) * N + k + 0]; - int32_t val_a01 = A[(i + 0) * N + k + 1]; - int32_t val_a10 = A[(i + 1) * N + k + 0]; - int32_t val_a11 = A[(i + 1) * N + k + 1]; - - int32_t val_b00 = B[(k + 0) * P + j + 0]; - int32_t val_b01 = B[(k + 0) * P + j + 1]; - int32_t val_b02 = B[(k + 0) * P + j + 2]; - int32_t val_b03 = B[(k + 0) * P + j + 3]; - - int32_t val_b10 = B[(k + 1) * P + j + 0]; - int32_t val_b11 = B[(k + 1) * P + j + 1]; - int32_t val_b12 = B[(k + 1) * P + j + 2]; - int32_t val_b13 = B[(k + 1) * P + j + 3]; - - c00 += val_a00 * val_b00; - c00 += val_a01 * val_b10; - c01 += val_a00 * val_b01; - c01 += val_a01 * val_b11; - c02 += val_a00 * val_b02; - c02 += val_a01 * val_b12; - c03 += val_a00 * val_b03; - c03 += val_a01 * val_b13; - - c10 += val_a10 * val_b00; - c10 += val_a11 * val_b10; - c11 += val_a10 * val_b01; - c11 += val_a11 * val_b11; - c12 += val_a10 * val_b02; - c12 += val_a11 * val_b12; - c13 += val_a10 * val_b03; - c13 += val_a11 * val_b13; - } - C[(i + 0) * P + j + 0] = c00; - C[(i + 0) * P + j + 1] = c01; - C[(i + 0) * P + j + 2] = c02; - C[(i + 0) * P + j + 3] = c03; - C[(i + 1) * P + j + 0] = c10; - C[(i + 1) * P + j + 1] = c11; - C[(i + 1) * P + j + 2] = c12; - C[(i + 1) * P + j + 3] = c13; - } - } -} - -void mat_mul_unrolled_4x2_parallel(int32_t const *__restrict__ A, - int32_t const *__restrict__ B, - int32_t *__restrict__ C, uint32_t M, - uint32_t N, uint32_t P, uint32_t id, - uint32_t numThreads) { - // Parallelize by assigning each core one row - uint32_t const c = - numThreads / (M / 2); // How many columns to split the matrix into, best - // should be numThreads/(M/2); - uint32_t const c_start = (P / c) * (id % c); - uint32_t const c_end = (P / c) * ((id % c) + 1); - for (uint32_t i = 2 * (id / c); i < M; i += 2 * (numThreads / c)) { - for (uint32_t j = c_start; j < c_end; j += 4) { - int32_t c00 = 0; - int32_t c01 = 0; - int32_t c02 = 0; - int32_t c03 = 0; - int32_t c10 = 0; - int32_t c11 = 0; - int32_t c12 = 0; - int32_t c13 = 0; - for (uint32_t k = 0; k < N; k += 2) { - // Explicitly load the values first to help with scheduling - int32_t val_a00 = A[(i + 0) * N + k + 0]; - int32_t val_a01 = A[(i + 0) * N + k + 1]; - int32_t val_a10 = A[(i + 1) * N + k + 0]; - int32_t val_a11 = A[(i + 1) * N + k + 1]; - - int32_t val_b00 = B[(k + 0) * P + j + 0]; - int32_t val_b01 = B[(k + 0) * P + j + 1]; - int32_t val_b02 = B[(k + 0) * P + j + 2]; - int32_t val_b03 = B[(k + 0) * P + j + 3]; - - int32_t val_b10 = B[(k + 1) * P + j + 0]; - int32_t val_b11 = B[(k + 1) * P + j + 1]; - int32_t val_b12 = B[(k + 1) * P + j + 2]; - int32_t val_b13 = B[(k + 1) * P + j + 3]; - - c00 += val_a00 * val_b00; - c00 += val_a01 * val_b10; - c01 += val_a00 * val_b01; - c01 += val_a01 * val_b11; - c02 += val_a00 * val_b02; - c02 += val_a01 * val_b12; - c03 += val_a00 * val_b03; - c03 += val_a01 * val_b13; - - c10 += val_a10 * val_b00; - c10 += val_a11 * val_b10; - c11 += val_a10 * val_b01; - c11 += val_a11 * val_b11; - c12 += val_a10 * val_b02; - c12 += val_a11 * val_b12; - c13 += val_a10 * val_b03; - c13 += val_a11 * val_b13; - } - C[(i + 0) * P + j + 0] = c00; - C[(i + 0) * P + j + 1] = c01; - C[(i + 0) * P + j + 2] = c02; - C[(i + 0) * P + j + 3] = c03; - C[(i + 1) * P + j + 0] = c10; - C[(i + 1) * P + j + 1] = c11; - C[(i + 1) * P + j + 2] = c12; - C[(i + 1) * P + j + 3] = c13; - } - } -} - -void mat_mul_unrolled_4x4_serial(int32_t const *__restrict__ A, - int32_t const *__restrict__ B, - int32_t *__restrict__ C, uint32_t M, - uint32_t N, uint32_t P) { - // Parallelize by assigning each core one row - for (uint32_t i = 0; i < M; i += 4) { - for (uint32_t j = 0; j < P; j += 4) { - // Initialize 4x4 output tile - int32_t c00 = 0, c01 = 0, c02 = 0, c03 = 0; - int32_t c10 = 0, c11 = 0, c12 = 0, c13 = 0; - int32_t c20 = 0, c21 = 0, c22 = 0, c23 = 0; - int32_t c30 = 0, c31 = 0, c32 = 0, c33 = 0; - for (uint32_t k = 0; k < N; k += 1) { - // Explicitly load the values first to help with scheduling - int32_t b0 = B[k * P + j + 0]; - int32_t b1 = B[k * P + j + 1]; - int32_t b2 = B[k * P + j + 2]; - int32_t b3 = B[k * P + j + 3]; - // A could be local with scrambling - int32_t a0 = A[(i + 0) * N + k]; - int32_t a1 = A[(i + 1) * N + k]; - int32_t a2 = A[(i + 2) * N + k]; - int32_t a3 = A[(i + 3) * N + k]; - // Compute - c00 += a0 * b0; - c01 += a0 * b1; - c02 += a0 * b2; - c03 += a0 * b3; - c10 += a1 * b0; - c11 += a1 * b1; - c12 += a1 * b2; - c13 += a1 * b3; - c20 += a2 * b0; - c21 += a2 * b1; - c22 += a2 * b2; - c23 += a2 * b3; - c30 += a3 * b0; - c31 += a3 * b1; - c32 += a3 * b2; - c33 += a3 * b3; - } - // Store - C[(i + 0) * P + j + 0] = c00; - C[(i + 0) * P + j + 1] = c01; - C[(i + 0) * P + j + 2] = c02; - C[(i + 0) * P + j + 3] = c03; - C[(i + 1) * P + j + 0] = c10; - C[(i + 1) * P + j + 1] = c11; - C[(i + 1) * P + j + 2] = c12; - C[(i + 1) * P + j + 3] = c13; - C[(i + 2) * P + j + 0] = c20; - C[(i + 2) * P + j + 1] = c21; - C[(i + 2) * P + j + 2] = c22; - C[(i + 2) * P + j + 3] = c23; - C[(i + 3) * P + j + 0] = c30; - C[(i + 3) * P + j + 1] = c31; - C[(i + 3) * P + j + 2] = c32; - C[(i + 3) * P + j + 3] = c33; - } - } -} - -void mat_mul_unrolled_4x4_parallel(int32_t const *__restrict__ A, - int32_t const *__restrict__ B, - int32_t *__restrict__ C, uint32_t M, - uint32_t N, uint32_t P, uint32_t id, - uint32_t numThreads) { - // Parallelize by assigning each core one row - uint32_t const c = - numThreads / (M / 4); // How many columns to split the matrix into - uint32_t const c_start = (P / c) * (id % c); - uint32_t const c_end = (P / c) * ((id % c) + 1); - for (uint32_t i = 4 * (id / c); i < M; i += 4 * (numThreads / c)) { - for (uint32_t j = c_start; j < c_end; j += 4) { - // Initialize 4x4 output tile - int32_t c00 = 0, c01 = 0, c02 = 0, c03 = 0; - int32_t c10 = 0, c11 = 0, c12 = 0, c13 = 0; - int32_t c20 = 0, c21 = 0, c22 = 0, c23 = 0; - int32_t c30 = 0, c31 = 0, c32 = 0, c33 = 0; - for (uint32_t k = 0; k < N; k += 1) { - // Explicitly load the values first to help with scheduling - int32_t b0 = B[k * P + j + 0]; - int32_t b1 = B[k * P + j + 1]; - int32_t b2 = B[k * P + j + 2]; - int32_t b3 = B[k * P + j + 3]; - // A could be local with scrambling - int32_t a0 = A[(i + 0) * N + k]; - int32_t a1 = A[(i + 1) * N + k]; - int32_t a2 = A[(i + 2) * N + k]; - int32_t a3 = A[(i + 3) * N + k]; - // Compute - c00 += a0 * b0; - c01 += a0 * b1; - c02 += a0 * b2; - c03 += a0 * b3; - c10 += a1 * b0; - c11 += a1 * b1; - c12 += a1 * b2; - c13 += a1 * b3; - c20 += a2 * b0; - c21 += a2 * b1; - c22 += a2 * b2; - c23 += a2 * b3; - c30 += a3 * b0; - c31 += a3 * b1; - c32 += a3 * b2; - c33 += a3 * b3; - } - // Store - C[(i + 0) * P + j + 0] = c00; - C[(i + 0) * P + j + 1] = c01; - C[(i + 0) * P + j + 2] = c02; - C[(i + 0) * P + j + 3] = c03; - C[(i + 1) * P + j + 0] = c10; - C[(i + 1) * P + j + 1] = c11; - C[(i + 1) * P + j + 2] = c12; - C[(i + 1) * P + j + 3] = c13; - C[(i + 2) * P + j + 0] = c20; - C[(i + 2) * P + j + 1] = c21; - C[(i + 2) * P + j + 2] = c22; - C[(i + 2) * P + j + 3] = c23; - C[(i + 3) * P + j + 0] = c30; - C[(i + 3) * P + j + 1] = c31; - C[(i + 3) * P + j + 2] = c32; - C[(i + 3) * P + j + 3] = c33; - } - } -} - -void mat_mul_unrolled_4x4_conflict_opt_parallel(int32_t const *__restrict__ A, - int32_t const *__restrict__ B, - int32_t *__restrict__ C, - uint32_t M, uint32_t N, - uint32_t P, uint32_t id, - uint32_t numThreads) { - - ///////////////////////////// - // Configuration // - ///////////////////////////// - // Parallelize by assigning each core one row - // How many cores per window - uint32_t c = numThreads / (M / 4); - if (numThreads * 4 < M) { - c = 1; - } - uint32_t const c_start = (P / c) * (id % c); - uint32_t const c_end = (P / c) * ((id % c) + 1); - - // For avoiding group conflict by same tile - // Each cores in the same tile should access to different groups - uint32_t group_bank_nums = 512; // MemPool = 256 - uint32_t tile_core_nums = 8; // MemPool = 4 - uint32_t jump_lines_A = group_bank_nums / N; // Used for i control - uint32_t jump_lines_B = group_bank_nums / P; // Used for k control - // Window size limit, min jump lines is 4 for MatrixA - if (jump_lines_A < 4) { - jump_lines_A = 4; - } - - ///////////////////////////// - // LOOP OFFSET // - ///////////////////////////// - // Outer Loop Control, for group access port conflict - uint32_t i_offset = jump_lines_A * (id % tile_core_nums); - // Inner Loop Incremental Control, for group access port conflict - uint32_t k_offset_incr = jump_lines_B * (id % tile_core_nums); - // Inner Loop Control - // k_offset = (Core offset) + (Window offset) + (Group offset from MatrixB) - uint32_t k_offset = (id % c) + (2 * (id / c)) + k_offset_incr; - // Middle Loop Control, window jump for avoiding bank conflict - uint32_t conflict_row = (group_bank_nums * tile_core_nums) / P; - uint32_t j_offset = (2 * (id / c)) / conflict_row; - - ///////////////////////////// - // LOOP CONTROL // - ///////////////////////////// - // Inner Round-Robin - if (k_offset >= N) { - k_offset = k_offset - N * (k_offset / N); - } - // Middle Round-Robin - uint32_t window_in_P = (P / c) / 4; - if (j_offset >= window_in_P) { - j_offset = j_offset - window_in_P * (j_offset / window_in_P); - } - // Outer Loop Control - uint32_t outer_loop_counter = 0; - uint32_t outer_loop_time = M / (4 * numThreads); - if (outer_loop_time < 1) { - outer_loop_time = 1; - } - uint32_t M_partition = M / outer_loop_time; - - ///////////////////////////// - // *LOOP START* // - ///////////////////////////// - for (uint32_t i_ori = 4 * (id / c); i_ori < M; - i_ori += 4 * (numThreads / c)) { - outer_loop_counter += 1; - uint32_t i = i_ori + i_offset; - // Round-Robin control, if offset lines > M, back to the first window - if (i >= M_partition * outer_loop_counter) { - i = i - M_partition * (i / (M_partition * outer_loop_counter)); - } - // Backup counter for mid-loop - uint32_t j_offset_counter = c_start + j_offset * 4; - uint32_t P_counter = c_end; - - Mid_loop: - for (uint32_t j = j_offset_counter; j < P_counter; j += 4) { - // Initialize 4x4 output tile - int32_t c00 = 0, c01 = 0, c02 = 0, c03 = 0; - int32_t c10 = 0, c11 = 0, c12 = 0, c13 = 0; - int32_t c20 = 0, c21 = 0, c22 = 0, c23 = 0; - int32_t c30 = 0, c31 = 0, c32 = 0, c33 = 0; - - // Backup the variables for restore and later use - uint32_t k_offset_counter = k_offset; - uint32_t N_counter = N; - - Inner_Loop: - for (uint32_t k = k_offset_counter; k < N_counter; k += 1) { - // Explicitly load the values first to help with scheduling - int32_t b0 = B[k * P + j + 0]; - int32_t b1 = B[k * P + j + 1]; - int32_t b2 = B[k * P + j + 2]; - int32_t b3 = B[k * P + j + 3]; - // A could be local with scrambling - int32_t a0 = A[(i + 0) * N + k]; - int32_t a1 = A[(i + 1) * N + k]; - int32_t a2 = A[(i + 2) * N + k]; - int32_t a3 = A[(i + 3) * N + k]; - // Compute - c00 += a0 * b0; - c01 += a0 * b1; - c02 += a0 * b2; - c03 += a0 * b3; - c10 += a1 * b0; - c11 += a1 * b1; - c12 += a1 * b2; - c13 += a1 * b3; - c20 += a2 * b0; - c21 += a2 * b1; - c22 += a2 * b2; - c23 += a2 * b3; - c30 += a3 * b0; - c31 += a3 * b1; - c32 += a3 * b2; - c33 += a3 * b3; - } - - // Pseudo-jump code to avoid complie inner-loop twice - // Complie twice will have scheduling issue due to register file limit. - if (k_offset_counter > 0) { - N_counter = k_offset; - k_offset_counter = 0; - goto Inner_Loop; - } - - // Store - C[(i + 0) * P + j + 0] = c00; - C[(i + 0) * P + j + 1] = c01; - C[(i + 0) * P + j + 2] = c02; - C[(i + 0) * P + j + 3] = c03; - C[(i + 1) * P + j + 0] = c10; - C[(i + 1) * P + j + 1] = c11; - C[(i + 1) * P + j + 2] = c12; - C[(i + 1) * P + j + 3] = c13; - C[(i + 2) * P + j + 0] = c20; - C[(i + 2) * P + j + 1] = c21; - C[(i + 2) * P + j + 2] = c22; - C[(i + 2) * P + j + 3] = c23; - C[(i + 3) * P + j + 0] = c30; - C[(i + 3) * P + j + 1] = c31; - C[(i + 3) * P + j + 2] = c32; - C[(i + 3) * P + j + 3] = c33; - } - - if (j_offset_counter != c_start) { - P_counter = j_offset_counter; - j_offset_counter = c_start; - goto Mid_loop; - } - } -} - -/*******************************/ -/* ASM CODE KERNEL START BELOW */ -/*******************************/ - -// Define immediate values that used in asm code. -#define N3 (matrix_M - 3) * 4 -#define N31 (-3 * matrix_N + 1) * 4 -#define P3 (matrix_P - 3) * 4 -#define P31 (-3 * matrix_N + 1) * 4 - -void mat_mul_unrolled_4x4_parallel_asm(int32_t const *__restrict__ A, - int32_t const *__restrict__ B, - int32_t *__restrict__ C, uint32_t M, - uint32_t N, uint32_t P, uint32_t id, - uint32_t numThreads) { - // Parallelize by assigning each tile one row - uint32_t c = numThreads / (M / 4); - if (numThreads * 4 < M) { - c = 1; - } - // numThreads / (M / 4); // How many columns to split the matrix into - uint32_t const c_start = (P / c) * (id % c); - uint32_t const c_end = (P / c) * ((id % c) + 1); - for (uint32_t i = 4 * (id / c); i < M; i += 4 * (numThreads / c)) { - for (uint32_t j = c_start; j < c_end; j += 4) { - // Address registers - int32_t const *addr_a = &A[i * N]; - int32_t const *addr_b = &B[j]; - int32_t const *end_b = &B[N * P + j]; - int32_t const *addr_c = &C[i * P + j]; - int32_t const N3_1_r = (-3 * (int32_t)N + 1) * 4; - int32_t const P_3_r = ((int32_t)P - 3) * 4; - - register int32_t k asm("x1") = (int32_t)end_b; - // x12 x13 x14 x15 - // - // x3 x16 x17 x18 x19 - // x4 x20 x21 x22 x23 - // x10 x24 x25 x26 x27 - // x11 x28 x29 x30 x31 - // - // - __asm__ volatile( - ".balign 16 \n\t" - // Outer loop: Initialize and preload. Execute this loop P times - // TODO arrange - "p.lw x3, %[N](%[addr_a]!) \n\t" - "p.lw x12, 4(%[addr_b]!) \n\t" - "p.lw x13, 4(%[addr_b]!) \n\t" - "p.lw x14, 4(%[addr_b]!) \n\t" - "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 - "p.lw x4, %[N](%[addr_a]!) \n\t" - "p.lw x10, %[N](%[addr_a]!) \n\t" - "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 - // Initial computation + prefetching - "mul x16, x3, x12 \n\t" - "mul x17, x3, x13 \n\t" - "mul x18, x3, x14 \n\t" - "mul x19, x3, x15 \n\t" - "p.lw x3, %[N](%[addr_a]!) \n\t" - "mul x20, x4, x12 \n\t" - "mul x21, x4, x13 \n\t" - "mul x22, x4, x14 \n\t" - "mul x23, x4, x15 \n\t" - "p.lw x4, %[N](%[addr_a]!) \n\t" - "mul x24, x10, x12 \n\t" - "mul x25, x10, x13 \n\t" - "mul x26, x10, x14 \n\t" - "mul x27, x10, x15 \n\t" - "p.lw x10, %[N](%[addr_a]!) \n\t" - "mul x28, x11, x12 \n\t" - "p.lw x12, 4(%[addr_b]!) \n\t" - "mul x29, x11, x13 \n\t" - "p.lw x13, 4(%[addr_b]!) \n\t" - "mul x30, x11, x14 \n\t" - "p.lw x14, 4(%[addr_b]!) \n\t" - "mul x31, x11, x15 \n\t" - "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 - "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 - // Inner loop: Do this loop N times - "1: \n\t" - "p.mac x16, x3, x12 \n\t" - "p.mac x17, x3, x13 \n\t" - "p.mac x20, x4, x12 \n\t" - "p.mac x21, x4, x13 \n\t" - "p.mac x18, x3, x14 \n\t" - "p.mac x22, x4, x14 \n\t" - "p.mac x19, x3, x15 \n\t" - "p.lw x3, %[N](%[addr_a]!) \n\t" - "p.mac x23, x4, x15 \n\t" - "p.lw x4, %[N](%[addr_a]!) \n\t" - "p.mac x24, x10, x12 \n\t" - "p.mac x28, x11, x12 \n\t" - "p.lw x12, 4(%[addr_b]!) \n\t" - "p.mac x25, x10, x13 \n\t" - "p.mac x29, x11, x13 \n\t" - "p.lw x13, 4(%[addr_b]!) \n\t" - "p.mac x26, x10, x14 \n\t" - "p.mac x30, x11, x14 \n\t" - "p.lw x14, 4(%[addr_b]!) \n\t" - "p.mac x27, x10, x15 \n\t" - "p.mac x31, x11, x15 \n\t" - "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 - "p.lw x10, %[N](%[addr_a]!) \n\t" - "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 - "bne %[addr_b], x1, 1b \n\t" - // Loop done store - "p.mac x16, x3, x12 \n\t" - "p.mac x17, x3, x13 \n\t" - "p.mac x18, x3, x14 \n\t" - "p.sw x16, 4(%[addr_c]!) \n\t" - "p.mac x19, x3, x15 \n\t" - "p.sw x17, 4(%[addr_c]!) \n\t" - "p.mac x20, x4, x12 \n\t" - "p.sw x18, 4(%[addr_c]!) \n\t" - "p.mac x21, x4, x13 \n\t" - "p.sw x19, %[P_3](%[addr_c]!) \n\t" - "p.mac x22, x4, x14 \n\t" - "p.sw x20, 4(%[addr_c]!) \n\t" - "p.mac x23, x4, x15 \n\t" - "p.sw x21, 4(%[addr_c]!) \n\t" - "p.mac x24, x10, x12 \n\t" - "p.sw x22, 4(%[addr_c]!) \n\t" - "p.mac x25, x10, x13 \n\t" - "p.sw x23, %[P_3](%[addr_c]!) \n\t" - "p.mac x26, x10, x14 \n\t" - "p.sw x24, 4(%[addr_c]!) \n\t" - "p.mac x27, x10, x15 \n\t" - "p.sw x25, 4(%[addr_c]!) \n\t" - "p.mac x28, x11, x12 \n\t" - "p.sw x26, 4(%[addr_c]!) \n\t" - "p.mac x29, x11, x13 \n\t" - "p.sw x27, %[P_3](%[addr_c]!) \n\t" - "p.mac x30, x11, x14 \n\t" - "p.sw x28, 4(%[addr_c]!) \n\t" - "p.mac x31, x11, x15 \n\t" - "p.sw x29, 4(%[addr_c]!) \n\t" - "p.sw x30, 4(%[addr_c]!) \n\t" - "p.sw x31, %[P_3](%[addr_c]!) \n\t" - : [addr_a] "+&r"(addr_a), [addr_b] "+&r"(addr_b), - [addr_c] "+&r"(addr_c) // Outputs - : [N3_1] "r"(N3_1_r), [P_3] "r"(P_3_r), [x1] "r"(k), - [N] "I"(matrix_N * 4) // Inputs - : "x3", "x4", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", - "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", - "x27", "x28", "x29", "x30", "x31", "memory"); // Clobber - } - } -} - -void mat_mul_unrolled_4x4_conflict_opt_parallel_asm( - int32_t const *__restrict__ A, int32_t const *__restrict__ B, - int32_t *__restrict__ C, uint32_t M, uint32_t N, uint32_t P, uint32_t id, - uint32_t numThreads) { - - ///////////////////////////// - // Configuration // - ///////////////////////////// - // Parallelize by assigning each core one row - // How many cores per window - uint32_t c = numThreads / (M / 4); - if (numThreads * 4 < M) { - c = 1; - } - uint32_t const c_start = (P / c) * (id % c); - uint32_t const c_end = (P / c) * ((id % c) + 1); - - // For avoiding group conflict by same tile - // Each cores in the same tile should access to different groups - uint32_t group_bank_nums = 512; // MemPool = 256 - uint32_t tile_core_nums = 8; // MemPool = 4 - uint32_t jump_lines_A = group_bank_nums / N; // Used for i control - uint32_t jump_lines_B = group_bank_nums / P; // Used for k control - // Window size limit, min jump lines is 4 for MatrixA - if (jump_lines_A < 4) { - jump_lines_A = 4; - } - - ///////////////////////////// - // LOOP OFFSET // - ///////////////////////////// - // Outer Loop Control, for group access port conflict - uint32_t i_offset = jump_lines_A * (id % tile_core_nums); - // Inner Loop Incremental Control, for group access port conflict - uint32_t k_offset_incr = jump_lines_B * (id % tile_core_nums); - // Inner Loop Control - // k_offset = (Core offset) + (Window offset) + (Group offset from MatrixB) - uint32_t k_offset = (id % c) + (2 * (id / c)) + k_offset_incr; - // Middle Loop Control, window jump for avoiding bank conflict - uint32_t conflict_row = (group_bank_nums * tile_core_nums) / P; - uint32_t j_offset = (2 * (id / c)) / conflict_row; - - ///////////////////////////// - // LOOP CONTROL // - ///////////////////////////// - // Inner Round-Robin - if (k_offset >= N) { - k_offset = k_offset - N * (k_offset / N); - } - // Middle Round-Robin - uint32_t window_in_P = (P / c) / 4; - if (j_offset >= window_in_P) { - j_offset = j_offset - window_in_P * (j_offset / window_in_P); - } - // Outer Loop Control - uint32_t outer_loop_counter = 0; - uint32_t outer_loop_time = M / (4 * numThreads); - if (outer_loop_time < 1) { - outer_loop_time = 1; - } - uint32_t M_partition = M / outer_loop_time; - - ///////////////////////////// - // *LOOP START* // - ///////////////////////////// - for (uint32_t i_ori = 4 * (id / c); i_ori < M; - i_ori += 4 * (numThreads / c)) { - outer_loop_counter += 1; - uint32_t i = i_ori + i_offset; - // Round-Robin control, if offset lines > M, back to the first window - if (i >= M_partition * outer_loop_counter) { - i = i - M_partition * (i / (M_partition * outer_loop_counter)); - } - // Backup counter for mid-loop - uint32_t j_offset_counter = c_start + j_offset * 4; - uint32_t P_counter = c_end; - - Mid_loop: - for (uint32_t j = j_offset_counter; j < P_counter; j += 4) { - // Address registers - int32_t const *addr_a_ori = &A[i * N]; - int32_t const *addr_b_ori = &B[j]; - int32_t const *addr_a = &A[i * N + k_offset]; - int32_t const *addr_b = &B[k_offset * P + j]; - int32_t const *end_b = &B[N * P + j]; - int32_t const *addr_c = &C[i * P + j]; - register int32_t k asm("x1") = (int32_t)end_b; - - __asm__ volatile( - ".balign 16 \n\t" - // Outer loop: Initialize and preload. Execute this loop P times - // TODO arrange - "add sp, sp, -8 \n\t" - "sw %[addr_b], 0(sp) \n\t" - "sw %[addr_a_ori], 4(sp) \n\t" - "p.lw x3, %[N](%[addr_a]!) \n\t" - "p.lw x12, 4(%[addr_b]!) \n\t" - "p.lw x13, 4(%[addr_b]!) \n\t" - "p.lw x14, 4(%[addr_b]!) \n\t" - "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 - "p.lw x4, %[N](%[addr_a]!) \n\t" - "p.lw x10, %[N](%[addr_a]!) \n\t" - "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 - - // If reach endpoint, swap address - "bne %[addr_b], x1, init_comp \n\t" - "lw x1, 0(sp) \n\t" - "addi %[addr_a], %[addr_a_ori], 0 \n\t" - "addi %[addr_b], %[addr_b_ori], 0 \n\t" - "sw %[addr_b], 0(sp) \n\t" - - // Initial computation + prefetching - "init_comp: \n\t" - "mul x16, x3, x12 \n\t" - "mul x17, x3, x13 \n\t" - "mul x18, x3, x14 \n\t" - "mul x19, x3, x15 \n\t" - "p.lw x3, %[N](%[addr_a]!) \n\t" - "mul x20, x4, x12 \n\t" - "mul x21, x4, x13 \n\t" - "mul x22, x4, x14 \n\t" - "mul x23, x4, x15 \n\t" - "p.lw x4, %[N](%[addr_a]!) \n\t" - "mul x24, x10, x12 \n\t" - "mul x25, x10, x13 \n\t" - "mul x26, x10, x14 \n\t" - "mul x27, x10, x15 \n\t" - "p.lw x10, %[N](%[addr_a]!) \n\t" - "mul x28, x11, x12 \n\t" - "p.lw x12, 4(%[addr_b]!) \n\t" - "mul x29, x11, x13 \n\t" - "p.lw x13, 4(%[addr_b]!) \n\t" - "mul x30, x11, x14 \n\t" - "p.lw x14, 4(%[addr_b]!) \n\t" - "mul %[addr_a_ori], x11, x15 \n\t" // Use addr_a_ori instead of x31 - "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 - "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 - - // If reach endpoint, swap address - "bne %[addr_b], x1, inner_loop \n\t" - "sw %[addr_a_ori], 8(sp) \n\t" // backup x31 - "lw %[addr_a_ori], 4(sp) \n\t" // load back addr_a_ori - "lw x1, 0(sp) \n\t" - "addi %[addr_a], %[addr_a_ori], 0 \n\t" - "addi %[addr_b], %[addr_b_ori], 0 \n\t" - "sw %[addr_b], 0(sp) \n\t" - "lw %[addr_a_ori], 8(sp) \n\t" // load back x31 - - // Inner loop: Do this loop N times - "inner_loop: \n\t" - "1: \n\t" - "p.mac x16, x3, x12 \n\t" - "p.mac x17, x3, x13 \n\t" - "p.mac x20, x4, x12 \n\t" - "p.mac x21, x4, x13 \n\t" - "p.mac x18, x3, x14 \n\t" - "p.mac x22, x4, x14 \n\t" - "p.mac x19, x3, x15 \n\t" - "p.lw x3, %[N](%[addr_a]!) \n\t" - "p.mac x23, x4, x15 \n\t" - "p.lw x4, %[N](%[addr_a]!) \n\t" - "p.mac x24, x10, x12 \n\t" - "p.mac x28, x11, x12 \n\t" - "p.lw x12, 4(%[addr_b]!) \n\t" - "p.mac x25, x10, x13 \n\t" - "p.mac x29, x11, x13 \n\t" - "p.lw x13, 4(%[addr_b]!) \n\t" - "p.mac x26, x10, x14 \n\t" - "p.mac x30, x11, x14 \n\t" - "p.lw x14, 4(%[addr_b]!) \n\t" - "p.mac x27, x10, x15 \n\t" - "p.mac %[addr_a_ori], x11, x15 \n\t" - "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 - "p.lw x10, %[N](%[addr_a]!) \n\t" - "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 - "bne %[addr_b], x1, 1b \n\t" - - // Case1: Loop done if k_offset = 0 - // Case2: Loop done when 2nd time to here - // Case3: If reach endpoint, swap address - "lw %[addr_b], 0(sp) \n\t" - "beq %[addr_b_ori], %[addr_b], store \n\t" - "sw %[addr_a_ori], 8(sp) \n\t" // backup x31 - "lw %[addr_a_ori], 4(sp) \n\t" // load back addr_a_ori - "addi x1, %[addr_b], 0 \n\t" - "addi %[addr_a], %[addr_a_ori], 0 \n\t" - "addi %[addr_b], %[addr_b_ori], 0 \n\t" - "sw %[addr_b], 0(sp) \n\t" - "lw %[addr_a_ori], 8(sp) \n\t" // load back x31 - "j 1b \n\t" - - // Loop done store - "store: \n\t" - "p.mac x16, x3, x12 \n\t" - "p.mac x17, x3, x13 \n\t" - "p.mac x18, x3, x14 \n\t" - "p.sw x16, 4(%[addr_c]!) \n\t" - "p.mac x19, x3, x15 \n\t" - "p.sw x17, 4(%[addr_c]!) \n\t" - "p.mac x20, x4, x12 \n\t" - "p.sw x18, 4(%[addr_c]!) \n\t" - "p.mac x21, x4, x13 \n\t" - "p.sw x19, %[P_3](%[addr_c]!) \n\t" - "p.mac x22, x4, x14 \n\t" - "p.sw x20, 4(%[addr_c]!) \n\t" - "p.mac x23, x4, x15 \n\t" - "p.sw x21, 4(%[addr_c]!) \n\t" - "p.mac x24, x10, x12 \n\t" - "p.sw x22, 4(%[addr_c]!) \n\t" - "p.mac x25, x10, x13 \n\t" - "p.sw x23, %[P_3](%[addr_c]!) \n\t" - "p.mac x26, x10, x14 \n\t" - "p.sw x24, 4(%[addr_c]!) \n\t" - "p.mac x27, x10, x15 \n\t" - "p.sw x25, 4(%[addr_c]!) \n\t" - "p.mac x28, x11, x12 \n\t" - "p.sw x26, 4(%[addr_c]!) \n\t" - "p.mac x29, x11, x13 \n\t" - "p.sw x27, %[P_3](%[addr_c]!) \n\t" - "p.mac x30, x11, x14 \n\t" - "p.sw x28, 4(%[addr_c]!) \n\t" - "p.mac %[addr_a_ori], x11, x15 \n\t" - "p.sw x29, 4(%[addr_c]!) \n\t" - "p.sw x30, 4(%[addr_c]!) \n\t" - "p.sw %[addr_a_ori], %[P_3](%[addr_c]!) \n\t" - "add sp, sp, 8 \n\t" - : [addr_a] "+&r"(addr_a), [addr_b] "+&r"(addr_b), - [addr_c] "+&r"(addr_c), [addr_a_ori] "+&r"(addr_a_ori), - [addr_b_ori] "+&r"(addr_b_ori) // Outputs - : [N3_1] "r"(N31), [P_3] "I"(P3), [x1] "r"(k), - [N] "I"(matrix_N * 4) // Inputs - : "x3", "x4", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", - "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", - "x27", "x28", "x29", "x30", "memory"); // Clobber - } - if (j_offset_counter != c_start) { - P_counter = j_offset_counter; - j_offset_counter = c_start; - goto Mid_loop; - } - } -} diff --git a/software/kernels/baremetal/mempool_matmul_i32p.h b/software/kernels/baremetal/mempool_matmul_i32p.h index ac6a6647a..9eb86f32c 100644 --- a/software/kernels/baremetal/mempool_matmul_i32p.h +++ b/software/kernels/baremetal/mempool_matmul_i32p.h @@ -414,3 +414,620 @@ void matmul_unrolled_2x2_parallel_i32_xpulpv2(int32_t const *__restrict__ A, } } #endif + +void mat_mul_unrolled_4x4_parallel(int32_t const *__restrict__ A, + int32_t const *__restrict__ B, + int32_t *__restrict__ C, uint32_t M, + uint32_t N, uint32_t P, uint32_t id, + uint32_t numThreads) { + // Parallelize by assigning each core one row + uint32_t const c = + numThreads / (M / 4); // How many columns to split the matrix into + uint32_t const c_start = (P / c) * (id % c); + uint32_t const c_end = (P / c) * ((id % c) + 1); + for (uint32_t i = 4 * (id / c); i < M; i += 4 * (numThreads / c)) { + for (uint32_t j = c_start; j < c_end; j += 4) { + // Initialize 4x4 output tile + int32_t c00 = 0, c01 = 0, c02 = 0, c03 = 0; + int32_t c10 = 0, c11 = 0, c12 = 0, c13 = 0; + int32_t c20 = 0, c21 = 0, c22 = 0, c23 = 0; + int32_t c30 = 0, c31 = 0, c32 = 0, c33 = 0; + for (uint32_t k = 0; k < N; k += 1) { + // Explicitly load the values first to help with scheduling + int32_t b0 = B[k * P + j + 0]; + int32_t b1 = B[k * P + j + 1]; + int32_t b2 = B[k * P + j + 2]; + int32_t b3 = B[k * P + j + 3]; + // A could be local with scrambling + int32_t a0 = A[(i + 0) * N + k]; + int32_t a1 = A[(i + 1) * N + k]; + int32_t a2 = A[(i + 2) * N + k]; + int32_t a3 = A[(i + 3) * N + k]; + // Compute + c00 += a0 * b0; + c01 += a0 * b1; + c02 += a0 * b2; + c03 += a0 * b3; + c10 += a1 * b0; + c11 += a1 * b1; + c12 += a1 * b2; + c13 += a1 * b3; + c20 += a2 * b0; + c21 += a2 * b1; + c22 += a2 * b2; + c23 += a2 * b3; + c30 += a3 * b0; + c31 += a3 * b1; + c32 += a3 * b2; + c33 += a3 * b3; + } + // Store + C[(i + 0) * P + j + 0] = c00; + C[(i + 0) * P + j + 1] = c01; + C[(i + 0) * P + j + 2] = c02; + C[(i + 0) * P + j + 3] = c03; + C[(i + 1) * P + j + 0] = c10; + C[(i + 1) * P + j + 1] = c11; + C[(i + 1) * P + j + 2] = c12; + C[(i + 1) * P + j + 3] = c13; + C[(i + 2) * P + j + 0] = c20; + C[(i + 2) * P + j + 1] = c21; + C[(i + 2) * P + j + 2] = c22; + C[(i + 2) * P + j + 3] = c23; + C[(i + 3) * P + j + 0] = c30; + C[(i + 3) * P + j + 1] = c31; + C[(i + 3) * P + j + 2] = c32; + C[(i + 3) * P + j + 3] = c33; + } + } +} + +void mat_mul_unrolled_4x4_conflict_opt_parallel(int32_t const *__restrict__ A, + int32_t const *__restrict__ B, + int32_t *__restrict__ C, + uint32_t M, uint32_t N, + uint32_t P, uint32_t id, + uint32_t numThreads) { + + ///////////////////////////// + // Configuration // + ///////////////////////////// + // Parallelize by assigning each core one row + // How many cores per window + uint32_t c = numThreads / (M / 4); + if (numThreads * 4 < M) { + c = 1; + } + uint32_t const c_start = (P / c) * (id % c); + uint32_t const c_end = (P / c) * ((id % c) + 1); + + // For avoiding group conflict by same tile + // Each cores in the same tile should access to different groups + uint32_t group_bank_nums = 512; // MemPool = 256 + uint32_t tile_core_nums = 8; // MemPool = 4 + uint32_t jump_lines_A = group_bank_nums / N; // Used for i control + uint32_t jump_lines_B = group_bank_nums / P; // Used for k control + // Window size limit, min jump lines is 4 for MatrixA + if (jump_lines_A < 4) { + jump_lines_A = 4; + } + + ///////////////////////////// + // LOOP OFFSET // + ///////////////////////////// + // Outer Loop Control, for group access port conflict + uint32_t i_offset = jump_lines_A * (id % tile_core_nums); + // Inner Loop Incremental Control, for group access port conflict + uint32_t k_offset_incr = jump_lines_B * (id % tile_core_nums); + // Inner Loop Control + // k_offset = (Core offset) + (Window offset) + (Group offset from MatrixB) + uint32_t k_offset = (id % c) + (2 * (id / c)) + k_offset_incr; + // Middle Loop Control, window jump for avoiding bank conflict + uint32_t conflict_row = (group_bank_nums * tile_core_nums) / P; + uint32_t j_offset = (2 * (id / c)) / conflict_row; + + ///////////////////////////// + // LOOP CONTROL // + ///////////////////////////// + // Inner Round-Robin + if (k_offset >= N) { + k_offset = k_offset - N * (k_offset / N); + } + // Middle Round-Robin + uint32_t window_in_P = (P / c) / 4; + if (j_offset >= window_in_P) { + j_offset = j_offset - window_in_P * (j_offset / window_in_P); + } + // Outer Loop Control + uint32_t outer_loop_counter = 0; + uint32_t outer_loop_time = M / (4 * numThreads); + if (outer_loop_time < 1) { + outer_loop_time = 1; + } + uint32_t M_partition = M / outer_loop_time; + + ///////////////////////////// + // *LOOP START* // + ///////////////////////////// + for (uint32_t i_ori = 4 * (id / c); i_ori < M; + i_ori += 4 * (numThreads / c)) { + outer_loop_counter += 1; + uint32_t i = i_ori + i_offset; + // Round-Robin control, if offset lines > M, back to the first window + if (i >= M_partition * outer_loop_counter) { + i = i - M_partition * (i / (M_partition * outer_loop_counter)); + } + // Backup counter for mid-loop + uint32_t j_offset_counter = c_start + j_offset * 4; + uint32_t P_counter = c_end; + + Mid_loop: + for (uint32_t j = j_offset_counter; j < P_counter; j += 4) { + // Initialize 4x4 output tile + int32_t c00 = 0, c01 = 0, c02 = 0, c03 = 0; + int32_t c10 = 0, c11 = 0, c12 = 0, c13 = 0; + int32_t c20 = 0, c21 = 0, c22 = 0, c23 = 0; + int32_t c30 = 0, c31 = 0, c32 = 0, c33 = 0; + + // Backup the variables for restore and later use + uint32_t k_offset_counter = k_offset; + uint32_t N_counter = N; + + Inner_Loop: + for (uint32_t k = k_offset_counter; k < N_counter; k += 1) { + // Explicitly load the values first to help with scheduling + int32_t b0 = B[k * P + j + 0]; + int32_t b1 = B[k * P + j + 1]; + int32_t b2 = B[k * P + j + 2]; + int32_t b3 = B[k * P + j + 3]; + // A could be local with scrambling + int32_t a0 = A[(i + 0) * N + k]; + int32_t a1 = A[(i + 1) * N + k]; + int32_t a2 = A[(i + 2) * N + k]; + int32_t a3 = A[(i + 3) * N + k]; + // Compute + c00 += a0 * b0; + c01 += a0 * b1; + c02 += a0 * b2; + c03 += a0 * b3; + c10 += a1 * b0; + c11 += a1 * b1; + c12 += a1 * b2; + c13 += a1 * b3; + c20 += a2 * b0; + c21 += a2 * b1; + c22 += a2 * b2; + c23 += a2 * b3; + c30 += a3 * b0; + c31 += a3 * b1; + c32 += a3 * b2; + c33 += a3 * b3; + } + + // Pseudo-jump code to avoid complie inner-loop twice + // Complie twice will have scheduling issue due to register file limit. + if (k_offset_counter > 0) { + N_counter = k_offset; + k_offset_counter = 0; + goto Inner_Loop; + } + + // Store + C[(i + 0) * P + j + 0] = c00; + C[(i + 0) * P + j + 1] = c01; + C[(i + 0) * P + j + 2] = c02; + C[(i + 0) * P + j + 3] = c03; + C[(i + 1) * P + j + 0] = c10; + C[(i + 1) * P + j + 1] = c11; + C[(i + 1) * P + j + 2] = c12; + C[(i + 1) * P + j + 3] = c13; + C[(i + 2) * P + j + 0] = c20; + C[(i + 2) * P + j + 1] = c21; + C[(i + 2) * P + j + 2] = c22; + C[(i + 2) * P + j + 3] = c23; + C[(i + 3) * P + j + 0] = c30; + C[(i + 3) * P + j + 1] = c31; + C[(i + 3) * P + j + 2] = c32; + C[(i + 3) * P + j + 3] = c33; + } + + if (j_offset_counter != c_start) { + P_counter = j_offset_counter; + j_offset_counter = c_start; + goto Mid_loop; + } + } +} + +/*******************************/ +/* ASM CODE KERNEL START BELOW */ +/*******************************/ + +// Define immediate values that used in asm code. +#define N3 (matrix_M - 3) * 4 +#define N31 (-3 * matrix_N + 1) * 4 +#define P3 (matrix_P - 3) * 4 +#define P31 (-3 * matrix_N + 1) * 4 + +void mat_mul_unrolled_4x4_parallel_asm(int32_t const *__restrict__ A, + int32_t const *__restrict__ B, + int32_t *__restrict__ C, uint32_t M, + uint32_t N, uint32_t P, uint32_t id, + uint32_t numThreads) { + // Parallelize by assigning each tile one row + uint32_t c = numThreads / (M / 4); + if (numThreads * 4 < M) { + c = 1; + } + // numThreads / (M / 4); // How many columns to split the matrix into + uint32_t const c_start = (P / c) * (id % c); + uint32_t const c_end = (P / c) * ((id % c) + 1); + for (uint32_t i = 4 * (id / c); i < M; i += 4 * (numThreads / c)) { + for (uint32_t j = c_start; j < c_end; j += 4) { + // Address registers + int32_t const *addr_a = &A[i * N]; + int32_t const *addr_b = &B[j]; + int32_t const *end_b = &B[N * P + j]; + int32_t const *addr_c = &C[i * P + j]; + int32_t const N3_1_r = (-3 * (int32_t)N + 1) * 4; + int32_t const P_3_r = ((int32_t)P - 3) * 4; + + register int32_t k asm("x1") = (int32_t)end_b; + // x12 x13 x14 x15 + // + // x3 x16 x17 x18 x19 + // x4 x20 x21 x22 x23 + // x10 x24 x25 x26 x27 + // x11 x28 x29 x30 x31 + // + // + __asm__ volatile( + ".balign 16 \n\t" + // Outer loop: Initialize and preload. Execute this loop P times + // TODO arrange + "p.lw x3, %[N](%[addr_a]!) \n\t" + "p.lw x12, 4(%[addr_b]!) \n\t" + "p.lw x13, 4(%[addr_b]!) \n\t" + "p.lw x14, 4(%[addr_b]!) \n\t" + "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 + "p.lw x4, %[N](%[addr_a]!) \n\t" + "p.lw x10, %[N](%[addr_a]!) \n\t" + "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 + // Initial computation + prefetching + "mul x16, x3, x12 \n\t" + "mul x17, x3, x13 \n\t" + "mul x18, x3, x14 \n\t" + "mul x19, x3, x15 \n\t" + "p.lw x3, %[N](%[addr_a]!) \n\t" + "mul x20, x4, x12 \n\t" + "mul x21, x4, x13 \n\t" + "mul x22, x4, x14 \n\t" + "mul x23, x4, x15 \n\t" + "p.lw x4, %[N](%[addr_a]!) \n\t" + "mul x24, x10, x12 \n\t" + "mul x25, x10, x13 \n\t" + "mul x26, x10, x14 \n\t" + "mul x27, x10, x15 \n\t" + "p.lw x10, %[N](%[addr_a]!) \n\t" + "mul x28, x11, x12 \n\t" + "p.lw x12, 4(%[addr_b]!) \n\t" + "mul x29, x11, x13 \n\t" + "p.lw x13, 4(%[addr_b]!) \n\t" + "mul x30, x11, x14 \n\t" + "p.lw x14, 4(%[addr_b]!) \n\t" + "mul x31, x11, x15 \n\t" + "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 + "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 + // Inner loop: Do this loop N times + "1: \n\t" + "p.mac x16, x3, x12 \n\t" + "p.mac x17, x3, x13 \n\t" + "p.mac x20, x4, x12 \n\t" + "p.mac x21, x4, x13 \n\t" + "p.mac x18, x3, x14 \n\t" + "p.mac x22, x4, x14 \n\t" + "p.mac x19, x3, x15 \n\t" + "p.lw x3, %[N](%[addr_a]!) \n\t" + "p.mac x23, x4, x15 \n\t" + "p.lw x4, %[N](%[addr_a]!) \n\t" + "p.mac x24, x10, x12 \n\t" + "p.mac x28, x11, x12 \n\t" + "p.lw x12, 4(%[addr_b]!) \n\t" + "p.mac x25, x10, x13 \n\t" + "p.mac x29, x11, x13 \n\t" + "p.lw x13, 4(%[addr_b]!) \n\t" + "p.mac x26, x10, x14 \n\t" + "p.mac x30, x11, x14 \n\t" + "p.lw x14, 4(%[addr_b]!) \n\t" + "p.mac x27, x10, x15 \n\t" + "p.mac x31, x11, x15 \n\t" + "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 + "p.lw x10, %[N](%[addr_a]!) \n\t" + "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 + "bne %[addr_b], x1, 1b \n\t" + // Loop done store + "p.mac x16, x3, x12 \n\t" + "p.mac x17, x3, x13 \n\t" + "p.mac x18, x3, x14 \n\t" + "p.sw x16, 4(%[addr_c]!) \n\t" + "p.mac x19, x3, x15 \n\t" + "p.sw x17, 4(%[addr_c]!) \n\t" + "p.mac x20, x4, x12 \n\t" + "p.sw x18, 4(%[addr_c]!) \n\t" + "p.mac x21, x4, x13 \n\t" + "p.sw x19, %[P_3](%[addr_c]!) \n\t" + "p.mac x22, x4, x14 \n\t" + "p.sw x20, 4(%[addr_c]!) \n\t" + "p.mac x23, x4, x15 \n\t" + "p.sw x21, 4(%[addr_c]!) \n\t" + "p.mac x24, x10, x12 \n\t" + "p.sw x22, 4(%[addr_c]!) \n\t" + "p.mac x25, x10, x13 \n\t" + "p.sw x23, %[P_3](%[addr_c]!) \n\t" + "p.mac x26, x10, x14 \n\t" + "p.sw x24, 4(%[addr_c]!) \n\t" + "p.mac x27, x10, x15 \n\t" + "p.sw x25, 4(%[addr_c]!) \n\t" + "p.mac x28, x11, x12 \n\t" + "p.sw x26, 4(%[addr_c]!) \n\t" + "p.mac x29, x11, x13 \n\t" + "p.sw x27, %[P_3](%[addr_c]!) \n\t" + "p.mac x30, x11, x14 \n\t" + "p.sw x28, 4(%[addr_c]!) \n\t" + "p.mac x31, x11, x15 \n\t" + "p.sw x29, 4(%[addr_c]!) \n\t" + "p.sw x30, 4(%[addr_c]!) \n\t" + "p.sw x31, %[P_3](%[addr_c]!) \n\t" + : [addr_a] "+&r"(addr_a), [addr_b] "+&r"(addr_b), + [addr_c] "+&r"(addr_c) // Outputs + : [N3_1] "r"(N3_1_r), [P_3] "r"(P_3_r), [x1] "r"(k), + [N] "I"(matrix_N * 4) // Inputs + : "x3", "x4", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", + "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", + "x27", "x28", "x29", "x30", "x31", "memory"); // Clobber + } + } +} + +void mat_mul_unrolled_4x4_conflict_opt_parallel_asm( + int32_t const *__restrict__ A, int32_t const *__restrict__ B, + int32_t *__restrict__ C, uint32_t M, uint32_t N, uint32_t P, uint32_t id, + uint32_t numThreads) { + + ///////////////////////////// + // Configuration // + ///////////////////////////// + // Parallelize by assigning each core one row + // How many cores per window + uint32_t c = numThreads / (M / 4); + if (numThreads * 4 < M) { + c = 1; + } + uint32_t const c_start = (P / c) * (id % c); + uint32_t const c_end = (P / c) * ((id % c) + 1); + + // For avoiding group conflict by same tile + // Each cores in the same tile should access to different groups + uint32_t group_bank_nums = 512; // MemPool = 256 + uint32_t tile_core_nums = 8; // MemPool = 4 + uint32_t jump_lines_A = group_bank_nums / N; // Used for i control + uint32_t jump_lines_B = group_bank_nums / P; // Used for k control + // Window size limit, min jump lines is 4 for MatrixA + if (jump_lines_A < 4) { + jump_lines_A = 4; + } + + ///////////////////////////// + // LOOP OFFSET // + ///////////////////////////// + // Outer Loop Control, for group access port conflict + uint32_t i_offset = jump_lines_A * (id % tile_core_nums); + // Inner Loop Incremental Control, for group access port conflict + uint32_t k_offset_incr = jump_lines_B * (id % tile_core_nums); + // Inner Loop Control + // k_offset = (Core offset) + (Window offset) + (Group offset from MatrixB) + uint32_t k_offset = (id % c) + (2 * (id / c)) + k_offset_incr; + // Middle Loop Control, window jump for avoiding bank conflict + uint32_t conflict_row = (group_bank_nums * tile_core_nums) / P; + uint32_t j_offset = (2 * (id / c)) / conflict_row; + + ///////////////////////////// + // LOOP CONTROL // + ///////////////////////////// + // Inner Round-Robin + if (k_offset >= N) { + k_offset = k_offset - N * (k_offset / N); + } + // Middle Round-Robin + uint32_t window_in_P = (P / c) / 4; + if (j_offset >= window_in_P) { + j_offset = j_offset - window_in_P * (j_offset / window_in_P); + } + // Outer Loop Control + uint32_t outer_loop_counter = 0; + uint32_t outer_loop_time = M / (4 * numThreads); + if (outer_loop_time < 1) { + outer_loop_time = 1; + } + uint32_t M_partition = M / outer_loop_time; + + ///////////////////////////// + // *LOOP START* // + ///////////////////////////// + for (uint32_t i_ori = 4 * (id / c); i_ori < M; + i_ori += 4 * (numThreads / c)) { + outer_loop_counter += 1; + uint32_t i = i_ori + i_offset; + // Round-Robin control, if offset lines > M, back to the first window + if (i >= M_partition * outer_loop_counter) { + i = i - M_partition * (i / (M_partition * outer_loop_counter)); + } + // Backup counter for mid-loop + uint32_t j_offset_counter = c_start + j_offset * 4; + uint32_t P_counter = c_end; + + Mid_loop: + for (uint32_t j = j_offset_counter; j < P_counter; j += 4) { + // Address registers + int32_t const *addr_a_ori = &A[i * N]; + int32_t const *addr_b_ori = &B[j]; + int32_t const *addr_a = &A[i * N + k_offset]; + int32_t const *addr_b = &B[k_offset * P + j]; + int32_t const *end_b = &B[N * P + j]; + int32_t const *addr_c = &C[i * P + j]; + register int32_t k asm("x1") = (int32_t)end_b; + + __asm__ volatile( + ".balign 16 \n\t" + // Outer loop: Initialize and preload. Execute this loop P times + // TODO arrange + "add sp, sp, -8 \n\t" + "sw %[addr_b], 0(sp) \n\t" + "sw %[addr_a_ori], 4(sp) \n\t" + "p.lw x3, %[N](%[addr_a]!) \n\t" + "p.lw x12, 4(%[addr_b]!) \n\t" + "p.lw x13, 4(%[addr_b]!) \n\t" + "p.lw x14, 4(%[addr_b]!) \n\t" + "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 + "p.lw x4, %[N](%[addr_a]!) \n\t" + "p.lw x10, %[N](%[addr_a]!) \n\t" + "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 + + // If reach endpoint, swap address + "bne %[addr_b], x1, init_comp \n\t" + "lw x1, 0(sp) \n\t" + "addi %[addr_a], %[addr_a_ori], 0 \n\t" + "addi %[addr_b], %[addr_b_ori], 0 \n\t" + "sw %[addr_b], 0(sp) \n\t" + + // Initial computation + prefetching + "init_comp: \n\t" + "mul x16, x3, x12 \n\t" + "mul x17, x3, x13 \n\t" + "mul x18, x3, x14 \n\t" + "mul x19, x3, x15 \n\t" + "p.lw x3, %[N](%[addr_a]!) \n\t" + "mul x20, x4, x12 \n\t" + "mul x21, x4, x13 \n\t" + "mul x22, x4, x14 \n\t" + "mul x23, x4, x15 \n\t" + "p.lw x4, %[N](%[addr_a]!) \n\t" + "mul x24, x10, x12 \n\t" + "mul x25, x10, x13 \n\t" + "mul x26, x10, x14 \n\t" + "mul x27, x10, x15 \n\t" + "p.lw x10, %[N](%[addr_a]!) \n\t" + "mul x28, x11, x12 \n\t" + "p.lw x12, 4(%[addr_b]!) \n\t" + "mul x29, x11, x13 \n\t" + "p.lw x13, 4(%[addr_b]!) \n\t" + "mul x30, x11, x14 \n\t" + "p.lw x14, 4(%[addr_b]!) \n\t" + "mul %[addr_a_ori], x11, x15 \n\t" // Use addr_a_ori instead of x31 + "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 + "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 + + // If reach endpoint, swap address + "bne %[addr_b], x1, inner_loop \n\t" + "sw %[addr_a_ori], 8(sp) \n\t" // backup x31 + "lw %[addr_a_ori], 4(sp) \n\t" // load back addr_a_ori + "lw x1, 0(sp) \n\t" + "addi %[addr_a], %[addr_a_ori], 0 \n\t" + "addi %[addr_b], %[addr_b_ori], 0 \n\t" + "sw %[addr_b], 0(sp) \n\t" + "lw %[addr_a_ori], 8(sp) \n\t" // load back x31 + + // Inner loop: Do this loop N times + "inner_loop: \n\t" + "1: \n\t" + "p.mac x16, x3, x12 \n\t" + "p.mac x17, x3, x13 \n\t" + "p.mac x20, x4, x12 \n\t" + "p.mac x21, x4, x13 \n\t" + "p.mac x18, x3, x14 \n\t" + "p.mac x22, x4, x14 \n\t" + "p.mac x19, x3, x15 \n\t" + "p.lw x3, %[N](%[addr_a]!) \n\t" + "p.mac x23, x4, x15 \n\t" + "p.lw x4, %[N](%[addr_a]!) \n\t" + "p.mac x24, x10, x12 \n\t" + "p.mac x28, x11, x12 \n\t" + "p.lw x12, 4(%[addr_b]!) \n\t" + "p.mac x25, x10, x13 \n\t" + "p.mac x29, x11, x13 \n\t" + "p.lw x13, 4(%[addr_b]!) \n\t" + "p.mac x26, x10, x14 \n\t" + "p.mac x30, x11, x14 \n\t" + "p.lw x14, 4(%[addr_b]!) \n\t" + "p.mac x27, x10, x15 \n\t" + "p.mac %[addr_a_ori], x11, x15 \n\t" + "p.lw x15, %[P_3](%[addr_b]!) \n\t" // Increment by P-3 + "p.lw x10, %[N](%[addr_a]!) \n\t" + "p.lw x11, %[N3_1](%[addr_a]!) \n\t" // Increment by -3N+1 + "bne %[addr_b], x1, 1b \n\t" + + // Case1: Loop done if k_offset = 0 + // Case2: Loop done when 2nd time to here + // Case3: If reach endpoint, swap address + "lw %[addr_b], 0(sp) \n\t" + "beq %[addr_b_ori], %[addr_b], store \n\t" + "sw %[addr_a_ori], 8(sp) \n\t" // backup x31 + "lw %[addr_a_ori], 4(sp) \n\t" // load back addr_a_ori + "addi x1, %[addr_b], 0 \n\t" + "addi %[addr_a], %[addr_a_ori], 0 \n\t" + "addi %[addr_b], %[addr_b_ori], 0 \n\t" + "sw %[addr_b], 0(sp) \n\t" + "lw %[addr_a_ori], 8(sp) \n\t" // load back x31 + "j 1b \n\t" + + // Loop done store + "store: \n\t" + "p.mac x16, x3, x12 \n\t" + "p.mac x17, x3, x13 \n\t" + "p.mac x18, x3, x14 \n\t" + "p.sw x16, 4(%[addr_c]!) \n\t" + "p.mac x19, x3, x15 \n\t" + "p.sw x17, 4(%[addr_c]!) \n\t" + "p.mac x20, x4, x12 \n\t" + "p.sw x18, 4(%[addr_c]!) \n\t" + "p.mac x21, x4, x13 \n\t" + "p.sw x19, %[P_3](%[addr_c]!) \n\t" + "p.mac x22, x4, x14 \n\t" + "p.sw x20, 4(%[addr_c]!) \n\t" + "p.mac x23, x4, x15 \n\t" + "p.sw x21, 4(%[addr_c]!) \n\t" + "p.mac x24, x10, x12 \n\t" + "p.sw x22, 4(%[addr_c]!) \n\t" + "p.mac x25, x10, x13 \n\t" + "p.sw x23, %[P_3](%[addr_c]!) \n\t" + "p.mac x26, x10, x14 \n\t" + "p.sw x24, 4(%[addr_c]!) \n\t" + "p.mac x27, x10, x15 \n\t" + "p.sw x25, 4(%[addr_c]!) \n\t" + "p.mac x28, x11, x12 \n\t" + "p.sw x26, 4(%[addr_c]!) \n\t" + "p.mac x29, x11, x13 \n\t" + "p.sw x27, %[P_3](%[addr_c]!) \n\t" + "p.mac x30, x11, x14 \n\t" + "p.sw x28, 4(%[addr_c]!) \n\t" + "p.mac %[addr_a_ori], x11, x15 \n\t" + "p.sw x29, 4(%[addr_c]!) \n\t" + "p.sw x30, 4(%[addr_c]!) \n\t" + "p.sw %[addr_a_ori], %[P_3](%[addr_c]!) \n\t" + "add sp, sp, 8 \n\t" + : [addr_a] "+&r"(addr_a), [addr_b] "+&r"(addr_b), + [addr_c] "+&r"(addr_c), [addr_a_ori] "+&r"(addr_a_ori), + [addr_b_ori] "+&r"(addr_b_ori) // Outputs + : [N3_1] "r"(N31), [P_3] "I"(P3), [x1] "r"(k), + [N] "I"(matrix_N * 4) // Inputs + : "x3", "x4", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", + "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", + "x27", "x28", "x29", "x30", "memory"); // Clobber + } + if (j_offset_counter != c_start) { + P_counter = j_offset_counter; + j_offset_counter = c_start; + goto Mid_loop; + } + } +}