diff --git a/sw/apps/covariance/src/args.h b/sw/apps/covariance/src/args.h index f88768dd59..cd15bc852e 100644 --- a/sw/apps/covariance/src/args.h +++ b/sw/apps/covariance/src/args.h @@ -8,7 +8,8 @@ #include typedef void (*covariance_fp_t)(uint32_t m, uint32_t n, double inv_n, - double inv_n_m1, double *data, double *datat,double *cov); + double inv_n_m1, double *data, double *datat, + double *cov); typedef struct { uint32_t m; diff --git a/sw/apps/covariance/src/covariance.h b/sw/apps/covariance/src/covariance.h index 53944e6cad..cdeb427bfb 100644 --- a/sw/apps/covariance/src/covariance.h +++ b/sw/apps/covariance/src/covariance.h @@ -11,15 +11,13 @@ #define DOUBLE_BUFFER 1 -void covariance_naive(uint32_t m, uint32_t n, double inv_n, - double inv_n_m1, double *data, double *datat, - double *cov) { +void covariance_naive(uint32_t m, uint32_t n, double inv_n, double inv_n_m1, + double *data, double *datat, double *cov) { uint32_t offset = snrt_cluster_core_idx(); uint32_t stride = snrt_cluster_compute_core_num(); // Center data for (uint32_t i = offset; i < m; i += stride) { - // Calculate row mean double data_mean = 0.0; double datat_mean = 0.0; @@ -44,15 +42,13 @@ void covariance_naive(uint32_t m, uint32_t n, double inv_n, syrk_naive(m, n, inv_n_m1, data, datat, 0, cov); } -void covariance_baseline(uint32_t m, uint32_t n, double inv_n, - double inv_n_m1, double *data, double *datat, - double *cov) { +void covariance_baseline(uint32_t m, uint32_t n, double inv_n, double inv_n_m1, + double *data, double *datat, double *cov) { uint32_t offset = snrt_cluster_core_idx(); uint32_t stride = snrt_cluster_compute_core_num(); // Center data for (uint32_t i = offset; i < m; i += stride) { - // Calculate row mean double data_mean = 0.0; double datat_mean = 0.0; @@ -77,9 +73,8 @@ void covariance_baseline(uint32_t m, uint32_t n, double inv_n, syrk_baseline(m, n, inv_n_m1, data, datat, 0, cov); } -void covariance_opt(uint32_t m, uint32_t n, double inv_n, - double inv_n_m1, double *data, double *datat, - double *cov) { +void covariance_opt(uint32_t m, uint32_t n, double inv_n, double inv_n_m1, + double *data, double *datat, double *cov) { uint32_t offset = snrt_cluster_core_idx(); uint32_t stride = snrt_cluster_compute_core_num(); @@ -97,14 +92,14 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, // ft0.push(data[i * n + j]) // ft1.push(datat[i * n + j]) const uint32_t ssr01_b[4] = {unroll0, n, 2, m / (stride * unroll0)}; - const uint32_t ssr01_i[4] = {sizeof(double) * n * stride, sizeof(double), - 0, sizeof(double) * n * stride * unroll0}; - snrt_ssr_loop_4d(SNRT_SSR_DM0, - ssr01_b[0], ssr01_b[1], ssr01_b[2], ssr01_b[3], - ssr01_i[0], ssr01_i[1], ssr01_i[2], ssr01_i[3]); - snrt_ssr_loop_4d(SNRT_SSR_DM1, - ssr01_b[0], ssr01_b[1], ssr01_b[2], ssr01_b[3], - ssr01_i[0], ssr01_i[1], ssr01_i[2], ssr01_i[3]); + const uint32_t ssr01_i[4] = {sizeof(double) * n * stride, sizeof(double), 0, + sizeof(double) * n * stride * unroll0}; + snrt_ssr_loop_4d(SNRT_SSR_DM0, ssr01_b[0], ssr01_b[1], ssr01_b[2], + ssr01_b[3], ssr01_i[0], ssr01_i[1], ssr01_i[2], + ssr01_i[3]); + snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr01_b[0], ssr01_b[1], ssr01_b[2], + ssr01_b[3], ssr01_i[0], ssr01_i[1], ssr01_i[2], + ssr01_i[3]); snrt_ssr_repeat(SNRT_SSR_DM0, 1); // Configure ft2 to store data and datat elements // for (i1 = offset; i1 < m; i1 += stride * unroll0) @@ -115,11 +110,9 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, // datat[i * n + j] = ft2.pop() const uint32_t ssr2_b[4] = {2, unroll0, n, m / (stride * unroll0)}; const uint32_t ssr2_i[4] = {(uint32_t)datat - (uint32_t)data, - sizeof(double) * n * stride, - sizeof(double), + sizeof(double) * n * stride, sizeof(double), sizeof(double) * n * stride * unroll0}; - snrt_ssr_loop_4d(SNRT_SSR_DM2, - ssr2_b[0], ssr2_b[1], ssr2_b[2], ssr2_b[3], + snrt_ssr_loop_4d(SNRT_SSR_DM2, ssr2_b[0], ssr2_b[1], ssr2_b[2], ssr2_b[3], ssr2_i[0], ssr2_i[1], ssr2_i[2], ssr2_i[3]); // SSR start address need to be configured each time @@ -130,21 +123,20 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, // Center data for (uint32_t i = offset; i < m; i += stride * unroll0) { - // Calculate row means double m[2 * unroll0]; - m[0] = 0.0; // mean(data[i]) - m[1] = 0.0; // mean(datat[i]) - m[2] = 0.0; // mean(data[i + stride]) - m[3] = 0.0; // mean(datat[i + stride]) + m[0] = 0.0; // mean(data[i]) + m[1] = 0.0; // mean(datat[i]) + m[2] = 0.0; // mean(data[i + stride]) + m[3] = 0.0; // mean(datat[i + stride]) asm volatile( "frep.o %[n_frep], %[n_insn], 0, 0 \n" "fadd.d %[m0], ft0, %[m0] \n" "fadd.d %[m1], ft1, %[m1] \n" "fadd.d %[m2], ft0, %[m2] \n" "fadd.d %[m3], ft1, %[m3] \n" - : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), - [ m2 ] "+f"(m[2]), [ m3 ] "+f"(m[3]) + : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), [ m2 ] "+f"(m[2]), + [ m3 ] "+f"(m[3]) : [ n_frep ] "r"(n - 1), [ n_insn ] "i"(2 * unroll0) : "ft0", "ft1", "ft2"); m[0] *= inv_n; @@ -161,8 +153,8 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, "fsub.d ft2, ft1, %[m1] \n" "fsub.d ft2, ft0, %[m2] \n" "fsub.d ft2, ft1, %[m3] \n" - : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), - [ m2 ] "+f"(m[2]), [ m3 ] "+f"(m[3]) + : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), [ m2 ] "+f"(m[2]), + [ m3 ] "+f"(m[3]) : [ n_frep ] "r"(n - 1), [ n_insn ] "i"(2 * unroll0) : "ft0", "ft1", "ft2"); } @@ -190,16 +182,16 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, // ft0.push(a[i * n + k]) // ft1.push(at[j * n + k]) const uint32_t ssr0_b[4] = {unroll1, n, m / unroll1, m / stride}; - const uint32_t ssr0_i[4] = {0, sizeof(double), 0, stride * n * sizeof(double)}; - snrt_ssr_loop_3d(SNRT_SSR_DM0, - ssr0_b[1], ssr0_b[2], ssr0_b[3], - ssr0_i[1], ssr0_i[2], ssr0_i[3]); + const uint32_t ssr0_i[4] = {0, sizeof(double), 0, + stride * n * sizeof(double)}; + snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], ssr0_i[1], + ssr0_i[2], ssr0_i[3]); snrt_ssr_repeat(SNRT_SSR_DM0, unroll1); const uint32_t ssr1_b[4] = {unroll1, n, m / unroll1, m / stride}; - const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), unroll1 * n * sizeof(double), 0}; - snrt_ssr_loop_4d(SNRT_SSR_DM1, - ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3], - ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); + const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), + unroll1 * n * sizeof(double), 0}; + snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3], + ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); // SSR start address need to be configured each time snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_3D, data + offset * n); @@ -208,7 +200,6 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, for (uint32_t i = offset; i < m; i += stride) { for (uint32_t j = 0; j < m; j += unroll1) { - double acc[unroll1]; acc[0] = 0; acc[1] = 0; @@ -227,8 +218,10 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, "fmul.d %[b3], %[acc3], %[alpha] \n" : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]), - [ b0 ] "=f"(cov[i * m + j + 0]), [ b1 ] "=f"(cov[i * m + j + 1]), - [ b2 ] "=f"(cov[i * m + j + 2]), [ b3 ] "=f"(cov[i * m + j + 3]) + [ b0 ] "=f"(cov[i * m + j + 0]), + [ b1 ] "=f"(cov[i * m + j + 1]), + [ b2 ] "=f"(cov[i * m + j + 2]), + [ b3 ] "=f"(cov[i * m + j + 3]) : [ n_frep ] "r"(n - 1), [ unroll1 ] "i"(unroll1), [ alpha ] "f"(inv_n_m1) : "ft0", "ft1", "ft2"); @@ -241,8 +234,8 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, void covariance_job(covariance_args_t *args) { uint32_t m_frac, a_tile_size, a_tile_bytes, b_tile_size, b_tile_bytes; - uint64_t local_a0_addr, local_at0_addr, local_b0_addr, - local_a1_addr, local_at1_addr, local_b1_addr; + uint64_t local_a0_addr, local_at0_addr, local_b0_addr, local_a1_addr, + local_at1_addr, local_b1_addr; double *local_a[2]; double *local_at[2]; double *local_b[2]; @@ -287,12 +280,13 @@ void covariance_job(covariance_args_t *args) { // Calculate number of iterations sb_iterations = args->m_tiles * args->m_tiles; - if (DOUBLE_BUFFER) iterations = sb_iterations + 2; - else iterations = sb_iterations; + if (DOUBLE_BUFFER) + iterations = sb_iterations + 2; + else + iterations = sb_iterations; // Iterate over all tiles for (i = 0; i < iterations; i++) { - if (snrt_is_dm_core()) { // DMA in if (!DOUBLE_BUFFER || (i < sb_iterations)) { @@ -305,18 +299,10 @@ void covariance_job(covariance_args_t *args) { i_col = i_dma_in % args->m_tiles; // Copy job operands in TCDM - snrt_dma_load_1d_tile( - local_a[buff_idx], - args->data, - i_row, - a_tile_size, - sizeof(double)); - snrt_dma_load_1d_tile( - local_at[buff_idx], - args->data, - i_col, - a_tile_size, - sizeof(double)); + snrt_dma_load_1d_tile(local_a[buff_idx], args->data, i_row, + a_tile_size, sizeof(double)); + snrt_dma_load_1d_tile(local_at[buff_idx], args->data, i_col, + a_tile_size, sizeof(double)); snrt_dma_wait_all(); snrt_mcycle(); @@ -343,15 +329,9 @@ void covariance_job(covariance_args_t *args) { i_col = i_dma_out % args->m_tiles; // Copy job outputs from TCDM - snrt_dma_store_2d_tile( - args->cov, - local_b[buff_idx], - i_row, - i_col, - m_frac, - m_frac, - args->m, - sizeof(double)); + snrt_dma_store_2d_tile(args->cov, local_b[buff_idx], i_row, + i_col, m_frac, m_frac, args->m, + sizeof(double)); snrt_dma_wait_all(); snrt_mcycle(); diff --git a/sw/apps/covariance/src/main.c b/sw/apps/covariance/src/main.c index 3c9d225a83..112ead3337 100644 --- a/sw/apps/covariance/src/main.c +++ b/sw/apps/covariance/src/main.c @@ -10,7 +10,6 @@ #include "data.h" int main() { - covariance_job(&args); return 0; diff --git a/sw/blas/axpy/src/args.h b/sw/blas/axpy/src/args.h index 0efe3a2b49..c5d5428522 100644 --- a/sw/blas/axpy/src/args.h +++ b/sw/blas/axpy/src/args.h @@ -5,14 +5,15 @@ #pragma once #include -typedef void (*axpy_fp_t)(uint32_t n, double a, double* x, double* y, double* z); +typedef void (*axpy_fp_t)(uint32_t n, double a, double* x, double* y, + double* z); typedef struct { uint32_t n; double a; - double *x; - double *y; - double *z; + double* x; + double* y; + double* z; uint32_t n_tiles; axpy_fp_t funcptr; } axpy_args_t; diff --git a/sw/blas/axpy/src/axpy.h b/sw/blas/axpy/src/axpy.h index c5df546ab5..8ded48167a 100644 --- a/sw/blas/axpy/src/axpy.h +++ b/sw/blas/axpy/src/axpy.h @@ -11,7 +11,8 @@ #define TCDM_ALIGNMENT (32 * BANK_ALIGNMENT) #define ALIGN_UP_TCDM(addr) ALIGN_UP(addr, TCDM_ALIGNMENT) -static inline void axpy_naive(uint32_t n, double a, double* x, double* y, double* z) { +static inline void axpy_naive(uint32_t n, double a, double *x, double *y, + double *z) { int core_idx = snrt_cluster_core_idx(); int frac = n / snrt_cluster_compute_core_num(); int offset = core_idx; @@ -22,28 +23,27 @@ static inline void axpy_naive(uint32_t n, double a, double* x, double* y, double snrt_fpu_fence(); } -static inline void axpy_fma(uint32_t n, double a, double* x, double* y, double* z) { +static inline void axpy_fma(uint32_t n, double a, double *x, double *y, + double *z) { int core_idx = snrt_cluster_core_idx(); int frac = n / snrt_cluster_compute_core_num(); int offset = core_idx; for (int i = offset; i < n; i += snrt_cluster_compute_core_num()) { - asm volatile ( - "fmadd.d %[z], %[a], %[x], %[y] \n" - : [ z ]"=f"(z[i]) - : [ a ]"f"(a), [ x ]"f"(x[i]), [ y ]"f"(y[i]) - ); + asm volatile("fmadd.d %[z], %[a], %[x], %[y] \n" + : [ z ] "=f"(z[i]) + : [ a ] "f"(a), [ x ] "f"(x[i]), [ y ] "f"(y[i])); } snrt_fpu_fence(); } -static inline void axpy_opt(uint32_t n, double a, double* x, double* y, double* z) { +static inline void axpy_opt(uint32_t n, double a, double *x, double *y, + double *z) { int core_idx = snrt_cluster_core_idx(); int frac = n / snrt_cluster_compute_core_num(); int offset = core_idx; - snrt_ssr_loop_1d(SNRT_SSR_DM_ALL, - frac, + snrt_ssr_loop_1d(SNRT_SSR_DM_ALL, frac, snrt_cluster_compute_core_num() * sizeof(double)); snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, x + offset); @@ -57,24 +57,22 @@ static inline void axpy_opt(uint32_t n, double a, double* x, double* y, double* "fmadd.d ft2, %[a], ft0, ft1\n" : : [ n_frep ] "r"(frac - 1), [ a ] "f"(a) - : "ft0", "ft1", "ft2", "memory" - ); - + : "ft0", "ft1", "ft2", "memory"); + snrt_fpu_fence(); snrt_ssr_disable(); } static inline void axpy_job(axpy_args_t *args) { uint32_t frac, offset, size; - uint64_t local_x0_addr, local_y0_addr, local_z0_addr, - local_x1_addr, local_y1_addr, local_z1_addr; + uint64_t local_x0_addr, local_y0_addr, local_z0_addr, local_x1_addr, + local_y1_addr, local_z1_addr; double *local_x[2]; double *local_y[2]; double *local_z[2]; double *remote_x, *remote_y, *remote_z; uint32_t iterations, i, i_dma_in, i_compute, i_dma_out, buff_idx; - #ifndef JOB_ARGS_PRELOADED // Allocate space for job arguments in TCDM axpy_args_t *local_args = (axpy_args_t *)snrt_l1_next(); @@ -102,8 +100,10 @@ static inline void axpy_job(axpy_args_t *args) { local_z[0] = (double *)local_z0_addr; if (DOUBLE_BUFFER) { local_x1_addr = ALIGN_UP_TCDM(local_z0_addr + size); - local_y1_addr = ALIGN_UP_TCDM(local_x1_addr + size) + 8 * BANK_ALIGNMENT; - local_z1_addr = ALIGN_UP_TCDM(local_y1_addr + size) + 16 * BANK_ALIGNMENT; + local_y1_addr = + ALIGN_UP_TCDM(local_x1_addr + size) + 8 * BANK_ALIGNMENT; + local_z1_addr = + ALIGN_UP_TCDM(local_y1_addr + size) + 16 * BANK_ALIGNMENT; local_x[1] = (double *)local_x1_addr; local_y[1] = (double *)local_y1_addr; local_z[1] = (double *)local_z1_addr; @@ -115,7 +115,6 @@ static inline void axpy_job(axpy_args_t *args) { // Iterate over all tiles for (i = 0; i < iterations; i++) { - if (snrt_is_dm_core()) { // DMA in if (!DOUBLE_BUFFER || (i < args->n_tiles)) { @@ -176,7 +175,8 @@ static inline void axpy_job(axpy_args_t *args) { // Perform tile computation axpy_fp_t fp = args->funcptr; - fp(frac, args->a, local_x[buff_idx], local_y[buff_idx], local_z[buff_idx]); + fp(frac, args->a, local_x[buff_idx], local_y[buff_idx], + local_z[buff_idx]); snrt_mcycle(); } diff --git a/sw/blas/axpy/src/main.c b/sw/blas/axpy/src/main.c index 83cb58ae8c..e0389d25d4 100644 --- a/sw/blas/axpy/src/main.c +++ b/sw/blas/axpy/src/main.c @@ -8,7 +8,6 @@ #include "data.h" int main() { - axpy_job(&args); // TODO: currently only works for single cluster otherwise need to diff --git a/sw/blas/syrk/scripts/datagen.py b/sw/blas/syrk/scripts/datagen.py index 05cd2f0381..9b4959fca4 100755 --- a/sw/blas/syrk/scripts/datagen.py +++ b/sw/blas/syrk/scripts/datagen.py @@ -8,12 +8,12 @@ import numpy as np from snitch.util.sim import data_utils -from snitch.util.sim.data_utils import format_array_definition, format_array_declaration, \ - format_struct_definition, DataGen +from snitch.util.sim.data_utils import format_array_definition, format_struct_definition, DataGen DOUBLE_BUFFER = True + class SyrkDataGen(DataGen): # Function pointers to alternative implementations @@ -55,7 +55,6 @@ def emit_header(self, **kwargs): A = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['n']))/100 C_in = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['m']))/100 - C_out = self.golden_model(alpha, A, beta, C_in) A = A.flatten() C_in = C_in.flatten() diff --git a/sw/blas/syrk/src/args.h b/sw/blas/syrk/src/args.h index 6bb58e00ec..24342d3e3a 100644 --- a/sw/blas/syrk/src/args.h +++ b/sw/blas/syrk/src/args.h @@ -8,7 +8,7 @@ #include typedef void (*syrk_fp_t)(uint32_t m, uint32_t n, double alpha, double *a, - double *at, double beta, double *b); + double *at, double beta, double *b); typedef struct { uint32_t m; diff --git a/sw/blas/syrk/src/main.c b/sw/blas/syrk/src/main.c index 9f1ad7163d..f8c09ae4f4 100644 --- a/sw/blas/syrk/src/main.c +++ b/sw/blas/syrk/src/main.c @@ -10,7 +10,6 @@ #include "data.h" int main() { - syrk_job(&args); return 0; diff --git a/sw/blas/syrk/src/syrk.h b/sw/blas/syrk/src/syrk.h index 9494f2777c..718ad7fe90 100644 --- a/sw/blas/syrk/src/syrk.h +++ b/sw/blas/syrk/src/syrk.h @@ -39,7 +39,6 @@ void syrk_baseline(uint32_t m, uint32_t n, double alpha, double *a, double *at, for (uint32_t i = offset; i < m; i += stride) { for (uint32_t j = 0; j < m; j += unroll1) { - double acc[4]; acc[0] = 0; acc[1] = 0; @@ -66,28 +65,26 @@ void syrk_baseline(uint32_t m, uint32_t n, double alpha, double *a, double *at, "fmadd.d %[acc3], %[a3], %[at15], %[acc3] \n" : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]) - : [ a0 ] "f"(a[i * n + k + 0]), - [ a1 ] "f"(a[i * n + k + 1]), - [ a2 ] "f"(a[i * n + k + 2]), - [ a3 ] "f"(a[i * n + k + 3]), - [ at0 ] "f"(at[(j + 0) * n + k]), - [ at1 ] "f"(at[(j + 1) * n + k]), - [ at2 ] "f"(at[(j + 2) * n + k]), - [ at3 ] "f"(at[(j + 3) * n + k]), - [ at4 ] "f"(at[(j + 0) * n + k + 1]), - [ at5 ] "f"(at[(j + 1) * n + k + 1]), - [ at6 ] "f"(at[(j + 2) * n + k + 1]), - [ at7 ] "f"(at[(j + 3) * n + k + 1]), - [ at8 ] "f"(at[(j + 0) * n + k + 2]), - [ at9 ] "f"(at[(j + 1) * n + k + 2]), - [ at10 ] "f"(at[(j + 2) * n + k + 2]), - [ at11 ] "f"(at[(j + 3) * n + k + 2]), - [ at12 ] "f"(at[(j + 0) * n + k + 3]), - [ at13 ] "f"(at[(j + 1) * n + k + 3]), - [ at14 ] "f"(at[(j + 2) * n + k + 3]), - [ at15 ] "f"(at[(j + 3) * n + k + 3]) : - ); + [ a0 ] "f"(a[i * n + k + 0]), [ a1 ] "f"(a[i * n + k + 1]), + [ a2 ] "f"(a[i * n + k + 2]), [ a3 ] "f"(a[i * n + k + 3]), + [ at0 ] "f"(at[(j + 0) * n + k]), + [ at1 ] "f"(at[(j + 1) * n + k]), + [ at2 ] "f"(at[(j + 2) * n + k]), + [ at3 ] "f"(at[(j + 3) * n + k]), + [ at4 ] "f"(at[(j + 0) * n + k + 1]), + [ at5 ] "f"(at[(j + 1) * n + k + 1]), + [ at6 ] "f"(at[(j + 2) * n + k + 1]), + [ at7 ] "f"(at[(j + 3) * n + k + 1]), + [ at8 ] "f"(at[(j + 0) * n + k + 2]), + [ at9 ] "f"(at[(j + 1) * n + k + 2]), + [ at10 ] "f"(at[(j + 2) * n + k + 2]), + [ at11 ] "f"(at[(j + 3) * n + k + 2]), + [ at12 ] "f"(at[(j + 0) * n + k + 3]), + [ at13 ] "f"(at[(j + 1) * n + k + 3]), + [ at14 ] "f"(at[(j + 2) * n + k + 3]), + [ at15 ] "f"(at[(j + 3) * n + k + 3]) + :); } c[i * m + j + 0] = multiply_opt(c[i * m + j + 0], beta); @@ -122,15 +119,16 @@ void syrk_opt(uint32_t m, uint32_t n, double alpha, double *a, double *at, // ft0.push(a[i * n + k]) // ft1.push(at[j * n + k]) const uint32_t ssr0_b[4] = {unroll, n, m / unroll, m / stride}; - const uint32_t ssr0_i[4] = {0, sizeof(double), 0, stride * n * sizeof(double)}; + const uint32_t ssr0_i[4] = {0, sizeof(double), 0, + stride * n * sizeof(double)}; snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], ssr0_i[1], ssr0_i[2], ssr0_i[3]); snrt_ssr_repeat(SNRT_SSR_DM0, unroll); const uint32_t ssr1_b[4] = {unroll, n, m / unroll, m / stride}; - const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), unroll * n * sizeof(double), 0}; + const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), + unroll * n * sizeof(double), 0}; snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], - ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2], - ssr1_i[3]); + ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); setup_ssr = 0; } @@ -141,7 +139,6 @@ void syrk_opt(uint32_t m, uint32_t n, double alpha, double *a, double *at, for (uint32_t i = offset; i < m; i += stride) { for (uint32_t j = 0; j < m; j += unroll) { - double acc[unroll]; acc[0] = 0; acc[1] = 0; @@ -178,8 +175,8 @@ void syrk_opt(uint32_t m, uint32_t n, double alpha, double *a, double *at, void syrk_job(syrk_args_t *args) { uint32_t m_frac, a_tile_size, a_tile_bytes, c_tile_size, c_tile_bytes; - uint64_t local_a0_addr, local_at0_addr, local_c0_addr, - local_a1_addr, local_at1_addr, local_c1_addr; + uint64_t local_a0_addr, local_at0_addr, local_c0_addr, local_a1_addr, + local_at1_addr, local_c1_addr; double *local_a[2]; double *local_at[2]; double *local_c[2]; @@ -227,7 +224,6 @@ void syrk_job(syrk_args_t *args) { // Iterate over all tiles for (i = 0; i < iterations; i++) { - if (snrt_is_dm_core()) { // DMA out // (out before in to avoid overwriting data) @@ -241,15 +237,8 @@ void syrk_job(syrk_args_t *args) { i_col = i_dma_out % args->m_tiles; // Copy job outputs from TCDM - snrt_dma_store_2d_tile( - args->c, - local_c[buff_idx], - i_row, - i_col, - m_frac, - m_frac, - args->m, - sizeof(double)); + snrt_dma_store_2d_tile(args->c, local_c[buff_idx], i_row, i_col, + m_frac, m_frac, args->m, sizeof(double)); snrt_dma_wait_all(); snrt_mcycle(); @@ -266,28 +255,14 @@ void syrk_job(syrk_args_t *args) { i_col = i_dma_in % args->m_tiles; // Copy job operands in TCDM - snrt_dma_load_1d_tile( - local_a[buff_idx], - args->a, - i_row, - a_tile_size, - sizeof(double)); - snrt_dma_load_1d_tile( - local_at[buff_idx], - args->a, - i_col, - a_tile_size, - sizeof(double)); + snrt_dma_load_1d_tile(local_a[buff_idx], args->a, i_row, + a_tile_size, sizeof(double)); + snrt_dma_load_1d_tile(local_at[buff_idx], args->a, i_col, + a_tile_size, sizeof(double)); if (args->funcptr == syrk_opt || args->beta != 0) { - snrt_dma_load_2d_tile( - local_c[buff_idx], - args->c, - i_row, - i_col, - m_frac, - m_frac, - args->m, - sizeof(double)); + snrt_dma_load_2d_tile(local_c[buff_idx], args->c, i_row, + i_col, m_frac, m_frac, args->m, + sizeof(double)); } snrt_dma_wait_all();