From 6d6a94af3fef8b5704e878de14577495e5fc6fb1 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 12 Aug 2024 10:35:40 +0200 Subject: [PATCH 01/19] sw/blas/axpy: Add multiple impls and optimize TCDM placement --- sw/blas/axpy/data/params.json | 3 +- sw/blas/axpy/scripts/datagen.py | 52 +++++++++++----- sw/blas/axpy/src/args.h | 17 +++++ sw/blas/axpy/src/axpy.h | 107 +++++++++++++++++++++++++++----- sw/blas/axpy/src/main.c | 49 ++------------- 5 files changed, 155 insertions(+), 73 deletions(-) create mode 100644 sw/blas/axpy/src/args.h diff --git a/sw/blas/axpy/data/params.json b/sw/blas/axpy/data/params.json index 2f8f5871c..ba0e9b476 100644 --- a/sw/blas/axpy/data/params.json +++ b/sw/blas/axpy/data/params.json @@ -3,5 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 { - n: 384 + "n": 384, + "funcptr": "axpy_opt" } diff --git a/sw/blas/axpy/scripts/datagen.py b/sw/blas/axpy/scripts/datagen.py index 117495391..48d84bce3 100755 --- a/sw/blas/axpy/scripts/datagen.py +++ b/sw/blas/axpy/scripts/datagen.py @@ -9,7 +9,7 @@ import sys from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \ - format_array_declaration, format_ifdef_wrapper, DataGen + format_array_declaration, format_ifdef_wrapper, format_struct_definition, DataGen class AxpyDataGen(DataGen): @@ -19,29 +19,53 @@ class AxpyDataGen(DataGen): # AXI splits bursts crossing 4KB address boundaries. To minimize # the occurrence of these splits the data should be aligned to 4KB BURST_ALIGNMENT = 4096 + # Function pointers to alternative implementations + FUNCPTRS = ["axpy_naive", "axpy_fma", "axpy_opt"] def golden_model(self, a, x, y): return a*x + y + def validate_config(self, **kwargs): + assert (kwargs['n'] % 8) == 0, "n must be an integer multiple of the number of cores" + assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}" + + # Calculate total TCDM occupation + # Note: doesn't account for double buffering + vec_size = kwargs['n'] * 8 + total_size = 3 * vec_size + data_utils.validate_tcdm_footprint(total_size) + def emit_header(self, **kwargs): header = [super().emit_header()] - n = kwargs['n'] - a = np.random.uniform(self.MIN, self.MAX, 1) - x = np.random.uniform(self.MIN, self.MAX, n) - y = np.random.uniform(self.MIN, self.MAX, n) + self.validate_config(**kwargs) + + a = np.random.uniform(self.MIN, self.MAX, 1)[0] + x = np.random.uniform(self.MIN, self.MAX, kwargs['n']) + y = np.random.uniform(self.MIN, self.MAX, kwargs['n']) g = self.golden_model(a, x, y) - assert (n % 8) == 0, "n must be an integer multiple of the number of cores" + x_uid = 'x' + y_uid = 'y' + z_uid = 'z' + + cfg = { + 'n': kwargs['n'], + 'a': a, + 'x': x_uid, + 'y': y_uid, + 'z': z_uid, + 'funcptr': kwargs['funcptr'] + } - header += [format_scalar_definition('const uint32_t', 'n', n)] - header += [format_scalar_definition('const double', 'a', a[0])] - header += [format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT, - section=kwargs['section'])] - header += [format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT, - section=kwargs['section'])] - header += [format_array_declaration('double', 'z', [n], alignment=self.BURST_ALIGNMENT, - section=kwargs['section'])] + header += [format_scalar_definition('const double', 'a', a)] + header += [format_array_definition('double', x_uid, x, + alignment=self.BURST_ALIGNMENT, section=kwargs['section'])] + header += [format_array_definition('double', y_uid, y, + alignment=self.BURST_ALIGNMENT, section=kwargs['section'])] + header += [format_array_declaration('double', z_uid, x.shape, + alignment=self.BURST_ALIGNMENT, section=kwargs['section'])] + header += [format_struct_definition('axpy_args_t', 'args', cfg)] result_def = format_array_definition('double', 'g', g) header += [format_ifdef_wrapper('BIST', result_def)] header = '\n\n'.join(header) diff --git a/sw/blas/axpy/src/args.h b/sw/blas/axpy/src/args.h new file mode 100644 index 000000000..aeaa76745 --- /dev/null +++ b/sw/blas/axpy/src/args.h @@ -0,0 +1,17 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include + +typedef void (*axpy_fp_t)(uint32_t n, double a, double* x, double* y, double* z); + +typedef struct { + uint32_t n; + double a; + double *x; + double *y; + double *z; + axpy_fp_t funcptr; +} axpy_args_t; diff --git a/sw/blas/axpy/src/axpy.h b/sw/blas/axpy/src/axpy.h index e8f5ae6c0..d5ded81af 100644 --- a/sw/blas/axpy/src/axpy.h +++ b/sw/blas/axpy/src/axpy.h @@ -2,28 +2,47 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 +#include "args.h" #include "snrt.h" -inline void axpy(uint32_t n, double a, double* x, double* y, double* z) { +#define BANK_ALIGNMENT 8 +#define TCDM_ALIGNMENT (32 * BANK_ALIGNMENT) +#define ALIGN_UP_TCDM(addr) ALIGN_UP(addr, TCDM_ALIGNMENT) + +static inline void axpy_naive(uint32_t n, double a, double* x, double* y, double* z) { int core_idx = snrt_cluster_core_idx(); int frac = n / snrt_cluster_compute_core_num(); - int offset = core_idx * frac; + int offset = core_idx; + + for (int i = offset; i < n; i += snrt_cluster_compute_core_num()) { + z[i] = a * x[i] + y[i]; + } + snrt_fpu_fence(); +} -#ifndef XSSR +static inline void axpy_fma(uint32_t n, double a, double* x, double* y, double* z) { + int core_idx = snrt_cluster_core_idx(); + int frac = n / snrt_cluster_compute_core_num(); + int offset = core_idx; - for (int i = 0; i < frac; i++) { - z[offset] = a * x[offset] + y[offset]; - offset++; + for (int i = offset; i < n; i += snrt_cluster_compute_core_num()) { + asm volatile ( + "fmadd.d %[z], %[a], %[x], %[y] \n" + : [ z ]"=f"(z[i]) + : [ a ]"f"(a), [ x ]"f"(x[i]), [ y ]"f"(y[i]) + ); } snrt_fpu_fence(); +} -#else +static inline void axpy_opt(uint32_t n, double a, double* x, double* y, double* z) { + int core_idx = snrt_cluster_core_idx(); + int frac = n / snrt_cluster_compute_core_num(); + int offset = core_idx; - // TODO(colluca): revert once Banshee supports SNRT_SSR_DM_ALL - // snrt_ssr_loop_1d(SNRT_SSR_DM_ALL, frac, sizeof(double)); - snrt_ssr_loop_1d(SNRT_SSR_DM0, frac, sizeof(double)); - snrt_ssr_loop_1d(SNRT_SSR_DM1, frac, sizeof(double)); - snrt_ssr_loop_1d(SNRT_SSR_DM2, frac, sizeof(double)); + snrt_ssr_loop_1d(SNRT_SSR_DM_ALL, + frac, + snrt_cluster_compute_core_num() * sizeof(double)); snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, x + offset); snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, y + offset); @@ -36,10 +55,70 @@ inline void axpy(uint32_t n, double a, double* x, double* y, double* z) { "fmadd.d ft2, %[a], ft0, ft1\n" : : [ n_frep ] "r"(frac - 1), [ a ] "f"(a) - : "ft0", "ft1", "ft2", "memory"); - + : "ft0", "ft1", "ft2", "memory" + ); + snrt_fpu_fence(); snrt_ssr_disable(); +} +static inline void axpy_job(axpy_args_t *args) { + uint64_t local_x_addr, local_y_addr, local_z_addr; + double *local_x, *local_y, *local_z; + double *remote_x, *remote_y, *remote_z; + +#ifndef JOB_ARGS_PRELOADED + // Allocate space for job arguments in TCDM + axpy_args_t *local_args = (axpy_args_t *)snrt_l1_next(); + + // Copy job arguments to TCDM + if (snrt_is_dm_core()) { + snrt_dma_start_1d(local_args, args, sizeof(axpy_args_t)); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); + args = local_args; #endif + + // Calculate size and pointers for each cluster + uint32_t frac = args->n / snrt_cluster_num(); + uint32_t offset = frac * snrt_cluster_idx(); + remote_x = args->x + offset; + remote_y = args->y + offset; + remote_z = args->z + offset; + + // Allocate space for job operands in TCDM + // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th. + local_x_addr = ALIGN_UP_TCDM((uint64_t)args + sizeof(axpy_args_t)); + local_y_addr = ALIGN_UP_TCDM(local_x_addr + frac * sizeof(double)) + 8 * BANK_ALIGNMENT; + local_z_addr = ALIGN_UP_TCDM(local_y_addr + frac * sizeof(double)) + 16 * BANK_ALIGNMENT; + local_x = (double *)local_x_addr; + local_y = (double *)local_y_addr; + local_z = (double *)local_z_addr; + + // Copy job operands in TCDM + if (snrt_is_dm_core()) { + size_t size = frac * sizeof(double); + snrt_dma_start_1d(local_x, remote_x, size); + snrt_dma_start_1d(local_y, remote_y, size); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); + + // Compute + if (!snrt_is_dm_core()) { + axpy_fp_t fp = args->funcptr; + uint32_t start_cycle = snrt_mcycle(); + fp(frac, args->a, local_x, local_y, local_z); + uint32_t end_cycle = snrt_mcycle(); + } + snrt_cluster_hw_barrier(); + + // Copy data out of TCDM + if (snrt_is_dm_core()) { + size_t size = frac * sizeof(double); + snrt_dma_start_1d(remote_z, local_z, size); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); } diff --git a/sw/blas/axpy/src/main.c b/sw/blas/axpy/src/main.c index 22f3dd129..83cb58ae8 100644 --- a/sw/blas/axpy/src/main.c +++ b/sw/blas/axpy/src/main.c @@ -4,64 +4,25 @@ #include "snrt.h" -#define XSSR #include "axpy.h" #include "data.h" int main() { - double *local_x, *local_y, *local_z; - double *remote_x, *remote_y, *remote_z; - // Calculate size and pointers for each cluster - uint32_t frac = n / snrt_cluster_num(); - uint32_t offset = frac * snrt_cluster_idx(); - remote_x = x + offset; - remote_y = y + offset; - remote_z = z + offset; - - // Allocate space in TCDM - local_x = (double *)snrt_l1_next(); - local_y = local_x + frac; - local_z = local_y + frac; - - // Copy data in TCDM - if (snrt_is_dm_core()) { - size_t size = frac * sizeof(double); - snrt_dma_start_1d(local_x, remote_x, size); - snrt_dma_start_1d(local_y, remote_y, size); - snrt_dma_wait_all(); - } - - snrt_cluster_hw_barrier(); - - // Compute - if (!snrt_is_dm_core()) { - uint32_t start_cycle = snrt_mcycle(); - axpy(frac, a, local_x, local_y, local_z); - uint32_t end_cycle = snrt_mcycle(); - } - - snrt_cluster_hw_barrier(); - - // Copy data out of TCDM - if (snrt_is_dm_core()) { - size_t size = frac * sizeof(double); - snrt_dma_start_1d(remote_z, local_z, size); - snrt_dma_wait_all(); - } - - snrt_cluster_hw_barrier(); + axpy_job(&args); // TODO: currently only works for single cluster otherwise need to // synchronize all cores here #ifdef BIST + uint32_t n = args.n; + double* z = args.z; uint32_t nerr = n; // Check computation is correct if (snrt_global_core_idx() == 0) { for (int i = 0; i < n; i++) { - if (local_z[i] == g[i]) nerr--; - printf("%d %d\n", local_z[i], g[i]); + if (z[i] == g[i]) nerr--; + printf("%d %d\n", z[i], g[i]); } } From ca26488027d90e83416a7a1c1a8a10189c50f260 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 12 Aug 2024 11:03:58 +0200 Subject: [PATCH 02/19] sw/blas/axpy: Support multiple tiles --- sw/blas/axpy/data/params.json | 1 + sw/blas/axpy/scripts/datagen.py | 7 +++- sw/blas/axpy/src/args.h | 1 + sw/blas/axpy/src/axpy.h | 72 +++++++++++++++++++-------------- 4 files changed, 49 insertions(+), 32 deletions(-) diff --git a/sw/blas/axpy/data/params.json b/sw/blas/axpy/data/params.json index ba0e9b476..a4fa15275 100644 --- a/sw/blas/axpy/data/params.json +++ b/sw/blas/axpy/data/params.json @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 { + "n_tiles": 3, "n": 384, "funcptr": "axpy_opt" } diff --git a/sw/blas/axpy/scripts/datagen.py b/sw/blas/axpy/scripts/datagen.py index 48d84bce3..af91d886d 100755 --- a/sw/blas/axpy/scripts/datagen.py +++ b/sw/blas/axpy/scripts/datagen.py @@ -26,12 +26,14 @@ def golden_model(self, a, x, y): return a*x + y def validate_config(self, **kwargs): - assert (kwargs['n'] % 8) == 0, "n must be an integer multiple of the number of cores" + assert kwargs['n'] % kwargs['n_tiles'] == 0, "n must be an integer multiple of n_tiles" + n_per_tile = kwargs['n'] // kwargs['n_tiles'] + assert (n_per_tile % 8) == 0, "n must be an integer multiple of the number of cores" assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}" # Calculate total TCDM occupation # Note: doesn't account for double buffering - vec_size = kwargs['n'] * 8 + vec_size = n_per_tile * 8 total_size = 3 * vec_size data_utils.validate_tcdm_footprint(total_size) @@ -55,6 +57,7 @@ def emit_header(self, **kwargs): 'x': x_uid, 'y': y_uid, 'z': z_uid, + 'n_tiles': kwargs['n_tiles'], 'funcptr': kwargs['funcptr'] } diff --git a/sw/blas/axpy/src/args.h b/sw/blas/axpy/src/args.h index aeaa76745..0efe3a2b4 100644 --- a/sw/blas/axpy/src/args.h +++ b/sw/blas/axpy/src/args.h @@ -13,5 +13,6 @@ typedef struct { double *x; double *y; double *z; + uint32_t n_tiles; axpy_fp_t funcptr; } axpy_args_t; diff --git a/sw/blas/axpy/src/axpy.h b/sw/blas/axpy/src/axpy.h index d5ded81af..a1fc7cda6 100644 --- a/sw/blas/axpy/src/axpy.h +++ b/sw/blas/axpy/src/axpy.h @@ -63,6 +63,7 @@ static inline void axpy_opt(uint32_t n, double a, double* x, double* y, double* } static inline void axpy_job(axpy_args_t *args) { + uint32_t frac, offset, size; uint64_t local_x_addr, local_y_addr, local_z_addr; double *local_x, *local_y, *local_z; double *remote_x, *remote_y, *remote_z; @@ -80,12 +81,8 @@ static inline void axpy_job(axpy_args_t *args) { args = local_args; #endif - // Calculate size and pointers for each cluster - uint32_t frac = args->n / snrt_cluster_num(); - uint32_t offset = frac * snrt_cluster_idx(); - remote_x = args->x + offset; - remote_y = args->y + offset; - remote_z = args->z + offset; + // Calculate size of each tile + frac = args->n / args->n_tiles; // Allocate space for job operands in TCDM // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th. @@ -96,29 +93,44 @@ static inline void axpy_job(axpy_args_t *args) { local_y = (double *)local_y_addr; local_z = (double *)local_z_addr; - // Copy job operands in TCDM - if (snrt_is_dm_core()) { - size_t size = frac * sizeof(double); - snrt_dma_start_1d(local_x, remote_x, size); - snrt_dma_start_1d(local_y, remote_y, size); - snrt_dma_wait_all(); - } - snrt_cluster_hw_barrier(); - - // Compute - if (!snrt_is_dm_core()) { - axpy_fp_t fp = args->funcptr; - uint32_t start_cycle = snrt_mcycle(); - fp(frac, args->a, local_x, local_y, local_z); - uint32_t end_cycle = snrt_mcycle(); + // Iterate over multiple tiles + for (int i = 0; i < args->n_tiles; i++) { + + // DMA in + if (snrt_is_dm_core()) { + + // Calculate size and pointers to current tile + size = frac * sizeof(double); + offset = i * frac; + remote_x = args->x + offset; + remote_y = args->y + offset; + + // Copy job operands in TCDM + snrt_dma_start_1d(local_x, remote_x, size); + snrt_dma_start_1d(local_y, remote_y, size); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); + + // Compute + if (!snrt_is_dm_core()) { + axpy_fp_t fp = args->funcptr; + uint32_t start_cycle = snrt_mcycle(); + fp(frac, args->a, local_x, local_y, local_z); + uint32_t end_cycle = snrt_mcycle(); + } + snrt_cluster_hw_barrier(); + + // DMA out + if (snrt_is_dm_core()) { + + // Calculate pointers to current tile + remote_z = args->z + offset; + + // Copy job outputs from TCDM + snrt_dma_start_1d(remote_z, local_z, size); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); } - snrt_cluster_hw_barrier(); - - // Copy data out of TCDM - if (snrt_is_dm_core()) { - size_t size = frac * sizeof(double); - snrt_dma_start_1d(remote_z, local_z, size); - snrt_dma_wait_all(); - } - snrt_cluster_hw_barrier(); } From 7810bf786ec637f32af1768902a70edb6f8bcb62 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 12 Aug 2024 17:58:51 +0200 Subject: [PATCH 03/19] sw/blas/axpy: Add double buffering --- sw/blas/axpy/scripts/datagen.py | 5 +- sw/blas/axpy/src/axpy.h | 127 +++++++++++++++++++++++--------- 2 files changed, 94 insertions(+), 38 deletions(-) diff --git a/sw/blas/axpy/scripts/datagen.py b/sw/blas/axpy/scripts/datagen.py index af91d886d..cf6795667 100755 --- a/sw/blas/axpy/scripts/datagen.py +++ b/sw/blas/axpy/scripts/datagen.py @@ -8,6 +8,7 @@ import numpy as np import sys +from snitch.util.sim import data_utils from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \ format_array_declaration, format_ifdef_wrapper, format_struct_definition, DataGen @@ -32,9 +33,9 @@ def validate_config(self, **kwargs): assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}" # Calculate total TCDM occupation - # Note: doesn't account for double buffering + # Note: doesn't account for gaps created by data alignment vec_size = n_per_tile * 8 - total_size = 3 * vec_size + total_size = 2 * 3 * vec_size data_utils.validate_tcdm_footprint(total_size) def emit_header(self, **kwargs): diff --git a/sw/blas/axpy/src/axpy.h b/sw/blas/axpy/src/axpy.h index a1fc7cda6..c5df546ab 100644 --- a/sw/blas/axpy/src/axpy.h +++ b/sw/blas/axpy/src/axpy.h @@ -5,6 +5,8 @@ #include "args.h" #include "snrt.h" +#define DOUBLE_BUFFER 1 + #define BANK_ALIGNMENT 8 #define TCDM_ALIGNMENT (32 * BANK_ALIGNMENT) #define ALIGN_UP_TCDM(addr) ALIGN_UP(addr, TCDM_ALIGNMENT) @@ -64,9 +66,14 @@ static inline void axpy_opt(uint32_t n, double a, double* x, double* y, double* static inline void axpy_job(axpy_args_t *args) { uint32_t frac, offset, size; - uint64_t local_x_addr, local_y_addr, local_z_addr; - double *local_x, *local_y, *local_z; + uint64_t local_x0_addr, local_y0_addr, local_z0_addr, + local_x1_addr, local_y1_addr, local_z1_addr; + double *local_x[2]; + double *local_y[2]; + double *local_z[2]; double *remote_x, *remote_y, *remote_z; + uint32_t iterations, i, i_dma_in, i_compute, i_dma_out, buff_idx; + #ifndef JOB_ARGS_PRELOADED // Allocate space for job arguments in TCDM @@ -83,54 +90,102 @@ static inline void axpy_job(axpy_args_t *args) { // Calculate size of each tile frac = args->n / args->n_tiles; + size = frac * sizeof(double); // Allocate space for job operands in TCDM // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th. - local_x_addr = ALIGN_UP_TCDM((uint64_t)args + sizeof(axpy_args_t)); - local_y_addr = ALIGN_UP_TCDM(local_x_addr + frac * sizeof(double)) + 8 * BANK_ALIGNMENT; - local_z_addr = ALIGN_UP_TCDM(local_y_addr + frac * sizeof(double)) + 16 * BANK_ALIGNMENT; - local_x = (double *)local_x_addr; - local_y = (double *)local_y_addr; - local_z = (double *)local_z_addr; + local_x0_addr = ALIGN_UP_TCDM((uint64_t)args + sizeof(axpy_args_t)); + local_y0_addr = ALIGN_UP_TCDM(local_x0_addr + size) + 8 * BANK_ALIGNMENT; + local_z0_addr = ALIGN_UP_TCDM(local_y0_addr + size) + 16 * BANK_ALIGNMENT; + local_x[0] = (double *)local_x0_addr; + local_y[0] = (double *)local_y0_addr; + local_z[0] = (double *)local_z0_addr; + if (DOUBLE_BUFFER) { + local_x1_addr = ALIGN_UP_TCDM(local_z0_addr + size); + local_y1_addr = ALIGN_UP_TCDM(local_x1_addr + size) + 8 * BANK_ALIGNMENT; + local_z1_addr = ALIGN_UP_TCDM(local_y1_addr + size) + 16 * BANK_ALIGNMENT; + local_x[1] = (double *)local_x1_addr; + local_y[1] = (double *)local_y1_addr; + local_z[1] = (double *)local_z1_addr; + } - // Iterate over multiple tiles - for (int i = 0; i < args->n_tiles; i++) { + // Calculate number of iterations + iterations = args->n_tiles; + if (DOUBLE_BUFFER) iterations += 2; + + // Iterate over all tiles + for (i = 0; i < iterations; i++) { - // DMA in if (snrt_is_dm_core()) { + // DMA in + if (!DOUBLE_BUFFER || (i < args->n_tiles)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_in = i; + buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0; + + // Calculate size and pointers to current tile + offset = i_dma_in * frac; + remote_x = args->x + offset; + remote_y = args->y + offset; + + // Copy job operands in TCDM + snrt_dma_start_1d(local_x[buff_idx], remote_x, size); + snrt_dma_start_1d(local_y[buff_idx], remote_y, size); + snrt_dma_wait_all(); + + snrt_mcycle(); + } + + // Additional barriers required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + // DMA out + if (!DOUBLE_BUFFER || (i > 1)) { + snrt_mcycle(); - // Calculate size and pointers to current tile - size = frac * sizeof(double); - offset = i * frac; - remote_x = args->x + offset; - remote_y = args->y + offset; + // Compute tile and buffer indices + i_dma_out = DOUBLE_BUFFER ? i - 2 : i; + buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0; - // Copy job operands in TCDM - snrt_dma_start_1d(local_x, remote_x, size); - snrt_dma_start_1d(local_y, remote_y, size); - snrt_dma_wait_all(); + // Calculate pointers to current tile + offset = i_dma_out * frac; + remote_z = args->z + offset; + + // Copy job outputs from TCDM + snrt_dma_start_1d(remote_z, local_z[buff_idx], size); + snrt_dma_wait_all(); + + snrt_mcycle(); + } } - snrt_cluster_hw_barrier(); // Compute - if (!snrt_is_dm_core()) { - axpy_fp_t fp = args->funcptr; - uint32_t start_cycle = snrt_mcycle(); - fp(frac, args->a, local_x, local_y, local_z); - uint32_t end_cycle = snrt_mcycle(); - } - snrt_cluster_hw_barrier(); + if (snrt_is_compute_core()) { + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); - // DMA out - if (snrt_is_dm_core()) { + if (!DOUBLE_BUFFER || (i > 0 && i < (args->n_tiles + 1))) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_compute = DOUBLE_BUFFER ? i - 1 : i; + buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0; + + // Perform tile computation + axpy_fp_t fp = args->funcptr; + fp(frac, args->a, local_x[buff_idx], local_y[buff_idx], local_z[buff_idx]); - // Calculate pointers to current tile - remote_z = args->z + offset; - - // Copy job outputs from TCDM - snrt_dma_start_1d(remote_z, local_z, size); - snrt_dma_wait_all(); + snrt_mcycle(); + } + + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); } + + // Synchronize cores after every iteration snrt_cluster_hw_barrier(); } } From c91cb215efc8c0a997d7b34c99d6d0f70a56e862 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Tue, 13 Aug 2024 18:17:25 +0200 Subject: [PATCH 04/19] sw: Add `ata` kernel --- sw/apps/ata/.gitignore | 1 + sw/apps/ata/data/params.json | 10 + sw/apps/ata/scripts/datagen.py | 72 ++++++ sw/apps/ata/scripts/verify.py | 44 ++++ sw/apps/ata/src/args.h | 19 ++ sw/apps/ata/src/ata.h | 277 +++++++++++++++++++++++ sw/apps/ata/src/main.c | 17 ++ target/snitch_cluster/sw.mk | 1 + target/snitch_cluster/sw/apps/ata/app.mk | 13 ++ 9 files changed, 454 insertions(+) create mode 100644 sw/apps/ata/.gitignore create mode 100644 sw/apps/ata/data/params.json create mode 100755 sw/apps/ata/scripts/datagen.py create mode 100755 sw/apps/ata/scripts/verify.py create mode 100644 sw/apps/ata/src/args.h create mode 100644 sw/apps/ata/src/ata.h create mode 100644 sw/apps/ata/src/main.c create mode 100644 target/snitch_cluster/sw/apps/ata/app.mk diff --git a/sw/apps/ata/.gitignore b/sw/apps/ata/.gitignore new file mode 100644 index 000000000..8485f615e --- /dev/null +++ b/sw/apps/ata/.gitignore @@ -0,0 +1 @@ +data/data.h \ No newline at end of file diff --git a/sw/apps/ata/data/params.json b/sw/apps/ata/data/params.json new file mode 100644 index 000000000..1db35db08 --- /dev/null +++ b/sw/apps/ata/data/params.json @@ -0,0 +1,10 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + "m": 16, + "n": 4, + "m_tiles": 2, + "funcptr": "ata_opt" +} diff --git a/sw/apps/ata/scripts/datagen.py b/sw/apps/ata/scripts/datagen.py new file mode 100755 index 000000000..11978b918 --- /dev/null +++ b/sw/apps/ata/scripts/datagen.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Author: Luca Colagrande + +import numpy as np + +from snitch.util.sim import data_utils +from snitch.util.sim.data_utils import format_array_definition, format_array_declaration, \ + format_struct_definition, DataGen + + +DOUBLE_BUFFER = True + +class AtaDataGen(DataGen): + + # Function pointers to alternative implementations + FUNCPTRS = ["ata_baseline", "ata_opt"] + + def golden_model(self, A): + return np.matmul(A, A.transpose()) + + def validate(self, **kwargs): + assert (kwargs['m'] % kwargs['m_tiles']) == 0, "m must be an integer multiple of m_tiles" + m_frac = kwargs['m'] / kwargs['m_tiles'] + assert (m_frac % 8) == 0, "m_frac must be an integer multiple of the number of cores" + assert (m_frac % 4) == 0, "m_frac must be an integer multiple of the unroll factor 4" + assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}" + + # Calculate total TCDM occupation + a_tile_size = m_frac * kwargs['n'] * 8 + b_tile_size = m_frac * m_frac * 8 + total_size = 2 * a_tile_size + b_tile_size + if DOUBLE_BUFFER: + total_size *= 2 + data_utils.validate_tcdm_footprint(total_size) + + def emit_header(self, **kwargs): + header = [super().emit_header()] + + self.validate(**kwargs) + + A = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['n']))/100 + B = self.golden_model(A) + + A = A.flatten() + B = B.flatten() + + A_uid = 'A' + B_uid = 'B' + + cfg = { + 'm': kwargs['m'], + 'n': kwargs['n'], + 'a': A_uid, + 'b': B_uid, + 'm_tiles': kwargs['m_tiles'], + 'funcptr': kwargs['funcptr'] + } + + header += [format_array_definition('double', A_uid, A)] + header += [format_array_declaration('double', B_uid, B.shape)] + header += [format_struct_definition('ata_args_t', 'args', cfg)] + header = '\n\n'.join(header) + + return header + + +if __name__ == '__main__': + AtaDataGen().main() diff --git a/sw/apps/ata/scripts/verify.py b/sw/apps/ata/scripts/verify.py new file mode 100755 index 000000000..1c6b50747 --- /dev/null +++ b/sw/apps/ata/scripts/verify.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import numpy as np +import sys +from datagen import AtaDataGen + +from snitch.util.sim.verif_utils import Verifier + + +class AtaVerifier(Verifier): + + OUTPUT_UIDS = ['B'] + + def __init__(self): + super().__init__() + self.func_args = { + 'm': 'I', + 'n': 'I', + 'A': 'I', + 'B': 'I', + 'm_tiles': 'I', + 'funcptr': 'I' + } + self.func_args = self.get_input_from_symbol('args', self.func_args) + + def get_actual_results(self): + return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double') + + def get_expected_results(self): + A = self.get_input_from_symbol('A', 'double') + A = np.reshape(A, (self.func_args['m'], self.func_args['n'])) + return AtaDataGen().golden_model(A).flatten() + + def check_results(self, *args): + return super().check_results(*args, rtol=1e-10) + + +if __name__ == "__main__": + sys.exit(AtaVerifier().main()) diff --git a/sw/apps/ata/src/args.h b/sw/apps/ata/src/args.h new file mode 100644 index 000000000..520693e22 --- /dev/null +++ b/sw/apps/ata/src/args.h @@ -0,0 +1,19 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#pragma once +#include + +typedef void (*ata_fp_t)(uint32_t m, uint32_t n, double *a, double *at,double *b); + +typedef struct { + uint32_t m; + uint32_t n; + double *a; + double *b; + uint32_t m_tiles; + ata_fp_t funcptr; +} ata_args_t; diff --git a/sw/apps/ata/src/ata.h b/sw/apps/ata/src/ata.h new file mode 100644 index 000000000..0e33ea5ff --- /dev/null +++ b/sw/apps/ata/src/ata.h @@ -0,0 +1,277 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include "args.h" +#include "snrt.h" + +#define DOUBLE_BUFFER 1 + +__thread int setup_ssr = 1; + +void ata_naive(uint32_t m, uint32_t n, double *a, double *at, double *b) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + for (uint32_t i = offset; i < m; i += stride) { + for (uint32_t j = 0; j < m; j++) { + b[i * m + j] = 0; + for (uint32_t k = 0; k < n; k++) { + b[i * m + j] += a[i * n + k] * at[j * n + k]; + } + } + } +} + +void ata_baseline(uint32_t m, uint32_t n, double *a, double *at, double *b) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + // Unrolling factor of innermost loop + // Note: changes must be reflected in the inline assembly code + // and datagen script + const uint32_t unroll = 8; + + for (uint32_t i = offset; i < m; i += stride) { + for (uint32_t j = 0; j < m; j++) { + + double acc = 0; + + for (uint32_t k = 0; k < n; k += unroll) { + asm volatile( + "fmadd.d %[acc], %[a0], %[at0], %[acc] \n" + "fmadd.d %[acc], %[a1], %[at1], %[acc] \n" + "fmadd.d %[acc], %[a2], %[at2], %[acc] \n" + "fmadd.d %[acc], %[a3], %[at3], %[acc] \n" + "fmadd.d %[acc], %[a4], %[at4], %[acc] \n" + "fmadd.d %[acc], %[a5], %[at5], %[acc] \n" + "fmadd.d %[acc], %[a6], %[at6], %[acc] \n" + "fmadd.d %[acc], %[a7], %[at7], %[acc] \n" + : [ acc ] "+f"(acc) + : [ a0 ] "f"(a[i * n + k + 0]), [ a1 ] "f"(a[i * n + k + 1]), + [ a2 ] "f"(a[i * n + k + 2]), [ a3 ] "f"(a[i * n + k + 3]), + [ a4 ] "f"(a[i * n + k + 4]), [ a5 ] "f"(a[i * n + k + 5]), + [ a6 ] "f"(a[i * n + k + 6]), [ a7 ] "f"(a[i * n + k + 7]), + [ at0 ] "f"(at[j * n + k + 0]), [ at1 ] "f"(at[j * n + k + 1]), + [ at2 ] "f"(at[j * n + k + 2]), [ at3 ] "f"(at[j * n + k + 3]), + [ at4 ] "f"(at[j * n + k + 4]), [ at5 ] "f"(at[j * n + k + 5]), + [ at6 ] "f"(at[j * n + k + 6]), [ at7 ] "f"(at[j * n + k + 7]) + : + ); + } + + b[i * m + j] = acc; + } + } +} + +void ata_opt(uint32_t m, uint32_t n, double *a, double *at, double *b) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + // Unrolling factor of innermost loop + // Note: changes must be reflected in the inline assembly code + // and datagen script + const uint32_t unroll = 4; + + if (setup_ssr) { + // Configure ft0 and ft1 to load A and At + // for (i = offset; i < m; i += stride) + // for (j1 = 0; j1 < m; j1 += unroll) + // for (k = 0; k < n; k++) + // for (j0 = 0; j0 < unroll; j0++) + // j = j1 + j0 + // ft0.push(a[i * n + k]) + // ft1.push(at[j * n + k]) + const uint32_t ssr0_b[4] = {unroll, n, m / unroll, m / stride}; + const uint32_t ssr0_i[4] = {0, sizeof(double), 0, stride * n * sizeof(double)}; + snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], + ssr0_i[1], ssr0_i[2], ssr0_i[3]); + snrt_ssr_repeat(SNRT_SSR_DM0, unroll); + const uint32_t ssr1_b[4] = {unroll, n, m / unroll, m / stride}; + const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), unroll * n * sizeof(double), 0}; + snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], + ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2], + ssr1_i[3]); + setup_ssr = 0; + } + + // SSR start address need to be configured each time + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, a + offset * n); + snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, at); + snrt_ssr_enable(); + + for (uint32_t i = offset; i < m; i += stride) { + for (uint32_t j = 0; j < m; j += unroll) { + + double acc[unroll]; + acc[0] = 0; + acc[1] = 0; + acc[2] = 0; + acc[3] = 0; + + asm volatile( + "frep.o %[n_frep], %[unroll], 0, 0 \n" + "fmadd.d %[b0], ft0, ft1, %[b0] \n" + "fmadd.d %[b1], ft0, ft1, %[b1] \n" + "fmadd.d %[b2], ft0, ft1, %[b2] \n" + "fmadd.d %[b3], ft0, ft1, %[b3] \n" + : [ b0 ] "+f"(acc[0]), [ b1 ] "+f"(acc[1]), + [ b2 ] "+f"(acc[2]), [ b3 ] "+f"(acc[3]) + : [ n_frep ] "r"(n - 1), [ unroll ] "i"(unroll) + : "ft0", "ft1", "ft2"); + + b[i * m + j + 0] = acc[0]; + b[i * m + j + 1] = acc[1]; + b[i * m + j + 2] = acc[2]; + b[i * m + j + 3] = acc[3]; + } + } + + snrt_ssr_disable(); + snrt_fpu_fence(); +} + +void ata_job(ata_args_t *args) { + uint32_t m_frac, a_tile_size, a_tile_bytes, b_tile_size, b_tile_bytes; + uint64_t local_a0_addr, local_at0_addr, local_b0_addr, + local_a1_addr, local_at1_addr, local_b1_addr; + double *local_a[2]; + double *local_at[2]; + double *local_b[2]; + uint32_t iterations, sb_iterations; + uint32_t i, i_dma_in, i_compute, i_dma_out, i_row, i_col, buff_idx; + +#ifndef JOB_ARGS_PRELOADED + // Allocate space for job arguments in TCDM + ata_args_t *local_args = (ata_args_t *)snrt_l1_next(); + + // Copy job arguments to TCDM + if (snrt_is_dm_core()) { + snrt_dma_start_1d(local_args, args, sizeof(ata_args_t)); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); + args = local_args; +#endif + + // Calculate size of each tile + m_frac = args->m / args->m_tiles; + a_tile_size = args->n * m_frac; + b_tile_size = m_frac * m_frac; + a_tile_bytes = a_tile_size * sizeof(double); + b_tile_bytes = b_tile_size * sizeof(double); + + // Allocate space for job operands in TCDM + // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th. + local_a0_addr = (uint64_t)args + sizeof(ata_args_t); + local_at0_addr = local_a0_addr + a_tile_bytes; + local_b0_addr = local_at0_addr + a_tile_bytes; + local_a[0] = (double *)local_a0_addr; + local_at[0] = (double *)local_at0_addr; + local_b[0] = (double *)local_b0_addr; + if (DOUBLE_BUFFER) { + local_a1_addr = local_b0_addr + b_tile_bytes; + local_at1_addr = local_a1_addr + a_tile_bytes; + local_b1_addr = local_at1_addr + a_tile_bytes; + local_a[1] = (double *)local_a1_addr; + local_at[1] = (double *)local_at1_addr; + local_b[1] = (double *)local_b1_addr; + } + + // Calculate number of iterations + sb_iterations = args->m_tiles * args->m_tiles; + if (DOUBLE_BUFFER) iterations = sb_iterations + 2; + else iterations = sb_iterations; + + // Iterate over all tiles + for (i = 0; i < iterations; i++) { + + if (snrt_is_dm_core()) { + // DMA in + if (!DOUBLE_BUFFER || (i < sb_iterations)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_in = i; + buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0; + i_row = i_dma_in / args->m_tiles; + i_col = i_dma_in % args->m_tiles; + + // Copy job operands in TCDM + snrt_dma_load_1d_tile( + local_a[buff_idx], + args->a, + i_row, + a_tile_size, + sizeof(double)); + snrt_dma_load_1d_tile( + local_at[buff_idx], + args->a, + i_col, + a_tile_size, + sizeof(double)); + snrt_dma_wait_all(); + + snrt_mcycle(); + } + + // Additional barriers required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + // DMA out + if (!DOUBLE_BUFFER || (i > 1)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_out = DOUBLE_BUFFER ? i - 2 : i; + buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0; + i_row = i_dma_out / args->m_tiles; + i_col = i_dma_out % args->m_tiles; + + // Copy job outputs from TCDM + snrt_dma_store_2d_tile( + args->b, + local_b[buff_idx], + i_row, + i_col, + m_frac, + m_frac, + args->m, + sizeof(double)); + snrt_dma_wait_all(); + + snrt_mcycle(); + } + } + + // Compute + if (snrt_is_compute_core()) { + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1))) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_compute = DOUBLE_BUFFER ? i - 1 : i; + buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0; + + // Perform tile computation + ata_fp_t fp = args->funcptr; + fp(m_frac, args->n, local_a[buff_idx], + local_at[buff_idx], local_b[buff_idx]); + + snrt_mcycle(); + } + + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + } + // Synchronize cores after every iteration + snrt_cluster_hw_barrier(); + } +} diff --git a/sw/apps/ata/src/main.c b/sw/apps/ata/src/main.c new file mode 100644 index 000000000..c8df4bea9 --- /dev/null +++ b/sw/apps/ata/src/main.c @@ -0,0 +1,17 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include "snrt.h" + +#include "ata.h" +#include "data.h" + +int main() { + + ata_job(&args); + + return 0; +} diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk index ca8246124..0a1e4c00c 100644 --- a/target/snitch_cluster/sw.mk +++ b/target/snitch_cluster/sw.mk @@ -63,6 +63,7 @@ APPS += sw/apps/dnn/concat APPS += sw/apps/dnn/fused_concat_linear APPS += sw/apps/dnn/transpose APPS += sw/apps/montecarlo/pi_estimation +APPS += sw/apps/ata APPS += sw/apps/atax APPS += sw/apps/correlation APPS += sw/apps/covariance diff --git a/target/snitch_cluster/sw/apps/ata/app.mk b/target/snitch_cluster/sw/apps/ata/app.mk new file mode 100644 index 000000000..af63400b4 --- /dev/null +++ b/target/snitch_cluster/sw/apps/ata/app.mk @@ -0,0 +1,13 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +APP := ata +$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build +SRC_DIR := $(ROOT)/sw/apps/$(APP)/src +SRCS := $(SRC_DIR)/main.c + +include $(ROOT)/sw/apps/common.mk +include $(ROOT)/target/snitch_cluster/sw/apps/common.mk From f63c6c9f41ee7fd02230880f3965d26dca4c1617 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 14 Aug 2024 00:25:29 +0200 Subject: [PATCH 05/19] sw: Add optimized covariance kernel --- sw/apps/covariance/data/params.json | 6 +- sw/apps/covariance/roi.json | 36 +++ sw/apps/covariance/scripts/datagen.py | 61 +++-- sw/apps/covariance/scripts/verify.py | 20 +- sw/apps/covariance/src/args.h | 22 ++ sw/apps/covariance/src/covariance.h | 375 +++++++++++++++++++++++--- sw/apps/covariance/src/main.c | 51 +--- 7 files changed, 471 insertions(+), 100 deletions(-) create mode 100644 sw/apps/covariance/roi.json create mode 100644 sw/apps/covariance/src/args.h diff --git a/sw/apps/covariance/data/params.json b/sw/apps/covariance/data/params.json index 9e89d9f85..5ae088d97 100644 --- a/sw/apps/covariance/data/params.json +++ b/sw/apps/covariance/data/params.json @@ -3,6 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 { - M: 16, - N: 8 + "m": 32, + "n": 2, + "m_tiles": 2, + "funcptr": "covariance_opt" } diff --git a/sw/apps/covariance/roi.json b/sw/apps/covariance/roi.json new file mode 100644 index 000000000..757a2ce6d --- /dev/null +++ b/sw/apps/covariance/roi.json @@ -0,0 +1,36 @@ +[ + <% DOUBLE_BUFFER = 1 %> + <% N_TILES = 4 %> + + // Compute cores + % for j in range(0, 8): + { + "thread": "${f'hart_{j}'}", + "roi": [ + % for i in range(0, N_TILES): + {"idx": ${2 * i + 1}, "label": "${f'tile_{i}'}"}, + % endfor + ] + }, + % endfor + + // DMA core + { + "thread": "hart_8", + "roi": [ + % if not DOUBLE_BUFFER: + % for i in range(0, N_TILES): + {"idx": ${4 * i + 1}, "label": "${f'tile_{i}_in'}"}, + {"idx": ${4 * i + 3}, "label": "${f'tile_{i}_out'}"}, + % endfor + % else: + {"idx": 1, "label": "tile_0_in"}, + % for i in range(1, N_TILES): + {"idx": ${4 * (i - 1) + 3}, "label": "${f'tile_{i}_in'}"}, + {"idx": ${4 * (i - 1) + 5}, "label": "${f'tile_{i-1}_out'}"}, + % endfor + {"idx": ${4 * (i - 1) + 7}, "label": "tile_15_out"}, + % endif + ] + } +] \ No newline at end of file diff --git a/sw/apps/covariance/scripts/datagen.py b/sw/apps/covariance/scripts/datagen.py index 44e20d55e..165fc30a5 100755 --- a/sw/apps/covariance/scripts/datagen.py +++ b/sw/apps/covariance/scripts/datagen.py @@ -8,38 +8,67 @@ import numpy as np -from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \ - format_array_declaration, format_ifdef_wrapper, DataGen +from snitch.util.sim import data_utils +from snitch.util.sim.data_utils import format_array_definition, \ + format_array_declaration, format_struct_definition, DataGen -# AXI splits bursts crossing 4KB address boundaries. To minimize -# the occurrence of these splits the data should be aligned to 4KB -BURST_ALIGNMENT = 4096 +DOUBLE_BUFFER = True class CovarianceDataGen(DataGen): + # Function pointers to alternative implementations + FUNCPTRS = ["covariance_naive", "covariance_opt"] + def golden_model(self, data): return np.cov(data, rowvar=False) + def validate(self, **kwargs): + assert (kwargs['m'] % kwargs['m_tiles']) == 0, "m must be an integer multiple of m_tiles" + m_per_tile = kwargs['m'] / kwargs['m_tiles'] + assert (m_per_tile % 8) == 0, "m_per_tile must be an integer multiple of the number of cores" + assert (m_per_tile % 4) == 0, "m_per_tile must be an integer multiple of unroll1 = 4" + m_per_core = m_per_tile / 8 + assert (m_per_core % 2) == 0, "m_per_core must be an integer multiple of the unroll0 = 2" + assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}" + + # Calculate total TCDM occupation + a_tile_size = m_per_tile * kwargs['n'] * 8 + b_tile_size = m_per_tile * m_per_tile * 8 + total_size = 2 * a_tile_size + b_tile_size + if DOUBLE_BUFFER: + total_size *= 2 + data_utils.validate_tcdm_footprint(total_size) + def emit_header(self, **kwargs): header = [super().emit_header()] - M, N = kwargs['M'], kwargs['N'] - data = np.random.randint(-200, 100, size=(N, M)) - cov = self.golden_model(data) + self.validate(**kwargs) - assert (M % 8) == 0, "M must be an integer multiple of the number of cores" + data = np.random.randint(-200, 100, size=(kwargs['n'], kwargs['m'])) + cov = self.golden_model(data) - data = data.flatten() + data = data.transpose().flatten() cov = cov.flatten() - header += [format_scalar_definition('uint32_t', 'M', M)] - header += [format_scalar_definition('uint32_t', 'N', N)] - header += [format_array_definition('double', 'data', data, alignment=BURST_ALIGNMENT)] - header += [format_array_declaration('double', 'cov', cov.shape, alignment=BURST_ALIGNMENT)] - result_def = format_array_definition('double', 'golden', cov, alignment=BURST_ALIGNMENT) - header += [format_ifdef_wrapper('BIST', result_def)] + data_uid = 'data' + cov_uid = 'cov' + + cfg = { + 'm': kwargs['m'], + 'n': kwargs['n'], + 'inv_n': 1 / kwargs['n'], + 'inv_n_m1': 1 / (kwargs['n'] - 1), + 'data': data_uid, + 'cov': cov_uid, + 'm_tiles': kwargs['m_tiles'], + 'funcptr': kwargs['funcptr'] + } + + header += [format_array_definition('double', data_uid, data)] + header += [format_array_declaration('double', cov_uid, cov.shape)] + header += [format_struct_definition('covariance_args_t', 'args', cfg)] header = '\n\n'.join(header) return header diff --git a/sw/apps/covariance/scripts/verify.py b/sw/apps/covariance/scripts/verify.py index 4c5b0cdd1..a390d83d1 100755 --- a/sw/apps/covariance/scripts/verify.py +++ b/sw/apps/covariance/scripts/verify.py @@ -16,14 +16,26 @@ class CovarianceVerifier(Verifier): OUTPUT_UIDS = ['cov'] + def __init__(self): + super().__init__() + self.func_args = { + 'm': 'I', + 'n': 'I', + 'inv_n': 'd', + 'inv_n_m1': 'd', + 'data': 'I', + 'cov': 'I', + 'm_tiles': 'I', + 'funcptr': 'I' + } + self.func_args = self.get_input_from_symbol('args', self.func_args) + def get_actual_results(self): - return self.get_output_from_symbol('cov', 'double') + return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double') def get_expected_results(self): - M = self.get_input_from_symbol('M', 'uint32_t')[0] - N = self.get_input_from_symbol('N', 'uint32_t')[0] data = self.get_input_from_symbol('data', 'double') - data = np.reshape(data, (N, M)) + data = np.reshape(data, (self.func_args['m'], self.func_args['n'])).transpose() return CovarianceDataGen().golden_model(data).flatten() def check_results(self, *args): diff --git a/sw/apps/covariance/src/args.h b/sw/apps/covariance/src/args.h new file mode 100644 index 000000000..f88768dd5 --- /dev/null +++ b/sw/apps/covariance/src/args.h @@ -0,0 +1,22 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#pragma once +#include + +typedef void (*covariance_fp_t)(uint32_t m, uint32_t n, double inv_n, + double inv_n_m1, double *data, double *datat,double *cov); + +typedef struct { + uint32_t m; + uint32_t n; + double inv_n; + double inv_n_m1; + double *data; + double *cov; + uint32_t m_tiles; + covariance_fp_t funcptr; +} covariance_args_t; diff --git a/sw/apps/covariance/src/covariance.h b/sw/apps/covariance/src/covariance.h index fec79d195..41c33a93b 100644 --- a/sw/apps/covariance/src/covariance.h +++ b/sw/apps/covariance/src/covariance.h @@ -5,50 +5,359 @@ // Author: Jose Pedro Castro Fonseca // Luca Colagrande -#include +#include "args.h" #include "snrt.h" -void kernel_covariance(uint32_t N, uint32_t M, double *data, double *cov) { - int i1, i, j, k; - int core_range, core_offset; - - // Compute deviations - if (snrt_is_compute_core()) { - // Distribute different attributes to the different cores - core_range = M / snrt_cluster_compute_core_num(); - core_offset = snrt_cluster_core_idx() * core_range; - for (i1 = 0; i1 < core_range; i1++) { - i = core_offset + i1; - - // Calculate mean vector - double mean = 0.0; - for (k = 0; k < N; k++) { - mean += data[k * M + i]; - } - mean = mean / N; +#define DOUBLE_BUFFER 1 + +void covariance_naive(uint32_t m, uint32_t n, double inv_n, + double inv_n_m1, double *data, double *datat, + double *cov) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + // Center data + for (uint32_t i = offset; i < m; i += stride) { + + // Calculate row mean + double data_mean = 0.0; + double datat_mean = 0.0; + for (uint32_t j = 0; j < n; j++) { + data_mean += data[i * n + j]; + datat_mean += datat[i * n + j]; + } + data_mean = data_mean * inv_n; + datat_mean = datat_mean * inv_n; + + // Center row around zero + for (uint32_t j = 0; j < n; j++) { + data[i * n + j] -= data_mean; + datat[i * n + j] -= datat_mean; + } + } - // Standardize data to zero mean - for (k = 0; k < N; k++) { - data[k * M + i] -= mean; + snrt_fpu_fence(); + snrt_cluster_hw_barrier(); + + // Compute covariance matrix + for (uint32_t i = offset; i < m; i += stride) { + for (uint32_t j = 0; j < m; j++) { + cov[i * m + j] = 0.0; + for (uint32_t k = 0; k < n; k++) { + cov[i * m + j] += data[i * n + k] * datat[j * n + k]; } + cov[i * m + j] *= inv_n_m1; } + } +} + +void covariance_opt(uint32_t m, uint32_t n, double inv_n, + double inv_n_m1, double *data, double *datat, + double *cov) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + // Unrolling factor of innermost loop + // Note: changes must be reflected in the inline assembly code + // and datagen script + const uint32_t unroll0 = 2; + + // Configure ft0 and ft1 to load data and datat elements + // for (k = 0; k < 2; k++) + // for (i1 = offset; i1 < m; i1 += stride * unroll0) + // for (j = 0; j < n; j++) + // for (i0 = 0; i0 < unroll0; i0++) + // i = i1 + i0 * stride + // ft0.push(data[i * n + j]) + // ft1.push(datat[i * n + j]) + const uint32_t ssr01_b[4] = {unroll0, n, 2, m / (stride * unroll0)}; + const uint32_t ssr01_i[4] = {sizeof(double) * n * stride, sizeof(double), + 0, sizeof(double) * n * stride * unroll0}; + snrt_ssr_loop_4d(SNRT_SSR_DM0, + ssr01_b[0], ssr01_b[1], ssr01_b[2], ssr01_b[3], + ssr01_i[0], ssr01_i[1], ssr01_i[2], ssr01_i[3]); + snrt_ssr_loop_4d(SNRT_SSR_DM1, + ssr01_b[0], ssr01_b[1], ssr01_b[2], ssr01_b[3], + ssr01_i[0], ssr01_i[1], ssr01_i[2], ssr01_i[3]); + // Configure ft2 to store data and datat elements + // for (i1 = offset; i1 < m; i1 += stride * unroll0) + // for (j = 0; j < n; j++) + // for (i0 = 0; i0 < unroll0; i0++) + // i = i1 + i0 * stride + // data[i * n + j] = ft2.pop() + // datat[i * n + j] = ft2.pop() + const uint32_t ssr2_b[4] = {2, unroll0, n, m / (stride * unroll0)}; + const uint32_t ssr2_i[4] = {(uint32_t)datat - (uint32_t)data, + sizeof(double) * n * stride, + sizeof(double), + sizeof(double) * n * stride * unroll0}; + snrt_ssr_loop_4d(SNRT_SSR_DM2, + ssr2_b[0], ssr2_b[1], ssr2_b[2], ssr2_b[3], + ssr2_i[0], ssr2_i[1], ssr2_i[2], ssr2_i[3]); + + // SSR start address need to be configured each time + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, data + offset * n); + snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, datat + offset * n); + snrt_ssr_write(SNRT_SSR_DM2, SNRT_SSR_4D, data + offset * n); + snrt_ssr_enable(); + + // Center data + for (uint32_t i = offset; i < m; i += stride * unroll0) { + + // Calculate row means + double m[2 * unroll0]; + m[0] = 0.0; // mean(data[i]) + m[1] = 0.0; // mean(datat[i]) + m[2] = 0.0; // mean(data[i + stride]) + m[3] = 0.0; // mean(datat[i + stride]) + asm volatile( + "frep.o %[n_frep], %[n_insn], 0, 0 \n" + "fadd.d %[m0], ft0, %[m0] \n" + "fadd.d %[m1], ft1, %[m1] \n" + "fadd.d %[m2], ft0, %[m2] \n" + "fadd.d %[m3], ft1, %[m3] \n" + : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), + [ m2 ] "+f"(m[2]), [ m3 ] "+f"(m[3]) + : [ n_frep ] "r"(n - 1), [ n_insn ] "i"(2 * unroll0) + : "ft0", "ft1", "ft2"); + m[0] *= inv_n; + m[1] *= inv_n; + m[2] *= inv_n; + m[3] *= inv_n; + snrt_fpu_fence(); + + // Center row around zero + asm volatile( + "frep.o %[n_frep], %[n_insn], 0, 0 \n" + "fsub.d ft2, ft0, %[m0] \n" + "fsub.d ft2, ft1, %[m1] \n" + "fsub.d ft2, ft0, %[m2] \n" + "fsub.d ft2, ft1, %[m3] \n" + : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), + [ m2 ] "+f"(m[2]), [ m3 ] "+f"(m[3]) + : [ n_frep ] "r"(n - 1), [ n_insn ] "i"(2 * unroll0) + : "ft0", "ft1", "ft2"); } + snrt_ssr_disable(); + + snrt_fpu_fence(); snrt_cluster_hw_barrier(); - // Compute covariance - if (snrt_is_compute_core()) { - for (i1 = 0; i1 < core_range; i1++) { - i = core_offset + i1; - for (j = 0; j <= i; j++) { - double tmp = 0.0; - for (k = 0; k < N; k++) { - tmp += data[k * M + i] * data[k * M + j]; - } - cov[i * M + j] = tmp / (N - 1); - cov[j * M + i] = cov[i * M + j]; + // The following is taken from the AtA kernel, apart from the normalization + // by 1/(n - 1). + // Here data stands for A and datat for At. + + // Unrolling factor of innermost loop + // Note: changes must be reflected in the inline assembly code + // and datagen script + const uint32_t unroll1 = 4; + + // Configure ft0 and ft1 to load A and At + // for (i = offset; i < m; i += stride) + // for (j1 = 0; j1 < m; j1 += unroll1) + // for (k = 0; k < n; k++) + // for (j0 = 0; j0 < unroll1; j0++) + // j = j1 + j0 + // ft0.push(a[i * n + k]) + // ft1.push(at[j * n + k]) + const uint32_t ssr0_b[4] = {unroll1, n, m / unroll1, m / stride}; + const uint32_t ssr0_i[4] = {0, sizeof(double), 0, stride * n * sizeof(double)}; + snrt_ssr_loop_3d(SNRT_SSR_DM0, + ssr0_b[1], ssr0_b[2], ssr0_b[3], + ssr0_i[1], ssr0_i[2], ssr0_i[3]); + snrt_ssr_repeat(SNRT_SSR_DM0, unroll1); + const uint32_t ssr1_b[4] = {unroll1, n, m / unroll1, m / stride}; + const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), unroll1 * n * sizeof(double), 0}; + snrt_ssr_loop_4d(SNRT_SSR_DM1, + ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3], + ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); + + // SSR start address need to be configured each time + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, data + offset * n); + snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, datat); + snrt_ssr_enable(); + + for (uint32_t i = offset; i < m; i += stride) { + for (uint32_t j = 0; j < m; j += unroll1) { + + double acc[unroll1]; + acc[0] = 0; + acc[1] = 0; + acc[2] = 0; + acc[3] = 0; + + asm volatile( + "frep.o %[n_frep], %[unroll1], 0, 0 \n" + "fmadd.d %[b0], ft0, ft1, %[b0] \n" + "fmadd.d %[b1], ft0, ft1, %[b1] \n" + "fmadd.d %[b2], ft0, ft1, %[b2] \n" + "fmadd.d %[b3], ft0, ft1, %[b3] \n" + : [ b0 ] "+f"(acc[0]), [ b1 ] "+f"(acc[1]), + [ b2 ] "+f"(acc[2]), [ b3 ] "+f"(acc[3]) + : [ n_frep ] "r"(n - 1), [ unroll1 ] "i"(unroll1) + : "ft0", "ft1", "ft2"); + + snrt_ssr_disable(); + + cov[i * m + j + 0] = acc[0] * inv_n_m1; + cov[i * m + j + 1] = acc[1] * inv_n_m1; + cov[i * m + j + 2] = acc[2] * inv_n_m1; + cov[i * m + j + 3] = acc[3] * inv_n_m1; + + snrt_ssr_enable(); + } + } + + snrt_ssr_disable(); + snrt_fpu_fence(); +} + +void covariance_job(covariance_args_t *args) { + uint32_t m_frac, a_tile_size, a_tile_bytes, b_tile_size, b_tile_bytes; + uint64_t local_a0_addr, local_at0_addr, local_b0_addr, + local_a1_addr, local_at1_addr, local_b1_addr; + double *local_a[2]; + double *local_at[2]; + double *local_b[2]; + uint32_t iterations, sb_iterations; + uint32_t i, i_dma_in, i_compute, i_dma_out, i_row, i_col, buff_idx; + +#ifndef JOB_ARGS_PRELOADED + // Allocate space for job arguments in TCDM + covariance_args_t *local_args = (covariance_args_t *)snrt_l1_next(); + + // Copy job arguments to TCDM + if (snrt_is_dm_core()) { + snrt_dma_start_1d(local_args, args, sizeof(covariance_args_t)); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); + args = local_args; +#endif + + // Calculate size of each tile + m_frac = args->m / args->m_tiles; + a_tile_size = args->n * m_frac; + b_tile_size = m_frac * m_frac; + a_tile_bytes = a_tile_size * sizeof(double); + b_tile_bytes = b_tile_size * sizeof(double); + + // Allocate space for job operands in TCDM + // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th. + local_a0_addr = (uint64_t)args + sizeof(covariance_args_t); + local_at0_addr = local_a0_addr + a_tile_bytes; + local_b0_addr = local_at0_addr + a_tile_bytes; + local_a[0] = (double *)local_a0_addr; + local_at[0] = (double *)local_at0_addr; + local_b[0] = (double *)local_b0_addr; + if (DOUBLE_BUFFER) { + local_a1_addr = local_b0_addr + b_tile_bytes; + local_at1_addr = local_a1_addr + a_tile_bytes; + local_b1_addr = local_at1_addr + a_tile_bytes; + local_a[1] = (double *)local_a1_addr; + local_at[1] = (double *)local_at1_addr; + local_b[1] = (double *)local_b1_addr; + } + + // Calculate number of iterations + sb_iterations = args->m_tiles * args->m_tiles; + if (DOUBLE_BUFFER) iterations = sb_iterations + 2; + else iterations = sb_iterations; + + // Iterate over all tiles + for (i = 0; i < iterations; i++) { + + if (snrt_is_dm_core()) { + // DMA in + if (!DOUBLE_BUFFER || (i < sb_iterations)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_in = i; + buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0; + i_row = i_dma_in / args->m_tiles; + i_col = i_dma_in % args->m_tiles; + + // Copy job operands in TCDM + snrt_dma_load_1d_tile( + local_a[buff_idx], + args->data, + i_row, + a_tile_size, + sizeof(double)); + snrt_dma_load_1d_tile( + local_at[buff_idx], + args->data, + i_col, + a_tile_size, + sizeof(double)); + snrt_dma_wait_all(); + + snrt_mcycle(); } + + // Additional barriers required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + // Additional barrier required to synchronize the compute cores + // among them after the data centering phase + if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1))) + snrt_cluster_hw_barrier(); + + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + // DMA out + if (!DOUBLE_BUFFER || (i > 1)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_out = DOUBLE_BUFFER ? i - 2 : i; + buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0; + i_row = i_dma_out / args->m_tiles; + i_col = i_dma_out % args->m_tiles; + + // Copy job outputs from TCDM + snrt_dma_store_2d_tile( + args->cov, + local_b[buff_idx], + i_row, + i_col, + m_frac, + m_frac, + args->m, + sizeof(double)); + snrt_dma_wait_all(); + + snrt_mcycle(); + } + } + + // Compute + if (snrt_is_compute_core()) { + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1))) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_compute = DOUBLE_BUFFER ? i - 1 : i; + buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0; + + // Perform tile computation + covariance_fp_t fp = args->funcptr; + fp(m_frac, args->n, args->inv_n, args->inv_n_m1, + local_a[buff_idx], local_at[buff_idx], local_b[buff_idx]); + + snrt_mcycle(); + } + + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); } + // Synchronize cores after every iteration + snrt_cluster_hw_barrier(); } } diff --git a/sw/apps/covariance/src/main.c b/sw/apps/covariance/src/main.c index 26b151393..3c9d225a8 100644 --- a/sw/apps/covariance/src/main.c +++ b/sw/apps/covariance/src/main.c @@ -1,56 +1,17 @@ -// Copyright 2023 ETH Zurich and University of Bologna. +// Copyright 2024 ETH Zurich and University of Bologna. // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 // -// Author: Jose Pedro Castro Fonseca -// Luca Colagrande +// Author: Luca Colagrande + +#include "snrt.h" #include "covariance.h" #include "data.h" -#define MAX_ERROR 1e-10 - int main() { - uint32_t nerr = 0; - double *local_mean; - double *local_cov; - double *local_data; - double diff; - - local_data = snrt_l1_next(); - local_cov = local_data + N * M; - - // Initialize input matrix - if (snrt_is_dm_core()) { - snrt_dma_start_1d(local_data, data, sizeof(double) * N * M); - snrt_dma_wait_all(); - } - snrt_cluster_hw_barrier(); - - // Perform Computations - kernel_covariance(N, M, local_data, local_cov); - snrt_cluster_hw_barrier(); - - // Writeback outputs - if (snrt_is_dm_core()) { - snrt_dma_start_1d(cov, local_cov, sizeof(double) * M * M); - snrt_dma_wait_all(); - } - snrt_cluster_hw_barrier(); -#ifdef BIST - // Check computation is correct - if (snrt_cluster_core_idx() == 0) { - for (int i = 0; i < M; i++) { - for (int j = 0; j < M; j++) { - diff = fabs(golden[i * M + j] - local_cov[i * M + j]); - if (diff > MAX_ERROR) { - nerr++; - } - } - } - } -#endif + covariance_job(&args); - return nerr; + return 0; } From 601e3958e33d470804e3455ccd4f27984f9ecda2 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 19 Aug 2024 09:35:23 +0200 Subject: [PATCH 06/19] gen_trace.py: Do not return on exception Ensures that performance metrics are dumped even if the simulation didn't terminate successfully. --- util/trace/gen_trace.py | 1 - 1 file changed, 1 deletion(-) diff --git a/util/trace/gen_trace.py b/util/trace/gen_trace.py index db094ad7e..0fab642e0 100755 --- a/util/trace/gen_trace.py +++ b/util/trace/gen_trace.py @@ -1145,7 +1145,6 @@ def main(): message += 'line {lineno}.' print(traceback.format_exc(), file=sys.stderr) print(message, file=sys.stderr) - return 1 else: break # Nothing more in pipe, EOF perf_metrics[-1]['tend'] = time_info[0] / 1000 From 7de5657efa8a300e1dbeb12169c53d4c81c57f85 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 19 Aug 2024 09:36:50 +0200 Subject: [PATCH 07/19] target: Delete performance dumps on `clean-traces` --- target/common/common.mk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/target/common/common.mk b/target/common/common.mk index 70afd80c2..995e80ba0 100644 --- a/target/common/common.mk +++ b/target/common/common.mk @@ -203,6 +203,7 @@ SNITCH_DASM_TRACES = $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null) SNITCH_TXT_TRACES = $(shell (echo $(SNITCH_DASM_TRACES) | sed 's/\.dasm/\.txt/g')) SNITCH_ANNOTATED_TRACES = $(shell (echo $(SNITCH_DASM_TRACES) | sed 's/\.dasm/\.s/g')) SNITCH_PERF_DUMPS = $(shell (echo $(SNITCH_DASM_TRACES) | sed 's/trace_hart/hart/g' | sed 's/.dasm/_perf.json/g')) +DMA_PERF_DUMPS = $(LOGS_DIR)/dma_*_perf.json TXT_TRACES += $(SNITCH_TXT_TRACES) ANNOTATED_TRACES += $(SNITCH_ANNOTATED_TRACES) @@ -219,7 +220,7 @@ annotate: $(ANNOTATED_TRACES) perf: $(JOINT_PERF_DUMP) visual-trace: $(VISUAL_TRACE) clean-traces: - rm -f $(TXT_TRACES) + rm -f $(TXT_TRACES) $(SNITCH_PERF_DUMPS) $(DMA_PERF_DUMPS) clean-annotate: rm -f $(ANNOTATED_TRACES) clean-perf: From ff3c3e3cfbebbdcd059cd4f21b2214a2f199f6c1 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 19 Aug 2024 14:33:54 +0200 Subject: [PATCH 08/19] ata: Generalize and optimize --- sw/apps/ata/scripts/datagen.py | 10 +-- sw/apps/ata/scripts/verify.py | 3 +- sw/apps/ata/src/args.h | 4 +- sw/apps/ata/src/ata.h | 110 +++++++++++++++++++++------------ 4 files changed, 82 insertions(+), 45 deletions(-) diff --git a/sw/apps/ata/scripts/datagen.py b/sw/apps/ata/scripts/datagen.py index 11978b918..f6474f2e6 100755 --- a/sw/apps/ata/scripts/datagen.py +++ b/sw/apps/ata/scripts/datagen.py @@ -17,10 +17,10 @@ class AtaDataGen(DataGen): # Function pointers to alternative implementations - FUNCPTRS = ["ata_baseline", "ata_opt"] + FUNCPTRS = ["ata_naive", "ata_baseline", "ata_opt"] - def golden_model(self, A): - return np.matmul(A, A.transpose()) + def golden_model(self, alpha, A): + return alpha * np.matmul(A, A.transpose()) def validate(self, **kwargs): assert (kwargs['m'] % kwargs['m_tiles']) == 0, "m must be an integer multiple of m_tiles" @@ -43,7 +43,8 @@ def emit_header(self, **kwargs): self.validate(**kwargs) A = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['n']))/100 - B = self.golden_model(A) + alpha = np.random.randint(-200, 100)/100 + B = self.golden_model(alpha, A) A = A.flatten() B = B.flatten() @@ -52,6 +53,7 @@ def emit_header(self, **kwargs): B_uid = 'B' cfg = { + 'alpha': alpha, 'm': kwargs['m'], 'n': kwargs['n'], 'a': A_uid, diff --git a/sw/apps/ata/scripts/verify.py b/sw/apps/ata/scripts/verify.py index 1c6b50747..206af870a 100755 --- a/sw/apps/ata/scripts/verify.py +++ b/sw/apps/ata/scripts/verify.py @@ -19,6 +19,7 @@ class AtaVerifier(Verifier): def __init__(self): super().__init__() self.func_args = { + 'alpha': 'd', 'm': 'I', 'n': 'I', 'A': 'I', @@ -34,7 +35,7 @@ def get_actual_results(self): def get_expected_results(self): A = self.get_input_from_symbol('A', 'double') A = np.reshape(A, (self.func_args['m'], self.func_args['n'])) - return AtaDataGen().golden_model(A).flatten() + return AtaDataGen().golden_model(self.func_args['alpha'], A).flatten() def check_results(self, *args): return super().check_results(*args, rtol=1e-10) diff --git a/sw/apps/ata/src/args.h b/sw/apps/ata/src/args.h index 520693e22..f65a6a13f 100644 --- a/sw/apps/ata/src/args.h +++ b/sw/apps/ata/src/args.h @@ -7,9 +7,11 @@ #pragma once #include -typedef void (*ata_fp_t)(uint32_t m, uint32_t n, double *a, double *at,double *b); +typedef void (*ata_fp_t)(double alpha, uint32_t m, uint32_t n, double *a, + double *at, double *b); typedef struct { + double alpha; uint32_t m; uint32_t n; double *a; diff --git a/sw/apps/ata/src/ata.h b/sw/apps/ata/src/ata.h index 0e33ea5ff..8673353a4 100644 --- a/sw/apps/ata/src/ata.h +++ b/sw/apps/ata/src/ata.h @@ -11,7 +11,7 @@ __thread int setup_ssr = 1; -void ata_naive(uint32_t m, uint32_t n, double *a, double *at, double *b) { +void ata_naive(double alpha, uint32_t m, uint32_t n, double *a, double *at, double *b) { uint32_t offset = snrt_cluster_core_idx(); uint32_t stride = snrt_cluster_compute_core_num(); @@ -21,53 +21,83 @@ void ata_naive(uint32_t m, uint32_t n, double *a, double *at, double *b) { for (uint32_t k = 0; k < n; k++) { b[i * m + j] += a[i * n + k] * at[j * n + k]; } + b[i * m + j] *= alpha; } } } -void ata_baseline(uint32_t m, uint32_t n, double *a, double *at, double *b) { +void ata_baseline(double alpha, uint32_t m, uint32_t n, double *a, double *at, double *b) { uint32_t offset = snrt_cluster_core_idx(); uint32_t stride = snrt_cluster_compute_core_num(); - // Unrolling factor of innermost loop + // Unrolling factors // Note: changes must be reflected in the inline assembly code // and datagen script - const uint32_t unroll = 8; + const uint32_t unroll1 = 4; + const uint32_t unroll0 = 4; for (uint32_t i = offset; i < m; i += stride) { - for (uint32_t j = 0; j < m; j++) { + for (uint32_t j = 0; j < m; j += unroll1) { - double acc = 0; + double acc[4]; + acc[0] = 0; + acc[1] = 0; + acc[2] = 0; + acc[3] = 0; - for (uint32_t k = 0; k < n; k += unroll) { + for (uint32_t k = 0; k < n; k += unroll0) { asm volatile( - "fmadd.d %[acc], %[a0], %[at0], %[acc] \n" - "fmadd.d %[acc], %[a1], %[at1], %[acc] \n" - "fmadd.d %[acc], %[a2], %[at2], %[acc] \n" - "fmadd.d %[acc], %[a3], %[at3], %[acc] \n" - "fmadd.d %[acc], %[a4], %[at4], %[acc] \n" - "fmadd.d %[acc], %[a5], %[at5], %[acc] \n" - "fmadd.d %[acc], %[a6], %[at6], %[acc] \n" - "fmadd.d %[acc], %[a7], %[at7], %[acc] \n" - : [ acc ] "+f"(acc) - : [ a0 ] "f"(a[i * n + k + 0]), [ a1 ] "f"(a[i * n + k + 1]), - [ a2 ] "f"(a[i * n + k + 2]), [ a3 ] "f"(a[i * n + k + 3]), - [ a4 ] "f"(a[i * n + k + 4]), [ a5 ] "f"(a[i * n + k + 5]), - [ a6 ] "f"(a[i * n + k + 6]), [ a7 ] "f"(a[i * n + k + 7]), - [ at0 ] "f"(at[j * n + k + 0]), [ at1 ] "f"(at[j * n + k + 1]), - [ at2 ] "f"(at[j * n + k + 2]), [ at3 ] "f"(at[j * n + k + 3]), - [ at4 ] "f"(at[j * n + k + 4]), [ at5 ] "f"(at[j * n + k + 5]), - [ at6 ] "f"(at[j * n + k + 6]), [ at7 ] "f"(at[j * n + k + 7]) + "fmadd.d %[acc0], %[a0], %[at0], %[acc0] \n" + "fmadd.d %[acc1], %[a0], %[at1], %[acc1] \n" + "fmadd.d %[acc2], %[a0], %[at2], %[acc2] \n" + "fmadd.d %[acc3], %[a0], %[at3], %[acc3] \n" + "fmadd.d %[acc0], %[a1], %[at4], %[acc0] \n" + "fmadd.d %[acc1], %[a1], %[at5], %[acc1] \n" + "fmadd.d %[acc2], %[a1], %[at6], %[acc2] \n" + "fmadd.d %[acc3], %[a1], %[at7], %[acc3] \n" + "fmadd.d %[acc0], %[a2], %[at8], %[acc0] \n" + "fmadd.d %[acc1], %[a2], %[at9], %[acc1] \n" + "fmadd.d %[acc2], %[a2], %[at10], %[acc2] \n" + "fmadd.d %[acc3], %[a2], %[at11], %[acc3] \n" + "fmadd.d %[acc0], %[a3], %[at12], %[acc0] \n" + "fmadd.d %[acc1], %[a3], %[at13], %[acc1] \n" + "fmadd.d %[acc2], %[a3], %[at14], %[acc2] \n" + "fmadd.d %[acc3], %[a3], %[at15], %[acc3] \n" + : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), + [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]) + : [ a0 ] "f"(a[i * n + k + 0]), + [ a1 ] "f"(a[i * n + k + 1]), + [ a2 ] "f"(a[i * n + k + 2]), + [ a3 ] "f"(a[i * n + k + 3]), + [ at0 ] "f"(at[(j + 0) * n + k]), + [ at1 ] "f"(at[(j + 1) * n + k]), + [ at2 ] "f"(at[(j + 2) * n + k]), + [ at3 ] "f"(at[(j + 3) * n + k]), + [ at4 ] "f"(at[(j + 0) * n + k + 1]), + [ at5 ] "f"(at[(j + 1) * n + k + 1]), + [ at6 ] "f"(at[(j + 2) * n + k + 1]), + [ at7 ] "f"(at[(j + 3) * n + k + 1]), + [ at8 ] "f"(at[(j + 0) * n + k + 2]), + [ at9 ] "f"(at[(j + 1) * n + k + 2]), + [ at10 ] "f"(at[(j + 2) * n + k + 2]), + [ at11 ] "f"(at[(j + 3) * n + k + 2]), + [ at12 ] "f"(at[(j + 0) * n + k + 3]), + [ at13 ] "f"(at[(j + 1) * n + k + 3]), + [ at14 ] "f"(at[(j + 2) * n + k + 3]), + [ at15 ] "f"(at[(j + 3) * n + k + 3]) : ); } - b[i * m + j] = acc; + b[i * m + j + 0] = alpha * acc[0]; + b[i * m + j + 1] = alpha * acc[1]; + b[i * m + j + 2] = alpha * acc[2]; + b[i * m + j + 3] = alpha * acc[3]; } } } -void ata_opt(uint32_t m, uint32_t n, double *a, double *at, double *b) { +void ata_opt(double alpha, uint32_t m, uint32_t n, double *a, double *at, double *b) { uint32_t offset = snrt_cluster_core_idx(); uint32_t stride = snrt_cluster_compute_core_num(); @@ -114,19 +144,21 @@ void ata_opt(uint32_t m, uint32_t n, double *a, double *at, double *b) { asm volatile( "frep.o %[n_frep], %[unroll], 0, 0 \n" - "fmadd.d %[b0], ft0, ft1, %[b0] \n" - "fmadd.d %[b1], ft0, ft1, %[b1] \n" - "fmadd.d %[b2], ft0, ft1, %[b2] \n" - "fmadd.d %[b3], ft0, ft1, %[b3] \n" - : [ b0 ] "+f"(acc[0]), [ b1 ] "+f"(acc[1]), - [ b2 ] "+f"(acc[2]), [ b3 ] "+f"(acc[3]) - : [ n_frep ] "r"(n - 1), [ unroll ] "i"(unroll) + "fmadd.d %[acc0], ft0, ft1, %[acc0] \n" + "fmadd.d %[acc1], ft0, ft1, %[acc1] \n" + "fmadd.d %[acc2], ft0, ft1, %[acc2] \n" + "fmadd.d %[acc3], ft0, ft1, %[acc3] \n" + "fmul.d %[b0], %[acc0], %[alpha] \n" + "fmul.d %[b1], %[acc1], %[alpha] \n" + "fmul.d %[b2], %[acc2], %[alpha] \n" + "fmul.d %[b3], %[acc3], %[alpha] \n" + : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), + [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]), + [ b0 ] "=f"(b[i * m + j + 0]), [ b1 ] "=f"(b[i * m + j + 1]), + [ b2 ] "=f"(b[i * m + j + 2]), [ b3 ] "=f"(b[i * m + j + 3]) + : [ n_frep ] "r"(n - 1), [ unroll ] "i"(unroll), + [ alpha ] "f"(alpha) : "ft0", "ft1", "ft2"); - - b[i * m + j + 0] = acc[0]; - b[i * m + j + 1] = acc[1]; - b[i * m + j + 2] = acc[2]; - b[i * m + j + 3] = acc[3]; } } @@ -262,7 +294,7 @@ void ata_job(ata_args_t *args) { // Perform tile computation ata_fp_t fp = args->funcptr; - fp(m_frac, args->n, local_a[buff_idx], + fp(args->alpha, m_frac, args->n, local_a[buff_idx], local_at[buff_idx], local_b[buff_idx]); snrt_mcycle(); From 018f77aa6766750e003fc661c55005d55f820354 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 19 Aug 2024 14:34:53 +0200 Subject: [PATCH 09/19] sw: Allow apps to extend `INCDIRS` --- sw/apps/common.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sw/apps/common.mk b/sw/apps/common.mk index 89f5da9f6..6bdc85984 100644 --- a/sw/apps/common.mk +++ b/sw/apps/common.mk @@ -13,7 +13,7 @@ DATA_H := $($(APP)_BUILD_DIR)/data.h DATAGEN_PY = $(SCRIPTS_DIR)/datagen.py $(APP)_HEADERS := $(DATA_H) -$(APP)_INCDIRS := $(dir $(DATA_H)) $(SRC_DIR) +$(APP)_INCDIRS += $(dir $(DATA_H)) $(SRC_DIR) $(dir $(DATA_H)): mkdir -p $@ From 3e975e9dbc75d3e76b95c8698644057aa7371c3f Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 19 Aug 2024 14:36:32 +0200 Subject: [PATCH 10/19] covariance: Fix bug and optimize baseline --- sw/apps/covariance/roi.json | 36 --------- sw/apps/covariance/scripts/datagen.py | 9 ++- sw/apps/covariance/src/covariance.h | 76 ++++++++++++------- .../snitch_cluster/sw/apps/covariance/app.mk | 1 + 4 files changed, 57 insertions(+), 65 deletions(-) delete mode 100644 sw/apps/covariance/roi.json diff --git a/sw/apps/covariance/roi.json b/sw/apps/covariance/roi.json deleted file mode 100644 index 757a2ce6d..000000000 --- a/sw/apps/covariance/roi.json +++ /dev/null @@ -1,36 +0,0 @@ -[ - <% DOUBLE_BUFFER = 1 %> - <% N_TILES = 4 %> - - // Compute cores - % for j in range(0, 8): - { - "thread": "${f'hart_{j}'}", - "roi": [ - % for i in range(0, N_TILES): - {"idx": ${2 * i + 1}, "label": "${f'tile_{i}'}"}, - % endfor - ] - }, - % endfor - - // DMA core - { - "thread": "hart_8", - "roi": [ - % if not DOUBLE_BUFFER: - % for i in range(0, N_TILES): - {"idx": ${4 * i + 1}, "label": "${f'tile_{i}_in'}"}, - {"idx": ${4 * i + 3}, "label": "${f'tile_{i}_out'}"}, - % endfor - % else: - {"idx": 1, "label": "tile_0_in"}, - % for i in range(1, N_TILES): - {"idx": ${4 * (i - 1) + 3}, "label": "${f'tile_{i}_in'}"}, - {"idx": ${4 * (i - 1) + 5}, "label": "${f'tile_{i-1}_out'}"}, - % endfor - {"idx": ${4 * (i - 1) + 7}, "label": "tile_15_out"}, - % endif - ] - } -] \ No newline at end of file diff --git a/sw/apps/covariance/scripts/datagen.py b/sw/apps/covariance/scripts/datagen.py index 165fc30a5..c3b7cd8b3 100755 --- a/sw/apps/covariance/scripts/datagen.py +++ b/sw/apps/covariance/scripts/datagen.py @@ -12,6 +12,7 @@ from snitch.util.sim.data_utils import format_array_definition, \ format_array_declaration, format_struct_definition, DataGen +np.random.seed(42) DOUBLE_BUFFER = True @@ -19,17 +20,19 @@ class CovarianceDataGen(DataGen): # Function pointers to alternative implementations - FUNCPTRS = ["covariance_naive", "covariance_opt"] + FUNCPTRS = ["covariance_naive", "covariance_baseline", "covariance_opt"] def golden_model(self, data): return np.cov(data, rowvar=False) def validate(self, **kwargs): + n_cores = 8 assert (kwargs['m'] % kwargs['m_tiles']) == 0, "m must be an integer multiple of m_tiles" m_per_tile = kwargs['m'] / kwargs['m_tiles'] - assert (m_per_tile % 8) == 0, "m_per_tile must be an integer multiple of the number of cores" + assert (m_per_tile % n_cores) == 0, \ + "m_per_tile must be an integer multiple of the number of cores" assert (m_per_tile % 4) == 0, "m_per_tile must be an integer multiple of unroll1 = 4" - m_per_core = m_per_tile / 8 + m_per_core = m_per_tile / n_cores assert (m_per_core % 2) == 0, "m_per_core must be an integer multiple of the unroll0 = 2" assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}" diff --git a/sw/apps/covariance/src/covariance.h b/sw/apps/covariance/src/covariance.h index 41c33a93b..29c6aec69 100644 --- a/sw/apps/covariance/src/covariance.h +++ b/sw/apps/covariance/src/covariance.h @@ -7,6 +7,7 @@ #include "args.h" #include "snrt.h" +#include "ata.h" #define DOUBLE_BUFFER 1 @@ -40,15 +41,40 @@ void covariance_naive(uint32_t m, uint32_t n, double inv_n, snrt_cluster_hw_barrier(); // Compute covariance matrix + ata_naive(inv_n_m1, m, n, data, datat, cov); +} + +void covariance_baseline(uint32_t m, uint32_t n, double inv_n, + double inv_n_m1, double *data, double *datat, + double *cov) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + // Center data for (uint32_t i = offset; i < m; i += stride) { - for (uint32_t j = 0; j < m; j++) { - cov[i * m + j] = 0.0; - for (uint32_t k = 0; k < n; k++) { - cov[i * m + j] += data[i * n + k] * datat[j * n + k]; - } - cov[i * m + j] *= inv_n_m1; + + // Calculate row mean + double data_mean = 0.0; + double datat_mean = 0.0; + for (uint32_t j = 0; j < n; j++) { + data_mean += data[i * n + j]; + datat_mean += datat[i * n + j]; + } + data_mean = data_mean * inv_n; + datat_mean = datat_mean * inv_n; + + // Center row around zero + for (uint32_t j = 0; j < n; j++) { + data[i * n + j] -= data_mean; + datat[i * n + j] -= datat_mean; } } + + snrt_fpu_fence(); + snrt_cluster_hw_barrier(); + + // Compute covariance matrix + ata_baseline(inv_n_m1, m, n, data, datat, cov); } void covariance_opt(uint32_t m, uint32_t n, double inv_n, @@ -79,6 +105,7 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr01_b[0], ssr01_b[1], ssr01_b[2], ssr01_b[3], ssr01_i[0], ssr01_i[1], ssr01_i[2], ssr01_i[3]); + snrt_ssr_repeat(SNRT_SSR_DM0, 1); // Configure ft2 to store data and datat elements // for (i1 = offset; i1 < m; i1 += stride * unroll0) // for (j = 0; j < n; j++) @@ -145,8 +172,8 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, snrt_fpu_fence(); snrt_cluster_hw_barrier(); - // The following is taken from the AtA kernel, apart from the normalization - // by 1/(n - 1). + // The following is taken from the AtA kernel, where alpha is set to + // the factor 1/(n - 1). // Here data stands for A and datat for At. // Unrolling factor of innermost loop @@ -175,7 +202,7 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); // SSR start address need to be configured each time - snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, data + offset * n); + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_3D, data + offset * n); snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, datat); snrt_ssr_enable(); @@ -190,23 +217,21 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, asm volatile( "frep.o %[n_frep], %[unroll1], 0, 0 \n" - "fmadd.d %[b0], ft0, ft1, %[b0] \n" - "fmadd.d %[b1], ft0, ft1, %[b1] \n" - "fmadd.d %[b2], ft0, ft1, %[b2] \n" - "fmadd.d %[b3], ft0, ft1, %[b3] \n" - : [ b0 ] "+f"(acc[0]), [ b1 ] "+f"(acc[1]), - [ b2 ] "+f"(acc[2]), [ b3 ] "+f"(acc[3]) - : [ n_frep ] "r"(n - 1), [ unroll1 ] "i"(unroll1) + "fmadd.d %[acc0], ft0, ft1, %[acc0] \n" + "fmadd.d %[acc1], ft0, ft1, %[acc1] \n" + "fmadd.d %[acc2], ft0, ft1, %[acc2] \n" + "fmadd.d %[acc3], ft0, ft1, %[acc3] \n" + "fmul.d %[b0], %[acc0], %[alpha] \n" + "fmul.d %[b1], %[acc1], %[alpha] \n" + "fmul.d %[b2], %[acc2], %[alpha] \n" + "fmul.d %[b3], %[acc3], %[alpha] \n" + : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), + [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]), + [ b0 ] "=f"(cov[i * m + j + 0]), [ b1 ] "=f"(cov[i * m + j + 1]), + [ b2 ] "=f"(cov[i * m + j + 2]), [ b3 ] "=f"(cov[i * m + j + 3]) + : [ n_frep ] "r"(n - 1), [ unroll1 ] "i"(unroll1), + [ alpha ] "f"(inv_n_m1) : "ft0", "ft1", "ft2"); - - snrt_ssr_disable(); - - cov[i * m + j + 0] = acc[0] * inv_n_m1; - cov[i * m + j + 1] = acc[1] * inv_n_m1; - cov[i * m + j + 2] = acc[2] * inv_n_m1; - cov[i * m + j + 3] = acc[3] * inv_n_m1; - - snrt_ssr_enable(); } } @@ -245,7 +270,6 @@ void covariance_job(covariance_args_t *args) { b_tile_bytes = b_tile_size * sizeof(double); // Allocate space for job operands in TCDM - // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th. local_a0_addr = (uint64_t)args + sizeof(covariance_args_t); local_at0_addr = local_a0_addr + a_tile_bytes; local_b0_addr = local_at0_addr + a_tile_bytes; diff --git a/target/snitch_cluster/sw/apps/covariance/app.mk b/target/snitch_cluster/sw/apps/covariance/app.mk index c177a9d61..005791c79 100644 --- a/target/snitch_cluster/sw/apps/covariance/app.mk +++ b/target/snitch_cluster/sw/apps/covariance/app.mk @@ -8,6 +8,7 @@ APP := covariance $(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build SRC_DIR := $(ROOT)/sw/apps/$(APP)/src SRCS := $(SRC_DIR)/main.c +$(APP)_INCDIRS := $(ROOT)/sw/apps/ata/src/ include $(ROOT)/sw/apps/common.mk include $(ROOT)/target/snitch_cluster/sw/apps/common.mk From 5e663d7e25cc410083ecb4c0b540328eae1e6f04 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 19 Aug 2024 18:10:00 +0200 Subject: [PATCH 11/19] sw: Replace AtA kernel with syrk --- sw/apps/covariance/src/covariance.h | 6 +- sw/blas/blas.h | 14 ++ sw/blas/gemm/src/gemm.h | 13 -- sw/blas/gemm/src/main.c | 2 +- sw/{apps/ata => blas/syrk}/.gitignore | 0 sw/{apps/ata => blas/syrk}/data/params.json | 10 +- sw/{apps/ata => blas/syrk}/scripts/datagen.py | 46 +++-- sw/{apps/ata => blas/syrk}/scripts/verify.py | 20 +- sw/{apps/ata => blas/syrk}/src/args.h | 13 +- sw/{apps/ata => blas/syrk}/src/main.c | 4 +- .../ata/src/ata.h => blas/syrk/src/syrk.h} | 186 +++++++++--------- target/snitch_cluster/sw.mk | 2 +- .../snitch_cluster/sw/apps/blas/gemm/app.mk | 1 + .../sw/apps/{ata => blas/syrk}/app.mk | 7 +- .../snitch_cluster/sw/apps/covariance/app.mk | 2 +- 15 files changed, 180 insertions(+), 146 deletions(-) rename sw/{apps/ata => blas/syrk}/.gitignore (100%) rename sw/{apps/ata => blas/syrk}/data/params.json (63%) rename sw/{apps/ata => blas/syrk}/scripts/datagen.py (56%) rename sw/{apps/ata => blas/syrk}/scripts/verify.py (70%) rename sw/{apps/ata => blas/syrk}/src/args.h (66%) rename sw/{apps/ata => blas/syrk}/src/main.c (88%) rename sw/{apps/ata/src/ata.h => blas/syrk/src/syrk.h} (69%) rename target/snitch_cluster/sw/apps/{ata => blas/syrk}/app.mk (65%) diff --git a/sw/apps/covariance/src/covariance.h b/sw/apps/covariance/src/covariance.h index 29c6aec69..53944e6ca 100644 --- a/sw/apps/covariance/src/covariance.h +++ b/sw/apps/covariance/src/covariance.h @@ -6,8 +6,8 @@ // Luca Colagrande #include "args.h" +#include "blas.h" #include "snrt.h" -#include "ata.h" #define DOUBLE_BUFFER 1 @@ -41,7 +41,7 @@ void covariance_naive(uint32_t m, uint32_t n, double inv_n, snrt_cluster_hw_barrier(); // Compute covariance matrix - ata_naive(inv_n_m1, m, n, data, datat, cov); + syrk_naive(m, n, inv_n_m1, data, datat, 0, cov); } void covariance_baseline(uint32_t m, uint32_t n, double inv_n, @@ -74,7 +74,7 @@ void covariance_baseline(uint32_t m, uint32_t n, double inv_n, snrt_cluster_hw_barrier(); // Compute covariance matrix - ata_baseline(inv_n_m1, m, n, data, datat, cov); + syrk_baseline(m, n, inv_n_m1, data, datat, 0, cov); } void covariance_opt(uint32_t m, uint32_t n, double inv_n, diff --git a/sw/blas/blas.h b/sw/blas/blas.h index 33c29e175..69005ccb7 100644 --- a/sw/blas/blas.h +++ b/sw/blas/blas.h @@ -4,6 +4,20 @@ #pragma once +// Floating-point multiplications by zero cannot be optimized as in some +// edge cases they do not yield zero: +// - 0f * NaN = NaN +// - 0f * INFINITY == NaN +// Thus in order to optimize it, we need to test for zero. You can use this +// function for free when `multiplier` is a constant. +static inline double multiply_opt(double multiplicand, double multiplier) { + if (multiplier) + return multiplicand * multiplier; + else + return 0; +} + #include "axpy/src/axpy.h" #include "dot/src/dot.h" #include "gemm/src/gemm.h" +#include "syrk/src/syrk.h" diff --git a/sw/blas/gemm/src/gemm.h b/sw/blas/gemm/src/gemm.h index a480379a9..1a73aedf8 100644 --- a/sw/blas/gemm/src/gemm.h +++ b/sw/blas/gemm/src/gemm.h @@ -13,19 +13,6 @@ #pragma once -// Floating-point multiplications by zero cannot be optimized as in some -// edge cases they do not yield zero: -// - 0f * NaN = NaN -// - 0f * INFINITY == NaN -// Thus in order to optimize it, we need to test for zero. You can use this -// function for free when `multiplier` is a constant. -static inline double multiply_opt(double multiplicand, double multiplier) { - if (multiplier) - return multiplicand * multiplier; - else - return 0; -} - #include "gemm_fp16.h" #include "gemm_fp32.h" #include "gemm_fp64.h" diff --git a/sw/blas/gemm/src/main.c b/sw/blas/gemm/src/main.c index 17f3936b0..9760000c6 100644 --- a/sw/blas/gemm/src/main.c +++ b/sw/blas/gemm/src/main.c @@ -9,7 +9,7 @@ #include #include -#include "gemm.h" +#include "blas.h" #include "data.h" #include "snrt.h" diff --git a/sw/apps/ata/.gitignore b/sw/blas/syrk/.gitignore similarity index 100% rename from sw/apps/ata/.gitignore rename to sw/blas/syrk/.gitignore diff --git a/sw/apps/ata/data/params.json b/sw/blas/syrk/data/params.json similarity index 63% rename from sw/apps/ata/data/params.json rename to sw/blas/syrk/data/params.json index 1db35db08..492d8e0cc 100644 --- a/sw/apps/ata/data/params.json +++ b/sw/blas/syrk/data/params.json @@ -3,8 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 { - "m": 16, - "n": 4, - "m_tiles": 2, - "funcptr": "ata_opt" + "m": 8, + "n": 2, + "alpha": 1.5, + "beta": 3.2, + "m_tiles": 1, + "funcptr": "syrk_opt" } diff --git a/sw/apps/ata/scripts/datagen.py b/sw/blas/syrk/scripts/datagen.py similarity index 56% rename from sw/apps/ata/scripts/datagen.py rename to sw/blas/syrk/scripts/datagen.py index f6474f2e6..05cd2f038 100755 --- a/sw/apps/ata/scripts/datagen.py +++ b/sw/blas/syrk/scripts/datagen.py @@ -14,25 +14,27 @@ DOUBLE_BUFFER = True -class AtaDataGen(DataGen): +class SyrkDataGen(DataGen): # Function pointers to alternative implementations - FUNCPTRS = ["ata_naive", "ata_baseline", "ata_opt"] + FUNCPTRS = ["syrk_naive", "syrk_baseline", "syrk_opt"] - def golden_model(self, alpha, A): - return alpha * np.matmul(A, A.transpose()) + def golden_model(self, alpha, A, beta, C): + return alpha * np.matmul(A, A.transpose()) + beta * C def validate(self, **kwargs): + n_cores = 8 assert (kwargs['m'] % kwargs['m_tiles']) == 0, "m must be an integer multiple of m_tiles" m_frac = kwargs['m'] / kwargs['m_tiles'] - assert (m_frac % 8) == 0, "m_frac must be an integer multiple of the number of cores" - assert (m_frac % 4) == 0, "m_frac must be an integer multiple of the unroll factor 4" + assert (m_frac % n_cores) == 0, "m_frac must be an integer multiple of the number of cores" + if kwargs['funcptr'] != "syrk_naive": + assert (m_frac % 4) == 0, "m_frac must be an integer multiple of the unroll factor 4" assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}" # Calculate total TCDM occupation a_tile_size = m_frac * kwargs['n'] * 8 - b_tile_size = m_frac * m_frac * 8 - total_size = 2 * a_tile_size + b_tile_size + c_tile_size = m_frac * m_frac * 8 + total_size = 2 * a_tile_size + c_tile_size if DOUBLE_BUFFER: total_size *= 2 data_utils.validate_tcdm_footprint(total_size) @@ -42,33 +44,43 @@ def emit_header(self, **kwargs): self.validate(**kwargs) + if 'alpha' in kwargs: + alpha = kwargs['alpha'] + else: + alpha = np.random.randint(-200, 100)/100 + if 'beta' in kwargs: + beta = kwargs['beta'] + else: + beta = np.random.randint(-200, 100)/100 + A = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['n']))/100 - alpha = np.random.randint(-200, 100)/100 - B = self.golden_model(alpha, A) + C_in = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['m']))/100 + C_out = self.golden_model(alpha, A, beta, C_in) A = A.flatten() - B = B.flatten() + C_in = C_in.flatten() A_uid = 'A' - B_uid = 'B' + C_uid = 'C' cfg = { - 'alpha': alpha, 'm': kwargs['m'], 'n': kwargs['n'], + 'alpha': alpha, + 'beta': beta, 'a': A_uid, - 'b': B_uid, + 'c': C_uid, 'm_tiles': kwargs['m_tiles'], 'funcptr': kwargs['funcptr'] } header += [format_array_definition('double', A_uid, A)] - header += [format_array_declaration('double', B_uid, B.shape)] - header += [format_struct_definition('ata_args_t', 'args', cfg)] + header += [format_array_definition('double', C_uid, C_in)] + header += [format_struct_definition('syrk_args_t', 'args', cfg)] header = '\n\n'.join(header) return header if __name__ == '__main__': - AtaDataGen().main() + SyrkDataGen().main() diff --git a/sw/apps/ata/scripts/verify.py b/sw/blas/syrk/scripts/verify.py similarity index 70% rename from sw/apps/ata/scripts/verify.py rename to sw/blas/syrk/scripts/verify.py index 206af870a..0624156cb 100755 --- a/sw/apps/ata/scripts/verify.py +++ b/sw/blas/syrk/scripts/verify.py @@ -7,23 +7,24 @@ import numpy as np import sys -from datagen import AtaDataGen +from datagen import SyrkDataGen from snitch.util.sim.verif_utils import Verifier -class AtaVerifier(Verifier): +class SyrkVerifier(Verifier): - OUTPUT_UIDS = ['B'] + OUTPUT_UIDS = ['C'] def __init__(self): super().__init__() self.func_args = { - 'alpha': 'd', 'm': 'I', 'n': 'I', + 'alpha': 'd', + 'beta': 'd', 'A': 'I', - 'B': 'I', + 'C': 'I', 'm_tiles': 'I', 'funcptr': 'I' } @@ -34,12 +35,17 @@ def get_actual_results(self): def get_expected_results(self): A = self.get_input_from_symbol('A', 'double') + C = self.get_input_from_symbol('C', 'double') A = np.reshape(A, (self.func_args['m'], self.func_args['n'])) - return AtaDataGen().golden_model(self.func_args['alpha'], A).flatten() + C = np.reshape(C, (self.func_args['m'], self.func_args['m'])) + return SyrkDataGen().golden_model( + self.func_args['alpha'], A, + self.func_args['beta'], C + ).flatten() def check_results(self, *args): return super().check_results(*args, rtol=1e-10) if __name__ == "__main__": - sys.exit(AtaVerifier().main()) + sys.exit(SyrkVerifier().main()) diff --git a/sw/apps/ata/src/args.h b/sw/blas/syrk/src/args.h similarity index 66% rename from sw/apps/ata/src/args.h rename to sw/blas/syrk/src/args.h index f65a6a13f..6bb58e00e 100644 --- a/sw/apps/ata/src/args.h +++ b/sw/blas/syrk/src/args.h @@ -7,15 +7,16 @@ #pragma once #include -typedef void (*ata_fp_t)(double alpha, uint32_t m, uint32_t n, double *a, - double *at, double *b); +typedef void (*syrk_fp_t)(uint32_t m, uint32_t n, double alpha, double *a, + double *at, double beta, double *b); typedef struct { - double alpha; uint32_t m; uint32_t n; + double alpha; + double beta; double *a; - double *b; + double *c; uint32_t m_tiles; - ata_fp_t funcptr; -} ata_args_t; + syrk_fp_t funcptr; +} syrk_args_t; diff --git a/sw/apps/ata/src/main.c b/sw/blas/syrk/src/main.c similarity index 88% rename from sw/apps/ata/src/main.c rename to sw/blas/syrk/src/main.c index c8df4bea9..9f1ad7163 100644 --- a/sw/apps/ata/src/main.c +++ b/sw/blas/syrk/src/main.c @@ -6,12 +6,12 @@ #include "snrt.h" -#include "ata.h" +#include "blas.h" #include "data.h" int main() { - ata_job(&args); + syrk_job(&args); return 0; } diff --git a/sw/apps/ata/src/ata.h b/sw/blas/syrk/src/syrk.h similarity index 69% rename from sw/apps/ata/src/ata.h rename to sw/blas/syrk/src/syrk.h index 8673353a4..9494f2777 100644 --- a/sw/apps/ata/src/ata.h +++ b/sw/blas/syrk/src/syrk.h @@ -7,26 +7,27 @@ #include "args.h" #include "snrt.h" -#define DOUBLE_BUFFER 1 - __thread int setup_ssr = 1; -void ata_naive(double alpha, uint32_t m, uint32_t n, double *a, double *at, double *b) { +void syrk_naive(uint32_t m, uint32_t n, double alpha, double *a, double *at, + double beta, double *c) { uint32_t offset = snrt_cluster_core_idx(); uint32_t stride = snrt_cluster_compute_core_num(); for (uint32_t i = offset; i < m; i += stride) { for (uint32_t j = 0; j < m; j++) { - b[i * m + j] = 0; + double acc = 0; for (uint32_t k = 0; k < n; k++) { - b[i * m + j] += a[i * n + k] * at[j * n + k]; + acc += a[i * n + k] * at[j * n + k]; } - b[i * m + j] *= alpha; + c[i * m + j] = multiply_opt(c[i * m + j], beta); + c[i * m + j] += alpha * acc; } } } -void ata_baseline(double alpha, uint32_t m, uint32_t n, double *a, double *at, double *b) { +void syrk_baseline(uint32_t m, uint32_t n, double alpha, double *a, double *at, + double beta, double *c) { uint32_t offset = snrt_cluster_core_idx(); uint32_t stride = snrt_cluster_compute_core_num(); @@ -89,15 +90,20 @@ void ata_baseline(double alpha, uint32_t m, uint32_t n, double *a, double *at, d ); } - b[i * m + j + 0] = alpha * acc[0]; - b[i * m + j + 1] = alpha * acc[1]; - b[i * m + j + 2] = alpha * acc[2]; - b[i * m + j + 3] = alpha * acc[3]; + c[i * m + j + 0] = multiply_opt(c[i * m + j + 0], beta); + c[i * m + j + 1] = multiply_opt(c[i * m + j + 1], beta); + c[i * m + j + 2] = multiply_opt(c[i * m + j + 2], beta); + c[i * m + j + 3] = multiply_opt(c[i * m + j + 3], beta); + c[i * m + j + 0] += alpha * acc[0]; + c[i * m + j + 1] += alpha * acc[1]; + c[i * m + j + 2] += alpha * acc[2]; + c[i * m + j + 3] += alpha * acc[3]; } } } -void ata_opt(double alpha, uint32_t m, uint32_t n, double *a, double *at, double *b) { +void syrk_opt(uint32_t m, uint32_t n, double alpha, double *a, double *at, + double beta, double *c) { uint32_t offset = snrt_cluster_core_idx(); uint32_t stride = snrt_cluster_compute_core_num(); @@ -148,16 +154,20 @@ void ata_opt(double alpha, uint32_t m, uint32_t n, double *a, double *at, double "fmadd.d %[acc1], ft0, ft1, %[acc1] \n" "fmadd.d %[acc2], ft0, ft1, %[acc2] \n" "fmadd.d %[acc3], ft0, ft1, %[acc3] \n" - "fmul.d %[b0], %[acc0], %[alpha] \n" - "fmul.d %[b1], %[acc1], %[alpha] \n" - "fmul.d %[b2], %[acc2], %[alpha] \n" - "fmul.d %[b3], %[acc3], %[alpha] \n" - : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), - [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]), - [ b0 ] "=f"(b[i * m + j + 0]), [ b1 ] "=f"(b[i * m + j + 1]), - [ b2 ] "=f"(b[i * m + j + 2]), [ b3 ] "=f"(b[i * m + j + 3]) + "fmul.d %[acc0], %[acc0], %[alpha] \n" + "fmul.d %[acc1], %[acc1], %[alpha] \n" + "fmul.d %[acc2], %[acc2], %[alpha] \n" + "fmul.d %[acc3], %[acc3], %[alpha] \n" + "fmadd.d %[c0], %[c0], %[beta], %[acc0] \n" + "fmadd.d %[c1], %[c1], %[beta], %[acc1] \n" + "fmadd.d %[c2], %[c2], %[beta], %[acc2] \n" + "fmadd.d %[c3], %[c3], %[beta], %[acc3] \n" + : [ c0 ] "+f"(c[i * m + j + 0]), [ c1 ] "+f"(c[i * m + j + 1]), + [ c2 ] "+f"(c[i * m + j + 2]), [ c3 ] "+f"(c[i * m + j + 3]), + [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), + [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]) : [ n_frep ] "r"(n - 1), [ unroll ] "i"(unroll), - [ alpha ] "f"(alpha) + [ alpha ] "f"(alpha), [ beta ] "f"(beta) : "ft0", "ft1", "ft2"); } } @@ -166,23 +176,23 @@ void ata_opt(double alpha, uint32_t m, uint32_t n, double *a, double *at, double snrt_fpu_fence(); } -void ata_job(ata_args_t *args) { - uint32_t m_frac, a_tile_size, a_tile_bytes, b_tile_size, b_tile_bytes; - uint64_t local_a0_addr, local_at0_addr, local_b0_addr, - local_a1_addr, local_at1_addr, local_b1_addr; +void syrk_job(syrk_args_t *args) { + uint32_t m_frac, a_tile_size, a_tile_bytes, c_tile_size, c_tile_bytes; + uint64_t local_a0_addr, local_at0_addr, local_c0_addr, + local_a1_addr, local_at1_addr, local_c1_addr; double *local_a[2]; double *local_at[2]; - double *local_b[2]; - uint32_t iterations, sb_iterations; + double *local_c[2]; + uint32_t n_tiles, iterations; uint32_t i, i_dma_in, i_compute, i_dma_out, i_row, i_col, buff_idx; #ifndef JOB_ARGS_PRELOADED // Allocate space for job arguments in TCDM - ata_args_t *local_args = (ata_args_t *)snrt_l1_next(); + syrk_args_t *local_args = (syrk_args_t *)snrt_l1_next(); // Copy job arguments to TCDM if (snrt_is_dm_core()) { - snrt_dma_start_1d(local_args, args, sizeof(ata_args_t)); + snrt_dma_start_1d(local_args, args, sizeof(syrk_args_t)); snrt_dma_wait_all(); } snrt_cluster_hw_barrier(); @@ -192,43 +202,66 @@ void ata_job(ata_args_t *args) { // Calculate size of each tile m_frac = args->m / args->m_tiles; a_tile_size = args->n * m_frac; - b_tile_size = m_frac * m_frac; + c_tile_size = m_frac * m_frac; a_tile_bytes = a_tile_size * sizeof(double); - b_tile_bytes = b_tile_size * sizeof(double); + c_tile_bytes = c_tile_size * sizeof(double); // Allocate space for job operands in TCDM // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th. - local_a0_addr = (uint64_t)args + sizeof(ata_args_t); + local_a0_addr = (uint64_t)args + sizeof(syrk_args_t); local_at0_addr = local_a0_addr + a_tile_bytes; - local_b0_addr = local_at0_addr + a_tile_bytes; + local_c0_addr = local_at0_addr + a_tile_bytes; local_a[0] = (double *)local_a0_addr; local_at[0] = (double *)local_at0_addr; - local_b[0] = (double *)local_b0_addr; - if (DOUBLE_BUFFER) { - local_a1_addr = local_b0_addr + b_tile_bytes; - local_at1_addr = local_a1_addr + a_tile_bytes; - local_b1_addr = local_at1_addr + a_tile_bytes; - local_a[1] = (double *)local_a1_addr; - local_at[1] = (double *)local_at1_addr; - local_b[1] = (double *)local_b1_addr; - } + local_c[0] = (double *)local_c0_addr; + local_a1_addr = local_c0_addr + c_tile_bytes; + local_at1_addr = local_a1_addr + a_tile_bytes; + local_c1_addr = local_at1_addr + a_tile_bytes; + local_a[1] = (double *)local_a1_addr; + local_at[1] = (double *)local_at1_addr; + local_c[1] = (double *)local_c1_addr; // Calculate number of iterations - sb_iterations = args->m_tiles * args->m_tiles; - if (DOUBLE_BUFFER) iterations = sb_iterations + 2; - else iterations = sb_iterations; + n_tiles = args->m_tiles * args->m_tiles; + iterations = n_tiles + 2; // Iterate over all tiles for (i = 0; i < iterations; i++) { if (snrt_is_dm_core()) { + // DMA out + // (out before in to avoid overwriting data) + if (i > 1) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_out = i - 2; + buff_idx = i_dma_out % 2; + i_row = i_dma_out / args->m_tiles; + i_col = i_dma_out % args->m_tiles; + + // Copy job outputs from TCDM + snrt_dma_store_2d_tile( + args->c, + local_c[buff_idx], + i_row, + i_col, + m_frac, + m_frac, + args->m, + sizeof(double)); + snrt_dma_wait_all(); + + snrt_mcycle(); + } + // DMA in - if (!DOUBLE_BUFFER || (i < sb_iterations)) { + if (i < n_tiles) { snrt_mcycle(); // Compute tile and buffer indices i_dma_in = i; - buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0; + buff_idx = i_dma_in % 2; i_row = i_dma_in / args->m_tiles; i_col = i_dma_in % args->m_tiles; @@ -245,35 +278,17 @@ void ata_job(ata_args_t *args) { i_col, a_tile_size, sizeof(double)); - snrt_dma_wait_all(); - - snrt_mcycle(); - } - - // Additional barriers required when not double buffering - if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); - if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); - - // DMA out - if (!DOUBLE_BUFFER || (i > 1)) { - snrt_mcycle(); - - // Compute tile and buffer indices - i_dma_out = DOUBLE_BUFFER ? i - 2 : i; - buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0; - i_row = i_dma_out / args->m_tiles; - i_col = i_dma_out % args->m_tiles; - - // Copy job outputs from TCDM - snrt_dma_store_2d_tile( - args->b, - local_b[buff_idx], - i_row, - i_col, - m_frac, - m_frac, - args->m, - sizeof(double)); + if (args->funcptr == syrk_opt || args->beta != 0) { + snrt_dma_load_2d_tile( + local_c[buff_idx], + args->c, + i_row, + i_col, + m_frac, + m_frac, + args->m, + sizeof(double)); + } snrt_dma_wait_all(); snrt_mcycle(); @@ -282,27 +297,22 @@ void ata_job(ata_args_t *args) { // Compute if (snrt_is_compute_core()) { - // Additional barrier required when not double buffering - if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); - - if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1))) { + if (i > 0 && i < (n_tiles + 1)) { snrt_mcycle(); // Compute tile and buffer indices - i_compute = DOUBLE_BUFFER ? i - 1 : i; - buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0; + i_compute = i - 1; + buff_idx = i_compute % 2; // Perform tile computation - ata_fp_t fp = args->funcptr; - fp(args->alpha, m_frac, args->n, local_a[buff_idx], - local_at[buff_idx], local_b[buff_idx]); + syrk_fp_t fp = args->funcptr; + fp(m_frac, args->n, args->alpha, local_a[buff_idx], + local_at[buff_idx], args->beta, local_c[buff_idx]); snrt_mcycle(); } - - // Additional barrier required when not double buffering - if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); } + // Synchronize cores after every iteration snrt_cluster_hw_barrier(); } diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk index 0a1e4c00c..674ea2cad 100644 --- a/target/snitch_cluster/sw.mk +++ b/target/snitch_cluster/sw.mk @@ -51,6 +51,7 @@ APPS = sw/apps/nop APPS += sw/apps/blas/axpy APPS += sw/apps/blas/gemm APPS += sw/apps/blas/dot +APPS += sw/apps/blas/syrk APPS += sw/apps/dnn/batchnorm APPS += sw/apps/dnn/conv2d APPS += sw/apps/dnn/fusedconv @@ -63,7 +64,6 @@ APPS += sw/apps/dnn/concat APPS += sw/apps/dnn/fused_concat_linear APPS += sw/apps/dnn/transpose APPS += sw/apps/montecarlo/pi_estimation -APPS += sw/apps/ata APPS += sw/apps/atax APPS += sw/apps/correlation APPS += sw/apps/covariance diff --git a/target/snitch_cluster/sw/apps/blas/gemm/app.mk b/target/snitch_cluster/sw/apps/blas/gemm/app.mk index 5d2b54068..f50f6d21c 100644 --- a/target/snitch_cluster/sw/apps/blas/gemm/app.mk +++ b/target/snitch_cluster/sw/apps/blas/gemm/app.mk @@ -8,6 +8,7 @@ APP := gemm $(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/blas/$(APP)/build SRC_DIR := $(ROOT)/sw/blas/$(APP)/src SRCS := $(SRC_DIR)/main.c +$(APP)_INCDIRS := $(ROOT)/sw/blas include $(ROOT)/sw/apps/common.mk include $(ROOT)/target/snitch_cluster/sw/apps/common.mk diff --git a/target/snitch_cluster/sw/apps/ata/app.mk b/target/snitch_cluster/sw/apps/blas/syrk/app.mk similarity index 65% rename from target/snitch_cluster/sw/apps/ata/app.mk rename to target/snitch_cluster/sw/apps/blas/syrk/app.mk index af63400b4..c0fd05044 100644 --- a/target/snitch_cluster/sw/apps/ata/app.mk +++ b/target/snitch_cluster/sw/apps/blas/syrk/app.mk @@ -4,10 +4,11 @@ # # Luca Colagrande -APP := ata -$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build -SRC_DIR := $(ROOT)/sw/apps/$(APP)/src +APP := syrk +$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/blas/$(APP)/build +SRC_DIR := $(ROOT)/sw/blas/$(APP)/src SRCS := $(SRC_DIR)/main.c +$(APP)_INCDIRS := $(ROOT)/sw/blas include $(ROOT)/sw/apps/common.mk include $(ROOT)/target/snitch_cluster/sw/apps/common.mk diff --git a/target/snitch_cluster/sw/apps/covariance/app.mk b/target/snitch_cluster/sw/apps/covariance/app.mk index 005791c79..e985e671e 100644 --- a/target/snitch_cluster/sw/apps/covariance/app.mk +++ b/target/snitch_cluster/sw/apps/covariance/app.mk @@ -8,7 +8,7 @@ APP := covariance $(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build SRC_DIR := $(ROOT)/sw/apps/$(APP)/src SRCS := $(SRC_DIR)/main.c -$(APP)_INCDIRS := $(ROOT)/sw/apps/ata/src/ +$(APP)_INCDIRS := $(ROOT)/sw/blas/ include $(ROOT)/sw/apps/common.mk include $(ROOT)/target/snitch_cluster/sw/apps/common.mk From 33f49db515ba6be55e5e4851808350f679ebe309 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 19 Aug 2024 18:13:21 +0200 Subject: [PATCH 12/19] ci: Add covariance and syrk --- target/snitch_cluster/sw/fdiv.yaml | 2 -- target/snitch_cluster/sw/run.yaml | 4 ++++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/target/snitch_cluster/sw/fdiv.yaml b/target/snitch_cluster/sw/fdiv.yaml index a8b5f3930..d6b7aea3b 100644 --- a/target/snitch_cluster/sw/fdiv.yaml +++ b/target/snitch_cluster/sw/fdiv.yaml @@ -13,5 +13,3 @@ runs: cmd: [../../../sw/dnn/flashattention_2/scripts/verify.py, "${sim_bin}", "${elf}"] - elf: apps/correlation/build/correlation.elf cmd: [../../../sw/apps/correlation/scripts/verify.py, "${sim_bin}", "${elf}"] - - elf: apps/covariance/build/covariance.elf - cmd: [../../../sw/apps/covariance/scripts/verify.py, "${sim_bin}", "${elf}"] diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml index 7a5a55a4c..ab302f7c3 100644 --- a/target/snitch_cluster/sw/run.yaml +++ b/target/snitch_cluster/sw/run.yaml @@ -80,6 +80,8 @@ runs: cmd: [../../../sw/blas/gemm/scripts/verify.py, "${sim_bin}", "${elf}"] - elf: apps/blas/dot/build/dot.elf cmd: [../../../sw/blas/dot/scripts/verify.py, "${sim_bin}", "${elf}"] + - elf: apps/blas/syrk/build/syrk.elf + cmd: [../../../sw/blas/syrk/scripts/verify.py, "${sim_bin}", "${elf}"] - elf: apps/dnn/batchnorm/build/batchnorm.elf - elf: apps/dnn/maxpool/build/maxpool.elf # - elf: apps/dnn/conv2d/build/conv2d.elf # Fails with wrong results @@ -95,3 +97,5 @@ runs: - elf: apps/montecarlo/pi_estimation/build/pi_estimation.elf # - elf: apps/atax/build/atax.elf # cmd: [../../../sw/apps/atax/scripts/verify.py, "${sim_bin}", "${elf}"] + - elf: apps/covariance/build/covariance.elf + cmd: [../../../sw/apps/covariance/scripts/verify.py, "${sim_bin}", "${elf}"] From 55848dc4d25c4922fa10c785b02a877eade1a0db Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 19 Aug 2024 18:27:40 +0200 Subject: [PATCH 13/19] ci: Correct linting --- sw/apps/covariance/src/args.h | 3 +- sw/apps/covariance/src/covariance.h | 118 ++++++++++++---------------- sw/apps/covariance/src/main.c | 1 - sw/blas/axpy/src/args.h | 9 ++- sw/blas/axpy/src/axpy.h | 40 +++++----- sw/blas/axpy/src/main.c | 1 - sw/blas/syrk/scripts/datagen.py | 5 +- sw/blas/syrk/src/args.h | 2 +- sw/blas/syrk/src/main.c | 1 - sw/blas/syrk/src/syrk.h | 95 +++++++++------------- 10 files changed, 114 insertions(+), 161 deletions(-) diff --git a/sw/apps/covariance/src/args.h b/sw/apps/covariance/src/args.h index f88768dd5..cd15bc852 100644 --- a/sw/apps/covariance/src/args.h +++ b/sw/apps/covariance/src/args.h @@ -8,7 +8,8 @@ #include typedef void (*covariance_fp_t)(uint32_t m, uint32_t n, double inv_n, - double inv_n_m1, double *data, double *datat,double *cov); + double inv_n_m1, double *data, double *datat, + double *cov); typedef struct { uint32_t m; diff --git a/sw/apps/covariance/src/covariance.h b/sw/apps/covariance/src/covariance.h index 53944e6ca..cdeb427bf 100644 --- a/sw/apps/covariance/src/covariance.h +++ b/sw/apps/covariance/src/covariance.h @@ -11,15 +11,13 @@ #define DOUBLE_BUFFER 1 -void covariance_naive(uint32_t m, uint32_t n, double inv_n, - double inv_n_m1, double *data, double *datat, - double *cov) { +void covariance_naive(uint32_t m, uint32_t n, double inv_n, double inv_n_m1, + double *data, double *datat, double *cov) { uint32_t offset = snrt_cluster_core_idx(); uint32_t stride = snrt_cluster_compute_core_num(); // Center data for (uint32_t i = offset; i < m; i += stride) { - // Calculate row mean double data_mean = 0.0; double datat_mean = 0.0; @@ -44,15 +42,13 @@ void covariance_naive(uint32_t m, uint32_t n, double inv_n, syrk_naive(m, n, inv_n_m1, data, datat, 0, cov); } -void covariance_baseline(uint32_t m, uint32_t n, double inv_n, - double inv_n_m1, double *data, double *datat, - double *cov) { +void covariance_baseline(uint32_t m, uint32_t n, double inv_n, double inv_n_m1, + double *data, double *datat, double *cov) { uint32_t offset = snrt_cluster_core_idx(); uint32_t stride = snrt_cluster_compute_core_num(); // Center data for (uint32_t i = offset; i < m; i += stride) { - // Calculate row mean double data_mean = 0.0; double datat_mean = 0.0; @@ -77,9 +73,8 @@ void covariance_baseline(uint32_t m, uint32_t n, double inv_n, syrk_baseline(m, n, inv_n_m1, data, datat, 0, cov); } -void covariance_opt(uint32_t m, uint32_t n, double inv_n, - double inv_n_m1, double *data, double *datat, - double *cov) { +void covariance_opt(uint32_t m, uint32_t n, double inv_n, double inv_n_m1, + double *data, double *datat, double *cov) { uint32_t offset = snrt_cluster_core_idx(); uint32_t stride = snrt_cluster_compute_core_num(); @@ -97,14 +92,14 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, // ft0.push(data[i * n + j]) // ft1.push(datat[i * n + j]) const uint32_t ssr01_b[4] = {unroll0, n, 2, m / (stride * unroll0)}; - const uint32_t ssr01_i[4] = {sizeof(double) * n * stride, sizeof(double), - 0, sizeof(double) * n * stride * unroll0}; - snrt_ssr_loop_4d(SNRT_SSR_DM0, - ssr01_b[0], ssr01_b[1], ssr01_b[2], ssr01_b[3], - ssr01_i[0], ssr01_i[1], ssr01_i[2], ssr01_i[3]); - snrt_ssr_loop_4d(SNRT_SSR_DM1, - ssr01_b[0], ssr01_b[1], ssr01_b[2], ssr01_b[3], - ssr01_i[0], ssr01_i[1], ssr01_i[2], ssr01_i[3]); + const uint32_t ssr01_i[4] = {sizeof(double) * n * stride, sizeof(double), 0, + sizeof(double) * n * stride * unroll0}; + snrt_ssr_loop_4d(SNRT_SSR_DM0, ssr01_b[0], ssr01_b[1], ssr01_b[2], + ssr01_b[3], ssr01_i[0], ssr01_i[1], ssr01_i[2], + ssr01_i[3]); + snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr01_b[0], ssr01_b[1], ssr01_b[2], + ssr01_b[3], ssr01_i[0], ssr01_i[1], ssr01_i[2], + ssr01_i[3]); snrt_ssr_repeat(SNRT_SSR_DM0, 1); // Configure ft2 to store data and datat elements // for (i1 = offset; i1 < m; i1 += stride * unroll0) @@ -115,11 +110,9 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, // datat[i * n + j] = ft2.pop() const uint32_t ssr2_b[4] = {2, unroll0, n, m / (stride * unroll0)}; const uint32_t ssr2_i[4] = {(uint32_t)datat - (uint32_t)data, - sizeof(double) * n * stride, - sizeof(double), + sizeof(double) * n * stride, sizeof(double), sizeof(double) * n * stride * unroll0}; - snrt_ssr_loop_4d(SNRT_SSR_DM2, - ssr2_b[0], ssr2_b[1], ssr2_b[2], ssr2_b[3], + snrt_ssr_loop_4d(SNRT_SSR_DM2, ssr2_b[0], ssr2_b[1], ssr2_b[2], ssr2_b[3], ssr2_i[0], ssr2_i[1], ssr2_i[2], ssr2_i[3]); // SSR start address need to be configured each time @@ -130,21 +123,20 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, // Center data for (uint32_t i = offset; i < m; i += stride * unroll0) { - // Calculate row means double m[2 * unroll0]; - m[0] = 0.0; // mean(data[i]) - m[1] = 0.0; // mean(datat[i]) - m[2] = 0.0; // mean(data[i + stride]) - m[3] = 0.0; // mean(datat[i + stride]) + m[0] = 0.0; // mean(data[i]) + m[1] = 0.0; // mean(datat[i]) + m[2] = 0.0; // mean(data[i + stride]) + m[3] = 0.0; // mean(datat[i + stride]) asm volatile( "frep.o %[n_frep], %[n_insn], 0, 0 \n" "fadd.d %[m0], ft0, %[m0] \n" "fadd.d %[m1], ft1, %[m1] \n" "fadd.d %[m2], ft0, %[m2] \n" "fadd.d %[m3], ft1, %[m3] \n" - : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), - [ m2 ] "+f"(m[2]), [ m3 ] "+f"(m[3]) + : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), [ m2 ] "+f"(m[2]), + [ m3 ] "+f"(m[3]) : [ n_frep ] "r"(n - 1), [ n_insn ] "i"(2 * unroll0) : "ft0", "ft1", "ft2"); m[0] *= inv_n; @@ -161,8 +153,8 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, "fsub.d ft2, ft1, %[m1] \n" "fsub.d ft2, ft0, %[m2] \n" "fsub.d ft2, ft1, %[m3] \n" - : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), - [ m2 ] "+f"(m[2]), [ m3 ] "+f"(m[3]) + : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), [ m2 ] "+f"(m[2]), + [ m3 ] "+f"(m[3]) : [ n_frep ] "r"(n - 1), [ n_insn ] "i"(2 * unroll0) : "ft0", "ft1", "ft2"); } @@ -190,16 +182,16 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, // ft0.push(a[i * n + k]) // ft1.push(at[j * n + k]) const uint32_t ssr0_b[4] = {unroll1, n, m / unroll1, m / stride}; - const uint32_t ssr0_i[4] = {0, sizeof(double), 0, stride * n * sizeof(double)}; - snrt_ssr_loop_3d(SNRT_SSR_DM0, - ssr0_b[1], ssr0_b[2], ssr0_b[3], - ssr0_i[1], ssr0_i[2], ssr0_i[3]); + const uint32_t ssr0_i[4] = {0, sizeof(double), 0, + stride * n * sizeof(double)}; + snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], ssr0_i[1], + ssr0_i[2], ssr0_i[3]); snrt_ssr_repeat(SNRT_SSR_DM0, unroll1); const uint32_t ssr1_b[4] = {unroll1, n, m / unroll1, m / stride}; - const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), unroll1 * n * sizeof(double), 0}; - snrt_ssr_loop_4d(SNRT_SSR_DM1, - ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3], - ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); + const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), + unroll1 * n * sizeof(double), 0}; + snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3], + ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); // SSR start address need to be configured each time snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_3D, data + offset * n); @@ -208,7 +200,6 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, for (uint32_t i = offset; i < m; i += stride) { for (uint32_t j = 0; j < m; j += unroll1) { - double acc[unroll1]; acc[0] = 0; acc[1] = 0; @@ -227,8 +218,10 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, "fmul.d %[b3], %[acc3], %[alpha] \n" : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]), - [ b0 ] "=f"(cov[i * m + j + 0]), [ b1 ] "=f"(cov[i * m + j + 1]), - [ b2 ] "=f"(cov[i * m + j + 2]), [ b3 ] "=f"(cov[i * m + j + 3]) + [ b0 ] "=f"(cov[i * m + j + 0]), + [ b1 ] "=f"(cov[i * m + j + 1]), + [ b2 ] "=f"(cov[i * m + j + 2]), + [ b3 ] "=f"(cov[i * m + j + 3]) : [ n_frep ] "r"(n - 1), [ unroll1 ] "i"(unroll1), [ alpha ] "f"(inv_n_m1) : "ft0", "ft1", "ft2"); @@ -241,8 +234,8 @@ void covariance_opt(uint32_t m, uint32_t n, double inv_n, void covariance_job(covariance_args_t *args) { uint32_t m_frac, a_tile_size, a_tile_bytes, b_tile_size, b_tile_bytes; - uint64_t local_a0_addr, local_at0_addr, local_b0_addr, - local_a1_addr, local_at1_addr, local_b1_addr; + uint64_t local_a0_addr, local_at0_addr, local_b0_addr, local_a1_addr, + local_at1_addr, local_b1_addr; double *local_a[2]; double *local_at[2]; double *local_b[2]; @@ -287,12 +280,13 @@ void covariance_job(covariance_args_t *args) { // Calculate number of iterations sb_iterations = args->m_tiles * args->m_tiles; - if (DOUBLE_BUFFER) iterations = sb_iterations + 2; - else iterations = sb_iterations; + if (DOUBLE_BUFFER) + iterations = sb_iterations + 2; + else + iterations = sb_iterations; // Iterate over all tiles for (i = 0; i < iterations; i++) { - if (snrt_is_dm_core()) { // DMA in if (!DOUBLE_BUFFER || (i < sb_iterations)) { @@ -305,18 +299,10 @@ void covariance_job(covariance_args_t *args) { i_col = i_dma_in % args->m_tiles; // Copy job operands in TCDM - snrt_dma_load_1d_tile( - local_a[buff_idx], - args->data, - i_row, - a_tile_size, - sizeof(double)); - snrt_dma_load_1d_tile( - local_at[buff_idx], - args->data, - i_col, - a_tile_size, - sizeof(double)); + snrt_dma_load_1d_tile(local_a[buff_idx], args->data, i_row, + a_tile_size, sizeof(double)); + snrt_dma_load_1d_tile(local_at[buff_idx], args->data, i_col, + a_tile_size, sizeof(double)); snrt_dma_wait_all(); snrt_mcycle(); @@ -343,15 +329,9 @@ void covariance_job(covariance_args_t *args) { i_col = i_dma_out % args->m_tiles; // Copy job outputs from TCDM - snrt_dma_store_2d_tile( - args->cov, - local_b[buff_idx], - i_row, - i_col, - m_frac, - m_frac, - args->m, - sizeof(double)); + snrt_dma_store_2d_tile(args->cov, local_b[buff_idx], i_row, + i_col, m_frac, m_frac, args->m, + sizeof(double)); snrt_dma_wait_all(); snrt_mcycle(); diff --git a/sw/apps/covariance/src/main.c b/sw/apps/covariance/src/main.c index 3c9d225a8..112ead333 100644 --- a/sw/apps/covariance/src/main.c +++ b/sw/apps/covariance/src/main.c @@ -10,7 +10,6 @@ #include "data.h" int main() { - covariance_job(&args); return 0; diff --git a/sw/blas/axpy/src/args.h b/sw/blas/axpy/src/args.h index 0efe3a2b4..c5d542852 100644 --- a/sw/blas/axpy/src/args.h +++ b/sw/blas/axpy/src/args.h @@ -5,14 +5,15 @@ #pragma once #include -typedef void (*axpy_fp_t)(uint32_t n, double a, double* x, double* y, double* z); +typedef void (*axpy_fp_t)(uint32_t n, double a, double* x, double* y, + double* z); typedef struct { uint32_t n; double a; - double *x; - double *y; - double *z; + double* x; + double* y; + double* z; uint32_t n_tiles; axpy_fp_t funcptr; } axpy_args_t; diff --git a/sw/blas/axpy/src/axpy.h b/sw/blas/axpy/src/axpy.h index c5df546ab..8ded48167 100644 --- a/sw/blas/axpy/src/axpy.h +++ b/sw/blas/axpy/src/axpy.h @@ -11,7 +11,8 @@ #define TCDM_ALIGNMENT (32 * BANK_ALIGNMENT) #define ALIGN_UP_TCDM(addr) ALIGN_UP(addr, TCDM_ALIGNMENT) -static inline void axpy_naive(uint32_t n, double a, double* x, double* y, double* z) { +static inline void axpy_naive(uint32_t n, double a, double *x, double *y, + double *z) { int core_idx = snrt_cluster_core_idx(); int frac = n / snrt_cluster_compute_core_num(); int offset = core_idx; @@ -22,28 +23,27 @@ static inline void axpy_naive(uint32_t n, double a, double* x, double* y, double snrt_fpu_fence(); } -static inline void axpy_fma(uint32_t n, double a, double* x, double* y, double* z) { +static inline void axpy_fma(uint32_t n, double a, double *x, double *y, + double *z) { int core_idx = snrt_cluster_core_idx(); int frac = n / snrt_cluster_compute_core_num(); int offset = core_idx; for (int i = offset; i < n; i += snrt_cluster_compute_core_num()) { - asm volatile ( - "fmadd.d %[z], %[a], %[x], %[y] \n" - : [ z ]"=f"(z[i]) - : [ a ]"f"(a), [ x ]"f"(x[i]), [ y ]"f"(y[i]) - ); + asm volatile("fmadd.d %[z], %[a], %[x], %[y] \n" + : [ z ] "=f"(z[i]) + : [ a ] "f"(a), [ x ] "f"(x[i]), [ y ] "f"(y[i])); } snrt_fpu_fence(); } -static inline void axpy_opt(uint32_t n, double a, double* x, double* y, double* z) { +static inline void axpy_opt(uint32_t n, double a, double *x, double *y, + double *z) { int core_idx = snrt_cluster_core_idx(); int frac = n / snrt_cluster_compute_core_num(); int offset = core_idx; - snrt_ssr_loop_1d(SNRT_SSR_DM_ALL, - frac, + snrt_ssr_loop_1d(SNRT_SSR_DM_ALL, frac, snrt_cluster_compute_core_num() * sizeof(double)); snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, x + offset); @@ -57,24 +57,22 @@ static inline void axpy_opt(uint32_t n, double a, double* x, double* y, double* "fmadd.d ft2, %[a], ft0, ft1\n" : : [ n_frep ] "r"(frac - 1), [ a ] "f"(a) - : "ft0", "ft1", "ft2", "memory" - ); - + : "ft0", "ft1", "ft2", "memory"); + snrt_fpu_fence(); snrt_ssr_disable(); } static inline void axpy_job(axpy_args_t *args) { uint32_t frac, offset, size; - uint64_t local_x0_addr, local_y0_addr, local_z0_addr, - local_x1_addr, local_y1_addr, local_z1_addr; + uint64_t local_x0_addr, local_y0_addr, local_z0_addr, local_x1_addr, + local_y1_addr, local_z1_addr; double *local_x[2]; double *local_y[2]; double *local_z[2]; double *remote_x, *remote_y, *remote_z; uint32_t iterations, i, i_dma_in, i_compute, i_dma_out, buff_idx; - #ifndef JOB_ARGS_PRELOADED // Allocate space for job arguments in TCDM axpy_args_t *local_args = (axpy_args_t *)snrt_l1_next(); @@ -102,8 +100,10 @@ static inline void axpy_job(axpy_args_t *args) { local_z[0] = (double *)local_z0_addr; if (DOUBLE_BUFFER) { local_x1_addr = ALIGN_UP_TCDM(local_z0_addr + size); - local_y1_addr = ALIGN_UP_TCDM(local_x1_addr + size) + 8 * BANK_ALIGNMENT; - local_z1_addr = ALIGN_UP_TCDM(local_y1_addr + size) + 16 * BANK_ALIGNMENT; + local_y1_addr = + ALIGN_UP_TCDM(local_x1_addr + size) + 8 * BANK_ALIGNMENT; + local_z1_addr = + ALIGN_UP_TCDM(local_y1_addr + size) + 16 * BANK_ALIGNMENT; local_x[1] = (double *)local_x1_addr; local_y[1] = (double *)local_y1_addr; local_z[1] = (double *)local_z1_addr; @@ -115,7 +115,6 @@ static inline void axpy_job(axpy_args_t *args) { // Iterate over all tiles for (i = 0; i < iterations; i++) { - if (snrt_is_dm_core()) { // DMA in if (!DOUBLE_BUFFER || (i < args->n_tiles)) { @@ -176,7 +175,8 @@ static inline void axpy_job(axpy_args_t *args) { // Perform tile computation axpy_fp_t fp = args->funcptr; - fp(frac, args->a, local_x[buff_idx], local_y[buff_idx], local_z[buff_idx]); + fp(frac, args->a, local_x[buff_idx], local_y[buff_idx], + local_z[buff_idx]); snrt_mcycle(); } diff --git a/sw/blas/axpy/src/main.c b/sw/blas/axpy/src/main.c index 83cb58ae8..e0389d25d 100644 --- a/sw/blas/axpy/src/main.c +++ b/sw/blas/axpy/src/main.c @@ -8,7 +8,6 @@ #include "data.h" int main() { - axpy_job(&args); // TODO: currently only works for single cluster otherwise need to diff --git a/sw/blas/syrk/scripts/datagen.py b/sw/blas/syrk/scripts/datagen.py index 05cd2f038..9b4959fca 100755 --- a/sw/blas/syrk/scripts/datagen.py +++ b/sw/blas/syrk/scripts/datagen.py @@ -8,12 +8,12 @@ import numpy as np from snitch.util.sim import data_utils -from snitch.util.sim.data_utils import format_array_definition, format_array_declaration, \ - format_struct_definition, DataGen +from snitch.util.sim.data_utils import format_array_definition, format_struct_definition, DataGen DOUBLE_BUFFER = True + class SyrkDataGen(DataGen): # Function pointers to alternative implementations @@ -55,7 +55,6 @@ def emit_header(self, **kwargs): A = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['n']))/100 C_in = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['m']))/100 - C_out = self.golden_model(alpha, A, beta, C_in) A = A.flatten() C_in = C_in.flatten() diff --git a/sw/blas/syrk/src/args.h b/sw/blas/syrk/src/args.h index 6bb58e00e..24342d3e3 100644 --- a/sw/blas/syrk/src/args.h +++ b/sw/blas/syrk/src/args.h @@ -8,7 +8,7 @@ #include typedef void (*syrk_fp_t)(uint32_t m, uint32_t n, double alpha, double *a, - double *at, double beta, double *b); + double *at, double beta, double *b); typedef struct { uint32_t m; diff --git a/sw/blas/syrk/src/main.c b/sw/blas/syrk/src/main.c index 9f1ad7163..f8c09ae4f 100644 --- a/sw/blas/syrk/src/main.c +++ b/sw/blas/syrk/src/main.c @@ -10,7 +10,6 @@ #include "data.h" int main() { - syrk_job(&args); return 0; diff --git a/sw/blas/syrk/src/syrk.h b/sw/blas/syrk/src/syrk.h index 9494f2777..718ad7fe9 100644 --- a/sw/blas/syrk/src/syrk.h +++ b/sw/blas/syrk/src/syrk.h @@ -39,7 +39,6 @@ void syrk_baseline(uint32_t m, uint32_t n, double alpha, double *a, double *at, for (uint32_t i = offset; i < m; i += stride) { for (uint32_t j = 0; j < m; j += unroll1) { - double acc[4]; acc[0] = 0; acc[1] = 0; @@ -66,28 +65,26 @@ void syrk_baseline(uint32_t m, uint32_t n, double alpha, double *a, double *at, "fmadd.d %[acc3], %[a3], %[at15], %[acc3] \n" : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]) - : [ a0 ] "f"(a[i * n + k + 0]), - [ a1 ] "f"(a[i * n + k + 1]), - [ a2 ] "f"(a[i * n + k + 2]), - [ a3 ] "f"(a[i * n + k + 3]), - [ at0 ] "f"(at[(j + 0) * n + k]), - [ at1 ] "f"(at[(j + 1) * n + k]), - [ at2 ] "f"(at[(j + 2) * n + k]), - [ at3 ] "f"(at[(j + 3) * n + k]), - [ at4 ] "f"(at[(j + 0) * n + k + 1]), - [ at5 ] "f"(at[(j + 1) * n + k + 1]), - [ at6 ] "f"(at[(j + 2) * n + k + 1]), - [ at7 ] "f"(at[(j + 3) * n + k + 1]), - [ at8 ] "f"(at[(j + 0) * n + k + 2]), - [ at9 ] "f"(at[(j + 1) * n + k + 2]), - [ at10 ] "f"(at[(j + 2) * n + k + 2]), - [ at11 ] "f"(at[(j + 3) * n + k + 2]), - [ at12 ] "f"(at[(j + 0) * n + k + 3]), - [ at13 ] "f"(at[(j + 1) * n + k + 3]), - [ at14 ] "f"(at[(j + 2) * n + k + 3]), - [ at15 ] "f"(at[(j + 3) * n + k + 3]) : - ); + [ a0 ] "f"(a[i * n + k + 0]), [ a1 ] "f"(a[i * n + k + 1]), + [ a2 ] "f"(a[i * n + k + 2]), [ a3 ] "f"(a[i * n + k + 3]), + [ at0 ] "f"(at[(j + 0) * n + k]), + [ at1 ] "f"(at[(j + 1) * n + k]), + [ at2 ] "f"(at[(j + 2) * n + k]), + [ at3 ] "f"(at[(j + 3) * n + k]), + [ at4 ] "f"(at[(j + 0) * n + k + 1]), + [ at5 ] "f"(at[(j + 1) * n + k + 1]), + [ at6 ] "f"(at[(j + 2) * n + k + 1]), + [ at7 ] "f"(at[(j + 3) * n + k + 1]), + [ at8 ] "f"(at[(j + 0) * n + k + 2]), + [ at9 ] "f"(at[(j + 1) * n + k + 2]), + [ at10 ] "f"(at[(j + 2) * n + k + 2]), + [ at11 ] "f"(at[(j + 3) * n + k + 2]), + [ at12 ] "f"(at[(j + 0) * n + k + 3]), + [ at13 ] "f"(at[(j + 1) * n + k + 3]), + [ at14 ] "f"(at[(j + 2) * n + k + 3]), + [ at15 ] "f"(at[(j + 3) * n + k + 3]) + :); } c[i * m + j + 0] = multiply_opt(c[i * m + j + 0], beta); @@ -122,15 +119,16 @@ void syrk_opt(uint32_t m, uint32_t n, double alpha, double *a, double *at, // ft0.push(a[i * n + k]) // ft1.push(at[j * n + k]) const uint32_t ssr0_b[4] = {unroll, n, m / unroll, m / stride}; - const uint32_t ssr0_i[4] = {0, sizeof(double), 0, stride * n * sizeof(double)}; + const uint32_t ssr0_i[4] = {0, sizeof(double), 0, + stride * n * sizeof(double)}; snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], ssr0_i[1], ssr0_i[2], ssr0_i[3]); snrt_ssr_repeat(SNRT_SSR_DM0, unroll); const uint32_t ssr1_b[4] = {unroll, n, m / unroll, m / stride}; - const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), unroll * n * sizeof(double), 0}; + const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), + unroll * n * sizeof(double), 0}; snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], - ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2], - ssr1_i[3]); + ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); setup_ssr = 0; } @@ -141,7 +139,6 @@ void syrk_opt(uint32_t m, uint32_t n, double alpha, double *a, double *at, for (uint32_t i = offset; i < m; i += stride) { for (uint32_t j = 0; j < m; j += unroll) { - double acc[unroll]; acc[0] = 0; acc[1] = 0; @@ -178,8 +175,8 @@ void syrk_opt(uint32_t m, uint32_t n, double alpha, double *a, double *at, void syrk_job(syrk_args_t *args) { uint32_t m_frac, a_tile_size, a_tile_bytes, c_tile_size, c_tile_bytes; - uint64_t local_a0_addr, local_at0_addr, local_c0_addr, - local_a1_addr, local_at1_addr, local_c1_addr; + uint64_t local_a0_addr, local_at0_addr, local_c0_addr, local_a1_addr, + local_at1_addr, local_c1_addr; double *local_a[2]; double *local_at[2]; double *local_c[2]; @@ -227,7 +224,6 @@ void syrk_job(syrk_args_t *args) { // Iterate over all tiles for (i = 0; i < iterations; i++) { - if (snrt_is_dm_core()) { // DMA out // (out before in to avoid overwriting data) @@ -241,15 +237,8 @@ void syrk_job(syrk_args_t *args) { i_col = i_dma_out % args->m_tiles; // Copy job outputs from TCDM - snrt_dma_store_2d_tile( - args->c, - local_c[buff_idx], - i_row, - i_col, - m_frac, - m_frac, - args->m, - sizeof(double)); + snrt_dma_store_2d_tile(args->c, local_c[buff_idx], i_row, i_col, + m_frac, m_frac, args->m, sizeof(double)); snrt_dma_wait_all(); snrt_mcycle(); @@ -266,28 +255,14 @@ void syrk_job(syrk_args_t *args) { i_col = i_dma_in % args->m_tiles; // Copy job operands in TCDM - snrt_dma_load_1d_tile( - local_a[buff_idx], - args->a, - i_row, - a_tile_size, - sizeof(double)); - snrt_dma_load_1d_tile( - local_at[buff_idx], - args->a, - i_col, - a_tile_size, - sizeof(double)); + snrt_dma_load_1d_tile(local_a[buff_idx], args->a, i_row, + a_tile_size, sizeof(double)); + snrt_dma_load_1d_tile(local_at[buff_idx], args->a, i_col, + a_tile_size, sizeof(double)); if (args->funcptr == syrk_opt || args->beta != 0) { - snrt_dma_load_2d_tile( - local_c[buff_idx], - args->c, - i_row, - i_col, - m_frac, - m_frac, - args->m, - sizeof(double)); + snrt_dma_load_2d_tile(local_c[buff_idx], args->c, i_row, + i_col, m_frac, m_frac, args->m, + sizeof(double)); } snrt_dma_wait_all(); From a8cafa96a6b4c8be52b441be48142711b6967cb4 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Tue, 20 Aug 2024 15:31:27 +0200 Subject: [PATCH 14/19] sw: Add doitgen kernel --- sw/apps/doitgen/.gitignore | 1 + sw/apps/doitgen/data/params.json | 12 + sw/apps/doitgen/scripts/datagen.py | 90 ++++++ sw/apps/doitgen/scripts/verify.py | 48 +++ sw/apps/doitgen/src/args.h | 22 ++ sw/apps/doitgen/src/doitgen.h | 303 +++++++++++++++++++ sw/apps/doitgen/src/main.c | 17 ++ target/snitch_cluster/sw.mk | 1 + target/snitch_cluster/sw/apps/doitgen/app.mk | 14 + target/snitch_cluster/sw/run.yaml | 2 + 10 files changed, 510 insertions(+) create mode 100644 sw/apps/doitgen/.gitignore create mode 100644 sw/apps/doitgen/data/params.json create mode 100755 sw/apps/doitgen/scripts/datagen.py create mode 100755 sw/apps/doitgen/scripts/verify.py create mode 100644 sw/apps/doitgen/src/args.h create mode 100644 sw/apps/doitgen/src/doitgen.h create mode 100644 sw/apps/doitgen/src/main.c create mode 100644 target/snitch_cluster/sw/apps/doitgen/app.mk diff --git a/sw/apps/doitgen/.gitignore b/sw/apps/doitgen/.gitignore new file mode 100644 index 000000000..8485f615e --- /dev/null +++ b/sw/apps/doitgen/.gitignore @@ -0,0 +1 @@ +data/data.h \ No newline at end of file diff --git a/sw/apps/doitgen/data/params.json b/sw/apps/doitgen/data/params.json new file mode 100644 index 000000000..4417f0c35 --- /dev/null +++ b/sw/apps/doitgen/data/params.json @@ -0,0 +1,12 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + "r": 16, + "q": 16, + "s": 32, + "r_tiles": 2, + "q_tiles": 2, + "funcptr": "doitgen_baseline" +} diff --git a/sw/apps/doitgen/scripts/datagen.py b/sw/apps/doitgen/scripts/datagen.py new file mode 100755 index 000000000..d0dddf6f5 --- /dev/null +++ b/sw/apps/doitgen/scripts/datagen.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Author: Luca Colagrande + +import numpy as np + +from snitch.util.sim import data_utils +from snitch.util.sim.data_utils import format_array_definition, format_struct_definition, DataGen + +np.random.seed(42) + +DOUBLE_BUFFER = True + + +class DoitgenDataGen(DataGen): + + # Function pointers to alternative implementations + FUNCPTRS = ["doitgen_naive", "doitgen_baseline", "doitgen_opt"] + + def golden_model(self, A, x): + R, Q, S = A.shape + P, _ = x.shape + Aout = np.ndarray((R, Q, P)) + for r in range(R): + for q in range(Q): + for p in range(P): + Aout[r, q, p] = 0 + for s in range(S): + Aout[r, q, p] += A[r, q, s] * x[p, s] + return Aout + + def validate(self, **kwargs): + n_cores = 8 + assert (kwargs['r'] % kwargs['r_tiles']) == 0, "r must be an integer multiple of r_tiles" + assert (kwargs['q'] % kwargs['q_tiles']) == 0, "q must be an integer multiple of q_tiles" + if kwargs['funcptr'] != 'doitgen_naive': + assert (kwargs['s'] % 4) == 0, "s must be an integer multiple of unrolling factor" + r_per_tile = kwargs['r'] / kwargs['r_tiles'] + q_per_tile = kwargs['q'] / kwargs['q_tiles'] + assert (r_per_tile % n_cores) == 0, "r_per_tile must be an integer multiple of n_cores" + assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}" + + # Calculate total TCDM occupation + a_tile_size = r_per_tile * q_per_tile * kwargs['s'] * 8 + x_size = kwargs['s'] * kwargs['s'] * 8 + total_size = 2 * a_tile_size + x_size + if DOUBLE_BUFFER: + total_size *= 2 + data_utils.validate_tcdm_footprint(total_size) + + def emit_header(self, **kwargs): + header = [super().emit_header()] + + self.validate(**kwargs) + + A = np.random.randint(-100, 100, size=(kwargs['r'], kwargs['q'], kwargs['s'])) + x = np.random.randint(-100, 100, size=(kwargs['s'], kwargs['s'])) + + _ = self.golden_model(A, x) + + A = A.flatten() + x = x.flatten() + + A_uid = 'A' + x_uid = 'x' + + cfg = { + 'r': kwargs['r'], + 'q': kwargs['q'], + 's': kwargs['s'], + 'A': A_uid, + 'x': x_uid, + 'r_tiles': kwargs['r_tiles'], + 'q_tiles': kwargs['q_tiles'], + 'funcptr': kwargs['funcptr'] + } + + header += [format_array_definition('double', A_uid, A)] + header += [format_array_definition('double', x_uid, x)] + header += [format_struct_definition('doitgen_args_t', 'args', cfg)] + header = '\n\n'.join(header) + + return header + + +if __name__ == '__main__': + DoitgenDataGen().main() diff --git a/sw/apps/doitgen/scripts/verify.py b/sw/apps/doitgen/scripts/verify.py new file mode 100755 index 000000000..8f72b0415 --- /dev/null +++ b/sw/apps/doitgen/scripts/verify.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import numpy as np +import sys +from datagen import DoitgenDataGen + +from snitch.util.sim.verif_utils import Verifier + + +class DoitgenVerifier(Verifier): + + OUTPUT_UIDS = ['A'] + + def __init__(self): + super().__init__() + self.func_args = { + 'r': 'I', + 'q': 'I', + 's': 'I', + 'A': 'I', + 'x': 'I', + 'r_tiles': 'I', + 'q_tiles': 'I', + 'funcptr': 'I' + } + self.func_args = self.get_input_from_symbol('args', self.func_args) + + def get_actual_results(self): + return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double') + + def get_expected_results(self): + A = self.get_input_from_symbol('A', 'double') + A = np.reshape(A, (self.func_args['r'], self.func_args['q'], self.func_args['s'])) + x = self.get_input_from_symbol('x', 'double') + x = np.reshape(x, (self.func_args['s'], self.func_args['s'])) + return DoitgenDataGen().golden_model(A, x).flatten() + + def check_results(self, *args): + return super().check_results(*args, rtol=1e-10) + + +if __name__ == "__main__": + sys.exit(DoitgenVerifier().main()) diff --git a/sw/apps/doitgen/src/args.h b/sw/apps/doitgen/src/args.h new file mode 100644 index 000000000..5d3f56ce4 --- /dev/null +++ b/sw/apps/doitgen/src/args.h @@ -0,0 +1,22 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#pragma once +#include + +typedef void (*doitgen_fp_t)(uint32_t r, uint32_t q, uint32_t s, double *A, + double *x, double *Aout); + +typedef struct { + uint32_t r; + uint32_t q; + uint32_t s; + double *A; + double *x; + uint32_t r_tiles; + uint32_t q_tiles; + doitgen_fp_t funcptr; +} doitgen_args_t; diff --git a/sw/apps/doitgen/src/doitgen.h b/sw/apps/doitgen/src/doitgen.h new file mode 100644 index 000000000..2f7bc6128 --- /dev/null +++ b/sw/apps/doitgen/src/doitgen.h @@ -0,0 +1,303 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include "args.h" +#include "snrt.h" + +#define DOUBLE_BUFFER 1 + +__thread int setup_ssr = 1; + +void doitgen_naive(uint32_t r, uint32_t q, uint32_t s, double *A, double *x, + double *Aout) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + for (uint32_t i = offset; i < r; i += stride) { + for (uint32_t j = 0; j < q; j++) { + for (uint32_t k = 0; k < s; k++) { + Aout[i * q * s + j * s + k] = 0.0; + for (uint32_t l = 0; l < s; l++) { + Aout[i * q * s + j * s + k] += + A[i * q * s + j * s + l] * x[k * s + l]; + } + } + } + } + + snrt_fpu_fence(); +} + +void doitgen_baseline(uint32_t r, uint32_t q, uint32_t s, double *A, double *x, + double *Aout) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + // Unrolling factors + // Note: changes must be reflected in the inline assembly code + // and datagen script + const uint32_t unroll1 = 4; + const uint32_t unroll0 = 4; + + for (uint32_t i = offset; i < r; i += stride) { + for (uint32_t j = 0; j < q; j++) { + for (uint32_t k = 0; k < s; k += unroll1) { + double acc[4]; + acc[0] = 0; + acc[1] = 0; + acc[2] = 0; + acc[3] = 0; + + for (uint32_t l = 0; l < s; l += unroll0) { + asm volatile( + "fmadd.d %[acc0], %[a0], %[x0], %[acc0] \n" + "fmadd.d %[acc1], %[a0], %[x1], %[acc1] \n" + "fmadd.d %[acc2], %[a0], %[x2], %[acc2] \n" + "fmadd.d %[acc3], %[a0], %[x3], %[acc3] \n" + "fmadd.d %[acc0], %[a1], %[x4], %[acc0] \n" + "fmadd.d %[acc1], %[a1], %[x5], %[acc1] \n" + "fmadd.d %[acc2], %[a1], %[x6], %[acc2] \n" + "fmadd.d %[acc3], %[a1], %[x7], %[acc3] \n" + "fmadd.d %[acc0], %[a2], %[x8], %[acc0] \n" + "fmadd.d %[acc1], %[a2], %[x9], %[acc1] \n" + "fmadd.d %[acc2], %[a2], %[x10], %[acc2] \n" + "fmadd.d %[acc3], %[a2], %[x11], %[acc3] \n" + "fmadd.d %[acc0], %[a3], %[x12], %[acc0] \n" + "fmadd.d %[acc1], %[a3], %[x13], %[acc1] \n" + "fmadd.d %[acc2], %[a3], %[x14], %[acc2] \n" + "fmadd.d %[acc3], %[a3], %[x15], %[acc3] \n" + : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), + [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]) + : [ a0 ] "f"(A[i * q * s + j * s + l + 0]), + [ a1 ] "f"(A[i * q * s + j * s + l + 1]), + [ a2 ] "f"(A[i * q * s + j * s + l + 2]), + [ a3 ] "f"(A[i * q * s + j * s + l + 3]), + [ x0 ] "f"(x[(k + 0) * s + l + 0]), + [ x1 ] "f"(x[(k + 1) * s + l + 0]), + [ x2 ] "f"(x[(k + 2) * s + l + 0]), + [ x3 ] "f"(x[(k + 3) * s + l + 0]), + [ x4 ] "f"(x[(k + 0) * s + l + 1]), + [ x5 ] "f"(x[(k + 1) * s + l + 1]), + [ x6 ] "f"(x[(k + 2) * s + l + 1]), + [ x7 ] "f"(x[(k + 3) * s + l + 1]), + [ x8 ] "f"(x[(k + 0) * s + l + 2]), + [ x9 ] "f"(x[(k + 1) * s + l + 2]), + [ x10 ] "f"(x[(k + 2) * s + l + 2]), + [ x11 ] "f"(x[(k + 3) * s + l + 2]), + [ x12 ] "f"(x[(k + 0) * s + l + 3]), + [ x13 ] "f"(x[(k + 1) * s + l + 3]), + [ x14 ] "f"(x[(k + 2) * s + l + 3]), + [ x15 ] "f"(x[(k + 3) * s + l + 3]) + :); + } + + Aout[i * q * s + j * s + k + 0] = acc[0]; + Aout[i * q * s + j * s + k + 1] = acc[1]; + Aout[i * q * s + j * s + k + 2] = acc[2]; + Aout[i * q * s + j * s + k + 3] = acc[3]; + } + } + } + + snrt_fpu_fence(); +} + +void doitgen_opt(uint32_t r, uint32_t q, uint32_t s, double *A, double *x, + double *Aout) { + uint32_t bound = r / snrt_cluster_compute_core_num(); + uint32_t offset = bound * snrt_cluster_core_idx(); + + // Unrolling factor of innermost loop + // Note: changes must be reflected in the inline assembly code + // and datagen script + const uint32_t unroll = 4; + + if (setup_ssr) { + // Configure ft0 and ft1 to load A and x + // for (i = offset; i < bound; i++) + // for (j = 0; j < q; j++) + // for (k1 = 0; k1 < s; k1 += unroll) + // for (l = 0; l < s; l++) + // for (k0 = 0; k0 < unroll; k0++) + // k = k1 + k0 + // ft0.push(A[i * q * s + j * s + l]) + // ft1.push(x[k * s + l]) + const uint32_t ssr0_b[4] = {unroll, s, s / unroll, q * bound}; + const uint32_t ssr0_i[4] = {0, sizeof(double), 0, s * sizeof(double)}; + snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], + ssr0_i[1], ssr0_i[2], ssr0_i[3]); + snrt_ssr_repeat(SNRT_SSR_DM0, unroll); + const uint32_t ssr1_b[4] = {unroll, s, s / unroll, q * bound}; + const uint32_t ssr1_i[4] = {s * sizeof(double), sizeof(double), + unroll * s * sizeof(double), 0}; + snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], + ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); + setup_ssr = 0; + } + + // SSR start address need to be configured each time + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, A + offset * q * s); + snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, x); + snrt_ssr_enable(); + + for (uint32_t i = offset; i < (offset + bound); i++) { + for (uint32_t j = 0; j < q; j++) { + for (uint32_t k = 0; k < s; k += unroll) { + double acc[unroll]; + acc[0] = 0; + acc[1] = 0; + acc[2] = 0; + acc[3] = 0; + + asm volatile( + "frep.o %[n_frep], %[unroll], 0, 0 \n" + "fmadd.d %[acc0], ft0, ft1, %[acc0] \n" + "fmadd.d %[acc1], ft0, ft1, %[acc1] \n" + "fmadd.d %[acc2], ft0, ft1, %[acc2] \n" + "fmadd.d %[acc3], ft0, ft1, %[acc3] \n" + : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), + [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]) + : [ n_frep ] "r"(s - 1), [ unroll ] "i"(unroll) + : "ft0", "ft1", "ft2"); + + Aout[i * q * s + j * s + k + 0] = acc[0]; + Aout[i * q * s + j * s + k + 1] = acc[1]; + Aout[i * q * s + j * s + k + 2] = acc[2]; + Aout[i * q * s + j * s + k + 3] = acc[3]; + } + } + } + + snrt_ssr_disable(); + snrt_fpu_fence(); +} + +void doitgen_job(doitgen_args_t *args) { + uint32_t r_frac, q_frac, a_tile_size, a_tile_bytes, x_size, x_bytes; + uint64_t local_a0_addr, local_aout0_addr, local_x0_addr, local_a1_addr, + local_aout1_addr; + double *local_a[2]; + double *local_aout[2]; + double *local_x; + uint32_t iterations, sb_iterations; + uint32_t i, i_dma_in, i_compute, i_dma_out, i_r, i_q, buff_idx; + +#ifndef JOB_ARGS_PRELOADED + // Allocate space for job arguments in TCDM + doitgen_args_t *local_args = (doitgen_args_t *)snrt_l1_next(); + + // Copy job arguments to TCDM + if (snrt_is_dm_core()) { + snrt_dma_start_1d(local_args, args, sizeof(doitgen_args_t)); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); + args = local_args; +#endif + + // Calculate size of each tile + r_frac = args->r / args->r_tiles; + q_frac = args->q / args->q_tiles; + a_tile_size = r_frac * q_frac * args->s; + x_size = args->s * args->s; + a_tile_bytes = a_tile_size * sizeof(double); + x_bytes = x_size * sizeof(double); + + // Allocate space for job operands in TCDM + local_x0_addr = (uint64_t)args + sizeof(doitgen_args_t); + local_a0_addr = local_x0_addr + x_bytes; + local_aout0_addr = local_a0_addr + a_tile_bytes; + local_x = (double *)local_x0_addr; + local_a[0] = (double *)local_a0_addr; + local_aout[0] = (double *)local_aout0_addr; + if (DOUBLE_BUFFER) { + local_a1_addr = local_aout0_addr + a_tile_bytes; + local_aout1_addr = local_a1_addr + a_tile_bytes; + local_a[1] = (double *)local_a1_addr; + local_aout[1] = (double *)local_aout1_addr; + } + + // Calculate number of iterations + sb_iterations = args->r_tiles * args->q_tiles; + if (DOUBLE_BUFFER) + iterations = sb_iterations + 2; + else + iterations = sb_iterations; + + // Iterate over all tiles + for (i = 0; i < iterations; i++) { + if (snrt_is_dm_core()) { + // DMA in + if (!DOUBLE_BUFFER || (i < sb_iterations)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_in = i; + buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0; + i_r = i_dma_in / args->q_tiles; + i_q = i_dma_in % args->q_tiles; + + // Copy job operands in TCDM + snrt_dma_load_2d_tile(local_a[buff_idx], args->A, i_r, i_q, + r_frac, q_frac * args->s, + args->q * args->s, sizeof(double)); + if (i_dma_in == 0) snrt_dma_start_1d(local_x, args->x, x_bytes); + snrt_dma_wait_all(); + + snrt_mcycle(); + } + + // Additional barriers required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + // DMA out + if (!DOUBLE_BUFFER || (i > 1)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_out = DOUBLE_BUFFER ? i - 2 : i; + buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0; + i_r = i_dma_out / args->q_tiles; + i_q = i_dma_out % args->q_tiles; + + // Copy job outputs from TCDM + snrt_dma_store_2d_tile(args->A, local_aout[buff_idx], i_r, i_q, + r_frac, q_frac * args->s, + args->q * args->s, sizeof(double)); + snrt_dma_wait_all(); + + snrt_mcycle(); + } + } + + // Compute + if (snrt_is_compute_core()) { + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1))) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_compute = DOUBLE_BUFFER ? i - 1 : i; + buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0; + + // Perform tile computation + doitgen_fp_t fp = args->funcptr; + fp(r_frac, q_frac, args->s, local_a[buff_idx], local_x, + local_aout[buff_idx]); + + snrt_mcycle(); + } + + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + } + // Synchronize cores after every iteration + snrt_cluster_hw_barrier(); + } +} diff --git a/sw/apps/doitgen/src/main.c b/sw/apps/doitgen/src/main.c new file mode 100644 index 000000000..64c9571f8 --- /dev/null +++ b/sw/apps/doitgen/src/main.c @@ -0,0 +1,17 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include "snrt.h" + +#include "doitgen.h" + +#include "data.h" + +int main() { + doitgen_job(&args); + + return 0; +} diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk index 674ea2cad..e4456fdfc 100644 --- a/target/snitch_cluster/sw.mk +++ b/target/snitch_cluster/sw.mk @@ -67,6 +67,7 @@ APPS += sw/apps/montecarlo/pi_estimation APPS += sw/apps/atax APPS += sw/apps/correlation APPS += sw/apps/covariance +APPS += sw/apps/doitgen # Include Makefile from each app subdirectory $(foreach app,$(APPS), \ diff --git a/target/snitch_cluster/sw/apps/doitgen/app.mk b/target/snitch_cluster/sw/apps/doitgen/app.mk new file mode 100644 index 000000000..ebef550d3 --- /dev/null +++ b/target/snitch_cluster/sw/apps/doitgen/app.mk @@ -0,0 +1,14 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +APP := doitgen +$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build +SRC_DIR := $(ROOT)/sw/apps/$(APP)/src +SRCS := $(SRC_DIR)/main.c +$(APP)_INCDIRS := $(ROOT)/sw/blas/ + +include $(ROOT)/sw/apps/common.mk +include $(ROOT)/target/snitch_cluster/sw/apps/common.mk diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml index ab302f7c3..d9e2f8c2f 100644 --- a/target/snitch_cluster/sw/run.yaml +++ b/target/snitch_cluster/sw/run.yaml @@ -99,3 +99,5 @@ runs: # cmd: [../../../sw/apps/atax/scripts/verify.py, "${sim_bin}", "${elf}"] - elf: apps/covariance/build/covariance.elf cmd: [../../../sw/apps/covariance/scripts/verify.py, "${sim_bin}", "${elf}"] + - elf: apps/doitgen/build/doitgen.elf + cmd: [../../../sw/apps/doitgen/scripts/verify.py, "${sim_bin}", "${elf}"] From 77f8792cc5fcff8198affbd57f45c3cd97b31195 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Thu, 22 Aug 2024 11:32:58 +0200 Subject: [PATCH 15/19] sw: Remove spurious files after #171 --- sw/apps/atax/.gitignore | 1 - sw/apps/correlation/.gitignore | 1 - sw/apps/covariance/.gitignore | 1 - sw/apps/doitgen/.gitignore | 1 - sw/blas/.gitignore | 1 - sw/blas/dot/Makefile | 31 ------------------------------- sw/blas/syrk/.gitignore | 1 - sw/dnn/.gitignore | 1 - 8 files changed, 38 deletions(-) delete mode 100644 sw/apps/atax/.gitignore delete mode 100644 sw/apps/correlation/.gitignore delete mode 100644 sw/apps/covariance/.gitignore delete mode 100644 sw/apps/doitgen/.gitignore delete mode 100644 sw/blas/.gitignore delete mode 100644 sw/blas/dot/Makefile delete mode 100644 sw/blas/syrk/.gitignore delete mode 100644 sw/dnn/.gitignore diff --git a/sw/apps/atax/.gitignore b/sw/apps/atax/.gitignore deleted file mode 100644 index 8485f615e..000000000 --- a/sw/apps/atax/.gitignore +++ /dev/null @@ -1 +0,0 @@ -data/data.h \ No newline at end of file diff --git a/sw/apps/correlation/.gitignore b/sw/apps/correlation/.gitignore deleted file mode 100644 index 8485f615e..000000000 --- a/sw/apps/correlation/.gitignore +++ /dev/null @@ -1 +0,0 @@ -data/data.h \ No newline at end of file diff --git a/sw/apps/covariance/.gitignore b/sw/apps/covariance/.gitignore deleted file mode 100644 index 8485f615e..000000000 --- a/sw/apps/covariance/.gitignore +++ /dev/null @@ -1 +0,0 @@ -data/data.h \ No newline at end of file diff --git a/sw/apps/doitgen/.gitignore b/sw/apps/doitgen/.gitignore deleted file mode 100644 index 8485f615e..000000000 --- a/sw/apps/doitgen/.gitignore +++ /dev/null @@ -1 +0,0 @@ -data/data.h \ No newline at end of file diff --git a/sw/blas/.gitignore b/sw/blas/.gitignore deleted file mode 100644 index 2ff975f29..000000000 --- a/sw/blas/.gitignore +++ /dev/null @@ -1 +0,0 @@ -**/data/data.h \ No newline at end of file diff --git a/sw/blas/dot/Makefile b/sw/blas/dot/Makefile deleted file mode 100644 index 077b84e5a..000000000 --- a/sw/blas/dot/Makefile +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright 2024 ETH Zurich and University of Bologna. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -# Usage of absolute paths is required to externally include this Makefile -MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) -DATA_DIR := $(realpath $(MK_DIR)/data) -SRC_DIR := $(realpath $(MK_DIR)/src) - -DATA_CFG ?= $(DATA_DIR)/params.json -SECTION ?= - -APP ?= dot -SRCS ?= $(realpath $(SRC_DIR)/main.c) -INCDIRS ?= $(dir $(DATA_H)) $(SRC_DIR) - -DATAGEN_PY = $(MK_DIR)/scripts/datagen.py -DATA_H ?= $(DATA_DIR)/data.h - -$(dir $(DATA_H)): - mkdir -p $@ - -$(DATA_H): $(DATAGEN_PY) $(DATA_CFG) | $(dir $(DATA_H)) - $< -c $(DATA_CFG) --section="$(SECTION)" $@ - -.PHONY: clean-data clean - -clean-data: - rm -f $(DATA_H) - -clean: clean-data diff --git a/sw/blas/syrk/.gitignore b/sw/blas/syrk/.gitignore deleted file mode 100644 index 8485f615e..000000000 --- a/sw/blas/syrk/.gitignore +++ /dev/null @@ -1 +0,0 @@ -data/data.h \ No newline at end of file diff --git a/sw/dnn/.gitignore b/sw/dnn/.gitignore deleted file mode 100644 index aed262ca8..000000000 --- a/sw/dnn/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*/data/data.h From bdfa4bedcc82afc96f0e56f72a14dd5f44a58979 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Fri, 23 Aug 2024 13:01:03 +0200 Subject: [PATCH 16/19] sw: Uniformize random data generation --- sw/apps/atax/scripts/datagen.py | 5 ++-- sw/apps/correlation/scripts/datagen.py | 3 +- sw/apps/covariance/scripts/datagen.py | 6 ++-- sw/apps/doitgen/scripts/datagen.py | 8 ++--- sw/blas/axpy/scripts/datagen.py | 12 ++++---- sw/blas/dot/scripts/datagen.py | 7 ++--- sw/blas/gemm/scripts/datagen.py | 16 +++++----- sw/blas/syrk/scripts/datagen.py | 12 ++++---- util/sim/data_utils.py | 41 ++++++++++++++++++++++++++ 9 files changed, 74 insertions(+), 36 deletions(-) diff --git a/sw/apps/atax/scripts/datagen.py b/sw/apps/atax/scripts/datagen.py index 51317c70e..c73ae70a2 100755 --- a/sw/apps/atax/scripts/datagen.py +++ b/sw/apps/atax/scripts/datagen.py @@ -8,6 +8,7 @@ import numpy as np +import snitch.util.sim.data_utils as du from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \ format_array_declaration, format_ifdef_wrapper, DataGen @@ -26,8 +27,8 @@ def emit_header(self, **kwargs): header = [super().emit_header()] M, N = kwargs['M'], kwargs['N'] - A = np.random.randint(-200, 100, size=(M, N))/100 - x = np.random.randint(-200, 100, size=(N, 1))/100 + A = du.generate_random_array((M, N)) + x = du.generate_random_array((N, 1)) y = self.golden_model(A, x) assert (M % 8) == 0, "M must be an integer multiple of the number of cores" diff --git a/sw/apps/correlation/scripts/datagen.py b/sw/apps/correlation/scripts/datagen.py index b2047d5eb..7880c1693 100755 --- a/sw/apps/correlation/scripts/datagen.py +++ b/sw/apps/correlation/scripts/datagen.py @@ -8,6 +8,7 @@ import numpy as np +import snitch.util.sim.data_utils as du from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \ format_array_declaration, format_ifdef_wrapper, DataGen @@ -26,7 +27,7 @@ def emit_header(self, **kwargs): header = [super().emit_header()] M, N = kwargs['M'], kwargs['N'] - data = np.random.randint(-200, 100, size=(N, M))/100 + data = du.generate_random_array((N, M)) corr = self.golden_model(data) data = data.flatten() diff --git a/sw/apps/covariance/scripts/datagen.py b/sw/apps/covariance/scripts/datagen.py index c3b7cd8b3..07bb92d0a 100755 --- a/sw/apps/covariance/scripts/datagen.py +++ b/sw/apps/covariance/scripts/datagen.py @@ -8,7 +8,7 @@ import numpy as np -from snitch.util.sim import data_utils +import snitch.util.sim.data_utils as du from snitch.util.sim.data_utils import format_array_definition, \ format_array_declaration, format_struct_definition, DataGen @@ -42,14 +42,14 @@ def validate(self, **kwargs): total_size = 2 * a_tile_size + b_tile_size if DOUBLE_BUFFER: total_size *= 2 - data_utils.validate_tcdm_footprint(total_size) + du.validate_tcdm_footprint(total_size) def emit_header(self, **kwargs): header = [super().emit_header()] self.validate(**kwargs) - data = np.random.randint(-200, 100, size=(kwargs['n'], kwargs['m'])) + data = du.generate_random_array((kwargs['n'], kwargs['m'])) cov = self.golden_model(data) data = data.transpose().flatten() diff --git a/sw/apps/doitgen/scripts/datagen.py b/sw/apps/doitgen/scripts/datagen.py index d0dddf6f5..5f14ec86d 100755 --- a/sw/apps/doitgen/scripts/datagen.py +++ b/sw/apps/doitgen/scripts/datagen.py @@ -7,7 +7,7 @@ import numpy as np -from snitch.util.sim import data_utils +import snitch.util.sim.data_utils as du from snitch.util.sim.data_utils import format_array_definition, format_struct_definition, DataGen np.random.seed(42) @@ -49,15 +49,15 @@ def validate(self, **kwargs): total_size = 2 * a_tile_size + x_size if DOUBLE_BUFFER: total_size *= 2 - data_utils.validate_tcdm_footprint(total_size) + du.validate_tcdm_footprint(total_size) def emit_header(self, **kwargs): header = [super().emit_header()] self.validate(**kwargs) - A = np.random.randint(-100, 100, size=(kwargs['r'], kwargs['q'], kwargs['s'])) - x = np.random.randint(-100, 100, size=(kwargs['s'], kwargs['s'])) + A = du.generate_random_array((kwargs['r'], kwargs['q'], kwargs['s'])) + x = du.generate_random_array((kwargs['s'], kwargs['s'])) _ = self.golden_model(A, x) diff --git a/sw/blas/axpy/scripts/datagen.py b/sw/blas/axpy/scripts/datagen.py index cf6795667..ec00a4c88 100755 --- a/sw/blas/axpy/scripts/datagen.py +++ b/sw/blas/axpy/scripts/datagen.py @@ -8,15 +8,13 @@ import numpy as np import sys -from snitch.util.sim import data_utils +import snitch.util.sim.data_utils as du from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \ format_array_declaration, format_ifdef_wrapper, format_struct_definition, DataGen class AxpyDataGen(DataGen): - MIN = -1000 - MAX = +1000 # AXI splits bursts crossing 4KB address boundaries. To minimize # the occurrence of these splits the data should be aligned to 4KB BURST_ALIGNMENT = 4096 @@ -36,16 +34,16 @@ def validate_config(self, **kwargs): # Note: doesn't account for gaps created by data alignment vec_size = n_per_tile * 8 total_size = 2 * 3 * vec_size - data_utils.validate_tcdm_footprint(total_size) + du.validate_tcdm_footprint(total_size) def emit_header(self, **kwargs): header = [super().emit_header()] self.validate_config(**kwargs) - a = np.random.uniform(self.MIN, self.MAX, 1)[0] - x = np.random.uniform(self.MIN, self.MAX, kwargs['n']) - y = np.random.uniform(self.MIN, self.MAX, kwargs['n']) + a = du.generate_random_array(1)[0] + x = du.generate_random_array(kwargs['n']) + y = du.generate_random_array(kwargs['n']) g = self.golden_model(a, x, y) x_uid = 'x' diff --git a/sw/blas/dot/scripts/datagen.py b/sw/blas/dot/scripts/datagen.py index 01560c51f..d11b53ff8 100755 --- a/sw/blas/dot/scripts/datagen.py +++ b/sw/blas/dot/scripts/datagen.py @@ -6,14 +6,13 @@ import numpy as np import sys +import snitch.util.sim.data_utils as du from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \ format_scalar_declaration, format_ifdef_wrapper, DataGen class DotDataGen(DataGen): - MIN = -1000 - MAX = +1000 # AXI splits bursts crossing 4KB address boundaries. To minimize # the occurrence of these splits the data should be aligned to 4KB BURST_ALIGNMENT = 4096 @@ -25,8 +24,8 @@ def emit_header(self, **kwargs): header = [super().emit_header()] n = kwargs['n'] - x = np.random.uniform(self.MIN, self.MAX, n) - y = np.random.uniform(self.MIN, self.MAX, n) + x = du.generate_random_array(n) + y = du.generate_random_array(n) g = self.golden_model(x, y) assert (n % (8 * 4)) == 0, "n must be an integer multiple of the number of cores times " \ diff --git a/sw/blas/gemm/scripts/datagen.py b/sw/blas/gemm/scripts/datagen.py index da7f8ba57..636175604 100755 --- a/sw/blas/gemm/scripts/datagen.py +++ b/sw/blas/gemm/scripts/datagen.py @@ -10,10 +10,9 @@ import numpy as np import re -import pyflexfloat as ff import sys -from snitch.util.sim import data_utils +import snitch.util.sim.data_utils as du from snitch.util.sim.data_utils import DataGen, format_array_declaration, \ format_struct_definition, format_array_definition, format_ifdef_wrapper @@ -56,14 +55,14 @@ def validate_config(self, gemm_fp, parallelize_m, # Calculate total TCDM occupation # Note: doesn't account for double buffering - prec = data_utils.size_from_precision_t(dtype) + prec = du.size_from_precision_t(dtype) a_size = frac_m * frac_k * prec b_size = frac_k * frac_n * prec c_size = frac_m * frac_n * prec total_size = a_size total_size += b_size total_size += c_size - data_utils.validate_tcdm_footprint(total_size) + du.validate_tcdm_footprint(total_size) assert (M % m_tiles) == 0, 'M is not an integer multiple of tile size' assert (N % n_tiles) == 0, 'N is not an integer multiple of tile size' @@ -99,12 +98,11 @@ def emit_header(self, **kwargs): prec, _ = self.infer_implementation(kwargs['gemm_fp']) - ff_desc = data_utils.ff_desc_from_precision_t(prec) - ctype = data_utils.ctype_from_precision_t(prec) + ctype = du.ctype_from_precision_t(prec) - a = ff.array(np.random.rand(M, K), ff_desc) - b = ff.array(np.random.rand(K, N), ff_desc) - c = ff.array(np.random.rand(M, N), ff_desc) + a = du.generate_random_array((M, K), prec) + b = du.generate_random_array((K, N), prec) + c = du.generate_random_array((M, N), prec) result = self.exact_golden_model(1, a, b, kwargs['beta'], c) # Store matrices in transposed form if requested diff --git a/sw/blas/syrk/scripts/datagen.py b/sw/blas/syrk/scripts/datagen.py index 9b4959fca..ad15222f3 100755 --- a/sw/blas/syrk/scripts/datagen.py +++ b/sw/blas/syrk/scripts/datagen.py @@ -7,7 +7,7 @@ import numpy as np -from snitch.util.sim import data_utils +import snitch.util.sim.data_utils as du from snitch.util.sim.data_utils import format_array_definition, format_struct_definition, DataGen @@ -37,7 +37,7 @@ def validate(self, **kwargs): total_size = 2 * a_tile_size + c_tile_size if DOUBLE_BUFFER: total_size *= 2 - data_utils.validate_tcdm_footprint(total_size) + du.validate_tcdm_footprint(total_size) def emit_header(self, **kwargs): header = [super().emit_header()] @@ -47,14 +47,14 @@ def emit_header(self, **kwargs): if 'alpha' in kwargs: alpha = kwargs['alpha'] else: - alpha = np.random.randint(-200, 100)/100 + alpha = du.generate_random_array(1)[0] if 'beta' in kwargs: beta = kwargs['beta'] else: - beta = np.random.randint(-200, 100)/100 + beta = du.generate_random_array(1)[0] - A = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['n']))/100 - C_in = np.random.randint(-200, 100, size=(kwargs['m'], kwargs['m']))/100 + A = du.generate_random_array((kwargs['m'], kwargs['n'])) + C_in = du.generate_random_array((kwargs['m'], kwargs['m'])) A = A.flatten() C_in = C_in.flatten() diff --git a/util/sim/data_utils.py b/util/sim/data_utils.py index e6f48acce..3b732c5cc 100644 --- a/util/sim/data_utils.py +++ b/util/sim/data_utils.py @@ -83,6 +83,24 @@ def torch_type_from_precision_t(prec): return precision_t_to_torch_type_map[_integer_precision_t(prec)] +def numpy_type_from_precision_t(prec): + """Convert `precision_t` type to PyTorch type. + + Args: + prec: A value of type `precision_t`. Accepts both enum strings + (e.g. "FP64") and integer enumeration values (e.g. 8). + """ + # Types which have a direct correspondence in Numpy + precision_t_to_numpy_type_map = { + 8: np.float64, + 4: np.float32, + 2: np.float16 + } + prec = _integer_precision_t(prec) + assert prec != 1, "No direct correspondence between FP8 and Numpy" + return precision_t_to_numpy_type_map[prec] + + # Returns the C type representing a floating-point value of the specified precision def ctype_from_precision_t(prec): """Convert `precision_t` type to a C type string. @@ -100,6 +118,29 @@ def ctype_from_precision_t(prec): return precision_t_to_ctype_map[_integer_precision_t(prec)] +def generate_random_array(size, prec='FP64'): + """Consistent random array generation for Snitch experiments. + + Samples values between -1 and 1 from a uniform distribution and + of the exact specified type, e.g. actual 64-bit doubles. + + This function ensures that e.g. power measurements are not skewed + by using integer values in the FPU. + + Args: + size: Tuple of array dimensions. + prec: A value of type `precision_t`. Accepts both enum strings + (e.g. "FP64") and integer enumeration values (e.g. 8). + """ + # Generate in 64b precision and then cast down + rand = np.random.default_rng().random(size=size, dtype=np.float64) * 2 - 1 + # Generate FlexFloat array for 8b floats, casted from 16b Numpy array + if _integer_precision_t(prec) == 1: + return ff.array(rand.astype(np.float16), ff_desc_from_precision_t(prec)) + else: + return rand.astype(numpy_type_from_precision_t(prec)) + + def flatten(array): """Flatten various array types with a homogeneous API. From 8ce7b9119c73765322bae9ddd6c1aa90226f3850 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Fri, 23 Aug 2024 13:07:01 +0200 Subject: [PATCH 17/19] sw: Uniformize `data_utils` import --- sw/apps/atax/scripts/datagen.py | 18 ++++++++--------- sw/apps/correlation/scripts/datagen.py | 19 +++++++++-------- sw/apps/covariance/scripts/datagen.py | 10 ++++----- sw/apps/doitgen/scripts/datagen.py | 9 ++++----- sw/blas/axpy/scripts/datagen.py | 19 ++++++++--------- sw/blas/dot/scripts/datagen.py | 22 +++++++++----------- sw/blas/gemm/scripts/datagen.py | 28 ++++++++++++-------------- sw/blas/syrk/scripts/datagen.py | 9 ++++----- 8 files changed, 60 insertions(+), 74 deletions(-) diff --git a/sw/apps/atax/scripts/datagen.py b/sw/apps/atax/scripts/datagen.py index c73ae70a2..0008bea26 100755 --- a/sw/apps/atax/scripts/datagen.py +++ b/sw/apps/atax/scripts/datagen.py @@ -9,8 +9,6 @@ import numpy as np import snitch.util.sim.data_utils as du -from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \ - format_array_declaration, format_ifdef_wrapper, DataGen # AXI splits bursts crossing 4KB address boundaries. To minimize @@ -18,7 +16,7 @@ BURST_ALIGNMENT = 4096 -class AtaxDataGen(DataGen): +class AtaxDataGen(du.DataGen): def golden_model(self, A, x): return np.matmul(A.transpose(), np.matmul(A, x)) @@ -38,13 +36,13 @@ def emit_header(self, **kwargs): x = x.flatten() y = y.flatten() - header += [format_scalar_definition('uint32_t', 'M', M)] - header += [format_scalar_definition('uint32_t', 'N', N)] - header += [format_array_definition('double', 'A', A, alignment=BURST_ALIGNMENT)] - header += [format_array_definition('double', 'x', x, alignment=BURST_ALIGNMENT)] - header += [format_array_declaration('double', 'y', y.shape, alignment=BURST_ALIGNMENT)] - result_def = format_array_definition('double', 'golden', y, alignment=BURST_ALIGNMENT) - header += [format_ifdef_wrapper('BIST', result_def)] + header += [du.format_scalar_definition('uint32_t', 'M', M)] + header += [du.format_scalar_definition('uint32_t', 'N', N)] + header += [du.format_array_definition('double', 'A', A, alignment=BURST_ALIGNMENT)] + header += [du.format_array_definition('double', 'x', x, alignment=BURST_ALIGNMENT)] + header += [du.format_array_declaration('double', 'y', y.shape, alignment=BURST_ALIGNMENT)] + result_def = du.format_array_definition('double', 'golden', y, alignment=BURST_ALIGNMENT) + header += [du.format_ifdef_wrapper('BIST', result_def)] header = '\n\n'.join(header) return header diff --git a/sw/apps/correlation/scripts/datagen.py b/sw/apps/correlation/scripts/datagen.py index 7880c1693..d60f527d1 100755 --- a/sw/apps/correlation/scripts/datagen.py +++ b/sw/apps/correlation/scripts/datagen.py @@ -9,8 +9,6 @@ import numpy as np import snitch.util.sim.data_utils as du -from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \ - format_array_declaration, format_ifdef_wrapper, DataGen # AXI splits bursts crossing 4KB address boundaries. To minimize @@ -18,7 +16,7 @@ BURST_ALIGNMENT = 4096 -class CorrelationDataGen(DataGen): +class CorrelationDataGen(du.DataGen): def golden_model(self, data): return np.corrcoef(data, rowvar=False) @@ -33,13 +31,14 @@ def emit_header(self, **kwargs): data = data.flatten() corr = corr.flatten() - header += [format_scalar_definition('uint32_t', 'M', M)] - header += [format_scalar_definition('uint32_t', 'N', N)] - header += [format_array_definition('double', 'data', data, alignment=BURST_ALIGNMENT)] - header += [format_array_declaration('double', 'corr', corr.shape, - alignment=BURST_ALIGNMENT)] - result_def = format_array_definition('double', 'golden', corr, alignment=BURST_ALIGNMENT) - header += [format_ifdef_wrapper('BIST', result_def)] + header += [du.format_scalar_definition('uint32_t', 'M', M)] + header += [du.format_scalar_definition('uint32_t', 'N', N)] + header += [du.format_array_definition('double', 'data', data, alignment=BURST_ALIGNMENT)] + header += [du.format_array_declaration('double', 'corr', corr.shape, + alignment=BURST_ALIGNMENT)] + result_def = du.format_array_definition('double', 'golden', corr, + alignment=BURST_ALIGNMENT) + header += [du.format_ifdef_wrapper('BIST', result_def)] header = '\n\n'.join(header) return header diff --git a/sw/apps/covariance/scripts/datagen.py b/sw/apps/covariance/scripts/datagen.py index 07bb92d0a..7beb2c671 100755 --- a/sw/apps/covariance/scripts/datagen.py +++ b/sw/apps/covariance/scripts/datagen.py @@ -9,15 +9,13 @@ import numpy as np import snitch.util.sim.data_utils as du -from snitch.util.sim.data_utils import format_array_definition, \ - format_array_declaration, format_struct_definition, DataGen np.random.seed(42) DOUBLE_BUFFER = True -class CovarianceDataGen(DataGen): +class CovarianceDataGen(du.DataGen): # Function pointers to alternative implementations FUNCPTRS = ["covariance_naive", "covariance_baseline", "covariance_opt"] @@ -69,9 +67,9 @@ def emit_header(self, **kwargs): 'funcptr': kwargs['funcptr'] } - header += [format_array_definition('double', data_uid, data)] - header += [format_array_declaration('double', cov_uid, cov.shape)] - header += [format_struct_definition('covariance_args_t', 'args', cfg)] + header += [du.format_array_definition('double', data_uid, data)] + header += [du.format_array_declaration('double', cov_uid, cov.shape)] + header += [du.format_struct_definition('covariance_args_t', 'args', cfg)] header = '\n\n'.join(header) return header diff --git a/sw/apps/doitgen/scripts/datagen.py b/sw/apps/doitgen/scripts/datagen.py index 5f14ec86d..d1a9c3b46 100755 --- a/sw/apps/doitgen/scripts/datagen.py +++ b/sw/apps/doitgen/scripts/datagen.py @@ -8,14 +8,13 @@ import numpy as np import snitch.util.sim.data_utils as du -from snitch.util.sim.data_utils import format_array_definition, format_struct_definition, DataGen np.random.seed(42) DOUBLE_BUFFER = True -class DoitgenDataGen(DataGen): +class DoitgenDataGen(du.DataGen): # Function pointers to alternative implementations FUNCPTRS = ["doitgen_naive", "doitgen_baseline", "doitgen_opt"] @@ -78,9 +77,9 @@ def emit_header(self, **kwargs): 'funcptr': kwargs['funcptr'] } - header += [format_array_definition('double', A_uid, A)] - header += [format_array_definition('double', x_uid, x)] - header += [format_struct_definition('doitgen_args_t', 'args', cfg)] + header += [du.format_array_definition('double', A_uid, A)] + header += [du.format_array_definition('double', x_uid, x)] + header += [du.format_struct_definition('doitgen_args_t', 'args', cfg)] header = '\n\n'.join(header) return header diff --git a/sw/blas/axpy/scripts/datagen.py b/sw/blas/axpy/scripts/datagen.py index ec00a4c88..38634dd5e 100755 --- a/sw/blas/axpy/scripts/datagen.py +++ b/sw/blas/axpy/scripts/datagen.py @@ -5,15 +5,12 @@ # # Author: Luca Colagrande -import numpy as np import sys import snitch.util.sim.data_utils as du -from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \ - format_array_declaration, format_ifdef_wrapper, format_struct_definition, DataGen -class AxpyDataGen(DataGen): +class AxpyDataGen(du.DataGen): # AXI splits bursts crossing 4KB address boundaries. To minimize # the occurrence of these splits the data should be aligned to 4KB @@ -60,16 +57,16 @@ def emit_header(self, **kwargs): 'funcptr': kwargs['funcptr'] } - header += [format_scalar_definition('const double', 'a', a)] - header += [format_array_definition('double', x_uid, x, + header += [du.format_scalar_definition('const double', 'a', a)] + header += [du.format_array_definition('double', x_uid, x, alignment=self.BURST_ALIGNMENT, section=kwargs['section'])] - header += [format_array_definition('double', y_uid, y, + header += [du.format_array_definition('double', y_uid, y, alignment=self.BURST_ALIGNMENT, section=kwargs['section'])] - header += [format_array_declaration('double', z_uid, x.shape, + header += [du.format_array_declaration('double', z_uid, x.shape, alignment=self.BURST_ALIGNMENT, section=kwargs['section'])] - header += [format_struct_definition('axpy_args_t', 'args', cfg)] - result_def = format_array_definition('double', 'g', g) - header += [format_ifdef_wrapper('BIST', result_def)] + header += [du.format_struct_definition('axpy_args_t', 'args', cfg)] + result_def = du.format_array_definition('double', 'g', g) + header += [du.format_ifdef_wrapper('BIST', result_def)] header = '\n\n'.join(header) return header diff --git a/sw/blas/dot/scripts/datagen.py b/sw/blas/dot/scripts/datagen.py index d11b53ff8..8a8631a6a 100755 --- a/sw/blas/dot/scripts/datagen.py +++ b/sw/blas/dot/scripts/datagen.py @@ -7,11 +7,9 @@ import sys import snitch.util.sim.data_utils as du -from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \ - format_scalar_declaration, format_ifdef_wrapper, DataGen -class DotDataGen(DataGen): +class DotDataGen(du.DataGen): # AXI splits bursts crossing 4KB address boundaries. To minimize # the occurrence of these splits the data should be aligned to 4KB @@ -31,15 +29,15 @@ def emit_header(self, **kwargs): assert (n % (8 * 4)) == 0, "n must be an integer multiple of the number of cores times " \ "the unrolling factor" - header += [format_scalar_definition('const uint32_t', 'n', n)] - header += [format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT, - section=kwargs['section'])] - header += [format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT, - section=kwargs['section'])] - header += [format_scalar_declaration('double', 'result', alignment=self.BURST_ALIGNMENT, - section=kwargs['section'])] - result_def = format_scalar_definition('double', 'g', g) - header += [format_ifdef_wrapper('BIST', result_def)] + header += [du.format_scalar_definition('const uint32_t', 'n', n)] + header += [du.format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT, + section=kwargs['section'])] + header += [du.format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT, + section=kwargs['section'])] + header += [du.format_scalar_declaration('double', 'result', alignment=self.BURST_ALIGNMENT, + section=kwargs['section'])] + result_def = du.format_scalar_definition('double', 'g', g) + header += [du.format_ifdef_wrapper('BIST', result_def)] header = '\n\n'.join(header) return header diff --git a/sw/blas/gemm/scripts/datagen.py b/sw/blas/gemm/scripts/datagen.py index 636175604..2eb6e2f4d 100755 --- a/sw/blas/gemm/scripts/datagen.py +++ b/sw/blas/gemm/scripts/datagen.py @@ -13,14 +13,12 @@ import sys import snitch.util.sim.data_utils as du -from snitch.util.sim.data_utils import DataGen, format_array_declaration, \ - format_struct_definition, format_array_definition, format_ifdef_wrapper np.random.seed(42) -class GemmDataGen(DataGen): +class GemmDataGen(du.DataGen): # AXI splits bursts crossing 4KB address boundaries. To minimize # the occurrence of these splits the data should be aligned to 4KB @@ -125,18 +123,18 @@ def emit_header(self, **kwargs): b = b.flatten() c = c.flatten() - header += [format_array_declaration(ctype, a_uid, a.shape)] - header += [format_array_declaration(ctype, b_uid, b.shape)] - header += [format_array_declaration(ctype, c_uid, c.shape)] - header += [format_struct_definition('gemm_args_t', 'args', cfg)] - header += [format_array_definition(ctype, a_uid, a, - section=kwargs['section'])] - header += [format_array_definition(ctype, b_uid, b, - section=kwargs['section'])] - header += [format_array_definition(ctype, c_uid, c, - section=kwargs['section'])] - result_def = format_array_definition(ctype, 'result', result.flatten()) - header += [format_ifdef_wrapper('BIST', result_def)] + header += [du.format_array_declaration(ctype, a_uid, a.shape)] + header += [du.format_array_declaration(ctype, b_uid, b.shape)] + header += [du.format_array_declaration(ctype, c_uid, c.shape)] + header += [du.format_struct_definition('gemm_args_t', 'args', cfg)] + header += [du.format_array_definition(ctype, a_uid, a, + section=kwargs['section'])] + header += [du.format_array_definition(ctype, b_uid, b, + section=kwargs['section'])] + header += [du.format_array_definition(ctype, c_uid, c, + section=kwargs['section'])] + result_def = du.format_array_definition(ctype, 'result', result.flatten()) + header += [du.format_ifdef_wrapper('BIST', result_def)] header = '\n\n'.join(header) return header diff --git a/sw/blas/syrk/scripts/datagen.py b/sw/blas/syrk/scripts/datagen.py index ad15222f3..3fb86644f 100755 --- a/sw/blas/syrk/scripts/datagen.py +++ b/sw/blas/syrk/scripts/datagen.py @@ -8,13 +8,12 @@ import numpy as np import snitch.util.sim.data_utils as du -from snitch.util.sim.data_utils import format_array_definition, format_struct_definition, DataGen DOUBLE_BUFFER = True -class SyrkDataGen(DataGen): +class SyrkDataGen(du.DataGen): # Function pointers to alternative implementations FUNCPTRS = ["syrk_naive", "syrk_baseline", "syrk_opt"] @@ -73,9 +72,9 @@ def emit_header(self, **kwargs): 'funcptr': kwargs['funcptr'] } - header += [format_array_definition('double', A_uid, A)] - header += [format_array_definition('double', C_uid, C_in)] - header += [format_struct_definition('syrk_args_t', 'args', cfg)] + header += [du.format_array_definition('double', A_uid, A)] + header += [du.format_array_definition('double', C_uid, C_in)] + header += [du.format_struct_definition('syrk_args_t', 'args', cfg)] header = '\n\n'.join(header) return header From ad76f7807474ce3713c81e40ca20be2718790f47 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Fri, 23 Aug 2024 14:58:31 +0200 Subject: [PATCH 18/19] gemm: Lower error thresholds as cancellations can now take place --- sw/blas/gemm/scripts/verify.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sw/blas/gemm/scripts/verify.py b/sw/blas/gemm/scripts/verify.py index 40840b327..353ea1328 100755 --- a/sw/blas/gemm/scripts/verify.py +++ b/sw/blas/gemm/scripts/verify.py @@ -18,9 +18,9 @@ class GemmVerifier(Verifier): OUTPUT_UIDS = ['c'] ERR_THRESHOLD = { 1: 1e-4, - 2: 1e-2, - 4: 1e-6, - 8: 1e-6 + 2: 8e-2, + 4: 1e-3, + 8: 1e-3 } def __init__(self): From 10c545fd3c86c48ce2c47d0bbdde6b82b864d68d Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 28 Aug 2024 17:08:48 +0200 Subject: [PATCH 19/19] ci: Fix Dockerfile --- util/container/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/util/container/Dockerfile b/util/container/Dockerfile index 9cdc7d9aa..bfef21266 100644 --- a/util/container/Dockerfile +++ b/util/container/Dockerfile @@ -94,6 +94,7 @@ RUN tar xzf snitch-spike-dasm-${SPIKE_DASM_VERSION}-x86_64-linux-gnu-ubuntu18.04 # Install Doxygen RUN wget https://www.doxygen.nl/files/doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz RUN tar xzf doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz +RUN mv doxygen-${DOXYGEN_VERSION} doxygen # 2. Stage FROM ubuntu:22.04 AS snitch_cluster @@ -154,7 +155,7 @@ COPY --from=builder /tools/spike-dasm bin/ COPY --from=builder /root/.cargo/bin/banshee bin/ COPY --from=builder /opt/python /opt/python COPY --from=builder /tools/verilator /tools/verilator/ -COPY --from=builder /tools/doxygen-${DOXYGEN_VERSION}/bin/doxygen bin/ +COPY --from=builder /tools/doxygen/bin/doxygen bin/ # Create and activate virtual environment ENV VIRTUAL_ENV "/root/.venvs/snitch_cluster"