diff --git a/sw/apps/doitgen/.gitignore b/sw/apps/doitgen/.gitignore new file mode 100644 index 0000000000..8485f615ee --- /dev/null +++ b/sw/apps/doitgen/.gitignore @@ -0,0 +1 @@ +data/data.h \ No newline at end of file diff --git a/sw/apps/doitgen/data/params.json b/sw/apps/doitgen/data/params.json new file mode 100644 index 0000000000..ef123f87d1 --- /dev/null +++ b/sw/apps/doitgen/data/params.json @@ -0,0 +1,12 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + "r": 32, + "q": 32, + "s": 8, + "r_tiles": 2, + "q_tiles": 2, + "funcptr": "doitgen_baseline" +} diff --git a/sw/apps/doitgen/scripts/datagen.py b/sw/apps/doitgen/scripts/datagen.py new file mode 100755 index 0000000000..d0dddf6f5a --- /dev/null +++ b/sw/apps/doitgen/scripts/datagen.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Author: Luca Colagrande + +import numpy as np + +from snitch.util.sim import data_utils +from snitch.util.sim.data_utils import format_array_definition, format_struct_definition, DataGen + +np.random.seed(42) + +DOUBLE_BUFFER = True + + +class DoitgenDataGen(DataGen): + + # Function pointers to alternative implementations + FUNCPTRS = ["doitgen_naive", "doitgen_baseline", "doitgen_opt"] + + def golden_model(self, A, x): + R, Q, S = A.shape + P, _ = x.shape + Aout = np.ndarray((R, Q, P)) + for r in range(R): + for q in range(Q): + for p in range(P): + Aout[r, q, p] = 0 + for s in range(S): + Aout[r, q, p] += A[r, q, s] * x[p, s] + return Aout + + def validate(self, **kwargs): + n_cores = 8 + assert (kwargs['r'] % kwargs['r_tiles']) == 0, "r must be an integer multiple of r_tiles" + assert (kwargs['q'] % kwargs['q_tiles']) == 0, "q must be an integer multiple of q_tiles" + if kwargs['funcptr'] != 'doitgen_naive': + assert (kwargs['s'] % 4) == 0, "s must be an integer multiple of unrolling factor" + r_per_tile = kwargs['r'] / kwargs['r_tiles'] + q_per_tile = kwargs['q'] / kwargs['q_tiles'] + assert (r_per_tile % n_cores) == 0, "r_per_tile must be an integer multiple of n_cores" + assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}" + + # Calculate total TCDM occupation + a_tile_size = r_per_tile * q_per_tile * kwargs['s'] * 8 + x_size = kwargs['s'] * kwargs['s'] * 8 + total_size = 2 * a_tile_size + x_size + if DOUBLE_BUFFER: + total_size *= 2 + data_utils.validate_tcdm_footprint(total_size) + + def emit_header(self, **kwargs): + header = [super().emit_header()] + + self.validate(**kwargs) + + A = np.random.randint(-100, 100, size=(kwargs['r'], kwargs['q'], kwargs['s'])) + x = np.random.randint(-100, 100, size=(kwargs['s'], kwargs['s'])) + + _ = self.golden_model(A, x) + + A = A.flatten() + x = x.flatten() + + A_uid = 'A' + x_uid = 'x' + + cfg = { + 'r': kwargs['r'], + 'q': kwargs['q'], + 's': kwargs['s'], + 'A': A_uid, + 'x': x_uid, + 'r_tiles': kwargs['r_tiles'], + 'q_tiles': kwargs['q_tiles'], + 'funcptr': kwargs['funcptr'] + } + + header += [format_array_definition('double', A_uid, A)] + header += [format_array_definition('double', x_uid, x)] + header += [format_struct_definition('doitgen_args_t', 'args', cfg)] + header = '\n\n'.join(header) + + return header + + +if __name__ == '__main__': + DoitgenDataGen().main() diff --git a/sw/apps/doitgen/scripts/verify.py b/sw/apps/doitgen/scripts/verify.py new file mode 100755 index 0000000000..8f72b0415f --- /dev/null +++ b/sw/apps/doitgen/scripts/verify.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import numpy as np +import sys +from datagen import DoitgenDataGen + +from snitch.util.sim.verif_utils import Verifier + + +class DoitgenVerifier(Verifier): + + OUTPUT_UIDS = ['A'] + + def __init__(self): + super().__init__() + self.func_args = { + 'r': 'I', + 'q': 'I', + 's': 'I', + 'A': 'I', + 'x': 'I', + 'r_tiles': 'I', + 'q_tiles': 'I', + 'funcptr': 'I' + } + self.func_args = self.get_input_from_symbol('args', self.func_args) + + def get_actual_results(self): + return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double') + + def get_expected_results(self): + A = self.get_input_from_symbol('A', 'double') + A = np.reshape(A, (self.func_args['r'], self.func_args['q'], self.func_args['s'])) + x = self.get_input_from_symbol('x', 'double') + x = np.reshape(x, (self.func_args['s'], self.func_args['s'])) + return DoitgenDataGen().golden_model(A, x).flatten() + + def check_results(self, *args): + return super().check_results(*args, rtol=1e-10) + + +if __name__ == "__main__": + sys.exit(DoitgenVerifier().main()) diff --git a/sw/apps/doitgen/src/args.h b/sw/apps/doitgen/src/args.h new file mode 100644 index 0000000000..5d3f56ce45 --- /dev/null +++ b/sw/apps/doitgen/src/args.h @@ -0,0 +1,22 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#pragma once +#include + +typedef void (*doitgen_fp_t)(uint32_t r, uint32_t q, uint32_t s, double *A, + double *x, double *Aout); + +typedef struct { + uint32_t r; + uint32_t q; + uint32_t s; + double *A; + double *x; + uint32_t r_tiles; + uint32_t q_tiles; + doitgen_fp_t funcptr; +} doitgen_args_t; diff --git a/sw/apps/doitgen/src/doitgen.h b/sw/apps/doitgen/src/doitgen.h new file mode 100644 index 0000000000..2f7bc61288 --- /dev/null +++ b/sw/apps/doitgen/src/doitgen.h @@ -0,0 +1,303 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include "args.h" +#include "snrt.h" + +#define DOUBLE_BUFFER 1 + +__thread int setup_ssr = 1; + +void doitgen_naive(uint32_t r, uint32_t q, uint32_t s, double *A, double *x, + double *Aout) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + for (uint32_t i = offset; i < r; i += stride) { + for (uint32_t j = 0; j < q; j++) { + for (uint32_t k = 0; k < s; k++) { + Aout[i * q * s + j * s + k] = 0.0; + for (uint32_t l = 0; l < s; l++) { + Aout[i * q * s + j * s + k] += + A[i * q * s + j * s + l] * x[k * s + l]; + } + } + } + } + + snrt_fpu_fence(); +} + +void doitgen_baseline(uint32_t r, uint32_t q, uint32_t s, double *A, double *x, + double *Aout) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + // Unrolling factors + // Note: changes must be reflected in the inline assembly code + // and datagen script + const uint32_t unroll1 = 4; + const uint32_t unroll0 = 4; + + for (uint32_t i = offset; i < r; i += stride) { + for (uint32_t j = 0; j < q; j++) { + for (uint32_t k = 0; k < s; k += unroll1) { + double acc[4]; + acc[0] = 0; + acc[1] = 0; + acc[2] = 0; + acc[3] = 0; + + for (uint32_t l = 0; l < s; l += unroll0) { + asm volatile( + "fmadd.d %[acc0], %[a0], %[x0], %[acc0] \n" + "fmadd.d %[acc1], %[a0], %[x1], %[acc1] \n" + "fmadd.d %[acc2], %[a0], %[x2], %[acc2] \n" + "fmadd.d %[acc3], %[a0], %[x3], %[acc3] \n" + "fmadd.d %[acc0], %[a1], %[x4], %[acc0] \n" + "fmadd.d %[acc1], %[a1], %[x5], %[acc1] \n" + "fmadd.d %[acc2], %[a1], %[x6], %[acc2] \n" + "fmadd.d %[acc3], %[a1], %[x7], %[acc3] \n" + "fmadd.d %[acc0], %[a2], %[x8], %[acc0] \n" + "fmadd.d %[acc1], %[a2], %[x9], %[acc1] \n" + "fmadd.d %[acc2], %[a2], %[x10], %[acc2] \n" + "fmadd.d %[acc3], %[a2], %[x11], %[acc3] \n" + "fmadd.d %[acc0], %[a3], %[x12], %[acc0] \n" + "fmadd.d %[acc1], %[a3], %[x13], %[acc1] \n" + "fmadd.d %[acc2], %[a3], %[x14], %[acc2] \n" + "fmadd.d %[acc3], %[a3], %[x15], %[acc3] \n" + : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), + [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]) + : [ a0 ] "f"(A[i * q * s + j * s + l + 0]), + [ a1 ] "f"(A[i * q * s + j * s + l + 1]), + [ a2 ] "f"(A[i * q * s + j * s + l + 2]), + [ a3 ] "f"(A[i * q * s + j * s + l + 3]), + [ x0 ] "f"(x[(k + 0) * s + l + 0]), + [ x1 ] "f"(x[(k + 1) * s + l + 0]), + [ x2 ] "f"(x[(k + 2) * s + l + 0]), + [ x3 ] "f"(x[(k + 3) * s + l + 0]), + [ x4 ] "f"(x[(k + 0) * s + l + 1]), + [ x5 ] "f"(x[(k + 1) * s + l + 1]), + [ x6 ] "f"(x[(k + 2) * s + l + 1]), + [ x7 ] "f"(x[(k + 3) * s + l + 1]), + [ x8 ] "f"(x[(k + 0) * s + l + 2]), + [ x9 ] "f"(x[(k + 1) * s + l + 2]), + [ x10 ] "f"(x[(k + 2) * s + l + 2]), + [ x11 ] "f"(x[(k + 3) * s + l + 2]), + [ x12 ] "f"(x[(k + 0) * s + l + 3]), + [ x13 ] "f"(x[(k + 1) * s + l + 3]), + [ x14 ] "f"(x[(k + 2) * s + l + 3]), + [ x15 ] "f"(x[(k + 3) * s + l + 3]) + :); + } + + Aout[i * q * s + j * s + k + 0] = acc[0]; + Aout[i * q * s + j * s + k + 1] = acc[1]; + Aout[i * q * s + j * s + k + 2] = acc[2]; + Aout[i * q * s + j * s + k + 3] = acc[3]; + } + } + } + + snrt_fpu_fence(); +} + +void doitgen_opt(uint32_t r, uint32_t q, uint32_t s, double *A, double *x, + double *Aout) { + uint32_t bound = r / snrt_cluster_compute_core_num(); + uint32_t offset = bound * snrt_cluster_core_idx(); + + // Unrolling factor of innermost loop + // Note: changes must be reflected in the inline assembly code + // and datagen script + const uint32_t unroll = 4; + + if (setup_ssr) { + // Configure ft0 and ft1 to load A and x + // for (i = offset; i < bound; i++) + // for (j = 0; j < q; j++) + // for (k1 = 0; k1 < s; k1 += unroll) + // for (l = 0; l < s; l++) + // for (k0 = 0; k0 < unroll; k0++) + // k = k1 + k0 + // ft0.push(A[i * q * s + j * s + l]) + // ft1.push(x[k * s + l]) + const uint32_t ssr0_b[4] = {unroll, s, s / unroll, q * bound}; + const uint32_t ssr0_i[4] = {0, sizeof(double), 0, s * sizeof(double)}; + snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], + ssr0_i[1], ssr0_i[2], ssr0_i[3]); + snrt_ssr_repeat(SNRT_SSR_DM0, unroll); + const uint32_t ssr1_b[4] = {unroll, s, s / unroll, q * bound}; + const uint32_t ssr1_i[4] = {s * sizeof(double), sizeof(double), + unroll * s * sizeof(double), 0}; + snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], + ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); + setup_ssr = 0; + } + + // SSR start address need to be configured each time + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, A + offset * q * s); + snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, x); + snrt_ssr_enable(); + + for (uint32_t i = offset; i < (offset + bound); i++) { + for (uint32_t j = 0; j < q; j++) { + for (uint32_t k = 0; k < s; k += unroll) { + double acc[unroll]; + acc[0] = 0; + acc[1] = 0; + acc[2] = 0; + acc[3] = 0; + + asm volatile( + "frep.o %[n_frep], %[unroll], 0, 0 \n" + "fmadd.d %[acc0], ft0, ft1, %[acc0] \n" + "fmadd.d %[acc1], ft0, ft1, %[acc1] \n" + "fmadd.d %[acc2], ft0, ft1, %[acc2] \n" + "fmadd.d %[acc3], ft0, ft1, %[acc3] \n" + : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), + [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]) + : [ n_frep ] "r"(s - 1), [ unroll ] "i"(unroll) + : "ft0", "ft1", "ft2"); + + Aout[i * q * s + j * s + k + 0] = acc[0]; + Aout[i * q * s + j * s + k + 1] = acc[1]; + Aout[i * q * s + j * s + k + 2] = acc[2]; + Aout[i * q * s + j * s + k + 3] = acc[3]; + } + } + } + + snrt_ssr_disable(); + snrt_fpu_fence(); +} + +void doitgen_job(doitgen_args_t *args) { + uint32_t r_frac, q_frac, a_tile_size, a_tile_bytes, x_size, x_bytes; + uint64_t local_a0_addr, local_aout0_addr, local_x0_addr, local_a1_addr, + local_aout1_addr; + double *local_a[2]; + double *local_aout[2]; + double *local_x; + uint32_t iterations, sb_iterations; + uint32_t i, i_dma_in, i_compute, i_dma_out, i_r, i_q, buff_idx; + +#ifndef JOB_ARGS_PRELOADED + // Allocate space for job arguments in TCDM + doitgen_args_t *local_args = (doitgen_args_t *)snrt_l1_next(); + + // Copy job arguments to TCDM + if (snrt_is_dm_core()) { + snrt_dma_start_1d(local_args, args, sizeof(doitgen_args_t)); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); + args = local_args; +#endif + + // Calculate size of each tile + r_frac = args->r / args->r_tiles; + q_frac = args->q / args->q_tiles; + a_tile_size = r_frac * q_frac * args->s; + x_size = args->s * args->s; + a_tile_bytes = a_tile_size * sizeof(double); + x_bytes = x_size * sizeof(double); + + // Allocate space for job operands in TCDM + local_x0_addr = (uint64_t)args + sizeof(doitgen_args_t); + local_a0_addr = local_x0_addr + x_bytes; + local_aout0_addr = local_a0_addr + a_tile_bytes; + local_x = (double *)local_x0_addr; + local_a[0] = (double *)local_a0_addr; + local_aout[0] = (double *)local_aout0_addr; + if (DOUBLE_BUFFER) { + local_a1_addr = local_aout0_addr + a_tile_bytes; + local_aout1_addr = local_a1_addr + a_tile_bytes; + local_a[1] = (double *)local_a1_addr; + local_aout[1] = (double *)local_aout1_addr; + } + + // Calculate number of iterations + sb_iterations = args->r_tiles * args->q_tiles; + if (DOUBLE_BUFFER) + iterations = sb_iterations + 2; + else + iterations = sb_iterations; + + // Iterate over all tiles + for (i = 0; i < iterations; i++) { + if (snrt_is_dm_core()) { + // DMA in + if (!DOUBLE_BUFFER || (i < sb_iterations)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_in = i; + buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0; + i_r = i_dma_in / args->q_tiles; + i_q = i_dma_in % args->q_tiles; + + // Copy job operands in TCDM + snrt_dma_load_2d_tile(local_a[buff_idx], args->A, i_r, i_q, + r_frac, q_frac * args->s, + args->q * args->s, sizeof(double)); + if (i_dma_in == 0) snrt_dma_start_1d(local_x, args->x, x_bytes); + snrt_dma_wait_all(); + + snrt_mcycle(); + } + + // Additional barriers required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + // DMA out + if (!DOUBLE_BUFFER || (i > 1)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_out = DOUBLE_BUFFER ? i - 2 : i; + buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0; + i_r = i_dma_out / args->q_tiles; + i_q = i_dma_out % args->q_tiles; + + // Copy job outputs from TCDM + snrt_dma_store_2d_tile(args->A, local_aout[buff_idx], i_r, i_q, + r_frac, q_frac * args->s, + args->q * args->s, sizeof(double)); + snrt_dma_wait_all(); + + snrt_mcycle(); + } + } + + // Compute + if (snrt_is_compute_core()) { + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1))) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_compute = DOUBLE_BUFFER ? i - 1 : i; + buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0; + + // Perform tile computation + doitgen_fp_t fp = args->funcptr; + fp(r_frac, q_frac, args->s, local_a[buff_idx], local_x, + local_aout[buff_idx]); + + snrt_mcycle(); + } + + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + } + // Synchronize cores after every iteration + snrt_cluster_hw_barrier(); + } +} diff --git a/sw/apps/doitgen/src/main.c b/sw/apps/doitgen/src/main.c new file mode 100644 index 0000000000..64c9571f8c --- /dev/null +++ b/sw/apps/doitgen/src/main.c @@ -0,0 +1,17 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include "snrt.h" + +#include "doitgen.h" + +#include "data.h" + +int main() { + doitgen_job(&args); + + return 0; +} diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk index 674ea2cadb..e4456fdfc3 100644 --- a/target/snitch_cluster/sw.mk +++ b/target/snitch_cluster/sw.mk @@ -67,6 +67,7 @@ APPS += sw/apps/montecarlo/pi_estimation APPS += sw/apps/atax APPS += sw/apps/correlation APPS += sw/apps/covariance +APPS += sw/apps/doitgen # Include Makefile from each app subdirectory $(foreach app,$(APPS), \ diff --git a/target/snitch_cluster/sw/apps/doitgen/app.mk b/target/snitch_cluster/sw/apps/doitgen/app.mk new file mode 100644 index 0000000000..ebef550d34 --- /dev/null +++ b/target/snitch_cluster/sw/apps/doitgen/app.mk @@ -0,0 +1,14 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +APP := doitgen +$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build +SRC_DIR := $(ROOT)/sw/apps/$(APP)/src +SRCS := $(SRC_DIR)/main.c +$(APP)_INCDIRS := $(ROOT)/sw/blas/ + +include $(ROOT)/sw/apps/common.mk +include $(ROOT)/target/snitch_cluster/sw/apps/common.mk diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml index ab302f7c38..d9e2f8c2fe 100644 --- a/target/snitch_cluster/sw/run.yaml +++ b/target/snitch_cluster/sw/run.yaml @@ -99,3 +99,5 @@ runs: # cmd: [../../../sw/apps/atax/scripts/verify.py, "${sim_bin}", "${elf}"] - elf: apps/covariance/build/covariance.elf cmd: [../../../sw/apps/covariance/scripts/verify.py, "${sim_bin}", "${elf}"] + - elf: apps/doitgen/build/doitgen.elf + cmd: [../../../sw/apps/doitgen/scripts/verify.py, "${sim_bin}", "${elf}"]