diff --git a/sw/apps/atax/.gitignore b/sw/apps/atax/.gitignore deleted file mode 100644 index 8485f615e..000000000 --- a/sw/apps/atax/.gitignore +++ /dev/null @@ -1 +0,0 @@ -data/data.h \ No newline at end of file diff --git a/sw/apps/atax/scripts/datagen.py b/sw/apps/atax/scripts/datagen.py index 51317c70e..0008bea26 100755 --- a/sw/apps/atax/scripts/datagen.py +++ b/sw/apps/atax/scripts/datagen.py @@ -8,8 +8,7 @@ import numpy as np -from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \ - format_array_declaration, format_ifdef_wrapper, DataGen +import snitch.util.sim.data_utils as du # AXI splits bursts crossing 4KB address boundaries. To minimize @@ -17,7 +16,7 @@ BURST_ALIGNMENT = 4096 -class AtaxDataGen(DataGen): +class AtaxDataGen(du.DataGen): def golden_model(self, A, x): return np.matmul(A.transpose(), np.matmul(A, x)) @@ -26,8 +25,8 @@ def emit_header(self, **kwargs): header = [super().emit_header()] M, N = kwargs['M'], kwargs['N'] - A = np.random.randint(-200, 100, size=(M, N))/100 - x = np.random.randint(-200, 100, size=(N, 1))/100 + A = du.generate_random_array((M, N)) + x = du.generate_random_array((N, 1)) y = self.golden_model(A, x) assert (M % 8) == 0, "M must be an integer multiple of the number of cores" @@ -37,13 +36,13 @@ def emit_header(self, **kwargs): x = x.flatten() y = y.flatten() - header += [format_scalar_definition('uint32_t', 'M', M)] - header += [format_scalar_definition('uint32_t', 'N', N)] - header += [format_array_definition('double', 'A', A, alignment=BURST_ALIGNMENT)] - header += [format_array_definition('double', 'x', x, alignment=BURST_ALIGNMENT)] - header += [format_array_declaration('double', 'y', y.shape, alignment=BURST_ALIGNMENT)] - result_def = format_array_definition('double', 'golden', y, alignment=BURST_ALIGNMENT) - header += [format_ifdef_wrapper('BIST', result_def)] + header += [du.format_scalar_definition('uint32_t', 'M', M)] + header += [du.format_scalar_definition('uint32_t', 'N', N)] + header += [du.format_array_definition('double', 'A', A, alignment=BURST_ALIGNMENT)] + header += [du.format_array_definition('double', 'x', x, alignment=BURST_ALIGNMENT)] + header += [du.format_array_declaration('double', 'y', y.shape, alignment=BURST_ALIGNMENT)] + result_def = du.format_array_definition('double', 'golden', y, alignment=BURST_ALIGNMENT) + header += [du.format_ifdef_wrapper('BIST', result_def)] header = '\n\n'.join(header) return header diff --git a/sw/apps/common.mk b/sw/apps/common.mk index 89f5da9f6..6bdc85984 100644 --- a/sw/apps/common.mk +++ b/sw/apps/common.mk @@ -13,7 +13,7 @@ DATA_H := $($(APP)_BUILD_DIR)/data.h DATAGEN_PY = $(SCRIPTS_DIR)/datagen.py $(APP)_HEADERS := $(DATA_H) -$(APP)_INCDIRS := $(dir $(DATA_H)) $(SRC_DIR) +$(APP)_INCDIRS += $(dir $(DATA_H)) $(SRC_DIR) $(dir $(DATA_H)): mkdir -p $@ diff --git a/sw/apps/correlation/.gitignore b/sw/apps/correlation/.gitignore deleted file mode 100644 index 8485f615e..000000000 --- a/sw/apps/correlation/.gitignore +++ /dev/null @@ -1 +0,0 @@ -data/data.h \ No newline at end of file diff --git a/sw/apps/correlation/scripts/datagen.py b/sw/apps/correlation/scripts/datagen.py index b2047d5eb..d60f527d1 100755 --- a/sw/apps/correlation/scripts/datagen.py +++ b/sw/apps/correlation/scripts/datagen.py @@ -8,8 +8,7 @@ import numpy as np -from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \ - format_array_declaration, format_ifdef_wrapper, DataGen +import snitch.util.sim.data_utils as du # AXI splits bursts crossing 4KB address boundaries. To minimize @@ -17,7 +16,7 @@ BURST_ALIGNMENT = 4096 -class CorrelationDataGen(DataGen): +class CorrelationDataGen(du.DataGen): def golden_model(self, data): return np.corrcoef(data, rowvar=False) @@ -26,19 +25,20 @@ def emit_header(self, **kwargs): header = [super().emit_header()] M, N = kwargs['M'], kwargs['N'] - data = np.random.randint(-200, 100, size=(N, M))/100 + data = du.generate_random_array((N, M)) corr = self.golden_model(data) data = data.flatten() corr = corr.flatten() - header += [format_scalar_definition('uint32_t', 'M', M)] - header += [format_scalar_definition('uint32_t', 'N', N)] - header += [format_array_definition('double', 'data', data, alignment=BURST_ALIGNMENT)] - header += [format_array_declaration('double', 'corr', corr.shape, - alignment=BURST_ALIGNMENT)] - result_def = format_array_definition('double', 'golden', corr, alignment=BURST_ALIGNMENT) - header += [format_ifdef_wrapper('BIST', result_def)] + header += [du.format_scalar_definition('uint32_t', 'M', M)] + header += [du.format_scalar_definition('uint32_t', 'N', N)] + header += [du.format_array_definition('double', 'data', data, alignment=BURST_ALIGNMENT)] + header += [du.format_array_declaration('double', 'corr', corr.shape, + alignment=BURST_ALIGNMENT)] + result_def = du.format_array_definition('double', 'golden', corr, + alignment=BURST_ALIGNMENT) + header += [du.format_ifdef_wrapper('BIST', result_def)] header = '\n\n'.join(header) return header diff --git a/sw/apps/covariance/.gitignore b/sw/apps/covariance/.gitignore deleted file mode 100644 index 8485f615e..000000000 --- a/sw/apps/covariance/.gitignore +++ /dev/null @@ -1 +0,0 @@ -data/data.h \ No newline at end of file diff --git a/sw/apps/covariance/data/params.json b/sw/apps/covariance/data/params.json index 9e89d9f85..5ae088d97 100644 --- a/sw/apps/covariance/data/params.json +++ b/sw/apps/covariance/data/params.json @@ -3,6 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 { - M: 16, - N: 8 + "m": 32, + "n": 2, + "m_tiles": 2, + "funcptr": "covariance_opt" } diff --git a/sw/apps/covariance/scripts/datagen.py b/sw/apps/covariance/scripts/datagen.py index 44e20d55e..7beb2c671 100755 --- a/sw/apps/covariance/scripts/datagen.py +++ b/sw/apps/covariance/scripts/datagen.py @@ -8,38 +8,68 @@ import numpy as np -from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \ - format_array_declaration, format_ifdef_wrapper, DataGen +import snitch.util.sim.data_utils as du +np.random.seed(42) -# AXI splits bursts crossing 4KB address boundaries. To minimize -# the occurrence of these splits the data should be aligned to 4KB -BURST_ALIGNMENT = 4096 +DOUBLE_BUFFER = True -class CovarianceDataGen(DataGen): +class CovarianceDataGen(du.DataGen): + + # Function pointers to alternative implementations + FUNCPTRS = ["covariance_naive", "covariance_baseline", "covariance_opt"] def golden_model(self, data): return np.cov(data, rowvar=False) + def validate(self, **kwargs): + n_cores = 8 + assert (kwargs['m'] % kwargs['m_tiles']) == 0, "m must be an integer multiple of m_tiles" + m_per_tile = kwargs['m'] / kwargs['m_tiles'] + assert (m_per_tile % n_cores) == 0, \ + "m_per_tile must be an integer multiple of the number of cores" + assert (m_per_tile % 4) == 0, "m_per_tile must be an integer multiple of unroll1 = 4" + m_per_core = m_per_tile / n_cores + assert (m_per_core % 2) == 0, "m_per_core must be an integer multiple of the unroll0 = 2" + assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}" + + # Calculate total TCDM occupation + a_tile_size = m_per_tile * kwargs['n'] * 8 + b_tile_size = m_per_tile * m_per_tile * 8 + total_size = 2 * a_tile_size + b_tile_size + if DOUBLE_BUFFER: + total_size *= 2 + du.validate_tcdm_footprint(total_size) + def emit_header(self, **kwargs): header = [super().emit_header()] - M, N = kwargs['M'], kwargs['N'] - data = np.random.randint(-200, 100, size=(N, M)) - cov = self.golden_model(data) + self.validate(**kwargs) - assert (M % 8) == 0, "M must be an integer multiple of the number of cores" + data = du.generate_random_array((kwargs['n'], kwargs['m'])) + cov = self.golden_model(data) - data = data.flatten() + data = data.transpose().flatten() cov = cov.flatten() - header += [format_scalar_definition('uint32_t', 'M', M)] - header += [format_scalar_definition('uint32_t', 'N', N)] - header += [format_array_definition('double', 'data', data, alignment=BURST_ALIGNMENT)] - header += [format_array_declaration('double', 'cov', cov.shape, alignment=BURST_ALIGNMENT)] - result_def = format_array_definition('double', 'golden', cov, alignment=BURST_ALIGNMENT) - header += [format_ifdef_wrapper('BIST', result_def)] + data_uid = 'data' + cov_uid = 'cov' + + cfg = { + 'm': kwargs['m'], + 'n': kwargs['n'], + 'inv_n': 1 / kwargs['n'], + 'inv_n_m1': 1 / (kwargs['n'] - 1), + 'data': data_uid, + 'cov': cov_uid, + 'm_tiles': kwargs['m_tiles'], + 'funcptr': kwargs['funcptr'] + } + + header += [du.format_array_definition('double', data_uid, data)] + header += [du.format_array_declaration('double', cov_uid, cov.shape)] + header += [du.format_struct_definition('covariance_args_t', 'args', cfg)] header = '\n\n'.join(header) return header diff --git a/sw/apps/covariance/scripts/verify.py b/sw/apps/covariance/scripts/verify.py index 4c5b0cdd1..a390d83d1 100755 --- a/sw/apps/covariance/scripts/verify.py +++ b/sw/apps/covariance/scripts/verify.py @@ -16,14 +16,26 @@ class CovarianceVerifier(Verifier): OUTPUT_UIDS = ['cov'] + def __init__(self): + super().__init__() + self.func_args = { + 'm': 'I', + 'n': 'I', + 'inv_n': 'd', + 'inv_n_m1': 'd', + 'data': 'I', + 'cov': 'I', + 'm_tiles': 'I', + 'funcptr': 'I' + } + self.func_args = self.get_input_from_symbol('args', self.func_args) + def get_actual_results(self): - return self.get_output_from_symbol('cov', 'double') + return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double') def get_expected_results(self): - M = self.get_input_from_symbol('M', 'uint32_t')[0] - N = self.get_input_from_symbol('N', 'uint32_t')[0] data = self.get_input_from_symbol('data', 'double') - data = np.reshape(data, (N, M)) + data = np.reshape(data, (self.func_args['m'], self.func_args['n'])).transpose() return CovarianceDataGen().golden_model(data).flatten() def check_results(self, *args): diff --git a/sw/apps/covariance/src/args.h b/sw/apps/covariance/src/args.h new file mode 100644 index 000000000..cd15bc852 --- /dev/null +++ b/sw/apps/covariance/src/args.h @@ -0,0 +1,23 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#pragma once +#include + +typedef void (*covariance_fp_t)(uint32_t m, uint32_t n, double inv_n, + double inv_n_m1, double *data, double *datat, + double *cov); + +typedef struct { + uint32_t m; + uint32_t n; + double inv_n; + double inv_n_m1; + double *data; + double *cov; + uint32_t m_tiles; + covariance_fp_t funcptr; +} covariance_args_t; diff --git a/sw/apps/covariance/src/covariance.h b/sw/apps/covariance/src/covariance.h index fec79d195..cdeb427bf 100644 --- a/sw/apps/covariance/src/covariance.h +++ b/sw/apps/covariance/src/covariance.h @@ -5,50 +5,363 @@ // Author: Jose Pedro Castro Fonseca // Luca Colagrande -#include +#include "args.h" +#include "blas.h" #include "snrt.h" -void kernel_covariance(uint32_t N, uint32_t M, double *data, double *cov) { - int i1, i, j, k; - int core_range, core_offset; - - // Compute deviations - if (snrt_is_compute_core()) { - // Distribute different attributes to the different cores - core_range = M / snrt_cluster_compute_core_num(); - core_offset = snrt_cluster_core_idx() * core_range; - for (i1 = 0; i1 < core_range; i1++) { - i = core_offset + i1; - - // Calculate mean vector - double mean = 0.0; - for (k = 0; k < N; k++) { - mean += data[k * M + i]; - } - mean = mean / N; +#define DOUBLE_BUFFER 1 - // Standardize data to zero mean - for (k = 0; k < N; k++) { - data[k * M + i] -= mean; - } +void covariance_naive(uint32_t m, uint32_t n, double inv_n, double inv_n_m1, + double *data, double *datat, double *cov) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + // Center data + for (uint32_t i = offset; i < m; i += stride) { + // Calculate row mean + double data_mean = 0.0; + double datat_mean = 0.0; + for (uint32_t j = 0; j < n; j++) { + data_mean += data[i * n + j]; + datat_mean += datat[i * n + j]; + } + data_mean = data_mean * inv_n; + datat_mean = datat_mean * inv_n; + + // Center row around zero + for (uint32_t j = 0; j < n; j++) { + data[i * n + j] -= data_mean; + datat[i * n + j] -= datat_mean; } + } + + snrt_fpu_fence(); + snrt_cluster_hw_barrier(); + + // Compute covariance matrix + syrk_naive(m, n, inv_n_m1, data, datat, 0, cov); +} + +void covariance_baseline(uint32_t m, uint32_t n, double inv_n, double inv_n_m1, + double *data, double *datat, double *cov) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + // Center data + for (uint32_t i = offset; i < m; i += stride) { + // Calculate row mean + double data_mean = 0.0; + double datat_mean = 0.0; + for (uint32_t j = 0; j < n; j++) { + data_mean += data[i * n + j]; + datat_mean += datat[i * n + j]; + } + data_mean = data_mean * inv_n; + datat_mean = datat_mean * inv_n; + + // Center row around zero + for (uint32_t j = 0; j < n; j++) { + data[i * n + j] -= data_mean; + datat[i * n + j] -= datat_mean; + } + } + + snrt_fpu_fence(); + snrt_cluster_hw_barrier(); + + // Compute covariance matrix + syrk_baseline(m, n, inv_n_m1, data, datat, 0, cov); +} + +void covariance_opt(uint32_t m, uint32_t n, double inv_n, double inv_n_m1, + double *data, double *datat, double *cov) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + // Unrolling factor of innermost loop + // Note: changes must be reflected in the inline assembly code + // and datagen script + const uint32_t unroll0 = 2; + + // Configure ft0 and ft1 to load data and datat elements + // for (k = 0; k < 2; k++) + // for (i1 = offset; i1 < m; i1 += stride * unroll0) + // for (j = 0; j < n; j++) + // for (i0 = 0; i0 < unroll0; i0++) + // i = i1 + i0 * stride + // ft0.push(data[i * n + j]) + // ft1.push(datat[i * n + j]) + const uint32_t ssr01_b[4] = {unroll0, n, 2, m / (stride * unroll0)}; + const uint32_t ssr01_i[4] = {sizeof(double) * n * stride, sizeof(double), 0, + sizeof(double) * n * stride * unroll0}; + snrt_ssr_loop_4d(SNRT_SSR_DM0, ssr01_b[0], ssr01_b[1], ssr01_b[2], + ssr01_b[3], ssr01_i[0], ssr01_i[1], ssr01_i[2], + ssr01_i[3]); + snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr01_b[0], ssr01_b[1], ssr01_b[2], + ssr01_b[3], ssr01_i[0], ssr01_i[1], ssr01_i[2], + ssr01_i[3]); + snrt_ssr_repeat(SNRT_SSR_DM0, 1); + // Configure ft2 to store data and datat elements + // for (i1 = offset; i1 < m; i1 += stride * unroll0) + // for (j = 0; j < n; j++) + // for (i0 = 0; i0 < unroll0; i0++) + // i = i1 + i0 * stride + // data[i * n + j] = ft2.pop() + // datat[i * n + j] = ft2.pop() + const uint32_t ssr2_b[4] = {2, unroll0, n, m / (stride * unroll0)}; + const uint32_t ssr2_i[4] = {(uint32_t)datat - (uint32_t)data, + sizeof(double) * n * stride, sizeof(double), + sizeof(double) * n * stride * unroll0}; + snrt_ssr_loop_4d(SNRT_SSR_DM2, ssr2_b[0], ssr2_b[1], ssr2_b[2], ssr2_b[3], + ssr2_i[0], ssr2_i[1], ssr2_i[2], ssr2_i[3]); + + // SSR start address need to be configured each time + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, data + offset * n); + snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, datat + offset * n); + snrt_ssr_write(SNRT_SSR_DM2, SNRT_SSR_4D, data + offset * n); + snrt_ssr_enable(); + + // Center data + for (uint32_t i = offset; i < m; i += stride * unroll0) { + // Calculate row means + double m[2 * unroll0]; + m[0] = 0.0; // mean(data[i]) + m[1] = 0.0; // mean(datat[i]) + m[2] = 0.0; // mean(data[i + stride]) + m[3] = 0.0; // mean(datat[i + stride]) + asm volatile( + "frep.o %[n_frep], %[n_insn], 0, 0 \n" + "fadd.d %[m0], ft0, %[m0] \n" + "fadd.d %[m1], ft1, %[m1] \n" + "fadd.d %[m2], ft0, %[m2] \n" + "fadd.d %[m3], ft1, %[m3] \n" + : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), [ m2 ] "+f"(m[2]), + [ m3 ] "+f"(m[3]) + : [ n_frep ] "r"(n - 1), [ n_insn ] "i"(2 * unroll0) + : "ft0", "ft1", "ft2"); + m[0] *= inv_n; + m[1] *= inv_n; + m[2] *= inv_n; + m[3] *= inv_n; + snrt_fpu_fence(); + + // Center row around zero + asm volatile( + "frep.o %[n_frep], %[n_insn], 0, 0 \n" + "fsub.d ft2, ft0, %[m0] \n" + "fsub.d ft2, ft1, %[m1] \n" + "fsub.d ft2, ft0, %[m2] \n" + "fsub.d ft2, ft1, %[m3] \n" + : [ m0 ] "+f"(m[0]), [ m1 ] "+f"(m[1]), [ m2 ] "+f"(m[2]), + [ m3 ] "+f"(m[3]) + : [ n_frep ] "r"(n - 1), [ n_insn ] "i"(2 * unroll0) + : "ft0", "ft1", "ft2"); + } + + snrt_ssr_disable(); + + snrt_fpu_fence(); + snrt_cluster_hw_barrier(); + + // The following is taken from the AtA kernel, where alpha is set to + // the factor 1/(n - 1). + // Here data stands for A and datat for At. + + // Unrolling factor of innermost loop + // Note: changes must be reflected in the inline assembly code + // and datagen script + const uint32_t unroll1 = 4; + + // Configure ft0 and ft1 to load A and At + // for (i = offset; i < m; i += stride) + // for (j1 = 0; j1 < m; j1 += unroll1) + // for (k = 0; k < n; k++) + // for (j0 = 0; j0 < unroll1; j0++) + // j = j1 + j0 + // ft0.push(a[i * n + k]) + // ft1.push(at[j * n + k]) + const uint32_t ssr0_b[4] = {unroll1, n, m / unroll1, m / stride}; + const uint32_t ssr0_i[4] = {0, sizeof(double), 0, + stride * n * sizeof(double)}; + snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], ssr0_i[1], + ssr0_i[2], ssr0_i[3]); + snrt_ssr_repeat(SNRT_SSR_DM0, unroll1); + const uint32_t ssr1_b[4] = {unroll1, n, m / unroll1, m / stride}; + const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), + unroll1 * n * sizeof(double), 0}; + snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], ssr1_b[3], + ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); + + // SSR start address need to be configured each time + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_3D, data + offset * n); + snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, datat); + snrt_ssr_enable(); + + for (uint32_t i = offset; i < m; i += stride) { + for (uint32_t j = 0; j < m; j += unroll1) { + double acc[unroll1]; + acc[0] = 0; + acc[1] = 0; + acc[2] = 0; + acc[3] = 0; + + asm volatile( + "frep.o %[n_frep], %[unroll1], 0, 0 \n" + "fmadd.d %[acc0], ft0, ft1, %[acc0] \n" + "fmadd.d %[acc1], ft0, ft1, %[acc1] \n" + "fmadd.d %[acc2], ft0, ft1, %[acc2] \n" + "fmadd.d %[acc3], ft0, ft1, %[acc3] \n" + "fmul.d %[b0], %[acc0], %[alpha] \n" + "fmul.d %[b1], %[acc1], %[alpha] \n" + "fmul.d %[b2], %[acc2], %[alpha] \n" + "fmul.d %[b3], %[acc3], %[alpha] \n" + : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), + [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]), + [ b0 ] "=f"(cov[i * m + j + 0]), + [ b1 ] "=f"(cov[i * m + j + 1]), + [ b2 ] "=f"(cov[i * m + j + 2]), + [ b3 ] "=f"(cov[i * m + j + 3]) + : [ n_frep ] "r"(n - 1), [ unroll1 ] "i"(unroll1), + [ alpha ] "f"(inv_n_m1) + : "ft0", "ft1", "ft2"); + } } + snrt_ssr_disable(); + snrt_fpu_fence(); +} + +void covariance_job(covariance_args_t *args) { + uint32_t m_frac, a_tile_size, a_tile_bytes, b_tile_size, b_tile_bytes; + uint64_t local_a0_addr, local_at0_addr, local_b0_addr, local_a1_addr, + local_at1_addr, local_b1_addr; + double *local_a[2]; + double *local_at[2]; + double *local_b[2]; + uint32_t iterations, sb_iterations; + uint32_t i, i_dma_in, i_compute, i_dma_out, i_row, i_col, buff_idx; + +#ifndef JOB_ARGS_PRELOADED + // Allocate space for job arguments in TCDM + covariance_args_t *local_args = (covariance_args_t *)snrt_l1_next(); + + // Copy job arguments to TCDM + if (snrt_is_dm_core()) { + snrt_dma_start_1d(local_args, args, sizeof(covariance_args_t)); + snrt_dma_wait_all(); + } snrt_cluster_hw_barrier(); + args = local_args; +#endif + + // Calculate size of each tile + m_frac = args->m / args->m_tiles; + a_tile_size = args->n * m_frac; + b_tile_size = m_frac * m_frac; + a_tile_bytes = a_tile_size * sizeof(double); + b_tile_bytes = b_tile_size * sizeof(double); + + // Allocate space for job operands in TCDM + local_a0_addr = (uint64_t)args + sizeof(covariance_args_t); + local_at0_addr = local_a0_addr + a_tile_bytes; + local_b0_addr = local_at0_addr + a_tile_bytes; + local_a[0] = (double *)local_a0_addr; + local_at[0] = (double *)local_at0_addr; + local_b[0] = (double *)local_b0_addr; + if (DOUBLE_BUFFER) { + local_a1_addr = local_b0_addr + b_tile_bytes; + local_at1_addr = local_a1_addr + a_tile_bytes; + local_b1_addr = local_at1_addr + a_tile_bytes; + local_a[1] = (double *)local_a1_addr; + local_at[1] = (double *)local_at1_addr; + local_b[1] = (double *)local_b1_addr; + } + + // Calculate number of iterations + sb_iterations = args->m_tiles * args->m_tiles; + if (DOUBLE_BUFFER) + iterations = sb_iterations + 2; + else + iterations = sb_iterations; + + // Iterate over all tiles + for (i = 0; i < iterations; i++) { + if (snrt_is_dm_core()) { + // DMA in + if (!DOUBLE_BUFFER || (i < sb_iterations)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_in = i; + buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0; + i_row = i_dma_in / args->m_tiles; + i_col = i_dma_in % args->m_tiles; + + // Copy job operands in TCDM + snrt_dma_load_1d_tile(local_a[buff_idx], args->data, i_row, + a_tile_size, sizeof(double)); + snrt_dma_load_1d_tile(local_at[buff_idx], args->data, i_col, + a_tile_size, sizeof(double)); + snrt_dma_wait_all(); - // Compute covariance - if (snrt_is_compute_core()) { - for (i1 = 0; i1 < core_range; i1++) { - i = core_offset + i1; - for (j = 0; j <= i; j++) { - double tmp = 0.0; - for (k = 0; k < N; k++) { - tmp += data[k * M + i] * data[k * M + j]; - } - cov[i * M + j] = tmp / (N - 1); - cov[j * M + i] = cov[i * M + j]; + snrt_mcycle(); } + + // Additional barriers required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + // Additional barrier required to synchronize the compute cores + // among them after the data centering phase + if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1))) + snrt_cluster_hw_barrier(); + + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + // DMA out + if (!DOUBLE_BUFFER || (i > 1)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_out = DOUBLE_BUFFER ? i - 2 : i; + buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0; + i_row = i_dma_out / args->m_tiles; + i_col = i_dma_out % args->m_tiles; + + // Copy job outputs from TCDM + snrt_dma_store_2d_tile(args->cov, local_b[buff_idx], i_row, + i_col, m_frac, m_frac, args->m, + sizeof(double)); + snrt_dma_wait_all(); + + snrt_mcycle(); + } + } + + // Compute + if (snrt_is_compute_core()) { + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1))) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_compute = DOUBLE_BUFFER ? i - 1 : i; + buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0; + + // Perform tile computation + covariance_fp_t fp = args->funcptr; + fp(m_frac, args->n, args->inv_n, args->inv_n_m1, + local_a[buff_idx], local_at[buff_idx], local_b[buff_idx]); + + snrt_mcycle(); + } + + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); } + // Synchronize cores after every iteration + snrt_cluster_hw_barrier(); } } diff --git a/sw/apps/covariance/src/main.c b/sw/apps/covariance/src/main.c index 26b151393..112ead333 100644 --- a/sw/apps/covariance/src/main.c +++ b/sw/apps/covariance/src/main.c @@ -1,56 +1,16 @@ -// Copyright 2023 ETH Zurich and University of Bologna. +// Copyright 2024 ETH Zurich and University of Bologna. // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 // -// Author: Jose Pedro Castro Fonseca -// Luca Colagrande +// Author: Luca Colagrande + +#include "snrt.h" #include "covariance.h" #include "data.h" -#define MAX_ERROR 1e-10 - int main() { - uint32_t nerr = 0; - double *local_mean; - double *local_cov; - double *local_data; - double diff; - - local_data = snrt_l1_next(); - local_cov = local_data + N * M; - - // Initialize input matrix - if (snrt_is_dm_core()) { - snrt_dma_start_1d(local_data, data, sizeof(double) * N * M); - snrt_dma_wait_all(); - } - snrt_cluster_hw_barrier(); - - // Perform Computations - kernel_covariance(N, M, local_data, local_cov); - snrt_cluster_hw_barrier(); - - // Writeback outputs - if (snrt_is_dm_core()) { - snrt_dma_start_1d(cov, local_cov, sizeof(double) * M * M); - snrt_dma_wait_all(); - } - snrt_cluster_hw_barrier(); - -#ifdef BIST - // Check computation is correct - if (snrt_cluster_core_idx() == 0) { - for (int i = 0; i < M; i++) { - for (int j = 0; j < M; j++) { - diff = fabs(golden[i * M + j] - local_cov[i * M + j]); - if (diff > MAX_ERROR) { - nerr++; - } - } - } - } -#endif + covariance_job(&args); - return nerr; + return 0; } diff --git a/sw/apps/doitgen/data/params.json b/sw/apps/doitgen/data/params.json new file mode 100644 index 000000000..4417f0c35 --- /dev/null +++ b/sw/apps/doitgen/data/params.json @@ -0,0 +1,12 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + "r": 16, + "q": 16, + "s": 32, + "r_tiles": 2, + "q_tiles": 2, + "funcptr": "doitgen_baseline" +} diff --git a/sw/apps/doitgen/scripts/datagen.py b/sw/apps/doitgen/scripts/datagen.py new file mode 100755 index 000000000..d1a9c3b46 --- /dev/null +++ b/sw/apps/doitgen/scripts/datagen.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Author: Luca Colagrande + +import numpy as np + +import snitch.util.sim.data_utils as du + +np.random.seed(42) + +DOUBLE_BUFFER = True + + +class DoitgenDataGen(du.DataGen): + + # Function pointers to alternative implementations + FUNCPTRS = ["doitgen_naive", "doitgen_baseline", "doitgen_opt"] + + def golden_model(self, A, x): + R, Q, S = A.shape + P, _ = x.shape + Aout = np.ndarray((R, Q, P)) + for r in range(R): + for q in range(Q): + for p in range(P): + Aout[r, q, p] = 0 + for s in range(S): + Aout[r, q, p] += A[r, q, s] * x[p, s] + return Aout + + def validate(self, **kwargs): + n_cores = 8 + assert (kwargs['r'] % kwargs['r_tiles']) == 0, "r must be an integer multiple of r_tiles" + assert (kwargs['q'] % kwargs['q_tiles']) == 0, "q must be an integer multiple of q_tiles" + if kwargs['funcptr'] != 'doitgen_naive': + assert (kwargs['s'] % 4) == 0, "s must be an integer multiple of unrolling factor" + r_per_tile = kwargs['r'] / kwargs['r_tiles'] + q_per_tile = kwargs['q'] / kwargs['q_tiles'] + assert (r_per_tile % n_cores) == 0, "r_per_tile must be an integer multiple of n_cores" + assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}" + + # Calculate total TCDM occupation + a_tile_size = r_per_tile * q_per_tile * kwargs['s'] * 8 + x_size = kwargs['s'] * kwargs['s'] * 8 + total_size = 2 * a_tile_size + x_size + if DOUBLE_BUFFER: + total_size *= 2 + du.validate_tcdm_footprint(total_size) + + def emit_header(self, **kwargs): + header = [super().emit_header()] + + self.validate(**kwargs) + + A = du.generate_random_array((kwargs['r'], kwargs['q'], kwargs['s'])) + x = du.generate_random_array((kwargs['s'], kwargs['s'])) + + _ = self.golden_model(A, x) + + A = A.flatten() + x = x.flatten() + + A_uid = 'A' + x_uid = 'x' + + cfg = { + 'r': kwargs['r'], + 'q': kwargs['q'], + 's': kwargs['s'], + 'A': A_uid, + 'x': x_uid, + 'r_tiles': kwargs['r_tiles'], + 'q_tiles': kwargs['q_tiles'], + 'funcptr': kwargs['funcptr'] + } + + header += [du.format_array_definition('double', A_uid, A)] + header += [du.format_array_definition('double', x_uid, x)] + header += [du.format_struct_definition('doitgen_args_t', 'args', cfg)] + header = '\n\n'.join(header) + + return header + + +if __name__ == '__main__': + DoitgenDataGen().main() diff --git a/sw/apps/doitgen/scripts/verify.py b/sw/apps/doitgen/scripts/verify.py new file mode 100755 index 000000000..8f72b0415 --- /dev/null +++ b/sw/apps/doitgen/scripts/verify.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import numpy as np +import sys +from datagen import DoitgenDataGen + +from snitch.util.sim.verif_utils import Verifier + + +class DoitgenVerifier(Verifier): + + OUTPUT_UIDS = ['A'] + + def __init__(self): + super().__init__() + self.func_args = { + 'r': 'I', + 'q': 'I', + 's': 'I', + 'A': 'I', + 'x': 'I', + 'r_tiles': 'I', + 'q_tiles': 'I', + 'funcptr': 'I' + } + self.func_args = self.get_input_from_symbol('args', self.func_args) + + def get_actual_results(self): + return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double') + + def get_expected_results(self): + A = self.get_input_from_symbol('A', 'double') + A = np.reshape(A, (self.func_args['r'], self.func_args['q'], self.func_args['s'])) + x = self.get_input_from_symbol('x', 'double') + x = np.reshape(x, (self.func_args['s'], self.func_args['s'])) + return DoitgenDataGen().golden_model(A, x).flatten() + + def check_results(self, *args): + return super().check_results(*args, rtol=1e-10) + + +if __name__ == "__main__": + sys.exit(DoitgenVerifier().main()) diff --git a/sw/apps/doitgen/src/args.h b/sw/apps/doitgen/src/args.h new file mode 100644 index 000000000..5d3f56ce4 --- /dev/null +++ b/sw/apps/doitgen/src/args.h @@ -0,0 +1,22 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#pragma once +#include + +typedef void (*doitgen_fp_t)(uint32_t r, uint32_t q, uint32_t s, double *A, + double *x, double *Aout); + +typedef struct { + uint32_t r; + uint32_t q; + uint32_t s; + double *A; + double *x; + uint32_t r_tiles; + uint32_t q_tiles; + doitgen_fp_t funcptr; +} doitgen_args_t; diff --git a/sw/apps/doitgen/src/doitgen.h b/sw/apps/doitgen/src/doitgen.h new file mode 100644 index 000000000..2f7bc6128 --- /dev/null +++ b/sw/apps/doitgen/src/doitgen.h @@ -0,0 +1,303 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include "args.h" +#include "snrt.h" + +#define DOUBLE_BUFFER 1 + +__thread int setup_ssr = 1; + +void doitgen_naive(uint32_t r, uint32_t q, uint32_t s, double *A, double *x, + double *Aout) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + for (uint32_t i = offset; i < r; i += stride) { + for (uint32_t j = 0; j < q; j++) { + for (uint32_t k = 0; k < s; k++) { + Aout[i * q * s + j * s + k] = 0.0; + for (uint32_t l = 0; l < s; l++) { + Aout[i * q * s + j * s + k] += + A[i * q * s + j * s + l] * x[k * s + l]; + } + } + } + } + + snrt_fpu_fence(); +} + +void doitgen_baseline(uint32_t r, uint32_t q, uint32_t s, double *A, double *x, + double *Aout) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + // Unrolling factors + // Note: changes must be reflected in the inline assembly code + // and datagen script + const uint32_t unroll1 = 4; + const uint32_t unroll0 = 4; + + for (uint32_t i = offset; i < r; i += stride) { + for (uint32_t j = 0; j < q; j++) { + for (uint32_t k = 0; k < s; k += unroll1) { + double acc[4]; + acc[0] = 0; + acc[1] = 0; + acc[2] = 0; + acc[3] = 0; + + for (uint32_t l = 0; l < s; l += unroll0) { + asm volatile( + "fmadd.d %[acc0], %[a0], %[x0], %[acc0] \n" + "fmadd.d %[acc1], %[a0], %[x1], %[acc1] \n" + "fmadd.d %[acc2], %[a0], %[x2], %[acc2] \n" + "fmadd.d %[acc3], %[a0], %[x3], %[acc3] \n" + "fmadd.d %[acc0], %[a1], %[x4], %[acc0] \n" + "fmadd.d %[acc1], %[a1], %[x5], %[acc1] \n" + "fmadd.d %[acc2], %[a1], %[x6], %[acc2] \n" + "fmadd.d %[acc3], %[a1], %[x7], %[acc3] \n" + "fmadd.d %[acc0], %[a2], %[x8], %[acc0] \n" + "fmadd.d %[acc1], %[a2], %[x9], %[acc1] \n" + "fmadd.d %[acc2], %[a2], %[x10], %[acc2] \n" + "fmadd.d %[acc3], %[a2], %[x11], %[acc3] \n" + "fmadd.d %[acc0], %[a3], %[x12], %[acc0] \n" + "fmadd.d %[acc1], %[a3], %[x13], %[acc1] \n" + "fmadd.d %[acc2], %[a3], %[x14], %[acc2] \n" + "fmadd.d %[acc3], %[a3], %[x15], %[acc3] \n" + : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), + [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]) + : [ a0 ] "f"(A[i * q * s + j * s + l + 0]), + [ a1 ] "f"(A[i * q * s + j * s + l + 1]), + [ a2 ] "f"(A[i * q * s + j * s + l + 2]), + [ a3 ] "f"(A[i * q * s + j * s + l + 3]), + [ x0 ] "f"(x[(k + 0) * s + l + 0]), + [ x1 ] "f"(x[(k + 1) * s + l + 0]), + [ x2 ] "f"(x[(k + 2) * s + l + 0]), + [ x3 ] "f"(x[(k + 3) * s + l + 0]), + [ x4 ] "f"(x[(k + 0) * s + l + 1]), + [ x5 ] "f"(x[(k + 1) * s + l + 1]), + [ x6 ] "f"(x[(k + 2) * s + l + 1]), + [ x7 ] "f"(x[(k + 3) * s + l + 1]), + [ x8 ] "f"(x[(k + 0) * s + l + 2]), + [ x9 ] "f"(x[(k + 1) * s + l + 2]), + [ x10 ] "f"(x[(k + 2) * s + l + 2]), + [ x11 ] "f"(x[(k + 3) * s + l + 2]), + [ x12 ] "f"(x[(k + 0) * s + l + 3]), + [ x13 ] "f"(x[(k + 1) * s + l + 3]), + [ x14 ] "f"(x[(k + 2) * s + l + 3]), + [ x15 ] "f"(x[(k + 3) * s + l + 3]) + :); + } + + Aout[i * q * s + j * s + k + 0] = acc[0]; + Aout[i * q * s + j * s + k + 1] = acc[1]; + Aout[i * q * s + j * s + k + 2] = acc[2]; + Aout[i * q * s + j * s + k + 3] = acc[3]; + } + } + } + + snrt_fpu_fence(); +} + +void doitgen_opt(uint32_t r, uint32_t q, uint32_t s, double *A, double *x, + double *Aout) { + uint32_t bound = r / snrt_cluster_compute_core_num(); + uint32_t offset = bound * snrt_cluster_core_idx(); + + // Unrolling factor of innermost loop + // Note: changes must be reflected in the inline assembly code + // and datagen script + const uint32_t unroll = 4; + + if (setup_ssr) { + // Configure ft0 and ft1 to load A and x + // for (i = offset; i < bound; i++) + // for (j = 0; j < q; j++) + // for (k1 = 0; k1 < s; k1 += unroll) + // for (l = 0; l < s; l++) + // for (k0 = 0; k0 < unroll; k0++) + // k = k1 + k0 + // ft0.push(A[i * q * s + j * s + l]) + // ft1.push(x[k * s + l]) + const uint32_t ssr0_b[4] = {unroll, s, s / unroll, q * bound}; + const uint32_t ssr0_i[4] = {0, sizeof(double), 0, s * sizeof(double)}; + snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], + ssr0_i[1], ssr0_i[2], ssr0_i[3]); + snrt_ssr_repeat(SNRT_SSR_DM0, unroll); + const uint32_t ssr1_b[4] = {unroll, s, s / unroll, q * bound}; + const uint32_t ssr1_i[4] = {s * sizeof(double), sizeof(double), + unroll * s * sizeof(double), 0}; + snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], + ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); + setup_ssr = 0; + } + + // SSR start address need to be configured each time + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, A + offset * q * s); + snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, x); + snrt_ssr_enable(); + + for (uint32_t i = offset; i < (offset + bound); i++) { + for (uint32_t j = 0; j < q; j++) { + for (uint32_t k = 0; k < s; k += unroll) { + double acc[unroll]; + acc[0] = 0; + acc[1] = 0; + acc[2] = 0; + acc[3] = 0; + + asm volatile( + "frep.o %[n_frep], %[unroll], 0, 0 \n" + "fmadd.d %[acc0], ft0, ft1, %[acc0] \n" + "fmadd.d %[acc1], ft0, ft1, %[acc1] \n" + "fmadd.d %[acc2], ft0, ft1, %[acc2] \n" + "fmadd.d %[acc3], ft0, ft1, %[acc3] \n" + : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), + [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]) + : [ n_frep ] "r"(s - 1), [ unroll ] "i"(unroll) + : "ft0", "ft1", "ft2"); + + Aout[i * q * s + j * s + k + 0] = acc[0]; + Aout[i * q * s + j * s + k + 1] = acc[1]; + Aout[i * q * s + j * s + k + 2] = acc[2]; + Aout[i * q * s + j * s + k + 3] = acc[3]; + } + } + } + + snrt_ssr_disable(); + snrt_fpu_fence(); +} + +void doitgen_job(doitgen_args_t *args) { + uint32_t r_frac, q_frac, a_tile_size, a_tile_bytes, x_size, x_bytes; + uint64_t local_a0_addr, local_aout0_addr, local_x0_addr, local_a1_addr, + local_aout1_addr; + double *local_a[2]; + double *local_aout[2]; + double *local_x; + uint32_t iterations, sb_iterations; + uint32_t i, i_dma_in, i_compute, i_dma_out, i_r, i_q, buff_idx; + +#ifndef JOB_ARGS_PRELOADED + // Allocate space for job arguments in TCDM + doitgen_args_t *local_args = (doitgen_args_t *)snrt_l1_next(); + + // Copy job arguments to TCDM + if (snrt_is_dm_core()) { + snrt_dma_start_1d(local_args, args, sizeof(doitgen_args_t)); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); + args = local_args; +#endif + + // Calculate size of each tile + r_frac = args->r / args->r_tiles; + q_frac = args->q / args->q_tiles; + a_tile_size = r_frac * q_frac * args->s; + x_size = args->s * args->s; + a_tile_bytes = a_tile_size * sizeof(double); + x_bytes = x_size * sizeof(double); + + // Allocate space for job operands in TCDM + local_x0_addr = (uint64_t)args + sizeof(doitgen_args_t); + local_a0_addr = local_x0_addr + x_bytes; + local_aout0_addr = local_a0_addr + a_tile_bytes; + local_x = (double *)local_x0_addr; + local_a[0] = (double *)local_a0_addr; + local_aout[0] = (double *)local_aout0_addr; + if (DOUBLE_BUFFER) { + local_a1_addr = local_aout0_addr + a_tile_bytes; + local_aout1_addr = local_a1_addr + a_tile_bytes; + local_a[1] = (double *)local_a1_addr; + local_aout[1] = (double *)local_aout1_addr; + } + + // Calculate number of iterations + sb_iterations = args->r_tiles * args->q_tiles; + if (DOUBLE_BUFFER) + iterations = sb_iterations + 2; + else + iterations = sb_iterations; + + // Iterate over all tiles + for (i = 0; i < iterations; i++) { + if (snrt_is_dm_core()) { + // DMA in + if (!DOUBLE_BUFFER || (i < sb_iterations)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_in = i; + buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0; + i_r = i_dma_in / args->q_tiles; + i_q = i_dma_in % args->q_tiles; + + // Copy job operands in TCDM + snrt_dma_load_2d_tile(local_a[buff_idx], args->A, i_r, i_q, + r_frac, q_frac * args->s, + args->q * args->s, sizeof(double)); + if (i_dma_in == 0) snrt_dma_start_1d(local_x, args->x, x_bytes); + snrt_dma_wait_all(); + + snrt_mcycle(); + } + + // Additional barriers required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + // DMA out + if (!DOUBLE_BUFFER || (i > 1)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_out = DOUBLE_BUFFER ? i - 2 : i; + buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0; + i_r = i_dma_out / args->q_tiles; + i_q = i_dma_out % args->q_tiles; + + // Copy job outputs from TCDM + snrt_dma_store_2d_tile(args->A, local_aout[buff_idx], i_r, i_q, + r_frac, q_frac * args->s, + args->q * args->s, sizeof(double)); + snrt_dma_wait_all(); + + snrt_mcycle(); + } + } + + // Compute + if (snrt_is_compute_core()) { + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + if (!DOUBLE_BUFFER || (i > 0 && i < (sb_iterations + 1))) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_compute = DOUBLE_BUFFER ? i - 1 : i; + buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0; + + // Perform tile computation + doitgen_fp_t fp = args->funcptr; + fp(r_frac, q_frac, args->s, local_a[buff_idx], local_x, + local_aout[buff_idx]); + + snrt_mcycle(); + } + + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + } + // Synchronize cores after every iteration + snrt_cluster_hw_barrier(); + } +} diff --git a/sw/apps/doitgen/src/main.c b/sw/apps/doitgen/src/main.c new file mode 100644 index 000000000..64c9571f8 --- /dev/null +++ b/sw/apps/doitgen/src/main.c @@ -0,0 +1,17 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include "snrt.h" + +#include "doitgen.h" + +#include "data.h" + +int main() { + doitgen_job(&args); + + return 0; +} diff --git a/sw/blas/.gitignore b/sw/blas/.gitignore deleted file mode 100644 index 2ff975f29..000000000 --- a/sw/blas/.gitignore +++ /dev/null @@ -1 +0,0 @@ -**/data/data.h \ No newline at end of file diff --git a/sw/blas/axpy/data/params.json b/sw/blas/axpy/data/params.json index 2f8f5871c..a4fa15275 100644 --- a/sw/blas/axpy/data/params.json +++ b/sw/blas/axpy/data/params.json @@ -3,5 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 { - n: 384 + "n_tiles": 3, + "n": 384, + "funcptr": "axpy_opt" } diff --git a/sw/blas/axpy/scripts/datagen.py b/sw/blas/axpy/scripts/datagen.py index 117495391..38634dd5e 100755 --- a/sw/blas/axpy/scripts/datagen.py +++ b/sw/blas/axpy/scripts/datagen.py @@ -5,45 +5,68 @@ # # Author: Luca Colagrande -import numpy as np import sys -from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \ - format_array_declaration, format_ifdef_wrapper, DataGen +import snitch.util.sim.data_utils as du -class AxpyDataGen(DataGen): +class AxpyDataGen(du.DataGen): - MIN = -1000 - MAX = +1000 # AXI splits bursts crossing 4KB address boundaries. To minimize # the occurrence of these splits the data should be aligned to 4KB BURST_ALIGNMENT = 4096 + # Function pointers to alternative implementations + FUNCPTRS = ["axpy_naive", "axpy_fma", "axpy_opt"] def golden_model(self, a, x, y): return a*x + y + def validate_config(self, **kwargs): + assert kwargs['n'] % kwargs['n_tiles'] == 0, "n must be an integer multiple of n_tiles" + n_per_tile = kwargs['n'] // kwargs['n_tiles'] + assert (n_per_tile % 8) == 0, "n must be an integer multiple of the number of cores" + assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}" + + # Calculate total TCDM occupation + # Note: doesn't account for gaps created by data alignment + vec_size = n_per_tile * 8 + total_size = 2 * 3 * vec_size + du.validate_tcdm_footprint(total_size) + def emit_header(self, **kwargs): header = [super().emit_header()] - n = kwargs['n'] - a = np.random.uniform(self.MIN, self.MAX, 1) - x = np.random.uniform(self.MIN, self.MAX, n) - y = np.random.uniform(self.MIN, self.MAX, n) + self.validate_config(**kwargs) + + a = du.generate_random_array(1)[0] + x = du.generate_random_array(kwargs['n']) + y = du.generate_random_array(kwargs['n']) g = self.golden_model(a, x, y) - assert (n % 8) == 0, "n must be an integer multiple of the number of cores" - - header += [format_scalar_definition('const uint32_t', 'n', n)] - header += [format_scalar_definition('const double', 'a', a[0])] - header += [format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT, - section=kwargs['section'])] - header += [format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT, - section=kwargs['section'])] - header += [format_array_declaration('double', 'z', [n], alignment=self.BURST_ALIGNMENT, - section=kwargs['section'])] - result_def = format_array_definition('double', 'g', g) - header += [format_ifdef_wrapper('BIST', result_def)] + x_uid = 'x' + y_uid = 'y' + z_uid = 'z' + + cfg = { + 'n': kwargs['n'], + 'a': a, + 'x': x_uid, + 'y': y_uid, + 'z': z_uid, + 'n_tiles': kwargs['n_tiles'], + 'funcptr': kwargs['funcptr'] + } + + header += [du.format_scalar_definition('const double', 'a', a)] + header += [du.format_array_definition('double', x_uid, x, + alignment=self.BURST_ALIGNMENT, section=kwargs['section'])] + header += [du.format_array_definition('double', y_uid, y, + alignment=self.BURST_ALIGNMENT, section=kwargs['section'])] + header += [du.format_array_declaration('double', z_uid, x.shape, + alignment=self.BURST_ALIGNMENT, section=kwargs['section'])] + header += [du.format_struct_definition('axpy_args_t', 'args', cfg)] + result_def = du.format_array_definition('double', 'g', g) + header += [du.format_ifdef_wrapper('BIST', result_def)] header = '\n\n'.join(header) return header diff --git a/sw/blas/axpy/src/args.h b/sw/blas/axpy/src/args.h new file mode 100644 index 000000000..c5d542852 --- /dev/null +++ b/sw/blas/axpy/src/args.h @@ -0,0 +1,19 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include + +typedef void (*axpy_fp_t)(uint32_t n, double a, double* x, double* y, + double* z); + +typedef struct { + uint32_t n; + double a; + double* x; + double* y; + double* z; + uint32_t n_tiles; + axpy_fp_t funcptr; +} axpy_args_t; diff --git a/sw/blas/axpy/src/axpy.h b/sw/blas/axpy/src/axpy.h index e8f5ae6c0..8ded48167 100644 --- a/sw/blas/axpy/src/axpy.h +++ b/sw/blas/axpy/src/axpy.h @@ -2,28 +2,49 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 +#include "args.h" #include "snrt.h" -inline void axpy(uint32_t n, double a, double* x, double* y, double* z) { +#define DOUBLE_BUFFER 1 + +#define BANK_ALIGNMENT 8 +#define TCDM_ALIGNMENT (32 * BANK_ALIGNMENT) +#define ALIGN_UP_TCDM(addr) ALIGN_UP(addr, TCDM_ALIGNMENT) + +static inline void axpy_naive(uint32_t n, double a, double *x, double *y, + double *z) { int core_idx = snrt_cluster_core_idx(); int frac = n / snrt_cluster_compute_core_num(); - int offset = core_idx * frac; + int offset = core_idx; + + for (int i = offset; i < n; i += snrt_cluster_compute_core_num()) { + z[i] = a * x[i] + y[i]; + } + snrt_fpu_fence(); +} -#ifndef XSSR +static inline void axpy_fma(uint32_t n, double a, double *x, double *y, + double *z) { + int core_idx = snrt_cluster_core_idx(); + int frac = n / snrt_cluster_compute_core_num(); + int offset = core_idx; - for (int i = 0; i < frac; i++) { - z[offset] = a * x[offset] + y[offset]; - offset++; + for (int i = offset; i < n; i += snrt_cluster_compute_core_num()) { + asm volatile("fmadd.d %[z], %[a], %[x], %[y] \n" + : [ z ] "=f"(z[i]) + : [ a ] "f"(a), [ x ] "f"(x[i]), [ y ] "f"(y[i])); } snrt_fpu_fence(); +} -#else +static inline void axpy_opt(uint32_t n, double a, double *x, double *y, + double *z) { + int core_idx = snrt_cluster_core_idx(); + int frac = n / snrt_cluster_compute_core_num(); + int offset = core_idx; - // TODO(colluca): revert once Banshee supports SNRT_SSR_DM_ALL - // snrt_ssr_loop_1d(SNRT_SSR_DM_ALL, frac, sizeof(double)); - snrt_ssr_loop_1d(SNRT_SSR_DM0, frac, sizeof(double)); - snrt_ssr_loop_1d(SNRT_SSR_DM1, frac, sizeof(double)); - snrt_ssr_loop_1d(SNRT_SSR_DM2, frac, sizeof(double)); + snrt_ssr_loop_1d(SNRT_SSR_DM_ALL, frac, + snrt_cluster_compute_core_num() * sizeof(double)); snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, x + offset); snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, y + offset); @@ -40,6 +61,131 @@ inline void axpy(uint32_t n, double a, double* x, double* y, double* z) { snrt_fpu_fence(); snrt_ssr_disable(); +} + +static inline void axpy_job(axpy_args_t *args) { + uint32_t frac, offset, size; + uint64_t local_x0_addr, local_y0_addr, local_z0_addr, local_x1_addr, + local_y1_addr, local_z1_addr; + double *local_x[2]; + double *local_y[2]; + double *local_z[2]; + double *remote_x, *remote_y, *remote_z; + uint32_t iterations, i, i_dma_in, i_compute, i_dma_out, buff_idx; +#ifndef JOB_ARGS_PRELOADED + // Allocate space for job arguments in TCDM + axpy_args_t *local_args = (axpy_args_t *)snrt_l1_next(); + + // Copy job arguments to TCDM + if (snrt_is_dm_core()) { + snrt_dma_start_1d(local_args, args, sizeof(axpy_args_t)); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); + args = local_args; #endif + + // Calculate size of each tile + frac = args->n / args->n_tiles; + size = frac * sizeof(double); + + // Allocate space for job operands in TCDM + // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th. + local_x0_addr = ALIGN_UP_TCDM((uint64_t)args + sizeof(axpy_args_t)); + local_y0_addr = ALIGN_UP_TCDM(local_x0_addr + size) + 8 * BANK_ALIGNMENT; + local_z0_addr = ALIGN_UP_TCDM(local_y0_addr + size) + 16 * BANK_ALIGNMENT; + local_x[0] = (double *)local_x0_addr; + local_y[0] = (double *)local_y0_addr; + local_z[0] = (double *)local_z0_addr; + if (DOUBLE_BUFFER) { + local_x1_addr = ALIGN_UP_TCDM(local_z0_addr + size); + local_y1_addr = + ALIGN_UP_TCDM(local_x1_addr + size) + 8 * BANK_ALIGNMENT; + local_z1_addr = + ALIGN_UP_TCDM(local_y1_addr + size) + 16 * BANK_ALIGNMENT; + local_x[1] = (double *)local_x1_addr; + local_y[1] = (double *)local_y1_addr; + local_z[1] = (double *)local_z1_addr; + } + + // Calculate number of iterations + iterations = args->n_tiles; + if (DOUBLE_BUFFER) iterations += 2; + + // Iterate over all tiles + for (i = 0; i < iterations; i++) { + if (snrt_is_dm_core()) { + // DMA in + if (!DOUBLE_BUFFER || (i < args->n_tiles)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_in = i; + buff_idx = DOUBLE_BUFFER ? i_dma_in % 2 : 0; + + // Calculate size and pointers to current tile + offset = i_dma_in * frac; + remote_x = args->x + offset; + remote_y = args->y + offset; + + // Copy job operands in TCDM + snrt_dma_start_1d(local_x[buff_idx], remote_x, size); + snrt_dma_start_1d(local_y[buff_idx], remote_y, size); + snrt_dma_wait_all(); + + snrt_mcycle(); + } + + // Additional barriers required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + // DMA out + if (!DOUBLE_BUFFER || (i > 1)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_out = DOUBLE_BUFFER ? i - 2 : i; + buff_idx = DOUBLE_BUFFER ? i_dma_out % 2 : 0; + + // Calculate pointers to current tile + offset = i_dma_out * frac; + remote_z = args->z + offset; + + // Copy job outputs from TCDM + snrt_dma_start_1d(remote_z, local_z[buff_idx], size); + snrt_dma_wait_all(); + + snrt_mcycle(); + } + } + + // Compute + if (snrt_is_compute_core()) { + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + + if (!DOUBLE_BUFFER || (i > 0 && i < (args->n_tiles + 1))) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_compute = DOUBLE_BUFFER ? i - 1 : i; + buff_idx = DOUBLE_BUFFER ? i_compute % 2 : 0; + + // Perform tile computation + axpy_fp_t fp = args->funcptr; + fp(frac, args->a, local_x[buff_idx], local_y[buff_idx], + local_z[buff_idx]); + + snrt_mcycle(); + } + + // Additional barrier required when not double buffering + if (!DOUBLE_BUFFER) snrt_cluster_hw_barrier(); + } + + // Synchronize cores after every iteration + snrt_cluster_hw_barrier(); + } } diff --git a/sw/blas/axpy/src/main.c b/sw/blas/axpy/src/main.c index 22f3dd129..e0389d25d 100644 --- a/sw/blas/axpy/src/main.c +++ b/sw/blas/axpy/src/main.c @@ -4,64 +4,24 @@ #include "snrt.h" -#define XSSR #include "axpy.h" #include "data.h" int main() { - double *local_x, *local_y, *local_z; - double *remote_x, *remote_y, *remote_z; - - // Calculate size and pointers for each cluster - uint32_t frac = n / snrt_cluster_num(); - uint32_t offset = frac * snrt_cluster_idx(); - remote_x = x + offset; - remote_y = y + offset; - remote_z = z + offset; - - // Allocate space in TCDM - local_x = (double *)snrt_l1_next(); - local_y = local_x + frac; - local_z = local_y + frac; - - // Copy data in TCDM - if (snrt_is_dm_core()) { - size_t size = frac * sizeof(double); - snrt_dma_start_1d(local_x, remote_x, size); - snrt_dma_start_1d(local_y, remote_y, size); - snrt_dma_wait_all(); - } - - snrt_cluster_hw_barrier(); - - // Compute - if (!snrt_is_dm_core()) { - uint32_t start_cycle = snrt_mcycle(); - axpy(frac, a, local_x, local_y, local_z); - uint32_t end_cycle = snrt_mcycle(); - } - - snrt_cluster_hw_barrier(); - - // Copy data out of TCDM - if (snrt_is_dm_core()) { - size_t size = frac * sizeof(double); - snrt_dma_start_1d(remote_z, local_z, size); - snrt_dma_wait_all(); - } - - snrt_cluster_hw_barrier(); + axpy_job(&args); // TODO: currently only works for single cluster otherwise need to // synchronize all cores here #ifdef BIST + uint32_t n = args.n; + double* z = args.z; uint32_t nerr = n; // Check computation is correct if (snrt_global_core_idx() == 0) { for (int i = 0; i < n; i++) { - if (local_z[i] == g[i]) nerr--; - printf("%d %d\n", local_z[i], g[i]); + if (z[i] == g[i]) nerr--; + printf("%d %d\n", z[i], g[i]); } } diff --git a/sw/blas/blas.h b/sw/blas/blas.h index 33c29e175..69005ccb7 100644 --- a/sw/blas/blas.h +++ b/sw/blas/blas.h @@ -4,6 +4,20 @@ #pragma once +// Floating-point multiplications by zero cannot be optimized as in some +// edge cases they do not yield zero: +// - 0f * NaN = NaN +// - 0f * INFINITY == NaN +// Thus in order to optimize it, we need to test for zero. You can use this +// function for free when `multiplier` is a constant. +static inline double multiply_opt(double multiplicand, double multiplier) { + if (multiplier) + return multiplicand * multiplier; + else + return 0; +} + #include "axpy/src/axpy.h" #include "dot/src/dot.h" #include "gemm/src/gemm.h" +#include "syrk/src/syrk.h" diff --git a/sw/blas/dot/Makefile b/sw/blas/dot/Makefile deleted file mode 100644 index 077b84e5a..000000000 --- a/sw/blas/dot/Makefile +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright 2024 ETH Zurich and University of Bologna. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -# Usage of absolute paths is required to externally include this Makefile -MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) -DATA_DIR := $(realpath $(MK_DIR)/data) -SRC_DIR := $(realpath $(MK_DIR)/src) - -DATA_CFG ?= $(DATA_DIR)/params.json -SECTION ?= - -APP ?= dot -SRCS ?= $(realpath $(SRC_DIR)/main.c) -INCDIRS ?= $(dir $(DATA_H)) $(SRC_DIR) - -DATAGEN_PY = $(MK_DIR)/scripts/datagen.py -DATA_H ?= $(DATA_DIR)/data.h - -$(dir $(DATA_H)): - mkdir -p $@ - -$(DATA_H): $(DATAGEN_PY) $(DATA_CFG) | $(dir $(DATA_H)) - $< -c $(DATA_CFG) --section="$(SECTION)" $@ - -.PHONY: clean-data clean - -clean-data: - rm -f $(DATA_H) - -clean: clean-data diff --git a/sw/blas/dot/scripts/datagen.py b/sw/blas/dot/scripts/datagen.py index 01560c51f..8a8631a6a 100755 --- a/sw/blas/dot/scripts/datagen.py +++ b/sw/blas/dot/scripts/datagen.py @@ -6,14 +6,11 @@ import numpy as np import sys -from snitch.util.sim.data_utils import format_scalar_definition, format_array_definition, \ - format_scalar_declaration, format_ifdef_wrapper, DataGen +import snitch.util.sim.data_utils as du -class DotDataGen(DataGen): +class DotDataGen(du.DataGen): - MIN = -1000 - MAX = +1000 # AXI splits bursts crossing 4KB address boundaries. To minimize # the occurrence of these splits the data should be aligned to 4KB BURST_ALIGNMENT = 4096 @@ -25,22 +22,22 @@ def emit_header(self, **kwargs): header = [super().emit_header()] n = kwargs['n'] - x = np.random.uniform(self.MIN, self.MAX, n) - y = np.random.uniform(self.MIN, self.MAX, n) + x = du.generate_random_array(n) + y = du.generate_random_array(n) g = self.golden_model(x, y) assert (n % (8 * 4)) == 0, "n must be an integer multiple of the number of cores times " \ "the unrolling factor" - header += [format_scalar_definition('const uint32_t', 'n', n)] - header += [format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT, - section=kwargs['section'])] - header += [format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT, - section=kwargs['section'])] - header += [format_scalar_declaration('double', 'result', alignment=self.BURST_ALIGNMENT, - section=kwargs['section'])] - result_def = format_scalar_definition('double', 'g', g) - header += [format_ifdef_wrapper('BIST', result_def)] + header += [du.format_scalar_definition('const uint32_t', 'n', n)] + header += [du.format_array_definition('double', 'x', x, alignment=self.BURST_ALIGNMENT, + section=kwargs['section'])] + header += [du.format_array_definition('double', 'y', y, alignment=self.BURST_ALIGNMENT, + section=kwargs['section'])] + header += [du.format_scalar_declaration('double', 'result', alignment=self.BURST_ALIGNMENT, + section=kwargs['section'])] + result_def = du.format_scalar_definition('double', 'g', g) + header += [du.format_ifdef_wrapper('BIST', result_def)] header = '\n\n'.join(header) return header diff --git a/sw/blas/gemm/scripts/datagen.py b/sw/blas/gemm/scripts/datagen.py index da7f8ba57..2eb6e2f4d 100755 --- a/sw/blas/gemm/scripts/datagen.py +++ b/sw/blas/gemm/scripts/datagen.py @@ -10,18 +10,15 @@ import numpy as np import re -import pyflexfloat as ff import sys -from snitch.util.sim import data_utils -from snitch.util.sim.data_utils import DataGen, format_array_declaration, \ - format_struct_definition, format_array_definition, format_ifdef_wrapper +import snitch.util.sim.data_utils as du np.random.seed(42) -class GemmDataGen(DataGen): +class GemmDataGen(du.DataGen): # AXI splits bursts crossing 4KB address boundaries. To minimize # the occurrence of these splits the data should be aligned to 4KB @@ -56,14 +53,14 @@ def validate_config(self, gemm_fp, parallelize_m, # Calculate total TCDM occupation # Note: doesn't account for double buffering - prec = data_utils.size_from_precision_t(dtype) + prec = du.size_from_precision_t(dtype) a_size = frac_m * frac_k * prec b_size = frac_k * frac_n * prec c_size = frac_m * frac_n * prec total_size = a_size total_size += b_size total_size += c_size - data_utils.validate_tcdm_footprint(total_size) + du.validate_tcdm_footprint(total_size) assert (M % m_tiles) == 0, 'M is not an integer multiple of tile size' assert (N % n_tiles) == 0, 'N is not an integer multiple of tile size' @@ -99,12 +96,11 @@ def emit_header(self, **kwargs): prec, _ = self.infer_implementation(kwargs['gemm_fp']) - ff_desc = data_utils.ff_desc_from_precision_t(prec) - ctype = data_utils.ctype_from_precision_t(prec) + ctype = du.ctype_from_precision_t(prec) - a = ff.array(np.random.rand(M, K), ff_desc) - b = ff.array(np.random.rand(K, N), ff_desc) - c = ff.array(np.random.rand(M, N), ff_desc) + a = du.generate_random_array((M, K), prec) + b = du.generate_random_array((K, N), prec) + c = du.generate_random_array((M, N), prec) result = self.exact_golden_model(1, a, b, kwargs['beta'], c) # Store matrices in transposed form if requested @@ -127,18 +123,18 @@ def emit_header(self, **kwargs): b = b.flatten() c = c.flatten() - header += [format_array_declaration(ctype, a_uid, a.shape)] - header += [format_array_declaration(ctype, b_uid, b.shape)] - header += [format_array_declaration(ctype, c_uid, c.shape)] - header += [format_struct_definition('gemm_args_t', 'args', cfg)] - header += [format_array_definition(ctype, a_uid, a, - section=kwargs['section'])] - header += [format_array_definition(ctype, b_uid, b, - section=kwargs['section'])] - header += [format_array_definition(ctype, c_uid, c, - section=kwargs['section'])] - result_def = format_array_definition(ctype, 'result', result.flatten()) - header += [format_ifdef_wrapper('BIST', result_def)] + header += [du.format_array_declaration(ctype, a_uid, a.shape)] + header += [du.format_array_declaration(ctype, b_uid, b.shape)] + header += [du.format_array_declaration(ctype, c_uid, c.shape)] + header += [du.format_struct_definition('gemm_args_t', 'args', cfg)] + header += [du.format_array_definition(ctype, a_uid, a, + section=kwargs['section'])] + header += [du.format_array_definition(ctype, b_uid, b, + section=kwargs['section'])] + header += [du.format_array_definition(ctype, c_uid, c, + section=kwargs['section'])] + result_def = du.format_array_definition(ctype, 'result', result.flatten()) + header += [du.format_ifdef_wrapper('BIST', result_def)] header = '\n\n'.join(header) return header diff --git a/sw/blas/gemm/scripts/verify.py b/sw/blas/gemm/scripts/verify.py index 40840b327..353ea1328 100755 --- a/sw/blas/gemm/scripts/verify.py +++ b/sw/blas/gemm/scripts/verify.py @@ -18,9 +18,9 @@ class GemmVerifier(Verifier): OUTPUT_UIDS = ['c'] ERR_THRESHOLD = { 1: 1e-4, - 2: 1e-2, - 4: 1e-6, - 8: 1e-6 + 2: 8e-2, + 4: 1e-3, + 8: 1e-3 } def __init__(self): diff --git a/sw/blas/gemm/src/gemm.h b/sw/blas/gemm/src/gemm.h index a480379a9..1a73aedf8 100644 --- a/sw/blas/gemm/src/gemm.h +++ b/sw/blas/gemm/src/gemm.h @@ -13,19 +13,6 @@ #pragma once -// Floating-point multiplications by zero cannot be optimized as in some -// edge cases they do not yield zero: -// - 0f * NaN = NaN -// - 0f * INFINITY == NaN -// Thus in order to optimize it, we need to test for zero. You can use this -// function for free when `multiplier` is a constant. -static inline double multiply_opt(double multiplicand, double multiplier) { - if (multiplier) - return multiplicand * multiplier; - else - return 0; -} - #include "gemm_fp16.h" #include "gemm_fp32.h" #include "gemm_fp64.h" diff --git a/sw/blas/gemm/src/main.c b/sw/blas/gemm/src/main.c index 17f3936b0..9760000c6 100644 --- a/sw/blas/gemm/src/main.c +++ b/sw/blas/gemm/src/main.c @@ -9,7 +9,7 @@ #include #include -#include "gemm.h" +#include "blas.h" #include "data.h" #include "snrt.h" diff --git a/sw/blas/syrk/data/params.json b/sw/blas/syrk/data/params.json new file mode 100644 index 000000000..492d8e0cc --- /dev/null +++ b/sw/blas/syrk/data/params.json @@ -0,0 +1,12 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + "m": 8, + "n": 2, + "alpha": 1.5, + "beta": 3.2, + "m_tiles": 1, + "funcptr": "syrk_opt" +} diff --git a/sw/blas/syrk/scripts/datagen.py b/sw/blas/syrk/scripts/datagen.py new file mode 100755 index 000000000..3fb86644f --- /dev/null +++ b/sw/blas/syrk/scripts/datagen.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Author: Luca Colagrande + +import numpy as np + +import snitch.util.sim.data_utils as du + + +DOUBLE_BUFFER = True + + +class SyrkDataGen(du.DataGen): + + # Function pointers to alternative implementations + FUNCPTRS = ["syrk_naive", "syrk_baseline", "syrk_opt"] + + def golden_model(self, alpha, A, beta, C): + return alpha * np.matmul(A, A.transpose()) + beta * C + + def validate(self, **kwargs): + n_cores = 8 + assert (kwargs['m'] % kwargs['m_tiles']) == 0, "m must be an integer multiple of m_tiles" + m_frac = kwargs['m'] / kwargs['m_tiles'] + assert (m_frac % n_cores) == 0, "m_frac must be an integer multiple of the number of cores" + if kwargs['funcptr'] != "syrk_naive": + assert (m_frac % 4) == 0, "m_frac must be an integer multiple of the unroll factor 4" + assert kwargs['funcptr'] in self.FUNCPTRS, f"Function pointer must be among {self.FUNCPTRS}" + + # Calculate total TCDM occupation + a_tile_size = m_frac * kwargs['n'] * 8 + c_tile_size = m_frac * m_frac * 8 + total_size = 2 * a_tile_size + c_tile_size + if DOUBLE_BUFFER: + total_size *= 2 + du.validate_tcdm_footprint(total_size) + + def emit_header(self, **kwargs): + header = [super().emit_header()] + + self.validate(**kwargs) + + if 'alpha' in kwargs: + alpha = kwargs['alpha'] + else: + alpha = du.generate_random_array(1)[0] + if 'beta' in kwargs: + beta = kwargs['beta'] + else: + beta = du.generate_random_array(1)[0] + + A = du.generate_random_array((kwargs['m'], kwargs['n'])) + C_in = du.generate_random_array((kwargs['m'], kwargs['m'])) + + A = A.flatten() + C_in = C_in.flatten() + + A_uid = 'A' + C_uid = 'C' + + cfg = { + 'm': kwargs['m'], + 'n': kwargs['n'], + 'alpha': alpha, + 'beta': beta, + 'a': A_uid, + 'c': C_uid, + 'm_tiles': kwargs['m_tiles'], + 'funcptr': kwargs['funcptr'] + } + + header += [du.format_array_definition('double', A_uid, A)] + header += [du.format_array_definition('double', C_uid, C_in)] + header += [du.format_struct_definition('syrk_args_t', 'args', cfg)] + header = '\n\n'.join(header) + + return header + + +if __name__ == '__main__': + SyrkDataGen().main() diff --git a/sw/blas/syrk/scripts/verify.py b/sw/blas/syrk/scripts/verify.py new file mode 100755 index 000000000..0624156cb --- /dev/null +++ b/sw/blas/syrk/scripts/verify.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import numpy as np +import sys +from datagen import SyrkDataGen + +from snitch.util.sim.verif_utils import Verifier + + +class SyrkVerifier(Verifier): + + OUTPUT_UIDS = ['C'] + + def __init__(self): + super().__init__() + self.func_args = { + 'm': 'I', + 'n': 'I', + 'alpha': 'd', + 'beta': 'd', + 'A': 'I', + 'C': 'I', + 'm_tiles': 'I', + 'funcptr': 'I' + } + self.func_args = self.get_input_from_symbol('args', self.func_args) + + def get_actual_results(self): + return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double') + + def get_expected_results(self): + A = self.get_input_from_symbol('A', 'double') + C = self.get_input_from_symbol('C', 'double') + A = np.reshape(A, (self.func_args['m'], self.func_args['n'])) + C = np.reshape(C, (self.func_args['m'], self.func_args['m'])) + return SyrkDataGen().golden_model( + self.func_args['alpha'], A, + self.func_args['beta'], C + ).flatten() + + def check_results(self, *args): + return super().check_results(*args, rtol=1e-10) + + +if __name__ == "__main__": + sys.exit(SyrkVerifier().main()) diff --git a/sw/blas/syrk/src/args.h b/sw/blas/syrk/src/args.h new file mode 100644 index 000000000..24342d3e3 --- /dev/null +++ b/sw/blas/syrk/src/args.h @@ -0,0 +1,22 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#pragma once +#include + +typedef void (*syrk_fp_t)(uint32_t m, uint32_t n, double alpha, double *a, + double *at, double beta, double *b); + +typedef struct { + uint32_t m; + uint32_t n; + double alpha; + double beta; + double *a; + double *c; + uint32_t m_tiles; + syrk_fp_t funcptr; +} syrk_args_t; diff --git a/sw/blas/syrk/src/main.c b/sw/blas/syrk/src/main.c new file mode 100644 index 000000000..f8c09ae4f --- /dev/null +++ b/sw/blas/syrk/src/main.c @@ -0,0 +1,16 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include "snrt.h" + +#include "blas.h" +#include "data.h" + +int main() { + syrk_job(&args); + + return 0; +} diff --git a/sw/blas/syrk/src/syrk.h b/sw/blas/syrk/src/syrk.h new file mode 100644 index 000000000..718ad7fe9 --- /dev/null +++ b/sw/blas/syrk/src/syrk.h @@ -0,0 +1,294 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Luca Colagrande + +#include "args.h" +#include "snrt.h" + +__thread int setup_ssr = 1; + +void syrk_naive(uint32_t m, uint32_t n, double alpha, double *a, double *at, + double beta, double *c) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + for (uint32_t i = offset; i < m; i += stride) { + for (uint32_t j = 0; j < m; j++) { + double acc = 0; + for (uint32_t k = 0; k < n; k++) { + acc += a[i * n + k] * at[j * n + k]; + } + c[i * m + j] = multiply_opt(c[i * m + j], beta); + c[i * m + j] += alpha * acc; + } + } +} + +void syrk_baseline(uint32_t m, uint32_t n, double alpha, double *a, double *at, + double beta, double *c) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + // Unrolling factors + // Note: changes must be reflected in the inline assembly code + // and datagen script + const uint32_t unroll1 = 4; + const uint32_t unroll0 = 4; + + for (uint32_t i = offset; i < m; i += stride) { + for (uint32_t j = 0; j < m; j += unroll1) { + double acc[4]; + acc[0] = 0; + acc[1] = 0; + acc[2] = 0; + acc[3] = 0; + + for (uint32_t k = 0; k < n; k += unroll0) { + asm volatile( + "fmadd.d %[acc0], %[a0], %[at0], %[acc0] \n" + "fmadd.d %[acc1], %[a0], %[at1], %[acc1] \n" + "fmadd.d %[acc2], %[a0], %[at2], %[acc2] \n" + "fmadd.d %[acc3], %[a0], %[at3], %[acc3] \n" + "fmadd.d %[acc0], %[a1], %[at4], %[acc0] \n" + "fmadd.d %[acc1], %[a1], %[at5], %[acc1] \n" + "fmadd.d %[acc2], %[a1], %[at6], %[acc2] \n" + "fmadd.d %[acc3], %[a1], %[at7], %[acc3] \n" + "fmadd.d %[acc0], %[a2], %[at8], %[acc0] \n" + "fmadd.d %[acc1], %[a2], %[at9], %[acc1] \n" + "fmadd.d %[acc2], %[a2], %[at10], %[acc2] \n" + "fmadd.d %[acc3], %[a2], %[at11], %[acc3] \n" + "fmadd.d %[acc0], %[a3], %[at12], %[acc0] \n" + "fmadd.d %[acc1], %[a3], %[at13], %[acc1] \n" + "fmadd.d %[acc2], %[a3], %[at14], %[acc2] \n" + "fmadd.d %[acc3], %[a3], %[at15], %[acc3] \n" + : [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), + [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]) + : + [ a0 ] "f"(a[i * n + k + 0]), [ a1 ] "f"(a[i * n + k + 1]), + [ a2 ] "f"(a[i * n + k + 2]), [ a3 ] "f"(a[i * n + k + 3]), + [ at0 ] "f"(at[(j + 0) * n + k]), + [ at1 ] "f"(at[(j + 1) * n + k]), + [ at2 ] "f"(at[(j + 2) * n + k]), + [ at3 ] "f"(at[(j + 3) * n + k]), + [ at4 ] "f"(at[(j + 0) * n + k + 1]), + [ at5 ] "f"(at[(j + 1) * n + k + 1]), + [ at6 ] "f"(at[(j + 2) * n + k + 1]), + [ at7 ] "f"(at[(j + 3) * n + k + 1]), + [ at8 ] "f"(at[(j + 0) * n + k + 2]), + [ at9 ] "f"(at[(j + 1) * n + k + 2]), + [ at10 ] "f"(at[(j + 2) * n + k + 2]), + [ at11 ] "f"(at[(j + 3) * n + k + 2]), + [ at12 ] "f"(at[(j + 0) * n + k + 3]), + [ at13 ] "f"(at[(j + 1) * n + k + 3]), + [ at14 ] "f"(at[(j + 2) * n + k + 3]), + [ at15 ] "f"(at[(j + 3) * n + k + 3]) + :); + } + + c[i * m + j + 0] = multiply_opt(c[i * m + j + 0], beta); + c[i * m + j + 1] = multiply_opt(c[i * m + j + 1], beta); + c[i * m + j + 2] = multiply_opt(c[i * m + j + 2], beta); + c[i * m + j + 3] = multiply_opt(c[i * m + j + 3], beta); + c[i * m + j + 0] += alpha * acc[0]; + c[i * m + j + 1] += alpha * acc[1]; + c[i * m + j + 2] += alpha * acc[2]; + c[i * m + j + 3] += alpha * acc[3]; + } + } +} + +void syrk_opt(uint32_t m, uint32_t n, double alpha, double *a, double *at, + double beta, double *c) { + uint32_t offset = snrt_cluster_core_idx(); + uint32_t stride = snrt_cluster_compute_core_num(); + + // Unrolling factor of innermost loop + // Note: changes must be reflected in the inline assembly code + // and datagen script + const uint32_t unroll = 4; + + if (setup_ssr) { + // Configure ft0 and ft1 to load A and At + // for (i = offset; i < m; i += stride) + // for (j1 = 0; j1 < m; j1 += unroll) + // for (k = 0; k < n; k++) + // for (j0 = 0; j0 < unroll; j0++) + // j = j1 + j0 + // ft0.push(a[i * n + k]) + // ft1.push(at[j * n + k]) + const uint32_t ssr0_b[4] = {unroll, n, m / unroll, m / stride}; + const uint32_t ssr0_i[4] = {0, sizeof(double), 0, + stride * n * sizeof(double)}; + snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], + ssr0_i[1], ssr0_i[2], ssr0_i[3]); + snrt_ssr_repeat(SNRT_SSR_DM0, unroll); + const uint32_t ssr1_b[4] = {unroll, n, m / unroll, m / stride}; + const uint32_t ssr1_i[4] = {n * sizeof(double), sizeof(double), + unroll * n * sizeof(double), 0}; + snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], + ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); + setup_ssr = 0; + } + + // SSR start address need to be configured each time + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, a + offset * n); + snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, at); + snrt_ssr_enable(); + + for (uint32_t i = offset; i < m; i += stride) { + for (uint32_t j = 0; j < m; j += unroll) { + double acc[unroll]; + acc[0] = 0; + acc[1] = 0; + acc[2] = 0; + acc[3] = 0; + + asm volatile( + "frep.o %[n_frep], %[unroll], 0, 0 \n" + "fmadd.d %[acc0], ft0, ft1, %[acc0] \n" + "fmadd.d %[acc1], ft0, ft1, %[acc1] \n" + "fmadd.d %[acc2], ft0, ft1, %[acc2] \n" + "fmadd.d %[acc3], ft0, ft1, %[acc3] \n" + "fmul.d %[acc0], %[acc0], %[alpha] \n" + "fmul.d %[acc1], %[acc1], %[alpha] \n" + "fmul.d %[acc2], %[acc2], %[alpha] \n" + "fmul.d %[acc3], %[acc3], %[alpha] \n" + "fmadd.d %[c0], %[c0], %[beta], %[acc0] \n" + "fmadd.d %[c1], %[c1], %[beta], %[acc1] \n" + "fmadd.d %[c2], %[c2], %[beta], %[acc2] \n" + "fmadd.d %[c3], %[c3], %[beta], %[acc3] \n" + : [ c0 ] "+f"(c[i * m + j + 0]), [ c1 ] "+f"(c[i * m + j + 1]), + [ c2 ] "+f"(c[i * m + j + 2]), [ c3 ] "+f"(c[i * m + j + 3]), + [ acc0 ] "+f"(acc[0]), [ acc1 ] "+f"(acc[1]), + [ acc2 ] "+f"(acc[2]), [ acc3 ] "+f"(acc[3]) + : [ n_frep ] "r"(n - 1), [ unroll ] "i"(unroll), + [ alpha ] "f"(alpha), [ beta ] "f"(beta) + : "ft0", "ft1", "ft2"); + } + } + + snrt_ssr_disable(); + snrt_fpu_fence(); +} + +void syrk_job(syrk_args_t *args) { + uint32_t m_frac, a_tile_size, a_tile_bytes, c_tile_size, c_tile_bytes; + uint64_t local_a0_addr, local_at0_addr, local_c0_addr, local_a1_addr, + local_at1_addr, local_c1_addr; + double *local_a[2]; + double *local_at[2]; + double *local_c[2]; + uint32_t n_tiles, iterations; + uint32_t i, i_dma_in, i_compute, i_dma_out, i_row, i_col, buff_idx; + +#ifndef JOB_ARGS_PRELOADED + // Allocate space for job arguments in TCDM + syrk_args_t *local_args = (syrk_args_t *)snrt_l1_next(); + + // Copy job arguments to TCDM + if (snrt_is_dm_core()) { + snrt_dma_start_1d(local_args, args, sizeof(syrk_args_t)); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); + args = local_args; +#endif + + // Calculate size of each tile + m_frac = args->m / args->m_tiles; + a_tile_size = args->n * m_frac; + c_tile_size = m_frac * m_frac; + a_tile_bytes = a_tile_size * sizeof(double); + c_tile_bytes = c_tile_size * sizeof(double); + + // Allocate space for job operands in TCDM + // Align X with the 1st bank in TCDM, Y with the 8th and Z with the 16th. + local_a0_addr = (uint64_t)args + sizeof(syrk_args_t); + local_at0_addr = local_a0_addr + a_tile_bytes; + local_c0_addr = local_at0_addr + a_tile_bytes; + local_a[0] = (double *)local_a0_addr; + local_at[0] = (double *)local_at0_addr; + local_c[0] = (double *)local_c0_addr; + local_a1_addr = local_c0_addr + c_tile_bytes; + local_at1_addr = local_a1_addr + a_tile_bytes; + local_c1_addr = local_at1_addr + a_tile_bytes; + local_a[1] = (double *)local_a1_addr; + local_at[1] = (double *)local_at1_addr; + local_c[1] = (double *)local_c1_addr; + + // Calculate number of iterations + n_tiles = args->m_tiles * args->m_tiles; + iterations = n_tiles + 2; + + // Iterate over all tiles + for (i = 0; i < iterations; i++) { + if (snrt_is_dm_core()) { + // DMA out + // (out before in to avoid overwriting data) + if (i > 1) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_out = i - 2; + buff_idx = i_dma_out % 2; + i_row = i_dma_out / args->m_tiles; + i_col = i_dma_out % args->m_tiles; + + // Copy job outputs from TCDM + snrt_dma_store_2d_tile(args->c, local_c[buff_idx], i_row, i_col, + m_frac, m_frac, args->m, sizeof(double)); + snrt_dma_wait_all(); + + snrt_mcycle(); + } + + // DMA in + if (i < n_tiles) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_dma_in = i; + buff_idx = i_dma_in % 2; + i_row = i_dma_in / args->m_tiles; + i_col = i_dma_in % args->m_tiles; + + // Copy job operands in TCDM + snrt_dma_load_1d_tile(local_a[buff_idx], args->a, i_row, + a_tile_size, sizeof(double)); + snrt_dma_load_1d_tile(local_at[buff_idx], args->a, i_col, + a_tile_size, sizeof(double)); + if (args->funcptr == syrk_opt || args->beta != 0) { + snrt_dma_load_2d_tile(local_c[buff_idx], args->c, i_row, + i_col, m_frac, m_frac, args->m, + sizeof(double)); + } + snrt_dma_wait_all(); + + snrt_mcycle(); + } + } + + // Compute + if (snrt_is_compute_core()) { + if (i > 0 && i < (n_tiles + 1)) { + snrt_mcycle(); + + // Compute tile and buffer indices + i_compute = i - 1; + buff_idx = i_compute % 2; + + // Perform tile computation + syrk_fp_t fp = args->funcptr; + fp(m_frac, args->n, args->alpha, local_a[buff_idx], + local_at[buff_idx], args->beta, local_c[buff_idx]); + + snrt_mcycle(); + } + } + + // Synchronize cores after every iteration + snrt_cluster_hw_barrier(); + } +} diff --git a/sw/dnn/.gitignore b/sw/dnn/.gitignore deleted file mode 100644 index aed262ca8..000000000 --- a/sw/dnn/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*/data/data.h diff --git a/target/common/common.mk b/target/common/common.mk index 70afd80c2..995e80ba0 100644 --- a/target/common/common.mk +++ b/target/common/common.mk @@ -203,6 +203,7 @@ SNITCH_DASM_TRACES = $(shell (ls $(LOGS_DIR)/trace_hart_*.dasm 2>/dev/null) SNITCH_TXT_TRACES = $(shell (echo $(SNITCH_DASM_TRACES) | sed 's/\.dasm/\.txt/g')) SNITCH_ANNOTATED_TRACES = $(shell (echo $(SNITCH_DASM_TRACES) | sed 's/\.dasm/\.s/g')) SNITCH_PERF_DUMPS = $(shell (echo $(SNITCH_DASM_TRACES) | sed 's/trace_hart/hart/g' | sed 's/.dasm/_perf.json/g')) +DMA_PERF_DUMPS = $(LOGS_DIR)/dma_*_perf.json TXT_TRACES += $(SNITCH_TXT_TRACES) ANNOTATED_TRACES += $(SNITCH_ANNOTATED_TRACES) @@ -219,7 +220,7 @@ annotate: $(ANNOTATED_TRACES) perf: $(JOINT_PERF_DUMP) visual-trace: $(VISUAL_TRACE) clean-traces: - rm -f $(TXT_TRACES) + rm -f $(TXT_TRACES) $(SNITCH_PERF_DUMPS) $(DMA_PERF_DUMPS) clean-annotate: rm -f $(ANNOTATED_TRACES) clean-perf: diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk index ca8246124..e4456fdfc 100644 --- a/target/snitch_cluster/sw.mk +++ b/target/snitch_cluster/sw.mk @@ -51,6 +51,7 @@ APPS = sw/apps/nop APPS += sw/apps/blas/axpy APPS += sw/apps/blas/gemm APPS += sw/apps/blas/dot +APPS += sw/apps/blas/syrk APPS += sw/apps/dnn/batchnorm APPS += sw/apps/dnn/conv2d APPS += sw/apps/dnn/fusedconv @@ -66,6 +67,7 @@ APPS += sw/apps/montecarlo/pi_estimation APPS += sw/apps/atax APPS += sw/apps/correlation APPS += sw/apps/covariance +APPS += sw/apps/doitgen # Include Makefile from each app subdirectory $(foreach app,$(APPS), \ diff --git a/target/snitch_cluster/sw/apps/blas/gemm/app.mk b/target/snitch_cluster/sw/apps/blas/gemm/app.mk index 5d2b54068..f50f6d21c 100644 --- a/target/snitch_cluster/sw/apps/blas/gemm/app.mk +++ b/target/snitch_cluster/sw/apps/blas/gemm/app.mk @@ -8,6 +8,7 @@ APP := gemm $(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/blas/$(APP)/build SRC_DIR := $(ROOT)/sw/blas/$(APP)/src SRCS := $(SRC_DIR)/main.c +$(APP)_INCDIRS := $(ROOT)/sw/blas include $(ROOT)/sw/apps/common.mk include $(ROOT)/target/snitch_cluster/sw/apps/common.mk diff --git a/target/snitch_cluster/sw/apps/blas/syrk/app.mk b/target/snitch_cluster/sw/apps/blas/syrk/app.mk new file mode 100644 index 000000000..c0fd05044 --- /dev/null +++ b/target/snitch_cluster/sw/apps/blas/syrk/app.mk @@ -0,0 +1,14 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +APP := syrk +$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/blas/$(APP)/build +SRC_DIR := $(ROOT)/sw/blas/$(APP)/src +SRCS := $(SRC_DIR)/main.c +$(APP)_INCDIRS := $(ROOT)/sw/blas + +include $(ROOT)/sw/apps/common.mk +include $(ROOT)/target/snitch_cluster/sw/apps/common.mk diff --git a/target/snitch_cluster/sw/apps/covariance/app.mk b/target/snitch_cluster/sw/apps/covariance/app.mk index c177a9d61..e985e671e 100644 --- a/target/snitch_cluster/sw/apps/covariance/app.mk +++ b/target/snitch_cluster/sw/apps/covariance/app.mk @@ -8,6 +8,7 @@ APP := covariance $(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build SRC_DIR := $(ROOT)/sw/apps/$(APP)/src SRCS := $(SRC_DIR)/main.c +$(APP)_INCDIRS := $(ROOT)/sw/blas/ include $(ROOT)/sw/apps/common.mk include $(ROOT)/target/snitch_cluster/sw/apps/common.mk diff --git a/target/snitch_cluster/sw/apps/doitgen/app.mk b/target/snitch_cluster/sw/apps/doitgen/app.mk new file mode 100644 index 000000000..ebef550d3 --- /dev/null +++ b/target/snitch_cluster/sw/apps/doitgen/app.mk @@ -0,0 +1,14 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +APP := doitgen +$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build +SRC_DIR := $(ROOT)/sw/apps/$(APP)/src +SRCS := $(SRC_DIR)/main.c +$(APP)_INCDIRS := $(ROOT)/sw/blas/ + +include $(ROOT)/sw/apps/common.mk +include $(ROOT)/target/snitch_cluster/sw/apps/common.mk diff --git a/target/snitch_cluster/sw/fdiv.yaml b/target/snitch_cluster/sw/fdiv.yaml index a8b5f3930..d6b7aea3b 100644 --- a/target/snitch_cluster/sw/fdiv.yaml +++ b/target/snitch_cluster/sw/fdiv.yaml @@ -13,5 +13,3 @@ runs: cmd: [../../../sw/dnn/flashattention_2/scripts/verify.py, "${sim_bin}", "${elf}"] - elf: apps/correlation/build/correlation.elf cmd: [../../../sw/apps/correlation/scripts/verify.py, "${sim_bin}", "${elf}"] - - elf: apps/covariance/build/covariance.elf - cmd: [../../../sw/apps/covariance/scripts/verify.py, "${sim_bin}", "${elf}"] diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml index 7a5a55a4c..d9e2f8c2f 100644 --- a/target/snitch_cluster/sw/run.yaml +++ b/target/snitch_cluster/sw/run.yaml @@ -80,6 +80,8 @@ runs: cmd: [../../../sw/blas/gemm/scripts/verify.py, "${sim_bin}", "${elf}"] - elf: apps/blas/dot/build/dot.elf cmd: [../../../sw/blas/dot/scripts/verify.py, "${sim_bin}", "${elf}"] + - elf: apps/blas/syrk/build/syrk.elf + cmd: [../../../sw/blas/syrk/scripts/verify.py, "${sim_bin}", "${elf}"] - elf: apps/dnn/batchnorm/build/batchnorm.elf - elf: apps/dnn/maxpool/build/maxpool.elf # - elf: apps/dnn/conv2d/build/conv2d.elf # Fails with wrong results @@ -95,3 +97,7 @@ runs: - elf: apps/montecarlo/pi_estimation/build/pi_estimation.elf # - elf: apps/atax/build/atax.elf # cmd: [../../../sw/apps/atax/scripts/verify.py, "${sim_bin}", "${elf}"] + - elf: apps/covariance/build/covariance.elf + cmd: [../../../sw/apps/covariance/scripts/verify.py, "${sim_bin}", "${elf}"] + - elf: apps/doitgen/build/doitgen.elf + cmd: [../../../sw/apps/doitgen/scripts/verify.py, "${sim_bin}", "${elf}"] diff --git a/util/container/Dockerfile b/util/container/Dockerfile index 9cdc7d9aa..bfef21266 100644 --- a/util/container/Dockerfile +++ b/util/container/Dockerfile @@ -94,6 +94,7 @@ RUN tar xzf snitch-spike-dasm-${SPIKE_DASM_VERSION}-x86_64-linux-gnu-ubuntu18.04 # Install Doxygen RUN wget https://www.doxygen.nl/files/doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz RUN tar xzf doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz +RUN mv doxygen-${DOXYGEN_VERSION} doxygen # 2. Stage FROM ubuntu:22.04 AS snitch_cluster @@ -154,7 +155,7 @@ COPY --from=builder /tools/spike-dasm bin/ COPY --from=builder /root/.cargo/bin/banshee bin/ COPY --from=builder /opt/python /opt/python COPY --from=builder /tools/verilator /tools/verilator/ -COPY --from=builder /tools/doxygen-${DOXYGEN_VERSION}/bin/doxygen bin/ +COPY --from=builder /tools/doxygen/bin/doxygen bin/ # Create and activate virtual environment ENV VIRTUAL_ENV "/root/.venvs/snitch_cluster" diff --git a/util/sim/data_utils.py b/util/sim/data_utils.py index e6f48acce..3b732c5cc 100644 --- a/util/sim/data_utils.py +++ b/util/sim/data_utils.py @@ -83,6 +83,24 @@ def torch_type_from_precision_t(prec): return precision_t_to_torch_type_map[_integer_precision_t(prec)] +def numpy_type_from_precision_t(prec): + """Convert `precision_t` type to PyTorch type. + + Args: + prec: A value of type `precision_t`. Accepts both enum strings + (e.g. "FP64") and integer enumeration values (e.g. 8). + """ + # Types which have a direct correspondence in Numpy + precision_t_to_numpy_type_map = { + 8: np.float64, + 4: np.float32, + 2: np.float16 + } + prec = _integer_precision_t(prec) + assert prec != 1, "No direct correspondence between FP8 and Numpy" + return precision_t_to_numpy_type_map[prec] + + # Returns the C type representing a floating-point value of the specified precision def ctype_from_precision_t(prec): """Convert `precision_t` type to a C type string. @@ -100,6 +118,29 @@ def ctype_from_precision_t(prec): return precision_t_to_ctype_map[_integer_precision_t(prec)] +def generate_random_array(size, prec='FP64'): + """Consistent random array generation for Snitch experiments. + + Samples values between -1 and 1 from a uniform distribution and + of the exact specified type, e.g. actual 64-bit doubles. + + This function ensures that e.g. power measurements are not skewed + by using integer values in the FPU. + + Args: + size: Tuple of array dimensions. + prec: A value of type `precision_t`. Accepts both enum strings + (e.g. "FP64") and integer enumeration values (e.g. 8). + """ + # Generate in 64b precision and then cast down + rand = np.random.default_rng().random(size=size, dtype=np.float64) * 2 - 1 + # Generate FlexFloat array for 8b floats, casted from 16b Numpy array + if _integer_precision_t(prec) == 1: + return ff.array(rand.astype(np.float16), ff_desc_from_precision_t(prec)) + else: + return rand.astype(numpy_type_from_precision_t(prec)) + + def flatten(array): """Flatten various array types with a homogeneous API. diff --git a/util/trace/gen_trace.py b/util/trace/gen_trace.py index db094ad7e..0fab642e0 100755 --- a/util/trace/gen_trace.py +++ b/util/trace/gen_trace.py @@ -1145,7 +1145,6 @@ def main(): message += 'line {lineno}.' print(traceback.format_exc(), file=sys.stderr) print(message, file=sys.stderr) - return 1 else: break # Nothing more in pipe, EOF perf_metrics[-1]['tend'] = time_info[0] / 1000