diff --git a/sw/dnn/softmax/data/datagen.py b/sw/dnn/softmax/data/datagen.py new file mode 100755 index 0000000000..6c645f5fae --- /dev/null +++ b/sw/dnn/softmax/data/datagen.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Tim Fischer +# Viviane Potocnik +# Luca Colagrande + +import argparse +import pathlib +import hjson +import sys +import os +import torch + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) +import data_utils # noqa: E402 +from data_utils import emit_license, \ + format_struct_definition, format_array_definition, \ + format_array_declaration, format_ifdef_wrapper # noqa: E402 + +torch.manual_seed(42) + +# AXI splits bursts crossing 4KB address boundaries. To minimize +# the occurrence of these splits the data should be aligned to 4KB +BURST_ALIGNMENT = 4096 + +PRECISION_T = { + '64': 'FP64', + '32': 'FP32', + '16': 'FP16', + '8': 'FP8' +} + + +def golden_model(ifmap, axis): + softmax = torch.nn.Softmax(dim=axis) + return softmax(ifmap) + + +def emit_header(**kwargs): + batch_size = kwargs['input_dim']['batch_size'] + seq_len = kwargs['input_dim']['seq_len'] + input_samples = kwargs['input_dim']['input_samples'] + reduce_dim = kwargs['reduce_dim'] + prec = str(kwargs['prec']) + + torch_type = data_utils.floating_point_torch_type(prec) + ifmap = torch.randn(batch_size, seq_len, input_samples, requires_grad=False, dtype=torch_type) + + ofmap = golden_model(ifmap, reduce_dim) + ofmap = ofmap.detach().numpy() + + ctype = data_utils.floating_point_ctype(prec) + + ifmap_uid = 'ifmap' + ofmap_uid = 'ofmap' + + layer_cfg = { + **kwargs['input_dim'], + 'reduce_dim': reduce_dim, + 'ifmap': ifmap_uid, + 'ofmap': ofmap_uid, + 'dtype': PRECISION_T[prec] + } + + data_str = [emit_license()] + data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape, + alignment=BURST_ALIGNMENT)] + data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape, + alignment=BURST_ALIGNMENT)] + data_str += [format_struct_definition('softmax_layer_t', 'layer', layer_cfg)] + data_str += [format_array_definition(ctype, ifmap_uid, ifmap, + alignment=BURST_ALIGNMENT)] + result_def = format_array_definition(ctype, 'golden', ofmap, alignment=BURST_ALIGNMENT) + data_str += [format_ifdef_wrapper('BIST', result_def)] + data_str = '\n\n'.join(data_str) + + return data_str + + +def main(): + + parser = argparse.ArgumentParser(description='Generate data for layernorm kernel') + parser.add_argument( + "-c", "--cfg", + type=pathlib.Path, + required=True, + help='Select param config file kernel' + ) + parser.add_argument( + '--section', + type=str, + help='Section to store matrices in') + parser.add_argument( + 'output', + type=pathlib.Path, + help='Path of the output header file') + args = parser.parse_args() + + # Load param config file + with args.cfg.open() as f: + param = hjson.loads(f.read()) + param['section'] = args.section + + # Emit header file + with open(args.output, 'w') as f: + f.write(emit_header(**param)) + + +if __name__ == '__main__': + main() diff --git a/target/snitch_cluster/sw/apps/dnn/softmax/src/params.hjson b/sw/dnn/softmax/data/params.hjson similarity index 81% rename from target/snitch_cluster/sw/apps/dnn/softmax/src/params.hjson rename to sw/dnn/softmax/data/params.hjson index a2724a0924..3ee58efb96 100644 --- a/target/snitch_cluster/sw/apps/dnn/softmax/src/params.hjson +++ b/sw/dnn/softmax/data/params.hjson @@ -2,10 +2,7 @@ // Solderpad Hardware License, Version 0.51, see LICENSE for details. // SPDX-License-Identifier: SHL-0.51 -// Parameters for a single SoftMax layer - { - kernel: "SoftMax" input_dim: { batch_size: 3, seq_len: 16, diff --git a/sw/dnn/softmax/src/main.c b/sw/dnn/softmax/src/main.c new file mode 100644 index 0000000000..7178c2b195 --- /dev/null +++ b/sw/dnn/softmax/src/main.c @@ -0,0 +1,14 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande + +#include "dnn.h" + +#include "data.h" + +int main() { + softmax_layer(layer); + return 0; +} diff --git a/sw/dnn/softmax/softmax.h b/sw/dnn/softmax/src/softmax.h similarity index 64% rename from sw/dnn/softmax/softmax.h rename to sw/dnn/softmax/src/softmax.h index f58d48fe63..a677766239 100644 --- a/sw/dnn/softmax/softmax.h +++ b/sw/dnn/softmax/src/softmax.h @@ -6,38 +6,31 @@ #include "math.h" #include "snrt.h" -// #include "printf.h" -#include "utils.h" /** * @struct softmax_layer_struct * @brief This structure contains all parameters necessary * for computing the Softmax activation function - * @var softmax_layer_struct::BATCH_SIZE + * @var softmax_layer_struct::batch_size * Size of each input sample - * @var softmax_layer_struct::SEQ_LEN + * @var softmax_layer_struct::seq_len * Size of each output sample - * @var softmax_layer_struct::INPUT_SAMPLES + * @var softmax_layer_struct::input_samples * Number of input samples - * @var softmax_layer_struct::REDUCE_DIM + * @var softmax_layer_struct::reduce_dim * Along which dimension to reduce * @var softmax_layer_struct::ifmap * Pointer to input feature map * @var softmax_layer_struct::ofmap * Pointer to output feature map - * @var softmax_layer_struct::result - * Pointer to the golden model output */ typedef struct softmax_layer_struct { - uint32_t BATCH_SIZE; - uint32_t SEQ_LEN; - uint32_t INPUT_SAMPLES; - uint32_t REDUCE_DIM; - + uint32_t batch_size; + uint32_t seq_len; + uint32_t input_samples; + int32_t reduce_dim; float *ifmap; float *ofmap; - float *result; - precision_t dtype; } softmax_layer_t; @@ -50,9 +43,6 @@ static inline void softmax_fp32(float *input, float *output, int32_t ldI, float max_core = 0.0; // max value of the current core float sum = 0.0; // sum of the exp values of the current core - // uint32_t compute_id = snrt_global_core_idx(); - // uint32_t num_cores = snrt_cluster_compute_core_num(); - for (int32_t b = 0; b < batch_size; b++) { for (int32_t s = 0; s < seq_len; s++) { max_core = -INFINITY; @@ -67,23 +57,13 @@ static inline void softmax_fp32(float *input, float *output, int32_t ldI, // compute the shifted value of the current row for (int32_t i = 0; i < input_samples; i++) { output[b * batch_offset + s * ldI + i] = - // FIXME: Below code is erroring due to the standard math - // lib conflict - // TODO: Try out with musl lib - // expf(input[b * batch_offset + s * ldI + i] - max_core); - // FIXME: actually there should be an exponentiation - input[b * batch_offset + s * ldI + i] - max_core; + expf(input[b * batch_offset + s * ldI + i] - max_core); sum += output[b * batch_offset + s * ldI + i]; } // compute the softmax value of the current row for (int32_t i = 0; i < input_samples; i++) { - // INFO: DIVSQRT unit MUST be activated in the cluster - // configuration output[b * batch_offset + s * ldI + i] /= sum; - // printf("output[%d] = %f\n", compute_id * input_samples + b * - // batch_offset + s * ldI + i, - // output[b * batch_offset + s * ldI + i]); } } } @@ -97,14 +77,14 @@ static inline void softmax_fp32(float *input, float *output, int32_t ldI, * @param l softmax_layer struct that holds addresses and parameters * */ -static inline void softmax_layer(softmax_layer_t *const l) { +static inline void softmax_layer(softmax_layer_t const l) { uint32_t cluster_num = snrt_cluster_num(); uint32_t cluster_id = snrt_cluster_idx(); uint32_t compute_num = snrt_cluster_compute_core_num(); uint32_t compute_id = snrt_global_core_idx(); uint32_t ifmap_size = - l->BATCH_SIZE * l->SEQ_LEN * l->INPUT_SAMPLES * sizeof(float); + l.batch_size * l.seq_len * l.input_samples * sizeof(float); uint32_t ofmap_size = ifmap_size; void *ptr = (float *)snrt_l1_next(); @@ -116,9 +96,9 @@ static inline void softmax_layer(softmax_layer_t *const l) { // DMA transfer the ifmap into the cluster TCDM if (snrt_is_dm_core()) { snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d( - ifmap, l->ifmap, l->BATCH_SIZE * sizeof(float), - l->BATCH_SIZE * sizeof(float), l->BATCH_SIZE * sizeof(float), - l->SEQ_LEN * l->INPUT_SAMPLES * sizeof(float)); + ifmap, l.ifmap, l.batch_size * sizeof(float), + l.batch_size * sizeof(float), l.batch_size * sizeof(float), + l.seq_len * l.input_samples * sizeof(float)); snrt_dma_wait_all(); } @@ -127,21 +107,31 @@ static inline void softmax_layer(softmax_layer_t *const l) { if (snrt_is_compute_core()) { // determine the row offset for each core - int32_t row_offset = compute_id * l->INPUT_SAMPLES; + int32_t row_offset = compute_id * l.input_samples; // determine the row stride of each matrix - int32_t ldI = compute_num * l->INPUT_SAMPLES; + int32_t ldI = compute_num * l.input_samples; // determine the batch offset for each core - int32_t batch_offset = l->SEQ_LEN * l->INPUT_SAMPLES; + int32_t batch_offset = l.seq_len * l.input_samples; // printf("row_offset: %d, ldI: %d\n", row_offset, ldI); softmax_fp32(&ifmap[row_offset], &ofmap[row_offset], ldI, batch_offset, - l->BATCH_SIZE, l->SEQ_LEN / 8, l->INPUT_SAMPLES); + l.batch_size, l.seq_len / 8, l.input_samples); } else { snrt_cluster_hw_barrier(); } + // DMA transfer the ofmap to DRAM + if (snrt_is_dm_core()) { + snrt_dma_txid_t txid_ofmap = snrt_dma_start_2d( + l.ofmap, ofmap, l.batch_size * sizeof(float), + l.batch_size * sizeof(float), l.batch_size * sizeof(float), + l.seq_len * l.input_samples * sizeof(float)); + + snrt_dma_wait_all(); + } + snrt_global_barrier(); } \ No newline at end of file diff --git a/sw/dnn/softmax/verify.py b/sw/dnn/softmax/verify.py new file mode 100755 index 0000000000..312bd6bb84 --- /dev/null +++ b/sw/dnn/softmax/verify.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import sys +from pathlib import Path +import numpy as np +import torch +from data.datagen import golden_model + +sys.path.append(str(Path(__file__).parent / '../../../util/sim/')) +import verification # noqa: E402 +from elf import Elf # noqa: E402 +from data_utils import bytes_to_float, bytes_to_struct # noqa: E402 + + +ERR_THRESHOLD = 0.003 + +PRECISION_T = { + 8: '64', + 4: '32', + 2: '16', + 1: '8' +} + +NUMPY_T = { + '64': np.float64, + '32': np.float32, + '16': np.float16 +} + + +def main(): + # Run simulation and get outputs + args = verification.parse_args() + raw_results = verification.simulate(sim_bin=args.sim_bin, + snitch_bin=args.snitch_bin, + symbols_bin=args.symbols_bin, + log=args.log, + output_uids=['ofmap']) + + # Extract input operands from ELF file + if args.symbols_bin: + elf = Elf(args.symbols_bin) + else: + elf = Elf(args.snitch_bin) + + layer_struct = { + 'batch_size': 'I', + 'seq_len': 'I', + 'input_samples': 'I', + 'reduce_dim': 'i', + 'ifmap_ptr': 'I', + 'ofmap_ptr': 'I', + 'dtype': 'I' + } + layer = bytes_to_struct(elf.get_symbol_contents('layer'), layer_struct) + batch_size = layer['batch_size'] + seq_len = layer['seq_len'] + input_samples = layer['input_samples'] + reduce_dim = layer['reduce_dim'] + prec = PRECISION_T[layer['dtype']] + + ifmap = np.array(bytes_to_float(elf.get_symbol_contents('ifmap'), prec), dtype=NUMPY_T[prec]) + ifmap = ifmap.reshape(batch_size, seq_len, input_samples) + ifmap = torch.from_numpy(ifmap) + + # Verify results + ofmap_actual = np.array(bytes_to_float(raw_results['ofmap'], prec), dtype=NUMPY_T[prec]) + ofmap_golden = golden_model(ifmap, reduce_dim).detach().numpy().flatten() + + absolute_err = np.absolute(ofmap_golden - ofmap_actual) + fail = np.any(absolute_err > ERR_THRESHOLD) + if (fail): + verification.dump_results_to_csv([ofmap_golden, ofmap_actual, absolute_err], + Path.cwd() / 'softmax_results.csv') + + return int(fail) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h index d1d190a968..a38cc9c76e 100644 --- a/sw/dnn/src/dnn.h +++ b/sw/dnn/src/dnn.h @@ -202,5 +202,5 @@ typedef struct network_single_cluster_t_ { #include "../layernorm/src/layernorm.h" #include "../linear/src/linear.h" #include "../maxpool/src/maxpool.h" -// #include "softmax.h" +#include "../softmax/src/softmax.h" // #include "utils.h" diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile index e5d8c8be5e..1a8ce51ae2 100644 --- a/target/snitch_cluster/sw/apps/Makefile +++ b/target/snitch_cluster/sw/apps/Makefile @@ -17,7 +17,7 @@ SUBDIRS += dnn/gemm SUBDIRS += dnn/layernorm SUBDIRS += dnn/linear SUBDIRS += dnn/maxpool -# SUBDIRS += dnn/softmax +SUBDIRS += dnn/softmax SUBDIRS += montecarlo/pi_estimation .PHONY: all clean $(SUBDIRS) diff --git a/target/snitch_cluster/sw/apps/dnn/softmax/Makefile b/target/snitch_cluster/sw/apps/dnn/softmax/Makefile index 8f2209c298..d4f685c7d0 100644 --- a/target/snitch_cluster/sw/apps/dnn/softmax/Makefile +++ b/target/snitch_cluster/sw/apps/dnn/softmax/Makefile @@ -2,11 +2,11 @@ # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # -# Gianna Paulin +# Luca Colagrande -APP = softmax +APP ?= softmax -include ../Makefile +include ../../../../../../sw/dnn/common.mk include ../../common.mk $(DEP): $(DATA_H) diff --git a/target/snitch_cluster/sw/apps/dnn/softmax/src/softmax.c b/target/snitch_cluster/sw/apps/dnn/softmax/src/softmax.c deleted file mode 100644 index d648b5548f..0000000000 --- a/target/snitch_cluster/sw/apps/dnn/softmax/src/softmax.c +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2020 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// SW testbench for profiling linear kernels in different -// floating point precisions (fp64, fp32, fp16), as well as -// different memory layouts for matrices (transposed/not-transposed) -// Correctness of results are checked automatically - -#include "dnn.h" -#include "snrt.h" - -#include "data.h" - -int main() { - softmax_l.ifmap = (float*)softmax_ifmap_dram; - // softmax_l.result = (float*)softmax_ofmap_dram; - - // checksum = (float*)softmax_checksum; - - softmax_layer(&softmax_l); - - snrt_global_barrier(); - - // uint32_t error = check_softmax_layer(&linear_l, (float*)linear_checksum); - - return 0; -} \ No newline at end of file diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml index 0c712fa552..85afc9ed59 100644 --- a/target/snitch_cluster/sw/run.yaml +++ b/target/snitch_cluster/sw/run.yaml @@ -78,9 +78,9 @@ runs: - elf: apps/dnn/maxpool/build/maxpool.elf - elf: apps/dnn/gemm/build/gemm.elf - elf: apps/dnn/layernorm/build/layernorm.elf # Illegal FDIV without FDIV unit + cmd: ../../sw/dnn/layernorm/verify.py {sim_bin} {elf} # - elf: apps/dnn/gelu/build/gelu.elf # seems like it stalls # - elf: apps/dnn/conv2d/build/conv2d.elf # fails with exit code 32 # - elf: apps/dnn/fusedconv/build/fusedconv.elf # fails newly - # - elf: apps/dnn/softmax/build/softmax.elf - # throws illegal instruction on FDIV in simulation + - elf: ../../sw/dnn/softmax/verify.py {sim_bin} {elf} # Illegal FDIV without FDIV unit - elf: apps/montecarlo/pi_estimation/build/pi_estimation.elf