From a17d2cbaae8604021ef788f25ba7cf24157abc6a Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Sun, 12 Nov 2023 20:55:21 +0100 Subject: [PATCH] dnn: Add FusedConcatLinear layer --- sw/dnn/fused_concat_linear/data/datagen.py | 113 ++++++++++++++++++ sw/dnn/fused_concat_linear/data/params.hjson | 10 ++ .../src/fused_concat_linear.h | 53 ++++++++ sw/dnn/fused_concat_linear/src/main.c | 14 +++ sw/dnn/fused_concat_linear/verify.py | 90 ++++++++++++++ sw/dnn/src/dnn.h | 1 + sw/snRuntime/src/team.h | 4 + target/snitch_cluster/sw/apps/Makefile | 1 + .../sw/apps/dnn/fused_concat_linear/Makefile | 12 ++ target/snitch_cluster/sw/run.yaml | 2 + 10 files changed, 300 insertions(+) create mode 100755 sw/dnn/fused_concat_linear/data/datagen.py create mode 100644 sw/dnn/fused_concat_linear/data/params.hjson create mode 100644 sw/dnn/fused_concat_linear/src/fused_concat_linear.h create mode 100644 sw/dnn/fused_concat_linear/src/main.c create mode 100755 sw/dnn/fused_concat_linear/verify.py create mode 100644 target/snitch_cluster/sw/apps/dnn/fused_concat_linear/Makefile diff --git a/sw/dnn/fused_concat_linear/data/datagen.py b/sw/dnn/fused_concat_linear/data/datagen.py new file mode 100755 index 0000000000..8d139feb96 --- /dev/null +++ b/sw/dnn/fused_concat_linear/data/datagen.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import argparse +import numpy as np +import pathlib +import hjson +import sys +import os +import torch + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) +import data_utils # noqa: E402 +from data_utils import emit_license, \ + format_struct_definition, format_array_definition, \ + format_array_declaration, format_ifdef_wrapper # noqa: E402 + +torch.manual_seed(42) + +# AXI splits bursts crossing 4KB address boundaries. To minimize +# the occurrence of these splits the data should be aligned to 4KB +BURST_ALIGNMENT = 4096 + +PRECISION = { + 'FP64': '64', + 'FP32': '32', + 'FP16': '16', + 'FP8': '8' +} + + +def golden_model(inputs, weights): + innermost_dim = len(inputs[0].shape) - 1 + concat_output = torch.cat(inputs, dim=innermost_dim) + linear_output = torch.matmul(concat_output, weights) + return linear_output, concat_output + + +def emit_header(section, params): + num_inputs = params['num_inputs'] + input_shape = params['input_shape'] + output_shape = params['output_shape'] + prec = PRECISION[params['dtype']] + + assert input_shape[0] == output_shape[0], 'Inconsistent input and output shapes' + + torch_type = data_utils.floating_point_torch_type(prec) + + inputs = [torch.rand(*input_shape, requires_grad=False, dtype=torch_type) for _ in range(num_inputs)] + weights = torch.rand([input_shape[1]*num_inputs, output_shape[1]], requires_grad=False, dtype=torch_type) + linear_output, concat_output = golden_model(inputs, weights) + + ctype = data_utils.floating_point_ctype(prec) + + layer_cfg = { + **params, + 'inputs': 'inputs', + 'weights': 'weights', + 'concat_output': 'concat_output', + 'linear_output': 'linear_output' + } + + data_str = [emit_license()] + data_str += [format_array_declaration(ctype, f'input_{i}', input_shape) for i in range(num_inputs)] + data_str += [format_array_declaration('void*', 'inputs', [num_inputs])] + data_str += [format_array_declaration(ctype, 'concat_output', concat_output.shape)] + data_str += [format_array_declaration(ctype, 'linear_output', linear_output.shape)] + data_str += [format_array_declaration(ctype, 'weights', weights.shape)] + data_str += [format_struct_definition('fused_concat_linear_layer_t', 'layer', layer_cfg)] + data_str += [format_array_definition(ctype, f'input_{i}', t) for i, t in enumerate(inputs)] + data_str += [format_array_definition('void*', 'inputs', np.array([f'input_{i}' for i in range(num_inputs)]))] + data_str += [format_array_definition(ctype, 'weights', weights)] + result_def = format_array_definition(ctype, 'golden', linear_output) + data_str += [format_ifdef_wrapper('BIST', result_def)] + data_str = '\n\n'.join(data_str) + + return data_str + + +def main(): + + parser = argparse.ArgumentParser(description='Generate data for layernorm kernel') + parser.add_argument( + "-c", "--cfg", + type=pathlib.Path, + required=True, + help='Select param config file kernel' + ) + parser.add_argument( + '--section', + type=str, + help='Section to store matrices in') + parser.add_argument( + 'output', + type=pathlib.Path, + help='Path of the output header file') + args = parser.parse_args() + + # Load param config file + with args.cfg.open() as f: + param = hjson.loads(f.read()) + + # Emit header file + with open(args.output, 'w') as f: + f.write(emit_header(args.section, param)) + + +if __name__ == '__main__': + main() diff --git a/sw/dnn/fused_concat_linear/data/params.hjson b/sw/dnn/fused_concat_linear/data/params.hjson new file mode 100644 index 0000000000..2274863f41 --- /dev/null +++ b/sw/dnn/fused_concat_linear/data/params.hjson @@ -0,0 +1,10 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +{ + num_inputs: 1 + input_shape: [32, 4] + output_shape: [32, 16] + dtype: FP64 +} \ No newline at end of file diff --git a/sw/dnn/fused_concat_linear/src/fused_concat_linear.h b/sw/dnn/fused_concat_linear/src/fused_concat_linear.h new file mode 100644 index 0000000000..09a7a9b7bb --- /dev/null +++ b/sw/dnn/fused_concat_linear/src/fused_concat_linear.h @@ -0,0 +1,53 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande + +#include "snrt.h" + +/** + * @struct fused_concat_linear_layer_t + * @brief This structure contains all parameters necessary + * for computing a Concat layer. + * @var fused_concat_linear_layer_t::input_shape + * Shape of the input tensors + * @var fused_concat_linear_layer_t::num_inputs + * Number of input tensors to concatenate + * @var fused_concat_linear_layer_t::inputs + * Pointer to an array of pointers to the individual tensors to concatenate + * @var fused_concat_linear_layer_t::output + * Pointer to the concatenated output tensor + */ +typedef struct { + uint32_t num_inputs; + uint32_t input_shape[2]; + uint32_t output_shape[2]; + void **inputs; + void *weights; + void *concat_output; + void *linear_output; + precision_t dtype; +} fused_concat_linear_layer_t; + +static inline int fused_concat_linear_layer(fused_concat_linear_layer_t l) { + // Concat layer + concat_layer_t concat_layer_cfg = { + .num_inputs = l.num_inputs, + .input_shape = {l.input_shape[0], l.input_shape[1]}, + .inputs = l.inputs, + .output = l.concat_output, + .dtype = l.dtype + }; + int nerr = concat_layer(concat_layer_cfg); + + // Linear layer + uint32_t m = l.input_shape[0]; + uint32_t k = l.input_shape[1] * l.num_inputs; + uint32_t n = l.output_shape[1]; + gemm(l.dtype, 0, 0, 0, 0, m, n, k, 1.0, l.concat_output, k, l.weights, n, 0.0, l.linear_output, n); + + snrt_global_barrier(); + + return nerr; +} diff --git a/sw/dnn/fused_concat_linear/src/main.c b/sw/dnn/fused_concat_linear/src/main.c new file mode 100644 index 0000000000..6d5f7adf03 --- /dev/null +++ b/sw/dnn/fused_concat_linear/src/main.c @@ -0,0 +1,14 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande + +#include "dnn.h" + +#include "data.h" + +int main() { + uint32_t nerr = fused_concat_linear_layer(layer); + return nerr; +} diff --git a/sw/dnn/fused_concat_linear/verify.py b/sw/dnn/fused_concat_linear/verify.py new file mode 100755 index 0000000000..fedf63a5cf --- /dev/null +++ b/sw/dnn/fused_concat_linear/verify.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import sys +from pathlib import Path +import numpy as np +import torch +from data.datagen import golden_model + +sys.path.append(str(Path(__file__).parent / '../../../util/sim/')) +import verification # noqa: E402 +from elf import Elf # noqa: E402 +from data_utils import bytes_to_float, bytes_to_struct # noqa: E402 + + +ERR_THRESHOLD = 1E-6 + +PRECISION_T = { + 8: '64', + 4: '32', + 2: '16', + 1: '8' +} + +NUMPY_T = { + '64': np.float64, + '32': np.float32, + '16': np.float16 +} + + +def main(): + # Run simulation and get outputs + args = verification.parse_args() + raw_results = verification.simulate(sim_bin=args.sim_bin, + snitch_bin=args.snitch_bin, + symbols_bin=args.symbols_bin, + log=args.log, + output_uids=['linear_output']) + + # Extract input operands from ELF file + if args.symbols_bin: + elf = Elf(args.symbols_bin) + else: + elf = Elf(args.snitch_bin) + + layer_struct = { + 'num_inputs': 'I', + 'in_height': 'I', + 'in_width': 'I', + 'out_height': 'I', + 'out_width': 'I', + 'inputs': 'I', + 'weights': 'I', + 'concat_output': 'I', + 'linear_output': 'I', + 'dtype': 'I' + } + layer = bytes_to_struct(elf.get_symbol_contents('layer'), layer_struct) + num_inputs = layer['num_inputs'] + input_shape = [layer['in_height'], layer['in_width']] + weights_shape = [layer['in_width']*num_inputs, layer['out_width']] + prec = PRECISION_T[layer['dtype']] + + inputs = [np.array(bytes_to_float(elf.get_symbol_contents(f'input_{i}'), prec), dtype=NUMPY_T[prec]) for i in range(num_inputs)] + inputs = [torch.from_numpy(tensor.reshape(input_shape)) for tensor in inputs] + weights = np.array(bytes_to_float(elf.get_symbol_contents(f'weights'), prec), dtype=NUMPY_T[prec]) + weights = torch.from_numpy(weights.reshape(weights_shape)) + + # Verify results + output_actual = np.array(bytes_to_float(raw_results['linear_output'], prec), dtype=NUMPY_T[prec]) + output_golden, _ = golden_model(inputs, weights) + output_golden = output_golden.detach().numpy().flatten() + + relative_err = np.absolute((output_golden - output_actual) / output_golden) + fail = np.any(relative_err > ERR_THRESHOLD) + if (fail): + verification.dump_results_to_csv([output_golden, output_actual, relative_err], + Path.cwd() / 'results.csv') + print('Maximum relative error:', np.max(relative_err)) + + return int(fail) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h index 313220493a..56e62e6d44 100644 --- a/sw/dnn/src/dnn.h +++ b/sw/dnn/src/dnn.h @@ -205,4 +205,5 @@ typedef struct network_single_cluster_t_ { #include "../maxpool/src/maxpool.h" #include "../softmax/src/softmax.h" #include "../concat/src/concat.h" +#include "../fused_concat_linear/src/fused_concat_linear.h" // #include "utils.h" diff --git a/sw/snRuntime/src/team.h b/sw/snRuntime/src/team.h index 918037e646..033b0bf781 100644 --- a/sw/snRuntime/src/team.h +++ b/sw/snRuntime/src/team.h @@ -24,6 +24,10 @@ inline uint32_t __attribute__((const)) snrt_global_core_num() { return snrt_cluster_num() * snrt_cluster_core_num(); } +inline uint32_t __attribute__((const)) snrt_global_compute_core_num() { + return snrt_cluster_num() * snrt_cluster_compute_core_num(); +} + inline uint32_t __attribute__((const)) snrt_global_core_idx() { return snrt_hartid() - snrt_global_core_base_hartid(); } diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile index 222f75cd48..c1faab9fc3 100644 --- a/target/snitch_cluster/sw/apps/Makefile +++ b/target/snitch_cluster/sw/apps/Makefile @@ -20,6 +20,7 @@ SUBDIRS += dnn/maxpool SUBDIRS += dnn/softmax SUBDIRS += dnn/flashattention_2 SUBDIRS += dnn/concat +SUBDIRS += dnn/fused_concat_linear SUBDIRS += montecarlo/pi_estimation .PHONY: all clean $(SUBDIRS) diff --git a/target/snitch_cluster/sw/apps/dnn/fused_concat_linear/Makefile b/target/snitch_cluster/sw/apps/dnn/fused_concat_linear/Makefile new file mode 100644 index 0000000000..827014a08f --- /dev/null +++ b/target/snitch_cluster/sw/apps/dnn/fused_concat_linear/Makefile @@ -0,0 +1,12 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +APP ?= fused_concat_linear + +include ../../../../../../sw/dnn/common.mk +include ../../common.mk + +$(DEP): $(DATA_H) diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml index 4a8499a636..2746aea862 100644 --- a/target/snitch_cluster/sw/run.yaml +++ b/target/snitch_cluster/sw/run.yaml @@ -90,4 +90,6 @@ runs: cmd: ../../sw/dnn/flashattention_2/verify.py {sim_bin} {elf} - elf: apps/dnn/concat/build/concat.elf cmd: ../../sw/dnn/concat/verify.py {sim_bin} {elf} + - elf: apps/dnn/fused_concat_linear/build/fused_concat_linear.elf + cmd: ../../sw/dnn/fused_concat_linear/verify.py {sim_bin} {elf} - elf: apps/montecarlo/pi_estimation/build/pi_estimation.elf