From c1d5c754d290b1f6d1498f40757511cfed00e944 Mon Sep 17 00:00:00 2001 From: Viviane Potocnik Date: Wed, 7 Feb 2024 18:28:57 +0100 Subject: [PATCH] sw: verification framework for FusedConv (failing) --- sw/dnn/conv2d/src/conv2d.h | 50 +++++----- sw/dnn/fusedconv/data/datagen.py | 55 +++++++---- sw/dnn/fusedconv/data/params.hjson | 2 +- sw/dnn/fusedconv/src/main.c | 51 ++++------- sw/dnn/fusedconv/verify.py | 142 +++++++++++++++++++++++++++++ target/snitch_cluster/sw/run.yaml | 6 +- 6 files changed, 228 insertions(+), 78 deletions(-) create mode 100755 sw/dnn/fusedconv/verify.py diff --git a/sw/dnn/conv2d/src/conv2d.h b/sw/dnn/conv2d/src/conv2d.h index 976795f47d..823a980d26 100644 --- a/sw/dnn/conv2d/src/conv2d.h +++ b/sw/dnn/conv2d/src/conv2d.h @@ -144,35 +144,37 @@ typedef struct conv_layer_struct { */ typedef struct { + uint32_t ch_in; + uint32_t ch_out; + uint32_t dim_in_x; + uint32_t dim_in_y; + uint32_t dim_kernel_x; + uint32_t dim_kernel_y; + uint32_t dim_out_x; + uint32_t dim_out_y; + uint32_t padding_y_top; + uint32_t padding_y_bottom; + uint32_t padding_x_left; + uint32_t padding_x_right; + uint32_t stride_x; + uint32_t stride_y; + uint32_t flag_relu; + int flag_batch_norm; + int depthwise; + int chw_layer; + int flag_y_accumulate_start; + int flag_y_accumulate_end; float *pInBuffer; - uint16_t dim_in_x; - uint16_t dim_in_y; - uint16_t ch_in; float *pWeight; - uint16_t ch_out; - uint16_t dim_kernel_x; - uint16_t dim_kernel_y; - uint16_t padding_y_top; - uint16_t padding_y_bottom; - uint16_t padding_x_left; - uint16_t padding_x_right; - uint16_t stride_x; - uint16_t stride_y; - int8_t *bias; - uint16_t bias_shift; - uint16_t out_shift; - uint16_t out_mult; - float *pOutBuffer; - uint16_t dim_out_x; - uint16_t dim_out_y; - float *kappa; float *lambda; + float *kappa; + float *pOutBuffer; + int8_t *bias; uint8_t *pIm2ColBuffer; - int flag_relu; - int flag_batch_norm; - int flag_y_accumulate_start; - int flag_y_accumulate_end; unsigned int *memory_chan; + uint32_t bias_shift; + uint32_t out_shift; + uint32_t out_mult; precision_t dtype; } kernel_fp32; diff --git a/sw/dnn/fusedconv/data/datagen.py b/sw/dnn/fusedconv/data/datagen.py index 0d37991eec..340f098bd0 100755 --- a/sw/dnn/fusedconv/data/datagen.py +++ b/sw/dnn/fusedconv/data/datagen.py @@ -6,7 +6,6 @@ # Viviane Potocnik import argparse -import numpy as np import pathlib import hjson import sys @@ -17,8 +16,7 @@ import data_utils # noqa: E402 from data_utils import emit_license, \ format_struct_definition, format_array_definition, \ - format_scalar_definition, format_array_declaration, \ - format_ifdef_wrapper, NUMPY_T # noqa: E402 + format_scalar_definition, format_array_declaration # noqa: E402 torch.manual_seed(42) @@ -26,6 +24,7 @@ # the occurrence of these splits the data should be aligned to 4KB BURST_ALIGNMENT = 4096 + # FusedConv def golden_model(ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise): @@ -110,21 +109,20 @@ def emit_header(**kwargs): kwargs['dim_kernel_x'], kwargs['ch_out'], requires_grad=False, dtype=torch_type) - bn_k = torch.randn(kwargs['ch_out'], requires_grad=False, dtype=torch_type) bn_l = torch.randn(kwargs['ch_out'], requires_grad=False, dtype=torch_type) flag_y_accumulate_start = kwargs['flags']['flag_y_accumulate_start'] - ofmap, ofmap_before, ifmap_padded = golden_model(ifmap, kernel, + ofmap, ofmap_before, ifmap_padded = golden_model(ifmap, kernel, bn_k, bn_l, - kwargs['padding'], - kwargs['stride'], + kwargs['padding'], + kwargs['stride'], kwargs['flags']['flag_batch_norm'], kwargs['flags']['flag_relu'], not flag_y_accumulate_start, kwargs['depthwise']) - + if kwargs['chw_layer']: ifmap = ifmap.permute(2, 0, 1) ifmap_padded = ifmap_padded.permute(2, 0, 1) @@ -166,22 +164,47 @@ def emit_header(**kwargs): 'stride_y': kwargs['stride']['stride_y'], 'flag_relu': kwargs['flags']['flag_relu'], 'flag_batch_norm': kwargs['flags']['flag_batch_norm'], + 'depthwise': kwargs['depthwise'], + 'chw_layer': kwargs['chw_layer'], 'flag_y_accumulate_start': flag_y_accumulate_start, 'flag_y_accumulate_end': kwargs['flags']['flag_y_accumulate_end'], + 'pInBuffer': 'fusedconv_pInBuffer_dram', + 'pWeight': 'fusedconv_pWeight_dram', + 'lambda': 'fusedconv_lambda_dram', + 'kappa': 'fusedconv_kappa_dram', + 'pOutBuffer': 'fusedconv_pOutBuffer_dram', 'dtype': 'FP' + prec } data_str = [emit_license()] - data_str += [format_struct_definition('kernel_fp32', 'k', layer_cfg)] + data_str += [format_array_declaration(ctype, 'fusedconv_pInBuffer_dram', + ifmap_padded.numpy().shape, BURST_ALIGNMENT)] + data_str += [format_array_declaration(ctype, 'fusedconv_pWeight_dram', + kernel.numpy().shape, BURST_ALIGNMENT)] + data_str += [format_array_declaration(ctype, 'fusedconv_lambda_dram', + bn_l.numpy().shape, BURST_ALIGNMENT)] + data_str += [format_array_declaration(ctype, 'fusedconv_kappa_dram', + bn_k.numpy().shape, BURST_ALIGNMENT)] + data_str += [format_array_declaration(ctype, 'fusedconv_pOutBuffer_dram', + ofmap_before.numpy().shape, BURST_ALIGNMENT)] + data_str += [format_array_declaration(ctype, 'fusedconv_pCheckOutBuffer_dram', + ofmap.numpy().shape, BURST_ALIGNMENT)] + data_str += [format_struct_definition('kernel_fp32', 'layer', layer_cfg)] data_str += [format_scalar_definition('uint32_t', 'dw', kwargs['depthwise'])] data_str += [format_scalar_definition('uint32_t', 'chw_layer', kwargs['chw_layer'])] - data_str += [format_array_definition(ctype, f'fusedconv_pInBuffer_dram', ifmap_padded.numpy(), BURST_ALIGNMENT)] - data_str += [format_array_definition(ctype, f'fusedconv_pWeight_dram', kernel.numpy(), BURST_ALIGNMENT)] - data_str += [format_array_definition(ctype, f'fusedconv_lambda_dram', bn_l.numpy(), BURST_ALIGNMENT)] - data_str += [format_array_definition(ctype, f'fusedconv_kappa_dram', bn_k.numpy(), BURST_ALIGNMENT)] - data_str += [format_array_definition(ctype, f'fusedconv_pOutBuffer_dram', ofmap_before.numpy(), BURST_ALIGNMENT)] - data_str += [format_array_definition(ctype, f'fusedconv_pCheckOutBuffer_dram', ofmap.numpy(), BURST_ALIGNMENT)] - + data_str += [format_array_definition(ctype, 'fusedconv_pInBuffer_dram', + ifmap_padded.numpy(), BURST_ALIGNMENT)] + data_str += [format_array_definition(ctype, 'fusedconv_pWeight_dram', + kernel.numpy(), BURST_ALIGNMENT)] + data_str += [format_array_definition(ctype, 'fusedconv_lambda_dram', + bn_l.numpy(), BURST_ALIGNMENT)] + data_str += [format_array_definition(ctype, 'fusedconv_kappa_dram', + bn_k.numpy(), BURST_ALIGNMENT)] + data_str += [format_array_definition(ctype, 'fusedconv_pOutBuffer_dram', + ofmap_before.numpy(), BURST_ALIGNMENT)] + data_str += [format_array_definition(ctype, 'fusedconv_pCheckOutBuffer_dram', + ofmap.numpy(), BURST_ALIGNMENT)] + data_str = '\n\n'.join(data_str) return data_str diff --git a/sw/dnn/fusedconv/data/params.hjson b/sw/dnn/fusedconv/data/params.hjson index 3c26ff566f..e566be0dd5 100644 --- a/sw/dnn/fusedconv/data/params.hjson +++ b/sw/dnn/fusedconv/data/params.hjson @@ -30,5 +30,5 @@ } depthwise: 0, chw_layer: 0, - prec: '32' + prec: '32', } \ No newline at end of file diff --git a/sw/dnn/fusedconv/src/main.c b/sw/dnn/fusedconv/src/main.c index d2504c3849..8d25be3668 100644 --- a/sw/dnn/fusedconv/src/main.c +++ b/sw/dnn/fusedconv/src/main.c @@ -10,15 +10,16 @@ void *share_ptr; int main() { - uint32_t ifmap_size = (k.dim_in_x + k.padding_x_left + k.padding_x_right) * - (k.dim_in_y + k.padding_y_top + k.padding_y_bottom) * - k.ch_in; + uint32_t ifmap_size = + (layer.dim_in_x + layer.padding_x_left + layer.padding_x_right) * + (layer.dim_in_y + layer.padding_y_top + layer.padding_y_bottom) * + layer.ch_in; uint32_t weights_size = - k.dim_kernel_x * k.dim_kernel_y * k.ch_in * k.ch_out; - uint32_t ofmap_size = k.dim_out_x * k.dim_out_y * k.ch_out; + layer.dim_kernel_x * layer.dim_kernel_y * layer.ch_in * layer.ch_out; + uint32_t ofmap_size = layer.dim_out_x * layer.dim_out_y * layer.ch_out; uint32_t total_size = - ifmap_size + weights_size + k.ch_out + k.ch_out + ofmap_size; + ifmap_size + weights_size + layer.ch_out + layer.ch_out + ofmap_size; float *ptr; @@ -36,9 +37,9 @@ int main() { float *pWeight = ptr; ptr += weights_size; float *kappa = ptr; - ptr += k.ch_out; + ptr += layer.ch_out; float *lambda = ptr; - ptr += k.ch_out; + ptr += layer.ch_out; float *pOutBuffer = ptr; ptr += ofmap_size; @@ -56,11 +57,11 @@ int main() { snrt_dma_wait_all(); } - k.pInBuffer = pInBuffer; - k.pWeight = pWeight; - k.pOutBuffer = pOutBuffer; - k.kappa = kappa; - k.lambda = lambda; + layer.pInBuffer = pInBuffer; + layer.pWeight = pWeight; + layer.pOutBuffer = pOutBuffer; + layer.kappa = kappa; + layer.lambda = lambda; snrt_cluster_hw_barrier(); @@ -68,16 +69,16 @@ int main() { if (snrt_is_compute_core() || (snrt_cluster_core_num() == 1)) { if (dw) { snrt_mcycle(); - conv2d_dw_fp32(&k); + conv2d_dw_fp32(&layer); snrt_mcycle(); } else if (chw_layer) { snrt_mcycle(); - conv2d_chw_fp32(&k); + conv2d_chw_fp32(&layer); snrt_mcycle(); } else { snrt_mcycle(); - conv2d_fp32(&k); + conv2d_fp32(&layer); snrt_mcycle(); } @@ -88,23 +89,5 @@ int main() { } snrt_cluster_hw_barrier(); - uint32_t errors = 0; - if (snrt_is_dm_core()) { - // Output feature map (H x W x Co) - const uint32_t output_w_stride = k.ch_out; - const uint32_t output_h_stride = output_w_stride * k.dim_out_x; - for (uint32_t i = 0; i < ofmap_size; i++) { - if (fabs(pOutBuffer[i] - - ((float *)fusedconv_pCheckOutBuffer_dram)[i]) > 0.01) { - errors++; - printf("Error at h %d w %d co %d\n", i / output_h_stride, - (i % output_h_stride) / output_w_stride, - i % output_w_stride); - printf("Expected: %f, Got: %f\n", ((float *)fusedconv_pCheckOutBuffer_dram)[i], pOutBuffer[i]); - } - } - printf("%d/%d Errors\n", errors, ofmap_size); - } - return 0; } \ No newline at end of file diff --git a/sw/dnn/fusedconv/verify.py b/sw/dnn/fusedconv/verify.py new file mode 100755 index 0000000000..96aa23fd93 --- /dev/null +++ b/sw/dnn/fusedconv/verify.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Viviane Potocnik + +import sys +from pathlib import Path +import numpy as np +import torch +from data.datagen import golden_model + +sys.path.append(str(Path(__file__).parent / '../../../util/sim/')) +import verification # noqa: E402 +from elf import Elf # noqa: E402 +from data_utils import bytes_to_float, bytes_to_struct, NUMPY_T, \ + PRECISION_T # noqa: E402 + + +ERR_THRESHOLD = 1E-6 + + +def main(): + # Run simulation and get outputs + args = verification.parse_args() + raw_results = verification.simulate(sim_bin=args.sim_bin, + snitch_bin=args.snitch_bin, + symbols_bin=args.symbols_bin, + log=args.log, + output_uids=['fusedconv_pOutBuffer_dram']) + + # Extract input operands from ELF file + if args.symbols_bin: + elf = Elf(args.symbols_bin) + else: + elf = Elf(args.snitch_bin) + + layer_struct = { + 'ch_in': 'I', + 'ch_out': 'I', + 'dim_in_x': 'I', + 'dim_in_y': 'I', + 'dim_kernel_x': 'I', + 'dim_kernel_y': 'I', + 'dim_out_x': 'I', + 'dim_out_y': 'I', + 'padding_y_top': 'I', + 'padding_y_bottom': 'I', + 'padding_x_left': 'I', + 'padding_x_right': 'I', + 'stride_x': 'I', + 'stride_y': 'I', + 'flag_relu': 'I', + 'flag_batch_norm': 'I', + 'depthwise': 'I', + 'chw_layer': 'I', + 'flag_y_accumulate_start': 'I', + 'flag_y_accumulate_end': 'I', + 'pInBuffer': 'I', + 'pWeight': 'I', + 'lambda': 'I', + 'kappa': 'I', + 'pOutBuffer': 'I', + 'pIm2ColBuffer': 'I', + 'memory_chan': 'I', + 'bias': 'I', + 'bias_shift': 'I', + 'out_shift': 'I', + 'out_mult': 'I', + 'dtype': 'I' + } + + layer = bytes_to_struct(elf.get_symbol_contents('layer'), layer_struct) + ifmap = [np.array(bytes_to_float( + elf.get_symbol_contents('fusedconv_pInBuffer_dram'), + PRECISION_T[layer['dtype']]), + dtype=NUMPY_T[PRECISION_T[layer['dtype']]])] + ifmap = torch.from_numpy( + ifmap[0].reshape(layer['dim_in_y'], layer['dim_in_x'], layer['ch_in'])) + kernel = [np.array(bytes_to_float( + elf.get_symbol_contents('fusedconv_pWeight_dram'), + PRECISION_T[layer['dtype']]), + dtype=NUMPY_T[PRECISION_T[layer['dtype']]])] + if not layer['depthwise']: + kernel = torch.from_numpy( + kernel[0].reshape(layer['ch_out'], layer['dim_kernel_y'], + layer['dim_kernel_x'], layer['ch_in'])) + else: + kernel = torch.from_numpy( + kernel[0].reshape(layer['dim_kernel_y'], layer['dim_kernel_x'], + layer['ch_out'])) + + bn_k = [np.array(bytes_to_float( + elf.get_symbol_contents('fusedconv_kappa_dram'), + PRECISION_T[layer['dtype']]), + dtype=NUMPY_T[PRECISION_T[layer['dtype']]])] + bn_k = torch.from_numpy(bn_k[0]) + bn_l = [np.array(bytes_to_float( + elf.get_symbol_contents('fusedconv_lambda_dram'), + PRECISION_T[layer['dtype']]), + dtype=NUMPY_T[PRECISION_T[layer['dtype']]])] + bn_l = torch.from_numpy(bn_l[0]) + + flag_y_accumulate_start = layer['flag_y_accumulate_start'] + + # Verify results + output_actual = np.array(bytes_to_float( + raw_results['fusedconv_pOutBuffer_dram'], + PRECISION_T[layer['dtype']]), + dtype=NUMPY_T[PRECISION_T[layer['dtype']]]) + output_golden, _, _ = golden_model(ifmap, kernel, + bn_k, bn_l, + layer, + layer, + layer['flag_batch_norm'], + layer['flag_relu'], + not flag_y_accumulate_start, + layer['depthwise']) + output_golden = output_golden.detach().numpy().flatten() + + # relative_err = np.absolute((output_golden - output_actual) / output_golden) + # compute relative error only for non-zero elements + relative_err = np.zeros_like(output_golden) + non_zero = output_golden != 0 + zero_idx = np.where(output_golden == 0) + relative_err[non_zero] = np.absolute((output_golden[non_zero] - output_actual[non_zero]) + / output_golden[non_zero]) + relative_err[zero_idx] = np.absolute(output_golden[zero_idx] - output_actual[zero_idx]) + + fail = np.any(relative_err > ERR_THRESHOLD) + if (fail): + verification.dump_results_to_csv( + [output_golden, output_actual, relative_err], + Path.cwd() / 'results.csv') + print('Maximum relative error:', np.max(relative_err)) + + return int(fail) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml index 22d9616afc..01097cd5ab 100644 --- a/target/snitch_cluster/sw/run.yaml +++ b/target/snitch_cluster/sw/run.yaml @@ -79,12 +79,12 @@ runs: - elf: apps/dnn/gemm/build/gemm.elf # - elf: apps/dnn/layernorm/build/layernorm.elf # Illegal FDIV w/o FDIV unit # cmd: [../../../sw/dnn/layernorm/verify.py, "${sim_bin}", "${elf}"] - # - elf: apps/dnn/conv2d/build/conv2d.elf + # - elf: apps/dnn/conv2d/build/conv2d.elf # Fails with wrong results # cmd: [../../../sw/dnn/conv2d/verify.py, "${sim_bin}", "${elf}"] - # - elf: apps/dnn/fusedconv/build/fusedconv.elf # fails newly + # - elf: apps/dnn/fusedconv/build/fusedconv.elf # Fails with wrong results # - elf: apps/dnn/softmax/build/softmax.elf # Illegal FDIV without FDIV unit # cmd: [../../../sw/dnn/softmax/verify.py, "${sim_bin}", "${elf}"] - # - elf: apps/dnn/flashattention_2/build/flashattention_2.elf # stalling + # - elf: apps/dnn/flashattention_2/build/flashattention_2.elf # TODO: test # cmd: [../../../sw/dnn/flashattention_2/verify.py, "${sim_bin}", "${elf}"] - elf: apps/dnn/concat/build/concat.elf cmd: [../../../sw/dnn/concat/verify.py, "${sim_bin}", "${elf}"]