Skip to content

Commit

Permalink
softmax: Add IPC verification
Browse files Browse the repository at this point in the history
  • Loading branch information
colluca committed Oct 30, 2023
1 parent e88ee12 commit 4f1ecf3
Show file tree
Hide file tree
Showing 10 changed files with 247 additions and 76 deletions.
113 changes: 113 additions & 0 deletions sw/dnn/softmax/data/datagen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/usr/bin/env python3
# Copyright 2023 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Tim Fischer <[email protected]>
# Viviane Potocnik <[email protected]>
# Luca Colagrande <[email protected]>

import argparse
import pathlib
import hjson
import sys
import os
import torch

sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
import data_utils # noqa: E402
from data_utils import emit_license, \
format_struct_definition, format_array_definition, \
format_array_declaration, format_ifdef_wrapper # noqa: E402

torch.manual_seed(42)

# AXI splits bursts crossing 4KB address boundaries. To minimize
# the occurrence of these splits the data should be aligned to 4KB
BURST_ALIGNMENT = 4096

PRECISION_T = {
'64': 'FP64',
'32': 'FP32',
'16': 'FP16',
'8': 'FP8'
}


def golden_model(ifmap, axis):
softmax = torch.nn.Softmax(dim=axis)
return softmax(ifmap)


def emit_header(**kwargs):
batch_size = kwargs['input_dim']['batch_size']
seq_len = kwargs['input_dim']['seq_len']
input_samples = kwargs['input_dim']['input_samples']
reduce_dim = kwargs['reduce_dim']
prec = str(kwargs['prec'])

torch_type = data_utils.floating_point_torch_type(prec)
ifmap = torch.randn(batch_size, seq_len, input_samples, requires_grad=False, dtype=torch_type)

ofmap = golden_model(ifmap, reduce_dim)
ofmap = ofmap.detach().numpy()

ctype = data_utils.floating_point_ctype(prec)

ifmap_uid = 'ifmap'
ofmap_uid = 'ofmap'

layer_cfg = {
**kwargs['input_dim'],
'reduce_dim': reduce_dim,
'ifmap': ifmap_uid,
'ofmap': ofmap_uid,
'dtype': PRECISION_T[prec]
}

data_str = [emit_license()]
data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape,
alignment=BURST_ALIGNMENT)]
data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape,
alignment=BURST_ALIGNMENT)]
data_str += [format_struct_definition('softmax_layer_t', 'layer', layer_cfg)]
data_str += [format_array_definition(ctype, ifmap_uid, ifmap,
alignment=BURST_ALIGNMENT)]
result_def = format_array_definition(ctype, 'golden', ofmap, alignment=BURST_ALIGNMENT)
data_str += [format_ifdef_wrapper('BIST', result_def)]
data_str = '\n\n'.join(data_str)

return data_str


def main():

parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
parser.add_argument(
"-c", "--cfg",
type=pathlib.Path,
required=True,
help='Select param config file kernel'
)
parser.add_argument(
'--section',
type=str,
help='Section to store matrices in')
parser.add_argument(
'output',
type=pathlib.Path,
help='Path of the output header file')
args = parser.parse_args()

# Load param config file
with args.cfg.open() as f:
param = hjson.loads(f.read())
param['section'] = args.section

# Emit header file
with open(args.output, 'w') as f:
f.write(emit_header(**param))


if __name__ == '__main__':
main()
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,7 @@
// Solderpad Hardware License, Version 0.51, see LICENSE for details.
// SPDX-License-Identifier: SHL-0.51

// Parameters for a single SoftMax layer

{
kernel: "SoftMax"
input_dim: {
batch_size: 3,
seq_len: 16,
Expand Down
14 changes: 14 additions & 0 deletions sw/dnn/softmax/src/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// Copyright 2023 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
//
// Luca Colagrande <[email protected]>

#include "dnn.h"

#include "data.h"

int main() {
softmax_layer(layer);
return 0;
}
66 changes: 28 additions & 38 deletions sw/dnn/softmax/softmax.h → sw/dnn/softmax/src/softmax.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,38 +6,31 @@

#include "math.h"
#include "snrt.h"
// #include "printf.h"
#include "utils.h"

/**
* @struct softmax_layer_struct
* @brief This structure contains all parameters necessary
* for computing the Softmax activation function
* @var softmax_layer_struct::BATCH_SIZE
* @var softmax_layer_struct::batch_size
* Size of each input sample
* @var softmax_layer_struct::SEQ_LEN
* @var softmax_layer_struct::seq_len
* Size of each output sample
* @var softmax_layer_struct::INPUT_SAMPLES
* @var softmax_layer_struct::input_samples
* Number of input samples
* @var softmax_layer_struct::REDUCE_DIM
* @var softmax_layer_struct::reduce_dim
* Along which dimension to reduce
* @var softmax_layer_struct::ifmap
* Pointer to input feature map
* @var softmax_layer_struct::ofmap
* Pointer to output feature map
* @var softmax_layer_struct::result
* Pointer to the golden model output
*/
typedef struct softmax_layer_struct {
uint32_t BATCH_SIZE;
uint32_t SEQ_LEN;
uint32_t INPUT_SAMPLES;
uint32_t REDUCE_DIM;

uint32_t batch_size;
uint32_t seq_len;
uint32_t input_samples;
int32_t reduce_dim;
float *ifmap;
float *ofmap;
float *result;

precision_t dtype;
} softmax_layer_t;

Expand All @@ -50,9 +43,6 @@ static inline void softmax_fp32(float *input, float *output, int32_t ldI,
float max_core = 0.0; // max value of the current core
float sum = 0.0; // sum of the exp values of the current core

// uint32_t compute_id = snrt_global_core_idx();
// uint32_t num_cores = snrt_cluster_compute_core_num();

for (int32_t b = 0; b < batch_size; b++) {
for (int32_t s = 0; s < seq_len; s++) {
max_core = -INFINITY;
Expand All @@ -67,23 +57,13 @@ static inline void softmax_fp32(float *input, float *output, int32_t ldI,
// compute the shifted value of the current row
for (int32_t i = 0; i < input_samples; i++) {
output[b * batch_offset + s * ldI + i] =
// FIXME: Below code is erroring due to the standard math
// lib conflict
// TODO: Try out with musl lib
// expf(input[b * batch_offset + s * ldI + i] - max_core);
// FIXME: actually there should be an exponentiation
input[b * batch_offset + s * ldI + i] - max_core;
expf(input[b * batch_offset + s * ldI + i] - max_core);
sum += output[b * batch_offset + s * ldI + i];
}

// compute the softmax value of the current row
for (int32_t i = 0; i < input_samples; i++) {
// INFO: DIVSQRT unit MUST be activated in the cluster
// configuration
output[b * batch_offset + s * ldI + i] /= sum;
// printf("output[%d] = %f\n", compute_id * input_samples + b *
// batch_offset + s * ldI + i,
// output[b * batch_offset + s * ldI + i]);
}
}
}
Expand All @@ -97,14 +77,14 @@ static inline void softmax_fp32(float *input, float *output, int32_t ldI,
* @param l softmax_layer struct that holds addresses and parameters
*
*/
static inline void softmax_layer(softmax_layer_t *const l) {
static inline void softmax_layer(softmax_layer_t const l) {
uint32_t cluster_num = snrt_cluster_num();
uint32_t cluster_id = snrt_cluster_idx();
uint32_t compute_num = snrt_cluster_compute_core_num();
uint32_t compute_id = snrt_global_core_idx();

uint32_t ifmap_size =
l->BATCH_SIZE * l->SEQ_LEN * l->INPUT_SAMPLES * sizeof(float);
l.batch_size * l.seq_len * l.input_samples * sizeof(float);
uint32_t ofmap_size = ifmap_size;

void *ptr = (float *)snrt_l1_next();
Expand All @@ -116,9 +96,9 @@ static inline void softmax_layer(softmax_layer_t *const l) {
// DMA transfer the ifmap into the cluster TCDM
if (snrt_is_dm_core()) {
snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d(
ifmap, l->ifmap, l->BATCH_SIZE * sizeof(float),
l->BATCH_SIZE * sizeof(float), l->BATCH_SIZE * sizeof(float),
l->SEQ_LEN * l->INPUT_SAMPLES * sizeof(float));
ifmap, l.ifmap, l.batch_size * sizeof(float),
l.batch_size * sizeof(float), l.batch_size * sizeof(float),
l.seq_len * l.input_samples * sizeof(float));

snrt_dma_wait_all();
}
Expand All @@ -127,21 +107,31 @@ static inline void softmax_layer(softmax_layer_t *const l) {

if (snrt_is_compute_core()) {
// determine the row offset for each core
int32_t row_offset = compute_id * l->INPUT_SAMPLES;
int32_t row_offset = compute_id * l.input_samples;

// determine the row stride of each matrix
int32_t ldI = compute_num * l->INPUT_SAMPLES;
int32_t ldI = compute_num * l.input_samples;

// determine the batch offset for each core
int32_t batch_offset = l->SEQ_LEN * l->INPUT_SAMPLES;
int32_t batch_offset = l.seq_len * l.input_samples;

// printf("row_offset: %d, ldI: %d\n", row_offset, ldI);
softmax_fp32(&ifmap[row_offset], &ofmap[row_offset], ldI, batch_offset,
l->BATCH_SIZE, l->SEQ_LEN / 8, l->INPUT_SAMPLES);
l.batch_size, l.seq_len / 8, l.input_samples);

} else {
snrt_cluster_hw_barrier();
}

// DMA transfer the ofmap to DRAM
if (snrt_is_dm_core()) {
snrt_dma_txid_t txid_ofmap = snrt_dma_start_2d(
l.ofmap, ofmap, l.batch_size * sizeof(float),
l.batch_size * sizeof(float), l.batch_size * sizeof(float),
l.seq_len * l.input_samples * sizeof(float));

snrt_dma_wait_all();
}

snrt_global_barrier();
}
85 changes: 85 additions & 0 deletions sw/dnn/softmax/verify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/env python3
# Copyright 2023 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Luca Colagrande <[email protected]>

import sys
from pathlib import Path
import numpy as np
import torch
from data.datagen import golden_model

sys.path.append(str(Path(__file__).parent / '../../../util/sim/'))
import verification # noqa: E402
from elf import Elf # noqa: E402
from data_utils import bytes_to_float, bytes_to_struct # noqa: E402


ERR_THRESHOLD = 0.003

PRECISION_T = {
8: '64',
4: '32',
2: '16',
1: '8'
}

NUMPY_T = {
'64': np.float64,
'32': np.float32,
'16': np.float16
}


def main():
# Run simulation and get outputs
args = verification.parse_args()
raw_results = verification.simulate(sim_bin=args.sim_bin,
snitch_bin=args.snitch_bin,
symbols_bin=args.symbols_bin,
log=args.log,
output_uids=['ofmap'])

# Extract input operands from ELF file
if args.symbols_bin:
elf = Elf(args.symbols_bin)
else:
elf = Elf(args.snitch_bin)

layer_struct = {
'batch_size': 'I',
'seq_len': 'I',
'input_samples': 'I',
'reduce_dim': 'i',
'ifmap_ptr': 'I',
'ofmap_ptr': 'I',
'dtype': 'I'
}
layer = bytes_to_struct(elf.get_symbol_contents('layer'), layer_struct)
batch_size = layer['batch_size']
seq_len = layer['seq_len']
input_samples = layer['input_samples']
reduce_dim = layer['reduce_dim']
prec = PRECISION_T[layer['dtype']]

ifmap = np.array(bytes_to_float(elf.get_symbol_contents('ifmap'), prec), dtype=NUMPY_T[prec])
ifmap = ifmap.reshape(batch_size, seq_len, input_samples)
ifmap = torch.from_numpy(ifmap)

# Verify results
ofmap_actual = np.array(bytes_to_float(raw_results['ofmap'], prec), dtype=NUMPY_T[prec])
ofmap_golden = golden_model(ifmap, reduce_dim).detach().numpy().flatten()

absolute_err = np.absolute(ofmap_golden - ofmap_actual)
fail = np.any(absolute_err > ERR_THRESHOLD)
if (fail):
verification.dump_results_to_csv([ofmap_golden, ofmap_actual, absolute_err],
Path.cwd() / 'softmax_results.csv')

return int(fail)


if __name__ == "__main__":
sys.exit(main())
2 changes: 1 addition & 1 deletion sw/dnn/src/dnn.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,5 +202,5 @@ typedef struct network_single_cluster_t_ {
#include "../layernorm/src/layernorm.h"
#include "../linear/src/linear.h"
#include "../maxpool/src/maxpool.h"
// #include "softmax.h"
#include "../softmax/src/softmax.h"
// #include "utils.h"
2 changes: 1 addition & 1 deletion target/snitch_cluster/sw/apps/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ SUBDIRS += dnn/gemm
SUBDIRS += dnn/layernorm
SUBDIRS += dnn/linear
SUBDIRS += dnn/maxpool
# SUBDIRS += dnn/softmax
SUBDIRS += dnn/softmax
SUBDIRS += montecarlo/pi_estimation

.PHONY: all clean $(SUBDIRS)
Expand Down
Loading

0 comments on commit 4f1ecf3

Please sign in to comment.