Skip to content

Commit

Permalink
dnn: Refactor and verify layernorm
Browse files Browse the repository at this point in the history
  • Loading branch information
colluca committed Oct 27, 2023
1 parent 5b937ad commit c3e418d
Show file tree
Hide file tree
Showing 53 changed files with 1,596 additions and 585 deletions.
2 changes: 2 additions & 0 deletions .clang-format-ignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@

# Ignore vendored third-party code
./sw/math/*
./target/snitch_cluster/sw/apps/transformer/src/transformer.c
./target/snitch_cluster/sw/apps/transformer/src/data.h
1 change: 1 addition & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ jobs:
with:
flake8-version: "6.0.0"
max-line-length: "100"
exclude: "target/snitch_cluster/sw/dnn/datagen.py"

######################
# Clang-Format Check #
Expand Down
14 changes: 7 additions & 7 deletions sw/blas/axpy/data/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
import os

sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
from data_utils import format_scalar_definition, format_vector_definition, \
format_vector_declaration, format_ifdef_wrapper # noqa: E402
from data_utils import format_scalar_definition, format_array_definition, \
format_array_declaration, format_ifdef_wrapper # noqa: E402

MIN = -1000
MAX = +1000
Expand Down Expand Up @@ -47,16 +47,16 @@ def main():
a = np.random.uniform(MIN, MAX, 1)
x = np.random.uniform(MIN, MAX, length)
y = np.random.uniform(MIN, MAX, length)
z = np.zeros(length)
g = golden_model(a, x, y)

# Format header file
l_str = format_scalar_definition('const uint32_t', 'l', length)
a_str = format_scalar_definition('const double', 'a', a[0])
x_str = format_vector_definition('double', 'x', x, alignment=BURST_ALIGNMENT, section=section)
y_str = format_vector_definition('double', 'y', y, alignment=BURST_ALIGNMENT, section=section)
z_str = format_vector_declaration('double', 'z', z, alignment=BURST_ALIGNMENT, section=section)
g_str = format_vector_definition('double', 'g', g)
x_str = format_array_definition('double', 'x', x, alignment=BURST_ALIGNMENT, section=section)
y_str = format_array_definition('double', 'y', y, alignment=BURST_ALIGNMENT, section=section)
z_str = format_array_declaration('double', 'z', [length],
alignment=BURST_ALIGNMENT, section=section)
g_str = format_array_definition('double', 'g', g)
g_str = format_ifdef_wrapper('BIST', g_str)
f_str = '\n\n'.join([l_str, a_str, x_str, y_str, z_str, g_str])
f_str += '\n'
Expand Down
16 changes: 8 additions & 8 deletions sw/blas/gemm/data/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
from data_utils import emit_license, format_scalar_definition, \
format_vector_definition, format_ifdef_wrapper # noqa: E402
format_array_definition, format_ifdef_wrapper # noqa: E402


np.random.seed(42)
Expand Down Expand Up @@ -100,18 +100,18 @@ def emit_header(**kwargs):
data_str += [format_scalar_definition('uint32_t', 'BETA', kwargs['beta'])]
data_str += [format_scalar_definition('uint32_t', 'dtype_size', kwargs['prec']//8)]
data_str += [format_scalar_definition('uint32_t', 'expand', kwargs['expand'])]
data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(),
data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(),
alignment=BURST_ALIGNMENT, section=kwargs['section'])]
data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(),
data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(),
alignment=BURST_ALIGNMENT, section=kwargs['section'])]
data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(),
data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(),
alignment=BURST_ALIGNMENT, section=kwargs['section'])]
if kwargs['prec'] == 8:
result_def = format_vector_definition(C_TYPES['64'], 'result', result.flatten())
result_def = format_array_definition(C_TYPES['64'], 'result', result.flatten())
else:
result_def = format_vector_definition(C_TYPES[str(kwargs['prec'])],
'result',
result.flatten())
result_def = format_array_definition(C_TYPES[str(kwargs['prec'])],
'result',
result.flatten())
data_str += [format_ifdef_wrapper('BIST', result_def)]
data_str = '\n\n'.join(data_str)

Expand Down
1 change: 1 addition & 0 deletions sw/dnn/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*/data/data.h
136 changes: 136 additions & 0 deletions sw/dnn/batchnorm/data/datagen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#!/usr/bin/env python3
# Copyright 2023 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Tim Fischer <[email protected]>
# Viviane Potocnik <[email protected]>
# Luca Colagrande <[email protected]>

import argparse
import pathlib
import hjson
import sys
import os
import torch

sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
import data_utils # noqa: E402
from data_utils import emit_license, \
format_struct_definition, format_array_definition, \
format_array_declaration, format_ifdef_wrapper # noqa: E402

torch.manual_seed(42)

# AXI splits bursts crossing 4KB address boundaries. To minimize
# the occurrence of these splits the data should be aligned to 4KB
BURST_ALIGNMENT = 4096

PRECISION_T = {
'64': 'FP64',
'32': 'FP32',
'16': 'FP16',
'8': 'FP8'
}


def golden_model(ifmap):
n, ci, ih, iw = ifmap.shape
bn = torch.nn.BatchNorm2d(ci)
bn.weight.requires_grad = False
bn.bias.requires_grad = False
running_mean = torch.randn_like(bn.running_mean, requires_grad=False)
running_var = torch.rand_like(bn.running_var, requires_grad=False)
gamma = bn.weight / torch.sqrt(running_var + bn.eps)
beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1)
return ofmap, gamma, beta


def emit_header(**kwargs):

in_channels = kwargs['input_dim']['channels']
in_height = kwargs['input_dim']['height']
in_width = kwargs['input_dim']['width']
tile_ci = kwargs['tile_ci']
prec = str(kwargs['prec'])

torch_type = data_utils.floating_point_torch_type(prec)
ctype = data_utils.floating_point_ctype(prec)

ifmap = torch.randn(1, in_channels, in_height, in_width, requires_grad=False, dtype=torch_type)
ofmap, gamma, beta = golden_model(ifmap)

# convert from CHW to HWC format
ifmap = ifmap.permute(0, 2, 3, 1)
ofmap = ofmap.permute(0, 2, 3, 1)

n, ih, iw, ci = ifmap.shape

ifmap_uid = 'ifmap'
ofmap_uid = 'ofmap'
beta_uid = 'beta'
gamma_uid = 'gamma'

layer_cfg = {
'CI': ci,
'IH': ih,
'IW': iw,
'TILE_CI': tile_ci,
'ifmap': ifmap_uid,
'ofmap': ofmap_uid,
'beta': beta_uid,
'gamma': gamma_uid
}

data_str = [emit_license()]
# Array forward declarations
data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape)]
data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape)]
data_str += [format_array_declaration(ctype, beta_uid, beta.shape)]
data_str += [format_array_declaration(ctype, gamma_uid, gamma.shape)]
# Layer struct
data_str += [format_struct_definition('batchnorm_layer_t', 'layer', layer_cfg)]
# Array definitions
data_str += [format_array_definition(ctype, ifmap_uid, ifmap)]
data_str += [format_array_definition(ctype, beta_uid, beta)]
data_str += [format_array_definition(ctype, gamma_uid, gamma)]
# Golden results for BIST
result_def = format_array_definition(ctype, 'golden', ofmap)
data_str += [format_ifdef_wrapper('BIST', result_def)]
data_str = '\n\n'.join(data_str)

return data_str


def main():

parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
parser.add_argument(
"-c", "--cfg",
type=pathlib.Path,
required=True,
help='Select param config file kernel'
)
parser.add_argument(
'--section',
type=str,
help='Section to store matrices in')
parser.add_argument(
'output',
type=pathlib.Path,
help='Path of the output header file')
args = parser.parse_args()

# Load param config file
with args.cfg.open() as f:
param = hjson.loads(f.read())
param['section'] = args.section

# Emit header file
with open(args.output, 'w') as f:
f.write(emit_header(**param))


if __name__ == '__main__':
main()
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,12 @@
// Solderpad Hardware License, Version 0.51, see LICENSE for details.
// SPDX-License-Identifier: SHL-0.51

// Parameters for a single BatchNorm layer

{
kernel: "BatchNorm"
channels: {
out: 32,
in: 32
}
input_dim: {
channels: 32
height: 8,
width: 8
}
tile_ci: 32
prec: 64
}
31 changes: 24 additions & 7 deletions sw/dnn/src/batchnorm.h → sw/dnn/batchnorm/src/batchnorm.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@

#include "snrt.h"

typedef struct {
uint32_t CI;
uint32_t IH;
uint32_t IW;
uint32_t TILE_CI;
double *ifmap;
double *ofmap;
double *gamma;
double *beta;
precision_t dtype;
} batchnorm_layer_t;

/**
* @brief implementation of a FP64 batchnorm as a linear combination
* y = gamma * x + beta
Expand Down Expand Up @@ -50,12 +62,17 @@ static inline void batchnorm_fp64(double *ifmap, double *gamma, double *beta,
snrt_ssr_disable();
}

static inline void batchnorm_layer(const conv_layer *l) {
static inline void batchnorm_layer(const batchnorm_layer_t *l) {
const uint32_t cluster_num = snrt_cluster_num();
const uint32_t cluster_id = snrt_cluster_idx();
const uint32_t compute_num = snrt_cluster_compute_core_num();
const uint32_t compute_id = snrt_cluster_core_idx();

// Calculate output dimensions
uint32_t OH = l->IH;
uint32_t OW = l->IW;
uint32_t CO = l->CI;

// Each cluster loads one tile of a row
uint32_t ifmap_size = 2 * l->IW * l->TILE_CI;
uint32_t weights_size = l->CI;
Expand All @@ -78,7 +95,7 @@ static inline void batchnorm_layer(const conv_layer *l) {
uint32_t prev_ow;
uint32_t prev_ci;

for (uint32_t oh = cluster_id; oh < l->OH; oh += cluster_num) {
for (uint32_t oh = cluster_id; oh < OH; oh += cluster_num) {
for (uint32_t ci = 0; ci < l->CI; ci += l->TILE_CI) {
if (snrt_is_dm_core()) {
// Load weights once in the beginning
Expand Down Expand Up @@ -112,13 +129,13 @@ static inline void batchnorm_layer(const conv_layer *l) {
if (!(oh == cluster_id && ci == 0)) {
if (l->TILE_CI == l->CI) {
// data is stored consecutively
snrt_dma_start_1d(&l->ofmap[prev_oh * l->OW * l->CI],
snrt_dma_start_1d(&l->ofmap[prev_oh * OW * l->CI],
&ofmap[!read_buf * (ofmap_size / 2)],
sizeof(double) * l->IW * l->CI);
} else {
// data is stored in interleaved layout
snrt_dma_start_2d(
&l->ofmap[prev_oh * l->OW * l->CI +
&l->ofmap[prev_oh * OW * l->CI +
prev_ci], /* dst */
&ofmap[!read_buf * (ofmap_size / 2)], /* src */
sizeof(double) * l->TILE_CI, /* size */
Expand Down Expand Up @@ -146,7 +163,7 @@ static inline void batchnorm_layer(const conv_layer *l) {
batchnorm_fp64(&ifmap[read_buf * ofmap_size / 2 + compute_id],
&gamma[ci + compute_id], &beta[ci + compute_id],
&ofmap[write_buf * ofmap_size / 2 + compute_id],
l->OW, l->TILE_CI, compute_num, setup_SSR);
OW, l->TILE_CI, compute_num, setup_SSR);

write_buf = !write_buf;
read_buf = !read_buf;
Expand All @@ -160,13 +177,13 @@ static inline void batchnorm_layer(const conv_layer *l) {
if (snrt_is_dm_core()) {
if (l->TILE_CI == l->CI) {
// data is stored consecutively
snrt_dma_start_1d(&l->ofmap[prev_oh * l->OW * l->CI],
snrt_dma_start_1d(&l->ofmap[prev_oh * OW * l->CI],
&ofmap[!read_buf * (ofmap_size / 2)],
sizeof(double) * l->IW * l->CI);
} else {
// data is stored in interleaved layout
snrt_dma_start_2d(
&l->ofmap[prev_oh * l->OW * l->CI + prev_ci], /* dst */
&l->ofmap[prev_oh * OW * l->CI + prev_ci], /* dst */
&ofmap[!read_buf * (ofmap_size / 2)], /* src */
sizeof(double) * l->TILE_CI, /* size */
sizeof(double) * l->CI, /* dst_stride */
Expand Down
15 changes: 15 additions & 0 deletions sw/dnn/batchnorm/src/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Copyright 2020 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

#include "dnn.h"

#include "data.h"

int main() {
batchnorm_layer(&layer);

snrt_global_barrier();

return 0;
}
31 changes: 31 additions & 0 deletions sw/dnn/common.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright 2023 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
#
# Luca Colagrande <[email protected]>

# Usage of absolute paths is required to externally include this Makefile
MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))

DATA_DIR := $(realpath $(MK_DIR)/$(APP)/data)
SRC_DIR := $(realpath $(MK_DIR)/$(APP)/src)
COMMON_SRC_DIR := $(realpath $(MK_DIR)/src)

DATA_CFG ?= $(DATA_DIR)/params.hjson
SECTION ?=

SRCS ?= $(realpath $(SRC_DIR)/main.c)
INCDIRS ?= $(DATA_DIR) $(SRC_DIR) $(COMMON_SRC_DIR)

DATAGEN_PY := $(DATA_DIR)/datagen.py
DATA_H := $(DATA_DIR)/data.h

$(DATA_H): $(DATAGEN_PY) $(DATA_CFG)
$< -c $(DATA_CFG) --section="$(SECTION)" $@

.PHONY: clean-data clean

clean-data:
rm -f $(DATA_H)

clean: clean-data
File renamed without changes.
Loading

0 comments on commit c3e418d

Please sign in to comment.