dnn: Refactor and verify layernorm

pulp-platform · Oct 27, 2023 · c3e418d · c3e418d
1 parent 5b937ad
commit c3e418d
Show file tree

Hide file tree

Showing 53 changed files with 1,596 additions and 585 deletions.
diff --git a/.clang-format-ignore b/.clang-format-ignore
@@ -4,3 +4,5 @@
 
 # Ignore vendored third-party code
 ./sw/math/*
+./target/snitch_cluster/sw/apps/transformer/src/transformer.c
+./target/snitch_cluster/sw/apps/transformer/src/data.h
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -117,6 +117,7 @@ jobs:
         with:
           flake8-version: "6.0.0"
           max-line-length: "100"
+          exclude: "target/snitch_cluster/sw/dnn/datagen.py"
 
   ######################
   # Clang-Format Check #

diff --git a/sw/blas/axpy/data/datagen.py b/sw/blas/axpy/data/datagen.py
@@ -11,8 +11,8 @@
 import os
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
-from data_utils import format_scalar_definition, format_vector_definition, \
-                       format_vector_declaration, format_ifdef_wrapper  # noqa: E402
+from data_utils import format_scalar_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
 
 MIN = -1000
 MAX = +1000
@@ -47,16 +47,16 @@ def main():
     a = np.random.uniform(MIN, MAX, 1)
     x = np.random.uniform(MIN, MAX, length)
     y = np.random.uniform(MIN, MAX, length)
-    z = np.zeros(length)
     g = golden_model(a, x, y)
 
     # Format header file
     l_str = format_scalar_definition('const uint32_t', 'l', length)
     a_str = format_scalar_definition('const double', 'a', a[0])
-    x_str = format_vector_definition('double', 'x', x, alignment=BURST_ALIGNMENT, section=section)
-    y_str = format_vector_definition('double', 'y', y, alignment=BURST_ALIGNMENT, section=section)
-    z_str = format_vector_declaration('double', 'z', z, alignment=BURST_ALIGNMENT, section=section)
-    g_str = format_vector_definition('double', 'g', g)
+    x_str = format_array_definition('double', 'x', x, alignment=BURST_ALIGNMENT, section=section)
+    y_str = format_array_definition('double', 'y', y, alignment=BURST_ALIGNMENT, section=section)
+    z_str = format_array_declaration('double', 'z', [length],
+                                     alignment=BURST_ALIGNMENT, section=section)
+    g_str = format_array_definition('double', 'g', g)
     g_str = format_ifdef_wrapper('BIST', g_str)
     f_str = '\n\n'.join([l_str, a_str, x_str, y_str, z_str, g_str])
     f_str += '\n'

diff --git a/sw/blas/gemm/data/datagen.py b/sw/blas/gemm/data/datagen.py
@@ -15,7 +15,7 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
 from data_utils import emit_license, format_scalar_definition, \
-                       format_vector_definition, format_ifdef_wrapper  # noqa: E402
+                       format_array_definition, format_ifdef_wrapper  # noqa: E402
 
 
 np.random.seed(42)
@@ -100,18 +100,18 @@ def emit_header(**kwargs):
     data_str += [format_scalar_definition('uint32_t', 'BETA', kwargs['beta'])]
     data_str += [format_scalar_definition('uint32_t', 'dtype_size', kwargs['prec']//8)]
     data_str += [format_scalar_definition('uint32_t', 'expand', kwargs['expand'])]
-    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(),
+    data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(),
                  alignment=BURST_ALIGNMENT, section=kwargs['section'])]
-    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(),
+    data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(),
                  alignment=BURST_ALIGNMENT, section=kwargs['section'])]
-    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(),
+    data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(),
                  alignment=BURST_ALIGNMENT, section=kwargs['section'])]
     if kwargs['prec'] == 8:
-        result_def = format_vector_definition(C_TYPES['64'], 'result', result.flatten())
+        result_def = format_array_definition(C_TYPES['64'], 'result', result.flatten())
     else:
-        result_def = format_vector_definition(C_TYPES[str(kwargs['prec'])],
-                                              'result',
-                                              result.flatten())
+        result_def = format_array_definition(C_TYPES[str(kwargs['prec'])],
+                                             'result',
+                                             result.flatten())
     data_str += [format_ifdef_wrapper('BIST', result_def)]
     data_str = '\n\n'.join(data_str)
 

diff --git a/sw/dnn/.gitignore b/sw/dnn/.gitignore
@@ -0,0 +1 @@
+*/data/data.h
diff --git a/sw/dnn/batchnorm/data/datagen.py b/sw/dnn/batchnorm/data/datagen.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Tim Fischer <[email protected]>
+# Viviane Potocnik <[email protected]>
+# Luca Colagrande <[email protected]>
+
+import argparse
+import pathlib
+import hjson
+import sys
+import os
+import torch
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+import data_utils  # noqa: E402
+from data_utils import emit_license, \
+                       format_struct_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
+
+torch.manual_seed(42)
+
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
+
+PRECISION_T = {
+    '64': 'FP64',
+    '32': 'FP32',
+    '16': 'FP16',
+    '8': 'FP8'
+}
+
+
+def golden_model(ifmap):
+    n, ci, ih, iw = ifmap.shape
+    bn = torch.nn.BatchNorm2d(ci)
+    bn.weight.requires_grad = False
+    bn.bias.requires_grad = False
+    running_mean = torch.randn_like(bn.running_mean, requires_grad=False)
+    running_var = torch.rand_like(bn.running_var, requires_grad=False)
+    gamma = bn.weight / torch.sqrt(running_var + bn.eps)
+    beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
+    ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1)
+    return ofmap, gamma, beta
+
+
+def emit_header(**kwargs):
+
+    in_channels = kwargs['input_dim']['channels']
+    in_height = kwargs['input_dim']['height']
+    in_width = kwargs['input_dim']['width']
+    tile_ci = kwargs['tile_ci']
+    prec = str(kwargs['prec'])
+
+    torch_type = data_utils.floating_point_torch_type(prec)
+    ctype = data_utils.floating_point_ctype(prec)
+
+    ifmap = torch.randn(1, in_channels, in_height, in_width, requires_grad=False, dtype=torch_type)
+    ofmap, gamma, beta = golden_model(ifmap)
+
+    # convert from CHW to HWC format
+    ifmap = ifmap.permute(0, 2, 3, 1)
+    ofmap = ofmap.permute(0, 2, 3, 1)
+
+    n, ih, iw, ci = ifmap.shape
+
+    ifmap_uid = 'ifmap'
+    ofmap_uid = 'ofmap'
+    beta_uid = 'beta'
+    gamma_uid = 'gamma'
+
+    layer_cfg = {
+        'CI': ci,
+        'IH': ih,
+        'IW': iw,
+        'TILE_CI': tile_ci,
+        'ifmap': ifmap_uid,
+        'ofmap': ofmap_uid,
+        'beta': beta_uid,
+        'gamma': gamma_uid
+    }
+
+    data_str = [emit_license()]
+    # Array forward declarations
+    data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape)]
+    data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape)]
+    data_str += [format_array_declaration(ctype, beta_uid, beta.shape)]
+    data_str += [format_array_declaration(ctype, gamma_uid, gamma.shape)]
+    # Layer struct
+    data_str += [format_struct_definition('batchnorm_layer_t', 'layer', layer_cfg)]
+    # Array definitions
+    data_str += [format_array_definition(ctype, ifmap_uid, ifmap)]
+    data_str += [format_array_definition(ctype, beta_uid, beta)]
+    data_str += [format_array_definition(ctype, gamma_uid, gamma)]
+    # Golden results for BIST
+    result_def = format_array_definition(ctype, 'golden', ofmap)
+    data_str += [format_ifdef_wrapper('BIST', result_def)]
+    data_str = '\n\n'.join(data_str)
+
+    return data_str
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
+    parser.add_argument(
+        "-c", "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel'
+    )
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
+    parser.add_argument(
+        'output',
+        type=pathlib.Path,
+        help='Path of the output header file')
+    args = parser.parse_args()
+
+    # Load param config file
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+    param['section'] = args.section
+
+    # Emit header file
+    with open(args.output, 'w') as f:
+        f.write(emit_header(**param))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/...er/sw/apps/dnn/batchnorm/src/params.hjson → sw/dnn/batchnorm/data/params.hjson b/...er/sw/apps/dnn/batchnorm/src/params.hjson → sw/dnn/batchnorm/data/params.hjson
@@ -2,17 +2,12 @@
 // Solderpad Hardware License, Version 0.51, see LICENSE for details.
 // SPDX-License-Identifier: SHL-0.51
 
-// Parameters for a single BatchNorm layer
-
 {
-    kernel: "BatchNorm"
-    channels: {
-        out: 32,
-        in: 32
-    }
     input_dim: {
+        channels: 32
         height: 8,
         width: 8
     }
+    tile_ci: 32
     prec: 64
 }
diff --git a/sw/dnn/src/batchnorm.h → sw/dnn/batchnorm/src/batchnorm.h b/sw/dnn/src/batchnorm.h → sw/dnn/batchnorm/src/batchnorm.h
@@ -4,6 +4,18 @@
 
 #include "snrt.h"
 
+typedef struct {
+    uint32_t CI;
+    uint32_t IH;
+    uint32_t IW;
+    uint32_t TILE_CI;
+    double *ifmap;
+    double *ofmap;
+    double *gamma;
+    double *beta;
+    precision_t dtype;
+} batchnorm_layer_t;
+
 /**
  * @brief implementation of a FP64 batchnorm as a linear combination
  * y = gamma * x + beta
@@ -50,12 +62,17 @@ static inline void batchnorm_fp64(double *ifmap, double *gamma, double *beta,
     snrt_ssr_disable();
 }
 
-static inline void batchnorm_layer(const conv_layer *l) {
+static inline void batchnorm_layer(const batchnorm_layer_t *l) {
     const uint32_t cluster_num = snrt_cluster_num();
     const uint32_t cluster_id = snrt_cluster_idx();
     const uint32_t compute_num = snrt_cluster_compute_core_num();
     const uint32_t compute_id = snrt_cluster_core_idx();
 
+    // Calculate output dimensions
+    uint32_t OH = l->IH;
+    uint32_t OW = l->IW;
+    uint32_t CO = l->CI;
+
     // Each cluster loads one tile of a row
     uint32_t ifmap_size = 2 * l->IW * l->TILE_CI;
     uint32_t weights_size = l->CI;
@@ -78,7 +95,7 @@ static inline void batchnorm_layer(const conv_layer *l) {
     uint32_t prev_ow;
     uint32_t prev_ci;
 
-    for (uint32_t oh = cluster_id; oh < l->OH; oh += cluster_num) {
+    for (uint32_t oh = cluster_id; oh < OH; oh += cluster_num) {
         for (uint32_t ci = 0; ci < l->CI; ci += l->TILE_CI) {
             if (snrt_is_dm_core()) {
                 // Load weights once in the beginning
@@ -112,13 +129,13 @@ static inline void batchnorm_layer(const conv_layer *l) {
                 if (!(oh == cluster_id && ci == 0)) {
                     if (l->TILE_CI == l->CI) {
                         // data is stored consecutively
-                        snrt_dma_start_1d(&l->ofmap[prev_oh * l->OW * l->CI],
+                        snrt_dma_start_1d(&l->ofmap[prev_oh * OW * l->CI],
                                           &ofmap[!read_buf * (ofmap_size / 2)],
                                           sizeof(double) * l->IW * l->CI);
                     } else {
                         // data is stored in interleaved layout
                         snrt_dma_start_2d(
-                            &l->ofmap[prev_oh * l->OW * l->CI +
+                            &l->ofmap[prev_oh * OW * l->CI +
                                       prev_ci],                   /* dst */
                             &ofmap[!read_buf * (ofmap_size / 2)], /* src */
                             sizeof(double) * l->TILE_CI,          /* size */
@@ -146,7 +163,7 @@ static inline void batchnorm_layer(const conv_layer *l) {
                 batchnorm_fp64(&ifmap[read_buf * ofmap_size / 2 + compute_id],
                                &gamma[ci + compute_id], &beta[ci + compute_id],
                                &ofmap[write_buf * ofmap_size / 2 + compute_id],
-                               l->OW, l->TILE_CI, compute_num, setup_SSR);
+                               OW, l->TILE_CI, compute_num, setup_SSR);
 
                 write_buf = !write_buf;
                 read_buf = !read_buf;
@@ -160,13 +177,13 @@ static inline void batchnorm_layer(const conv_layer *l) {
     if (snrt_is_dm_core()) {
         if (l->TILE_CI == l->CI) {
             // data is stored consecutively
-            snrt_dma_start_1d(&l->ofmap[prev_oh * l->OW * l->CI],
+            snrt_dma_start_1d(&l->ofmap[prev_oh * OW * l->CI],
                               &ofmap[!read_buf * (ofmap_size / 2)],
                               sizeof(double) * l->IW * l->CI);
         } else {
             // data is stored in interleaved layout
             snrt_dma_start_2d(
-                &l->ofmap[prev_oh * l->OW * l->CI + prev_ci], /* dst */
+                &l->ofmap[prev_oh * OW * l->CI + prev_ci], /* dst */
                 &ofmap[!read_buf * (ofmap_size / 2)],         /* src */
                 sizeof(double) * l->TILE_CI,                  /* size */
                 sizeof(double) * l->CI,                       /* dst_stride */

diff --git a/sw/dnn/batchnorm/src/main.c b/sw/dnn/batchnorm/src/main.c
@@ -0,0 +1,15 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dnn.h"
+
+#include "data.h"
+
+int main() {
+    batchnorm_layer(&layer);
+
+    snrt_global_barrier();
+
+    return 0;
+}
diff --git a/sw/dnn/common.mk b/sw/dnn/common.mk
@@ -0,0 +1,31 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <[email protected]>
+
+# Usage of absolute paths is required to externally include this Makefile
+MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+
+DATA_DIR       := $(realpath $(MK_DIR)/$(APP)/data)
+SRC_DIR        := $(realpath $(MK_DIR)/$(APP)/src)
+COMMON_SRC_DIR := $(realpath $(MK_DIR)/src)
+
+DATA_CFG ?= $(DATA_DIR)/params.hjson
+SECTION  ?=
+
+SRCS    ?= $(realpath $(SRC_DIR)/main.c)
+INCDIRS ?= $(DATA_DIR) $(SRC_DIR) $(COMMON_SRC_DIR)
+
+DATAGEN_PY := $(DATA_DIR)/datagen.py
+DATA_H     := $(DATA_DIR)/data.h
+
+$(DATA_H): $(DATAGEN_PY) $(DATA_CFG)
+	$< -c $(DATA_CFG) --section="$(SECTION)" $@
+
+.PHONY: clean-data clean
+
+clean-data:
+	rm -f $(DATA_H)
+
+clean: clean-data
diff --git a/sw/dnn/src/conv2d.h → sw/dnn/conv2d/conv2d.h b/sw/dnn/src/conv2d.h → sw/dnn/conv2d/conv2d.h