Temporary DNN refactoring

pulp-platform · Oct 25, 2023 · 1eb0e45 · 1eb0e45
1 parent 5b937ad
commit 1eb0e45
Show file tree

Hide file tree

Showing 22 changed files with 355 additions and 141 deletions.
diff --git a/sw/blas/axpy/data/datagen.py b/sw/blas/axpy/data/datagen.py
@@ -11,8 +11,8 @@
 import os
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
-from data_utils import format_scalar_definition, format_vector_definition, \
-                       format_vector_declaration, format_ifdef_wrapper  # noqa: E402
+from data_utils import format_scalar_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
 
 MIN = -1000
 MAX = +1000
@@ -47,16 +47,15 @@ def main():
     a = np.random.uniform(MIN, MAX, 1)
     x = np.random.uniform(MIN, MAX, length)
     y = np.random.uniform(MIN, MAX, length)
-    z = np.zeros(length)
     g = golden_model(a, x, y)
 
     # Format header file
     l_str = format_scalar_definition('const uint32_t', 'l', length)
     a_str = format_scalar_definition('const double', 'a', a[0])
-    x_str = format_vector_definition('double', 'x', x, alignment=BURST_ALIGNMENT, section=section)
-    y_str = format_vector_definition('double', 'y', y, alignment=BURST_ALIGNMENT, section=section)
-    z_str = format_vector_declaration('double', 'z', z, alignment=BURST_ALIGNMENT, section=section)
-    g_str = format_vector_definition('double', 'g', g)
+    x_str = format_array_definition('double', 'x', x, alignment=BURST_ALIGNMENT, section=section)
+    y_str = format_array_definition('double', 'y', y, alignment=BURST_ALIGNMENT, section=section)
+    z_str = format_array_declaration('double', 'z', [length], alignment=BURST_ALIGNMENT, section=section)
+    g_str = format_array_definition('double', 'g', g)
     g_str = format_ifdef_wrapper('BIST', g_str)
     f_str = '\n\n'.join([l_str, a_str, x_str, y_str, z_str, g_str])
     f_str += '\n'

diff --git a/sw/blas/gemm/data/datagen.py b/sw/blas/gemm/data/datagen.py
@@ -15,7 +15,7 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
 from data_utils import emit_license, format_scalar_definition, \
-                       format_vector_definition, format_ifdef_wrapper  # noqa: E402
+                       format_array_definition, format_ifdef_wrapper  # noqa: E402
 
 
 np.random.seed(42)
@@ -100,16 +100,16 @@ def emit_header(**kwargs):
     data_str += [format_scalar_definition('uint32_t', 'BETA', kwargs['beta'])]
     data_str += [format_scalar_definition('uint32_t', 'dtype_size', kwargs['prec']//8)]
     data_str += [format_scalar_definition('uint32_t', 'expand', kwargs['expand'])]
-    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(),
+    data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'a', a.flatten(),
                  alignment=BURST_ALIGNMENT, section=kwargs['section'])]
-    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(),
+    data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'b', b.flatten(),
                  alignment=BURST_ALIGNMENT, section=kwargs['section'])]
-    data_str += [format_vector_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(),
+    data_str += [format_array_definition(C_TYPES[str(kwargs['prec'])], 'c', c.flatten(),
                  alignment=BURST_ALIGNMENT, section=kwargs['section'])]
     if kwargs['prec'] == 8:
-        result_def = format_vector_definition(C_TYPES['64'], 'result', result.flatten())
+        result_def = format_array_definition(C_TYPES['64'], 'result', result.flatten())
     else:
-        result_def = format_vector_definition(C_TYPES[str(kwargs['prec'])],
+        result_def = format_array_definition(C_TYPES[str(kwargs['prec'])],
                                               'result',
                                               result.flatten())
     data_str += [format_ifdef_wrapper('BIST', result_def)]

diff --git a/sw/dnn/src/batchnorm.h → sw/dnn/batchnorm/batchnorm.h b/sw/dnn/src/batchnorm.h → sw/dnn/batchnorm/batchnorm.h
diff --git a/sw/dnn/common.mk b/sw/dnn/common.mk
@@ -0,0 +1,31 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <[email protected]>
+
+# Usage of absolute paths is required to externally include this Makefile
+MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+
+DATA_DIR       := $(realpath $(MK_DIR)/$(APP)/data)
+SRC_DIR        := $(realpath $(MK_DIR)/$(APP)/src)
+COMMON_SRC_DIR := $(realpath $(MK_DIR)/src)
+
+DATA_CFG ?= $(DATA_DIR)/params.hjson
+SECTION  ?=
+
+SRCS    ?= $(realpath $(SRC_DIR)/main.c)
+INCDIRS ?= $(DATA_DIR) $(SRC_DIR) $(COMMON_SRC_DIR)
+
+DATAGEN_PY := $(DATA_DIR)/datagen.py
+DATA_H     := $(DATA_DIR)/data.h
+
+$(DATA_H): $(DATAGEN_PY) $(DATA_CFG)
+	$< -c $(DATA_CFG) --section="$(SECTION)" > $@
+
+.PHONY: clean-data clean
+
+clean-data:
+	rm -f $(DATA_H)
+
+clean: clean-data
diff --git a/sw/dnn/src/conv2d.h → sw/dnn/conv2d/conv2d.h b/sw/dnn/src/conv2d.h → sw/dnn/conv2d/conv2d.h
diff --git a/sw/dnn/src/gelu.h → sw/dnn/gelu/gelu.h b/sw/dnn/src/gelu.h → sw/dnn/gelu/gelu.h
diff --git a/sw/dnn/src/gemm.h → sw/dnn/gemm/gemm.h b/sw/dnn/src/gemm.h → sw/dnn/gemm/gemm.h
diff --git a/sw/dnn/layernorm/.gitignore b/sw/dnn/layernorm/.gitignore
@@ -0,0 +1 @@
+data/data.h
diff --git a/sw/dnn/layernorm/data/datagen.py b/sw/dnn/layernorm/data/datagen.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <[email protected]>
+
+import argparse
+import pathlib
+import hjson
+import sys
+import os
+import torch
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+from data_utils import emit_license, format_scalar_definition, \
+                       format_struct_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
+import data_utils
+
+torch.manual_seed(42)
+
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
+
+
+def golden_model(ifmap, eps, shape, prec):
+    dtype = data_utils.floating_point_torch_type(prec)
+    ln = torch.nn.LayerNorm(shape, eps=eps)#, dtype=dtype)
+    return ln(ifmap)
+
+
+def emit_header(**kwargs):
+    batch_size = kwargs['input_dim']['batch_size']
+    seq_len = kwargs['input_dim']['seq_len']
+    embeddings = kwargs['input_dim']['embeddings']
+    eps = kwargs['eps']
+    prec = str(kwargs['prec'])
+
+    torch_type = data_utils.floating_point_torch_type(prec)
+    ifmap = torch.randn(batch_size, seq_len, embeddings, requires_grad=False, dtype=torch_type)
+
+    ofmap = golden_model(ifmap, eps, embeddings, prec)
+    ofmap = ofmap.detach().numpy()
+
+    ctype = data_utils.floating_point_ctype(prec)
+    checksum = torch.sum(ifmap, dim=-1)
+
+    ifmap_uid = 'ifmap'
+    ofmap_uid = 'ofmap'
+
+    layer_cfg = {
+        **kwargs['input_dim'],
+        'ifmap': ifmap_uid,
+        'ofmap': ofmap_uid
+    }
+
+    data_str = [emit_license()]
+    data_str += [format_array_declaration(ctype, ifmap_uid, ifmap.shape,
+                 alignment=BURST_ALIGNMENT)]    
+    data_str += [format_array_declaration(ctype, ofmap_uid, ofmap.shape,
+                 alignment=BURST_ALIGNMENT)]
+    data_str += [format_struct_definition('layernorm_layer_t', 'layer', layer_cfg)]
+    data_str += [format_array_definition(ctype, ifmap_uid, ifmap,
+                 alignment=BURST_ALIGNMENT)]
+    result_def = format_array_definition(ctype, 'golden', ofmap, alignment=BURST_ALIGNMENT)
+    data_str += [format_ifdef_wrapper('BIST', result_def)]
+    data_str = '\n\n'.join(data_str)
+
+    return data_str
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
+    parser.add_argument(
+        "-c", "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel'
+    )
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
+    args = parser.parse_args()
+
+    # Load param config file
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+    param['section'] = args.section
+
+    # Emit header file
+    print(emit_header(**param))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/...er/sw/apps/dnn/layernorm/src/params.hjson → sw/dnn/layernorm/data/params.hjson b/...er/sw/apps/dnn/layernorm/src/params.hjson → sw/dnn/layernorm/data/params.hjson
@@ -1,11 +1,8 @@
-// Copyright 2020 ETH Zurich and University of Bologna.
+// Copyright 2023 ETH Zurich and University of Bologna.
 // Solderpad Hardware License, Version 0.51, see LICENSE for details.
 // SPDX-License-Identifier: SHL-0.51
 
-// Parameters for a single LayerNorm layer
-
 {
-    kernel: "LayerNorm"
     input_dim: {
         batch_size: 1,
         seq_len: 32,

diff --git a/sw/dnn/src/layernorm.h → sw/dnn/layernorm/src/layernorm.h b/sw/dnn/src/layernorm.h → sw/dnn/layernorm/src/layernorm.h
@@ -7,7 +7,7 @@
 #include "math.h"
 #include "snrt.h"
 // #include "printf.h"
-#include "utils.h"
+#include "dnn.h"
 
 // add dump function for layernorm
 dump_float(ln, 5);
@@ -16,29 +16,24 @@ dump_float(ln, 5);
  * @struct layernorm_layer_struct
  * @brief This structure contains all parameters necessary
  *        for computing the LayerNorm activation function
- * @var layernorm_layer_struct::BATCH_SIZE
+ * @var layernorm_layer_struct::batch_size
  * Size of each input sample
- * @var layernorm_layer_struct::SEQ_LEN
+ * @var layernorm_layer_struct::seq_len
  * Size of each output sample
- * @var layernorm_layer_struct::EMBEDDINGS
+ * @var layernorm_layer_struct::embeddings
  * Number of hidden dimensions
  * @var layernorm_layer_struct::ifmap
  * Pointer to input feature map
  * @var layernorm_layer_struct::ofmap
  * Pointer to output feature map
- * @var layernorm_layer_struct::result
- * Pointer to the golden model output
  */
 typedef struct layernorm_layer_struct {
-    uint32_t BATCH_SIZE;
-    uint32_t SEQ_LEN;
-    uint32_t EMBEDDINGS;
-    uint32_t EPS;
-
-    float *ifmap;
-    float *ofmap;
-    float *result;
-
+    uint32_t batch_size;
+    uint32_t seq_len;
+    uint32_t embeddings;
+    uint32_t eps;
+    void *ifmap;
+    void *ofmap;
     precision_t dtype;
 } layernorm_layer_t;
 
@@ -95,44 +90,8 @@ static inline void layernorm_fp32(float *input, float *output, int32_t ldI,
  */
 static inline void transformer_layernorm_fp64(double *input, int32_t ldI,
                                               int32_t seq_len, int32_t embeddings,
-                                              int32_t eps) {
-    double mean = 0.0;  // max value of the current core
-    double var = 0.0;   // sum of the exp values of the current core
-
-    uint32_t compute_id = snrt_global_core_idx();
-    uint32_t num_cores = snrt_cluster_compute_core_num();
-
-    for (int32_t s = 0; s < seq_len; s++) {
-        mean = 0.0;
-        var = 0.0;
-
-        for (int32_t i = 0; i < embeddings; i++) {
-            mean += input[s * ldI + i];
-        }
-        mean /= embeddings;
-
-        // printf("mean[%d] = %f\n", b, mean);
-
-        for (int32_t i = 0; i < embeddings; i++) {
-            var += (input[s * ldI + i] - mean) *
-                    (input[s * ldI + i] - mean);
-        }
-        var /= embeddings;
-
-        // printf("var[%d] = %f\n", b, var);
-
-        // compute the shifted value of the current row
-        for (int32_t i = 0; i < embeddings; i++) {
-            input[s * ldI + i] =
-                (input[s * ldI + i] - mean) /
-                sqrtf(var + eps);
-            // printf("output[%d][%d][%d] = %f\n", b, s + compute_id, i,
-            //        output[s * ldI + i]);
-            // dump_ln(input[s * ldI + i]);
-        }
-    }
-
-    snrt_cluster_hw_barrier();
+                                              int32_t eps) {    
+    layernorm_fp64(input, input, ldI, 0, 1, seq_len, embeddings, eps);
 }
 
 
@@ -187,14 +146,14 @@ static inline void transformer_layernorm_fp32(float *input, int32_t ldI,
  * @param l layernorm_layer struct that holds addresses and parameters
  *
  */
-static inline void layernorm_layer(const layernorm_layer_t *l) {
+static inline void layernorm_layer(layernorm_layer_t l) {
     uint32_t cluster_num = snrt_cluster_num();
     uint32_t cluster_id = snrt_cluster_idx();
     uint32_t compute_num = snrt_cluster_compute_core_num();
     uint32_t compute_id = snrt_global_core_idx();
 
     uint32_t ifmap_size =
-        l->BATCH_SIZE * l->SEQ_LEN * l->EMBEDDINGS * sizeof(float);
+        l.batch_size * l.seq_len * l.embeddings * sizeof(float);
     uint32_t ofmap_size = ifmap_size;
 
     void *ptr = (float *)snrt_l1_next();
@@ -206,9 +165,9 @@ static inline void layernorm_layer(const layernorm_layer_t *l) {
     // DMA transfer the ifmap into the cluster TCDM
     if (snrt_is_dm_core()) {
         snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d(
-            ifmap, l->ifmap, l->BATCH_SIZE * sizeof(float),
-            l->BATCH_SIZE * sizeof(float), l->BATCH_SIZE * sizeof(float),
-            l->SEQ_LEN * l->EMBEDDINGS * sizeof(float));
+            ifmap, l.ifmap, l.batch_size * sizeof(float),
+            l.batch_size * sizeof(float), l.batch_size * sizeof(float),
+            l.seq_len * l.embeddings * sizeof(float));
 
         snrt_dma_wait_all();
     }
@@ -217,18 +176,18 @@ static inline void layernorm_layer(const layernorm_layer_t *l) {
 
     if (snrt_is_compute_core()) {
         // determine the row offset for each core
-        int32_t row_offset = compute_id * l->EMBEDDINGS;
+        int32_t row_offset = compute_id * l.embeddings;
 
         // determine the row stride of each matrix
-        int32_t ldI = compute_num * l->EMBEDDINGS;
+        int32_t ldI = compute_num * l.embeddings;
 
         // determine the batch offset for each core
-        int32_t batch_offset = l->SEQ_LEN * l->EMBEDDINGS;
+        int32_t batch_offset = l.seq_len * l.embeddings;
 
         // printf("row_offset: %d, ldI: %d\n", row_offset, ldI);
         layernorm_fp32(&ifmap[row_offset], &ofmap[row_offset], ldI,
-                       batch_offset, l->BATCH_SIZE, l->SEQ_LEN / 8,
-                       l->EMBEDDINGS, l->EPS);
+                       batch_offset, l.batch_size, l.seq_len / 8,
+                       l.embeddings, l.eps);
 
     } else {
         snrt_cluster_hw_barrier();