dnn: Refactor and verify GeLU

pulp-platform · Nov 8, 2023 · f20ace0 · f20ace0
1 parent f3266ab
commit f20ace0
Show file tree

Hide file tree

Showing 6 changed files with 130 additions and 102 deletions.
diff --git a/sw/dnn/gelu/data/datagen.py b/sw/dnn/gelu/data/datagen.py
@@ -35,30 +35,26 @@
 
 
 def golden_model(ifmap):
-    gelu = torch.nn.GELU()
+    gelu = torch.nn.GELU(approximate='tanh')
     return gelu(ifmap)
 
 
 def emit_header(**kwargs):
 
-    batch_size = kwargs['input_dim']['batch_size']
-    seq_len = kwargs['input_dim']['seq_len']
-    hidden_nodes = kwargs['input_dim']['hidden_nodes']
+    size = kwargs['size']
     prec = str(kwargs['prec'])
 
     torch_type = data_utils.floating_point_torch_type(prec)
     ctype = data_utils.floating_point_ctype(prec)
 
-    ifmap = torch.randn(batch_size, seq_len, hidden_nodes, requires_grad=False, dtype=torch_type)
+    ifmap = torch.randn(size, requires_grad=False, dtype=torch_type)
     ofmap = golden_model(ifmap)
 
     ifmap_uid = 'ifmap'
     ofmap_uid = 'ofmap'
 
     layer_cfg = {
-        'batch_size': batch_size,
-        'seq_len': seq_len,
-        'hidden_nodes': hidden_nodes,
+        'size':  size,
         'ifmap': ifmap_uid,
         'ofmap': ofmap_uid,
         'dtype': PRECISION_T[prec]
@@ -82,7 +78,7 @@ def emit_header(**kwargs):
 
 def main():
 
-    parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
+    parser = argparse.ArgumentParser()
     parser.add_argument(
         "-c", "--cfg",
         type=pathlib.Path,

diff --git a/sw/dnn/gelu/data/params.hjson b/sw/dnn/gelu/data/params.hjson
@@ -3,10 +3,6 @@
 // SPDX-License-Identifier: SHL-0.51
 
 {
-    input_dim: {
-        batch_size: 3,
-        seq_len: 8,
-        hidden_nodes: 4
-    }
-    prec: 32
+    size: 128
+    prec: 64
 }
diff --git a/sw/dnn/gelu/src/gelu.h b/sw/dnn/gelu/src/gelu.h
@@ -11,116 +11,72 @@
  * @struct gelu_layer_struct
  * @brief This structure contains all parameters necessary
  *        for computing the GELU activation function
- * @var gelu_layer_struct::batch_size
- * Size of each input sample
- * @var gelu_layer_struct::seq_len
- * Size of each output sample
- * @var gelu_layer_struct::hidden_nodes
- * Number of hidden dimensions
+ * @var gelu_layer_struct::size
+ * Size of the feature map
  * @var gelu_layer_struct::ifmap
  * Pointer to input feature map
  * @var gelu_layer_struct::ofmap
  * Pointer to output feature map
  */
 typedef struct gelu_layer_struct {
-    uint32_t batch_size;
-    uint32_t seq_len;
-    uint32_t hidden_nodes;
-    float *ifmap;
-    float *ofmap;
+    uint32_t size;
+    double *ifmap;
+    double *ofmap;
     precision_t dtype;
 } gelu_layer_t;
 
-/**
- * Implementation of the GELU layer
- */
-static inline void gelu_fp32(float *input, float *output, int32_t ldI,
-                             uint32_t batch_size, uint32_t seq_len,
-                             uint32_t hidden_nodes) {
-    // uint32_t compute_id = snrt_cluster_compute_core_num();
-
-    for (int s = 0; s < seq_len; s++) {
-        for (int h = 0; h < hidden_nodes; h++) {
-            // if (compute_id == 1) {
-            //     printf("compute id: %d, input[%d][%d] = %f\n", compute_id, s,
-            //     h,
-            //         input[s * hidden_nodes + h]);
-            // }
-            float x = input[s * hidden_nodes + h];
-            float y =
-                0.5 * x *
-                (1.0 + tanh(sqrt(2.0 / M_PI) * (x + 0.044715 * x * x * x)));
-            output[s * hidden_nodes + h] = y;
+// tanh based approximation of the GeLU activation function
+static inline double gelu_activation_fp64(double x) {
+    return 0.5 * x *
+           (1.0 + tanh(sqrt(2.0 / M_PI) * (x + 0.044715 * x * x * x)));
+}
 
-            // if (compute_id == 1) {
-            //     printf("compute id: %d, output[%d][%d] = %f\n", compute_id,
-            //     s, h,
-            //         output[s * hidden_nodes + h]);
-            // }
+// Single-cluster GeLU
+static inline void gelu_fp64(double *input, double *output, uint32_t size) {
+    if (snrt_is_compute_core()) {
+        for (uint32_t i = 0; i < size; i++) {
+            snrt_mcycle();
+            output[i] = gelu_activation_fp64(input[i]);
         }
     }
 }
 
-/**
- * @brief  GELU layer
- *
- * @param l gelu_layer_t struct that holds addresses and parameters
- *
- */
-static inline void gelu_layer(const gelu_layer_t *l) {
-    uint32_t cluster_num = snrt_cluster_num();
-    uint32_t cluster_id = snrt_cluster_idx();
-    uint32_t compute_num = snrt_cluster_compute_core_num();
-    uint32_t compute_id = snrt_cluster_compute_core_num();
+// Parallel GeLU layer with DMA transfers
+static inline void gelu_layer(const gelu_layer_t l) {
+    // Parallelize the computation over clusters
+    uint32_t cluster_fmap_size = l.size / snrt_cluster_num();
+    uint32_t cluster_fmap_bytes = cluster_fmap_size * sizeof(double);
 
-    uint32_t ifmap_size =
-        l->batch_size * l->seq_len * l->hidden_nodes * sizeof(float);
-    uint32_t ofmap_size = ifmap_size;
+    // Allocate memory in TCDM
+    void *ptr = (double *)snrt_l1_next();
+    double *l1_ifmap = ptr;
+    ptr += cluster_fmap_bytes;
+    double *l1_ofmap = ptr;
+    ptr += cluster_fmap_bytes;
 
-    void *ptr = (float *)snrt_l1_next();
-    float *ifmap = ptr;
-    ptr += ifmap_size;
-    float *ofmap = ptr;
-    ptr += ofmap_size;
+    // Get pointer to feature maps in L3
+    uint32_t cluster_offset = cluster_fmap_bytes * snrt_cluster_idx();
+    double *l3_ifmap = ((void *)l.ifmap) + cluster_offset;
+    double *l3_ofmap = ((void *)l.ofmap) + cluster_offset;
 
     // DMA transfer the ifmap into the cluster TCDM
     if (snrt_is_dm_core()) {
-        snrt_dma_txid_t txid_ifmap = snrt_dma_start_2d(
-            ifmap, l->ifmap, l->batch_size * sizeof(float),
-            l->batch_size * sizeof(float), l->batch_size * sizeof(float),
-            l->seq_len * l->hidden_nodes * sizeof(float));
-
+        snrt_dma_start_1d(l1_ifmap, l3_ifmap, cluster_fmap_bytes);
         snrt_dma_wait_all();
     }
 
     snrt_cluster_hw_barrier();
 
-    if (snrt_is_compute_core()) {
-        // determine the row offset for each core
-        int32_t row_offset = compute_id * l->hidden_nodes;
-
-        // determine the row stride of each matrix
-        int32_t ldI = compute_num * l->hidden_nodes;
+    // Cluster computation
+    gelu_fp64(l1_ifmap, l1_ofmap, cluster_fmap_size);
 
-        // determine the batch offset for each core
-        int32_t batch_offset = l->seq_len * l->hidden_nodes;
-
-        // printf("row_offset: %d, ldI: %d\n", row_offset, ldI);
-
-        for (int b = 0; b < l->batch_size; b++) {
-            // if (compute_id == 1) {
-            //     printf("BATCH: %d\n", b);
-            // }
-            gelu_fp32(&ifmap[row_offset + b * batch_offset],
-                      &ofmap[row_offset + b * batch_offset], ldI, l->batch_size,
-                      l->seq_len / 8, l->hidden_nodes);
-        }
-
-        snrt_cluster_hw_barrier();
+    snrt_cluster_hw_barrier();
 
-    } else {
-        snrt_cluster_hw_barrier();
+    // DMA transfer the ofmap to DRAM
+    if (snrt_is_dm_core()) {
+        snrt_dma_start_1d(l3_ofmap, l1_ofmap, cluster_fmap_bytes);
+        snrt_dma_wait_all();
     }
 
-    snrt_global_barrier();
-}
+    snrt_cluster_hw_barrier();
+}
diff --git a/sw/dnn/gelu/src/main.c b/sw/dnn/gelu/src/main.c
@@ -6,4 +6,4 @@
 
 #include "data.h"
 
-int main() { gelu_layer(&layer); }
+int main() { gelu_layer(layer); }
diff --git a/sw/dnn/gelu/verify.py b/sw/dnn/gelu/verify.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <[email protected]>
+
+import sys
+from pathlib import Path
+import numpy as np
+import torch
+from data.datagen import golden_model
+
+sys.path.append(str(Path(__file__).parent / '../../../util/sim/'))
+import verification  # noqa: E402
+from elf import Elf  # noqa: E402
+from data_utils import bytes_to_float, bytes_to_struct  # noqa: E402
+
+
+ERR_THRESHOLD = 1E-6
+
+PRECISION_T = {
+    8: '64',
+    4: '32',
+    2: '16',
+    1: '8'
+}
+
+NUMPY_T = {
+    '64': np.float64,
+    '32': np.float32,
+    '16': np.float16
+}
+
+
+def main():
+    # Run simulation and get outputs
+    args = verification.parse_args()
+    raw_results = verification.simulate(sim_bin=args.sim_bin,
+                                        snitch_bin=args.snitch_bin,
+                                        symbols_bin=args.symbols_bin,
+                                        log=args.log,
+                                        output_uids=['ofmap'])
+
+    # Extract input operands from ELF file
+    if args.symbols_bin:
+        elf = Elf(args.symbols_bin)
+    else:
+        elf = Elf(args.snitch_bin)
+
+    layer_struct = {
+        'size': 'I',
+        'ifmap': 'I',
+        'ofmap': 'I',
+        'dtype': 'I'
+    }
+    layer = bytes_to_struct(elf.get_symbol_contents('layer'), layer_struct)
+    prec = PRECISION_T[layer['dtype']]
+
+    ifmap = np.array(bytes_to_float(elf.get_symbol_contents('ifmap'), prec), dtype=NUMPY_T[prec])
+    ifmap = torch.from_numpy(ifmap)
+
+    # Verify results
+    ofmap_actual = np.array(bytes_to_float(raw_results['ofmap'], prec), dtype=NUMPY_T[prec])
+    ofmap_golden = golden_model(ifmap).detach().numpy().flatten()
+    relative_err = np.absolute((ofmap_golden - ofmap_actual) / ofmap_golden)
+    fail = np.any(relative_err > ERR_THRESHOLD)
+
+    # Print results
+    if (fail):
+        verification.dump_results_to_csv([ofmap_golden, ofmap_actual, relative_err],
+                                         Path.cwd() / 'gelu_results.csv')
+        print('Maximum relative error:', np.max(relative_err))
+
+    return int(fail)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml
@@ -79,7 +79,8 @@ runs:
   - elf: apps/dnn/gemm/build/gemm.elf
   - elf: apps/dnn/layernorm/build/layernorm.elf # Illegal FDIV without FDIV unit
     cmd: ../../sw/dnn/layernorm/verify.py {sim_bin} {elf}
-  # - elf: apps/dnn/gelu/build/gelu.elf # seems like it stalls
+  - elf: apps/dnn/gelu/build/gelu.elf
+    cmd: ../../sw/dnn/gelu/verify.py {sim_bin} {elf}
   # - elf: apps/dnn/conv2d/build/conv2d.elf # fails with exit code 32
   # - elf: apps/dnn/fusedconv/build/fusedconv.elf # fails newly
   - elf: apps/dnn/softmax/build/softmax.elf # Illegal FDIV without FDIV unit
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,4 +6,4 @@

		#include "data.h"

		int main() { gelu_layer(&layer); }
		int main() { gelu_layer(layer); }