Add bias; works for all conv2d, not dense

KastnerRG · Sep 15, 2023 · 8b7d62b · 8b7d62b
1 parent e4b7930
commit 8b7d62b
Show file tree

Hide file tree

Showing 4 changed files with 106 additions and 55 deletions.
diff --git a/c/model.h b/c/model.h
@@ -1,22 +1,24 @@
-#define N_BUNDLES 7
+#define N_BUNDLES 6
 Bundle_t bundles [N_BUNDLES] = {
-   {.n=2, .l=2, .kw=11, .coe=2, .coe_tl=2, .r_ll=8, .h=16, .w=8, .w_kw2=3, .t=8, .p=3, .cm=1, .cm_p0=1, .w_bpt=152, .w_bpt_p0=152, .x_bpt=216, .x_bpt_p0=216, .x_header=414341061322735616, .x_header_p0=414341061322735616, .w_header=414376340184104960, .w_header_p0=414341061322735616 },
-   {.n=2, .l=2, .kw=1, .coe=24, .coe_tl=0, .r_ll=8, .h=16, .w=8, .w_kw2=8, .t=1, .p=1, .cm=19, .cm_p0=16, .w_bpt=212, .w_bpt_p0=212, .x_bpt=3336, .x_bpt_p0=3336, .x_header=8700964375684448256, .x_header_p0=8700964375684448256, .w_header=8700999697495490560, .w_header_p0=8700964375684448256 },
-   {.n=2, .l=2, .kw=7, .coe=3, .coe_tl=4, .r_ll=8, .h=16, .w=8, .w_kw2=5, .t=6, .p=8, .cm=2, .cm_p0=2, .w_bpt=188, .w_bpt_p0=188, .x_bpt=424, .x_bpt_p0=424, .x_header=846686625550303232, .x_header_p0=846686625550303232, .w_header=846721930181476352, .w_header_p0=846686625550303232 },
-   {.n=2, .l=2, .kw=5, .coe=4, .coe_tl=4, .r_ll=8, .h=16, .w=8, .w_kw2=6, .t=4, .p=6, .cm=3, .cm_p0=1, .w_bpt=200, .w_bpt_p0=80, .x_bpt=632, .x_bpt_p0=216, .x_header=1351089783815798784, .x_header_p0=198168279208951808, .w_header=1351125097036906496, .w_header_p0=198168279208951808 },
-   {.n=2, .l=2, .kw=3, .coe=8, .coe_tl=8, .r_ll=8, .h=16, .w=8, .w_kw2=7, .t=3, .p=3, .cm=6, .cm_p0=4, .w_bpt=236, .w_bpt_p0=164, .x_bpt=1256, .x_bpt_p0=840, .x_header=3008414446688141312, .x_header_p0=1855492942081294336, .w_header=3008449785679052800, .w_header_p0=1855492942081294336 },
-   {.n=2, .l=2, .kw=1, .coe=24, .coe_tl=2, .r_ll=8, .h=16, .w=8, .w_kw2=8, .t=3, .p=2, .cm=19, .cm_p0=5, .w_bpt=248, .w_bpt_p0=80, .x_bpt=3960, .x_bpt_p0=1048, .x_header=10430346632594718720, .x_header_p0=2359896100346789888, .w_header=10430381980175564800, .w_header_p0=2359896100346789888 },
-   {.n=2, .l=1, .kw=1, .coe=24, .coe_tl=0, .r_ll=1, .h=1, .w=1, .w_kw2=1, .t=1, .p=337, .cm=19, .cm_p0=16, .w_bpt=248, .w_bpt_p0=212, .x_bpt=255, .x_bpt_p0=216, .x_header=10376293541461622784, .x_header_p0=8646911284551352320, .w_header=10376328889042468864, .w_header_p0=8646911284551352320 }
+   {.n=8, .l=2, .kw=11, .coe=2, .coe_tl=2, .r_ll=8, .h=16, .w=8, .w_kw2=3, .t=8, .p=3, .cm=1, .cm_p0=1, .w_bpt=152, .w_bpt_p0=152, .x_bpt=840, .x_bpt_p0=840, .is_bias=1, .b_offset=0, .b_val_shift=9, .b_bias_shift=0, .x_header=414341061322735616, .x_header_p0=414341061322735616, .w_header=414587446416637952, .w_header_p0=414341061322735616 },
+   {.n=8, .l=2, .kw=1, .coe=24, .coe_tl=0, .r_ll=8, .h=16, .w=8, .w_kw2=8, .t=1, .p=1, .cm=19, .cm_p0=16, .w_bpt=212, .w_bpt_p0=212, .x_bpt=13320, .x_bpt_p0=13320, .is_bias=1, .b_offset=16, .b_val_shift=9, .b_bias_shift=0, .x_header=8700964375684448256, .x_header_p0=8700964375684448256, .w_header=8701210803728023552, .w_header_p0=8700964375684448256 },
+   {.n=8, .l=2, .kw=7, .coe=3, .coe_tl=4, .r_ll=8, .h=16, .w=8, .w_kw2=5, .t=6, .p=8, .cm=2, .cm_p0=2, .w_bpt=188, .w_bpt_p0=188, .x_bpt=1672, .x_bpt_p0=1672, .is_bias=1, .b_offset=40, .b_val_shift=9, .b_bias_shift=0, .x_header=846686625550303232, .x_header_p0=846686625550303232, .w_header=846933036414009344, .w_header_p0=846686625550303232 },
+   {.n=8, .l=2, .kw=5, .coe=4, .coe_tl=4, .r_ll=8, .h=16, .w=8, .w_kw2=6, .t=4, .p=6, .cm=3, .cm_p0=1, .w_bpt=200, .w_bpt_p0=80, .x_bpt=2504, .x_bpt_p0=840, .is_bias=1, .b_offset=58, .b_val_shift=9, .b_bias_shift=0, .x_header=1351089783815798784, .x_header_p0=198168279208951808, .w_header=1351336203269439488, .w_header_p0=198168279208951808 },
+   {.n=8, .l=2, .kw=3, .coe=8, .coe_tl=8, .r_ll=8, .h=16, .w=8, .w_kw2=7, .t=3, .p=3, .cm=6, .cm_p0=4, .w_bpt=236, .w_bpt_p0=164, .x_bpt=5000, .x_bpt_p0=3336, .is_bias=1, .b_offset=74, .b_val_shift=9, .b_bias_shift=0, .x_header=3008414446688141312, .x_header_p0=1855492942081294336, .w_header=3008660891911585792, .w_header_p0=1855492942081294336 },
+   {.n=8, .l=2, .kw=1, .coe=24, .coe_tl=2, .r_ll=8, .h=16, .w=8, .w_kw2=8, .t=3, .p=2, .cm=19, .cm_p0=5, .w_bpt=248, .w_bpt_p0=80, .x_bpt=15816, .x_bpt_p0=4168, .is_bias=1, .b_offset=98, .b_val_shift=9, .b_bias_shift=0, .x_header=10430346632594718720, .x_header_p0=2359896100346789888, .w_header=10430593086408097792, .w_header_p0=2359896100346789888 }
 };
 
 #define X_BITS_L2   2
 #define W_BITS_L2   2
 #define PE_ROWS     8
 #define PE_COLS     24
 
-#define W_BYTES     103636
-#define X_BYTES     648
-#define X_BYTES_ALL 105008
-#define Y_BYTES     73736
+#define WB_BYTES    20436
+#define W_BYTES     20096
+#define X_BYTES     2520
+#define X_BYTES_ALL 75896
+#define Y_BYTES     294920
+#define B_TYPE      signed short
+#define B_WORDS     170
 #define DATA_DIR   "D:/dnn-engine/test/vectors"
 
diff --git a/c/runtime.h b/c/runtime.h
@@ -1,15 +1,18 @@
 typedef struct {
   const int n, l, kw, coe, coe_tl, r_ll, h, w, w_kw2, t, p, cm, cm_p0;
   const int w_bpt, w_bpt_p0, x_bpt, x_bpt_p0; // bytes per transfer
+  const char is_bias;
+  const int b_offset, b_val_shift, b_bias_shift;
   const unsigned long long x_header, x_header_p0, w_header, w_header_p0; // 64 bits (at least)
 } Bundle_t;
 
 #include "model.h"
 
 typedef struct {
-  char w  [W_BYTES     ];
-  char x  [X_BYTES_ALL ];
-  int  y  [Y_BYTES/4   ];
+  char   w  [W_BYTES     ];
+  B_TYPE b  [B_WORDS     ]; // keep next to w. weights are loaded to w_ptr
+  char   x  [X_BYTES_ALL ];
+  int    y  [Y_BYTES/4   ];
 } Memory_st;
 Memory_st mem;
 
@@ -24,23 +27,23 @@ Memory_st mem;
 #endif
 
 
-static inline void process_y(int val, int p_y, int ib, int ip, int it, int in, int il, int iw, int icoe, int iw_last, int ir){
+static inline void process_y(int val, int p_y, Bundle_t *p_bundle, int ib, int ip, int it, int in, int il, int iw, int icoe, int iw_last, int ir){
 
   // ------ ADD P PASSES ------ 
-
-  if (bundles[ib].p == 1) {}          // only p  : proceed with value
-  else if (ip == bundles[ib].p-1)     // last p  : read, add, proceed
+  if (p_bundle->p == 1) {          // only p  : proceed with value
+  } else if (ip == p_bundle->p-1) {// last p  : read, add, proceed
     val += mem.y[p_y];
-  else if (ip == 0) {                 // first p : overwrite memory, return
+  } else if (ip == 0) {            // first p : overwrite memory, return
     mem.y[p_y] = val;
     return;
-  }
-  else {                              // middle p: read, add, store, return
+  } else {                         // middle p: read, add, store, return
     mem.y[p_y] += val;
     return;
   }
 
   // ------ ADD BIAS ------ 
+  if (p_bundle->is_bias)
+    val = (val << p_bundle->b_val_shift) + (mem.b[p_bundle->b_offset + p_bundle->coe*it + icoe] << p_bundle-> b_bias_shift);
 
   // ------ RELU + QUANT ------
 
@@ -58,6 +61,7 @@ static inline void process_y(int val, int p_y, int ib, int ip, int it, int in, i
 
 extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc,  const unsigned int *p_sram_u32) {
 
+  static Bundle_t *p_bundle = &bundles[0];
   static int p_y=0;
   static int ib=0, ip=0, it=0, in=0, il=0, iw=0;
   const int *p_sram = (const int *)p_sram_u32;
@@ -67,16 +71,16 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc,  c
   sprintf(path, "%s/%0d_%0d_%0d_y_sim.txt", DATA_DIR, ib, ip, it);
   fp = fopen(path, "a"); 
 
-  int w_last = iw == bundles[ib].w_kw2-1 ? bundles[ib].kw/2+1 : 1;
+  int w_last = iw == p_bundle->w_kw2-1 ? p_bundle->kw/2+1 : 1;
   int sram_addr=0;
-  for (int icoe=0; icoe<bundles[ib].coe; icoe++)
+  for (int icoe=0; icoe<p_bundle->coe; icoe++)
     for (int iw_last=0; iw_last<w_last; iw_last++)
       for (int ir=0; ir<PE_ROWS; ir++) {
 
         int val = p_sram[sram_addr];
         fprintf(fp,"%d\n", val);
 
-        process_y(val, p_y, ib, ip, it, in, il, iw, icoe, iw_last, ir);
+        process_y(val, p_y, p_bundle, ib, ip, it, in, il, iw, icoe, iw_last, ir);
 
         p_y += 1;
         sram_addr += 1;
@@ -87,20 +91,20 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc,  c
 
   int p_y_prev;
   // Nested for loop [for(ib) for(ip) for(it) for(il) for(in) for(iw) {}] inverted to increment once per call
-  ++ iw; if (iw >= bundles[ib].w_kw2) { iw = 0;
-    ++ in; if (in >= bundles[ib].n) { in = 0;
-      ++ il; if (il >= bundles[ib].l) { il = 0;
-        ++ it; if (it >= bundles[ib].t) { it = 0;
+  ++ iw; if (iw >= p_bundle->w_kw2) { iw = 0;
+    ++ in; if (in >= p_bundle->n) { in = 0;
+      ++ il; if (il >= p_bundle->l) { il = 0;
+        ++ it; if (it >= p_bundle->t) { it = 0;
 
           // After each p
           printf("done p!! iw:%d in:%d il:%d it:%d ip:%d ib:%d\n", iw, in, il, it, ip, ib);
           p_y_prev = p_y;
           p_y=0;
 
-          ++ ip; if (ip >= bundles[ib].p) { ip = 0;
+          ++ ip; if (ip >= p_bundle->p) { ip = 0;
 
-            // After each bundle
-            printf("done bundle!! iw:%d in:%d il:%d it:%d ip:%d ib:%d\n", iw, in, il, it, ip, ib);
+            // After each p_bundle
+            printf("done p_bundle!! iw:%d in:%d il:%d it:%d ip:%d ib:%d\n", iw, in, il, it, ip, ib);
             // Write to file at every it_done
             sprintf(path, "%s/%0d_y_sim.txt", DATA_DIR, ib);
             fp = fopen(path, "w"); 
@@ -110,7 +114,9 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc,  c
 
             ++ ib; if (ib >= N_BUNDLES) { ib = 0;
               *p_done =1;
-  }}}}}}
+            }
+            p_bundle = &bundles[ib];
+  }}}}}
   *pt_done_proc = !(*pt_done_proc);
 }
 
@@ -167,7 +173,7 @@ extern EXT_C void fill_memory (){
     printf("ABORT! File not found: %s \n", path);
     exit(1);
   }
-  fread(mem.w, 1, W_BYTES, fp);
+  fread(mem.w, 1, WB_BYTES, fp);
   fclose(fp);
 
   sprintf(path, "%s/x_all.bin", DATA_DIR);
@@ -178,6 +184,9 @@ extern EXT_C void fill_memory (){
   }
   fread(mem.x, 1, X_BYTES_ALL, fp);
   fclose(fp);
+
+  for (int i=0; i<B_WORDS; i++)
+    printf("i:%d, bias:%d\n", i, mem.b[i]);
 }
 
 

diff --git a/test/py/bundle.py b/test/py/bundle.py
@@ -89,15 +89,15 @@ def extract_act(signature):
             self.core['layer'] = QConv2DBatchnorm(
                 filters=self.core['filters'], kernel_size=self.core['kernel_size'], strides=self.core['strides'],
                 padding=self.core['padding'], kernel_quantizer=self.core['kernel_quantizer'], 
-                bias_quantizer=self.core['bias_quantizer'], use_bias=self.core['use_bias'])
+                bias_quantizer=self.core['bias_quantizer'], use_bias=self.core['use_bias'], bias_initializer='glorot_uniform')
 
         else:
             for i in ['units', 'kernel_quantizer', 'bias_quantizer', 'use_bias', 'act_str']:
                 assert i in self.core, f"'{i}' must be provided for dense"
 
             self.core['layer'] = QDense(
                 units=self.core['units'], kernel_quantizer=self.core['kernel_quantizer'],
-                bias_quantizer=self.core['bias_quantizer'], use_bias=self.core['use_bias'])
+                bias_quantizer=self.core['bias_quantizer'], use_bias=self.core['use_bias'], bias_initializer='glorot_uniform')
 
         '''
         CORE ACT LAYER
@@ -201,7 +201,7 @@ def load_weight_bias(self):
             self.b = {'tensor':b_tensor, 'int':b_int, 'bits':b_config['bits'], 'frac':b_frac}
 
 
-    def process(self, inp = None):
+    def process(self, inp, c):
 
         ''' Integer test for output '''
         self.out['int'] = self.out['tensor'].numpy() * 2**self.out['frac']
@@ -227,19 +227,40 @@ def process(self, inp = None):
 
         self.y = copy.deepcopy(self.proc)
 
-        self.post_process()
+        self.post_process(c)
 
 
-    def post_process(self):
+    def post_process(self, c):
+
+        def add (p, p_frac, p_bits, q, q_frac, q_bits):
+            '''
+            Add p,q while preserving precision
+            '''
+            p_intb, q_intb = p_bits-p_frac, q_bits-q_frac
+
+            r_frac = max(p_frac,q_frac)
+            r_intb = max(p_intb,q_intb)
+            r_bits = 1 + r_intb + r_frac # +1 to allow overflow
+
+            p_shift = r_frac-p_frac
+            q_shift = r_frac-q_frac
+
+            r = (p << p_shift) + (q << q_shift)
+            return (r, r_frac, r_bits), (p_shift, q_shift)
 
         clog2_add = int(np.ceil(np.log2(np.prod(self.w['int'].shape[:-1]))))
         self.proc['bits'] = self.inp['bits'] + self.w['bits'] + clog2_add
         self.proc['frac'] = self.inp['frac'] + self.w['frac']
 
         if self.b is not None:
-            self.proc['int'] += self.b['int'] * 2** (self.proc['frac'] - self.b['frac'])
-        self.b_frac_shift = self.proc['frac'] - self.b['frac'] if self.b else None
-        self.y_int_b = self.proc['int'] if self.b else None
+            (self.proc['int'], self.proc['frac'], self.proc['bits']), (self.bias_val_shift, self.bias_b_shift) = add(
+                self.proc['int'], self.proc['frac'], self.proc['bits'],
+                self.b   ['int'], self.b   ['frac'], self.b   ['bits']
+            )
+            assert self.proc['bits'] <= c.INT_BITS, f"After bias addition, resulting bits {self.proc['bits']} are more than bits for integer in CPU {c.INT_BITS}. Reduce bits or increase integer bits of bias to continue"
+        else:
+            self.bias_val_shift, self.bias_b_shift = 0, 0
+        self.y_int_b = self.proc['int']
 
 
         if 'strides' in self.core and self.core['strides'] != (1,1):
@@ -394,6 +415,7 @@ def export (self, c):
             y_int = self.y['int'].reshape(XN,1,1,CO) # (XN,CI) -> (XN, XH, XW, CI)
         else:
             y_int = self.y['int']
+            p_int = self.y_int_b
             w_int, x_int = self.w['int'], self.inp['int']
 
         r = self.get_runtime_params(c, w_int.shape, x_int.shape, y_int.shape)
@@ -411,13 +433,15 @@ def export (self, c):
         print(r)
         self.check_sparsity(w_int, x_int)
 
-        self.be =  self.reorder_b_q2e_conv(self.b, c, r) if self.b else None
+        self.be =  self.reorder_b_q2e_conv(self.b['int'], c, r) if self.b else None
         self.we = self.reorder_w_q2e_conv(w_int, c, r)
         self.ye_exp_shape = (r.IT, r.XN, r.L, r.XW*r.CO_PRL, c.ROWS)
         self.ye_hw = np.zeros(self.ye_exp_shape)
 
         self.xe = self.reorder_x_q2e_conv(x_int, c, r)
         self.ye_exp = self.reorder_y_q2e_conv(y_int, c, r)
+        self.pe_exp = self.reorder_y_q2e_conv(p_int, c, r)
+        print(f"x reshape: [int]:{self.inp['int'].shape}, int:{x_int.shape}. xe:{self.xe[0].shape}")
 
         '''
         Prepare expected outputs for each pass
@@ -549,7 +573,6 @@ def check_sparsity(w, x):
     def reorder_b_q2e_conv(b, c, r):
         b = np.pad(b, ((0,r.CO_PAD-r.CO)))
         b = b.reshape(r.IT, r.CO_PRL)
-        b = np.flip(b, axis=1)
         return b