From d6df90410ed11ebb1b1a85bbe9c840fd7b746991 Mon Sep 17 00:00:00 2001
From: Aba <abarajithan-7@gmail.com>
Date: Sat, 11 Nov 2023 01:59:33 -0800
Subject: [PATCH] Fix avg pool & act with shift=0

---
 c/model.h             | 18 +++++++++---------
 c/runtime.h           | 25 +++++++++++++++++++------
 test/py/bundle.py     | 35 +++++++++++++++++++----------------
 test/py/param_test.py | 22 +++++++++++++---------
 4 files changed, 60 insertions(+), 40 deletions(-)

diff --git a/c/model.h b/c/model.h
index 6fc04ca..82cc673 100644
--- a/c/model.h
+++ b/c/model.h
@@ -1,12 +1,12 @@
 #define N_BUNDLES 7
 Bundle_t bundles [N_BUNDLES] = {
-   {.n=8  , .l=3  , .kw=11 , .coe=2  , .coe_tl=2  , .r_ll=2  , .h=18 , .w=18 , .ci=3   , .co=8  , .w_kw2=13 , .t=4  , .p=3  , .cm=1  , .cm_p0=1  , .xp_words=6048, .out_buffer_idx=0 , .w_bpt=140  , .w_bpt_p0=140  , .x_bpt=3032 , .x_bpt_p0=3032 , .o_words=5376 , .o_bytes=2696 , .is_bias=1  , .is_flatten=0  , .b_offset=0  , .b_val_shift=9  , .b_bias_shift=0  , .ca_nzero=0  , .ca_shift=12 , .ca_pl_scale=0  , .csh=2  , .ch=9  , .csh_shift=1  , .pkh=3  , .psh=2  , .ph=5  , .psh_shift=1  , .csw=1  , .cw=18 , .csw_shift=0  , .pkw=4  , .psw=3  , .pw=6  , .psw_shift=0  , .pool=POOL_MAX  , .on=8  , .oh=5  , .ow=6  , .oc=8  , .x_header=               17055749u, .x_header_p0=               17055749u, .w_header=           347372535813u, .w_header_p0=                 17055749u , .debug_nhwc_words=1920  },
-   {.n=8  , .l=1  , .kw=1  , .coe=24 , .coe_tl=0  , .r_ll=5  , .h=5  , .w=6  , .ci=8   , .co=8  , .w_kw2=6  , .t=1  , .p=1  , .cm=20 , .cm_p0=8  , .xp_words=672, .out_buffer_idx=1 , .w_bpt=104  , .w_bpt_p0=104  , .x_bpt=2696 , .x_bpt_p0=2696 , .o_words=5376 , .o_bytes=2720 , .is_bias=0  , .is_flatten=0  , .b_offset=8  , .b_val_shift=0  , .b_bias_shift=0  , .ca_nzero=1  , .ca_shift=3  , .ca_pl_scale=0  , .csh=1  , .ch=5  , .csh_shift=0  , .pkh=1  , .psh=1  , .ph=5  , .psh_shift=0  , .csw=1  , .cw=6  , .csw_shift=0  , .pkw=1  , .psw=1  , .pw=6  , .psw_shift=0  , .pool=POOL_NONE , .on=8  , .oh=5  , .ow=6  , .oc=8  , .x_header=                  81976u, .x_header_p0=                  81976u, .w_header=           244276346936u, .w_header_p0=                    81976u , .debug_nhwc_words=1920  },
-   {.n=8  , .l=1  , .kw=7  , .coe=3  , .coe_tl=2  , .r_ll=5  , .h=5  , .w=6  , .ci=8   , .co=8  , .w_kw2=3  , .t=3  , .p=4  , .cm=2  , .cm_p0=2  , .xp_words=672, .out_buffer_idx=0 , .w_bpt=176  , .w_bpt_p0=176  , .x_bpt=680  , .x_bpt_p0=680  , .o_words=5376 , .o_bytes=2704 , .is_bias=1  , .is_flatten=0  , .b_offset=8  , .b_val_shift=9  , .b_bias_shift=0  , .ca_nzero=1  , .ca_shift=12 , .ca_pl_scale=0  , .csh=1  , .ch=5  , .csh_shift=0  , .pkh=1  , .psh=1  , .ph=5  , .psh_shift=0  , .csw=1  , .cw=6  , .csw_shift=0  , .pkw=1  , .psw=1  , .pw=6  , .psw_shift=0  , .pool=POOL_NONE , .on=8  , .oh=5  , .ow=6  , .oc=8  , .x_header=                  81931u, .x_header_p0=                  81931u, .w_header=           450434777099u, .w_header_p0=                    81931u , .debug_nhwc_words=1920  },
-   {.n=8  , .l=1  , .kw=5  , .coe=4  , .coe_tl=4  , .r_ll=5  , .h=5  , .w=6  , .ci=8   , .co=8  , .w_kw2=4  , .t=2  , .p=2  , .cm=4  , .cm_p0=4  , .xp_words=672, .out_buffer_idx=1 , .w_bpt=248  , .w_bpt_p0=248  , .x_bpt=1352 , .x_bpt_p0=1352 , .o_words=5376 , .o_bytes=2704 , .is_bias=0  , .is_flatten=0  , .b_offset=17 , .b_val_shift=0  , .b_bias_shift=0  , .ca_nzero=1  , .ca_shift=6  , .ca_pl_scale=3  , .csh=1  , .ch=5  , .csh_shift=0  , .pkh=1  , .psh=1  , .ph=5  , .psh_shift=0  , .csw=1  , .cw=6  , .csw_shift=0  , .pkw=1  , .psw=1  , .pw=6  , .psw_shift=0  , .pool=POOL_NONE , .on=8  , .oh=5  , .ow=6  , .oc=8  , .x_header=                  81946u, .x_header_p0=                  81946u, .w_header=           656593207322u, .w_header_p0=                    81946u , .debug_nhwc_words=1920  },
-   {.n=8  , .l=1  , .kw=3  , .coe=8  , .coe_tl=8  , .r_ll=5  , .h=5  , .w=6  , .ci=8   , .co=24 , .w_kw2=5  , .t=3  , .p=2  , .cm=6  , .cm_p0=2  , .xp_words=672, .out_buffer_idx=0 , .w_bpt=224  , .w_bpt_p0=80   , .x_bpt=2024 , .x_bpt_p0=680  , .o_words=16128, .o_bytes=8080 , .is_bias=1  , .is_flatten=0  , .b_offset=17 , .b_val_shift=9  , .b_bias_shift=0  , .ca_nzero=0  , .ca_shift=12 , .ca_pl_scale=0  , .csh=1  , .ch=5  , .csh_shift=0  , .pkh=1  , .psh=1  , .ph=5  , .psh_shift=0  , .csw=1  , .cw=6  , .csw_shift=0  , .pkw=1  , .psw=1  , .pw=6  , .psw_shift=0  , .pool=POOL_NONE , .on=8  , .oh=5  , .ow=6  , .oc=24 , .x_header=                  81961u, .x_header_p0=                  81929u, .w_header=           587873730601u, .w_header_p0=                    81929u , .debug_nhwc_words=5760  },
-   {.n=8  , .l=1  , .kw=1  , .coe=24 , .coe_tl=0  , .r_ll=5  , .h=5  , .w=6  , .ci=24  , .co=10 , .w_kw2=6  , .t=1  , .p=2  , .cm=20 , .cm_p0=4  , .xp_words=672, .out_buffer_idx=1 , .w_bpt=248  , .w_bpt_p0=56   , .x_bpt=6728 , .x_bpt_p0=1352 , .o_words=4200 , .o_bytes=2220 , .is_bias=0  , .is_flatten=1  , .b_offset=41 , .b_val_shift=0  , .b_bias_shift=0  , .ca_nzero=1  , .ca_shift=6  , .ca_pl_scale=3  , .csh=1  , .ch=5  , .csh_shift=0  , .pkh=1  , .psh=1  , .ph=5  , .psh_shift=0  , .csw=1  , .cw=6  , .csw_shift=0  , .pkw=1  , .psw=1  , .pw=6  , .psw_shift=0  , .pool=POOL_NONE , .on=1  , .oh=8  , .ow=1  , .oc=300, .x_header=                  82072u, .x_header_p0=                  81944u, .w_header=           656593207448u, .w_header_p0=                    81944u , .debug_nhwc_words=2400  },
-   {.n=1  , .l=1  , .kw=1  , .coe=24 , .coe_tl=0  , .r_ll=8  , .h=8  , .w=1  , .ci=300 , .co=10 , .w_kw2=1  , .t=1  , .p=15 , .cm=20 , .cm_p0=20 , .xp_words=14 , .out_buffer_idx=-1, .w_bpt=248  , .w_bpt_p0=248  , .x_bpt=148  , .x_bpt_p0=148  , .o_words=80   , .o_bytes=320  , .is_bias=1  , .is_flatten=0  , .b_offset=41 , .b_val_shift=9  , .b_bias_shift=0  , .ca_nzero=1  , .ca_shift=15 , .ca_pl_scale=3  , .csh=1  , .ch=8  , .csh_shift=0  , .pkh=1  , .psh=1  , .ph=8  , .psh_shift=0  , .csw=1  , .cw=1  , .csw_shift=0  , .pkw=1  , .psw=1  , .pw=1  , .psw_shift=0  , .pool=POOL_NONE , .on=1  , .oh=8  , .ow=1  , .oc=10 , .x_header=                    152u, .x_header_p0=                    152u, .w_header=           652835029144u, .w_header_p0=                      152u , .debug_nhwc_words=80    }
+   {.n=8  , .l=3  , .kw=11 , .coe=2  , .coe_tl=2  , .r_ll=2  , .h=18 , .w=18 , .ci=3   , .co=8  , .w_kw2=13 , .t=4  , .p=3  , .cm=1  , .cm_p0=1  , .xp_words=6048, .out_buffer_idx=0 , .add_buffer_idx=-1, .w_bpt=140  , .w_bpt_p0=140  , .x_bpt=3032 , .x_bpt_p0=3032 , .o_words=5376 , .o_bytes=2696 , .is_bias=1  , .is_flatten=0  , .b_offset=0  , .b_val_shift=9  , .b_bias_shift=0  , .ca_nzero=0  , .ca_shift=12 , .ca_pl_scale=0  , .add_act_shift=0  , .pool_act_shift=0  , .csh=2  , .ch=9  , .csh_shift=1  , .pkh=3  , .psh=2  , .ph=5  , .psh_shift=1  , .csw=1  , .cw=18 , .csw_shift=0  , .pkw=4  , .psw=3  , .pw=6  , .psw_shift=0  , .pool=POOL_AVG  , .on=8  , .oh=5  , .ow=6  , .oc=8  , .x_header=               17055749u, .x_header_p0=               17055749u, .w_header=           347372535813u, .w_header_p0=                 17055749u , .debug_nhwc_words=1920  },
+   {.n=8  , .l=1  , .kw=1  , .coe=24 , .coe_tl=0  , .r_ll=5  , .h=5  , .w=6  , .ci=8   , .co=8  , .w_kw2=6  , .t=1  , .p=1  , .cm=20 , .cm_p0=8  , .xp_words=672, .out_buffer_idx=1 , .add_buffer_idx=-1, .w_bpt=104  , .w_bpt_p0=104  , .x_bpt=2696 , .x_bpt_p0=2696 , .o_words=5376 , .o_bytes=2720 , .is_bias=1  , .is_flatten=0  , .b_offset=8  , .b_val_shift=9  , .b_bias_shift=0  , .ca_nzero=1  , .ca_shift=12 , .ca_pl_scale=0  , .add_act_shift=0  , .pool_act_shift=0  , .csh=1  , .ch=5  , .csh_shift=0  , .pkh=1  , .psh=1  , .ph=5  , .psh_shift=0  , .csw=1  , .cw=6  , .csw_shift=0  , .pkw=1  , .psw=1  , .pw=6  , .psw_shift=0  , .pool=POOL_NONE , .on=8  , .oh=5  , .ow=6  , .oc=8  , .x_header=                  81976u, .x_header_p0=                  81976u, .w_header=           244276346936u, .w_header_p0=                    81976u , .debug_nhwc_words=1920  },
+   {.n=8  , .l=1  , .kw=7  , .coe=3  , .coe_tl=2  , .r_ll=5  , .h=5  , .w=6  , .ci=8   , .co=8  , .w_kw2=3  , .t=3  , .p=4  , .cm=2  , .cm_p0=2  , .xp_words=672, .out_buffer_idx=0 , .add_buffer_idx=-1, .w_bpt=176  , .w_bpt_p0=176  , .x_bpt=680  , .x_bpt_p0=680  , .o_words=5376 , .o_bytes=2704 , .is_bias=0  , .is_flatten=0  , .b_offset=32 , .b_val_shift=0  , .b_bias_shift=0  , .ca_nzero=1  , .ca_shift=3  , .ca_pl_scale=0  , .add_act_shift=0  , .pool_act_shift=0  , .csh=1  , .ch=5  , .csh_shift=0  , .pkh=1  , .psh=1  , .ph=5  , .psh_shift=0  , .csw=1  , .cw=6  , .csw_shift=0  , .pkw=1  , .psw=1  , .pw=6  , .psw_shift=0  , .pool=POOL_NONE , .on=8  , .oh=5  , .ow=6  , .oc=8  , .x_header=                  81931u, .x_header_p0=                  81931u, .w_header=           450434777099u, .w_header_p0=                    81931u , .debug_nhwc_words=1920  },
+   {.n=8  , .l=1  , .kw=5  , .coe=4  , .coe_tl=4  , .r_ll=5  , .h=5  , .w=6  , .ci=8   , .co=8  , .w_kw2=4  , .t=2  , .p=2  , .cm=4  , .cm_p0=4  , .xp_words=672, .out_buffer_idx=1 , .add_buffer_idx=-1, .w_bpt=248  , .w_bpt_p0=248  , .x_bpt=1352 , .x_bpt_p0=1352 , .o_words=5376 , .o_bytes=2704 , .is_bias=1  , .is_flatten=0  , .b_offset=32 , .b_val_shift=9  , .b_bias_shift=0  , .ca_nzero=1  , .ca_shift=15 , .ca_pl_scale=3  , .add_act_shift=0  , .pool_act_shift=0  , .csh=1  , .ch=5  , .csh_shift=0  , .pkh=1  , .psh=1  , .ph=5  , .psh_shift=0  , .csw=1  , .cw=6  , .csw_shift=0  , .pkw=1  , .psw=1  , .pw=6  , .psw_shift=0  , .pool=POOL_NONE , .on=8  , .oh=5  , .ow=6  , .oc=8  , .x_header=                  81946u, .x_header_p0=                  81946u, .w_header=           656593207322u, .w_header_p0=                    81946u , .debug_nhwc_words=1920  },
+   {.n=8  , .l=1  , .kw=3  , .coe=8  , .coe_tl=8  , .r_ll=5  , .h=5  , .w=6  , .ci=8   , .co=24 , .w_kw2=5  , .t=3  , .p=2  , .cm=6  , .cm_p0=2  , .xp_words=672, .out_buffer_idx=0 , .add_buffer_idx=-1, .w_bpt=224  , .w_bpt_p0=80   , .x_bpt=2024 , .x_bpt_p0=680  , .o_words=16128, .o_bytes=8080 , .is_bias=1  , .is_flatten=0  , .b_offset=40 , .b_val_shift=9  , .b_bias_shift=0  , .ca_nzero=0  , .ca_shift=12 , .ca_pl_scale=0  , .add_act_shift=0  , .pool_act_shift=0  , .csh=1  , .ch=5  , .csh_shift=0  , .pkh=1  , .psh=1  , .ph=5  , .psh_shift=0  , .csw=1  , .cw=6  , .csw_shift=0  , .pkw=1  , .psw=1  , .pw=6  , .psw_shift=0  , .pool=POOL_NONE , .on=8  , .oh=5  , .ow=6  , .oc=24 , .x_header=                  81961u, .x_header_p0=                  81929u, .w_header=           587873730601u, .w_header_p0=                    81929u , .debug_nhwc_words=5760  },
+   {.n=8  , .l=1  , .kw=1  , .coe=24 , .coe_tl=0  , .r_ll=5  , .h=5  , .w=6  , .ci=24  , .co=10 , .w_kw2=6  , .t=1  , .p=2  , .cm=20 , .cm_p0=4  , .xp_words=672, .out_buffer_idx=1 , .add_buffer_idx=-1, .w_bpt=248  , .w_bpt_p0=56   , .x_bpt=6728 , .x_bpt_p0=1352 , .o_words=4200 , .o_bytes=2220 , .is_bias=1  , .is_flatten=1  , .b_offset=64 , .b_val_shift=9  , .b_bias_shift=0  , .ca_nzero=1  , .ca_shift=15 , .ca_pl_scale=3  , .add_act_shift=0  , .pool_act_shift=0  , .csh=1  , .ch=5  , .csh_shift=0  , .pkh=1  , .psh=1  , .ph=5  , .psh_shift=0  , .csw=1  , .cw=6  , .csw_shift=0  , .pkw=1  , .psw=1  , .pw=6  , .psw_shift=0  , .pool=POOL_NONE , .on=1  , .oh=8  , .ow=1  , .oc=300, .x_header=                  82072u, .x_header_p0=                  81944u, .w_header=           656593207448u, .w_header_p0=                    81944u , .debug_nhwc_words=2400  },
+   {.n=1  , .l=1  , .kw=1  , .coe=24 , .coe_tl=0  , .r_ll=8  , .h=8  , .w=1  , .ci=300 , .co=10 , .w_kw2=1  , .t=1  , .p=15 , .cm=20 , .cm_p0=20 , .xp_words=14 , .out_buffer_idx=-1, .add_buffer_idx=-1, .w_bpt=248  , .w_bpt_p0=248  , .x_bpt=148  , .x_bpt_p0=148  , .o_words=80   , .o_bytes=320  , .is_bias=1  , .is_flatten=0  , .b_offset=88 , .b_val_shift=9  , .b_bias_shift=0  , .ca_nzero=1  , .ca_shift=15 , .ca_pl_scale=3  , .add_act_shift=0  , .pool_act_shift=0  , .csh=1  , .ch=8  , .csh_shift=0  , .pkh=1  , .psh=1  , .ph=8  , .psh_shift=0  , .csw=1  , .cw=1  , .csw_shift=0  , .pkw=1  , .psw=1  , .pw=1  , .psw_shift=0  , .pool=POOL_NONE , .on=1  , .oh=8  , .ow=1  , .oc=10 , .x_header=                    152u, .x_header_p0=                    152u, .w_header=           652835029144u, .w_header_p0=                      152u , .debug_nhwc_words=80    }
 };
 
 #define X_BITS_L2   2
@@ -17,7 +17,7 @@ Bundle_t bundles [N_BUNDLES] = {
 #define PE_COLS     24
 
 #define N_BUF       2
-#define WB_BYTES    9954
+#define WB_BYTES    10048
 #define W_BYTES     9824
 #define X_BYTES     9096
 #define O_WORDS     80
@@ -26,7 +26,7 @@ Bundle_t bundles [N_BUNDLES] = {
 #define X_BYTES_ALL 30220
 #define Y_BYTES     110600
 #define B_TYPE      int16_t
-#define B_WORDS     65
+#define B_WORDS     112
 #define DATA_DIR   "D:/dnn-engine/test/vectors"
 
 static const uint8_t X_POSITION_INVERTED_MASKS [] = { 240, 15 };
diff --git a/c/runtime.h b/c/runtime.h
index 8ab5e29..b855ef6 100644
--- a/c/runtime.h
+++ b/c/runtime.h
@@ -11,11 +11,11 @@
 #endif
 
 typedef const struct {
-  const int32_t  n, l, kw, coe, coe_tl, r_ll, h, w, ci, co, w_kw2, t, p, cm, cm_p0, xp_words, out_buffer_idx;
+  const int32_t  n, l, kw, coe, coe_tl, r_ll, h, w, ci, co, w_kw2, t, p, cm, cm_p0, xp_words, out_buffer_idx, add_buffer_idx;
   const int32_t  w_bpt, w_bpt_p0, x_bpt, x_bpt_p0, o_words, o_bytes; // bytes per transfer
   const int8_t   is_bias, is_pool, is_flatten;
   const int32_t  b_offset, b_val_shift, b_bias_shift;
-  const int8_t   ca_nzero, ca_shift, ca_pl_scale;
+  const int8_t   ca_nzero, ca_shift, ca_pl_scale, add_act_shift, pool_act_shift;
   const int32_t  csh, ch, csh_shift, pkh, psh, ph, psh_shift, csw, cw, csw_shift, pkw, psw, pw, psw_shift, pool, on, oh, ow, oc;
   const uint64_t x_header, x_header_p0, w_header, w_header_p0; // 64 bits (at least)
   const int32_t  debug_nhwc_words;
@@ -52,7 +52,7 @@ volatile char is_bundle_write_done = 1;
 #define max(x, y) ((x) > (y) ? (x) : (y))
 #define min(x, y) ((x) < (y) ? (x) : (y))
 #define clip(x, xmin, xmax) (((x) < (xmin)) ? (xmin) : ((x) > (xmax)) ? (xmax) : (x))
-#define shift_round(n, s) (((n) + (1<<((s)-1)) - (~((n)>>(s))&1) ) >> s) // === np.around(n/2**s).astype(int32_t)
+#define shift_round(n, s) (((n) + ((s)>0 ? (1<<((s)-1)) - (~((n)>>(s))&1) : 0)) >> s) // === np.around(n/2**s).astype(int32_t)
 #define div_round(a, b) (((a)+((b)/2) - (~((b)|(a)/(b)) &1))/(b))
 
 
@@ -88,6 +88,14 @@ static inline void write_x(int8_t val, int8_t *p_out_buffer, int32_t ib, int32_t
 
   assert_printf (packed_index , <, bundles[ib].o_bytes, "write_x", WRITEX_DEBUG_INFO);
 
+  // // ------ RESIDUAL ADD ----
+  // if (bundles[ib].add_buffer_idx != -1){
+  //   uint8_t add_byte          = mem.buffers[bundles[ib].add_buffer_idx][packed_index];
+  //   uint8_t add_byte_cleaned  = X_POSITION_INVERTED_MASKS[packed_position] & add_byte;
+  //   uint8_t add_byte_unpacked = (add_byte_cleaned >> (packed_position * X_BITS)) & X_BITS_MASK;
+  //   int8_t  add_val           = add_byte_unpacked | ~X_BITS_MASK); 
+  // }
+
   uint8_t packed_val             = ((uint8_t)val & X_BITS_MASK) << (packed_position * X_BITS);
   uint8_t mem_val                = p_out_buffer[packed_index];
   uint8_t mem_val_cleaned        = X_POSITION_INVERTED_MASKS[packed_position] & mem_val;
@@ -322,10 +330,15 @@ extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc,  const uint32_
                 result = pb->pool==POOL_MAX ? max(result, read_val) : (result + read_val);
               }
             }
-            int32_t count  = (ph_end-ph_beg)*(pw_end-pw_beg);
-            result = pb->pool==POOL_MAX ? result : div_round(result, count); 
 
-            // ------ POOL ACTIVATION ------
+            // ------ AVG POOL: Divide & Activation ------
+            if (pb->pool == POOL_AVG) {
+              int32_t count  = (ph_end-ph_beg)*(pw_end-pw_beg);
+              result = div_round(result, count);
+              result = shift_round(result, pb->pool_act_shift);
+              result = clip(result, -(1<<(X_BITS-1)), (1<<(X_BITS-1))-1);
+            }
+
             tile_write(result, p_out_buffer, ib, pb,   i_yn, ixh, ixw, i_yc,  yn, pb->ph, pb->pw, yc); // Write
           }
         }
diff --git a/test/py/bundle.py b/test/py/bundle.py
index f263089..8b26dc3 100644
--- a/test/py/bundle.py
+++ b/test/py/bundle.py
@@ -175,7 +175,7 @@ def call(self, x, x_1=None):
         if x_1 is not None:
             if hasattr(x_1, "bundle"):
                 self.add['bundle'] = x_1.bundle
-                self.x_1.out_tensor_dest += [self.idx]
+                x_1.bundle.out_tensor_dest += [self.idx]
             else:
                 self.add['bundle'] = None
             x = Add()([x, x_1])
@@ -304,7 +304,8 @@ def add (p, p_frac, p_bits, q, q_frac, q_bits):
         def shift_round(n,s):
             '''Performs integer division with round-to-nearest-even. 
                Eq: np.around(n/2**s).astype(int)'''
-            return (n + (1<<(s-1)) - (~(n>>s)&1) ) >> s
+            half_b = 1<<(s-1) if s>0 else 0
+            return (n + half_b - (s>0)*(~(n>>s)&1) ) >> s
         
         def div_round(n,d):
             '''Performs integer division with round-to-nearest-even for d>0. 
@@ -330,16 +331,16 @@ def apply_act(act_dict):
 
         if self.add is not None:
             a = self.add['bundle']
-            out_frac_add, out_bits_add = max(self.proc['frac'], a.out['frac']), max(self.proc['bits'], a.out['bits'])
 
-            a_arr_cast = a.out['int'] * 2** (out_frac_add - a.out['frac'])
-            out_arr_cast = self.proc['int'] * 2 **(out_frac_add - self.proc['frac'])
-
-            self.proc['int'] = out_arr_cast.astype(np.int64) + a_arr_cast.astype(np.int64)
-            self.proc['bits'], self.proc['frac'] = out_bits_add, out_frac_add
+            (self.proc['int'], self.proc['frac'], self.proc['bits']), (self.add_val_shift, self.add_a_shift) = add(
+                self.proc['int']            , self.proc['frac'], self.proc['bits'],
+                a.out    ['int'].astype(int), a.out    ['frac'], a.out    ['bits']
+            )
+            assert self.proc['bits'] <= c.INT_BITS, f"After residual addition, resulting bits {self.proc['bits']} are more than bits for integer in CPU {c.INT_BITS}. Reduce bits or increase integer bits of bias to continue"
             apply_act(self.add['act'])
-
             assert np.all(self.proc['int'] == self.add['tensor'].numpy() * 2**self.proc['frac']), f"Add + act output of bundle {self.idx} is not a fixed point"
+        else:
+            self.add_val_shift, self.add_a_shift = 0, 0
 
         if self.pool_layer:
 
@@ -368,7 +369,7 @@ def apply_act(act_dict):
                 q_st = max((PSW*(PXW-1)+PKW-YW)//2, 0)
 
             for n in range(YN):
-                for c in range(YC):
+                for ic in range(YC):
                     for iyh in range(YH):
                         for iyw in range(YW):
 
@@ -404,14 +405,14 @@ def apply_act(act_dict):
                                         for ipyw in range(pw_end, pw_beg,-1):
                                             
                                             if self.pool['type']=='max':
-                                                result = max(result, in_arr[n,ipyh,ipyw,c])
+                                                result = max(result, in_arr[n,ipyh,ipyw,ic])
                                             else:
-                                                result += in_arr[n,ipyh,ipyw,c]
+                                                result += in_arr[n,ipyh,ipyw,ic]
 
                                     count  = (ph_end-ph_beg)*(pw_end-pw_beg)
-                                    result = result if self.pool['type']=='max' else result/count
+                                    result = result if self.pool['type']=='max' else div_round(result, count)
                                     ''' Writing '''
-                                    out_arr[n,ixh,ixw,c] = result
+                                    out_arr[n,ixh,ixw,ic] = result
 
                                     pw_beg += PSW # move pooling window by stride
                                     pw_end = min(pw_end+PSW, YW-1)
@@ -419,8 +420,10 @@ def apply_act(act_dict):
                                 ph_end = min(ph_end+PSH, YH-1)
             
             self.proc['int'] = out_arr
-            self.proc['bits'] += 4
-            # apply_act(self.pool['act'])
+            if self.pool['type'] == 'avg':
+                self.proc['bits'] += int(np.ceil(np.log2(PKH*PKW)))
+                assert self.proc['bits'] <= c.INT_BITS, f"When summing avg pool, resulting bits {self.proc['bits']} are more than bits for integer in CPU {c.INT_BITS}. Reduce bits or increase integer bits of bias to continue"
+            apply_act(self.pool['act'])
             assert np.all(self.proc['int'] == self.pool['tensor'].numpy() * 2**self.proc['frac']), f"Pool + act output of bundle {self.idx} is not a fixed point"
 
         if self.flatten:
diff --git a/test/py/param_test.py b/test/py/param_test.py
index 3ac4592..1eb9970 100644
--- a/test/py/param_test.py
+++ b/test/py/param_test.py
@@ -190,13 +190,13 @@ def test_dnn_engine(COMPILE):
     x = x_in = Input(input_shape[1:], name='input')
     x = QActivation(xq)(x)
 
-    x = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':(11,11), 'strides':(2,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({c.X_BITS},0,negative_slope=0)'    }, pool= {'type':'max', 'size':(3,4), 'strides':(2,3), 'padding':'same', 'act_str':f'quantized_bits({c.X_BITS},0,False,False,1)'})(x)
-    x = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_bits({c.X_BITS},0,False,False,1)'       },)(x)
-    x = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 7, 7), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':False, 'act_str':f'quantized_bits({c.X_BITS},0,False,True,1)'        },)(x)
-    x = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 5, 5), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({c.X_BITS},0,negative_slope=0.125)'},)(x)
-    x = Bundle( core= {'type':'conv' , 'filters':24, 'kernel_size':( 3, 3), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({c.X_BITS},0,negative_slope=0)'    },)(x)
-    x = Bundle( core= {'type':'conv' , 'filters':10, 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({c.X_BITS},0,negative_slope=0.125)'}, flatten= True)(x)
-    x = Bundle( core= {'type':'dense', 'units'  :10,                                                           'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({c.X_BITS},0,negative_slope=0.125)'})(x)
+    x = x_skip = Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':(11,11), 'strides':(2,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({c.X_BITS},0,negative_slope=0)'    }, pool= {'type':'avg', 'size':(3,4), 'strides':(2,3), 'padding':'same', 'act_str':f'quantized_bits({c.X_BITS},0,False,False,1)'})(x)
+    x =          Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_bits({c.X_BITS},0,False,False,1)'       },)(x)
+    x =          Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 7, 7), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':False, 'act_str':f'quantized_bits({c.X_BITS},0,False,True,1)'        },)(x) # add = {'act_str':f'quantized_bits({c.X_BITS},0,False,True,1)'})(x, x_skip)
+    x =          Bundle( core= {'type':'conv' , 'filters':8 , 'kernel_size':( 5, 5), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({c.X_BITS},0,negative_slope=0.125)'},)(x)
+    x =          Bundle( core= {'type':'conv' , 'filters':24, 'kernel_size':( 3, 3), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({c.X_BITS},0,negative_slope=0)'    },)(x)
+    x =          Bundle( core= {'type':'conv' , 'filters':10, 'kernel_size':( 1, 1), 'strides':(1,1), 'padding':'same', 'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({c.X_BITS},0,negative_slope=0.125)'}, flatten= True)(x)
+    x =          Bundle( core= {'type':'dense', 'units'  :10,                                                           'kernel_quantizer':kq, 'bias_quantizer':bq, 'use_bias':True , 'act_str':f'quantized_relu({c.X_BITS},0,negative_slope=0.125)'})(x)
 
     model = Model(inputs=x_in, outputs=x)
 
@@ -309,18 +309,22 @@ def test_dnn_engine(COMPILE):
 
             ca_nzero, ca_shift, ca_pl_scale = b.core['act']['non_zero'], b.core['act']['shift_bits'], b.core['act']['plog_slope']
 
+            add_act_shift = b.add['act']['shift_bits'] if b.add is not None else 0
+            add_buffer_idx = b.add['bundle'].idx if b.add is not None else -1
+
             if b.pool is None:
                 pool_type = 'POOL_NONE'
             elif b.pool['type'] == 'max':
                 pool_type = 'POOL_MAX'
             elif b.pool['type'] == 'avg':
                 pool_type = 'POOL_AVG'
+            pool_act_shift = b.pool['act']['shift_bits'] if b.pool is not None else 0
 
-            ch.write(f"   {{.n={b.r.XN:<3}, .l={b.r.XL:<3}, .kw={b.r.KW:<3}, .coe={y_coe:<3}, .coe_tl={y_coe_tl:<3}, .r_ll={y_r_ll:<3}, .h={b.r.XH:<3}, .w={b.r.XW:<3}, .ci={b.r.CI:<4}, .co={b.r.CO:<3}, .w_kw2={b.r.XW-b.r.KW//2:<3}, .t={b.r.IT:<3}, .p={b.r.CP:<3}, .cm={b.r.CM:<3}, .cm_p0={b.r.CM_0:<3}, .xp_words={xp_words:<3}, .out_buffer_idx={b.buffer_idx:<2}, ")
+            ch.write(f"   {{.n={b.r.XN:<3}, .l={b.r.XL:<3}, .kw={b.r.KW:<3}, .coe={y_coe:<3}, .coe_tl={y_coe_tl:<3}, .r_ll={y_r_ll:<3}, .h={b.r.XH:<3}, .w={b.r.XW:<3}, .ci={b.r.CI:<4}, .co={b.r.CO:<3}, .w_kw2={b.r.XW-b.r.KW//2:<3}, .t={b.r.IT:<3}, .p={b.r.CP:<3}, .cm={b.r.CM:<3}, .cm_p0={b.r.CM_0:<3}, .xp_words={xp_words:<3}, .out_buffer_idx={b.buffer_idx:<2}, .add_buffer_idx={add_buffer_idx:<2}, ")
             ch.write(     f".w_bpt={w_bpt:<5}, .w_bpt_p0={w_bpt_p0:<5}, .x_bpt={x_bpt:<5}, .x_bpt_p0={x_bpt_p0:<5}, .o_words={o_words_b:<5}, .o_bytes={o_bytes_b:<5}, ")
             ch.write(     f".is_bias={1*(b.b is not None):<3}, .is_flatten={1*b.flatten:<3}, ")
             ch.write(     f".b_offset={b_words:<3}, .b_val_shift={b.bias_val_shift:<3}, .b_bias_shift={b.bias_b_shift:<3}, ")
-            ch.write(     f".ca_nzero={ca_nzero:<3}, .ca_shift={ca_shift:<3}, .ca_pl_scale={ca_pl_scale:<3}, ")
+            ch.write(     f".ca_nzero={ca_nzero:<3}, .ca_shift={ca_shift:<3}, .ca_pl_scale={ca_pl_scale:<3}, .add_act_shift={add_act_shift:<3}, .pool_act_shift={pool_act_shift:<3}, ")
             ch.write(     f".csh={b.r.CSH:<3}, .ch={b.r.CYH:<3}, .csh_shift={b.r.CSH_SHIFT:<3}, .pkh={b.r.PKH:<3}, .psh={b.r.PSH:<3}, .ph={b.r.PYH:<3}, .psh_shift={b.r.PSH_SHIFT:<3}, .csw={b.r.CSW:<3}, .cw={b.r.CYW:<3}, .csw_shift={b.r.CSW_SHIFT:<3}, .pkw={b.r.PKW:<3}, .psw={b.r.PSW:<3}, .pw={b.r.PYW:<3}, .psw_shift={b.r.PSW_SHIFT:<3}, .pool={pool_type:<10}, .on={b.r.ON:<3}, .oh={b.r.OH:<3}, .ow={b.r.OW:<3}, .oc={b.r.OC:<3}, ")
             ch.write(     f".x_header={b.r.x_header_le_p[-1][0]:>23}u, .x_header_p0={b.r.x_header_le_p[0][0]:>23}u, .w_header={b.r.w_header_le_p[-1][0]:>23}u, .w_header_p0={b.r.x_header_le_p[0][0]:>25}u , ")
             ch.write(     f".debug_nhwc_words={b.oe_exp_nhwc.size:<5} }}")