diff --git a/c/runtime.h b/c/runtime.h index 9a05f20..003fb1d 100644 --- a/c/runtime.h +++ b/c/runtime.h @@ -15,7 +15,7 @@ typedef struct { const char is_bias, is_pool, is_flatten; const int b_offset, b_val_shift, b_bias_shift; const signed char ca_nzero, ca_shift, ca_pl_scale; - const int csh, ch, csh_shift, pkh, psh, ph, psh_shift, csw, cw, csw_shift, pkw, psw, pw, psw_shift, p_type, on, oh, ow, oc; + const int csh, ch, csh_shift, pkh, psh, ph, psh_shift, csw, cw, csw_shift, pkw, psw, pw, psw_shift, pool, on, oh, ow, oc; const unsigned long long x_header, x_header_p0, w_header, w_header_p0; // 64 bits (at least) const int debug_nhwc_words; } Bundle_t; @@ -36,42 +36,42 @@ typedef struct { } Memory_st; Memory_st mem; -#define max(x, y) (x > y ? x : y) -#define min(x, y) (x < y ? x : y) -#define clip(x, min, max) ((x < min) ? min : (x > max) ? max : x) -#define shift_round(n, s) ((n + (1<<(s-1)) - (~(n>>s)&1) ) >> s) // === np.around(n/2**s).astype(int) -#define div_round(a, b) ((a+(b/2) - (~(b|a/b) &1))/b) +#define max(x, y) ((x) > (y) ? (x) : (y)) +#define min(x, y) ((x) < (y) ? (x) : (y)) +#define clip(x, xmin, xmax) (((x) < (xmin)) ? (xmin) : ((x) > (xmax)) ? (xmax) : (x)) +#define shift_round(n, s) (((n) + (1<<((s)-1)) - (~((n)>>(s))&1) ) >> s) // === np.around(n/2**s).astype(int) +#define div_round(a, b) (((a)+((b)/2) - (~((b)|(a)/(b)) &1))/(b)) #define assert_printf(debug_info, condition,...) ((condition) || (printf(#condition), printf(__VA_ARGS__), printf(debug_info), assert(condition), 0)) static inline int quant_lrelu(int x, signed char nzero, signed char shift, signed char pl_scale){ - x = ((x<0)*x)*nzero + (((x>0)*x) << pl_scale); + x = x < 0 ? x*nzero : x << pl_scale; x = shift_round(x, shift); x = clip(x, -(1<<(X_BITS-pl_scale-1)), (1<<(X_BITS-1))-1); return x; } -static inline void write_x(signed char val, int ib, int ixp, int ixn, int ixl, int ixw, int ixcm, int ixr, Bundle_t *p_bo, int xcm ){ +static inline void write_x(signed char val, int ib, int ixp, int ixn, int ixl, int ixw, int ixcm, int ixr, Bundle_t *pb_out, int xcm ){ #define DBG "--- ib:%d ixp:%d ixn:%d ixl:%d ixw:%d ixcm:%d ixr:%d xcm :%d \n",ib,ixp,ixn,ixl,ixw,ixcm,ixr,xcm assert_printf(DBG, ixr < PE_ROWS+X_PAD, "ixr < PE_ROWS+X_PAD"); - assert_printf(DBG, ixcm < xcm , "ixcm < xcm "); - assert_printf(DBG, ixw < p_bo->w , "ixw < p_bo->w "); - assert_printf(DBG, ixl < p_bo->l , "ixl < p_bo->l "); - assert_printf(DBG, ixn < p_bo->n , "ixn < p_bo->n "); - assert_printf(DBG, ixp < p_bo->p , "ixp < p_bo->p "); - - int p_offset = (ixp == 0) ? 0 : (p_bo->cm_p0 + (ixp-1)*p_bo->cm) *p_bo->n*p_bo->l*p_bo->w*(PE_ROWS+X_PAD); - int flat_index_n2r = (((ixn*p_bo->l + ixl)*p_bo->w + ixw)*xcm + ixcm)*(PE_ROWS+X_PAD) + ixr; // multidim_index -> flat_index [n,l,w,cm,r] + assert_printf(DBG, ixcm < xcm , "ixcm < xcm "); + assert_printf(DBG, ixw < pb_out->w , "ixw < pb_out->w "); + assert_printf(DBG, ixl < pb_out->l , "ixl < pb_out->l "); + assert_printf(DBG, ixn < pb_out->n , "ixn < pb_out->n "); + assert_printf(DBG, ixp < pb_out->p , "ixp < pb_out->p "); + + int p_offset = (ixp == 0) ? 0 : (pb_out->cm_p0 + (ixp-1)*pb_out->cm) *pb_out->n*pb_out->l*pb_out->w*(PE_ROWS+X_PAD); + int flat_index_n2r = (((ixn*pb_out->l + ixl)*pb_out->w + ixw)*xcm + ixcm)*(PE_ROWS+X_PAD) + ixr; // multidim_index -> flat_index [n,l,w,cm,r] mem.nx[p_offset + flat_index_n2r] = val; } -static inline void tile_write( int out_val, int ib, Bundle_t *p_bundle, int i_yn, int i_yh, int i_yw, int i_yc, int yn, int yh, int yw, int yc ) { +static inline void tile_write( int out_val, int ib, Bundle_t *pb, int i_yn, int i_yh, int i_yw, int i_yc, int yn, int yh, int yw, int yc ) { // ------ FLATTEN ------ - if (p_bundle->is_flatten) { + if (pb->is_flatten) { i_yc = (i_yh*yw + i_yw)*yc + i_yc; // (H*W*C) -> C i_yw = 0; // W=1 i_yh = i_yn; // N -> H @@ -84,25 +84,25 @@ static inline void tile_write( int out_val, int ib, Bundle_t *p_bundle, int i_yn } // Check - assert_printf ("", yn == p_bundle->on, ": yn"); - assert_printf ("", yh == p_bundle->oh, ": yh"); - assert_printf ("", yw == p_bundle->ow, ": yw"); - assert_printf ("", yc == p_bundle->oc, ": yc"); + assert_printf ("", yn == pb->on, ": yn"); + assert_printf ("", yh == pb->oh, ": yh"); + assert_printf ("", yw == pb->ow, ": yw"); + assert_printf ("", yc == pb->oc, ": yc"); // ------ TILING: Calculate X coordinates ------ // y [n,h,w,c] -> x[p, n, l, w,cmp, r+pad] - Bundle_t* p_bo = ib == N_BUNDLES-1 ? &bundles[ib] : &bundles[ib+1]; - char yp_first = i_yc < p_bo->cm_p0; + Bundle_t* pb_out = ib == N_BUNDLES-1 ? &bundles[ib] : &bundles[ib+1]; + char yp_first = i_yc < pb_out->cm_p0; div_t div_oh = div(i_yh, PE_ROWS); int i_yr = div_oh.rem; int i_yl = div_oh.quot; - div_t div_oc = div(i_yc-p_bo->cm_p0, p_bo->cm); + div_t div_oc = div(i_yc-pb_out->cm_p0, pb_out->cm); int i_yp = yp_first ? 0 : div_oc.quot + 1; int i_ycm = yp_first ? i_yc : div_oc.rem; - int ycm = yp_first ? p_bo->cm_p0 : p_bo->cm ; + int ycm = yp_first ? pb_out->cm_p0 : pb_out->cm ; // ------ STORE ------ @@ -123,13 +123,13 @@ static inline void tile_write( int out_val, int ib, Bundle_t *p_bundle, int i_yn int yr_sweep = i_yh==yh-1 ? PE_ROWS : i_yr + 1; for (int i_yr_dest = i_yr; i_yr_dest < yr_sweep; i_yr_dest++) { - write_x(out_val, ib, i_yp, i_yn, i_yl, i_yw, i_ycm, i_yr_dest, p_bo, ycm); + write_x(out_val, ib, i_yp, i_yn, i_yl, i_yw, i_ycm, i_yr_dest, pb_out, ycm); // --- PADDING: the [bottom X_PAD rows of previous block (l-1)] with [first X_PAD rows of this block (l)] if (i_yr_dest < X_PAD) { int pad_val = (i_yl == 0) ? 0 : out_val; - int dest_yl = (i_yl == 0) ? p_bo->l-1 : i_yl-1; - write_x(pad_val, ib, i_yp, i_yn, dest_yl, i_yw, i_ycm, i_yr_dest+PE_ROWS, p_bo, ycm); + int dest_yl = (i_yl == 0) ? pb_out->l-1 : i_yl-1; + write_x(pad_val, ib, i_yp, i_yn, dest_yl, i_yw, i_ycm, i_yr_dest+PE_ROWS, pb_out, ycm); } out_val = 0; } @@ -139,7 +139,7 @@ static inline void tile_write( int out_val, int ib, Bundle_t *p_bundle, int i_yn extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, const unsigned int *p_sram_u32) { - static Bundle_t *p_bundle = &bundles[0]; + static Bundle_t *pb = &bundles[0]; static int it_bias=0; static int ib=0, ip=0, it=0, in=0, il=0, iw_kw2=0; const int *p_sram = (const int *)p_sram_u32; @@ -156,9 +156,9 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c FILE *fp_sum = fopen(f_path_sum, "a"); //New iw_kw2: - int w_last = iw_kw2 == p_bundle->w_kw2-1 ? p_bundle->kw/2+1 : 1; + int w_last = iw_kw2 == pb->w_kw2-1 ? pb->kw/2+1 : 1; int sram_addr=0; - for (int icoe=0; icoecoe; icoe++) { + for (int icoe=0; icoecoe; icoe++) { int i_bias = it_bias + icoe; for (int iw_last=0; iw_lastcoe*it + icoe; + int i_yc = pb->coe*it + icoe; // Save y_dims - int yn = p_bundle->n; - int yh = p_bundle->h; - int yw = p_bundle->w; - int yc = p_bundle->co; - + int yn = pb->n; + int yh = pb->h; + int yw = pb->w; + int yc = pb->co; // if out of bounds, early return if (i_yh >= yh || i_yc >= yc) { - if (ip == p_bundle->p-1) + if (ip == pb->p-1) fprintf(fp_sum,"%d\n", 0); // Save summed output goto PROCESS_AND_STORE_DONE; } @@ -198,8 +197,8 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c // ------ ADD P PASSES ------ iy_nhwc = ((i_yn*yh + i_yh)*yw + i_yw)*yc + i_yc; - if (p_bundle->p == 1) { // only p : proceed with value - } else if (ip == p_bundle->p-1) {// last p : read, add, proceed + if (pb->p == 1) { // only p : proceed with value + } else if (ip == pb->p-1) {// last p : read, add, proceed out_val += mem.nhwc[iy_nhwc]; } else if (ip == 0) { // first p : overwrite memory, return mem.nhwc[iy_nhwc] = out_val; @@ -212,24 +211,24 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c // ------ CONV STRIDING ------ - div_ch = div(i_yh-p_bundle->csh_shift, p_bundle->csh); - div_cw = div(i_yw-p_bundle->csw_shift, p_bundle->csw); + div_ch = div(i_yh-pb->csh_shift, pb->csh); + div_cw = div(i_yw-pb->csw_shift, pb->csw); if (div_ch.rem != 0 || div_cw.rem != 0) goto PROCESS_AND_STORE_DONE; i_yh = div_ch.quot; // update indices and dimensions i_yw = div_cw.quot; - yh = p_bundle->ch; - yw = p_bundle->cw; + yh = pb->ch; + yw = pb->cw; // ------ ADD BIAS ------ - if (p_bundle->is_bias) - out_val = (out_val << p_bundle->b_val_shift) + (mem.b[i_bias] << p_bundle->b_bias_shift); + if (pb->is_bias) + out_val = (out_val << pb->b_val_shift) + (mem.b[i_bias] << pb->b_bias_shift); // ------ CORE ACT ------ - out_val = quant_lrelu(out_val, p_bundle->ca_nzero, p_bundle->ca_shift, p_bundle->ca_pl_scale); + out_val = quant_lrelu(out_val, pb->ca_nzero, pb->ca_shift, pb->ca_pl_scale); @@ -238,8 +237,8 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c // ------ MAX/AVG POOL --- - if (p_bundle->p_type == POOL_NONE) { - tile_write(out_val, ib, p_bundle, i_yn, i_yh, i_yw, i_yc, yn, yh, yw, yc); + if (pb->pool == POOL_NONE) { + tile_write(out_val, ib, pb, i_yn, i_yh, i_yw, i_yc, yn, yh, yw, yc); goto PROCESS_AND_STORE_DONE; } @@ -251,14 +250,15 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c iy_nhwc = ((i_yn*yh + i_yh)*yw + i_yw)*yc + i_yc; // store as nhwc for pooling mem.nhwc[iy_nhwc] = out_val; - div_ixh = div(i_yh+p_bundle->psh_shift-p_bundle->pkh+1, p_bundle->psh); - div_ixw = div(i_yw+p_bundle->psw_shift-p_bundle->pkw+1, p_bundle->psw); + div_ixh = div(i_yh+pb->psh_shift-pb->pkh+1, pb->psh); + div_ixw = div(i_yw+pb->psw_shift-pb->pkw+1, pb->psw); ixh_beg = div_ixh.quot; // ix(hw) that corresponds to the pooling window ixw_beg = div_ixw.quot; if (ixh_beg < 0 || ixw_beg < 0) // skip when target ix(h,w) < 0 goto PROCESS_AND_STORE_DONE; + // Pool Striding if (div_ixh.rem != 0) // invalid ixh if (i_yh==yh-1) ixh_beg += 1; //but last yh. start sweeping else goto PROCESS_AND_STORE_DONE; // not last yh. skip @@ -269,18 +269,18 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c ph_end = i_yh; // iy(h,w) is the bottom-right of pooling window -> All values in pooling window have been computed pw_end = i_yw; - ph_beg_const = max(p_bundle->psh*ixh_beg-p_bundle->psh_shift, 0)-1; // p(h,w)_beg is the index of top left corner of pooling window. If negative, set to zero - pw_beg_const = max(p_bundle->psw*ixw_beg-p_bundle->psw_shift, 0)-1; + ph_beg_const = max(pb->psh*ixh_beg-pb->psh_shift, 0)-1; // p(h,w)_beg is the index of top left corner of pooling window. If negative, set to zero + pw_beg_const = max(pb->psw*ixw_beg-pb->psw_shift, 0)-1; - xh_sweep = i_yh == yh-1 ? p_bundle->ph : ixh_beg+1; // ix(hw) is sweeped from ix(hw)_beg to x(h,w)_sweep. Normally sweep is 1. - xw_sweep = i_yw == yw-1 ? p_bundle->pw : ixw_beg+1; // But when iy(h,w) is at its edges, need to compute remaining ix(hw) pixels by sweeping + xh_sweep = i_yh == yh-1 ? pb->ph : ixh_beg+1; // ix(hw) is sweeped from ix(hw)_beg to x(h,w)_sweep. Normally sweep is 1. + xw_sweep = i_yw == yw-1 ? pb->pw : ixw_beg+1; // But when iy(h,w) is at its edges, need to compute remaining ix(hw) pixels by sweeping // Sweep the pooling window - for (int ixh = ixh_beg, ph_beg = ph_beg_const; ixh < xh_sweep; ixh++, ph_beg += p_bundle->psh) { - for (int ixw = ixw_beg, pw_beg = pw_beg_const; ixw < xw_sweep; ixw++, pw_beg += p_bundle->psw) { + for (int ixh = ixh_beg, ph_beg = ph_beg_const; ixh < xh_sweep; ixh++, ph_beg += pb->psh) { + for (int ixw = ixw_beg, pw_beg = pw_beg_const; ixw < xw_sweep; ixw++, pw_beg += pb->psw) { // Traverse each pool window & perform pooling - int result = p_bundle->p_type == POOL_MAX ? INT_MIN : 0; + int result = pb->pool == POOL_MAX ? INT_MIN : 0; for (int ipyh = ph_end; ipyh > ph_beg; ipyh--){ for (int ipyw = pw_end; ipyw > pw_beg; ipyw--){ @@ -290,18 +290,18 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c assert_printf ("read", i_yc < yc, ": i_yc < yc"); int read_val = mem.nhwc[((i_yn*yh + ipyh)*yw + ipyw)*yc + i_yc]; - result = p_bundle->p_type==POOL_MAX ? max(result, read_val) : (result + read_val); + result = pb->pool==POOL_MAX ? max(result, read_val) : (result + read_val); } } int count = (ph_end-ph_beg)*(pw_end-pw_beg); - result = p_bundle->p_type==POOL_MAX ? result : div_round(result, count); + result = pb->pool==POOL_MAX ? result : div_round(result, count); // ------ POOL ACTIVATION ------ - tile_write(result, ib, p_bundle, i_yn, ixh, ixw, i_yc, yn, p_bundle->ph, p_bundle->pw, yc); // Write + tile_write(result, ib, pb, i_yn, ixh, ixw, i_yc, yn, pb->ph, pb->pw, yc); // Write } } - yh = p_bundle->ph; - yw = p_bundle->pw; + yh = pb->ph; + yw = pb->pw; PROCESS_AND_STORE_DONE: @@ -317,25 +317,25 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c //Nested for loop [for(ib) for(ip) for(it) for(il) for(in) for(iw_kw2) {}] // inverted to increment once per call - ++iw_kw2; if (iw_kw2 >= p_bundle->w_kw2) { iw_kw2 = 0; //after_each(in) = after_all(iw_kw2): - ++il; if (il >= p_bundle->l) { il = 0; //after_each(in) = after_all(il): - ++in; if (in >= p_bundle->n) { in = 0; //after_each(it) = after_all(in): - ++it; if (it >= p_bundle->t) { it = 0; //after_each(ip) = after_all(it): - ++ip; if (ip >= p_bundle->p) { ip = 0; //after_each(ib) = after_all(ip): + ++iw_kw2; if (iw_kw2 >= pb->w_kw2) { iw_kw2 = 0; //after_each(in) = after_all(iw_kw2): + ++il; if (il >= pb->l) { il = 0; //after_each(in) = after_all(il): + ++in; if (in >= pb->n) { in = 0; //after_each(it) = after_all(in): + ++it; if (it >= pb->t) { it = 0; //after_each(ip) = after_all(it): + ++ip; if (ip >= pb->p) { ip = 0; //after_each(ib) = after_all(ip): printf("done bundle!! iw_kw2:%d in:%d il:%d it:%d ip:%d ib:%d\n", iw_kw2, in, il, it, ip, ib); char f_path_tiled [1000]; sprintf(f_path_tiled, "%s/%0d_y_tiled_sim.txt", DATA_DIR, ib); FILE *fp_tiled = fopen(f_path_tiled, "w"); - for (int i=0; io_bytes; i++) + for (int i=0; io_bytes; i++) fprintf(fp_tiled,"%d\n", ib == N_BUNDLES-1 ? mem.y[i] : mem.nx[i]); fclose(fp_tiled); char f_path_debug [1000]; sprintf(f_path_debug, "%s/%0d_y_nhwc_sim.txt", DATA_DIR, ib); FILE *fp_debug = fopen(f_path_debug, "w"); - for (int i=0; idebug_nhwc_words; i++) + for (int i=0; idebug_nhwc_words; i++) fprintf(fp_debug,"%d\n", mem.debug_nhwc[i]); fclose(fp_debug); @@ -343,10 +343,10 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c ++ib; if (ib >= N_BUNDLES) { ib = 0; // after_all(ib): *p_done = 1; }//new(ib): - p_bundle = &bundles[ib]; + pb = &bundles[ib]; }//new(ip): }//new(it): - it_bias = p_bundle->b_offset + p_bundle->coe*it; + it_bias = pb->b_offset + pb->coe*it; }//new(in): }//new(il): }//new(iw_kw2): diff --git a/test/py/param_test.py b/test/py/param_test.py index b198f2f..f6425c2 100644 --- a/test/py/param_test.py +++ b/test/py/param_test.py @@ -307,7 +307,7 @@ def test_dnn_engine(COMPILE): ch.write( f".is_bias={1*(b.b is not None)}, .is_flatten={1*b.flatten}, ") ch.write( f".b_offset={b_words}, .b_val_shift={b.bias_val_shift}, .b_bias_shift={b.bias_b_shift}, ") ch.write( f".ca_nzero={ca_nzero}, .ca_shift={ca_shift}, .ca_pl_scale={ca_pl_scale}, ") - ch.write( f".csh={b.r.CSH}, .ch={b.r.CYH}, .csh_shift={b.r.CSH_SHIFT}, .pkh={b.r.PKH}, .psh={b.r.PSH}, .ph={b.r.PYH}, .psh_shift={b.r.PSH_SHIFT}, .csw={b.r.CSW}, .cw={b.r.CYW}, .csw_shift={b.r.CSW_SHIFT}, .pkw={b.r.PKW}, .psw={b.r.PSW}, .pw={b.r.PYW}, .psw_shift={b.r.PSW_SHIFT}, .p_type={pool_type}, .on={b.r.ON}, .oh={b.r.OH}, .ow={b.r.OW}, .oc={b.r.OC}, ") + ch.write( f".csh={b.r.CSH}, .ch={b.r.CYH}, .csh_shift={b.r.CSH_SHIFT}, .pkh={b.r.PKH}, .psh={b.r.PSH}, .ph={b.r.PYH}, .psh_shift={b.r.PSH_SHIFT}, .csw={b.r.CSW}, .cw={b.r.CYW}, .csw_shift={b.r.CSW_SHIFT}, .pkw={b.r.PKW}, .psw={b.r.PSW}, .pw={b.r.PYW}, .psw_shift={b.r.PSW_SHIFT}, .pool={pool_type}, .on={b.r.ON}, .oh={b.r.OH}, .ow={b.r.OW}, .oc={b.r.OC}, ") ch.write( f".x_header={b.r.x_header_be_p[-1][0]}, .x_header_p0={b.r.x_header_be_p[0][0]}, .w_header={b.r.w_header_be_p[-1][0]}, .w_header_p0={b.r.x_header_be_p[0][0]} , ") ch.write( f".debug_nhwc_words={b.oe_exp_nhwc.size} }}")