Skip to content

Commit

Permalink
Refactor: rename p_bundle, p_bo; change relu to conditional (for arm)
Browse files Browse the repository at this point in the history
  • Loading branch information
Aba committed Oct 31, 2023
1 parent 2370ab0 commit 1d5a15d
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 74 deletions.
146 changes: 73 additions & 73 deletions c/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ typedef struct {
const char is_bias, is_pool, is_flatten;
const int b_offset, b_val_shift, b_bias_shift;
const signed char ca_nzero, ca_shift, ca_pl_scale;
const int csh, ch, csh_shift, pkh, psh, ph, psh_shift, csw, cw, csw_shift, pkw, psw, pw, psw_shift, p_type, on, oh, ow, oc;
const int csh, ch, csh_shift, pkh, psh, ph, psh_shift, csw, cw, csw_shift, pkw, psw, pw, psw_shift, pool, on, oh, ow, oc;
const unsigned long long x_header, x_header_p0, w_header, w_header_p0; // 64 bits (at least)
const int debug_nhwc_words;
} Bundle_t;
Expand All @@ -36,42 +36,42 @@ typedef struct {
} Memory_st;
Memory_st mem;

#define max(x, y) (x > y ? x : y)
#define min(x, y) (x < y ? x : y)
#define clip(x, min, max) ((x < min) ? min : (x > max) ? max : x)
#define shift_round(n, s) ((n + (1<<(s-1)) - (~(n>>s)&1) ) >> s) // === np.around(n/2**s).astype(int)
#define div_round(a, b) ((a+(b/2) - (~(b|a/b) &1))/b)
#define max(x, y) ((x) > (y) ? (x) : (y))
#define min(x, y) ((x) < (y) ? (x) : (y))
#define clip(x, xmin, xmax) (((x) < (xmin)) ? (xmin) : ((x) > (xmax)) ? (xmax) : (x))
#define shift_round(n, s) (((n) + (1<<((s)-1)) - (~((n)>>(s))&1) ) >> s) // === np.around(n/2**s).astype(int)
#define div_round(a, b) (((a)+((b)/2) - (~((b)|(a)/(b)) &1))/(b))

#define assert_printf(debug_info, condition,...) ((condition) || (printf(#condition), printf(__VA_ARGS__), printf(debug_info), assert(condition), 0))

static inline int quant_lrelu(int x, signed char nzero, signed char shift, signed char pl_scale){
x = ((x<0)*x)*nzero + (((x>0)*x) << pl_scale);
x = x < 0 ? x*nzero : x << pl_scale;
x = shift_round(x, shift);
x = clip(x, -(1<<(X_BITS-pl_scale-1)), (1<<(X_BITS-1))-1);
return x;
}


static inline void write_x(signed char val, int ib, int ixp, int ixn, int ixl, int ixw, int ixcm, int ixr, Bundle_t *p_bo, int xcm ){
static inline void write_x(signed char val, int ib, int ixp, int ixn, int ixl, int ixw, int ixcm, int ixr, Bundle_t *pb_out, int xcm ){

#define DBG "--- ib:%d ixp:%d ixn:%d ixl:%d ixw:%d ixcm:%d ixr:%d xcm :%d \n",ib,ixp,ixn,ixl,ixw,ixcm,ixr,xcm
assert_printf(DBG, ixr < PE_ROWS+X_PAD, "ixr < PE_ROWS+X_PAD");
assert_printf(DBG, ixcm < xcm , "ixcm < xcm ");
assert_printf(DBG, ixw < p_bo->w , "ixw < p_bo->w ");
assert_printf(DBG, ixl < p_bo->l , "ixl < p_bo->l ");
assert_printf(DBG, ixn < p_bo->n , "ixn < p_bo->n ");
assert_printf(DBG, ixp < p_bo->p , "ixp < p_bo->p ");

int p_offset = (ixp == 0) ? 0 : (p_bo->cm_p0 + (ixp-1)*p_bo->cm) *p_bo->n*p_bo->l*p_bo->w*(PE_ROWS+X_PAD);
int flat_index_n2r = (((ixn*p_bo->l + ixl)*p_bo->w + ixw)*xcm + ixcm)*(PE_ROWS+X_PAD) + ixr; // multidim_index -> flat_index [n,l,w,cm,r]
assert_printf(DBG, ixcm < xcm , "ixcm < xcm ");
assert_printf(DBG, ixw < pb_out->w , "ixw < pb_out->w ");
assert_printf(DBG, ixl < pb_out->l , "ixl < pb_out->l ");
assert_printf(DBG, ixn < pb_out->n , "ixn < pb_out->n ");
assert_printf(DBG, ixp < pb_out->p , "ixp < pb_out->p ");

int p_offset = (ixp == 0) ? 0 : (pb_out->cm_p0 + (ixp-1)*pb_out->cm) *pb_out->n*pb_out->l*pb_out->w*(PE_ROWS+X_PAD);
int flat_index_n2r = (((ixn*pb_out->l + ixl)*pb_out->w + ixw)*xcm + ixcm)*(PE_ROWS+X_PAD) + ixr; // multidim_index -> flat_index [n,l,w,cm,r]
mem.nx[p_offset + flat_index_n2r] = val;
}


static inline void tile_write( int out_val, int ib, Bundle_t *p_bundle, int i_yn, int i_yh, int i_yw, int i_yc, int yn, int yh, int yw, int yc ) {
static inline void tile_write( int out_val, int ib, Bundle_t *pb, int i_yn, int i_yh, int i_yw, int i_yc, int yn, int yh, int yw, int yc ) {

// ------ FLATTEN ------
if (p_bundle->is_flatten) {
if (pb->is_flatten) {
i_yc = (i_yh*yw + i_yw)*yc + i_yc; // (H*W*C) -> C
i_yw = 0; // W=1
i_yh = i_yn; // N -> H
Expand All @@ -84,25 +84,25 @@ static inline void tile_write( int out_val, int ib, Bundle_t *p_bundle, int i_yn
}

// Check
assert_printf ("", yn == p_bundle->on, ": yn");
assert_printf ("", yh == p_bundle->oh, ": yh");
assert_printf ("", yw == p_bundle->ow, ": yw");
assert_printf ("", yc == p_bundle->oc, ": yc");
assert_printf ("", yn == pb->on, ": yn");
assert_printf ("", yh == pb->oh, ": yh");
assert_printf ("", yw == pb->ow, ": yw");
assert_printf ("", yc == pb->oc, ": yc");

// ------ TILING: Calculate X coordinates ------
// y [n,h,w,c] -> x[p, n, l, w,cmp, r+pad]

Bundle_t* p_bo = ib == N_BUNDLES-1 ? &bundles[ib] : &bundles[ib+1];
char yp_first = i_yc < p_bo->cm_p0;
Bundle_t* pb_out = ib == N_BUNDLES-1 ? &bundles[ib] : &bundles[ib+1];
char yp_first = i_yc < pb_out->cm_p0;

div_t div_oh = div(i_yh, PE_ROWS);
int i_yr = div_oh.rem;
int i_yl = div_oh.quot;

div_t div_oc = div(i_yc-p_bo->cm_p0, p_bo->cm);
div_t div_oc = div(i_yc-pb_out->cm_p0, pb_out->cm);
int i_yp = yp_first ? 0 : div_oc.quot + 1;
int i_ycm = yp_first ? i_yc : div_oc.rem;
int ycm = yp_first ? p_bo->cm_p0 : p_bo->cm ;
int ycm = yp_first ? pb_out->cm_p0 : pb_out->cm ;


// ------ STORE ------
Expand All @@ -123,13 +123,13 @@ static inline void tile_write( int out_val, int ib, Bundle_t *p_bundle, int i_yn
int yr_sweep = i_yh==yh-1 ? PE_ROWS : i_yr + 1;

for (int i_yr_dest = i_yr; i_yr_dest < yr_sweep; i_yr_dest++) {
write_x(out_val, ib, i_yp, i_yn, i_yl, i_yw, i_ycm, i_yr_dest, p_bo, ycm);
write_x(out_val, ib, i_yp, i_yn, i_yl, i_yw, i_ycm, i_yr_dest, pb_out, ycm);

// --- PADDING: the [bottom X_PAD rows of previous block (l-1)] with [first X_PAD rows of this block (l)]
if (i_yr_dest < X_PAD) {
int pad_val = (i_yl == 0) ? 0 : out_val;
int dest_yl = (i_yl == 0) ? p_bo->l-1 : i_yl-1;
write_x(pad_val, ib, i_yp, i_yn, dest_yl, i_yw, i_ycm, i_yr_dest+PE_ROWS, p_bo, ycm);
int dest_yl = (i_yl == 0) ? pb_out->l-1 : i_yl-1;
write_x(pad_val, ib, i_yp, i_yn, dest_yl, i_yw, i_ycm, i_yr_dest+PE_ROWS, pb_out, ycm);
}
out_val = 0;
}
Expand All @@ -139,7 +139,7 @@ static inline void tile_write( int out_val, int ib, Bundle_t *p_bundle, int i_yn

extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, const unsigned int *p_sram_u32) {

static Bundle_t *p_bundle = &bundles[0];
static Bundle_t *pb = &bundles[0];
static int it_bias=0;
static int ib=0, ip=0, it=0, in=0, il=0, iw_kw2=0;
const int *p_sram = (const int *)p_sram_u32;
Expand All @@ -156,9 +156,9 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c
FILE *fp_sum = fopen(f_path_sum, "a");

//New iw_kw2:
int w_last = iw_kw2 == p_bundle->w_kw2-1 ? p_bundle->kw/2+1 : 1;
int w_last = iw_kw2 == pb->w_kw2-1 ? pb->kw/2+1 : 1;
int sram_addr=0;
for (int icoe=0; icoe<p_bundle->coe; icoe++) {
for (int icoe=0; icoe<pb->coe; icoe++) {
int i_bias = it_bias + icoe;

for (int iw_last=0; iw_last<w_last; iw_last++) {
Expand All @@ -173,18 +173,17 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c
int i_yn = in;
int i_yh = il*PE_ROWS + ir;
int i_yw = iw_kw2 + iw_last;
int i_yc = p_bundle->coe*it + icoe;
int i_yc = pb->coe*it + icoe;

// Save y_dims
int yn = p_bundle->n;
int yh = p_bundle->h;
int yw = p_bundle->w;
int yc = p_bundle->co;

int yn = pb->n;
int yh = pb->h;
int yw = pb->w;
int yc = pb->co;

// if out of bounds, early return
if (i_yh >= yh || i_yc >= yc) {
if (ip == p_bundle->p-1)
if (ip == pb->p-1)
fprintf(fp_sum,"%d\n", 0); // Save summed output
goto PROCESS_AND_STORE_DONE;
}
Expand All @@ -198,8 +197,8 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c
// ------ ADD P PASSES ------
iy_nhwc = ((i_yn*yh + i_yh)*yw + i_yw)*yc + i_yc;

if (p_bundle->p == 1) { // only p : proceed with value
} else if (ip == p_bundle->p-1) {// last p : read, add, proceed
if (pb->p == 1) { // only p : proceed with value
} else if (ip == pb->p-1) {// last p : read, add, proceed
out_val += mem.nhwc[iy_nhwc];
} else if (ip == 0) { // first p : overwrite memory, return
mem.nhwc[iy_nhwc] = out_val;
Expand All @@ -212,24 +211,24 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c


// ------ CONV STRIDING ------
div_ch = div(i_yh-p_bundle->csh_shift, p_bundle->csh);
div_cw = div(i_yw-p_bundle->csw_shift, p_bundle->csw);
div_ch = div(i_yh-pb->csh_shift, pb->csh);
div_cw = div(i_yw-pb->csw_shift, pb->csw);

if (div_ch.rem != 0 || div_cw.rem != 0)
goto PROCESS_AND_STORE_DONE;

i_yh = div_ch.quot; // update indices and dimensions
i_yw = div_cw.quot;
yh = p_bundle->ch;
yw = p_bundle->cw;
yh = pb->ch;
yw = pb->cw;

// ------ ADD BIAS ------
if (p_bundle->is_bias)
out_val = (out_val << p_bundle->b_val_shift) + (mem.b[i_bias] << p_bundle->b_bias_shift);
if (pb->is_bias)
out_val = (out_val << pb->b_val_shift) + (mem.b[i_bias] << pb->b_bias_shift);


// ------ CORE ACT ------
out_val = quant_lrelu(out_val, p_bundle->ca_nzero, p_bundle->ca_shift, p_bundle->ca_pl_scale);
out_val = quant_lrelu(out_val, pb->ca_nzero, pb->ca_shift, pb->ca_pl_scale);



Expand All @@ -238,8 +237,8 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c

// ------ MAX/AVG POOL ---

if (p_bundle->p_type == POOL_NONE) {
tile_write(out_val, ib, p_bundle, i_yn, i_yh, i_yw, i_yc, yn, yh, yw, yc);
if (pb->pool == POOL_NONE) {
tile_write(out_val, ib, pb, i_yn, i_yh, i_yw, i_yc, yn, yh, yw, yc);
goto PROCESS_AND_STORE_DONE;
}

Expand All @@ -251,14 +250,15 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c
iy_nhwc = ((i_yn*yh + i_yh)*yw + i_yw)*yc + i_yc; // store as nhwc for pooling
mem.nhwc[iy_nhwc] = out_val;

div_ixh = div(i_yh+p_bundle->psh_shift-p_bundle->pkh+1, p_bundle->psh);
div_ixw = div(i_yw+p_bundle->psw_shift-p_bundle->pkw+1, p_bundle->psw);
div_ixh = div(i_yh+pb->psh_shift-pb->pkh+1, pb->psh);
div_ixw = div(i_yw+pb->psw_shift-pb->pkw+1, pb->psw);
ixh_beg = div_ixh.quot; // ix(hw) that corresponds to the pooling window
ixw_beg = div_ixw.quot;

if (ixh_beg < 0 || ixw_beg < 0) // skip when target ix(h,w) < 0
goto PROCESS_AND_STORE_DONE;

// Pool Striding
if (div_ixh.rem != 0) // invalid ixh
if (i_yh==yh-1) ixh_beg += 1; //but last yh. start sweeping
else goto PROCESS_AND_STORE_DONE; // not last yh. skip
Expand All @@ -269,18 +269,18 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c

ph_end = i_yh; // iy(h,w) is the bottom-right of pooling window -> All values in pooling window have been computed
pw_end = i_yw;
ph_beg_const = max(p_bundle->psh*ixh_beg-p_bundle->psh_shift, 0)-1; // p(h,w)_beg is the index of top left corner of pooling window. If negative, set to zero
pw_beg_const = max(p_bundle->psw*ixw_beg-p_bundle->psw_shift, 0)-1;
ph_beg_const = max(pb->psh*ixh_beg-pb->psh_shift, 0)-1; // p(h,w)_beg is the index of top left corner of pooling window. If negative, set to zero
pw_beg_const = max(pb->psw*ixw_beg-pb->psw_shift, 0)-1;

xh_sweep = i_yh == yh-1 ? p_bundle->ph : ixh_beg+1; // ix(hw) is sweeped from ix(hw)_beg to x(h,w)_sweep. Normally sweep is 1.
xw_sweep = i_yw == yw-1 ? p_bundle->pw : ixw_beg+1; // But when iy(h,w) is at its edges, need to compute remaining ix(hw) pixels by sweeping
xh_sweep = i_yh == yh-1 ? pb->ph : ixh_beg+1; // ix(hw) is sweeped from ix(hw)_beg to x(h,w)_sweep. Normally sweep is 1.
xw_sweep = i_yw == yw-1 ? pb->pw : ixw_beg+1; // But when iy(h,w) is at its edges, need to compute remaining ix(hw) pixels by sweeping

// Sweep the pooling window
for (int ixh = ixh_beg, ph_beg = ph_beg_const; ixh < xh_sweep; ixh++, ph_beg += p_bundle->psh) {
for (int ixw = ixw_beg, pw_beg = pw_beg_const; ixw < xw_sweep; ixw++, pw_beg += p_bundle->psw) {
for (int ixh = ixh_beg, ph_beg = ph_beg_const; ixh < xh_sweep; ixh++, ph_beg += pb->psh) {
for (int ixw = ixw_beg, pw_beg = pw_beg_const; ixw < xw_sweep; ixw++, pw_beg += pb->psw) {

// Traverse each pool window & perform pooling
int result = p_bundle->p_type == POOL_MAX ? INT_MIN : 0;
int result = pb->pool == POOL_MAX ? INT_MIN : 0;
for (int ipyh = ph_end; ipyh > ph_beg; ipyh--){
for (int ipyw = pw_end; ipyw > pw_beg; ipyw--){

Expand All @@ -290,18 +290,18 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c
assert_printf ("read", i_yc < yc, ": i_yc < yc");

int read_val = mem.nhwc[((i_yn*yh + ipyh)*yw + ipyw)*yc + i_yc];
result = p_bundle->p_type==POOL_MAX ? max(result, read_val) : (result + read_val);
result = pb->pool==POOL_MAX ? max(result, read_val) : (result + read_val);
}
}
int count = (ph_end-ph_beg)*(pw_end-pw_beg);
result = p_bundle->p_type==POOL_MAX ? result : div_round(result, count);
result = pb->pool==POOL_MAX ? result : div_round(result, count);

// ------ POOL ACTIVATION ------
tile_write(result, ib, p_bundle, i_yn, ixh, ixw, i_yc, yn, p_bundle->ph, p_bundle->pw, yc); // Write
tile_write(result, ib, pb, i_yn, ixh, ixw, i_yc, yn, pb->ph, pb->pw, yc); // Write
}
}
yh = p_bundle->ph;
yw = p_bundle->pw;
yh = pb->ph;
yw = pb->pw;


PROCESS_AND_STORE_DONE:
Expand All @@ -317,36 +317,36 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c

//Nested for loop [for(ib) for(ip) for(it) for(il) for(in) for(iw_kw2) {}]
// inverted to increment once per call
++iw_kw2; if (iw_kw2 >= p_bundle->w_kw2) { iw_kw2 = 0; //after_each(in) = after_all(iw_kw2):
++il; if (il >= p_bundle->l) { il = 0; //after_each(in) = after_all(il):
++in; if (in >= p_bundle->n) { in = 0; //after_each(it) = after_all(in):
++it; if (it >= p_bundle->t) { it = 0; //after_each(ip) = after_all(it):
++ip; if (ip >= p_bundle->p) { ip = 0; //after_each(ib) = after_all(ip):
++iw_kw2; if (iw_kw2 >= pb->w_kw2) { iw_kw2 = 0; //after_each(in) = after_all(iw_kw2):
++il; if (il >= pb->l) { il = 0; //after_each(in) = after_all(il):
++in; if (in >= pb->n) { in = 0; //after_each(it) = after_all(in):
++it; if (it >= pb->t) { it = 0; //after_each(ip) = after_all(it):
++ip; if (ip >= pb->p) { ip = 0; //after_each(ib) = after_all(ip):

printf("done bundle!! iw_kw2:%d in:%d il:%d it:%d ip:%d ib:%d\n", iw_kw2, in, il, it, ip, ib);

char f_path_tiled [1000];
sprintf(f_path_tiled, "%s/%0d_y_tiled_sim.txt", DATA_DIR, ib);
FILE *fp_tiled = fopen(f_path_tiled, "w");
for (int i=0; i<p_bundle->o_bytes; i++)
for (int i=0; i<pb->o_bytes; i++)
fprintf(fp_tiled,"%d\n", ib == N_BUNDLES-1 ? mem.y[i] : mem.nx[i]);
fclose(fp_tiled);

char f_path_debug [1000];
sprintf(f_path_debug, "%s/%0d_y_nhwc_sim.txt", DATA_DIR, ib);
FILE *fp_debug = fopen(f_path_debug, "w");
for (int i=0; i<p_bundle->debug_nhwc_words; i++)
for (int i=0; i<pb->debug_nhwc_words; i++)
fprintf(fp_debug,"%d\n", mem.debug_nhwc[i]);
fclose(fp_debug);


++ib; if (ib >= N_BUNDLES) { ib = 0; // after_all(ib):
*p_done = 1;
}//new(ib):
p_bundle = &bundles[ib];
pb = &bundles[ib];
}//new(ip):
}//new(it):
it_bias = p_bundle->b_offset + p_bundle->coe*it;
it_bias = pb->b_offset + pb->coe*it;
}//new(in):
}//new(il):
}//new(iw_kw2):
Expand Down
2 changes: 1 addition & 1 deletion test/py/param_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def test_dnn_engine(COMPILE):
ch.write( f".is_bias={1*(b.b is not None)}, .is_flatten={1*b.flatten}, ")
ch.write( f".b_offset={b_words}, .b_val_shift={b.bias_val_shift}, .b_bias_shift={b.bias_b_shift}, ")
ch.write( f".ca_nzero={ca_nzero}, .ca_shift={ca_shift}, .ca_pl_scale={ca_pl_scale}, ")
ch.write( f".csh={b.r.CSH}, .ch={b.r.CYH}, .csh_shift={b.r.CSH_SHIFT}, .pkh={b.r.PKH}, .psh={b.r.PSH}, .ph={b.r.PYH}, .psh_shift={b.r.PSH_SHIFT}, .csw={b.r.CSW}, .cw={b.r.CYW}, .csw_shift={b.r.CSW_SHIFT}, .pkw={b.r.PKW}, .psw={b.r.PSW}, .pw={b.r.PYW}, .psw_shift={b.r.PSW_SHIFT}, .p_type={pool_type}, .on={b.r.ON}, .oh={b.r.OH}, .ow={b.r.OW}, .oc={b.r.OC}, ")
ch.write( f".csh={b.r.CSH}, .ch={b.r.CYH}, .csh_shift={b.r.CSH_SHIFT}, .pkh={b.r.PKH}, .psh={b.r.PSH}, .ph={b.r.PYH}, .psh_shift={b.r.PSH_SHIFT}, .csw={b.r.CSW}, .cw={b.r.CYW}, .csw_shift={b.r.CSW_SHIFT}, .pkw={b.r.PKW}, .psw={b.r.PSW}, .pw={b.r.PYW}, .psw_shift={b.r.PSW_SHIFT}, .pool={pool_type}, .on={b.r.ON}, .oh={b.r.OH}, .ow={b.r.OW}, .oc={b.r.OC}, ")
ch.write( f".x_header={b.r.x_header_be_p[-1][0]}, .x_header_p0={b.r.x_header_be_p[0][0]}, .w_header={b.r.w_header_be_p[-1][0]}, .w_header_p0={b.r.x_header_be_p[0][0]} , ")
ch.write( f".debug_nhwc_words={b.oe_exp_nhwc.size} }}")

Expand Down

0 comments on commit 1d5a15d

Please sign in to comment.