Skip to content

Commit

Permalink
Add bias; works for all conv2d, not dense
Browse files Browse the repository at this point in the history
  • Loading branch information
Aba committed Sep 15, 2023
1 parent e4b7930 commit 8b7d62b
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 55 deletions.
26 changes: 14 additions & 12 deletions c/model.h
Original file line number Diff line number Diff line change
@@ -1,22 +1,24 @@
#define N_BUNDLES 7
#define N_BUNDLES 6
Bundle_t bundles [N_BUNDLES] = {
{.n=2, .l=2, .kw=11, .coe=2, .coe_tl=2, .r_ll=8, .h=16, .w=8, .w_kw2=3, .t=8, .p=3, .cm=1, .cm_p0=1, .w_bpt=152, .w_bpt_p0=152, .x_bpt=216, .x_bpt_p0=216, .x_header=414341061322735616, .x_header_p0=414341061322735616, .w_header=414376340184104960, .w_header_p0=414341061322735616 },
{.n=2, .l=2, .kw=1, .coe=24, .coe_tl=0, .r_ll=8, .h=16, .w=8, .w_kw2=8, .t=1, .p=1, .cm=19, .cm_p0=16, .w_bpt=212, .w_bpt_p0=212, .x_bpt=3336, .x_bpt_p0=3336, .x_header=8700964375684448256, .x_header_p0=8700964375684448256, .w_header=8700999697495490560, .w_header_p0=8700964375684448256 },
{.n=2, .l=2, .kw=7, .coe=3, .coe_tl=4, .r_ll=8, .h=16, .w=8, .w_kw2=5, .t=6, .p=8, .cm=2, .cm_p0=2, .w_bpt=188, .w_bpt_p0=188, .x_bpt=424, .x_bpt_p0=424, .x_header=846686625550303232, .x_header_p0=846686625550303232, .w_header=846721930181476352, .w_header_p0=846686625550303232 },
{.n=2, .l=2, .kw=5, .coe=4, .coe_tl=4, .r_ll=8, .h=16, .w=8, .w_kw2=6, .t=4, .p=6, .cm=3, .cm_p0=1, .w_bpt=200, .w_bpt_p0=80, .x_bpt=632, .x_bpt_p0=216, .x_header=1351089783815798784, .x_header_p0=198168279208951808, .w_header=1351125097036906496, .w_header_p0=198168279208951808 },
{.n=2, .l=2, .kw=3, .coe=8, .coe_tl=8, .r_ll=8, .h=16, .w=8, .w_kw2=7, .t=3, .p=3, .cm=6, .cm_p0=4, .w_bpt=236, .w_bpt_p0=164, .x_bpt=1256, .x_bpt_p0=840, .x_header=3008414446688141312, .x_header_p0=1855492942081294336, .w_header=3008449785679052800, .w_header_p0=1855492942081294336 },
{.n=2, .l=2, .kw=1, .coe=24, .coe_tl=2, .r_ll=8, .h=16, .w=8, .w_kw2=8, .t=3, .p=2, .cm=19, .cm_p0=5, .w_bpt=248, .w_bpt_p0=80, .x_bpt=3960, .x_bpt_p0=1048, .x_header=10430346632594718720, .x_header_p0=2359896100346789888, .w_header=10430381980175564800, .w_header_p0=2359896100346789888 },
{.n=2, .l=1, .kw=1, .coe=24, .coe_tl=0, .r_ll=1, .h=1, .w=1, .w_kw2=1, .t=1, .p=337, .cm=19, .cm_p0=16, .w_bpt=248, .w_bpt_p0=212, .x_bpt=255, .x_bpt_p0=216, .x_header=10376293541461622784, .x_header_p0=8646911284551352320, .w_header=10376328889042468864, .w_header_p0=8646911284551352320 }
{.n=8, .l=2, .kw=11, .coe=2, .coe_tl=2, .r_ll=8, .h=16, .w=8, .w_kw2=3, .t=8, .p=3, .cm=1, .cm_p0=1, .w_bpt=152, .w_bpt_p0=152, .x_bpt=840, .x_bpt_p0=840, .is_bias=1, .b_offset=0, .b_val_shift=9, .b_bias_shift=0, .x_header=414341061322735616, .x_header_p0=414341061322735616, .w_header=414587446416637952, .w_header_p0=414341061322735616 },
{.n=8, .l=2, .kw=1, .coe=24, .coe_tl=0, .r_ll=8, .h=16, .w=8, .w_kw2=8, .t=1, .p=1, .cm=19, .cm_p0=16, .w_bpt=212, .w_bpt_p0=212, .x_bpt=13320, .x_bpt_p0=13320, .is_bias=1, .b_offset=16, .b_val_shift=9, .b_bias_shift=0, .x_header=8700964375684448256, .x_header_p0=8700964375684448256, .w_header=8701210803728023552, .w_header_p0=8700964375684448256 },
{.n=8, .l=2, .kw=7, .coe=3, .coe_tl=4, .r_ll=8, .h=16, .w=8, .w_kw2=5, .t=6, .p=8, .cm=2, .cm_p0=2, .w_bpt=188, .w_bpt_p0=188, .x_bpt=1672, .x_bpt_p0=1672, .is_bias=1, .b_offset=40, .b_val_shift=9, .b_bias_shift=0, .x_header=846686625550303232, .x_header_p0=846686625550303232, .w_header=846933036414009344, .w_header_p0=846686625550303232 },
{.n=8, .l=2, .kw=5, .coe=4, .coe_tl=4, .r_ll=8, .h=16, .w=8, .w_kw2=6, .t=4, .p=6, .cm=3, .cm_p0=1, .w_bpt=200, .w_bpt_p0=80, .x_bpt=2504, .x_bpt_p0=840, .is_bias=1, .b_offset=58, .b_val_shift=9, .b_bias_shift=0, .x_header=1351089783815798784, .x_header_p0=198168279208951808, .w_header=1351336203269439488, .w_header_p0=198168279208951808 },
{.n=8, .l=2, .kw=3, .coe=8, .coe_tl=8, .r_ll=8, .h=16, .w=8, .w_kw2=7, .t=3, .p=3, .cm=6, .cm_p0=4, .w_bpt=236, .w_bpt_p0=164, .x_bpt=5000, .x_bpt_p0=3336, .is_bias=1, .b_offset=74, .b_val_shift=9, .b_bias_shift=0, .x_header=3008414446688141312, .x_header_p0=1855492942081294336, .w_header=3008660891911585792, .w_header_p0=1855492942081294336 },
{.n=8, .l=2, .kw=1, .coe=24, .coe_tl=2, .r_ll=8, .h=16, .w=8, .w_kw2=8, .t=3, .p=2, .cm=19, .cm_p0=5, .w_bpt=248, .w_bpt_p0=80, .x_bpt=15816, .x_bpt_p0=4168, .is_bias=1, .b_offset=98, .b_val_shift=9, .b_bias_shift=0, .x_header=10430346632594718720, .x_header_p0=2359896100346789888, .w_header=10430593086408097792, .w_header_p0=2359896100346789888 }
};

#define X_BITS_L2 2
#define W_BITS_L2 2
#define PE_ROWS 8
#define PE_COLS 24

#define W_BYTES 103636
#define X_BYTES 648
#define X_BYTES_ALL 105008
#define Y_BYTES 73736
#define WB_BYTES 20436
#define W_BYTES 20096
#define X_BYTES 2520
#define X_BYTES_ALL 75896
#define Y_BYTES 294920
#define B_TYPE signed short
#define B_WORDS 170
#define DATA_DIR "D:/dnn-engine/test/vectors"

53 changes: 31 additions & 22 deletions c/runtime.h
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
typedef struct {
const int n, l, kw, coe, coe_tl, r_ll, h, w, w_kw2, t, p, cm, cm_p0;
const int w_bpt, w_bpt_p0, x_bpt, x_bpt_p0; // bytes per transfer
const char is_bias;
const int b_offset, b_val_shift, b_bias_shift;
const unsigned long long x_header, x_header_p0, w_header, w_header_p0; // 64 bits (at least)
} Bundle_t;

#include "model.h"

typedef struct {
char w [W_BYTES ];
char x [X_BYTES_ALL ];
int y [Y_BYTES/4 ];
char w [W_BYTES ];
B_TYPE b [B_WORDS ]; // keep next to w. weights are loaded to w_ptr
char x [X_BYTES_ALL ];
int y [Y_BYTES/4 ];
} Memory_st;
Memory_st mem;

Expand All @@ -24,23 +27,23 @@ Memory_st mem;
#endif


static inline void process_y(int val, int p_y, int ib, int ip, int it, int in, int il, int iw, int icoe, int iw_last, int ir){
static inline void process_y(int val, int p_y, Bundle_t *p_bundle, int ib, int ip, int it, int in, int il, int iw, int icoe, int iw_last, int ir){

// ------ ADD P PASSES ------

if (bundles[ib].p == 1) {} // only p : proceed with value
else if (ip == bundles[ib].p-1) // last p : read, add, proceed
if (p_bundle->p == 1) { // only p : proceed with value
} else if (ip == p_bundle->p-1) {// last p : read, add, proceed
val += mem.y[p_y];
else if (ip == 0) { // first p : overwrite memory, return
} else if (ip == 0) { // first p : overwrite memory, return
mem.y[p_y] = val;
return;
}
else { // middle p: read, add, store, return
} else { // middle p: read, add, store, return
mem.y[p_y] += val;
return;
}

// ------ ADD BIAS ------
if (p_bundle->is_bias)
val = (val << p_bundle->b_val_shift) + (mem.b[p_bundle->b_offset + p_bundle->coe*it + icoe] << p_bundle-> b_bias_shift);

// ------ RELU + QUANT ------

Expand All @@ -58,6 +61,7 @@ static inline void process_y(int val, int p_y, int ib, int ip, int it, int in, i

extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, const unsigned int *p_sram_u32) {

static Bundle_t *p_bundle = &bundles[0];
static int p_y=0;
static int ib=0, ip=0, it=0, in=0, il=0, iw=0;
const int *p_sram = (const int *)p_sram_u32;
Expand All @@ -67,16 +71,16 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c
sprintf(path, "%s/%0d_%0d_%0d_y_sim.txt", DATA_DIR, ib, ip, it);
fp = fopen(path, "a");

int w_last = iw == bundles[ib].w_kw2-1 ? bundles[ib].kw/2+1 : 1;
int w_last = iw == p_bundle->w_kw2-1 ? p_bundle->kw/2+1 : 1;
int sram_addr=0;
for (int icoe=0; icoe<bundles[ib].coe; icoe++)
for (int icoe=0; icoe<p_bundle->coe; icoe++)
for (int iw_last=0; iw_last<w_last; iw_last++)
for (int ir=0; ir<PE_ROWS; ir++) {

int val = p_sram[sram_addr];
fprintf(fp,"%d\n", val);

process_y(val, p_y, ib, ip, it, in, il, iw, icoe, iw_last, ir);
process_y(val, p_y, p_bundle, ib, ip, it, in, il, iw, icoe, iw_last, ir);

p_y += 1;
sram_addr += 1;
Expand All @@ -87,20 +91,20 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c

int p_y_prev;
// Nested for loop [for(ib) for(ip) for(it) for(il) for(in) for(iw) {}] inverted to increment once per call
++ iw; if (iw >= bundles[ib].w_kw2) { iw = 0;
++ in; if (in >= bundles[ib].n) { in = 0;
++ il; if (il >= bundles[ib].l) { il = 0;
++ it; if (it >= bundles[ib].t) { it = 0;
++ iw; if (iw >= p_bundle->w_kw2) { iw = 0;
++ in; if (in >= p_bundle->n) { in = 0;
++ il; if (il >= p_bundle->l) { il = 0;
++ it; if (it >= p_bundle->t) { it = 0;

// After each p
printf("done p!! iw:%d in:%d il:%d it:%d ip:%d ib:%d\n", iw, in, il, it, ip, ib);
p_y_prev = p_y;
p_y=0;

++ ip; if (ip >= bundles[ib].p) { ip = 0;
++ ip; if (ip >= p_bundle->p) { ip = 0;

// After each bundle
printf("done bundle!! iw:%d in:%d il:%d it:%d ip:%d ib:%d\n", iw, in, il, it, ip, ib);
// After each p_bundle
printf("done p_bundle!! iw:%d in:%d il:%d it:%d ip:%d ib:%d\n", iw, in, il, it, ip, ib);
// Write to file at every it_done
sprintf(path, "%s/%0d_y_sim.txt", DATA_DIR, ib);
fp = fopen(path, "w");
Expand All @@ -110,7 +114,9 @@ extern EXT_C void load_y (unsigned char *p_done, unsigned char *pt_done_proc, c

++ ib; if (ib >= N_BUNDLES) { ib = 0;
*p_done =1;
}}}}}}
}
p_bundle = &bundles[ib];
}}}}}
*pt_done_proc = !(*pt_done_proc);
}

Expand Down Expand Up @@ -167,7 +173,7 @@ extern EXT_C void fill_memory (){
printf("ABORT! File not found: %s \n", path);
exit(1);
}
fread(mem.w, 1, W_BYTES, fp);
fread(mem.w, 1, WB_BYTES, fp);
fclose(fp);

sprintf(path, "%s/x_all.bin", DATA_DIR);
Expand All @@ -178,6 +184,9 @@ extern EXT_C void fill_memory (){
}
fread(mem.x, 1, X_BYTES_ALL, fp);
fclose(fp);

for (int i=0; i<B_WORDS; i++)
printf("i:%d, bias:%d\n", i, mem.b[i]);
}


Expand Down
43 changes: 33 additions & 10 deletions test/py/bundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,15 @@ def extract_act(signature):
self.core['layer'] = QConv2DBatchnorm(
filters=self.core['filters'], kernel_size=self.core['kernel_size'], strides=self.core['strides'],
padding=self.core['padding'], kernel_quantizer=self.core['kernel_quantizer'],
bias_quantizer=self.core['bias_quantizer'], use_bias=self.core['use_bias'])
bias_quantizer=self.core['bias_quantizer'], use_bias=self.core['use_bias'], bias_initializer='glorot_uniform')

else:
for i in ['units', 'kernel_quantizer', 'bias_quantizer', 'use_bias', 'act_str']:
assert i in self.core, f"'{i}' must be provided for dense"

self.core['layer'] = QDense(
units=self.core['units'], kernel_quantizer=self.core['kernel_quantizer'],
bias_quantizer=self.core['bias_quantizer'], use_bias=self.core['use_bias'])
bias_quantizer=self.core['bias_quantizer'], use_bias=self.core['use_bias'], bias_initializer='glorot_uniform')

'''
CORE ACT LAYER
Expand Down Expand Up @@ -201,7 +201,7 @@ def load_weight_bias(self):
self.b = {'tensor':b_tensor, 'int':b_int, 'bits':b_config['bits'], 'frac':b_frac}


def process(self, inp = None):
def process(self, inp, c):

''' Integer test for output '''
self.out['int'] = self.out['tensor'].numpy() * 2**self.out['frac']
Expand All @@ -227,19 +227,40 @@ def process(self, inp = None):

self.y = copy.deepcopy(self.proc)

self.post_process()
self.post_process(c)


def post_process(self):
def post_process(self, c):

def add (p, p_frac, p_bits, q, q_frac, q_bits):
'''
Add p,q while preserving precision
'''
p_intb, q_intb = p_bits-p_frac, q_bits-q_frac

r_frac = max(p_frac,q_frac)
r_intb = max(p_intb,q_intb)
r_bits = 1 + r_intb + r_frac # +1 to allow overflow

p_shift = r_frac-p_frac
q_shift = r_frac-q_frac

r = (p << p_shift) + (q << q_shift)
return (r, r_frac, r_bits), (p_shift, q_shift)

clog2_add = int(np.ceil(np.log2(np.prod(self.w['int'].shape[:-1]))))
self.proc['bits'] = self.inp['bits'] + self.w['bits'] + clog2_add
self.proc['frac'] = self.inp['frac'] + self.w['frac']

if self.b is not None:
self.proc['int'] += self.b['int'] * 2** (self.proc['frac'] - self.b['frac'])
self.b_frac_shift = self.proc['frac'] - self.b['frac'] if self.b else None
self.y_int_b = self.proc['int'] if self.b else None
(self.proc['int'], self.proc['frac'], self.proc['bits']), (self.bias_val_shift, self.bias_b_shift) = add(
self.proc['int'], self.proc['frac'], self.proc['bits'],
self.b ['int'], self.b ['frac'], self.b ['bits']
)
assert self.proc['bits'] <= c.INT_BITS, f"After bias addition, resulting bits {self.proc['bits']} are more than bits for integer in CPU {c.INT_BITS}. Reduce bits or increase integer bits of bias to continue"
else:
self.bias_val_shift, self.bias_b_shift = 0, 0
self.y_int_b = self.proc['int']


if 'strides' in self.core and self.core['strides'] != (1,1):
Expand Down Expand Up @@ -394,6 +415,7 @@ def export (self, c):
y_int = self.y['int'].reshape(XN,1,1,CO) # (XN,CI) -> (XN, XH, XW, CI)
else:
y_int = self.y['int']
p_int = self.y_int_b
w_int, x_int = self.w['int'], self.inp['int']

r = self.get_runtime_params(c, w_int.shape, x_int.shape, y_int.shape)
Expand All @@ -411,13 +433,15 @@ def export (self, c):
print(r)
self.check_sparsity(w_int, x_int)

self.be = self.reorder_b_q2e_conv(self.b, c, r) if self.b else None
self.be = self.reorder_b_q2e_conv(self.b['int'], c, r) if self.b else None
self.we = self.reorder_w_q2e_conv(w_int, c, r)
self.ye_exp_shape = (r.IT, r.XN, r.L, r.XW*r.CO_PRL, c.ROWS)
self.ye_hw = np.zeros(self.ye_exp_shape)

self.xe = self.reorder_x_q2e_conv(x_int, c, r)
self.ye_exp = self.reorder_y_q2e_conv(y_int, c, r)
self.pe_exp = self.reorder_y_q2e_conv(p_int, c, r)
print(f"x reshape: [int]:{self.inp['int'].shape}, int:{x_int.shape}. xe:{self.xe[0].shape}")

'''
Prepare expected outputs for each pass
Expand Down Expand Up @@ -549,7 +573,6 @@ def check_sparsity(w, x):
def reorder_b_q2e_conv(b, c, r):
b = np.pad(b, ((0,r.CO_PAD-r.CO)))
b = b.reshape(r.IT, r.CO_PRL)
b = np.flip(b, axis=1)
return b


Expand Down
Loading

0 comments on commit 8b7d62b

Please sign in to comment.