diff --git a/c/model.h b/c/model.h index f84c41b..6fc04ca 100644 --- a/c/model.h +++ b/c/model.h @@ -1,12 +1,12 @@ #define N_BUNDLES 7 Bundle_t bundles [N_BUNDLES] = { - {.n=8 , .l=3 , .kw=11 , .coe=2 , .coe_tl=2 , .r_ll=2 , .h=18 , .w=18 , .ci=3 , .co=8 , .w_kw2=13 , .t=4 , .p=3 , .cm=1 , .cm_p0=1 , .xp_words=6048, .w_bpt=140 , .w_bpt_p0=140 , .x_bpt=3032 , .x_bpt_p0=3032 , .o_words=5376 , .o_bytes=2696 , .is_bias=1 , .is_flatten=0 , .b_offset=0 , .b_val_shift=9 , .b_bias_shift=0 , .ca_nzero=0 , .ca_shift=12 , .ca_pl_scale=0 , .csh=2 , .ch=9 , .csh_shift=1 , .pkh=3 , .psh=2 , .ph=5 , .psh_shift=1 , .csw=1 , .cw=18 , .csw_shift=0 , .pkw=4 , .psw=3 , .pw=6 , .psw_shift=0 , .pool=POOL_MAX , .on=8 , .oh=5 , .ow=6 , .oc=8 , .x_header= 17055749u, .x_header_p0= 17055749u, .w_header= 347372535813u, .w_header_p0= 17055749u , .debug_nhwc_words=1920 }, - {.n=8 , .l=1 , .kw=1 , .coe=24 , .coe_tl=0 , .r_ll=5 , .h=5 , .w=6 , .ci=8 , .co=8 , .w_kw2=6 , .t=1 , .p=1 , .cm=20 , .cm_p0=8 , .xp_words=672, .w_bpt=104 , .w_bpt_p0=104 , .x_bpt=2696 , .x_bpt_p0=2696 , .o_words=5376 , .o_bytes=2720 , .is_bias=0 , .is_flatten=0 , .b_offset=8 , .b_val_shift=0 , .b_bias_shift=0 , .ca_nzero=1 , .ca_shift=3 , .ca_pl_scale=0 , .csh=1 , .ch=5 , .csh_shift=0 , .pkh=1 , .psh=1 , .ph=5 , .psh_shift=0 , .csw=1 , .cw=6 , .csw_shift=0 , .pkw=1 , .psw=1 , .pw=6 , .psw_shift=0 , .pool=POOL_NONE , .on=8 , .oh=5 , .ow=6 , .oc=8 , .x_header= 81976u, .x_header_p0= 81976u, .w_header= 244276346936u, .w_header_p0= 81976u , .debug_nhwc_words=1920 }, - {.n=8 , .l=1 , .kw=7 , .coe=3 , .coe_tl=2 , .r_ll=5 , .h=5 , .w=6 , .ci=8 , .co=8 , .w_kw2=3 , .t=3 , .p=4 , .cm=2 , .cm_p0=2 , .xp_words=672, .w_bpt=176 , .w_bpt_p0=176 , .x_bpt=680 , .x_bpt_p0=680 , .o_words=5376 , .o_bytes=2704 , .is_bias=1 , .is_flatten=0 , .b_offset=8 , .b_val_shift=9 , .b_bias_shift=0 , .ca_nzero=1 , .ca_shift=12 , .ca_pl_scale=0 , .csh=1 , .ch=5 , .csh_shift=0 , .pkh=1 , .psh=1 , .ph=5 , .psh_shift=0 , .csw=1 , .cw=6 , .csw_shift=0 , .pkw=1 , .psw=1 , .pw=6 , .psw_shift=0 , .pool=POOL_NONE , .on=8 , .oh=5 , .ow=6 , .oc=8 , .x_header= 81931u, .x_header_p0= 81931u, .w_header= 450434777099u, .w_header_p0= 81931u , .debug_nhwc_words=1920 }, - {.n=8 , .l=1 , .kw=5 , .coe=4 , .coe_tl=4 , .r_ll=5 , .h=5 , .w=6 , .ci=8 , .co=8 , .w_kw2=4 , .t=2 , .p=2 , .cm=4 , .cm_p0=4 , .xp_words=672, .w_bpt=248 , .w_bpt_p0=248 , .x_bpt=1352 , .x_bpt_p0=1352 , .o_words=5376 , .o_bytes=2704 , .is_bias=0 , .is_flatten=0 , .b_offset=17 , .b_val_shift=0 , .b_bias_shift=0 , .ca_nzero=1 , .ca_shift=6 , .ca_pl_scale=3 , .csh=1 , .ch=5 , .csh_shift=0 , .pkh=1 , .psh=1 , .ph=5 , .psh_shift=0 , .csw=1 , .cw=6 , .csw_shift=0 , .pkw=1 , .psw=1 , .pw=6 , .psw_shift=0 , .pool=POOL_NONE , .on=8 , .oh=5 , .ow=6 , .oc=8 , .x_header= 81946u, .x_header_p0= 81946u, .w_header= 656593207322u, .w_header_p0= 81946u , .debug_nhwc_words=1920 }, - {.n=8 , .l=1 , .kw=3 , .coe=8 , .coe_tl=8 , .r_ll=5 , .h=5 , .w=6 , .ci=8 , .co=24 , .w_kw2=5 , .t=3 , .p=2 , .cm=6 , .cm_p0=2 , .xp_words=672, .w_bpt=224 , .w_bpt_p0=80 , .x_bpt=2024 , .x_bpt_p0=680 , .o_words=16128, .o_bytes=8080 , .is_bias=1 , .is_flatten=0 , .b_offset=17 , .b_val_shift=9 , .b_bias_shift=0 , .ca_nzero=0 , .ca_shift=12 , .ca_pl_scale=0 , .csh=1 , .ch=5 , .csh_shift=0 , .pkh=1 , .psh=1 , .ph=5 , .psh_shift=0 , .csw=1 , .cw=6 , .csw_shift=0 , .pkw=1 , .psw=1 , .pw=6 , .psw_shift=0 , .pool=POOL_NONE , .on=8 , .oh=5 , .ow=6 , .oc=24 , .x_header= 81961u, .x_header_p0= 81929u, .w_header= 587873730601u, .w_header_p0= 81929u , .debug_nhwc_words=5760 }, - {.n=8 , .l=1 , .kw=1 , .coe=24 , .coe_tl=0 , .r_ll=5 , .h=5 , .w=6 , .ci=24 , .co=10 , .w_kw2=6 , .t=1 , .p=2 , .cm=20 , .cm_p0=4 , .xp_words=672, .w_bpt=248 , .w_bpt_p0=56 , .x_bpt=6728 , .x_bpt_p0=1352 , .o_words=4200 , .o_bytes=2220 , .is_bias=0 , .is_flatten=1 , .b_offset=41 , .b_val_shift=0 , .b_bias_shift=0 , .ca_nzero=1 , .ca_shift=6 , .ca_pl_scale=3 , .csh=1 , .ch=5 , .csh_shift=0 , .pkh=1 , .psh=1 , .ph=5 , .psh_shift=0 , .csw=1 , .cw=6 , .csw_shift=0 , .pkw=1 , .psw=1 , .pw=6 , .psw_shift=0 , .pool=POOL_NONE , .on=1 , .oh=8 , .ow=1 , .oc=300, .x_header= 82072u, .x_header_p0= 81944u, .w_header= 656593207448u, .w_header_p0= 81944u , .debug_nhwc_words=2400 }, - {.n=1 , .l=1 , .kw=1 , .coe=24 , .coe_tl=0 , .r_ll=8 , .h=8 , .w=1 , .ci=300, .co=10 , .w_kw2=1 , .t=1 , .p=15 , .cm=20 , .cm_p0=20 , .xp_words=14 , .w_bpt=248 , .w_bpt_p0=248 , .x_bpt=148 , .x_bpt_p0=148 , .o_words=80 , .o_bytes=320 , .is_bias=1 , .is_flatten=0 , .b_offset=41 , .b_val_shift=9 , .b_bias_shift=0 , .ca_nzero=1 , .ca_shift=15 , .ca_pl_scale=3 , .csh=1 , .ch=8 , .csh_shift=0 , .pkh=1 , .psh=1 , .ph=8 , .psh_shift=0 , .csw=1 , .cw=1 , .csw_shift=0 , .pkw=1 , .psw=1 , .pw=1 , .psw_shift=0 , .pool=POOL_NONE , .on=1 , .oh=8 , .ow=1 , .oc=10 , .x_header= 152u, .x_header_p0= 152u, .w_header= 652835029144u, .w_header_p0= 152u , .debug_nhwc_words=80 } + {.n=8 , .l=3 , .kw=11 , .coe=2 , .coe_tl=2 , .r_ll=2 , .h=18 , .w=18 , .ci=3 , .co=8 , .w_kw2=13 , .t=4 , .p=3 , .cm=1 , .cm_p0=1 , .xp_words=6048, .out_buffer_idx=0 , .w_bpt=140 , .w_bpt_p0=140 , .x_bpt=3032 , .x_bpt_p0=3032 , .o_words=5376 , .o_bytes=2696 , .is_bias=1 , .is_flatten=0 , .b_offset=0 , .b_val_shift=9 , .b_bias_shift=0 , .ca_nzero=0 , .ca_shift=12 , .ca_pl_scale=0 , .csh=2 , .ch=9 , .csh_shift=1 , .pkh=3 , .psh=2 , .ph=5 , .psh_shift=1 , .csw=1 , .cw=18 , .csw_shift=0 , .pkw=4 , .psw=3 , .pw=6 , .psw_shift=0 , .pool=POOL_MAX , .on=8 , .oh=5 , .ow=6 , .oc=8 , .x_header= 17055749u, .x_header_p0= 17055749u, .w_header= 347372535813u, .w_header_p0= 17055749u , .debug_nhwc_words=1920 }, + {.n=8 , .l=1 , .kw=1 , .coe=24 , .coe_tl=0 , .r_ll=5 , .h=5 , .w=6 , .ci=8 , .co=8 , .w_kw2=6 , .t=1 , .p=1 , .cm=20 , .cm_p0=8 , .xp_words=672, .out_buffer_idx=1 , .w_bpt=104 , .w_bpt_p0=104 , .x_bpt=2696 , .x_bpt_p0=2696 , .o_words=5376 , .o_bytes=2720 , .is_bias=0 , .is_flatten=0 , .b_offset=8 , .b_val_shift=0 , .b_bias_shift=0 , .ca_nzero=1 , .ca_shift=3 , .ca_pl_scale=0 , .csh=1 , .ch=5 , .csh_shift=0 , .pkh=1 , .psh=1 , .ph=5 , .psh_shift=0 , .csw=1 , .cw=6 , .csw_shift=0 , .pkw=1 , .psw=1 , .pw=6 , .psw_shift=0 , .pool=POOL_NONE , .on=8 , .oh=5 , .ow=6 , .oc=8 , .x_header= 81976u, .x_header_p0= 81976u, .w_header= 244276346936u, .w_header_p0= 81976u , .debug_nhwc_words=1920 }, + {.n=8 , .l=1 , .kw=7 , .coe=3 , .coe_tl=2 , .r_ll=5 , .h=5 , .w=6 , .ci=8 , .co=8 , .w_kw2=3 , .t=3 , .p=4 , .cm=2 , .cm_p0=2 , .xp_words=672, .out_buffer_idx=0 , .w_bpt=176 , .w_bpt_p0=176 , .x_bpt=680 , .x_bpt_p0=680 , .o_words=5376 , .o_bytes=2704 , .is_bias=1 , .is_flatten=0 , .b_offset=8 , .b_val_shift=9 , .b_bias_shift=0 , .ca_nzero=1 , .ca_shift=12 , .ca_pl_scale=0 , .csh=1 , .ch=5 , .csh_shift=0 , .pkh=1 , .psh=1 , .ph=5 , .psh_shift=0 , .csw=1 , .cw=6 , .csw_shift=0 , .pkw=1 , .psw=1 , .pw=6 , .psw_shift=0 , .pool=POOL_NONE , .on=8 , .oh=5 , .ow=6 , .oc=8 , .x_header= 81931u, .x_header_p0= 81931u, .w_header= 450434777099u, .w_header_p0= 81931u , .debug_nhwc_words=1920 }, + {.n=8 , .l=1 , .kw=5 , .coe=4 , .coe_tl=4 , .r_ll=5 , .h=5 , .w=6 , .ci=8 , .co=8 , .w_kw2=4 , .t=2 , .p=2 , .cm=4 , .cm_p0=4 , .xp_words=672, .out_buffer_idx=1 , .w_bpt=248 , .w_bpt_p0=248 , .x_bpt=1352 , .x_bpt_p0=1352 , .o_words=5376 , .o_bytes=2704 , .is_bias=0 , .is_flatten=0 , .b_offset=17 , .b_val_shift=0 , .b_bias_shift=0 , .ca_nzero=1 , .ca_shift=6 , .ca_pl_scale=3 , .csh=1 , .ch=5 , .csh_shift=0 , .pkh=1 , .psh=1 , .ph=5 , .psh_shift=0 , .csw=1 , .cw=6 , .csw_shift=0 , .pkw=1 , .psw=1 , .pw=6 , .psw_shift=0 , .pool=POOL_NONE , .on=8 , .oh=5 , .ow=6 , .oc=8 , .x_header= 81946u, .x_header_p0= 81946u, .w_header= 656593207322u, .w_header_p0= 81946u , .debug_nhwc_words=1920 }, + {.n=8 , .l=1 , .kw=3 , .coe=8 , .coe_tl=8 , .r_ll=5 , .h=5 , .w=6 , .ci=8 , .co=24 , .w_kw2=5 , .t=3 , .p=2 , .cm=6 , .cm_p0=2 , .xp_words=672, .out_buffer_idx=0 , .w_bpt=224 , .w_bpt_p0=80 , .x_bpt=2024 , .x_bpt_p0=680 , .o_words=16128, .o_bytes=8080 , .is_bias=1 , .is_flatten=0 , .b_offset=17 , .b_val_shift=9 , .b_bias_shift=0 , .ca_nzero=0 , .ca_shift=12 , .ca_pl_scale=0 , .csh=1 , .ch=5 , .csh_shift=0 , .pkh=1 , .psh=1 , .ph=5 , .psh_shift=0 , .csw=1 , .cw=6 , .csw_shift=0 , .pkw=1 , .psw=1 , .pw=6 , .psw_shift=0 , .pool=POOL_NONE , .on=8 , .oh=5 , .ow=6 , .oc=24 , .x_header= 81961u, .x_header_p0= 81929u, .w_header= 587873730601u, .w_header_p0= 81929u , .debug_nhwc_words=5760 }, + {.n=8 , .l=1 , .kw=1 , .coe=24 , .coe_tl=0 , .r_ll=5 , .h=5 , .w=6 , .ci=24 , .co=10 , .w_kw2=6 , .t=1 , .p=2 , .cm=20 , .cm_p0=4 , .xp_words=672, .out_buffer_idx=1 , .w_bpt=248 , .w_bpt_p0=56 , .x_bpt=6728 , .x_bpt_p0=1352 , .o_words=4200 , .o_bytes=2220 , .is_bias=0 , .is_flatten=1 , .b_offset=41 , .b_val_shift=0 , .b_bias_shift=0 , .ca_nzero=1 , .ca_shift=6 , .ca_pl_scale=3 , .csh=1 , .ch=5 , .csh_shift=0 , .pkh=1 , .psh=1 , .ph=5 , .psh_shift=0 , .csw=1 , .cw=6 , .csw_shift=0 , .pkw=1 , .psw=1 , .pw=6 , .psw_shift=0 , .pool=POOL_NONE , .on=1 , .oh=8 , .ow=1 , .oc=300, .x_header= 82072u, .x_header_p0= 81944u, .w_header= 656593207448u, .w_header_p0= 81944u , .debug_nhwc_words=2400 }, + {.n=1 , .l=1 , .kw=1 , .coe=24 , .coe_tl=0 , .r_ll=8 , .h=8 , .w=1 , .ci=300 , .co=10 , .w_kw2=1 , .t=1 , .p=15 , .cm=20 , .cm_p0=20 , .xp_words=14 , .out_buffer_idx=-1, .w_bpt=248 , .w_bpt_p0=248 , .x_bpt=148 , .x_bpt_p0=148 , .o_words=80 , .o_bytes=320 , .is_bias=1 , .is_flatten=0 , .b_offset=41 , .b_val_shift=9 , .b_bias_shift=0 , .ca_nzero=1 , .ca_shift=15 , .ca_pl_scale=3 , .csh=1 , .ch=8 , .csh_shift=0 , .pkh=1 , .psh=1 , .ph=8 , .psh_shift=0 , .csw=1 , .cw=1 , .csw_shift=0 , .pkw=1 , .psw=1 , .pw=1 , .psw_shift=0 , .pool=POOL_NONE , .on=1 , .oh=8 , .ow=1 , .oc=10 , .x_header= 152u, .x_header_p0= 152u, .w_header= 652835029144u, .w_header_p0= 152u , .debug_nhwc_words=80 } }; #define X_BITS_L2 2 @@ -16,6 +16,7 @@ Bundle_t bundles [N_BUNDLES] = { #define PE_ROWS 8 #define PE_COLS 24 +#define N_BUF 2 #define WB_BYTES 9954 #define W_BYTES 9824 #define X_BYTES 9096 diff --git a/c/runtime.h b/c/runtime.h index 21fafc4..9bbd956 100644 --- a/c/runtime.h +++ b/c/runtime.h @@ -11,7 +11,7 @@ #endif typedef const struct { - const int32_t n, l, kw, coe, coe_tl, r_ll, h, w, ci, co, w_kw2, t, p, cm, cm_p0, xp_words; + const int32_t n, l, kw, coe, coe_tl, r_ll, h, w, ci, co, w_kw2, t, p, cm, cm_p0, xp_words, out_buffer_idx; const int32_t w_bpt, w_bpt_p0, x_bpt, x_bpt_p0, o_words, o_bytes; // bytes per transfer const int8_t is_bias, is_pool, is_flatten; const int32_t b_offset, b_val_shift, b_bias_shift; @@ -31,15 +31,17 @@ typedef enum {POOL_NONE, POOL_MAX, POOL_AVG} Pool_t; typedef struct { int8_t w [W_BYTES ]; B_TYPE b [B_WORDS ]; // keep next to w. weights are loaded to w_ptr + int8_t buffers [N_BUF][O_BYTES_MAX ]; int8_t x [X_BYTES_ALL ]; int32_t y [O_WORDS ]; int32_t nhwc [Y_BYTES/4 ]; int8_t debug_tiled [O_WORDS_MAX ]; - uint8_t debug_packed [O_BYTES_MAX ]; int32_t debug_nhwc [Y_BYTES/4 ]; } Memory_st; Memory_st mem; +int8_t *p_in_buffer = (int8_t*)&mem.x; +volatile char is_bundle_write_done = 1; #define assert_printf(v1, op, v2, optional_debug_info,...) ((v1 op v2) || (printf("ASSERT FAILED: \n CONDITION: "), printf("( " #v1 " " #op " " #v2 " )"), printf(", VALUES: ( %d %s %d ), ", v1, #op, v2), printf("DEBUG_INFO: " optional_debug_info), printf(" " __VA_ARGS__), printf("\n\n"), assert(v1 op v2), 0)) @@ -62,7 +64,7 @@ static inline int32_t quant_lrelu(int32_t x, int8_t nzero, int8_t shift, int8_t } -static inline void write_x(int8_t val, int32_t ib, int32_t ixp, int32_t ixn, int32_t ixl, int32_t ixw, int32_t ixcm, int32_t ixr, Bundle_t *pb_out, int32_t xcm ){ +static inline void write_x(int8_t val, int8_t *p_out_buffer, int32_t ib, int32_t ixp, int32_t ixn, int32_t ixl, int32_t ixw, int32_t ixcm, int32_t ixr, Bundle_t *pb_out, int32_t xcm ){ #define WRITEX_DEBUG_INFO "--- ib:%d ixp:%d ixn:%d ixl:%d ixw:%d ixcm:%d ixr:%d xcm :%d \n",ib,ixp,ixn,ixl,ixw,ixcm,ixr,xcm assert_printf (ixr , <, PE_ROWS+X_PAD, "write_x", WRITEX_DEBUG_INFO); @@ -87,15 +89,15 @@ static inline void write_x(int8_t val, int32_t ib, int32_t ixp, int32_t ixn, int assert_printf (packed_index , <, bundles[ib].o_bytes, "write_x", WRITEX_DEBUG_INFO); uint8_t packed_val = ((uint8_t)val & X_BITS_MASK) << (packed_position * X_BITS); - uint8_t mem_val = mem.debug_packed[packed_index]; + uint8_t mem_val = p_out_buffer[packed_index]; uint8_t mem_val_cleaned = X_POSITION_INVERTED_MASKS[packed_position] & mem_val; - mem.debug_packed[packed_index] = mem_val_cleaned | packed_val; + p_out_buffer[packed_index] = mem_val_cleaned | packed_val; // if (ib==1 && packed_index >= 356) printf("index:%d, final_val:%d --- position:%d value:%d packed_val:%d, mem_val:%d, mem_val_cleaned:%d, clean_mask:%d, pos_mask:%d \n", packed_index, mem.debug_packed[packed_index], packed_position, val, packed_val, mem_val, mem_val_cleaned, X_BITS_MASK, X_POSITION_INVERTED_MASKS[packed_position]); } -static inline void tile_write( int32_t out_val, int32_t ib, Bundle_t *pb, int32_t i_yn, int32_t i_yh, int32_t i_yw, int32_t i_yc, int32_t yn, int32_t yh, int32_t yw, int32_t yc ) { +static inline void tile_write( int32_t out_val, int8_t *p_out_buffer, int32_t ib, Bundle_t *pb, int32_t i_yn, int32_t i_yh, int32_t i_yw, int32_t i_yc, int32_t yn, int32_t yh, int32_t yw, int32_t yc ) { // ------ FLATTEN ------ if (pb->is_flatten) { @@ -145,13 +147,13 @@ static inline void tile_write( int32_t out_val, int32_t ib, Bundle_t *pb, int32_ int32_t yr_sweep = i_yh==yh-1 ? PE_ROWS : i_yr + 1; for (int32_t i_yr_dest = i_yr; i_yr_dest < yr_sweep; i_yr_dest++) { - write_x(out_val, ib, i_yp, i_yn, i_yl, i_yw, i_ycm, i_yr_dest, pb_out, ycm); + write_x(out_val, p_out_buffer, ib, i_yp, i_yn, i_yl, i_yw, i_ycm, i_yr_dest, pb_out, ycm); // --- PADDING: the [bottom X_PAD rows of previous block (l-1)] with [first X_PAD rows of this block (l)] if (i_yr_dest < X_PAD) { int32_t pad_val = (i_yl == 0) ? 0 : out_val; int32_t dest_yl = (i_yl == 0) ? pb_out->l-1 : i_yl-1; - write_x(pad_val, ib, i_yp, i_yn, dest_yl, i_yw, i_ycm, i_yr_dest+PE_ROWS, pb_out, ycm); + write_x(pad_val, p_out_buffer, ib, i_yp, i_yn, dest_yl, i_yw, i_ycm, i_yr_dest+PE_ROWS, pb_out, ycm); } out_val = 0; } @@ -164,6 +166,7 @@ extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_ static Bundle_t *pb = &bundles[0]; static int32_t it_bias=0; static int32_t ib=0, ip=0, it=0, in=0, il=0, iw_kw2=0; + static int8_t *p_out_buffer = (int8_t*)&mem.buffers[0]; const int32_t *p_sram = (const int32_t *)p_sram_u32; int32_t iy_nhwc; @@ -185,8 +188,8 @@ extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_ for (int ixp=0; ixp < pb_out->p; ixp++) { int32_t offset_words = (ixp == 0) ? 0 : (pb_out->cm_p0 + (ixp-1)*pb_out->cm)*pb_out->xp_words; int32_t offset_bytes = offset_words/X_WORDS_PER_BYTE + ixp*8; - uint64_t *ptr_x_header = (uint64_t*)&(mem.debug_packed[offset_bytes]); - *ptr_x_header = ixp == 0 ? pb_out->x_header_p0 : pb_out->x_header; + + *(uint64_t*)&(p_out_buffer[offset_bytes]) = ixp == 0 ? pb_out->x_header_p0 : pb_out->x_header; // printf("--------ib:%d, ixp:%d offset_bytes:%d\n", ib, ixp, offset_bytes); } write_x_header = 0; @@ -273,7 +276,7 @@ extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_ // ------ MAX/AVG POOL --- if (pb->pool == POOL_NONE) { - tile_write(out_val, ib, pb, i_yn, i_yh, i_yw, i_yc, yn, yh, yw, yc); + tile_write(out_val, p_out_buffer, ib, pb, i_yn, i_yh, i_yw, i_yc, yn, yh, yw, yc); goto PROCESS_AND_STORE_DONE; } @@ -323,7 +326,7 @@ extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_ result = pb->pool==POOL_MAX ? result : div_round(result, count); // ------ POOL ACTIVATION ------ - tile_write(result, ib, pb, i_yn, ixh, ixw, i_yc, yn, pb->ph, pb->pw, yc); // Write + tile_write(result, p_out_buffer, ib, pb, i_yn, ixh, ixw, i_yc, yn, pb->ph, pb->pw, yc); // Write } } yh = pb->ph; @@ -350,6 +353,7 @@ extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_ ++ip; if (ip >= pb->p) { ip = 0; //after_each(ib) = after_all(ip): printf("done bundle!! iw_kw2:%d in:%d il:%d it:%d ip:%d ib:%d\n", iw_kw2, in, il, it, ip, ib); + is_bundle_write_done = 1; char f_path_debug [1000]; @@ -370,7 +374,7 @@ extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_ char f_path_packed [1000]; sprintf(f_path_packed, "%s/%0d_y_packed_sim.bin", DATA_DIR, ib); FILE *fp_packed = fopen(f_path_packed, "wb"); - fwrite(&mem.debug_packed, 1, pb->o_bytes, fp_packed); + fwrite(p_out_buffer, 1, pb->o_bytes, fp_packed); fclose(fp_packed); } @@ -379,6 +383,7 @@ extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_ }//new(ib): pb = &bundles[ib]; + p_out_buffer = (int8_t*)&mem.buffers[pb->out_buffer_idx]; if (ib != N_BUNDLES-1) write_x_header = 1; // Make write_x write new headers }//new(ip): @@ -391,7 +396,7 @@ extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_ } -extern EXT_C void load_x (uint8_t *p_done, int32_t *p_offset, int32_t *p_bpt) { +extern EXT_C void load_x (uint8_t *p_done, uint8_t *bundle_read_done, int32_t *p_offset, int32_t *p_bpt) { static int32_t ib=0, ip=0, it=0, offset_next=0; int32_t offset = offset_next; @@ -402,13 +407,12 @@ extern EXT_C void load_x (uint8_t *p_done, int32_t *p_offset, int32_t *p_bpt) { // Nested for loop [for ib: for ip: for it: {}] inverted to increment once per call ++ it; if (it >= bundles[ib].t) { it = 0; + offset_next += bpt; ++ ip; if (ip >= bundles[ib].p) { ip = 0; ++ ib; if (ib >= N_BUNDLES) { ib = 0; *p_done =1; offset_next = 0; - }} - offset_next += bpt; - } + }}} } @@ -455,8 +459,11 @@ extern EXT_C void fill_memory (){ printf("i:%d, bias:%d\n", i, mem.b[i]); } - extern EXT_C int8_t get_byte_wx (int32_t addr, int32_t mode){ if (mode==0) return mem.w[addr]; else if (mode==1) return mem.x[addr]; +} + +extern EXT_C char get_is_bundle_write_done(){ + return is_bundle_write_done; } \ No newline at end of file diff --git a/test/py/bundle.py b/test/py/bundle.py index b7c5504..f263089 100644 --- a/test/py/bundle.py +++ b/test/py/bundle.py @@ -64,6 +64,8 @@ def __init__(self, # Store reference to bundle object here, not just a idx number self.prev_bundle = None self.add_bundle = None + self.out_tensor_dest = [] + self.buffer_idx = None def extract_act(signature): ilayer = QActivation(signature) @@ -159,6 +161,7 @@ def call(self, x, x_1=None): if hasattr(x, "bundle"): self.prev_bundle = x.bundle self.idx = self.prev_bundle.idx + 1 + self.prev_bundle.out_tensor_dest += [self.idx] else: self.prev_bundle = None self.idx = 0 @@ -172,6 +175,7 @@ def call(self, x, x_1=None): if x_1 is not None: if hasattr(x_1, "bundle"): self.add['bundle'] = x_1.bundle + self.x_1.out_tensor_dest += [self.idx] else: self.add['bundle'] = None x = Add()([x, x_1]) @@ -622,7 +626,6 @@ def get_runtime_params(c, w_shape, x_shape, o_shape, core_d, pool_d, flatten): print('final output', o_shape) - ''' Pack all local variables into a namedtuple ''' diff --git a/test/py/param_test.py b/test/py/param_test.py index 86bc98c..93d9203 100644 --- a/test/py/param_test.py +++ b/test/py/param_test.py @@ -9,6 +9,7 @@ import pytest import itertools import pickle +from copy import deepcopy from collections import namedtuple from dataclasses import dataclass from bundle import Bundle @@ -244,11 +245,41 @@ def test_dnn_engine(COMPILE): ''' Export ''' + buffer_map = [] for ib, b in enumerate(bundles): print(f'-----------------{b.idx}-----------------------') b.process(inp if b.idx==0 else None, c) b.export(c, False) #ib==len(bundles)-1 + ''' + Buffer allocation + ''' + if ib == len(bundles)-1: + b.buffer_idx = -1 + continue + + print(f'input_map:{buffer_map}') + + '''Find and assign a free buffer. If not, add new buffer''' + for im in range(len(buffer_map)): + if buffer_map[im] is None: + buffer_map[im] = {'in':ib, 'out':b.out_tensor_dest} + b.buffer_idx = im + break + else: #m if break is not hit + b.buffer_idx = len(buffer_map) + buffer_map += [{'in':ib, 'out':b.out_tensor_dest}] + + print('buffer_idx:', b.buffer_idx) + + '''Free the buffers whose last destination is current bundle''' + for im in range(len(buffer_map)): + buf = buffer_map[im] + if buf is not None: + if buf['out'][-1] == ib: + buffer_map[im] = None + + print(f'output_map:{buffer_map}') ''' @@ -309,7 +340,7 @@ def test_dnn_engine(COMPILE): elif b.pool['type'] == 'avg': pool_type = 'POOL_AVG' - ch.write(f" {{.n={b.r.XN:<3}, .l={b.r.XL:<3}, .kw={b.r.KW:<3}, .coe={y_coe:<3}, .coe_tl={y_coe_tl:<3}, .r_ll={y_r_ll:<3}, .h={b.r.XH:<3}, .w={b.r.XW:<3}, .ci={b.r.CI:<3}, .co={b.r.CO:<3}, .w_kw2={b.r.XW-b.r.KW//2:<3}, .t={b.r.IT:<3}, .p={b.r.CP:<3}, .cm={b.r.CM:<3}, .cm_p0={b.r.CM_0:<3}, .xp_words={xp_words:<3}, ") + ch.write(f" {{.n={b.r.XN:<3}, .l={b.r.XL:<3}, .kw={b.r.KW:<3}, .coe={y_coe:<3}, .coe_tl={y_coe_tl:<3}, .r_ll={y_r_ll:<3}, .h={b.r.XH:<3}, .w={b.r.XW:<3}, .ci={b.r.CI:<4}, .co={b.r.CO:<3}, .w_kw2={b.r.XW-b.r.KW//2:<3}, .t={b.r.IT:<3}, .p={b.r.CP:<3}, .cm={b.r.CM:<3}, .cm_p0={b.r.CM_0:<3}, .xp_words={xp_words:<3}, .out_buffer_idx={b.buffer_idx:<2}, ") ch.write( f".w_bpt={w_bpt:<5}, .w_bpt_p0={w_bpt_p0:<5}, .x_bpt={x_bpt:<5}, .x_bpt_p0={x_bpt_p0:<5}, .o_words={o_words_b:<5}, .o_bytes={o_bytes_b:<5}, ") ch.write( f".is_bias={1*(b.b is not None):<3}, .is_flatten={1*b.flatten:<3}, ") ch.write( f".b_offset={b_words:<3}, .b_val_shift={b.bias_val_shift:<3}, .b_bias_shift={b.bias_b_shift:<3}, ") @@ -333,6 +364,7 @@ def test_dnn_engine(COMPILE): ch.write(f"#define PE_ROWS {c.ROWS}\n") ch.write(f"#define PE_COLS {c.COLS}\n\n") + ch.write(f"#define N_BUF {len(buffer_map)}\n") ch.write(f"#define WB_BYTES {w_bytes + (b_words*c.B_BITS)//8}\n") ch.write(f"#define W_BYTES {w_bytes}\n") ch.write(f"#define X_BYTES {x_bytes}\n") diff --git a/test/sv/dnn_engine_tb.sv b/test/sv/dnn_engine_tb.sv index a4aa4d1..49a777a 100644 --- a/test/sv/dnn_engine_tb.sv +++ b/test/sv/dnn_engine_tb.sv @@ -49,14 +49,15 @@ module dnn_engine_tb; DMA_M2S #(S_PIXELS_WIDTH_LF , VALID_PROB, 1) source_x (aclk, aresetn, s_axis_pixels_tready , s_axis_pixels_tvalid , s_axis_pixels_tlast , s_axis_pixels_tdata , s_axis_pixels_tkeep ); DMA_M2S #(S_WEIGHTS_WIDTH_LF, VALID_PROB, 0) source_k (aclk, aresetn, s_axis_weights_tready, s_axis_weights_tvalid, s_axis_weights_tlast, s_axis_weights_tdata, s_axis_weights_tkeep); - bit y_done=0, x_done=0, w_done=0; + bit y_done=0, x_done=0, w_done=0, bundle_read_done=0, bundle_write_done=0; int w_offset=0, w_bpt=0, x_offset=0, x_bpt=0; - import "DPI-C" function void load_x(inout bit x_done, inout int x_offset, x_bpt); + import "DPI-C" function void load_x(inout bit x_done, bundle_read_done, inout int x_offset, x_bpt); import "DPI-C" function void load_w(inout bit w_done, inout int w_offset, w_bpt); import "DPI-C" function void load_y(inout bit y_done, inout bit m_t_done_proc, inout bit [31:0] y_sram [ROWS*COLS-1:0]); import "DPI-C" function void fill_memory(); import "DPI-C" function byte get_byte_wx (int addr, int mode); + import "DPI-C" function byte get_is_bundle_write_done(); // W DMA @@ -71,8 +72,9 @@ module dnn_engine_tb; // X DMA initial while (1) begin - load_x (x_done, x_offset, x_bpt); + load_x (x_done, bundle_read_done, x_offset, x_bpt); source_x.axis_push(x_offset, x_bpt); + while(bundle_read_done && !get_is_bundle_write_done()) #10ps; $display("Done input dma at offset=%d, bpt=%d \n", x_offset, x_bpt); if (x_done) break; end