diff --git a/deepsocflow/c/runtime.h b/deepsocflow/c/runtime.h index 17c3779..4494f0c 100644 --- a/deepsocflow/c/runtime.h +++ b/deepsocflow/c/runtime.h @@ -1,16 +1,9 @@ -#include #include #include #include #include #include -#ifdef VERILATOR - #define EXT_C "C" -#else - #define EXT_C -#endif - typedef const struct { const int32_t n, l, kw, coe, coe_tl, r_ll, h, w, ci, co, w_kw2, t, p, cm, cm_p0, xp_words; const int32_t w_bpt, w_bpt_p0, x_bpt, x_bpt_p0, o_words, o_bytes; // bytes per transfer @@ -43,13 +36,29 @@ typedef struct { int8_t out_buffers [2 ][O_BYTES_MAX ]; int8_t add_buffers [N_ADD_BUF ][NHWC_WORDS ]; // should be last, since N_ADD_BUF can be empty } Memory_st; -Memory_st mem; -int8_t *p_in_buffer = (int8_t*)&mem.x; -volatile char is_bundle_write_done = 1; + +#ifdef __x86_64__ + #define SIM + #include + #define fprintf fprintf + Memory_st mem; + #define p_mem (&mem) +#else + #define fprintf(...) + #define p_mem ((Memory_st*)MEM_BASEADDR) +#endif + +#ifdef __cplusplus + #define EXT_C "C" +#else + #define EXT_C +#endif #define assert_printf(v1, op, v2, optional_debug_info,...) ((v1 op v2) || (printf("ASSERT FAILED: \n CONDITION: "), printf("( " #v1 " " #op " " #v2 " )"), printf(", VALUES: ( %d %s %d ), ", v1, #op, v2), printf("DEBUG_INFO: " optional_debug_info), printf(" " __VA_ARGS__), printf("\n\n"), assert(v1 op v2), 0)) +volatile char is_bundle_write_done = 1; + #define flatten_nhwc(in,ih,iw,ic, N,H,W,C, optional_debug_info,...)\ ((in*H + ih)*W + iw)*C + ic;\ assert_printf (in, <, N, optional_debug_info,__VA_ARGS__); assert_printf (ih, <, H, optional_debug_info,__VA_ARGS__); assert_printf (iw, <, W, optional_debug_info,__VA_ARGS__); assert_printf (ic, <, C, optional_debug_info,__VA_ARGS__); assert_printf ((((in*H + ih)*W + iw)*C + ic), <, NHWC_WORDS, optional_debug_info,__VA_ARGS__); @@ -84,7 +93,7 @@ static inline void write_x(int8_t val, int8_t *p_out_buffer, int32_t ib, int32_t // Debug tiled output int32_t flat_index = p_offset + flat_index_n2r; - mem.debug_tiled[flat_index] = val; + p_mem->debug_tiled[flat_index] = val; // Pack bits and store int32_t flat_index_with_header = p_offset + flat_index_n2r + (ixp+1)*64/X_BITS; @@ -98,7 +107,7 @@ static inline void write_x(int8_t val, int8_t *p_out_buffer, int32_t ib, int32_t uint8_t mem_val_cleaned = X_POSITION_INVERTED_MASKS[packed_position] & mem_val; p_out_buffer[packed_index] = mem_val_cleaned | packed_val; - // if (ib==1 && packed_index >= 356) printf("index:%d, final_val:%d --- position:%d value:%d packed_val:%d, mem_val:%d, mem_val_cleaned:%d, clean_mask:%d, pos_mask:%d \n", packed_index, mem.debug_packed[packed_index], packed_position, val, packed_val, mem_val, mem_val_cleaned, X_BITS_MASK, X_POSITION_INVERTED_MASKS[packed_position]); + // if (ib==1 && packed_index >= 356) printf("index:%d, final_val:%d --- position:%d value:%d packed_val:%d, mem_val:%d, mem_val_cleaned:%d, clean_mask:%d, pos_mask:%d \n", packed_index, p_mem->debug_packed[packed_index], packed_position, val, packed_val, mem_val, mem_val_cleaned, X_BITS_MASK, X_POSITION_INVERTED_MASKS[packed_position]); } @@ -142,14 +151,14 @@ static inline void tile_write( int32_t out_val, int8_t *p_out_buffer, int32_t ib // ------ STORE ------ int32_t iy_nhwc = flatten_nhwc(i_yn,i_yh,i_yw,i_yc, yn,yh,yw,yc,,); - mem.debug_nhwc[iy_nhwc] = out_val; + p_mem->debug_nhwc[iy_nhwc] = out_val; // Store for residual add if (pb->add_out_buffer_idx != -1) - mem.add_buffers[pb->add_out_buffer_idx][iy_nhwc] = (int8_t)out_val; + p_mem->add_buffers[pb->add_out_buffer_idx][iy_nhwc] = (int8_t)out_val; if (ib == N_BUNDLES-1) - mem.y[iy_nhwc] = out_val; // Last bundle: save as NHWC + p_mem->y[iy_nhwc] = out_val; // Last bundle: save as NHWC else { // Other bundles: pad & save as tiled @@ -169,33 +178,33 @@ static inline void tile_write( int32_t out_val, int8_t *p_out_buffer, int32_t ib } } - extern EXT_C void load_y (uint8_t *p_done, uint64_t *p_base_addr_next, int32_t *p_bpt_next) { static Bundle_t *pb = &bundles[0]; static int32_t it_bias=0; static int32_t ib=0, ip=0, it=0, in=0, il=0, iw_kw2=0; - static int8_t *p_out_buffer = (int8_t*)&mem.out_buffers[0]; + static int8_t *p_out_buffer = (int8_t*)&p_mem->out_buffers[0]; int32_t iy_nhwc; div_t div_ch, div_cw, div_ixh, div_ixw; - int32_t ph_end, ph_beg_const, ph_beg, ixh_beg, xh_sweep; - int32_t pw_end, pw_beg_const, pw_beg, ixw_beg, xw_sweep; + int32_t ph_end, ph_beg_const, ixh_beg, xh_sweep; + int32_t pw_end, pw_beg_const, ixw_beg, xw_sweep; +#ifdef SIM char f_path_raw [1000], f_path_sum [1000]; // make sure full f_path_raw is shorter than 1000 sprintf(f_path_raw, "%s/%0d_%0d_%0d_y_raw_sim.txt", DATA_DIR, ib, ip, it); sprintf(f_path_sum, "%s/%0d_y_sum_sim.txt", DATA_DIR, ib); - FILE *fp_raw = fopen(f_path_raw, "a"); - FILE *fp_sum = fopen(f_path_sum, "a"); + FILE *fp_raw = fopen(f_path_raw, "a"); + FILE *fp_sum = fopen(f_path_sum, "a"); +#endif - static char ocm_bank = 1; + static int8_t ocm_bank = 1; int32_t w_last, sram_addr; - static uint8_t write_x_header = N_BUNDLES == 1 ? 0 : 1; /** * ---------- WAIT FOR S2MM DMA DONE ---------- - * + * * When running on hardware, we wait for DMA's interrupt at "DMA_WAIT" * But Verilator cannot pass simulation time when "waiting" * Therefore, @@ -206,13 +215,13 @@ extern EXT_C void load_y (uint8_t *p_done, uint64_t *p_base_addr_next, int32_t * */ static char is_first_call = 1; if (is_first_call) is_first_call = 0; - else goto DMA_WAIT; - + else goto DMA_WAIT; + for (ib = 0; ib < N_BUNDLES; ib++) { pb = &bundles[ib]; - p_out_buffer = (int8_t*)&mem.out_buffers[pb->out_buffer_idx]; + p_out_buffer = (int8_t*)&p_mem->out_buffers[pb->out_buffer_idx]; // Init - add headers to out buffer if (ib != N_BUNDLES-1) { @@ -238,7 +247,7 @@ extern EXT_C void load_y (uint8_t *p_done, uint64_t *p_base_addr_next, int32_t * ocm_bank = !ocm_bank; w_last = iw_kw2 == pb->w_kw2-1 ? pb->kw/2+1 : 1; - *p_base_addr_next = (uint64_t)&mem.ocm[ocm_bank]; + *p_base_addr_next = (uint64_t)&p_mem->ocm[ocm_bank]; *p_bpt_next = PE_ROWS * pb->coe * w_last * sizeof(Y_TYPE); return; @@ -257,7 +266,7 @@ extern EXT_C void load_y (uint8_t *p_done, uint64_t *p_base_addr_next, int32_t * #define DEBUG_INFO "--- ib:%d ip:%d it:%d in:%d il:%d iw_kw2:%d icoe:%d iw_last:%d ir:%d \n",ib,ip,it,in,il,iw_kw2,icoe,iw_last,ir int32_t raw_val=0, out_val=0; - + // Caculate y_index int32_t i_yn = in; int32_t i_yh = il*PE_ROWS + ir; @@ -271,28 +280,28 @@ extern EXT_C void load_y (uint8_t *p_done, uint64_t *p_base_addr_next, int32_t * int32_t yc = pb->co; // if out of bounds, early return - if (i_yh >= yh || i_yc >= yc) { + if (i_yh >= yh || i_yc >= yc) { if (ip == pb->p-1) fprintf(fp_sum,"%d\n", 0); // Save summed output goto PROCESS_AND_STORE_DONE; } - raw_val = mem.ocm[ocm_bank][sram_addr]; + raw_val = p_mem->ocm[ocm_bank][sram_addr]; out_val = raw_val; PROCESS_START: - // ------ ADD P PASSES ------ + // ------ ADD P PASSES ------ iy_nhwc = flatten_nhwc(i_yn,i_yh,i_yw,i_yc, yn,yh,yw,yc, "Before add P passes", DEBUG_INFO); if (pb->p == 1) { // only p : proceed with value } else if (ip == pb->p-1) {// last p : read, add, proceed - out_val += mem.nhwc[iy_nhwc]; + out_val += p_mem->nhwc[iy_nhwc]; } else if (ip == 0) { // first p : overwrite memory, return - mem.nhwc[iy_nhwc] = out_val; + p_mem->nhwc[iy_nhwc] = out_val; goto PROCESS_AND_STORE_DONE; } else { // middle p: read, add, store, return - mem.nhwc[iy_nhwc] += out_val; + p_mem->nhwc[iy_nhwc] += out_val; goto PROCESS_AND_STORE_DONE; } fprintf(fp_sum,"%d\n", out_val); // Save summed output @@ -310,10 +319,10 @@ extern EXT_C void load_y (uint8_t *p_done, uint64_t *p_base_addr_next, int32_t * yh = pb->ch; yw = pb->cw; - // ------ ADD BIAS ------ + // ------ ADD BIAS ------ if (pb->is_bias) - out_val = (out_val << pb->b_val_shift) + (mem.b[i_bias] << pb->b_bias_shift); - + out_val = (out_val << pb->b_val_shift) + (p_mem->b[i_bias] << pb->b_bias_shift); + // ------ CORE ACT ------ out_val = quant_lrelu(out_val, pb->ca_nzero, pb->ca_shift, pb->ca_pl_scale); @@ -322,11 +331,11 @@ extern EXT_C void load_y (uint8_t *p_done, uint64_t *p_base_addr_next, int32_t * if (pb->add_in_buffer_idx != -1) { iy_nhwc = flatten_nhwc(i_yn,i_yh,i_yw,i_yc, yn,yh,yw,yc, "Before add", DEBUG_INFO);// store as nhwc for pooling - out_val += mem.add_buffers[pb->add_in_buffer_idx][iy_nhwc]; + out_val += p_mem->add_buffers[pb->add_in_buffer_idx][iy_nhwc]; out_val = shift_round(out_val, pb->add_act_shift); out_val = clip(out_val, -(1<<(X_BITS-1)), (1<<(X_BITS-1))-1); } - + // ------ SOFTMAX ------ if (pb->is_softmax) { @@ -335,20 +344,21 @@ extern EXT_C void load_y (uint8_t *p_done, uint64_t *p_base_addr_next, int32_t * float val = (float)out_val; val = val / (float)(1 << pb->softmax_frac); val = val - pb->softmax_max_f; +#ifdef SIM val = (float)exp(val); - - mem.y[iy_nhwc] = val; +#endif + p_mem->y[iy_nhwc] = val; if (i_yc == pb->co-1) { float sum = 0; int32_t iy_nhwc; for (int i=0; ico; i++){ iy_nhwc = flatten_nhwc(i_yn,i_yh,i_yw,i, yn,yh,yw,yc, "Before softmax sum", DEBUG_INFO); - sum += mem.y[iy_nhwc]; + sum += p_mem->y[iy_nhwc]; } for (int i=0; ico; i++){ iy_nhwc = flatten_nhwc(i_yn,i_yh,i_yw,i, yn,yh,yw,yc, "After softmax sum", DEBUG_INFO); - mem.y[iy_nhwc] = mem.y[iy_nhwc] / sum; + p_mem->y[iy_nhwc] = p_mem->y[iy_nhwc] / sum; } } goto PROCESS_AND_STORE_DONE; @@ -362,24 +372,26 @@ extern EXT_C void load_y (uint8_t *p_done, uint64_t *p_base_addr_next, int32_t * } iy_nhwc = flatten_nhwc(i_yn,i_yh,i_yw,i_yc, yn,yh,yw,yc, "Before maxpool", DEBUG_INFO);// store as nhwc for pooling - mem.nhwc[iy_nhwc] = out_val; + p_mem->nhwc[iy_nhwc] = out_val; div_ixh = div(i_yh+pb->psh_shift-pb->pkh+1, pb->psh); div_ixw = div(i_yw+pb->psw_shift-pb->pkw+1, pb->psw); ixh_beg = div_ixh.quot; // ix(hw) that corresponds to the pooling window ixw_beg = div_ixw.quot; - + if (ixh_beg < 0 || ixw_beg < 0) // skip when target ix(h,w) < 0 goto PROCESS_AND_STORE_DONE; // Pool Striding - if (div_ixh.rem != 0) // invalid ixh + if (div_ixh.rem != 0) { // invalid ixh if (i_yh==yh-1) ixh_beg += 1; //but last yh. start sweeping else goto PROCESS_AND_STORE_DONE; // not last yh. skip - - if (div_ixw.rem != 0) + } + + if (div_ixw.rem != 0) { if (i_yw==yw-1) ixw_beg += 1; else goto PROCESS_AND_STORE_DONE; + } ph_end = i_yh; // iy(h,w) is the bottom-right of pooling window -> All values in pooling window have been computed pw_end = i_yw; @@ -399,7 +411,7 @@ extern EXT_C void load_y (uint8_t *p_done, uint64_t *p_base_addr_next, int32_t * for (int32_t ipyw = pw_end; ipyw > pw_beg; ipyw--){ int32_t read_idx = flatten_nhwc(i_yn, ipyh, ipyw, i_yc, yn, yh, yw, yc, "Inside pool window", DEBUG_INFO); - int32_t read_val = mem.nhwc[read_idx]; + int32_t read_val = p_mem->nhwc[read_idx]; result = pb->pool==POOL_MAX ? max(result, read_val) : (result + read_val); } } @@ -417,7 +429,7 @@ extern EXT_C void load_y (uint8_t *p_done, uint64_t *p_base_addr_next, int32_t * } yh = pb->ph; yw = pb->pw; - + PROCESS_AND_STORE_DONE: @@ -426,9 +438,10 @@ extern EXT_C void load_y (uint8_t *p_done, uint64_t *p_base_addr_next, int32_t * } } } +#ifdef SIM fclose(fp_sum); fclose(fp_raw); - +#endif } // iw_kw2 iw_kw2 = 0; } // il @@ -438,17 +451,18 @@ extern EXT_C void load_y (uint8_t *p_done, uint64_t *p_base_addr_next, int32_t * } // it it = 0; } // ip - ip = 0; + ip = 0; //after_each(ib) = after_all(ip): is_bundle_write_done = 1; - + printf("done bundle!! iw_kw2:%d in:%d il:%d it:%d ip:%d ib:%d\n", iw_kw2, in, il, it, ip, ib); +#ifdef SIM char f_path_debug [1000]; sprintf(f_path_debug, "%s/%0d_y_nhwc_sim.txt", DATA_DIR, ib); FILE *fp_debug = fopen(f_path_debug, "w"); for (int32_t i=0; idebug_nhwc_words; i++) - fprintf(fp_debug,"%d\n", mem.debug_nhwc[i]); + fprintf(fp_debug,"%d\n", p_mem->debug_nhwc[i]); fclose(fp_debug); char f_path_tiled [1000]; @@ -456,9 +470,9 @@ extern EXT_C void load_y (uint8_t *p_done, uint64_t *p_base_addr_next, int32_t * FILE *fp_tiled = fopen(f_path_tiled, "w"); for (int32_t i=0; io_words; i++) if (ib == N_BUNDLES-1) - if (pb->is_softmax) fprintf(fp_tiled,"%f\n", mem.y[i]); - else fprintf(fp_tiled,"%d\n", mem.y[i]); - else fprintf(fp_tiled,"%d\n", mem.debug_tiled[i]); + if (pb->is_softmax) fprintf(fp_tiled,"%f\n", p_mem->y[i]); + else fprintf(fp_tiled,"%d\n", p_mem->y[i]); + else fprintf(fp_tiled,"%d\n", p_mem->debug_tiled[i]); fclose(fp_tiled); if (ib != N_BUNDLES-1){ @@ -468,6 +482,7 @@ extern EXT_C void load_y (uint8_t *p_done, uint64_t *p_base_addr_next, int32_t * fwrite(p_out_buffer, 1, pb->o_bytes, fp_packed); fclose(fp_packed); } +#endif } // ib ib = 0; @@ -480,7 +495,7 @@ extern EXT_C void load_x (uint8_t *p_done, uint8_t *bundle_read_done, uint64_t * static int32_t ib=0, ip=0, it=0, offset_next=0; - int8_t *p_buffer_base = (ib==0) ? mem.x : mem.out_buffers[bundles[ib-1].out_buffer_idx]; + int8_t *p_buffer_base = (ib==0) ? p_mem->x : p_mem->out_buffers[bundles[ib-1].out_buffer_idx]; *p_base_addr = (uint64_t)p_buffer_base + offset_next; *p_bpt = ip == 0 ? bundles[ib].x_bpt_p0 : bundles[ib].x_bpt; @@ -507,7 +522,7 @@ extern EXT_C void load_w (uint8_t *p_done, uint64_t *p_base_addr, int32_t *p_bpt int32_t offset = offset_next; int32_t bpt = ip == 0 ? bundles[ib].w_bpt_p0 : bundles[ib].w_bpt; - *p_base_addr = (uint64_t)&mem.w + offset; + *p_base_addr = (uint64_t)&p_mem->w + offset; *p_bpt = bpt; // Nested for loop [for ib: for ip: for it: {}] inverted to increment once per call @@ -520,7 +535,7 @@ extern EXT_C void load_w (uint8_t *p_done, uint64_t *p_base_addr, int32_t *p_bpt offset_next += bpt; } - +#ifdef SIM extern EXT_C void fill_memory (uint64_t *p_w_base, uint64_t *p_x_base){ FILE *fp; char f_path [1000]; @@ -529,22 +544,23 @@ extern EXT_C void fill_memory (uint64_t *p_w_base, uint64_t *p_x_base){ fp = fopen(f_path, "rb"); if(!fp) printf("ERROR! File not found: %s \n", f_path); - fread(mem.w, 1, WB_BYTES, fp); + fread(p_mem->w, 1, WB_BYTES, fp); fclose(fp); sprintf(f_path, "%s/x_all.bin", DATA_DIR); fp = fopen(f_path, "rb"); if(!fp) printf("ERROR! File not found: %s \n", f_path); - fread(mem.x, 1, X_BYTES_ALL, fp); + fread(p_mem->x, 1, X_BYTES_ALL, fp); fclose(fp); for (int32_t i=0; ib[i]); - *p_w_base = (uint64_t)&mem.w; - *p_x_base = (uint64_t)&mem.x; + *p_w_base = (uint64_t)&p_mem->w; + *p_x_base = (uint64_t)&p_mem->x; } +#endif extern EXT_C int8_t get_byte (uint64_t addr){ return *(int8_t*)addr; @@ -559,4 +575,4 @@ extern EXT_C char get_is_bundle_write_done(){ } extern EXT_C void set_is_bundle_write_done(uint8_t val){ is_bundle_write_done = val; -} \ No newline at end of file +} diff --git a/deepsocflow/py/hardware.py b/deepsocflow/py/hardware.py index 6a68d1a..cd9e2bc 100644 --- a/deepsocflow/py/hardware.py +++ b/deepsocflow/py/hardware.py @@ -199,7 +199,7 @@ def simulate(self, SIM='verilator', SIM_PATH=''): assert subprocess.run(cmd).returncode == 0 if SIM == "verilator": - cmd = f'{SIM_PATH}verilator --binary -j 0 -Wno-fatal --trace --relative-includes --top {self.TB_MODULE} -I../ -F ../sources.txt -CFLAGS -DVERILATOR -CFLAGS -I../ {self.MODULE_DIR}/c/example.c --Mdir ./' + cmd = f'{SIM_PATH}verilator --binary -j 0 -Wno-fatal --trace --relative-includes --top {self.TB_MODULE} -I../ -F ../sources.txt -CFLAGS -I../ {self.MODULE_DIR}/c/example.c --Mdir ./' print(cmd) assert subprocess.run(cmd.split(' '), cwd='build').returncode == 0 diff --git a/deepsocflow/tcl/fpga/vivado.tcl b/deepsocflow/tcl/fpga/vivado.tcl index 5094407..d00cd31 100644 --- a/deepsocflow/tcl/fpga/vivado.tcl +++ b/deepsocflow/tcl/fpga/vivado.tcl @@ -74,15 +74,15 @@ reset_run impl_1 reset_run synth_1 launch_runs impl_1 -to_step write_bitstream -jobs 10 wait_on_run -timeout 360 impl_1 -write_hw_platform -fixed -include_bit -force -file design_1_wrapper.xsa +write_hw_platform -fixed -include_bit -force -file $PROJECT_NAME/design_1_wrapper.xsa # Reports open_run impl_1 if {![file exists $PROJECT_NAME/reports]} {exec mkdir $PROJECT_NAME/reports} report_timing_summary -delay_type min_max -report_unconstrained -check_timing_verbose -max_paths 100 -input_pins -routable_nets -name timing_1 -file $PROJECT_NAME/reports/${PROJECT_NAME}_${BOARD}_${FREQ}_timing_report.txt report_utilization -file $PROJECT_NAME/reports/${PROJECT_NAME}_${BOARD}_${FREQ}_utilization_report.txt -name utilization_1 -report_power -file reports/${PROJECT_NAME}_${BOARD}_${FREQ}_power_1.txt -name {power_1} -report_drc -name drc_1 -file reports/${PROJECT_NAME}_${BOARD}_${FREQ}_drc_1.txt -ruledecks {default opt_checks placer_checks router_checks bitstream_checks incr_eco_checks eco_checks abs_checks} +report_power -file $PROJECT_NAME/reports/${PROJECT_NAME}_${BOARD}_${FREQ}_power_1.txt -name {power_1} +report_drc -name drc_1 -file $PROJECT_NAME/reports/${PROJECT_NAME}_${BOARD}_${FREQ}_drc_1.txt -ruledecks {default opt_checks placer_checks router_checks bitstream_checks incr_eco_checks eco_checks abs_checks} exec mkdir -p $PROJECT_NAME/output exec cp "$PROJECT_NAME/$PROJECT_NAME.gen/sources_1/bd/design_1/hw_handoff/design_1.hwh" $PROJECT_NAME/output/