From c1834f74885015f5c2e6f325eb0014adb728776f Mon Sep 17 00:00:00 2001 From: Aba Date: Mon, 20 Nov 2023 20:57:15 -0800 Subject: [PATCH] Replace OUT_RAM_SWITCH with regular S2MM DMA --- deepsocflow/c/runtime.h | 30 ++++++++++-- deepsocflow/py/hardware.py | 1 + deepsocflow/py/model.py | 1 + deepsocflow/rtl/dnn_engine.v | 71 ++++++++++++++++++---------- deepsocflow/tcl/fpga/pynq_z2.tcl | 3 +- deepsocflow/tcl/fpga/vivado.tcl | 31 +++++------- deepsocflow/tcl/fpga/zcu104.tcl | 5 +- deepsocflow/test/sv/dma.sv | 42 +++++++++++++++- deepsocflow/test/sv/dnn_engine_tb.sv | 43 +++++++---------- run/param_test.py | 2 +- run/work/config_fw.h | 1 + run/work/config_tb.svh | 2 +- run/work/hardware.json | 2 +- 13 files changed, 148 insertions(+), 86 deletions(-) diff --git a/deepsocflow/c/runtime.h b/deepsocflow/c/runtime.h index db1f8aa..4c8af40 100644 --- a/deepsocflow/c/runtime.h +++ b/deepsocflow/c/runtime.h @@ -32,6 +32,7 @@ typedef enum {POOL_NONE, POOL_MAX, POOL_AVG} Pool_t; #define X_BITS_MASK ((1 << X_BITS) -1) typedef struct { + Y_TYPE ocm [2][PE_COLS*PE_ROWS]; int8_t w [W_BYTES ]; B_TYPE b [B_WORDS ]; // keep next to w. weights are loaded to w_ptr int8_t x [X_BYTES_ALL ]; @@ -40,7 +41,7 @@ typedef struct { int8_t debug_tiled [O_WORDS_MAX ]; int32_t debug_nhwc [NHWC_WORDS ]; int8_t out_buffers [2 ][O_BYTES_MAX ]; - int8_t add_buffers [N_ADD_BUF ][NHWC_WORDS ]; + int8_t add_buffers [N_ADD_BUF ][NHWC_WORDS ]; // should be last, since N_ADD_BUF can be empty } Memory_st; Memory_st mem; @@ -169,13 +170,12 @@ static inline void tile_write( int32_t out_val, int8_t *p_out_buffer, int32_t ib } -extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_t *p_sram_u32) { +extern EXT_C void load_y (uint8_t *p_done, uint64_t *p_base_addr_next, int32_t *p_bpt_next) { static Bundle_t *pb = &bundles[0]; static int32_t it_bias=0; static int32_t ib=0, ip=0, it=0, in=0, il=0, iw_kw2=0; static int8_t *p_out_buffer = (int8_t*)&mem.out_buffers[0]; - const int32_t *p_sram = (const int32_t *)p_sram_u32; int32_t iy_nhwc; div_t div_ch, div_cw, div_ixh, div_ixw; @@ -206,6 +206,16 @@ extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_ //New iw_kw2: int32_t w_last = iw_kw2 == pb->w_kw2-1 ? pb->kw/2+1 : 1; int32_t sram_addr=0; + + static char is_first_call = 1; + static char ocm_bank = 0; + if (is_first_call) { + is_first_call = 0; + *p_base_addr_next = (uint64_t)&mem.ocm[ocm_bank]; + *p_bpt_next = PE_ROWS * pb->coe * w_last * sizeof(Y_TYPE); + return; + } + for (int32_t icoe=0; icoe < pb->coe; icoe++) { int32_t i_bias = it_bias + icoe; @@ -236,7 +246,7 @@ extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_ goto PROCESS_AND_STORE_DONE; } - raw_val = p_sram[sram_addr]; + raw_val = mem.ocm[ocm_bank][sram_addr]; out_val = raw_val; PROCESS_START: @@ -428,6 +438,7 @@ extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_ ++ib; if (ib >= N_BUNDLES) { ib = 0; // after_all(ib): *p_done = 1; + is_first_call = 1; }//new(ib): pb = &bundles[ib]; @@ -440,7 +451,12 @@ extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_ }//new(in): }//new(il): }//new(iw_kw2): - *pt_done_proc = !(*pt_done_proc); + // *pt_done_proc = !(*pt_done_proc); + + ocm_bank = !ocm_bank; + w_last = iw_kw2 == pb->w_kw2-1 ? pb->kw/2+1 : 1; + *p_base_addr_next = (uint64_t)&mem.ocm[ocm_bank]; + *p_bpt_next = PE_ROWS * pb->coe * w_last * sizeof(Y_TYPE); } @@ -518,6 +534,10 @@ extern EXT_C int8_t get_byte (uint64_t addr){ return *(int8_t*)addr; } +extern EXT_C void set_byte (uint64_t addr, int8_t data){ + *(int8_t*)addr = data; +} + extern EXT_C char get_is_bundle_write_done(){ return is_bundle_write_done; } diff --git a/deepsocflow/py/hardware.py b/deepsocflow/py/hardware.py index a1c0c36..6a68d1a 100644 --- a/deepsocflow/py/hardware.py +++ b/deepsocflow/py/hardware.py @@ -93,6 +93,7 @@ def __init__( self.BITS_BLOCKS_MAX = clog2(self.L_MAX) self.BITS_XN_MAX = clog2(self.XN_MAX) self.BITS_RAM_WEIGHTS_ADDR = clog2(self.RAM_WEIGHTS_DEPTH) + self.Y_OUT_BITS = 2**clog2(self.Y_BITS) self.MODULE_DIR = os.path.normpath(os.path.dirname(deepsocflow.__file__)).replace('\\', '/') self.TB_MODULE = "dnn_engine_tb" diff --git a/deepsocflow/py/model.py b/deepsocflow/py/model.py index b77915d..bc872a8 100644 --- a/deepsocflow/py/model.py +++ b/deepsocflow/py/model.py @@ -181,6 +181,7 @@ def export_inference(self, x, hw): ch.write(f"#define O_BYTES_MAX {o_bytes_max}\n") ch.write(f"#define X_BYTES_ALL {x_bytes_all}\n") ch.write(f"#define NHWC_WORDS {nhwc_words_max}\n") + ch.write(f"#define Y_TYPE int{hw.Y_OUT_BITS}_t\n") ch.write(f"#define B_TYPE int{hw.B_BITS}_t\n") ch.write(f"#define O_TYPE {out_type}\n") ch.write(f"#define B_WORDS {b_words}\n") diff --git a/deepsocflow/rtl/dnn_engine.v b/deepsocflow/rtl/dnn_engine.v index 5dc4da5..6fc8335 100644 --- a/deepsocflow/rtl/dnn_engine.v +++ b/deepsocflow/rtl/dnn_engine.v @@ -14,6 +14,7 @@ module dnn_engine #( S_PIXELS_WIDTH_LF = `S_PIXELS_WIDTH_LF , S_WEIGHTS_WIDTH_LF = `S_WEIGHTS_WIDTH_LF , + M_OUTPUT_WIDTH_LF = `M_OUTPUT_WIDTH_LF , OUT_ADDR_WIDTH = 10, OUT_BITS = 32 @@ -33,11 +34,10 @@ module dnn_engine #( input wire [S_WEIGHTS_WIDTH_LF -1:0] s_axis_weights_tdata, input wire [S_WEIGHTS_WIDTH_LF/8-1:0] s_axis_weights_tkeep, - input wire [(OUT_ADDR_WIDTH+2)-1:0] m_ram_addr_a, - output wire [ OUT_BITS -1:0] m_ram_rddata_a, - input wire m_ram_en_a, - output wire m_done_fill, - input wire m_t_done_proc + input wire m_axis_tready, + output wire m_axis_tvalid, m_axis_tlast, + output wire [M_OUTPUT_WIDTH_LF -1:0] m_axis_tdata, + output wire [M_OUTPUT_WIDTH_LF/8 -1:0] m_axis_tkeep ); localparam TUSER_WIDTH = `TUSER_WIDTH; @@ -51,9 +51,6 @@ module dnn_engine #( wire [K_BITS*COLS -1:0] weights_m_data; wire [TUSER_WIDTH -1:0] weights_m_user; - wire out_s_ready, out_s_valid, out_s_last; - wire [M_DATA_WIDTH_HF_CONV_DW -1:0] out_s_data; - // Unpack tkeep_bytes into tkeep_words wire [S_PIXELS_WIDTH_LF /X_BITS-1:0] s_axis_pixels_tkeep_words; @@ -108,6 +105,9 @@ module dnn_engine #( .pixels_m_ready (pixels_m_ready ) ); + wire m_ready, m_valid, m_last; + wire [M_DATA_WIDTH_HF_CONV_DW -1:0] m_data; + proc_engine_out PROC_OUT ( .aclk (aclk ), .aresetn (aresetn ), @@ -117,25 +117,46 @@ module dnn_engine #( .s_user (weights_m_user ), .s_data_pixels (pixels_m_data ), .s_data_weights (weights_m_data ), - .m_ready (out_s_ready ), - .m_valid (out_s_valid ), - .m_data (out_s_data ), - .m_last (out_s_last ) + .m_ready (m_ready ), + .m_valid (m_valid ), + .m_data (m_data ), + .m_last (m_last ) ); - out_ram_switch OUT_RAM ( - .clk (aclk ), - .rstn (aresetn ), - .s_ready (out_s_ready ), - .s_valid (out_s_valid ), - .s_data (out_s_data ), - .s_last (out_s_last ), - - .m_ram_addr_a (m_ram_addr_a ), - .m_ram_rddata_a (m_ram_rddata_a), - .m_ram_en_a (m_ram_en_a ), - .m_done_fill (m_done_fill ), - .m_t_done_proc (m_t_done_proc ) + localparam Y_BITS_PADDED = 2**$clog2(Y_BITS); + genvar iy; + + wire [Y_BITS_PADDED*ROWS-1:0] m_data_padded; + generate + for (iy=0; iy