Skip to content

Commit

Permalink
Replace OUT_RAM_SWITCH with regular S2MM DMA
Browse files Browse the repository at this point in the history
  • Loading branch information
Aba committed Nov 21, 2023
1 parent ea04128 commit c1834f7
Show file tree
Hide file tree
Showing 13 changed files with 148 additions and 86 deletions.
30 changes: 25 additions & 5 deletions deepsocflow/c/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ typedef enum {POOL_NONE, POOL_MAX, POOL_AVG} Pool_t;
#define X_BITS_MASK ((1 << X_BITS) -1)

typedef struct {
Y_TYPE ocm [2][PE_COLS*PE_ROWS];
int8_t w [W_BYTES ];
B_TYPE b [B_WORDS ]; // keep next to w. weights are loaded to w_ptr
int8_t x [X_BYTES_ALL ];
Expand All @@ -40,7 +41,7 @@ typedef struct {
int8_t debug_tiled [O_WORDS_MAX ];
int32_t debug_nhwc [NHWC_WORDS ];
int8_t out_buffers [2 ][O_BYTES_MAX ];
int8_t add_buffers [N_ADD_BUF ][NHWC_WORDS ];
int8_t add_buffers [N_ADD_BUF ][NHWC_WORDS ]; // should be last, since N_ADD_BUF can be empty
} Memory_st;
Memory_st mem;

Expand Down Expand Up @@ -169,13 +170,12 @@ static inline void tile_write( int32_t out_val, int8_t *p_out_buffer, int32_t ib
}


extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_t *p_sram_u32) {
extern EXT_C void load_y (uint8_t *p_done, uint64_t *p_base_addr_next, int32_t *p_bpt_next) {

static Bundle_t *pb = &bundles[0];
static int32_t it_bias=0;
static int32_t ib=0, ip=0, it=0, in=0, il=0, iw_kw2=0;
static int8_t *p_out_buffer = (int8_t*)&mem.out_buffers[0];
const int32_t *p_sram = (const int32_t *)p_sram_u32;

int32_t iy_nhwc;
div_t div_ch, div_cw, div_ixh, div_ixw;
Expand Down Expand Up @@ -206,6 +206,16 @@ extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_
//New iw_kw2:
int32_t w_last = iw_kw2 == pb->w_kw2-1 ? pb->kw/2+1 : 1;
int32_t sram_addr=0;

static char is_first_call = 1;
static char ocm_bank = 0;
if (is_first_call) {
is_first_call = 0;
*p_base_addr_next = (uint64_t)&mem.ocm[ocm_bank];
*p_bpt_next = PE_ROWS * pb->coe * w_last * sizeof(Y_TYPE);
return;
}

for (int32_t icoe=0; icoe < pb->coe; icoe++) {
int32_t i_bias = it_bias + icoe;

Expand Down Expand Up @@ -236,7 +246,7 @@ extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_
goto PROCESS_AND_STORE_DONE;
}

raw_val = p_sram[sram_addr];
raw_val = mem.ocm[ocm_bank][sram_addr];
out_val = raw_val;

PROCESS_START:
Expand Down Expand Up @@ -428,6 +438,7 @@ extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_

++ib; if (ib >= N_BUNDLES) { ib = 0; // after_all(ib):
*p_done = 1;
is_first_call = 1;
}//new(ib):

pb = &bundles[ib];
Expand All @@ -440,7 +451,12 @@ extern EXT_C void load_y (uint8_t *p_done, uint8_t *pt_done_proc, const uint32_
}//new(in):
}//new(il):
}//new(iw_kw2):
*pt_done_proc = !(*pt_done_proc);
// *pt_done_proc = !(*pt_done_proc);

ocm_bank = !ocm_bank;
w_last = iw_kw2 == pb->w_kw2-1 ? pb->kw/2+1 : 1;
*p_base_addr_next = (uint64_t)&mem.ocm[ocm_bank];
*p_bpt_next = PE_ROWS * pb->coe * w_last * sizeof(Y_TYPE);
}


Expand Down Expand Up @@ -518,6 +534,10 @@ extern EXT_C int8_t get_byte (uint64_t addr){
return *(int8_t*)addr;
}

extern EXT_C void set_byte (uint64_t addr, int8_t data){
*(int8_t*)addr = data;
}

extern EXT_C char get_is_bundle_write_done(){
return is_bundle_write_done;
}
Expand Down
1 change: 1 addition & 0 deletions deepsocflow/py/hardware.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def __init__(
self.BITS_BLOCKS_MAX = clog2(self.L_MAX)
self.BITS_XN_MAX = clog2(self.XN_MAX)
self.BITS_RAM_WEIGHTS_ADDR = clog2(self.RAM_WEIGHTS_DEPTH)
self.Y_OUT_BITS = 2**clog2(self.Y_BITS)

self.MODULE_DIR = os.path.normpath(os.path.dirname(deepsocflow.__file__)).replace('\\', '/')
self.TB_MODULE = "dnn_engine_tb"
Expand Down
1 change: 1 addition & 0 deletions deepsocflow/py/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ def export_inference(self, x, hw):
ch.write(f"#define O_BYTES_MAX {o_bytes_max}\n")
ch.write(f"#define X_BYTES_ALL {x_bytes_all}\n")
ch.write(f"#define NHWC_WORDS {nhwc_words_max}\n")
ch.write(f"#define Y_TYPE int{hw.Y_OUT_BITS}_t\n")
ch.write(f"#define B_TYPE int{hw.B_BITS}_t\n")
ch.write(f"#define O_TYPE {out_type}\n")
ch.write(f"#define B_WORDS {b_words}\n")
Expand Down
71 changes: 46 additions & 25 deletions deepsocflow/rtl/dnn_engine.v
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ module dnn_engine #(

S_PIXELS_WIDTH_LF = `S_PIXELS_WIDTH_LF ,
S_WEIGHTS_WIDTH_LF = `S_WEIGHTS_WIDTH_LF ,
M_OUTPUT_WIDTH_LF = `M_OUTPUT_WIDTH_LF ,

OUT_ADDR_WIDTH = 10,
OUT_BITS = 32
Expand All @@ -33,11 +34,10 @@ module dnn_engine #(
input wire [S_WEIGHTS_WIDTH_LF -1:0] s_axis_weights_tdata,
input wire [S_WEIGHTS_WIDTH_LF/8-1:0] s_axis_weights_tkeep,

input wire [(OUT_ADDR_WIDTH+2)-1:0] m_ram_addr_a,
output wire [ OUT_BITS -1:0] m_ram_rddata_a,
input wire m_ram_en_a,
output wire m_done_fill,
input wire m_t_done_proc
input wire m_axis_tready,
output wire m_axis_tvalid, m_axis_tlast,
output wire [M_OUTPUT_WIDTH_LF -1:0] m_axis_tdata,
output wire [M_OUTPUT_WIDTH_LF/8 -1:0] m_axis_tkeep
);

localparam TUSER_WIDTH = `TUSER_WIDTH;
Expand All @@ -51,9 +51,6 @@ module dnn_engine #(
wire [K_BITS*COLS -1:0] weights_m_data;
wire [TUSER_WIDTH -1:0] weights_m_user;

wire out_s_ready, out_s_valid, out_s_last;
wire [M_DATA_WIDTH_HF_CONV_DW -1:0] out_s_data;


// Unpack tkeep_bytes into tkeep_words
wire [S_PIXELS_WIDTH_LF /X_BITS-1:0] s_axis_pixels_tkeep_words;
Expand Down Expand Up @@ -108,6 +105,9 @@ module dnn_engine #(
.pixels_m_ready (pixels_m_ready )
);

wire m_ready, m_valid, m_last;
wire [M_DATA_WIDTH_HF_CONV_DW -1:0] m_data;

proc_engine_out PROC_OUT (
.aclk (aclk ),
.aresetn (aresetn ),
Expand All @@ -117,25 +117,46 @@ module dnn_engine #(
.s_user (weights_m_user ),
.s_data_pixels (pixels_m_data ),
.s_data_weights (weights_m_data ),
.m_ready (out_s_ready ),
.m_valid (out_s_valid ),
.m_data (out_s_data ),
.m_last (out_s_last )
.m_ready (m_ready ),
.m_valid (m_valid ),
.m_data (m_data ),
.m_last (m_last )
);

out_ram_switch OUT_RAM (
.clk (aclk ),
.rstn (aresetn ),
.s_ready (out_s_ready ),
.s_valid (out_s_valid ),
.s_data (out_s_data ),
.s_last (out_s_last ),

.m_ram_addr_a (m_ram_addr_a ),
.m_ram_rddata_a (m_ram_rddata_a),
.m_ram_en_a (m_ram_en_a ),
.m_done_fill (m_done_fill ),
.m_t_done_proc (m_t_done_proc )
localparam Y_BITS_PADDED = 2**$clog2(Y_BITS);
genvar iy;

wire [Y_BITS_PADDED*ROWS-1:0] m_data_padded;
generate
for (iy=0; iy<ROWS; iy=iy+1) begin
assign m_data_padded[Y_BITS_PADDED*(iy+1)-1:Y_BITS_PADDED*iy] = $signed(m_data[Y_BITS*(iy+1)-1:Y_BITS*iy]);
end
endgenerate


alex_axis_adapter_any #(
.S_DATA_WIDTH (Y_BITS_PADDED*ROWS),
.M_DATA_WIDTH (M_OUTPUT_WIDTH_LF ),
.S_KEEP_ENABLE (1),
.M_KEEP_ENABLE (1),
.S_KEEP_WIDTH (Y_BITS_PADDED*ROWS/8),
.M_KEEP_WIDTH (M_OUTPUT_WIDTH_LF/8),
.ID_ENABLE (0),
.DEST_ENABLE (0),
.USER_ENABLE (0)
) DW (
.clk (aclk ),
.rst (~aresetn ),
.s_axis_tready (m_ready ),
.s_axis_tvalid (m_valid ),
.s_axis_tdata (m_data_padded),
.s_axis_tlast (m_last ),
.s_axis_tkeep ({(Y_BITS_PADDED*ROWS/8){1'b1}}),
.m_axis_tready (m_axis_tready),
.m_axis_tvalid (m_axis_tvalid),
.m_axis_tdata (m_axis_tdata ),
.m_axis_tlast (m_axis_tlast ),
.m_axis_tkeep (m_axis_tkeep )
);
endmodule

Expand Down
3 changes: 1 addition & 2 deletions deepsocflow/tcl/fpga/pynq_z2.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@ set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
create_bd_design "design_1"
create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0
apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" Master "Disable" Slave "Disable" } [get_bd_cells processing_system7_0]
set_property -dict [list CONFIG.PCW_USE_S_AXI_GP0 {0} CONFIG.PCW_USE_M_AXI_GP1 {1} CONFIG.PCW_USE_S_AXI_HP0 {1} CONFIG.PCW_USE_FABRIC_INTERRUPT {1} CONFIG.PCW_IRQ_F2P_INTR {1} CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ $FREQ CONFIG.PCW_UART0_PERIPHERAL_ENABLE {1}] [get_bd_cells processing_system7_0]
set_property -dict [list CONFIG.PCW_USE_S_AXI_GP0 {0} CONFIG.PCW_USE_S_AXI_HP0 {1} CONFIG.PCW_USE_FABRIC_INTERRUPT {1} CONFIG.PCW_IRQ_F2P_INTR {1} CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ $FREQ CONFIG.PCW_UART0_PERIPHERAL_ENABLE {1}] [get_bd_cells processing_system7_0]

set PS_IRQ "processing_system7_0/IRQ_F2P"
set PS_M_AXI_LITE "/processing_system7_0/M_AXI_GP0"
set PS_M_AXI_BRAM "/processing_system7_0/M_AXI_GP1"
set PS_S_AXI "/processing_system7_0/S_AXI_HP0"
set PS_CLK "/processing_system7_0/FCLK_CLK0"
31 changes: 11 additions & 20 deletions deepsocflow/tcl/fpga/vivado.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -28,32 +28,27 @@ set IP_NAME "dma_pixels"
create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 $IP_NAME
set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_m_axi_mm2s_data_width $S_PIXELS_WIDTH_LF CONFIG.c_m_axis_mm2s_tdata_width $S_PIXELS_WIDTH_LF CONFIG.c_include_mm2s_dre {1} CONFIG.c_mm2s_burst_size {64} CONFIG.c_include_s2mm {0}] [get_bd_cells $IP_NAME]

set IP_NAME "axi_bram_ctrl"
create_bd_cell -type ip -vlnv xilinx.com:ip:axi_bram_ctrl:4.1 $IP_NAME
set_property -dict [list CONFIG.SINGLE_PORT_BRAM {1}] [get_bd_cells $IP_NAME]

set IP_NAME "axi_gpio_out"
create_bd_cell -type ip -vlnv xilinx.com:ip:axi_gpio:2.0 axi_gpio_out
set_property -dict [list CONFIG.C_GPIO_WIDTH {1} CONFIG.C_ALL_OUTPUTS {1}] [get_bd_cells axi_gpio_out]

set IP_NAME "dma_output"
create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 $IP_NAME
set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_s2mm_burst_size {8} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_include_s2mm_dre {1} CONFIG.c_m_axi_s2mm_data_width $M_OUTPUT_WIDTH_LF CONFIG.c_s_axis_s2mm_tdata_width $M_OUTPUT_WIDTH_LF CONFIG.c_include_s2mm {1} CONFIG.c_include_mm2s {0}] [get_bd_cells $IP_NAME]

# Interrupts
create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_0
set_property -dict [list CONFIG.NUM_PORTS {3}] [get_bd_cells xlconcat_0]
connect_bd_net [get_bd_pins dma_pixels/mm2s_introut] [get_bd_pins xlconcat_0/In0]
connect_bd_net [get_bd_pins dma_weights/mm2s_introut] [get_bd_pins xlconcat_0/In1]
connect_bd_net [get_bd_pins dma_output/s2mm_introut] [get_bd_pins xlconcat_0/In2]
connect_bd_net [get_bd_pins xlconcat_0/dout] [get_bd_pins ${PS_IRQ}]

# AXI Lite
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config "Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master $PS_M_AXI_LITE Slave {/dma_pixels/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}" [get_bd_intf_pins dma_pixels/S_AXI_LITE]
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config "Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master $PS_M_AXI_LITE Slave {/dma_weights/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}" [get_bd_intf_pins dma_weights/S_AXI_LITE]
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config " Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master $PS_M_AXI_BRAM Slave {/axi_bram_ctrl/S_AXI} ddr_seg {Auto} intc_ip {Auto} master_apm {0}" [get_bd_intf_pins axi_bram_ctrl/S_AXI]
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config " Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master $PS_M_AXI_BRAM Slave {/axi_gpio_out/S_AXI} ddr_seg {Auto} intc_ip {Auto} master_apm {0}" [get_bd_intf_pins axi_gpio_out/S_AXI]
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config "Clk_master $PS_CLK Clk_slave $PS_CLK Clk_xbar $PS_CLK Master $PS_M_AXI_LITE Slave {/dma_pixels/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}" [get_bd_intf_pins dma_pixels/S_AXI_LITE ]
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config "Clk_master $PS_CLK Clk_slave $PS_CLK Clk_xbar $PS_CLK Master $PS_M_AXI_LITE Slave {/dma_weights/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}" [get_bd_intf_pins dma_weights/S_AXI_LITE]
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config "Clk_master $PS_CLK Clk_slave $PS_CLK Clk_xbar $PS_CLK Master $PS_M_AXI_LITE Slave {/dma_output/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}" [get_bd_intf_pins dma_output/S_AXI_LITE ]

# AXI Full
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config "Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/dma_pixels/M_AXI_MM2S} Slave $PS_S_AXI ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}" [get_bd_intf_pins ${PS_S_AXI}]
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config "Clk_master {Auto} Clk_slave $PS_CLK Clk_xbar $PS_CLK Master {/dma_weights/M_AXI_MM2S} Slave $PS_S_AXI ddr_seg {Auto} intc_ip {/axi_smc} master_apm {0}" [get_bd_intf_pins dma_weights/M_AXI_MM2S]

apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config "Clk_master $PS_CLK Clk_slave $PS_CLK Clk_xbar $PS_CLK Master {/dma_pixels/M_AXI_MM2S} Slave $PS_S_AXI ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}" [get_bd_intf_pins ${PS_S_AXI} ]
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config "Clk_master $PS_CLK Clk_slave $PS_CLK Clk_xbar $PS_CLK Master {/dma_weights/M_AXI_MM2S} Slave $PS_S_AXI ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}" [get_bd_intf_pins dma_weights/M_AXI_MM2S]
apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config "Clk_master $PS_CLK Clk_slave $PS_CLK Clk_xbar $PS_CLK Master {/dma_output/M_AXI_S2MM} Slave $PS_S_AXI ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}" [get_bd_intf_pins dma_output/M_AXI_S2MM ]

# Engine
add_files [glob $CONFIG_DIR/*.svh] [glob $RTL_DIR/*] [glob $RTL_DIR/ext/*]
Expand All @@ -63,12 +58,8 @@ create_bd_cell -type module -reference dnn_engine dnn_engine_0
connect_bd_net [get_bd_pins $PS_CLK] [get_bd_pins dnn_engine_0/aclk]
connect_bd_intf_net [get_bd_intf_pins dma_pixels/M_AXIS_MM2S] [get_bd_intf_pins dnn_engine_0/s_axis_pixels]
connect_bd_intf_net [get_bd_intf_pins dma_weights/M_AXIS_MM2S] [get_bd_intf_pins dnn_engine_0/s_axis_weights]
connect_bd_intf_net [get_bd_intf_pins dma_output/S_AXIS_S2MM] [get_bd_intf_pins dnn_engine_0/m_axis]
connect_bd_net [get_bd_pins dnn_engine_0/aresetn] [get_bd_pins axi_smc/aresetn]
connect_bd_net [get_bd_pins dnn_engine_0/m_done_fill] [get_bd_pins xlconcat_0/In2]
connect_bd_net [get_bd_pins axi_bram_ctrl/bram_addr_a] [get_bd_pins dnn_engine_0/m_ram_addr_a]
connect_bd_net [get_bd_pins axi_bram_ctrl/bram_rddata_a] [get_bd_pins dnn_engine_0/m_ram_rddata_a]
connect_bd_net [get_bd_pins axi_bram_ctrl/bram_en_a] [get_bd_pins dnn_engine_0/m_ram_en_a]
connect_bd_net [get_bd_pins axi_gpio_out/gpio_io_o] [get_bd_pins dnn_engine_0/m_t_done_proc]

validate_bd_design

Expand Down
5 changes: 2 additions & 3 deletions deepsocflow/tcl/fpga/zcu104.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@ set_property board_part xilinx.com:zcu104:part0:1.1 [current_project]
create_bd_design "design_1"
create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.4 zynq_ultra_ps_e_0
apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells zynq_ultra_ps_e_0]
set_property -dict [list CONFIG.PSU__USE__M_AXI_HPM0_LPD {0} CONFIG.PSU__USE__M_AXI_GP1 {1} CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__USE__M_AXI_GP2 {1} CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ $FREQ CONFIG.PSU__USE__M_AXI_GP0 {0}] [get_bd_cells zynq_ultra_ps_e_0]
set_property -dict [list CONFIG.PSU__USE__M_AXI_GP1 {1} CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ $FREQ CONFIG.PSU__USE__M_AXI_GP0 {0}] [get_bd_cells zynq_ultra_ps_e_0]

set PS_IRQ "zynq_ultra_ps_e_0/pl_ps_irq0"
set PS_M_AXI_LITE "/zynq_ultra_ps_e_0/M_AXI_HPM0_LPD"
set PS_M_AXI_BRAM "/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD"
set PS_M_AXI_LITE "/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD"
set PS_S_AXI "/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD"
set PS_CLK "/zynq_ultra_ps_e_0/pl_clk0"
42 changes: 41 additions & 1 deletion deepsocflow/test/sv/dma.sv
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

module DMA_M2S #(
parameter BUS_WIDTH=8, PROB_VALID=20,
parameter MODE=0,
parameter BYTES_PER_BEAT = BUS_WIDTH/8
)(
input logic aclk, aresetn, s_ready,
Expand Down Expand Up @@ -67,4 +66,45 @@ module DMA_M2S #(
prev_handshake = 1;
@(posedge aclk);
endtask
endmodule


module DMA_S2M #(
parameter BUS_WIDTH=8, PROB_READY=20,
parameter BYTES_PER_BEAT = BUS_WIDTH/8
)(
input logic aclk, aresetn,
output logic m_ready,
input logic m_valid, m_last,
input logic [BYTES_PER_BEAT-1:0][7:0] m_data,
input logic [BYTES_PER_BEAT-1:0] m_keep
);

longint unsigned i_bytes = 0;
bit done = 0;

import "DPI-C" function void set_byte (longint unsigned addr, byte data);

task axis_pull (input longint unsigned base_addr, input int bytes_per_transfer);
m_ready = 0;
wait(aresetn);

while (!done) begin

@(posedge aclk)
if (m_ready && m_valid) begin // read at posedge
for (int i=0; i < BYTES_PER_BEAT; i=i+1)
if (m_keep[i]) begin
set_byte(base_addr + i_bytes, m_data[i]);
i_bytes += 1;
end
if (m_last) done <= 1;
end

#10ps // delay before writing
m_ready = $urandom_range(0,999) < PROB_READY;
end

{done, i_bytes} = 0;
endtask
endmodule
Loading

0 comments on commit c1834f7

Please sign in to comment.