diff --git a/.gitignore b/.gitignore index cf61c035..397ff3cc 100644 --- a/.gitignore +++ b/.gitignore @@ -89,6 +89,7 @@ scripts/spyglass/* # experiment outputs util/output/* +measurements # Misc *.csv diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e13f5004..2682c4e6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -111,7 +111,7 @@ run-traffic: matrix: - DUT: [axi_mesh, nw_mesh] ROUTE_ALGO: [xy, src, id] - TRAFFIC_TYPE: [random, hbm, shuffle, single_dest_center] + TRAFFIC_TYPE: [uniform, hbm, shuffle, hotspot] TRAFFIC_RW: [read, write] script: - make jobs diff --git a/Makefile b/Makefile index 74ae93fa..035e1d03 100644 --- a/Makefile +++ b/Makefile @@ -67,6 +67,9 @@ VCS_ARGS += -j 8 ifdef JOB_NAME VSIM_FLAGS += +JOB_NAME=$(JOB_NAME) endif +ifdef TRAFFIC_INJ_RATIO + VSIM_FLAGS += +TRAFFIC_INJ_RATIO=$(TRAFFIC_INJ_RATIO) +endif ifdef JOB_DIR VSIM_FLAGS += +JOB_DIR=$(JOB_DIR) endif diff --git a/hw/floo_nw_join.sv b/hw/floo_nw_join.sv index 11620869..1cfd8a40 100644 --- a/hw/floo_nw_join.sv +++ b/hw/floo_nw_join.sv @@ -28,9 +28,9 @@ module floo_nw_join #( /// max(`AxiNarrowIdWidth` and `AxiWideIdWidth`) == AxidOutWidth - 1 parameter int unsigned AxiIdOutWidth = AxiCfgJoin.InIdWidth, /// ID width of the busses before muxing them together. - localparam int unsigned AxiIdConvWidth = AxiIdOutWidth - 1, + parameter int unsigned AxiIdConvWidth = AxiIdOutWidth - 1, /// Default parameter for number of inflight narrow transactions - localparam int unsigned AxiNarrowMaxTxns = 8, + parameter int unsigned AxiNarrowMaxTxns = 8, /// Maximum number of in-flight AXI narrow write transactions parameter int unsigned AxiNarrowMaxWriteTxns = AxiNarrowMaxTxns, /// Maximum number of in-flight AXI narrow read transactions @@ -46,7 +46,7 @@ module floo_nw_join #( /// Maximum number of in-flight transactions with the same ID at the narrow master port. parameter int unsigned AxiNarrowMstPortMaxTxnsPerId = AxiNarrowMaxTxns, /// Default parameter for number of inflight wide transactions - localparam int unsigned AxiWideMaxTxns = 32, + parameter int unsigned AxiWideMaxTxns = 32, /// Maximum number of in-flight AXI wide write transactions parameter int unsigned AxiWideMaxWriteTxns = AxiWideMaxTxns, /// Number of unique IDs on the wide AXI bus diff --git a/hw/tb/tb_floo_axi_mesh.sv b/hw/tb/tb_floo_axi_mesh.sv index d0705b39..4ccd1631 100644 --- a/hw/tb/tb_floo_axi_mesh.sv +++ b/hw/tb/tb_floo_axi_mesh.sv @@ -21,6 +21,10 @@ module tb_floo_axi_mesh; localparam int unsigned NumHBMChannels = NumY; localparam int unsigned NumMax = (NumX > NumY) ? NumX : NumY; + // Add a buffer before the AXI monitors. Otherwise transactions + // are stalled which skews the latency measurements + localparam int unsigned FifoDepth = 100; + typedef axi_in_addr_t addr_t; localparam int unsigned HBMLatency = 100; localparam addr_t HBMSize = 48'h10000; // 64KB @@ -49,6 +53,10 @@ module tb_floo_axi_mesh; axi_out_req_t [NumHBMChannels-1:0] hbm_req; axi_out_rsp_t [NumHBMChannels-1:0] hbm_rsp; + axi_in_req_t [NumX-1:0][NumY-1:0] cluster_in_buf_req; + axi_in_rsp_t [NumX-1:0][NumY-1:0] cluster_in_buf_rsp; + + /////////////////// // HBM Model // /////////////////// @@ -81,10 +89,10 @@ module tb_floo_axi_mesh; //////////////////////// for (genvar x = 0; x < NumX; x++) begin : gen_x - for (genvar y = 0; y < NumX; y++) begin : gen_y + for (genvar y = 0; y < NumY; y++) begin : gen_y localparam string DmaName = $sformatf("dma_%0d_%0d", x, y); - localparam int unsigned Index = x * NumX + y; + localparam int unsigned Index = x * NumY + y; localparam addr_t MemBaseAddr = Sam[ClusterNi00+Index].start_addr; floo_dma_test_node #( @@ -109,6 +117,26 @@ module tb_floo_axi_mesh; .end_of_sim_o ( end_of_sim[x][y] ) ); + axi_fifo #( + .Depth ( FifoDepth ), + .FallThrough ( 1'b1 ), + .aw_chan_t ( axi_in_aw_chan_t ), + .w_chan_t ( axi_in_w_chan_t ), + .b_chan_t ( axi_in_b_chan_t ), + .ar_chan_t ( axi_in_ar_chan_t ), + .r_chan_t ( axi_in_r_chan_t ), + .axi_req_t ( axi_in_req_t ), + .axi_resp_t ( axi_in_rsp_t ) + ) i_axi_narrow_buffer ( + .clk_i ( clk ), + .rst_ni ( rst_n ), + .test_i ( 1'b0 ), + .slv_req_i ( cluster_in_req[x][y] ), + .slv_resp_o ( cluster_in_rsp[x][y] ), + .mst_req_o ( cluster_in_buf_req[x][y] ), + .mst_resp_i ( cluster_in_buf_rsp[x][y] ) + ); + axi_bw_monitor #( .req_t ( axi_in_req_t ), .rsp_t ( axi_in_rsp_t ), @@ -132,15 +160,15 @@ module tb_floo_axi_mesh; ///////////////////////// floo_axi_mesh_noc i_floo_axi_mesh_noc ( - .clk_i ( clk ), - .rst_ni ( rst_n ), - .test_enable_i ( 1'b0 ), - .cluster_axi_in_req_i ( cluster_in_req ), - .cluster_axi_in_rsp_o ( cluster_in_rsp ), - .cluster_axi_out_req_o ( cluster_out_req ), - .cluster_axi_out_rsp_i ( cluster_out_rsp ), - .hbm_axi_out_req_o ( hbm_req ), - .hbm_axi_out_rsp_i ( hbm_rsp ) + .clk_i ( clk ), + .rst_ni ( rst_n ), + .test_enable_i ( 1'b0 ), + .cluster_axi_in_req_i ( cluster_in_buf_req ), + .cluster_axi_in_rsp_o ( cluster_in_buf_rsp ), + .cluster_axi_out_req_o ( cluster_out_req ), + .cluster_axi_out_rsp_i ( cluster_out_rsp ), + .hbm_axi_out_req_o ( hbm_req ), + .hbm_axi_out_rsp_i ( hbm_rsp ) ); diff --git a/hw/tb/tb_floo_nw_mesh.sv b/hw/tb/tb_floo_nw_mesh.sv index c84c5890..ffbec10b 100644 --- a/hw/tb/tb_floo_nw_mesh.sv +++ b/hw/tb/tb_floo_nw_mesh.sv @@ -21,6 +21,10 @@ module tb_floo_nw_mesh; localparam int unsigned NumHBMChannels = NumY; localparam int unsigned NumMax = (NumX > NumY) ? NumX : NumY; + // Add a buffer before the AXI monitors. Otherwise transactions + // are stalled which skews the latency measurements + localparam int unsigned FifoDepth = 100; + typedef axi_narrow_in_addr_t addr_t; localparam int unsigned HBMLatency = 100; localparam addr_t HBMSize = 48'h10000; // 64KB @@ -55,6 +59,11 @@ module tb_floo_nw_mesh; axi_wide_out_req_t [NumHBMChannels-1:0] hbm_wide_req; axi_wide_out_rsp_t [NumHBMChannels-1:0] hbm_wide_rsp; + axi_narrow_in_req_t [NumX-1:0][NumY-1:0] cluster_narrow_in_buf_req; + axi_narrow_in_rsp_t [NumX-1:0][NumY-1:0] cluster_narrow_in_buf_rsp; + axi_wide_in_req_t [NumX-1:0][NumY-1:0] cluster_wide_in_buf_req; + axi_wide_in_rsp_t [NumX-1:0][NumY-1:0] cluster_wide_in_buf_rsp; + /////////////////// // HBM Model // /////////////////// @@ -119,11 +128,11 @@ module tb_floo_nw_mesh; //////////////////////// for (genvar x = 0; x < NumX; x++) begin : gen_x - for (genvar y = 0; y < NumX; y++) begin : gen_y + for (genvar y = 0; y < NumY; y++) begin : gen_y localparam string NarrowDmaName = $sformatf("narrow_dma_%0d_%0d", x, y); localparam string WideDmaName = $sformatf("wide_dma_%0d_%0d", x, y); - localparam int unsigned Index = x * NumX + y; + localparam int unsigned Index = x * NumY + y; localparam addr_t MemBaseAddr = Sam[ClusterNi00+Index].start_addr; floo_dma_test_node #( @@ -170,6 +179,46 @@ module tb_floo_nw_mesh; .end_of_sim_o ( end_of_sim[x][y][1] ) ); + axi_fifo #( + .Depth ( FifoDepth ), + .FallThrough ( 1'b1 ), + .aw_chan_t ( axi_narrow_in_aw_chan_t ), + .w_chan_t ( axi_narrow_in_w_chan_t ), + .b_chan_t ( axi_narrow_in_b_chan_t ), + .ar_chan_t ( axi_narrow_in_ar_chan_t ), + .r_chan_t ( axi_narrow_in_r_chan_t ), + .axi_req_t ( axi_narrow_in_req_t ), + .axi_resp_t ( axi_narrow_in_rsp_t ) + ) i_axi_narrow_buffer ( + .clk_i ( clk ), + .rst_ni ( rst_n ), + .test_i ( 1'b0 ), + .slv_req_i ( cluster_narrow_in_req[x][y] ), + .slv_resp_o ( cluster_narrow_in_rsp[x][y] ), + .mst_req_o ( cluster_narrow_in_buf_req[x][y] ), + .mst_resp_i ( cluster_narrow_in_buf_rsp[x][y] ) + ); + + axi_fifo #( + .Depth ( FifoDepth ), + .FallThrough ( 1'b1 ), + .aw_chan_t ( axi_wide_in_aw_chan_t ), + .w_chan_t ( axi_wide_in_w_chan_t ), + .b_chan_t ( axi_wide_in_b_chan_t ), + .ar_chan_t ( axi_wide_in_ar_chan_t ), + .r_chan_t ( axi_wide_in_r_chan_t ), + .axi_req_t ( axi_wide_in_req_t ), + .axi_resp_t ( axi_wide_in_rsp_t ) + ) i_axi_wide_buffer ( + .clk_i ( clk ), + .rst_ni ( rst_n ), + .test_i ( 1'b0 ), + .slv_req_i ( cluster_wide_in_req[x][y] ), + .slv_resp_o ( cluster_wide_in_rsp[x][y] ), + .mst_req_o ( cluster_wide_in_buf_req[x][y] ), + .mst_resp_i ( cluster_wide_in_buf_rsp[x][y] ) + ); + axi_bw_monitor #( .req_t ( axi_narrow_in_req_t ), .rsp_t ( axi_narrow_in_rsp_t ), @@ -183,7 +232,7 @@ module tb_floo_nw_mesh; .rsp_i ( cluster_narrow_in_rsp[x][y] ), .ar_in_flight_o ( ), .aw_in_flight_o ( ) - ); + ); axi_bw_monitor #( .req_t ( axi_wide_in_req_t ), @@ -198,7 +247,7 @@ module tb_floo_nw_mesh; .rsp_i ( cluster_wide_in_rsp[x][y] ), .ar_in_flight_o ( ), .aw_in_flight_o ( ) - ); + ); end end @@ -208,21 +257,21 @@ module tb_floo_nw_mesh; ///////////////////////// floo_nw_mesh_noc i_floo_nw_mesh_noc ( - .clk_i ( clk ), - .rst_ni ( rst_n ), - .test_enable_i ( 1'b0 ), - .cluster_narrow_in_req_i ( cluster_narrow_in_req ), - .cluster_narrow_in_rsp_o ( cluster_narrow_in_rsp ), - .cluster_narrow_out_req_o ( cluster_narrow_out_req ), - .cluster_narrow_out_rsp_i ( cluster_narrow_out_rsp ), - .cluster_wide_in_req_i ( cluster_wide_in_req ), - .cluster_wide_in_rsp_o ( cluster_wide_in_rsp ), - .cluster_wide_out_req_o ( cluster_wide_out_req ), - .cluster_wide_out_rsp_i ( cluster_wide_out_rsp ), - .hbm_narrow_out_req_o ( hbm_narrow_req ), - .hbm_narrow_out_rsp_i ( hbm_narrow_rsp ), - .hbm_wide_out_req_o ( hbm_wide_req ), - .hbm_wide_out_rsp_i ( hbm_wide_rsp ) + .clk_i ( clk ), + .rst_ni ( rst_n ), + .test_enable_i ( 1'b0 ), + .cluster_narrow_in_req_i ( cluster_narrow_in_buf_req ), + .cluster_narrow_in_rsp_o ( cluster_narrow_in_buf_rsp ), + .cluster_narrow_out_req_o ( cluster_narrow_out_req ), + .cluster_narrow_out_rsp_i ( cluster_narrow_out_rsp ), + .cluster_wide_in_req_i ( cluster_wide_in_buf_req ), + .cluster_wide_in_rsp_o ( cluster_wide_in_buf_rsp ), + .cluster_wide_out_req_o ( cluster_wide_out_req ), + .cluster_wide_out_rsp_i ( cluster_wide_out_rsp ), + .hbm_narrow_out_req_o ( hbm_narrow_req ), + .hbm_narrow_out_rsp_i ( hbm_narrow_rsp ), + .hbm_wide_out_req_o ( hbm_wide_req ), + .hbm_wide_out_rsp_i ( hbm_wide_rsp ) ); diff --git a/hw/test/floo_axi_rand_slave.sv b/hw/test/floo_axi_rand_slave.sv index bee2d0fb..37fcaf0b 100644 --- a/hw/test/floo_axi_rand_slave.sv +++ b/hw/test/floo_axi_rand_slave.sv @@ -131,8 +131,15 @@ module floo_axi_rand_slave #( .UW ( AxiCfg.UserWidth ), // Stimuli application and test time .TA ( ApplTime ), - .TT ( TestTime ) - ) axi_rand_slave_t; + .TT ( TestTime ), + // Responsiveness + .AX_MIN_WAIT_CYCLES (0), + .AX_MAX_WAIT_CYCLES (0), + .R_MIN_WAIT_CYCLES (0), + .R_MAX_WAIT_CYCLES (0), + .RESP_MIN_WAIT_CYCLES (0), + .RESP_MAX_WAIT_CYCLES (0) + ) axi_rand_ideal_slave_t; typedef axi_test::axi_rand_slave #( // AXI interface parameters @@ -173,6 +180,7 @@ module floo_axi_rand_slave #( // axi slave axi_rand_slow_slave_t axi_rand_slow_slave[NumSlaves]; axi_rand_fast_slave_t axi_rand_fast_slave[NumSlaves]; + axi_rand_ideal_slave_t axi_rand_ideal_slave[NumSlaves]; if (SlaveType == floo_test_pkg::SlowSlave) begin : gen_slow_slaves for (genvar i = 0; i < NumSlaves; i++) begin : gen_slow_slaves @@ -192,6 +200,15 @@ module floo_axi_rand_slave #( axi_rand_fast_slave[i].run(); end end + end else if (SlaveType == floo_test_pkg::IdealSlave) begin : gen_fast_slaves + for (genvar i = 0; i < NumSlaves; i++) begin : gen_fast_slaves + initial begin + axi_rand_ideal_slave[i] = new( slave_dv[i] ); + axi_rand_ideal_slave[i].reset(); + @(posedge rst_ni) + axi_rand_ideal_slave[i].run(); + end + end end else if (SlaveType == floo_test_pkg::MixedSlave) begin : gen_mixed_slaves for (genvar i = 0; i < NumSlaves; i++) begin : gen_mixed_slaves if (i % 2 == 0) begin : gen_slow_slaves diff --git a/hw/test/floo_dma_test_node.sv b/hw/test/floo_dma_test_node.sv index 39dc7f66..11425b78 100644 --- a/hw/test/floo_dma_test_node.sv +++ b/hw/test/floo_dma_test_node.sv @@ -15,6 +15,7 @@ module floo_dma_test_node #( parameter time TT = 9ns, parameter int unsigned BufferDepth = 16, parameter int unsigned NumAxInFlight = 16, + parameter floo_test_pkg::slave_type_e SlaveType = floo_test_pkg::FastSlave, parameter floo_pkg::axi_cfg_t AxiCfg = '{default:0}, parameter type axi_req_t = logic, parameter type axi_rsp_t = logic, @@ -250,7 +251,7 @@ module floo_dma_test_node #( .AxiCfg ( AxiCfg ), .ApplTime ( TA ), .TestTime ( TT ), - .SlaveType ( floo_test_pkg::FastSlave ), + .SlaveType ( SlaveType ), .NumSlaves ( 1 ), .axi_req_t ( axi_xbar_req_t ), .axi_rsp_t ( axi_xbar_resp_t ) @@ -267,7 +268,7 @@ module floo_dma_test_node #( .AxiCfg ( floo_pkg::axi_cfg_swap_iw(AxiCfg) ), .ApplTime ( TA ), .TestTime ( TT ), - .SlaveType ( floo_test_pkg::FastSlave ), + .SlaveType ( SlaveType ), .NumSlaves ( 1 ), .axi_req_t ( axi_in_req_t ), .axi_rsp_t ( axi_in_rsp_t ) @@ -340,8 +341,8 @@ module floo_dma_test_node #( `include "tb_tasks.svh" //-------------------------------------- - // Read Job queue from File - //-------------------------------------- + // Read Job queue from File + //-------------------------------------- initial begin string job_file, job_name, job_dir; if ($value$plusargs ("JOB_NAME=%s", job_name)) begin @@ -374,6 +375,7 @@ module floo_dma_test_node #( axi_pkg::resp_t cause; addr_t burst_addr; int err_idx [$]; + real injection_ratio; initial begin // reset driver @@ -382,35 +384,51 @@ module floo_dma_test_node #( wait (rst_ni); // print a job summary print_summary(req_jobs); + // wait some additional time + if (!$value$plusargs("TRAFFIC_INJ_RATIO=%f", injection_ratio)) begin + injection_ratio = 1.0; + $display("[DMA%0d] Using default injection ratio of 1.0", JobId + 1); + end else begin + $display("[DMA%0d] Using injection ratio of %f", JobId + 1, injection_ratio); + end + // wait some additional time #2ns; // run all requests in queue while (req_jobs.size() != 0) begin - // pop front to get a job - automatic tb_dma_job_t now = req_jobs.pop_front(); - // print job to terminal - if (EnableDebug) $display("[DMA%0d]%s", JobId, now.pprint()); - // launch DUT - drv.launch_tf( - now.length, - now.src_addr, - now.dst_addr, - now.src_protocol, - now.dst_protocol, - now.aw_decoupled, - now.rw_decoupled, - $clog2(now.max_src_len), - $clog2(now.max_dst_len), - now.max_src_len != 'd256, - now.max_dst_len != 'd256, - now.id - ); + automatic tb_dma_job_t now; + // Inject delay based on injection ratio + // Compute whether to inject in the next cycle based on the injection ratio + if (!($urandom_range(0, 100) < (injection_ratio * 100))) begin + // Wait for the next cycle + if (EnableDebug && (JobId == 100)) $display("[DMA%0d] Delay", JobId + 1); + @(posedge clk_i); + continue; + end + // pop front to get a job + now = req_jobs.pop_front(); + // print job to terminal + if (EnableDebug) $display("[DMA%0d]%s", JobId, now.pprint()); + // launch DUT + drv.launch_tf( + now.length, + now.src_addr, + now.dst_addr, + now.src_protocol, + now.dst_protocol, + now.aw_decoupled, + now.rw_decoupled, + $clog2(now.max_src_len), + $clog2(now.max_dst_len), + now.max_src_len != 'd256, + now.max_dst_len != 'd256, + now.id + ); end // once done: launched all transfers $display("[DMA%0d] Launched all Transfers.", JobId + 1); - -end + end initial begin end_of_sim_o = 1'b0; diff --git a/hw/test/floo_test_pkg.sv b/hw/test/floo_test_pkg.sv index 1e480cfc..1cc38fa3 100644 --- a/hw/test/floo_test_pkg.sv +++ b/hw/test/floo_test_pkg.sv @@ -9,6 +9,7 @@ package floo_test_pkg; typedef enum { + IdealSlave, FastSlave, SlowSlave, MixedSlave diff --git a/util/gen_jobs.py b/util/gen_jobs.py index 4a2a669c..f3999ab6 100755 --- a/util/gen_jobs.py +++ b/util/gen_jobs.py @@ -17,6 +17,8 @@ data_widths = {"wide": 512, "narrow": 64} +random.seed(42) + def clog2(x: int): """Compute the ceiling of the log2 of x.""" @@ -146,11 +148,13 @@ def gen_mesh_traffic( # Tile x=0 are the HBM channels # Each core read from the channel of its y coordinate ext_addr = get_hbm_base_addr(y) - elif traffic_type == "random": + accesses = [(ext_addr, rw, wide_length)] + elif traffic_type == "uniform": ext_addr = local_addr while ext_addr == local_addr: ext_addr = get_xy_base_addr(random.randint(0, NUM_X-1), random.randint(0, NUM_Y-1)) + accesses = [(ext_addr, rw, wide_length)] elif traffic_type == "onehop": if not (x == 0 and y == 0): wide_length = 0 @@ -159,9 +163,11 @@ def gen_mesh_traffic( ext_addr = 0 else: ext_addr = get_xy_base_addr(x, y + 1) + accesses = [(ext_addr, rw, wide_length)] elif traffic_type == "bit_complement": ext_addr = get_xy_base_addr(NUM_X - x - 1, NUM_Y - y - 1) + accesses = [(ext_addr, rw, wide_length)] elif traffic_type == "bit_reverse": # in order to achieve same result as garnet: # change to space where addresses start at 0 and return afterwards @@ -174,6 +180,7 @@ def gen_mesh_traffic( straight >>= 1 reverse |= (straight & 1) # LSB ext_addr = get_xy_base_addr(reverse % NUM_X, reverse // NUM_X) + accesses = [(ext_addr, rw, wide_length)] elif traffic_type == "bit_rotation": source = x * NUM_Y + y num_destinations = NUM_X * NUM_Y @@ -182,8 +189,10 @@ def gen_mesh_traffic( else: # (source % 2 == 1) ext = (source // 2) + (num_destinations // 2) ext_addr = get_xy_base_addr(ext % NUM_X, ext // NUM_X) + accesses = [(ext_addr, rw, wide_length)] elif traffic_type == "neighbor": ext_addr = get_xy_base_addr((x + 1) % NUM_X, y) + accesses = [(ext_addr, rw, wide_length)] elif traffic_type == "shuffle": source = x * NUM_Y + y num_destinations = NUM_X * NUM_Y @@ -191,25 +200,52 @@ def gen_mesh_traffic( ext = source * 2 else: ext = (source * 2) - num_destinations + 1 ext_addr = get_xy_base_addr(ext % NUM_X, ext // NUM_X) + accesses = [(ext_addr, rw, wide_length)] elif traffic_type == "transpose": - dest_x = y - dest_y = x + if NUM_X == NUM_Y: + dest_x = y + dest_y = x + elif NUM_Y > NUM_X: + assert NUM_Y % NUM_X == 0, "NUM_Y must be divisible by NUM_X" + dest_x = y - (y // NUM_X) * NUM_X + dest_y = x + (y // NUM_X) * NUM_X + else: + assert NUM_X % NUM_Y == 0, "NUM_X must be divisible by NUM_Y" + dest_x = y + (x // NUM_Y) * NUM_Y + dest_y = x - (x // NUM_Y) * NUM_Y ext_addr = get_xy_base_addr(dest_x, dest_y) + accesses = [(ext_addr, rw, wide_length)] elif traffic_type == "tornado": dest_x = (x + math.ceil(NUM_X / 2) - 1) % NUM_X ext_addr = get_xy_base_addr(dest_x, y) - elif traffic_type == "single_dest_boundary": + accesses = [(ext_addr, rw, wide_length)] + elif traffic_type == "hotspot_boundary": ext_addr = get_hbm_base_addr(NUM_Y//2) - elif traffic_type == "single_dest_center": + accesses = [(ext_addr, rw, wide_length)] + elif traffic_type == "hotspot": ext_addr = get_xy_base_addr(NUM_X//2, NUM_Y//2) + accesses = [(ext_addr, rw, wide_length)] + elif traffic_type == "matmul": + # access matrix A from HBM + accesses = [(get_hbm_base_addr(y), "read", wide_length//2)] + # access matrix B from HBM + for i in range(NUM_Y): + hbm_addr = get_hbm_base_addr((y + i) % NUM_Y) + accesses += [(hbm_addr, "read", (wide_length//2)//NUM_Y)] + # Writeback of matrix C to HBM + accesses += [(get_hbm_base_addr(y), "write", wide_length//4)] else: raise ValueError(f"Unknown traffic type: {traffic_type}") - src_addr = ext_addr if rw == "read" else local_addr - dst_addr = local_addr if rw == "read" else ext_addr for _ in range(num_wide_bursts): - wide_jobs += gen_job_str(wide_length, src_addr, dst_addr) + for access in accesses: + src_addr = access[0] if access[1] == "read" else local_addr + dst_addr = local_addr if access[1] == "read" else access[0] + wide_jobs += gen_job_str(access[2], src_addr, dst_addr) for _ in range(num_narrow_bursts): - narrow_jobs += gen_job_str(narrow_length, src_addr, dst_addr) + for access in accesses: + src_addr = access[0] if access[1] == "read" else local_addr + dst_addr = local_addr if access[1] == "read" else access[0] + narrow_jobs += gen_job_str(access[2], src_addr, dst_addr) emit_jobs(wide_jobs, out_dir, "mesh", x * NUM_Y + y) emit_jobs(narrow_jobs, out_dir, "mesh", x * NUM_Y + y + 100)