From 7fcf5863ebee1f63c7e36819480dd743ec1d747c Mon Sep 17 00:00:00 2001 From: Yichao Zhang Date: Fri, 8 Dec 2023 18:26:28 +0100 Subject: [PATCH] [TeraPool] Rebase on the main branch; Update the variable control style in Makefile and config files --- config/config.mk | 7 ++++ config/terapool.mk | 7 +++- hardware/Makefile | 16 ++++---- hardware/src/mempool_pkg.sv | 2 +- hardware/tb/mempool_tb.sv | 73 +++++++++++++++++++++++++++++-------- 5 files changed, 78 insertions(+), 27 deletions(-) diff --git a/config/config.mk b/config/config.mk index 0691071c8..6561824f5 100644 --- a/config/config.mk +++ b/config/config.mk @@ -68,3 +68,10 @@ xqueue_size ?= 0 # Enable the XpulpIMG extension xpulpimg ?= 1 + +# This parameter is only used for TeraPool configurations +num_sub_groups_per_group ?= 1 +remote_group_latency_cycles ?= 7 + +# Makefile RTL Filtering Control +subgroup_rtl ?= 0 diff --git a/config/terapool.mk b/config/terapool.mk index 631fb6d83..4a42225e6 100644 --- a/config/terapool.mk +++ b/config/terapool.mk @@ -31,13 +31,16 @@ banking_factor ?= 4 remote_group_latency_cycles ?= 7 # Radix for hierarchical AXI interconnect -axi_hier_radix ?= 10 +axi_hier_radix ?= 9 # Number of AXI masters per group axi_masters_per_group ?= 4 # Number of DMA backends in each group -dmas_per_group ?= 8 +dmas_per_group ?= 4 # L2 Banks/Channels l2_banks = 16 + +# Makefile RTL Filtering Control +subgroup_rtl = 1 diff --git a/hardware/Makefile b/hardware/Makefile index 83cd4077d..1fa5254af 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -87,6 +87,7 @@ endif vlog_args += -suppress vlog-2583 -suppress vlog-13314 -suppress vlog-13233 vlog_args += -work $(library) # Defines +vlog_defs += -D$(config) vlog_defs += -DNUM_CORES=$(num_cores) -DNUM_CORES_PER_TILE=$(num_cores_per_tile) -DNUM_GROUPS=$(num_groups) -DBANKING_FACTOR=$(banking_factor) vlog_defs += -DL2_BASE=32\'d$(l2_base) -DL2_SIZE=32\'d$(l2_size) -DL2_BANKS=$(l2_banks) vlog_defs += -DL1_BANK_SIZE=$(l1_bank_size) @@ -97,14 +98,8 @@ vlog_defs += -DRO_LINE_WIDTH=$(ro_line_width) vlog_defs += -DDMAS_PER_GROUP=$(dmas_per_group) vlog_defs += -DAXI_HIER_RADIX=$(axi_hier_radix) -DAXI_MASTERS_PER_GROUP=$(axi_masters_per_group) vlog_defs += -DSEQ_MEM_SIZE=$(seq_mem_size) -DXQUEUE_SIZE=$(xqueue_size) -ifdef terapool - subgroup_rtl = 1 - vlog_defs += -DTERAPOOL=$(terapool) - vlog_defs += -DNUM_SUB_GROUPS_PER_GROUP=$(num_sub_groups_per_group) - vlog_defs += -DREMOTE_GROUP_LATENCY_CYCLES=$(remote_group_latency_cycles) -else - subgroup_rtl = 0 -endif +# This parameter is only used for TeraPool configurations +vlog_defs += -DNUM_SUB_GROUPS_PER_GROUP=$(num_sub_groups_per_group) -DREMOTE_GROUP_LATENCY_CYCLES=$(remote_group_latency_cycles) # Traffic generation enabled ifdef tg @@ -150,13 +145,14 @@ compile: dpi lib $(buildpath) $(buildpath)/compile.tcl update_opcodes $(buildpath)/compile.tcl: $(bender) $(config_mk) Makefile $(MEMPOOL_DIR)/Bender.yml $(shell find {src,tb,deps} -type f) $(bender) script vsim --vlog-arg="$(vlog_args)" $(vlog_defs) -t rtl -t mempool_vsim > $(buildpath)/compile.tcl echo "exit" >> $(buildpath)/compile.tcl + if [ "${subgroup_rtl}" -eq "0" ]; then awk '!/mempool_sub_group/' $(buildpath)/compile.tcl > tmpfile && mv tmpfile $(buildpath)/compile.tcl; fi cd $(buildpath) && $(questa_cmd) vsim -work $(library) -c -do compile.tcl # Simulation .PHONY: sim sim: clean-dasm compile cd $(buildpath) && \ - $(questa_cmd) vsim -voptargs=+acc $(questa_args) $(library).$(top_level) -do ../scripts/questa/run.tcl + $(questa_cmd) vsim -voptargs=+acc $(questa_args) $(library).$(top_level) -do "set config ${config}" -do ../scripts/questa/run.tcl ./scripts/return_status.sh $(buildpath)/transcript .PHONY: simc @@ -187,6 +183,7 @@ elabvcs: dpivcs $(buildpath) $(buildpath)/compilevcs.sh update_opcodes $(buildpath)/compilevcs.sh: $(bender) $(config_mk) Makefile $(MEMPOOL_DIR)/Bender.yml $(shell find {src,tb,deps} -type f) $(bender) script vcs --vlogan-bin="$(vcs_cmd) vlogan" --vlog-arg="$(vlogan_args)" $(vlog_defs) -t rtl -t mempool_vsim > $(buildpath)/compilevcs.sh echo "exit" >> $(buildpath)/compilevcs.sh + if [ "${subgroup_rtl}" -eq "0" ]; then awk '!/mempool_sub_group/' $(buildpath)/compilevcs.sh > tmpfile && mv tmpfile $(buildpath)/compilevcs.sh; fi # Call VCS cd $(buildpath) && \ chmod +x compilevcs.sh && \ @@ -260,6 +257,7 @@ $(VERILATOR_MK): $(VERILATOR_CONF) $(VERILATOR_WAIVE) $(MEMPOOL_DIR)/Bender.yml $(eval boot_addr=$(l2_base)) # Create Bender script of all RTL files $(bender) script verilator $(vlog_defs) -t rtl -t mempool_verilator > $(verilator_files) + if [ "${subgroup_rtl}" -eq "0" ]; then awk '!/mempool_sub_group/' $(verilator_files) > tmpfile && mv tmpfile $(verilator_files); fi # Append the verilator library files @echo '' >> $(verilator_files) # Append the verilator library files: Includes diff --git a/hardware/src/mempool_pkg.sv b/hardware/src/mempool_pkg.sv index 69589124b..38bfea217 100644 --- a/hardware/src/mempool_pkg.sv +++ b/hardware/src/mempool_pkg.sv @@ -301,7 +301,7 @@ package mempool_pkg; localparam integer unsigned NumBanksPerSubGroup = NumBanksPerGroup / NumSubGroupsPerGroup; // TeraPool Remote Groups Latency Control (in Cycles) - localparam integer unsigned RemoteGroupLatencyCycle = `ifdef REMOTE_GROUP_LATENCY_CYCLES `REMOTE_GROUP_LATENCY_CYCLES `else 9 `endif; + localparam integer unsigned RemoteGroupLatencyCycle = `ifdef REMOTE_GROUP_LATENCY_CYCLES `REMOTE_GROUP_LATENCY_CYCLES `else 7 `endif; //TeraPool AXI/DMA Config localparam integer unsigned NumAXIMastersPerSubGroup = NumAXIMastersPerGroup/NumSubGroupsPerGroup; diff --git a/hardware/tb/mempool_tb.sv b/hardware/tb/mempool_tb.sv index 3261ff3dc..ad4e4f2a7 100644 --- a/hardware/tb/mempool_tb.sv +++ b/hardware/tb/mempool_tb.sv @@ -350,6 +350,7 @@ module mempool_tb; **************************************/ `ifndef TARGET_SYNTHESIS `ifndef TARGET_VERILATOR +`ifndef POSTLAYOUT // Cores logic [NumCores-1:0] instruction_handshake, lsu_request, lsu_handshake; @@ -357,29 +358,68 @@ module mempool_tb; assign snitch_utilization = $countones(instruction_handshake); assign lsu_utilization = $countones(lsu_handshake); assign lsu_pressure = $countones(lsu_request); - for (genvar g = 0; g < NumGroups; g++) begin - for (genvar t = 0; t < NumTilesPerGroup; t++) begin - for (genvar c = 0; c < NumCoresPerTile; c++) begin - logic valid_instr, stall; - logic lsu_valid, lsu_ready; - // Snitch - assign valid_instr = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.valid_instr; - assign stall = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.stall; - assign instruction_handshake[g*NumTilesPerGroup*NumCoresPerTile+t*NumCoresPerTile+c] = valid_instr & !stall; - // Interconnect - assign lsu_valid = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.data_qvalid_o; - assign lsu_ready = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.data_qready_i; - assign lsu_request[g*NumTilesPerGroup*NumCoresPerTile+t*NumCoresPerTile+c] = lsu_valid & !lsu_ready; - assign lsu_handshake[g*NumTilesPerGroup*NumCoresPerTile+t*NumCoresPerTile+c] = lsu_valid & lsu_ready; + `ifdef TERAPOOL + for (genvar g = 0; g < NumGroups; g++) begin + for (genvar sg = 0; sg < NumSubGroupsPerGroup; sg++) begin + for (genvar t = 0; t < NumTilesPerSubGroup; t++) begin + for (genvar c = 0; c < NumCoresPerTile; c++) begin + logic valid_instr, stall; + logic lsu_valid, lsu_ready; + // Snitch + assign valid_instr = dut.i_mempool_cluster.gen_groups[g].gen_rtl_group.i_group.gen_sub_groups[sg].gen_rtl_sg.i_sub_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.valid_instr; + assign stall = dut.i_mempool_cluster.gen_groups[g].gen_rtl_group.i_group.gen_sub_groups[sg].gen_rtl_sg.i_sub_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.stall; + assign instruction_handshake[g*NumSubGroupsPerGroup*NumTilesPerSubGroup*NumCoresPerTile + sg*NumTilesPerSubGroup*NumCoresPerTile + t*NumCoresPerTile + c] = valid_instr & !stall; + // Interconnect + assign lsu_valid = dut.i_mempool_cluster.gen_groups[g].gen_rtl_group.i_group.gen_sub_groups[sg].gen_rtl_sg.i_sub_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.data_qvalid_o; + assign lsu_ready = dut.i_mempool_cluster.gen_groups[g].gen_rtl_group.i_group.gen_sub_groups[sg].gen_rtl_sg.i_sub_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.data_qready_i; + assign lsu_request[g*NumSubGroupsPerGroup*NumTilesPerSubGroup*NumCoresPerTile + sg*NumTilesPerSubGroup*NumCoresPerTile + t*NumCoresPerTile + c] = lsu_valid & !lsu_ready; + assign lsu_handshake[g*NumSubGroupsPerGroup*NumTilesPerSubGroup*NumCoresPerTile + sg*NumTilesPerSubGroup*NumCoresPerTile + t*NumCoresPerTile + c] = lsu_valid & lsu_ready; + end + end end end - end + `else + for (genvar g = 0; g < NumGroups; g++) begin + for (genvar t = 0; t < NumTilesPerGroup; t++) begin + for (genvar c = 0; c < NumCoresPerTile; c++) begin + logic valid_instr, stall; + logic lsu_valid, lsu_ready; + // Snitch + assign valid_instr = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.valid_instr; + assign stall = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.stall; + assign instruction_handshake[g*NumTilesPerGroup*NumCoresPerTile+t*NumCoresPerTile+c] = valid_instr & !stall; + // Interconnect + assign lsu_valid = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.data_qvalid_o; + assign lsu_ready = dut.i_mempool_cluster.gen_groups[g].i_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch.data_qready_i; + assign lsu_request[g*NumTilesPerGroup*NumCoresPerTile+t*NumCoresPerTile+c] = lsu_valid & !lsu_ready; + assign lsu_handshake[g*NumTilesPerGroup*NumCoresPerTile+t*NumCoresPerTile+c] = lsu_valid & lsu_ready; + end + end + end + `endif + // DSPU if (snitch_pkg::XPULPIMG) begin: gen_utilization logic [NumCores-1:0] dspu_handshake, dspu_mac; int unsigned dspu_utilization, mac_utilization; assign dspu_utilization = $countones(dspu_handshake); assign mac_utilization = $countones(dspu_mac); + `ifdef TERAPOOL + for (genvar g = 0; g < NumGroups; g++) begin + for (genvar sg = 0; sg < NumSubGroupsPerGroup; sg++) begin + for (genvar t = 0; t < NumTilesPerSubGroup; t++) begin + for (genvar c = 0; c < NumCoresPerTile; c++) begin + logic dsp_valid, dsp_ready, mac; + assign dsp_valid = dut.i_mempool_cluster.gen_groups[g].gen_rtl_group.i_group.gen_sub_groups[sg].gen_rtl_sg.i_sub_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch_ipu.gen_xpulpimg.i_dspu.in_valid_i; + assign dsp_ready = dut.i_mempool_cluster.gen_groups[g].gen_rtl_group.i_group.gen_sub_groups[sg].gen_rtl_sg.i_sub_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch_ipu.gen_xpulpimg.i_dspu.in_ready_o; + assign mac = dut.i_mempool_cluster.gen_groups[g].gen_rtl_group.i_group.gen_sub_groups[sg].gen_rtl_sg.i_sub_group.gen_tiles[t].i_tile.gen_cores[c].gen_mempool_cc.riscv_core.i_snitch_ipu.gen_xpulpimg.i_dspu.operator_i ==? riscv_instr::P_MAC; + assign dspu_handshake[g*NumSubGroupsPerGroup*NumTilesPerSubGroup*NumCoresPerTile + sg*NumTilesPerSubGroup*NumCoresPerTile + t*NumCoresPerTile + c] = dsp_valid & dsp_ready; + assign dspu_mac[g*NumSubGroupsPerGroup*NumTilesPerSubGroup*NumCoresPerTile + sg*NumTilesPerSubGroup*NumCoresPerTile + t*NumCoresPerTile + c] = dsp_valid & dsp_ready & mac; + end + end + end + end + `else for (genvar g = 0; g < NumGroups; g++) begin for (genvar t = 0; t < NumTilesPerGroup; t++) begin for (genvar c = 0; c < NumCoresPerTile; c++) begin @@ -392,7 +432,9 @@ module mempool_tb; end end end + `endif end + // AXI logic [NumGroups*NumAXIMastersPerGroup-1:0] w_valid, w_ready, r_ready, r_valid; int unsigned axi_w_utilization, axi_r_utilization; @@ -407,5 +449,6 @@ module mempool_tb; `endif `endif +`endif endmodule : mempool_tb