diff --git a/hw/ip/spatz/src/spatz_vlsu.sv b/hw/ip/spatz/src/spatz_vlsu.sv index 07d20131..38cf788c 100644 --- a/hw/ip/spatz/src/spatz_vlsu.sv +++ b/hw/ip/spatz/src/spatz_vlsu.sv @@ -35,12 +35,12 @@ module spatz_vlsu output logic [NrInterfaces-1:0] vrf_we_o, output vrf_be_t [NrInterfaces-1:0] vrf_wbe_o, input logic [NrInterfaces-1:0] vrf_wvalid_i, - - output spatz_id_t [2:0] vrf_id_o, - output vrf_addr_t [1:0] vrf_raddr_o, - output logic [1:0] vrf_re_o, - input vrf_data_t [1:0] vrf_rdata_i, - input logic [1:0] vrf_rvalid_i, + + output spatz_id_t [(3*NrInterfaces)-1:0] vrf_id_o, + output vrf_addr_t [(2*NrInterfaces)-1:0] vrf_raddr_o, + output logic [(2*NrInterfaces)-1:0] vrf_re_o, + input vrf_data_t [(2*NrInterfaces)-1:0] vrf_rdata_i, + input logic [(2*NrInterfaces)-1:0] vrf_rvalid_i, // Memory Request output spatz_mem_req_t [NrMemPorts-1:0] spatz_mem_req_o, output logic [NrMemPorts-1:0] spatz_mem_req_valid_o, @@ -168,72 +168,79 @@ module spatz_vlsu // Reorder Buffer // ////////////////////// + // On the VRF side, each interface works independently because of the VRF structure + typedef logic [int'(MAXEW)-1:0] addr_offset_t; - elen_t [NrMemPorts-1:0] rob_wdata; - id_t [NrMemPorts-1:0] rob_wid; - logic [NrMemPorts-1:0] rob_push; - logic [NrMemPorts-1:0] rob_rvalid; - elen_t [NrMemPorts-1:0] rob_rdata; - logic [NrMemPorts-1:0] rob_pop; - id_t [NrMemPorts-1:0] rob_rid; - logic [NrMemPorts-1:0] rob_req_id; - id_t [NrMemPorts-1:0] rob_id; - logic [NrMemPorts-1:0] rob_full; - logic [NrMemPorts-1:0] rob_empty; + elen_t [NrInterfaces-1:0] [N_FU-1:0] rob_wdata; + id_t [NrInterfaces-1:0] [N_FU-1:0] rob_wid; + logic [NrInterfaces-1:0] [N_FU-1:0] rob_push; + logic [NrInterfaces-1:0] [N_FU-1:0] rob_rvalid; + elen_t [NrInterfaces-1:0] [N_FU-1:0] rob_rdata; + logic [NrInterfaces-1:0] [N_FU-1:0] rob_pop; + id_t [NrInterfaces-1:0] [N_FU-1:0] rob_rid; + logic [NrInterfaces-1:0] [N_FU-1:0] rob_req_id; + id_t [NrInterfaces-1:0] [N_FU-1:0] rob_id; + logic [NrInterfaces-1:0] [N_FU-1:0] rob_full; + logic [NrInterfaces-1:0] [N_FU-1:0] rob_empty; // The reorder buffer decouples the memory side from the register file side. // All elements from one side to the other go through it. - for (genvar port = 0; port < NrMemPorts; port++) begin : gen_rob + // Each interface works independently from the others. + for (genvar intf = 0; intf < NrInterfaces; intf++) begin : gen_rob_intf + for (genvar fu = 0; fu < N_FU; fu++) begin : gen_rob_intf_fu `ifdef MEMPOOL_SPATZ - reorder_buffer #( - .DataWidth(ELEN ), - .NumWords (NrOutstandingLoads) - ) i_reorder_buffer ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .data_i (rob_wdata[port] ), - .id_i (rob_wid[port] ), - .push_i (rob_push[port] ), - .data_o (rob_rdata[port] ), - .valid_o (rob_rvalid[port]), - .id_read_o(rob_rid[port] ), - .pop_i (rob_pop[port] ), - .id_req_i (rob_req_id[port]), - .id_o (rob_id[port] ), - .full_o (rob_full[port] ), - .empty_o (rob_empty[port] ) - ); + reorder_buffer #( + .DataWidth(ELEN ), + .NumWords (NrOutstandingLoads) + ) i_reorder_buffer ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .data_i (rob_wdata[intf][fu] ), + .id_i (rob_wid[intf][fu] ), + .push_i (rob_push[intf][fu] ), + .data_o (rob_rdata[intf][fu] ), + .valid_o (rob_rvalid[intf][fu]), + .id_read_o(rob_rid[intf][fu] ), + .pop_i (rob_pop[intf][fu] ), + .id_req_i (rob_req_id[intf][fu]), + .id_o (rob_id[intf][fu] ), + .full_o (rob_full[intf][fu] ), + .empty_o (rob_empty[intf][fu] ) + ); `else - fifo_v3 #( - .DATA_WIDTH(ELEN ), - .DEPTH (NrOutstandingLoads) - ) i_reorder_buffer ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .flush_i (1'b0 ), - .testmode_i(1'b0 ), - .data_i (rob_wdata[port] ), - .push_i (rob_push[port] ), - .data_o (rob_rdata[port] ), - .pop_i (rob_pop[port] ), - .full_o (rob_full[port] ), - .empty_o (rob_empty[port] ), - .usage_o (/* Unused */ ) - ); - assign rob_rvalid[port] = !rob_empty[port]; + fifo_v3 #( + .DATA_WIDTH(ELEN ), + .DEPTH (NrOutstandingLoads) + ) i_reorder_buffer ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .flush_i (1'b0 ), + .testmode_i(1'b0 ), + .data_i (rob_wdata[intf][fu] ), + .push_i (rob_push[intf][fu] ), + .data_o (rob_rdata[intf][fu] ), + .pop_i (rob_pop[intf][fu] ), + .full_o (rob_full[intf][fu] ), + .empty_o (rob_empty[intf][fu] ), + .usage_o (/* Unused */ ) + ); + assign rob_rvalid[intf][fu] = !rob_empty[intf][fu]; `endif - end: gen_rob + end: gen_rob_intf_fu + end: gen_rob_intf ////////////////////// // Memory request // ////////////////////// + // On the memory side, the interface boundary is transparent + // Is the memory operation valid and are we at the last one? logic [NrMemPorts-1:0] mem_operation_valid; logic [NrMemPorts-1:0] mem_operation_last; - // For each memory port we count how many elements we have already loaded/stored. + // For each memory port we count how many bytes we have already loaded/stored (VLSU <-> MEM). // Multiple counters are needed all memory ports can work independent of each other. vlen_t [NrMemPorts-1:0] mem_counter_max; logic [NrMemPorts-1:0] mem_counter_en; @@ -375,37 +382,40 @@ module spatz_vlsu end end - // For each FU that we have, count how many elements we have already loaded/stored. - // Multiple counters are necessary for the case where not every single FU will + // For each memory port that we have, count how many elements we have already loaded/stored (VRF <-> VLSU). + // Multiple counters are necessary for the case where not every single port will // receive the same number of elements to work through. - vlen_t [N_FU*NrMemPorts-1:0] commit_counter_max; - logic [N_FU*NrMemPorts-1:0] commit_counter_en; - logic [N_FU*NrMemPorts-1:0] commit_counter_load; - vlen_t [N_FU*NrMemPorts-1:0] commit_counter_delta; - vlen_t [N_FU*NrMemPorts-1:0] commit_counter_d; - vlen_t [N_FU*NrMemPorts-1:0] commit_counter_q; - logic [N_FU*NrMemPorts-1:0] commit_finished_q; - logic [N_FU*NrMemPorts-1:0] commit_finished_d; - - for (genvar port = 0; port < N_FU*NrMemPorts; port++) begin: gen_vreg_counters - delta_counter #( - .WIDTH($bits(vlen_t)) - ) i_delta_counter_vreg ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .clear_i (1'b0 ), - .en_i (commit_counter_en[port] ), - .load_i (commit_counter_load[port] ), - .down_i (1'b0 ), // We always count up - .delta_i (commit_counter_delta[port]), - .d_i (commit_counter_d[port] ), - .q_o (commit_counter_q[port] ), - .overflow_o(/* Unused */ ) - ); - - assign commit_finished_q[port] = commit_insn_valid && (commit_counter_q[port] == commit_counter_max[port]); - assign commit_finished_d[port] = commit_insn_valid && ((commit_counter_q[port] + commit_counter_delta[port]) == commit_counter_max[port]); - end: gen_vreg_counters + vlen_t [NrInterfaces-1:0] [N_FU-1:0] commit_counter_max; + logic [NrInterfaces-1:0] [N_FU-1:0] commit_counter_en; + logic [NrInterfaces-1:0] [N_FU-1:0] commit_counter_load; + vlen_t [NrInterfaces-1:0] [N_FU-1:0] commit_counter_delta; + vlen_t [NrInterfaces-1:0] [N_FU-1:0] commit_counter_d; + vlen_t [NrInterfaces-1:0] [N_FU-1:0] commit_counter_q; + logic [NrInterfaces-1:0] [N_FU-1:0] commit_finished_q; + logic [NrInterfaces-1:0] [N_FU-1:0] commit_finished_d; + + for (genvar intf = 0; intf < NrInterfaces; intf++) begin : gen_vreg_counters_intf + for (genvar fu = 0; fu < N_FU; fu++) begin : gen_vreg_counters_intf_fu + delta_counter #( + .WIDTH($bits(vlen_t)) + ) i_delta_counter_vreg ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .clear_i (1'b0 ), + .en_i (commit_counter_en[intf][fu] ), + .load_i (commit_counter_load[intf][fu] ), + .down_i (1'b0 ), // We always count up + .delta_i (commit_counter_delta[intf][fu]), + .d_i (commit_counter_d[intf][fu] ), + .q_o (commit_counter_q[intf][fu] ), + .overflow_o(/* Unused */ ) + ); + + assign commit_finished_q[intf][fu] = commit_insn_valid && (commit_counter_q[intf][fu] == commit_counter_max[intf][fu]); + assign commit_finished_d[intf][fu] = commit_insn_valid && + ((commit_counter_q[intf][fu] + commit_counter_delta[intf][fu]) == commit_counter_max[intf][fu]); + end: gen_vreg_counters_intf_fu + end: gen_vreg_counters_intf //////////////////////// // Address Generation // @@ -413,8 +423,8 @@ module spatz_vlsu elen_t [NrMemPorts-1:0] mem_req_addr; - vrf_addr_t vd_vreg_addr; - vrf_addr_t vs2_vreg_addr; + vrf_addr_t [NrInterfaces-1:0] vd_vreg_addr; + vrf_addr_t [NrInterfaces-1:0] vs2_vreg_addr; // Current element index and byte index that are being accessed at the register file vreg_elem_t vd_elem_id; @@ -464,10 +474,12 @@ module spatz_vlsu end end: gen_mem_req_addr - // Calculate the register file address + // Calculate the register file addresses always_comb begin : gen_vreg_addr - vd_vreg_addr = (commit_insn_q.vd << $clog2(NrWordsPerVector)) + $unsigned(vd_elem_id); - vs2_vreg_addr = (mem_spatz_req.vs2 << $clog2(NrWordsPerVector)) + $unsigned(vs2_elem_id_q); + for (genvar intf = 0; intf < NrInterfaces; intf++) begin : gen_vreg_addr_intf + vd_vreg_addr[intf] = (commit_insn_q.vd << $clog2(NrWordsPerVector)) + ($unsigned(vd_elem_id) << 1) + $unsigned(intf); + vs2_vreg_addr[intf] = (mem_spatz_req.vs2 << $clog2(NrWordsPerVector)) + ($unsigned(vs2_elem_id_q) << 1) + $unsigned(intf); + end end /////////////// @@ -510,8 +522,8 @@ module spatz_vlsu end: control_proc // Is the VRF operation valid and are we at the last one? - logic [N_FU*NrMemPorts-1:0] commit_operation_valid; - logic [N_FU*NrMemPorts-1:0] commit_operation_last; + logic [NrMemPorts-1:0] commit_operation_valid; + logic [NrMemPorts-1:0] commit_operation_last; // Is instruction a load? logic mem_is_load; @@ -560,8 +572,8 @@ module spatz_vlsu // Store the offsets of all loads, for realigning addr_offset_t [N_FU*rMemPorts-1:0] vreg_addr_offset; - logic [N_FU*NrMemPorts-1:0] offset_queue_full; - for (genvar port = 0; port < N_FU*NrMemPorts; port++) begin : gen_offset_queue + logic [NrMemPorts-1:0] offset_queue_full; + for (genvar port = 0; port < NrMemPorts; port++) begin : gen_offset_queue fifo_v3 #( .DATA_WIDTH(int'(MAXEW) ), .DEPTH (NrOutstandingLoads) @@ -593,29 +605,32 @@ module spatz_vlsu logic rsp_valid; } vrf_req_t; - vrf_req_t vrf_req_d, vrf_req_q; - logic vrf_req_valid_d, vrf_req_ready_d; - logic vrf_req_valid_q, vrf_req_ready_q; + vrf_req_t [NrInterfaces-1:0] vrf_req_d, vrf_req_q; + logic [NrInterfaces-1:0] vrf_req_valid_d, vrf_req_ready_d; + logic [NrInterfaces-1:0] vrf_req_valid_q, vrf_req_ready_q; - spill_register #( - .T(vrf_req_t) - ) i_vrf_req_register ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .data_i (vrf_req_d ), - .valid_i(vrf_req_valid_d), - .ready_o(vrf_req_ready_d), - .data_o (vrf_req_q ), - .valid_o(vrf_req_valid_q), - .ready_i(vrf_req_ready_q) - ); + for (genvar intf = 0; intf < NrInterfaces; intf++) begin : gen_vrf_req_register_intf + spill_register #( + .T(vrf_req_t) + ) i_vrf_req_register ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .data_i (vrf_req_d[intf] ), + .valid_i(vrf_req_valid_d[intf]), + .ready_o(vrf_req_ready_d[intf]), + .data_o (vrf_req_q[intf] ), + .valid_o(vrf_req_valid_q[intf]), + .ready_i(vrf_req_ready_q[intf]) + ); + + assign vrf_waddr_o[intf] = vrf_req_q[intf].waddr; + assign vrf_wdata_o[intf] = vrf_req_q[intf].wdata; + assign vrf_wbe_o[intf] = vrf_req_q[intf].wbe; + assign vrf_we_o[intf] = vrf_req_valid_q[intf]; + assign vrf_id_o[intf] = {vrf_req_q[intf].rsp.id, mem_spatz_req.id, commit_insn_q.id}; + assign vrf_req_ready_q[intf] = vrf_wvalid_i[intf]; + end - assign vrf_waddr_o = vrf_req_q.waddr; - assign vrf_wdata_o = vrf_req_q.wdata; - assign vrf_wbe_o = vrf_req_q.wbe; - assign vrf_we_o = vrf_req_valid_q; - assign vrf_id_o = {vrf_req_q.rsp.id, mem_spatz_req.id, commit_insn_q.id}; - assign vrf_req_ready_q = vrf_wvalid_i; // Ack when the vector store finishes, or when the vector load commits to the VRF assign vlsu_rsp_o = vrf_req_q.rsp_valid && vrf_req_valid_q ? vrf_req_q.rsp : '{id: commit_insn_q.id, default: '0}; @@ -634,18 +649,18 @@ module spatz_vlsu end: gen_catchup for (genvar fu = 0; fu < N_FU*NrMemPorts; fu++) begin: gen_vreg_counter_proc - // The total amount of elements we have to work through - vlen_t max_elements; + // The total amount of vector bytes we have to work through + vlen_t max_bytes; always_comb begin // Default value - max_elements = (commit_insn_q.vl >> $clog2(N_FU*NrMemPorts*ELENB)) << $clog2(ELENB); + max_bytes = (commit_insn_q.vl >> $clog2(N_FU*NrMemPorts*ELENB)) << $clog2(ELENB); // Full transfer if (commit_insn_q.vl[$clog2(ELENB) +: $clog2(N_FU*NrMemPorts)] > fu) - max_elements += ELENB; + max_bytes += ELENB; else if (commit_insn_q.vl[$clog2(N_FU*NrMemPorts*ELENB)-1:$clog2(ELENB)] == fu) - max_elements += commit_insn_q.vl[$clog2(ELENB)-1:0]; + max_bytes += commit_insn_q.vl[$clog2(ELENB)-1:0]; commit_counter_load[port] = commit_insn_pop; commit_counter_d[port] = (commit_insn_q.vstart >> $clog2(N_FU*NrMemPorts*ELENB)) << $clog2(ELENB); @@ -653,34 +668,34 @@ module spatz_vlsu commit_counter_d[port] += ELENB; else if (commit_insn_q.vstart[idx_width(N_FU*NrMemPorts*ELENB)-1:$clog2(ELENB)] == fu) commit_counter_d[port] += commit_insn_q.vstart[$clog2(ELENB)-1:0]; - commit_operation_valid[port] = commit_insn_valid && (commit_counter_q[port] != max_elements) && (catchup[port] || (!catchup[port] && ~|catchup)); - commit_operation_last[port] = commit_operation_valid[port] && ((max_elements - commit_counter_q[port]) <= (commit_is_single_element_operation ? commit_single_element_size : ELENB)); - commit_counter_delta[port] = !commit_operation_valid[port] ? vlen_t'('d0) : commit_is_single_element_operation ? vlen_t'(commit_single_element_size) : commit_operation_last[port] ? (max_elements - commit_counter_q[port]) : vlen_t'(ELENB); - commit_counter_en[port] = commit_operation_valid[port] && (commit_insn_q.is_load && vrf_req_valid_d && vrf_req_ready_d) || (!commit_insn_q.is_load && vrf_rvalid_i[0] && vrf_re_o[0] && (!mem_is_indexed || vrf_rvalid_i[1])); - commit_counter_max[port] = max_elements; + commit_operation_valid[port] = commit_insn_valid && (commit_counter_q[port] != max_bytes) && (catchup[port] || (!catchup[port] && ~|catchup)); + commit_operation_last[port] = commit_operation_valid[port] && ((max_bytes - commit_counter_q[port]) <= (commit_is_single_element_operation ? commit_single_element_size : ELENB)); + commit_counter_delta[port] = !commit_operation_valid[port] ? vlen_t'('d0) : commit_is_single_element_operation ? vlen_t'(commit_single_element_size) : commit_operation_last[port] ? (max_bytes - commit_counter_q[port]) : vlen_t'(ELENB); + commit_counter_en[intf][fu] = commit_operation_valid[port] && (commit_insn_q.is_load && vrf_req_valid_d && vrf_req_ready_d) || (!commit_insn_q.is_load && vrf_rvalid_i[0] && vrf_re_o[0] && (!mem_is_indexed || vrf_rvalid_i[1])); + commit_counter_max[port] = max_bytes; end end assign vd_elem_id = (commit_counter_q[0] > vreg_start_0) ? commit_counter_q[0] >> $clog2(ELENB) : commit_counter_q[N_FU*NrMemPorts-1] >> $clog2(ELENB); for (genvar port = 0; port < NrMemPorts; port++) begin: gen_mem_counter_proc - // The total amount of elements we have to work through - vlen_t max_elements; + // The total amount of vector bytes we have to work through + vlen_t max_bytes; always_comb begin // Default value - max_elements = (mem_spatz_req.vl >> $clog2(NrMemPorts*MemDataWidthB)) << $clog2(MemDataWidthB); + max_bytes = (mem_spatz_req.vl >> $clog2(NrMemPorts*MemDataWidthB)) << $clog2(MemDataWidthB); if (NrMemPorts == 1) - max_elements = mem_spatz_req.vl; + max_bytes = mem_spatz_req.vl; else if (mem_spatz_req.vl[$clog2(MemDataWidthB) +: $clog2(NrMemPorts)] > port) - max_elements += MemDataWidthB; + max_bytes += MemDataWidthB; else if (mem_spatz_req.vl[$clog2(MemDataWidthB) +: $clog2(NrMemPorts)] == port) - max_elements += mem_spatz_req.vl[$clog2(MemDataWidthB)-1:0]; + max_bytes += mem_spatz_req.vl[$clog2(MemDataWidthB)-1:0]; - mem_operation_valid[port] = mem_spatz_req_valid && (max_elements != mem_counter_q[port]); - mem_operation_last[port] = mem_operation_valid[port] && ((max_elements - mem_counter_q[port]) <= (mem_is_single_element_operation ? mem_single_element_size : MemDataWidthB)); + mem_operation_valid[port] = mem_spatz_req_valid && (max_bytes != mem_counter_q[port]); + mem_operation_last[port] = mem_operation_valid[port] && ((max_bytes - mem_counter_q[port]) <= (mem_is_single_element_operation ? mem_single_element_size : MemDataWidthB)); mem_counter_load[port] = mem_spatz_req_ready; mem_counter_d[port] = (mem_spatz_req.vstart >> $clog2(NrMemPorts*MemDataWidthB)) << $clog2(MemDataWidthB); if (NrMemPorts == 1) @@ -690,9 +705,9 @@ module spatz_vlsu mem_counter_d[port] += MemDataWidthB; else if (mem_spatz_req.vstart[$clog2(MemDataWidthB) +: $clog2(NrMemPorts)] == port) mem_counter_d[port] += mem_spatz_req.vstart[$clog2(MemDataWidthB)-1:0]; - mem_counter_delta[port] = !mem_operation_valid[port] ? 'd0 : mem_is_single_element_operation ? mem_single_element_size : mem_operation_last[port] ? (max_elements - mem_counter_q[port]) : MemDataWidthB; + mem_counter_delta[port] = !mem_operation_valid[port] ? 'd0 : mem_is_single_element_operation ? mem_single_element_size : mem_operation_last[port] ? (max_bytes - mem_counter_q[port]) : MemDataWidthB; mem_counter_en[port] = spatz_mem_req_ready[port] && spatz_mem_req_valid[port]; - mem_counter_max[port] = max_elements; + mem_counter_max[port] = max_bytes; // Index counter mem_idx_counter_d[port] = mem_counter_d[port];