From ebffe93aa2eda7fa0164c04b2d729544ba1c4445 Mon Sep 17 00:00:00 2001 From: Paul Scheffler Date: Thu, 8 Feb 2024 23:11:50 +0100 Subject: [PATCH] hw: Fix load-store consistency between int and FP datapaths --- docs/schema/snitch_cluster.schema.json | 10 ++ hw/snitch/src/snitch.sv | 27 ++++- hw/snitch/src/snitch_lsu.sv | 104 +++++++++++++++++- hw/snitch_cluster/src/snitch_cc.sv | 10 ++ hw/snitch_cluster/src/snitch_cluster.sv | 5 + .../src/snitch_cluster_wrapper.sv.tpl | 2 + hw/snitch_cluster/src/snitch_fp_ss.sv | 16 ++- 7 files changed, 169 insertions(+), 5 deletions(-) diff --git a/docs/schema/snitch_cluster.schema.json b/docs/schema/snitch_cluster.schema.json index 6e8c1181ad..539085168a 100644 --- a/docs/schema/snitch_cluster.schema.json +++ b/docs/schema/snitch_cluster.schema.json @@ -136,6 +136,16 @@ "description": "Number of request entries the DMA can keep", "default": 3 }, + "caq_depth": { + "type": "number", + "description": "Depth of consistency address queue (CAQ). At most `num_fp_outstanding_mem` plus offloading round trip time makes sense.", + "default": 4 + }, + "caq_tag_width": { + "type": "number", + "description": "Tag width for consistency address queue (CAQ). Reordering of integer/FP loads/stores to words with colliding tags causes stalls.", + "default": 8 + }, "enable_debug": { "type": "boolean", "description": "Whether to provide a debug request input and external debug features", diff --git a/hw/snitch/src/snitch.sv b/hw/snitch/src/snitch.sv index 58f582cc9e..7d1ad0cf33 100644 --- a/hw/snitch/src/snitch.sv +++ b/hw/snitch/src/snitch.sv @@ -55,6 +55,9 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( parameter int unsigned NumDTLBEntries = 0, parameter int unsigned NumITLBEntries = 0, parameter snitch_pma_pkg::snitch_pma_t SnitchPMACfg = '{default: 0}, + /// Consistency Address Queue (CAQ) parameters + parameter int unsigned CaqDepth = 0, + parameter int unsigned CaqTagWidth = 0, /// Enable debug support. parameter bit DebugSupport = 1, /// Derived parameter *Do not override* @@ -103,6 +106,9 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( output fpnew_pkg::roundmode_e fpu_rnd_mode_o, output fpnew_pkg::fmt_mode_t fpu_fmt_mode_o, input fpnew_pkg::status_t fpu_status_i, + /// Consistency Address Queue (CAQ) interface. + /// Used by FPU to notify Snitch LSU of retired loads/stores. + input logic caq_pvalid_i, // Core events for performance counters output snitch_pkg::core_events_t core_events_o, // Cluster HW barrier @@ -146,7 +152,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( logic [31:0] alu_result; logic [RegWidth-1:0] rd, rs1, rs2; - logic stall, lsu_stall; + logic stall, lsu_stall, caq_stall; // Register connections logic [1:0][RegWidth-1:0] gpr_raddr; logic [1:0][31:0] gpr_rdata; @@ -453,6 +459,9 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( assign stall = ~valid_instr // The LSU is stalling. | lsu_stall + // The CAQ inside the LSU is stalling; + // this prevents a local load/store from overtaking a prior offloaded load/store. + | caq_stall // The accelerator port is stalling. | acc_stall // We are waiting on the `fence.i` flush. @@ -2804,6 +2813,12 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( // sign exten to appropriate length assign lsu_qdata = $unsigned(gpr_rdata[1]); + logic caq_qvalid, caq_qready; + // Make request to CAQ on offloading an FPU load or store + assign caq_qvalid = acc_qvalid_o & (is_fp_load | is_fp_store); + // CAQ stalls core when not ready to take valid request + assign caq_stall = caq_valid & ~caq_qready; + snitch_lsu #( .AddrWidth (AddrWidth), .DataWidth (DataWidth), @@ -2811,7 +2826,10 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( .drsp_t (drsp_t), .tag_t (logic[RegWidth-1:0]), .NumOutstandingMem (NumIntOutstandingMem), - .NumOutstandingLoads (NumIntOutstandingLoads) + .NumOutstandingLoads (NumIntOutstandingLoads), + .Caq (1), + .CaqDepth (CaqDepth), + .CaqTagWidth (CaqTagWidth) ) i_snitch_lsu ( .clk_i (clk_i), .rst_i (rst_i), @@ -2830,6 +2848,11 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( .lsu_pvalid_o (lsu_pvalid), .lsu_pready_i (lsu_pready), .lsu_empty_o (lsu_empty), + .caq_qaddr_i (ls_paddr), + .caq_qwrite_i (is_fp_store), + .caq_qvalid_i (caq_qvalid), + .caq_qready_o (caq_qready), + .caq_pvalid_i, .data_req_o, .data_rsp_i ); diff --git a/hw/snitch/src/snitch_lsu.sv b/hw/snitch/src/snitch_lsu.sv index 653fd92bd4..3c6cdf2967 100644 --- a/hw/snitch/src/snitch_lsu.sv +++ b/hw/snitch/src/snitch_lsu.sv @@ -19,6 +19,17 @@ module snitch_lsu #( parameter int unsigned NumOutstandingLoads = 1, /// Whether to NaN Box values. Used for floating-point load/stores. parameter bit NaNBox = 0, + /// Whether to instantiate a consistency address queue (CAQ). The CAQ enables + /// consistency with another LSU in the same hart (i.e. in the FPSS) strictly + /// *downstream* of the issuing Snitch core. For all offloaded accesses, the + /// word address LSBs are pushed into the Snitch core's CAQ on offload and + /// popped when completed by the downstream LSU. Incoming accesses possibly + /// overtaking pending downstream accesses in the CAQ are stalled. + parameter bit Caq = 0, + /// CAQ Depth; should match the number of downstream LSU outstanding requests. + parameter int unsigned CaqDepth = 0, + /// Size of CAQ address LSB tags; provides a pessimism-complexity tradeoff. + parameter int unsigned CaqTagWidth = 0, parameter type dreq_t = logic, parameter type drsp_t = logic, /// Derived parameter *Do not override* @@ -43,6 +54,15 @@ module snitch_lsu #( output logic lsu_perror_o, output logic lsu_pvalid_o, input logic lsu_pready_i, + /// CAQ request snoop channel. Only some address bits will be read. + /// Fork offloaded loads/stores to here iff `Caq` is 1. + input addr_t caq_qaddr_i, + input logic caq_qwrite_i, + input logic caq_qvalid_i, + output logic caq_qready_o, + /// CAQ response snoop channel. + /// Fork responses to offloaded loads/stores to here iff `Caq` is 1. + input logic caq_pvalid_i, /// High if there is currently no transaction pending. output logic lsu_empty_o, // Memory Interface Channel @@ -51,6 +71,86 @@ module snitch_lsu #( ); localparam int unsigned DataAlign = $clog2(DataWidth/8); + + // ------------------------------- + // Consistency Address Queue (CAQ) + // ------------------------------- + + // TODO: What about exceptions? We *should* get a response for all offloaded + // loads/stores anyways as already issued instructions should conclude, but + // if this is not the case, things go south! + + logic lsu_postcaq_qvalid, lsu_postcaq_qready; + + if (Caq) begin : gen_caq + + // Fork request handshake between CAQ lookup and downstream (post-CAQ) LSU logic. + logic caq_lsu_qvalid, caq_lsu_qready; + + stream_fork #( + .N_OUP(2) + ) i_stream_fork_caq_lsu ( + .clk_i, + .rst_ni, + .valid_i ( lsu_qvalid_i ), + .ready_o ( lsu_qready_o ), + .valid_o ( {caq_lsu_qvalid, lsu_postcaq_qvalid} ), + .ready_i ( {caq_lsu_qready, lsu_postcaq_qready} ) + ); + + // CAQ unblocks request when ready to proceed and contains no collision. + logic caq_lsu_gnt, caq_lsu_exists; + logic caq_out_valid, caq_out_gnt; + + assign caq_lsu_qready = caq_lsu_gnt & ~caq_lsu_exists; + + id_queue #( + .data_t ( logic [CaqTagWidth:0] ), // Store address tag *and* write enable + .ID_WIDTH ( 0 ), // No reorder capability here + .CAPACITY ( CaqDepth ), + .FULL_BW ( 1 ), + ) i_caq ( + .clk_i, + .rst_ni, + // Push in snooped accesses offloaded to downstream LSU + .inp_id_i ( '0 ), + .inp_data_i ( {caq_qwrite_i, caq_qaddr_i[CaqTagWidth+DataAlign-1:DataAlign]} ), + .inp_req_i ( caq_qvalid_i ), + .inp_gnt_o ( caq_qready_o ), + // Check if currently presented request collides with any snooped ones. + // Check address tag in any case. Check the write enable only when it + // is necessary. If we receive a write, stall on any address match + // (mask MSB). If we receive a read, we stall only if a write collides. + .exists_mask_i ( {~lsu_qwrite_i, {(CaqDepth){1'b1}}} ), + .exists_data_i ( {1'b1, lsu_qaddr_i[CaqTagWidth+DataAlign-1:DataAlign]} ), + .exists_req_i ( caq_lsu_qvalid ), + .exists_gnt_o ( caq_lsu_gnt ), + .exists_o ( caq_lsu_exists ), + // Pop output whenever we get a response for a snooped request. + // This has no backpressure as we should snoop as many responses as requests. + .oup_id_i ( '0 ), + .oup_pop_i ( caq_pvalid_i ), + .oup_req_i ( caq_pvalid_i ), + .oup_data_o ( ), + .oup_data_valid_o( caq_out_valid ), + .oup_gnt_o ( caq_out_gnt ) + ); + + // Check that we do not pop more snooped responses than we pushed requests. + `ASSERT(CaqPopEmpty, (caq_qvalid_i |-> caq_out_gnt && caq_out_valid), clk_i, rst_i) + + end else begin : gen_no_caq + + // No CAQ can stall us; forward request handshake to LSU logic + assign lsu_postcaq_qvalid = lsu_qvalid_i; + assign lsu_postcaq_qready = lsu_qready_o; + + end + + // -------------- + // Downstream LSU + // -------------- + logic [63:0] ld_result; logic [63:0] lsu_qdata, data_qdata; @@ -115,7 +215,7 @@ module snitch_lsu #( // Only make a request when we got a valid request and if it is a load also // check that we can actually store the necessary information to process it in // the upcoming cycle(s). - assign data_req_o.q_valid = lsu_qvalid_i & (lsu_qwrite_i | ~laq_full) & ~mem_full; + assign data_req_o.q_valid = lsu_postcaq_qvalid & (lsu_qwrite_i | ~laq_full) & ~mem_full; assign data_req_o.q.write = lsu_qwrite_i; assign data_req_o.q.addr = lsu_qaddr_i; assign data_req_o.q.amo = lsu_qamo_i; @@ -152,7 +252,7 @@ module snitch_lsu #( /* verilator lint_on WIDTH */ // The interface didn't accept our request yet - assign lsu_qready_o = ~(data_req_o.q_valid & ~data_rsp_i.q_ready) + assign lsu_postcaq_qready = ~(data_req_o.q_valid & ~data_rsp_i.q_ready) & (lsu_qwrite_i | ~laq_full) & ~mem_full; assign laq_push = ~lsu_qwrite_i & data_rsp_i.q_ready & data_req_o.q_valid & ~laq_full; diff --git a/hw/snitch_cluster/src/snitch_cc.sv b/hw/snitch_cluster/src/snitch_cc.sv index 5bb3b1b481..a66ea2759d 100644 --- a/hw/snitch_cluster/src/snitch_cc.sv +++ b/hw/snitch_cluster/src/snitch_cc.sv @@ -97,6 +97,9 @@ module snitch_cc #( /// Insert Pipeline registers immediately after FPU datapath parameter bit RegisterFPUOut = 0, parameter snitch_pma_pkg::snitch_pma_t SnitchPMACfg = '{default: 0}, + /// Consistency Address Queue (CAQ) parameters. + parameter int unsigned CaqDepth = 0, + parameter int unsigned CaqTagWidth = 0, /// Enable debug support. parameter bit DebugSupport = 1, /// Derived parameter *Do not override* @@ -195,6 +198,9 @@ module snitch_cc #( logic wake_up; + // Consistency Address Queue (CAQ) interface + logic caq_pvalid; + `SNITCH_VM_TYPEDEF(AddrWidth) snitch #( @@ -228,6 +234,8 @@ module snitch_cc #( .XFDOTP (XFDOTP), .XFAUX (XFauxMerged), .FLEN (FLEN), + .CaqDepth (CaqDepth), + .CaqTagWidth (CaqTagWidth), .DebugSupport (DebugSupport) ) i_snitch ( .clk_i ( clk_d2_i ), // if necessary operate on half the frequency @@ -247,6 +255,7 @@ module snitch_cc #( .acc_prsp_i ( acc_demux_snitch ), .acc_pvalid_i ( acc_demux_snitch_valid ), .acc_pready_o ( acc_demux_snitch_ready ), + .caq_pvalid_i ( caq_pvalid ), .data_req_o ( snitch_dreq_d ), .data_rsp_i ( snitch_drsp_d ), .ptw_valid_o (hive_req_o.ptw_valid), @@ -494,6 +503,7 @@ module snitch_cc #( .acc_resp_o ( acc_seq ), .acc_resp_valid_o ( acc_pvalid ), .acc_resp_ready_i ( acc_pready ), + .caq_pvalid_o ( caq_pvalid ), .data_req_o ( fpu_dreq ), .data_rsp_i ( fpu_drsp ), .fpu_rnd_mode_i ( fpu_rnd_mode ), diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv index 4f666075ed..d2ac16ea9a 100644 --- a/hw/snitch_cluster/src/snitch_cluster.sv +++ b/hw/snitch_cluster/src/snitch_cluster.sv @@ -182,6 +182,9 @@ module snitch_cluster // In case you are using the `RegisterTCDMCuts` feature this adds an // additional cycle latency, which is taken into account here. parameter int unsigned MemoryMacroLatency = 1 + RegisterTCDMCuts, + /// Consistency Address Queue (CAQ) parameters. + parameter int unsigned CaqDepth = 0, + parameter int unsigned CaqTagWidth = 0, /// Enable debug support. parameter bit DebugSupport = 1 ) ( @@ -873,6 +876,8 @@ module snitch_cluster .RegisterFPUIn (RegisterFPUIn), .RegisterFPUOut (RegisterFPUOut), .TCDMAddrWidth (TCDMAddrWidth), + .CaqDepth (CaqDepth), + .CaqTagWidth (CaqTagWidth), .DebugSupport (DebugSupport) ) i_snitch_cc ( .clk_i, diff --git a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl index 6c1df0dc1f..bbd5e80cf6 100644 --- a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl +++ b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl @@ -332,6 +332,8 @@ module ${cfg['name']}_wrapper ( .NarrowMaxSlvTrans (${cfg['narrow_trans']}), .sram_cfg_t (${cfg['pkg_name']}::sram_cfg_t), .sram_cfgs_t (${cfg['pkg_name']}::sram_cfgs_t), + .CaqDepth (${int(cfg['caq_depth'])}), + .CaqTagWidth (${int(cfg['caq_tag_width'])}), .DebugSupport (${int(cfg['enable_debug'])}) ) i_cluster ( .clk_i, diff --git a/hw/snitch_cluster/src/snitch_fp_ss.sv b/hw/snitch_cluster/src/snitch_fp_ss.sv index fc75a386c8..c0a77331c0 100644 --- a/hw/snitch_cluster/src/snitch_fp_ss.sv +++ b/hw/snitch_cluster/src/snitch_fp_ss.sv @@ -73,6 +73,10 @@ module snitch_fp_ss import snitch_pkg::*; #( input logic streamctl_done_i, input logic streamctl_valid_i, output logic streamctl_ready_o, + // Consistency Address Queue (CAQ) interface. + // Notifies the issuing Snitch core of retired loads/stores. + // TODO: is it good enough to assert this at issuing time instead? + output logic caq_pvalid_o, // Core event strobes output core_events_t core_events_o ); @@ -2582,6 +2586,9 @@ module snitch_fp_ss import snitch_pkg::*; #( // ---------------------- assign lsu_qvalid = acc_req_valid_q & (&op_ready) & (is_load | is_store); + // TODO: verify this does not create timing loops! + assign caq_pvalid_o = lsu_pvalid & lsu_pready; + snitch_lsu #( .AddrWidth (AddrWidth), .DataWidth (DataWidth), @@ -2590,7 +2597,8 @@ module snitch_fp_ss import snitch_pkg::*; #( .tag_t (logic [4:0]), .NumOutstandingMem (NumFPOutstandingMem), .NumOutstandingLoads (NumFPOutstandingLoads), - .NaNBox (1'b1) + .NaNBox (1'b1), + .Caq (1'b0) ) i_snitch_lsu ( .clk_i (clk_i), .rst_i (rst_i), @@ -2609,6 +2617,12 @@ module snitch_fp_ss import snitch_pkg::*; #( .lsu_pvalid_o (lsu_pvalid), .lsu_pready_i (lsu_pready), .lsu_empty_o (/* unused */), + // CAQ is not enabled here + .caq_qaddr_i ('0), + .caq_qwrite_i ('0), + .caq_qvalid_i (1'b0), + .caq_qready_o (), + .caq_pvalid_i (1'b0), .data_req_o, .data_rsp_i );