Skip to content

Commit

Permalink
hw: Fix load-store consistency between int and FP datapaths
Browse files Browse the repository at this point in the history
  • Loading branch information
paulsc96 committed Feb 9, 2024
1 parent ae02a03 commit 01634d2
Show file tree
Hide file tree
Showing 7 changed files with 171 additions and 5 deletions.
10 changes: 10 additions & 0 deletions docs/schema/snitch_cluster.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,16 @@
"description": "Number of request entries the DMA can keep",
"default": 3
},
"caq_depth": {
"type": "number",
"description": "Depth of consistency address queue (CAQ). At most `num_fp_outstanding_mem` plus offloading round trip time makes sense.",
"default": 4
},
"caq_tag_width": {
"type": "number",
"description": "Tag width for consistency address queue (CAQ). Reordering of integer/FP loads/stores to words with colliding tags causes stalls.",
"default": 8
},
"enable_debug": {
"type": "boolean",
"description": "Whether to provide a debug request input and external debug features",
Expand Down
27 changes: 25 additions & 2 deletions hw/snitch/src/snitch.sv
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
parameter int unsigned NumDTLBEntries = 0,
parameter int unsigned NumITLBEntries = 0,
parameter snitch_pma_pkg::snitch_pma_t SnitchPMACfg = '{default: 0},
/// Consistency Address Queue (CAQ) parameters
parameter int unsigned CaqDepth = 0,
parameter int unsigned CaqTagWidth = 0,
/// Enable debug support.
parameter bit DebugSupport = 1,
/// Derived parameter *Do not override*
Expand Down Expand Up @@ -103,6 +106,9 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
output fpnew_pkg::roundmode_e fpu_rnd_mode_o,
output fpnew_pkg::fmt_mode_t fpu_fmt_mode_o,
input fpnew_pkg::status_t fpu_status_i,
/// Consistency Address Queue (CAQ) interface.
/// Used by FPU to notify Snitch LSU of retired loads/stores.
input logic caq_pvalid_i,
// Core events for performance counters
output snitch_pkg::core_events_t core_events_o,
// Cluster HW barrier
Expand Down Expand Up @@ -146,7 +152,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
logic [31:0] alu_result;

logic [RegWidth-1:0] rd, rs1, rs2;
logic stall, lsu_stall;
logic stall, lsu_stall, caq_stall;
// Register connections
logic [1:0][RegWidth-1:0] gpr_raddr;
logic [1:0][31:0] gpr_rdata;
Expand Down Expand Up @@ -453,6 +459,9 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
assign stall = ~valid_instr
// The LSU is stalling.
| lsu_stall
// The CAQ inside the LSU is stalling;
// this prevents a local load/store from overtaking a prior offloaded load/store.
| caq_stall
// The accelerator port is stalling.
| acc_stall
// We are waiting on the `fence.i` flush.
Expand Down Expand Up @@ -2804,14 +2813,23 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
// sign exten to appropriate length
assign lsu_qdata = $unsigned(gpr_rdata[1]);

logic caq_qvalid, caq_qready;
// Make request to CAQ on offloading an FPU load or store
assign caq_qvalid = acc_qvalid_o & (is_fp_load | is_fp_store);
// CAQ stalls core when not ready to take valid request
assign caq_stall = caq_qvalid & ~caq_qready;

snitch_lsu #(
.AddrWidth (AddrWidth),
.DataWidth (DataWidth),
.dreq_t (dreq_t),
.drsp_t (drsp_t),
.tag_t (logic[RegWidth-1:0]),
.NumOutstandingMem (NumIntOutstandingMem),
.NumOutstandingLoads (NumIntOutstandingLoads)
.NumOutstandingLoads (NumIntOutstandingLoads),
.Caq (FP_EN),
.CaqDepth (CaqDepth),
.CaqTagWidth (CaqTagWidth)
) i_snitch_lsu (
.clk_i (clk_i),
.rst_i (rst_i),
Expand All @@ -2830,6 +2848,11 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
.lsu_pvalid_o (lsu_pvalid),
.lsu_pready_i (lsu_pready),
.lsu_empty_o (lsu_empty),
.caq_qaddr_i (ls_paddr),
.caq_qwrite_i (is_fp_store),
.caq_qvalid_i (caq_qvalid),
.caq_qready_o (caq_qready),
.caq_pvalid_i,
.data_req_o,
.data_rsp_i
);
Expand Down
104 changes: 102 additions & 2 deletions hw/snitch/src/snitch_lsu.sv
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,17 @@ module snitch_lsu #(
parameter int unsigned NumOutstandingLoads = 1,
/// Whether to NaN Box values. Used for floating-point load/stores.
parameter bit NaNBox = 0,
/// Whether to instantiate a consistency address queue (CAQ). The CAQ enables
/// consistency with another LSU in the same hart (i.e. in the FPSS) strictly
/// *downstream* of the issuing Snitch core. For all offloaded accesses, the
/// word address LSBs are pushed into the Snitch core's CAQ on offload and
/// popped when completed by the downstream LSU. Incoming accesses possibly
/// overtaking pending downstream accesses in the CAQ are stalled.
parameter bit Caq = 0,
/// CAQ Depth; should match the number of downstream LSU outstanding requests.
parameter int unsigned CaqDepth = 0,
/// Size of CAQ address LSB tags; provides a pessimism-complexity tradeoff.
parameter int unsigned CaqTagWidth = 0,
parameter type dreq_t = logic,
parameter type drsp_t = logic,
/// Derived parameter *Do not override*
Expand All @@ -43,6 +54,15 @@ module snitch_lsu #(
output logic lsu_perror_o,
output logic lsu_pvalid_o,
input logic lsu_pready_i,
/// CAQ request snoop channel. Only some address bits will be read.
/// Fork offloaded loads/stores to here iff `Caq` is 1.
input addr_t caq_qaddr_i,
input logic caq_qwrite_i,
input logic caq_qvalid_i,
output logic caq_qready_o,
/// CAQ response snoop channel.
/// Fork responses to offloaded loads/stores to here iff `Caq` is 1.
input logic caq_pvalid_i,
/// High if there is currently no transaction pending.
output logic lsu_empty_o,
// Memory Interface Channel
Expand All @@ -51,6 +71,86 @@ module snitch_lsu #(
);

localparam int unsigned DataAlign = $clog2(DataWidth/8);

// -------------------------------
// Consistency Address Queue (CAQ)
// -------------------------------

// TODO: What about exceptions? We *should* get a response for all offloaded
// loads/stores anyways as already issued instructions should conclude, but
// if this is not the case, things go south!

logic lsu_postcaq_qvalid, lsu_postcaq_qready;

if (Caq) begin : gen_caq

// Fork request handshake between CAQ lookup and downstream (post-CAQ) LSU logic.
logic caq_lsu_qvalid, caq_lsu_qready;

stream_fork #(
.N_OUP(2)
) i_stream_fork_caq_lsu (
.clk_i,
.rst_ni ( ~rst_i ),
.valid_i ( lsu_qvalid_i ),
.ready_o ( lsu_qready_o ),
.valid_o ( {caq_lsu_qvalid, lsu_postcaq_qvalid} ),
.ready_i ( {caq_lsu_qready, lsu_postcaq_qready} )
);

// CAQ unblocks request when ready to proceed and contains no collision.
logic caq_lsu_gnt, caq_lsu_exists;
logic caq_out_valid, caq_out_gnt;

assign caq_lsu_qready = caq_lsu_gnt & ~caq_lsu_exists;

id_queue #(
.data_t ( logic [CaqTagWidth:0] ), // Store address tag *and* write enable
.ID_WIDTH ( 0 ), // No reorder capability here
.CAPACITY ( CaqDepth ),
.FULL_BW ( 1 )
) i_caq (
.clk_i,
.rst_ni ( ~rst_i ),
// Push in snooped accesses offloaded to downstream LSU
.inp_id_i ( '0 ),
.inp_data_i ( {caq_qwrite_i, caq_qaddr_i[CaqTagWidth+DataAlign-1:DataAlign]} ),
.inp_req_i ( caq_qvalid_i ),
.inp_gnt_o ( caq_qready_o ),
// Check if currently presented request collides with any snooped ones.
// Check address tag in any case. Check the write enable only when it
// is necessary. If we receive a write, stall on any address match
// (mask MSB). If we receive a read, we stall only if a write collides.
.exists_mask_i ( {~lsu_qwrite_i, {(CaqDepth){1'b1}}} ),
.exists_data_i ( {1'b1, lsu_qaddr_i[CaqTagWidth+DataAlign-1:DataAlign]} ),
.exists_req_i ( caq_lsu_qvalid ),
.exists_gnt_o ( caq_lsu_gnt ),
.exists_o ( caq_lsu_exists ),
// Pop output whenever we get a response for a snooped request.
// This has no backpressure as we should snoop as many responses as requests.
.oup_id_i ( '0 ),
.oup_pop_i ( caq_pvalid_i ),
.oup_req_i ( caq_pvalid_i ),
.oup_data_o ( ),
.oup_data_valid_o ( caq_out_valid ),
.oup_gnt_o ( caq_out_gnt )
);

// Check that we do not pop more snooped responses than we pushed requests.
`ASSERT(CaqPopEmpty, (caq_qvalid_i |-> caq_out_gnt && caq_out_valid), clk_i, rst_i)

end else begin : gen_no_caq

// No CAQ can stall us; forward request handshake to LSU logic
assign lsu_postcaq_qvalid = lsu_qvalid_i;
assign lsu_postcaq_qready = lsu_qready_o;

end

// --------------
// Downstream LSU
// --------------

logic [63:0] ld_result;
logic [63:0] lsu_qdata, data_qdata;

Expand Down Expand Up @@ -115,7 +215,7 @@ module snitch_lsu #(
// Only make a request when we got a valid request and if it is a load also
// check that we can actually store the necessary information to process it in
// the upcoming cycle(s).
assign data_req_o.q_valid = lsu_qvalid_i & (lsu_qwrite_i | ~laq_full) & ~mem_full;
assign data_req_o.q_valid = lsu_postcaq_qvalid & (lsu_qwrite_i | ~laq_full) & ~mem_full;
assign data_req_o.q.write = lsu_qwrite_i;
assign data_req_o.q.addr = lsu_qaddr_i;
assign data_req_o.q.amo = lsu_qamo_i;
Expand Down Expand Up @@ -152,7 +252,7 @@ module snitch_lsu #(
/* verilator lint_on WIDTH */

// The interface didn't accept our request yet
assign lsu_qready_o = ~(data_req_o.q_valid & ~data_rsp_i.q_ready)
assign lsu_postcaq_qready = ~(data_req_o.q_valid & ~data_rsp_i.q_ready)
& (lsu_qwrite_i | ~laq_full) & ~mem_full;
assign laq_push = ~lsu_qwrite_i & data_rsp_i.q_ready & data_req_o.q_valid & ~laq_full;

Expand Down
12 changes: 12 additions & 0 deletions hw/snitch_cluster/src/snitch_cc.sv
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ module snitch_cc #(
/// Insert Pipeline registers immediately after FPU datapath
parameter bit RegisterFPUOut = 0,
parameter snitch_pma_pkg::snitch_pma_t SnitchPMACfg = '{default: 0},
/// Consistency Address Queue (CAQ) parameters.
parameter int unsigned CaqDepth = 0,
parameter int unsigned CaqTagWidth = 0,
/// Enable debug support.
parameter bit DebugSupport = 1,
/// Derived parameter *Do not override*
Expand Down Expand Up @@ -195,6 +198,9 @@ module snitch_cc #(

logic wake_up;

// Consistency Address Queue (CAQ) interface
logic caq_pvalid;

`SNITCH_VM_TYPEDEF(AddrWidth)

snitch #(
Expand Down Expand Up @@ -228,6 +234,8 @@ module snitch_cc #(
.XFDOTP (XFDOTP),
.XFAUX (XFauxMerged),
.FLEN (FLEN),
.CaqDepth (CaqDepth),
.CaqTagWidth (CaqTagWidth),
.DebugSupport (DebugSupport)
) i_snitch (
.clk_i ( clk_d2_i ), // if necessary operate on half the frequency
Expand All @@ -247,6 +255,7 @@ module snitch_cc #(
.acc_prsp_i ( acc_demux_snitch ),
.acc_pvalid_i ( acc_demux_snitch_valid ),
.acc_pready_o ( acc_demux_snitch_ready ),
.caq_pvalid_i ( caq_pvalid ),
.data_req_o ( snitch_dreq_d ),
.data_rsp_i ( snitch_drsp_d ),
.ptw_valid_o (hive_req_o.ptw_valid),
Expand Down Expand Up @@ -494,6 +503,7 @@ module snitch_cc #(
.acc_resp_o ( acc_seq ),
.acc_resp_valid_o ( acc_pvalid ),
.acc_resp_ready_i ( acc_pready ),
.caq_pvalid_o ( caq_pvalid ),
.data_req_o ( fpu_dreq ),
.data_rsp_i ( fpu_drsp ),
.fpu_rnd_mode_i ( fpu_rnd_mode ),
Expand Down Expand Up @@ -555,6 +565,8 @@ module snitch_cc #(
assign acc_seq.error = '0;
assign acc_pvalid = '0;

assign caq_pvalid = '0;

assign merged_dreq = snitch_dreq_q;
assign snitch_drsp_q = merged_drsp;

Expand Down
5 changes: 5 additions & 0 deletions hw/snitch_cluster/src/snitch_cluster.sv
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,9 @@ module snitch_cluster
// In case you are using the `RegisterTCDMCuts` feature this adds an
// additional cycle latency, which is taken into account here.
parameter int unsigned MemoryMacroLatency = 1 + RegisterTCDMCuts,
/// Consistency Address Queue (CAQ) parameters.
parameter int unsigned CaqDepth = 0,
parameter int unsigned CaqTagWidth = 0,
/// Enable debug support.
parameter bit DebugSupport = 1
) (
Expand Down Expand Up @@ -873,6 +876,8 @@ module snitch_cluster
.RegisterFPUIn (RegisterFPUIn),
.RegisterFPUOut (RegisterFPUOut),
.TCDMAddrWidth (TCDMAddrWidth),
.CaqDepth (CaqDepth),
.CaqTagWidth (CaqTagWidth),
.DebugSupport (DebugSupport)
) i_snitch_cc (
.clk_i,
Expand Down
2 changes: 2 additions & 0 deletions hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,8 @@ module ${cfg['name']}_wrapper (
.NarrowMaxSlvTrans (${cfg['narrow_trans']}),
.sram_cfg_t (${cfg['pkg_name']}::sram_cfg_t),
.sram_cfgs_t (${cfg['pkg_name']}::sram_cfgs_t),
.CaqDepth (${int(cfg['caq_depth'])}),
.CaqTagWidth (${int(cfg['caq_tag_width'])}),
.DebugSupport (${int(cfg['enable_debug'])})
) i_cluster (
.clk_i,
Expand Down
16 changes: 15 additions & 1 deletion hw/snitch_cluster/src/snitch_fp_ss.sv
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ module snitch_fp_ss import snitch_pkg::*; #(
input logic streamctl_done_i,
input logic streamctl_valid_i,
output logic streamctl_ready_o,
// Consistency Address Queue (CAQ) interface.
// Notifies the issuing Snitch core of retired loads/stores.
// TODO: is it good enough to assert this at issuing time instead?
output logic caq_pvalid_o,
// Core event strobes
output core_events_t core_events_o
);
Expand Down Expand Up @@ -2582,6 +2586,9 @@ module snitch_fp_ss import snitch_pkg::*; #(
// ----------------------
assign lsu_qvalid = acc_req_valid_q & (&op_ready) & (is_load | is_store);

// TODO: verify this does not create timing loops!
assign caq_pvalid_o = lsu_pvalid & lsu_pready;

snitch_lsu #(
.AddrWidth (AddrWidth),
.DataWidth (DataWidth),
Expand All @@ -2590,7 +2597,8 @@ module snitch_fp_ss import snitch_pkg::*; #(
.tag_t (logic [4:0]),
.NumOutstandingMem (NumFPOutstandingMem),
.NumOutstandingLoads (NumFPOutstandingLoads),
.NaNBox (1'b1)
.NaNBox (1'b1),
.Caq (1'b0)
) i_snitch_lsu (
.clk_i (clk_i),
.rst_i (rst_i),
Expand All @@ -2609,6 +2617,12 @@ module snitch_fp_ss import snitch_pkg::*; #(
.lsu_pvalid_o (lsu_pvalid),
.lsu_pready_i (lsu_pready),
.lsu_empty_o (/* unused */),
// CAQ is not enabled here
.caq_qaddr_i ('0),
.caq_qwrite_i ('0),
.caq_qvalid_i (1'b0),
.caq_qready_o (),
.caq_pvalid_i (1'b0),
.data_req_o,
.data_rsp_i
);
Expand Down

0 comments on commit 01634d2

Please sign in to comment.