Skip to content

Commit

Permalink
hw: Fix load-store consistency between int and FP datapaths
Browse files Browse the repository at this point in the history
  • Loading branch information
paulsc96 committed Feb 9, 2024
1 parent 1542031 commit 476442f
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 14 deletions.
10 changes: 10 additions & 0 deletions docs/schema/snitch_cluster.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,16 @@
"description": "Number of request entries the DMA can keep",
"default": 3
},
"caq_depth": {
"type": "number",
"description": "Depth of consistency address queue (CAQ). At most `num_fp_outstanding_mem` plus offloading round trip time makes sense.",
"default": 4
},
"caq_tag_width": {
"type": "number",
"description": "Tag width for consistency address queue (CAQ). Reordering of integer/FP loads/stores to words with colliding tags causes stalls.",
"default": 8
},
"enable_debug": {
"type": "boolean",
"description": "Whether to provide a debug request input and external debug features",
Expand Down
43 changes: 32 additions & 11 deletions hw/snitch/src/snitch.sv
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
parameter int unsigned NumDTLBEntries = 0,
parameter int unsigned NumITLBEntries = 0,
parameter snitch_pma_pkg::snitch_pma_t SnitchPMACfg = '{default: 0},
/// Consistency Address Queue (CAQ) parameters
parameter int unsigned CaqDepth = 0,
parameter int unsigned CaqTagWidth = 0,
/// Enable debug support.
parameter bit DebugSupport = 1,
/// Derived parameter *Do not override*
Expand Down Expand Up @@ -103,6 +106,9 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
output fpnew_pkg::roundmode_e fpu_rnd_mode_o,
output fpnew_pkg::fmt_mode_t fpu_fmt_mode_o,
input fpnew_pkg::status_t fpu_status_i,
/// Consistency Address Queue (CAQ) interface.
/// Used by FPU to notify Snitch LSU of retired loads/stores.
input logic caq_pvalid_i,
// Core events for performance counters
output snitch_pkg::core_events_t core_events_o,
// Cluster HW barrier
Expand Down Expand Up @@ -163,6 +169,8 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
logic st_addr_misaligned;
logic inst_addr_misaligned;

logic caq_qvalid, caq_qready, caq_ena;

logic itlb_valid, itlb_ready;
va_t itlb_va;
logic itlb_page_fault;
Expand Down Expand Up @@ -445,8 +453,8 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
& operands_ready
& dst_ready
& ((itlb_valid & itlb_ready) | ~trans_active);
// the accelerator interface stalled us
assign acc_stall = acc_qvalid_o & ~acc_qready_i;
// the accelerator interface stalled us. Also wait for CAQ if this is an FP load/store.
assign acc_stall = acc_qvalid_o & ~acc_qready_i | (caq_ena & ~caq_qready);
// the LSU Interface didn't accept our request yet
assign lsu_stall = lsu_tlb_qvalid & ~lsu_tlb_qready;
// Stall the stage if we either didn't get a valid instruction or the LSU/Accelerator is not ready
Expand Down Expand Up @@ -2001,7 +2009,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
opa_select = Reg;
opb_select = IImmediate;
write_rd = 1'b0;
acc_qvalid_o = valid_instr & trans_ready;
acc_qvalid_o = valid_instr & trans_ready & caq_qready;
ls_size = Word;
is_fp_load = 1'b1;
end else begin
Expand All @@ -2013,7 +2021,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
opa_select = Reg;
opb_select = SFImmediate;
write_rd = 1'b0;
acc_qvalid_o = valid_instr & trans_ready;
acc_qvalid_o = valid_instr & trans_ready & caq_qready;
ls_size = Word;
is_fp_store = 1'b1;
end else begin
Expand All @@ -2026,7 +2034,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
opa_select = Reg;
opb_select = IImmediate;
write_rd = 1'b0;
acc_qvalid_o = valid_instr & trans_ready;
acc_qvalid_o = valid_instr & trans_ready & caq_qready;
ls_size = Double;
is_fp_load = 1'b1;
end else begin
Expand All @@ -2038,7 +2046,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
opa_select = Reg;
opb_select = SFImmediate;
write_rd = 1'b0;
acc_qvalid_o = valid_instr & trans_ready;
acc_qvalid_o = valid_instr & trans_ready & caq_qready;
ls_size = Double;
is_fp_store = 1'b1;
end else begin
Expand All @@ -2051,7 +2059,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
opa_select = Reg;
opb_select = IImmediate;
write_rd = 1'b0;
acc_qvalid_o = valid_instr & trans_ready;
acc_qvalid_o = valid_instr & trans_ready & caq_qready;
ls_size = HalfWord;
is_fp_load = 1'b1;
end else begin
Expand All @@ -2063,7 +2071,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
opa_select = Reg;
opb_select = SFImmediate;
write_rd = 1'b0;
acc_qvalid_o = valid_instr & trans_ready;
acc_qvalid_o = valid_instr & trans_ready & caq_qready;
ls_size = HalfWord;
is_fp_store = 1'b1;
end else begin
Expand All @@ -2076,7 +2084,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
opa_select = Reg;
opb_select = IImmediate;
write_rd = 1'b0;
acc_qvalid_o = valid_instr & trans_ready;
acc_qvalid_o = valid_instr & trans_ready & caq_qready;
ls_size = Byte;
is_fp_load = 1'b1;
end else begin
Expand All @@ -2088,7 +2096,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
opa_select = Reg;
opb_select = SFImmediate;
write_rd = 1'b0;
acc_qvalid_o = valid_instr & trans_ready;
acc_qvalid_o = valid_instr & trans_ready & caq_qready;
ls_size = Byte;
is_fp_store = 1'b1;
end else begin
Expand Down Expand Up @@ -2804,14 +2812,22 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
// sign exten to appropriate length
assign lsu_qdata = $unsigned(gpr_rdata[1]);

// Consider CAQ in accelerator handshake when offloading an FPU load or store.
assign caq_ena = is_fp_load | is_fp_store;
// Make request to CAQ when offloading access and accelerator interface ready.
assign caq_qvalid = caq_ena & acc_qready_i;

snitch_lsu #(
.AddrWidth (AddrWidth),
.DataWidth (DataWidth),
.dreq_t (dreq_t),
.drsp_t (drsp_t),
.tag_t (logic[RegWidth-1:0]),
.NumOutstandingMem (NumIntOutstandingMem),
.NumOutstandingLoads (NumIntOutstandingLoads)
.NumOutstandingLoads (NumIntOutstandingLoads),
.Caq (FP_EN),
.CaqDepth (CaqDepth),
.CaqTagWidth (CaqTagWidth)
) i_snitch_lsu (
.clk_i (clk_i),
.rst_i (rst_i),
Expand All @@ -2830,6 +2846,11 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
.lsu_pvalid_o (lsu_pvalid),
.lsu_pready_i (lsu_pready),
.lsu_empty_o (lsu_empty),
.caq_qaddr_i (ls_paddr),
.caq_qwrite_i (is_fp_store),
.caq_qvalid_i (caq_qvalid),
.caq_qready_o (caq_qready),
.caq_pvalid_i,
.data_req_o,
.data_rsp_i
);
Expand Down
105 changes: 103 additions & 2 deletions hw/snitch/src/snitch_lsu.sv
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,17 @@ module snitch_lsu #(
parameter int unsigned NumOutstandingLoads = 1,
/// Whether to NaN Box values. Used for floating-point load/stores.
parameter bit NaNBox = 0,
/// Whether to instantiate a consistency address queue (CAQ). The CAQ enables
/// consistency with another LSU in the same hart (i.e. in the FPSS) strictly
/// *downstream* of the issuing Snitch core. For all offloaded accesses, the
/// word address LSBs are pushed into the Snitch core's CAQ on offload and
/// popped when completed by the downstream LSU. Incoming accesses possibly
/// overtaking pending downstream accesses in the CAQ are stalled.
parameter bit Caq = 0,
/// CAQ Depth; should match the number of downstream LSU outstanding requests.
parameter int unsigned CaqDepth = 0,
/// Size of CAQ address LSB tags; provides a pessimism-complexity tradeoff.
parameter int unsigned CaqTagWidth = 0,
parameter type dreq_t = logic,
parameter type drsp_t = logic,
/// Derived parameter *Do not override*
Expand All @@ -43,14 +54,104 @@ module snitch_lsu #(
output logic lsu_perror_o,
output logic lsu_pvalid_o,
input logic lsu_pready_i,
/// CAQ request snoop channel. Only some address bits will be read.
/// Fork offloaded loads/stores to here iff `Caq` is 1.
input addr_t caq_qaddr_i,
input logic caq_qwrite_i,
input logic caq_qvalid_i,
output logic caq_qready_o,
/// CAQ response snoop channel.
/// Fork responses to offloaded loads/stores to here iff `Caq` is 1.
input logic caq_pvalid_i,
/// High if there is currently no transaction pending.
output logic lsu_empty_o,
// Memory Interface Channel
output dreq_t data_req_o,
input drsp_t data_rsp_i
);

`include "common_cells/assertions.svh"

localparam int unsigned DataAlign = $clog2(DataWidth/8);

// -------------------------------
// Consistency Address Queue (CAQ)
// -------------------------------

// TODO: What about exceptions? We *should* get a response for all offloaded
// loads/stores anyways as already issued instructions should conclude, but
// if this is not the case, things go south!

logic lsu_postcaq_qvalid, lsu_postcaq_qready;

if (Caq) begin : gen_caq

logic caq_lsu_gnt, caq_lsu_exists;
logic caq_out_valid, caq_out_gnt;
logic caq_pass;

// CAQ passes requests to downstream LSU only once they are known not to collide.
// This is assumed to be *stable* once given as the Snitch core is stalled on a
// load/store and elements can only be popped from the queue, not pushed.
assign caq_pass = caq_lsu_gnt & ~caq_lsu_exists;

// Gate downstream LSU on CAQ pass
assign lsu_postcaq_qvalid = caq_pass & lsu_qvalid_i;
assign lsu_qready_o = caq_pass & lsu_postcaq_qready;

id_queue #(
.data_t ( logic [CaqTagWidth:0] ), // Store address tag *and* write enable
.ID_WIDTH ( 1 ), // De facto 0: no reorder capability here
.CAPACITY ( CaqDepth ),
.FULL_BW ( 1 )
) i_caq (
.clk_i,
.rst_ni ( ~rst_i ),
// Push in snooped accesses offloaded to downstream LSU
.inp_id_i ( '0 ),
.inp_data_i ( {caq_qwrite_i, caq_qaddr_i[CaqTagWidth+DataAlign-1:DataAlign]} ),
.inp_req_i ( caq_qvalid_i ),
.inp_gnt_o ( caq_qready_o ),
// Check if currently presented request collides with any snooped ones.
// Check address tag in any case. Check the write enable only when it
// is necessary. If we receive a write, stall on any address match
// (mask MSB). If we receive a read, we stall only if a write collides.
.exists_mask_i ( {~lsu_qwrite_i, {(CaqTagWidth){1'b1}}} ),
.exists_data_i ( {1'b1, lsu_qaddr_i[CaqTagWidth+DataAlign-1:DataAlign]} ),
.exists_req_i ( lsu_qvalid_i ),
.exists_gnt_o ( caq_lsu_gnt ),
.exists_o ( caq_lsu_exists ),
// Pop output whenever we get a response for a snooped request.
// This has no backpressure as we should snoop as many responses as requests.
.oup_id_i ( '0 ),
.oup_pop_i ( caq_pvalid_i ),
.oup_req_i ( caq_pvalid_i ),
.oup_data_o ( ),
.oup_data_valid_o ( caq_out_valid ),
.oup_gnt_o ( caq_out_gnt )
);

// Check that we do not pop more snooped responses than we pushed requests.
`ASSERT(CaqPopEmpty, (caq_pvalid_i |-> caq_out_gnt && caq_out_valid), clk_i, rst_i)

// Check that once asserted, `caq_pass` is stable until we handshake the load/store
`ASSERT(CaqPassStable, ($rose(caq_pass) |-> (caq_pass until_with lsu_qvalid_i & lsu_qready_o)), clk_i, rst_i)

Check warning on line 138 in hw/snitch/src/snitch_lsu.sv

View workflow job for this annotation

GitHub Actions / verible-verilog-lint

[verible-verilog-lint] hw/snitch/src/snitch_lsu.sv#L138

Line length exceeds max: 100; is: 113 [Style: line-length] [line-length]
Raw output
message:"Line length exceeds max: 100; is: 113 [Style: line-length] [line-length]" location:{path:"./hw/snitch/src/snitch_lsu.sv" range:{start:{line:138 column:101}}} severity:WARNING source:{name:"verible-verilog-lint" url:"https://github.com/chipsalliance/verible"}

end else begin : gen_no_caq

// No CAQ can stall us; forward request handshake to LSU logic
assign lsu_postcaq_qvalid = lsu_qvalid_i;
assign lsu_qready_o = lsu_postcaq_qready;

// Tie CAQ interface
assign caq_qready_o = '1;

end

// --------------
// Downstream LSU
// --------------

logic [63:0] ld_result;
logic [63:0] lsu_qdata, data_qdata;

Expand Down Expand Up @@ -115,7 +216,7 @@ module snitch_lsu #(
// Only make a request when we got a valid request and if it is a load also
// check that we can actually store the necessary information to process it in
// the upcoming cycle(s).
assign data_req_o.q_valid = lsu_qvalid_i & (lsu_qwrite_i | ~laq_full) & ~mem_full;
assign data_req_o.q_valid = lsu_postcaq_qvalid & (lsu_qwrite_i | ~laq_full) & ~mem_full;
assign data_req_o.q.write = lsu_qwrite_i;
assign data_req_o.q.addr = lsu_qaddr_i;
assign data_req_o.q.amo = lsu_qamo_i;
Expand Down Expand Up @@ -152,7 +253,7 @@ module snitch_lsu #(
/* verilator lint_on WIDTH */

// The interface didn't accept our request yet
assign lsu_qready_o = ~(data_req_o.q_valid & ~data_rsp_i.q_ready)
assign lsu_postcaq_qready = ~(data_req_o.q_valid & ~data_rsp_i.q_ready)
& (lsu_qwrite_i | ~laq_full) & ~mem_full;
assign laq_push = ~lsu_qwrite_i & data_rsp_i.q_ready & data_req_o.q_valid & ~laq_full;

Expand Down
12 changes: 12 additions & 0 deletions hw/snitch_cluster/src/snitch_cc.sv
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ module snitch_cc #(
/// Insert Pipeline registers immediately after FPU datapath
parameter bit RegisterFPUOut = 0,
parameter snitch_pma_pkg::snitch_pma_t SnitchPMACfg = '{default: 0},
/// Consistency Address Queue (CAQ) parameters.
parameter int unsigned CaqDepth = 0,
parameter int unsigned CaqTagWidth = 0,
/// Enable debug support.
parameter bit DebugSupport = 1,
/// Optional fixed TCDM alias.
Expand Down Expand Up @@ -198,6 +201,9 @@ module snitch_cc #(

logic wake_up;

// Consistency Address Queue (CAQ) interface
logic caq_pvalid;

`SNITCH_VM_TYPEDEF(AddrWidth)

snitch #(
Expand Down Expand Up @@ -231,6 +237,8 @@ module snitch_cc #(
.XFDOTP (XFDOTP),
.XFAUX (XFauxMerged),
.FLEN (FLEN),
.CaqDepth (CaqDepth),
.CaqTagWidth (CaqTagWidth),
.DebugSupport (DebugSupport)
) i_snitch (
.clk_i ( clk_d2_i ), // if necessary operate on half the frequency
Expand All @@ -250,6 +258,7 @@ module snitch_cc #(
.acc_prsp_i ( acc_demux_snitch ),
.acc_pvalid_i ( acc_demux_snitch_valid ),
.acc_pready_o ( acc_demux_snitch_ready ),
.caq_pvalid_i ( caq_pvalid ),
.data_req_o ( snitch_dreq_d ),
.data_rsp_i ( snitch_drsp_d ),
.ptw_valid_o (hive_req_o.ptw_valid),
Expand Down Expand Up @@ -496,6 +505,7 @@ module snitch_cc #(
.acc_resp_o ( acc_seq ),
.acc_resp_valid_o ( acc_pvalid ),
.acc_resp_ready_i ( acc_pready ),
.caq_pvalid_o ( caq_pvalid ),
.data_req_o ( fpu_dreq ),
.data_rsp_i ( fpu_drsp ),
.fpu_rnd_mode_i ( fpu_rnd_mode ),
Expand Down Expand Up @@ -557,6 +567,8 @@ module snitch_cc #(
assign acc_seq.error = '0;
assign acc_pvalid = '0;

assign caq_pvalid = '0;

assign merged_dreq = snitch_dreq_q;
assign snitch_drsp_q = merged_drsp;

Expand Down
5 changes: 5 additions & 0 deletions hw/snitch_cluster/src/snitch_cluster.sv
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,9 @@ module snitch_cluster
// In case you are using the `RegisterTCDMCuts` feature this adds an
// additional cycle latency, which is taken into account here.
parameter int unsigned MemoryMacroLatency = 1 + RegisterTCDMCuts,
/// Consistency Address Queue (CAQ) parameters.
parameter int unsigned CaqDepth = 0,
parameter int unsigned CaqTagWidth = 0,
/// Enable debug support.
parameter bit DebugSupport = 1,
/// Optional fixed cluster alias region.
Expand Down Expand Up @@ -909,6 +912,8 @@ module snitch_cluster
.RegisterFPUIn (RegisterFPUIn),
.RegisterFPUOut (RegisterFPUOut),
.TCDMAddrWidth (TCDMAddrWidth),
.CaqDepth (CaqDepth),
.CaqTagWidth (CaqTagWidth),
.DebugSupport (DebugSupport),
.TCDMAliasEnable (AliasRegionEnable),
.TCDMAliasStart (TCDMAliasStart)
Expand Down
2 changes: 2 additions & 0 deletions hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,8 @@ module ${cfg['name']}_wrapper (
.NarrowMaxSlvTrans (${cfg['narrow_trans']}),
.sram_cfg_t (${cfg['pkg_name']}::sram_cfg_t),
.sram_cfgs_t (${cfg['pkg_name']}::sram_cfgs_t),
.CaqDepth (${int(cfg['caq_depth'])}),
.CaqTagWidth (${int(cfg['caq_tag_width'])}),
.DebugSupport (${int(cfg['enable_debug'])}),
.AliasRegionEnable (${int(cfg['alias_region_enable'])}),
.AliasRegionBase (${int(cfg['alias_region_base'])})
Expand Down
Loading

0 comments on commit 476442f

Please sign in to comment.