From 97f0a67ec53477bc25e2b3cb9a88dbdd3ef7f7a6 Mon Sep 17 00:00:00 2001 From: Nils Wistoff Date: Fri, 11 Aug 2023 11:13:39 +0200 Subject: [PATCH] coherence: Pack Valid/Dirty SRAM (#36) * sram: Replace by sram_pulp Does not introduce cuts and allows for parametric byte width Signed-off-by: Nils Wistoff * std_cache: Pack valid/dirty SRAM Signed-off-by: Nils Wistoff * std_cache: Decouple Valid/Dirty WE from Data WE Signed-off-by: Nils Wistoff --------- Signed-off-by: Nils Wistoff --- Bender.yml | 2 +- common/local/util/sram_pulp.sv | 90 ++++++++++++++++++++++++++++ core/Flist.cva6 | 2 +- core/cache_subsystem/cache_ctrl.sv | 5 +- core/cache_subsystem/miss_handler.sv | 11 +++- core/cache_subsystem/std_nbdcache.sv | 28 +++------ core/include/std_cache_pkg.sv | 11 +++- corev_apu/tb/ariane_tb.cpp | 4 +- corev_apu/tb/ariane_tb.sv | 2 +- 9 files changed, 123 insertions(+), 32 deletions(-) create mode 100644 common/local/util/sram_pulp.sv diff --git a/Bender.yml b/Bender.yml index 0539239f2c..9a67411e0c 100644 --- a/Bender.yml +++ b/Bender.yml @@ -211,7 +211,7 @@ sources: - include_dirs: - common/local/util files: - - common/local/util/sram.sv + - common/local/util/sram_pulp.sv - target: not(all(fpga, xilinx)) include_dirs: diff --git a/common/local/util/sram_pulp.sv b/common/local/util/sram_pulp.sv new file mode 100644 index 0000000000..f3c4773bf4 --- /dev/null +++ b/common/local/util/sram_pulp.sv @@ -0,0 +1,90 @@ +// Copyright 2018 ETH Zurich and University of Bologna. +// Copyright and related rights are licensed under the Solderpad Hardware +// License, Version 0.51 (the "License"); you may not use this file except in +// compliance with the License. You may obtain a copy of the License at +// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +// or agreed to in writing, software, hardware and materials distributed under +// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. +// +// Author: Florian Zaruba , ETH Zurich +// Michael Schaffner , ETH Zurich +// Nils Wistoff , ETH Zurich +// Date: 15.08.2018 +// Description: generic tc_sram wrapper for CVA6 +// +// Note: the wrapped module contains two different implementations for +// ALTERA and XILINX tools, since these follow different coding styles for +// inferrable RAMS with byte enable. define `FPGA_TARGET_XILINX or +// `FPGA_TARGET_ALTERA in your build environment (default is ALTERA) + +module sram #( + parameter DATA_WIDTH = 64, + parameter BYTE_WIDTH = 8, + parameter USER_WIDTH = 1, + parameter USER_EN = 0, + parameter NUM_WORDS = 1024, + parameter SIM_INIT = "none", + parameter OUT_REGS = 0 // enables output registers in FPGA macro (read lat = 2) +)( + input logic clk_i, + input logic rst_ni, + input logic req_i, + input logic we_i, + input logic [$clog2(NUM_WORDS)-1:0] addr_i, + input logic [USER_WIDTH-1:0] wuser_i, + input logic [DATA_WIDTH-1:0] wdata_i, + input logic [(DATA_WIDTH+BYTE_WIDTH-1)/BYTE_WIDTH-1:0] be_i, + output logic [USER_WIDTH-1:0] ruser_o, + output logic [DATA_WIDTH-1:0] rdata_o +); + + tc_sram #( + .NumWords ( NUM_WORDS ), + .DataWidth ( DATA_WIDTH ), + .ByteWidth ( BYTE_WIDTH ), + .NumPorts ( 32'd1 ), + .Latency ( 32'd1 ), + .SimInit ( SIM_INIT ), + .PrintSimCfg ( 1'b0 ) + ) i_tc_sram ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .req_i ( req_i ), + .we_i ( we_i ), + .be_i ( be_i ), + .wdata_i ( wdata_i ), + .addr_i ( addr_i ), + .rdata_o ( rdata_o ) + ); + + if (USER_EN > 0) begin : gen_mem_user + tc_sram #( + .NumWords ( NUM_WORDS ), + .DataWidth ( DATA_WIDTH ), + .ByteWidth ( BYTE_WIDTH ), + .NumPorts ( 32'd1 ), + .Latency ( 32'd1 ), + .SimInit ( SIM_INIT ), + .PrintSimCfg ( 1'b0 ) + ) i_tc_sram_user ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .req_i ( req_i ), + .we_i ( we_i ), + .be_i ( be_i ), + .wdata_i ( wuser_i ), + .addr_i ( addr_i ), + .rdata_o ( ruser_o ) + ); + + if (USER_WIDTH != DATA_WIDTH) begin : gen_err_data_user_width + $fatal(1, "sram_pulp: USER_WIDTH needs to be equal to DATA_WIDTH (if USER_EN is set)."); + end + + end else begin + assign ruser_o = '0; + end + +endmodule : sram diff --git a/core/Flist.cva6 b/core/Flist.cva6 index 1107f12764..e9d5992412 100644 --- a/core/Flist.cva6 +++ b/core/Flist.cva6 @@ -167,6 +167,6 @@ ${CVA6_REPO_DIR}/common/local/util/instr_tracer_if.sv ${CVA6_REPO_DIR}/common/local/util/instr_tracer.sv ${CVA6_REPO_DIR}/common/local/util/tc_sram_wrapper.sv ${CVA6_REPO_DIR}/vendor/pulp-platform/tech_cells_generic/src/rtl/tc_sram.sv -${CVA6_REPO_DIR}/common/local/util/sram.sv +${CVA6_REPO_DIR}/common/local/util/sram_pulp.sv // end of manifest diff --git a/core/cache_subsystem/cache_ctrl.sv b/core/cache_subsystem/cache_ctrl.sv index 9a8923a1ee..9c23fcbcfd 100644 --- a/core/cache_subsystem/cache_ctrl.sv +++ b/core/cache_subsystem/cache_ctrl.sv @@ -296,10 +296,11 @@ module cache_ctrl import ariane_pkg::*; import std_cache_pkg::*; #( addr_o = mem_req_q.index; we_o = 1'b1; - be_o.vldrty = hit_way_q; - // set the correct byte enable be_o.data[cl_offset>>3 +: 8] = mem_req_q.be; + for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin + if (hit_way_q[i]) be_o.vldrty[i] = '{valid: 1, dirty: be_o.data}; + end data_o.data[cl_offset +: 64] = mem_req_q.wdata; // ~> change the state data_o.dirty[cl_offset>>3 +: 8] = mem_req_q.be; diff --git a/core/cache_subsystem/miss_handler.sv b/core/cache_subsystem/miss_handler.sv index 1da09ccc7b..a39e31827d 100644 --- a/core/cache_subsystem/miss_handler.sv +++ b/core/cache_subsystem/miss_handler.sv @@ -300,8 +300,11 @@ module miss_handler import ariane_pkg::*; import std_cache_pkg::*; #( addr_o = mshr_q.addr[DCACHE_INDEX_WIDTH-1:0]; req_o = evict_way_q; we_o = 1'b1; - be_o = '1; - be_o.vldrty = evict_way_q; + be_o.tag = '1; + be_o.data = '1; + for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin + if (evict_way_q[i]) be_o.vldrty[i] = '1; + end data_o.tag = mshr_q.addr[DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH-1:DCACHE_INDEX_WIDTH]; data_o.data = data_miss_fsm; data_o.valid = 1'b1; @@ -345,7 +348,9 @@ module miss_handler import ariane_pkg::*; import std_cache_pkg::*; #( we_o = 1'b1; data_o.valid = INVALIDATE_ON_FLUSH ? 1'b0 : 1'b1; // invalidate - be_o.vldrty = evict_way_q; + for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin + if (evict_way_q[i]) be_o.vldrty[i] = '1; + end // go back to handling the miss or flushing, depending on where we came from state_d = (state_q == WB_CACHELINE_MISS) ? MISS : FLUSH_REQ_STATUS; end diff --git a/core/cache_subsystem/std_nbdcache.sv b/core/cache_subsystem/std_nbdcache.sv index 6f4e4d3e5b..17c5fdaaaf 100644 --- a/core/cache_subsystem/std_nbdcache.sv +++ b/core/cache_subsystem/std_nbdcache.sv @@ -89,7 +89,7 @@ import std_cache_pkg::*; cache_line_t wdata_ram; cache_line_t [DCACHE_SET_ASSOC-1:0] rdata_ram; cl_be_t be_ram; - logic [(DCACHE_LINE_WIDTH/8+1)*DCACHE_SET_ASSOC-1:0] be_valid_dirty_ram; + vldrty_t [DCACHE_SET_ASSOC-1:0] be_valid_dirty_ram; // Busy signals logic miss_handler_busy; @@ -222,30 +222,20 @@ import std_cache_pkg::*; // Valid/Dirty Regs // ---------------- - // align each valid/dirty bit pair to a byte boundary in order to leverage byte enable signals. - // note: if you have an SRAM that supports flat bit enables for your target technology, - // you can use it here to save the extra 17x overhead introduced by this workaround. - logic [(DCACHE_LINE_WIDTH+8)*DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata; + vldrty_t [DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata; for (genvar i = 0; i < DCACHE_SET_ASSOC; i++) begin - for (genvar j = 0; j < DCACHE_LINE_WIDTH/8; j++) begin - // dirty bits assignment - assign dirty_wdata[(DCACHE_LINE_WIDTH+8)*i+8*j] = wdata_ram.dirty[j]; - assign rdata_ram[i].dirty[j] = dirty_rdata[(DCACHE_LINE_WIDTH+8)*i+8*j]; - end - // valid bit assignment - assign dirty_wdata[DCACHE_LINE_WIDTH+(DCACHE_LINE_WIDTH+8)*i] = wdata_ram.valid; - assign rdata_ram[i].valid = dirty_rdata[DCACHE_LINE_WIDTH+(DCACHE_LINE_WIDTH+8)*i]; - end - - // be construction for valid_dirty_sram - for (genvar i = 0; i < DCACHE_SET_ASSOC; i++) begin - assign be_valid_dirty_ram[i*(DCACHE_LINE_WIDTH/8+1)+:(DCACHE_LINE_WIDTH/8+1)] = {be_ram.vldrty[i], be_ram.data} & {(DCACHE_LINE_WIDTH/8+1){be_ram.vldrty[i]}}; + assign dirty_wdata[i] = '{dirty: wdata_ram.dirty, valid: wdata_ram.valid}; + assign rdata_ram[i].dirty = dirty_rdata[i].dirty; + assign rdata_ram[i].valid = dirty_rdata[i].valid; + assign be_valid_dirty_ram[i].valid = be_ram.vldrty[i].valid; + assign be_valid_dirty_ram[i].dirty = be_ram.vldrty[i].dirty; end sram #( .USER_WIDTH ( 1 ), - .DATA_WIDTH ( (DCACHE_LINE_WIDTH+8)*DCACHE_SET_ASSOC ), + .DATA_WIDTH ( DCACHE_SET_ASSOC*$bits(vldrty_t) ), + .BYTE_WIDTH ( 1 ), .NUM_WORDS ( DCACHE_NUM_WORDS ) ) valid_dirty_sram ( .clk_i ( clk_i ), diff --git a/core/include/std_cache_pkg.sv b/core/include/std_cache_pkg.sv index 740bf56cfe..de82701cab 100644 --- a/core/include/std_cache_pkg.sv +++ b/core/include/std_cache_pkg.sv @@ -60,6 +60,11 @@ package std_cache_pkg; logic [63:0] rdata; } bypass_rsp_t; + typedef struct packed { + logic [ariane_pkg::DCACHE_LINE_WIDTH/8-1:0] dirty; + logic valid; + } vldrty_t; + typedef struct packed { logic [ariane_pkg::DCACHE_TAG_WIDTH-1:0] tag; // tag array logic [ariane_pkg::DCACHE_LINE_WIDTH-1:0] data; // data array @@ -69,9 +74,9 @@ package std_cache_pkg; // cache line byte enable typedef struct packed { - logic [(ariane_pkg::DCACHE_TAG_WIDTH+7)/8-1:0] tag; // byte enable into tag array - logic [(ariane_pkg::DCACHE_LINE_WIDTH+7)/8-1:0] data; // byte enable into data array - logic [ariane_pkg::DCACHE_SET_ASSOC-1:0] vldrty; // bit enable into state array (valid for a pair of dirty/valid bits) + logic [(ariane_pkg::DCACHE_TAG_WIDTH+7)/8-1:0] tag; // byte enable into tag array + logic [(ariane_pkg::DCACHE_LINE_WIDTH+7)/8-1:0] data; // byte enable into data array + vldrty_t [ariane_pkg::DCACHE_SET_ASSOC-1:0] vldrty; // bit enable into state array } cl_be_t; // convert one hot to bin for -> needed for cache replacement diff --git a/corev_apu/tb/ariane_tb.cpp b/corev_apu/tb/ariane_tb.cpp index a9805ee0b9..b5473d059e 100644 --- a/corev_apu/tb/ariane_tb.cpp +++ b/corev_apu/tb/ariane_tb.cpp @@ -338,10 +338,10 @@ int main(int argc, char **argv) { size_t mem_size = 0xFFFFFF; #if (VERILATOR_VERSION_INTEGER >= 5000000) // Verilator v5: Use rootp pointer and .data() accessor. - memif.read(0x80000000, mem_size, (void *)top->rootp->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__i_tc_sram_wrapper__DOT__i_tc_sram__DOT__sram.data()); + memif.read(0x80000000, mem_size, (void *)top->rootp->ariane_testharness__DOT__i_sram__DOT__i_tc_sram__DOT__sram.data()); #else // Verilator v4 - memif.read(0x80000000, mem_size, (void *)top->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__i_tc_sram_wrapper__DOT__i_tc_sram__DOT__sram); + memif.read(0x80000000, mem_size, (void *)top->ariane_testharness__DOT__i_sram__DOT__i_tc_sram__DOT__sram); #endif // memif.read(0x84000000, mem_size, (void *)top->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__gen_mem__DOT__gen_mem_user__DOT__i_tc_sram_wrapper_user__DOT__i_tc_sram__DOT__sram); diff --git a/corev_apu/tb/ariane_tb.sv b/corev_apu/tb/ariane_tb.sv index c0d9c32239..bbbfbaf65b 100644 --- a/corev_apu/tb/ariane_tb.sv +++ b/corev_apu/tb/ariane_tb.sv @@ -19,7 +19,7 @@ import uvm_pkg::*; `include "uvm_macros.svh" -`define MAIN_MEM(P) dut.i_sram.gen_cut[0].i_tc_sram_wrapper.i_tc_sram.init_val[(``P``)] +`define MAIN_MEM(P) dut.i_sram.i_tc_sram.init_val[(``P``)] // `define USER_MEM(P) dut.i_sram.gen_cut[0].gen_mem.gen_mem_user.i_tc_sram_wrapper_user.i_tc_sram.init_val[(``P``)] import "DPI-C" function read_elf(input string filename);