diff --git a/hw/snitch/src/riscv_instr.sv b/hw/snitch/src/riscv_instr.sv index d339d4d62..b415628af 100644 --- a/hw/snitch/src/riscv_instr.sv +++ b/hw/snitch/src/riscv_instr.sv @@ -1133,6 +1133,7 @@ package riscv_instr; localparam logic [11:0] CSR_MHARTID = 12'hf14; localparam logic [11:0] CSR_SSR = 12'h7c0; localparam logic [11:0] CSR_FPMODE = 12'h7c1; + localparam logic [11:0] CSR_BARRIER = 12'h7c2; localparam logic [11:0] CSR_HTIMEDELTAH = 12'h615; localparam logic [11:0] CSR_CYCLEH = 12'hc80; localparam logic [11:0] CSR_TIMEH = 12'hc81; diff --git a/hw/snitch/src/snitch.sv b/hw/snitch/src/snitch.sv index 87815c1bc..b5b13dca3 100644 --- a/hw/snitch/src/snitch.sv +++ b/hw/snitch/src/snitch.sv @@ -102,7 +102,10 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( output fpnew_pkg::fmt_mode_t fpu_fmt_mode_o, input fpnew_pkg::status_t fpu_status_i, // Core events for performance counters - output snitch_pkg::core_events_t core_events_o + output snitch_pkg::core_events_t core_events_o, + // Cluster HW barrier + output logic barrier_o, + input logic barrier_i ); // Debug module's base address localparam logic [31:0] DmBaseAddress = 0; @@ -232,6 +235,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( // ----- logic [31:0] csr_rvalue; logic csr_en; + logic csr_stall_d, csr_stall_q; localparam logic M = 0; localparam logic S = 1; @@ -293,6 +297,8 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( `FFNR(dscratch_q, dscratch_d, clk_i) `FFAR(debug_q, debug_d, '0, clk_i, rst_i) // Debug mode + `FFAR(csr_stall_q, csr_stall_d, '0, clk_i, rst_i) + typedef struct packed { fpnew_pkg::fmt_mode_t fmode; fpnew_pkg::roundmode_e frm; @@ -398,7 +404,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( | (~{(PPNSize){trans_active}} & {{{AddrWidth-32}{1'b0}}, pc_q[31:PageShift]}); assign inst_addr_o[PageShift-1:0] = pc_q[PageShift-1:0]; assign inst_cacheable_o = snitch_pma_pkg::is_inside_cacheable_regions(SnitchPMACfg, inst_addr_o); - assign inst_valid_o = ~wfi_q; + assign inst_valid_o = ~wfi_q && ~csr_stall_q; // -------------------- // Control @@ -454,7 +460,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( pc_d = pc_q; npc = pc_q; // the next PC if we wouldn't be in debug mode // if we got a valid instruction word increment the PC unless we are waiting for an event - if (!stall && !wfi_q) begin + if (!stall && !wfi_q && !csr_stall_q) begin casez (next_pc) Consec: npc = consec_pc; Alu: npc = alu_result & {{31{1'b1}}, ~zero_lsb}; @@ -2273,6 +2279,11 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( dpc_d = dpc_q; dscratch_d = dscratch_q; + csr_stall_d = csr_stall_q; + + if (barrier_i) csr_stall_d = 1'b0; + barrier_o = 1'b0; + // DPC and DCSR update logic if (!debug_q) begin if (valid_instr && inst_data_i == EBREAK) begin @@ -2468,6 +2479,11 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #( if (!exception) fcsr_d = fcsr_t'(alu_result[9:0]); end else illegal_csr = 1'b1; end + // HW cluster barrier + CSR_BARRIER: begin + barrier_o = 1'b1; + csr_stall_d = 1'b1; + end default: csr_rvalue = '0; endcase end else illegal_csr = 1'b1; diff --git a/hw/snitch_cluster/src/snitch_barrier.sv b/hw/snitch_cluster/src/snitch_barrier.sv index 5193fcd0a..32b8cfad3 100644 --- a/hw/snitch_cluster/src/snitch_barrier.sv +++ b/hw/snitch_cluster/src/snitch_barrier.sv @@ -1,80 +1,37 @@ -// Copyright 2020 ETH Zurich and University of Bologna. +// Copyright 2023 ETH Zurich and University of Bologna. // Solderpad Hardware License, Version 0.51, see LICENSE for details. // SPDX-License-Identifier: SHL-0.51 - -// Author: Florian Zaruba -// Author: Fabian Schuiki +// +// Author: Luca Colagrande `include "common_cells/registers.svh" /// Hardware barrier to synchronize all cores in a cluster. -module snitch_barrier - import snitch_pkg::*; - import snitch_cluster_peripheral_reg_pkg::*; -#( - parameter int unsigned AddrWidth = 0, - parameter int NrPorts = 0, - parameter type dreq_t = logic, - parameter type drsp_t = logic, - /// Derived parameter *Do not override* - parameter type addr_t = logic [AddrWidth-1:0] +module snitch_barrier #( + parameter int NrCores = 0 ) ( - input logic clk_i, - input logic rst_ni, - input dreq_t [NrPorts-1:0] in_req_i, - output drsp_t [NrPorts-1:0] in_rsp_o, - - output dreq_t [NrPorts-1:0] out_req_o, - input drsp_t [NrPorts-1:0] out_rsp_i, - - input addr_t cluster_periph_start_address_i + input logic clk_i, + input logic rst_ni, + input logic [NrCores-1:0] barrier_i, + output logic barrier_o ); - typedef enum logic [1:0] { - Idle, - Wait, - Take - } barrier_state_e; - barrier_state_e [NrPorts-1:0] state_d, state_q; - logic [NrPorts-1:0] is_barrier; - logic take_barrier; + logic [NrCores-1:0] arrival_d, arrival_q; + + generate + for (genvar i = 0; i < NrCores; i++) begin : gen_arrival_bit - assign take_barrier = &is_barrier; + `FF(arrival_q[i], arrival_d[i], 1'b0, clk_i, rst_ni) - always_comb begin - state_d = state_q; - is_barrier = '0; - out_req_o = in_req_i; - in_rsp_o = out_rsp_i; + always_comb begin + if (barrier_o) arrival_d[i] = 1'b0; + else if (barrier_i[i]) arrival_d[i] = 1'b1; + else arrival_d[i] = arrival_q[i]; + end - for (int i = 0; i < NrPorts; i++) begin - case (state_q[i]) - Idle: begin - if (in_req_i[i].q_valid && - (in_req_i[i].q.addr == - cluster_periph_start_address_i + - SNITCH_CLUSTER_PERIPHERAL_HW_BARRIER_OFFSET)) begin - state_d[i] = Wait; - out_req_o[i].q_valid = 0; - in_rsp_o[i].q_ready = 0; - end - end - Wait: begin - is_barrier[i] = 1; - out_req_o[i].q_valid = 0; - in_rsp_o[i].q_ready = 0; - if (take_barrier) state_d[i] = Take; - end - Take: begin - if (out_req_o[i].q_valid && in_rsp_o[i].q_ready) state_d[i] = Idle; - end - default: state_d[i] = Idle; - endcase end - end + endgenerate - for (genvar i = 0; i < NrPorts; i++) begin : gen_ff - `FFARN(state_q[i], state_d[i], Idle, clk_i, rst_ni) - end + assign barrier_o = &arrival_q; endmodule diff --git a/hw/snitch_cluster/src/snitch_cc.sv b/hw/snitch_cluster/src/snitch_cc.sv index 0d290fb53..3fab544d0 100644 --- a/hw/snitch_cluster/src/snitch_cc.sv +++ b/hw/snitch_cluster/src/snitch_cc.sv @@ -126,7 +126,10 @@ module snitch_cc #( output dma_events_t axi_dma_events_o, // Core event strobes output snitch_pkg::core_events_t core_events_o, - input addr_t tcdm_addr_base_i + input addr_t tcdm_addr_base_i, + // Cluster HW barrier + output logic barrier_o, + input logic barrier_i ); // FMA architecture is "merged" -> mulexp and macexp instructions are supported @@ -252,7 +255,9 @@ module snitch_cc #( .fpu_rnd_mode_o ( fpu_rnd_mode ), .fpu_fmt_mode_o ( fpu_fmt_mode ), .fpu_status_i ( fpu_status ), - .core_events_o ( snitch_events) + .core_events_o ( snitch_events), + .barrier_o ( barrier_o ), + .barrier_i ( barrier_i ) ); reqrsp_iso #( diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv index acaee3acc..00e0899e1 100644 --- a/hw/snitch_cluster/src/snitch_cluster.sv +++ b/hw/snitch_cluster/src/snitch_cluster.sv @@ -490,8 +490,8 @@ module snitch_cluster snitch_icache_pkg::icache_events_t [NrCores-1:0] icache_events; // 4. Memory Subsystem (Core side). - reqrsp_req_t [NrCores-1:0] core_req, filtered_core_req; - reqrsp_rsp_t [NrCores-1:0] core_rsp, filtered_core_rsp; + reqrsp_req_t [NrCores-1:0] core_req; + reqrsp_rsp_t [NrCores-1:0] core_rsp; reqrsp_req_t [NrHives-1:0] ptw_req; reqrsp_rsp_t [NrHives-1:0] ptw_rsp; @@ -502,6 +502,8 @@ module snitch_cluster // 5. Misc. Wires. logic icache_prefetch_enable; logic [NrCores-1:0] cl_interrupt; + logic [NrCores-1:0] barrier_in; + logic barrier_out; // ------------- // DMA Subsystem @@ -889,7 +891,9 @@ module snitch_cluster .axi_dma_perf_o (), .axi_dma_events_o (dma_core_events), .core_events_o (core_events[i]), - .tcdm_addr_base_i (tcdm_start_address) + .tcdm_addr_base_i (tcdm_start_address), + .barrier_o (barrier_in[i]), + .barrier_i (barrier_out) ); for (genvar j = 0; j < TcdmPorts; j++) begin : gen_tcdm_user always_comb begin @@ -1000,19 +1004,14 @@ module snitch_cluster // -------- // Coes SoC // -------- + snitch_barrier #( - .AddrWidth (PhysicalAddrWidth), - .NrPorts (NrCores), - .dreq_t (reqrsp_req_t), - .drsp_t (reqrsp_rsp_t) + .NrCores(NrCores) ) i_snitch_barrier ( .clk_i, .rst_ni, - .in_req_i (core_req), - .in_rsp_o (core_rsp), - .out_req_o (filtered_core_req), - .out_rsp_i (filtered_core_rsp), - .cluster_periph_start_address_i (cluster_periph_start_address) + .barrier_i(barrier_in), + .barrier_o(barrier_out) ); reqrsp_req_t core_to_axi_req; @@ -1032,8 +1031,8 @@ module snitch_cluster ) i_reqrsp_mux_core ( .clk_i, .rst_ni, - .slv_req_i (filtered_core_req), - .slv_rsp_o (filtered_core_rsp), + .slv_req_i (core_req), + .slv_rsp_o (core_rsp), .mst_req_o (core_to_axi_req), .mst_rsp_i (core_to_axi_rsp), .idx_o (/*unused*/) diff --git a/sw/deps/riscv-opcodes b/sw/deps/riscv-opcodes index 77c3baf1a..23650ceed 160000 --- a/sw/deps/riscv-opcodes +++ b/sw/deps/riscv-opcodes @@ -1 +1 @@ -Subproject commit 77c3baf1acd8290d7efc8dc94abb659f3b6381c2 +Subproject commit 23650ceedbd542dcb319c15a217e63ced398cb8b diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c index 77d5a0326..3593980c7 100644 --- a/sw/snRuntime/src/start.c +++ b/sw/snRuntime/src/start.c @@ -13,13 +13,6 @@ static inline uint32_t snrt_cls_base_addr() { return l1_end_addr - cdata_size - cbss_size; } -static inline void snrt_crt0_cluster_hw_barrier() { - uint32_t register r; - uint32_t hw_barrier = - SNRT_CLUSTER_HW_BARRIER_ADDR + snrt_cluster_idx() * SNRT_CLUSTER_OFFSET; - asm volatile("lw %0, 0(%1)" : "=r"(r) : "r"(hw_barrier) : "memory"); -} - static inline void snrt_init_tls() { extern volatile uint32_t __tdata_start, __tdata_end; extern volatile uint32_t __tbss_start, __tbss_end; @@ -130,7 +123,7 @@ void snrt_main() { #endif #ifdef SNRT_CRT0_PRE_BARRIER - snrt_crt0_cluster_hw_barrier(); + snrt_cluster_hw_barrier(); #endif #ifdef SNRT_CRT0_CALLBACK5 @@ -147,7 +140,7 @@ void snrt_main() { #endif #ifdef SNRT_CRT0_POST_BARRIER - snrt_crt0_cluster_hw_barrier(); + snrt_cluster_hw_barrier(); #endif #ifdef SNRT_CRT0_CALLBACK7 diff --git a/sw/snRuntime/src/sync.h b/sw/snRuntime/src/sync.h index 22093fdc0..07eea700f 100644 --- a/sw/snRuntime/src/sync.h +++ b/sw/snRuntime/src/sync.h @@ -56,12 +56,7 @@ inline void snrt_mutex_release(volatile uint32_t *pmtx) { /// Synchronize cores in a cluster with a hardware barrier inline void snrt_cluster_hw_barrier() { - uint32_t register r; - - asm volatile("lw %0, 0(%1)" - : "=r"(r) - : "r"((uint32_t)snrt_cluster_hw_barrier_addr()) - : "memory"); + asm volatile("csrr x0, 0x7C2" ::: "memory"); } /// Synchronize clusters globally with a global software barrier diff --git a/util/generate-opcodes.sh b/util/generate-opcodes.sh index 069504c1f..82f35f391 100755 --- a/util/generate-opcodes.sh +++ b/util/generate-opcodes.sh @@ -3,7 +3,7 @@ # Solderpad Hardware License, Version 0.51, see LICENSE for details. # SPDX-License-Identifier: SHL-0.51 -# Generate the opcodes for the Snith system. +# Generate the opcodes for the Snitch system. set -e ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)