From eb67e8ac44890658047020574e6707a971d95bb9 Mon Sep 17 00:00:00 2001 From: Ryan Antonio Date: Fri, 29 Sep 2023 16:34:13 +0200 Subject: [PATCH] Add SNAX GEMM (#25) * bender: Add snax-gemm to bender * bender: Update bender with latest snax-gemm repo * hw: Add snax-gemm.sv * hw: Modify snitch_cluster and add snax-gemm instance * hw: Modify snitch cluster template * docs: Modify snitch cluster schema * lint: Exclude snax_gemm from license check * hw: Lint snax_gemm * cfg: Modify snax.hjson with new config * improve gemm * modify gemm.sv * hw: modify status CSR * cfg: Add snax-hjson configuration * tests: Renamed snax-run.yaml to snax-mac-run.yaml * cfg: Modify to use MAC generation * cfg: Rename snax.hjson to snax-mac.hjson * ci: Change build to SNAX-MAC runs * cfg: Modify snax-gemm.hjson * tests: Add snax-gemm-run.yaml * ci: Add the SNAX-GEMM in CI * ci: Fix lint indentation * cfg: Take out original Snitch compute core templates --------- Co-authored-by: xiaoling-yi --- .github/workflows/ci.yml | 32 +- .github/workflows/lint.yml | 1 + Bender.yml | 10 +- docs/schema/snitch_cluster.schema.json | 10 +- hw/snax_gemm/src/snax_gemm.sv | 361 ++++++++++++++++++ hw/snitch_cluster/src/snitch_cluster.sv | 45 ++- .../src/snitch_cluster_wrapper.sv.tpl | 3 +- target/snitch_cluster/cfg/snax-gemm.hjson | 122 ++++++ .../cfg/{snax.hjson => snax-mac.hjson} | 46 +-- .../snitch_cluster/sw/apps/snax-gemm-run.yaml | 17 + .../apps/{snax-run.yaml => snax-mac-run.yaml} | 0 .../{snax-run.yaml => snax-gemm-run.yaml} | 0 .../snitch_cluster/sw/tests/snax-mac-run.yaml | 71 ++++ 13 files changed, 656 insertions(+), 62 deletions(-) create mode 100644 hw/snax_gemm/src/snax_gemm.sv create mode 100644 target/snitch_cluster/cfg/snax-gemm.hjson rename target/snitch_cluster/cfg/{snax.hjson => snax-mac.hjson} (73%) create mode 100644 target/snitch_cluster/sw/apps/snax-gemm-run.yaml rename target/snitch_cluster/sw/apps/{snax-run.yaml => snax-mac-run.yaml} (100%) rename target/snitch_cluster/sw/tests/{snax-run.yaml => snax-gemm-run.yaml} (100%) create mode 100644 target/snitch_cluster/sw/tests/snax-mac-run.yaml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index db116c316..aa4a408e9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -50,8 +50,8 @@ jobs: # Simulate SW on Snax Cluster w/ Verilator # ############################################ - sw-snax-cluster-vlt: - name: Simulate SW on SNAX Cluster w/ Verilator + sw-snax-mac-cluster-vlt: + name: Simulate SW on SNAX MAC Cluster w/ Verilator runs-on: ubuntu-22.04 container: image: ghcr.io/pulp-platform/snitch_cluster:main @@ -61,7 +61,7 @@ jobs: submodules: 'recursive' - name: Build Hardware run: | - make CFG_OVERRIDE=cfg/snax.hjson \ + make CFG_OVERRIDE=cfg/snax-mac.hjson \ -C target/snitch_cluster bin/snitch_cluster.vlt - name: Build Software run: | @@ -70,8 +70,30 @@ jobs: - name: Run Tests working-directory: target/snitch_cluster run: |- - ./run.py sw/tests/snax-run.yaml --simulator verilator - ./run.py sw/apps/snax-run.yaml --simulator verilator + ./run.py sw/tests/snax-mac-run.yaml --simulator verilator + ./run.py sw/apps/snax-mac-run.yaml --simulator verilator + + sw-snax-gemm-cluster-vlt: + name: Simulate SW on SNAX GEMM Cluster w/ Verilator + runs-on: ubuntu-22.04 + container: + image: ghcr.io/pulp-platform/snitch_cluster:main + steps: + - uses: actions/checkout@v2 + with: + submodules: 'recursive' + - name: Build Hardware + run: | + make CFG_OVERRIDE=cfg/snax-gemm.hjson \ + -C target/snitch_cluster bin/snitch_cluster.vlt + - name: Build Software + run: | + make -C target/snitch_cluster sw + - name: Run Tests + working-directory: target/snitch_cluster + run: |- + ./run.py sw/tests/snax-gemm-run.yaml --simulator verilator + ./run.py sw/apps/snax-gemm-run.yaml --simulator verilator ############################################ # Build SW on Snitch Cluster w/ Banshee # diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 5a062ba60..68cb06e03 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -87,6 +87,7 @@ jobs: sw/math/Makefile hw/snitch/src/csr_snax_def.sv hw/snax_hwpe_mac/src/* + hw/snax_gemm/src/* target/snitch_cluster/sw/apps/snax-mac/* target/snitch_cluster/sw/apps/snax-mac-simple/* diff --git a/Bender.yml b/Bender.yml index d5f431a24..b9e93b664 100644 --- a/Bender.yml +++ b/Bender.yml @@ -17,6 +17,8 @@ package: - Nils Wistoff - Luca Bertaccini - Matheus Cavalcante + - Xiaoling Yi + - Ryan Antonio dependencies: axi: { git: https://github.com/pulp-platform/axi, version: 0.39.0 } @@ -27,6 +29,7 @@ dependencies: tech_cells_generic: { git: https://github.com/pulp-platform/tech_cells_generic, version: 0.2.11 } riscv-dbg: { git: https://github.com/pulp-platform/riscv-dbg, version: 0.8.0 } hwpe-mac-engine: { git: https://github.com/KULeuven-MICAS/hwpe-mac-engine.git, rev: 5d3b4525b665169fc8321c8a811f3c83ad3c72e8 } + snax-gemm: { git: https://github.com/KULeuven-MICAS/snax-gemm.git, rev: 6b538447969cb0c00a4b749b38d68ae932c6aca1 } vendor_package: - name: musl @@ -184,13 +187,18 @@ sources: files: - hw/snitch/test/snitch_l0_tlb_tb.sv - # hwpe_mac + # snax mac - files: # Level 0 - hw/snax_hwpe_mac/src/snax_hwpe_ctrl.sv - hw/snax_hwpe_mac/src/snax_hwpe_to_reqrsp.sv - hw/snax_hwpe_mac/src/snax_mac.sv + # snax gemm + - files: + # Level 0 + - hw/snax_gemm/src/snax_gemm.sv + # snitch_vm - files: - hw/snitch_vm/src/snitch_ptw.sv diff --git a/docs/schema/snitch_cluster.schema.json b/docs/schema/snitch_cluster.schema.json index 5ea872ec6..e1ad5ed51 100644 --- a/docs/schema/snitch_cluster.schema.json +++ b/docs/schema/snitch_cluster.schema.json @@ -416,11 +416,11 @@ "description": "Enable Smallfloat vector extension (SIMD).", "default": false }, - "snax": { - "type": "boolean", - "title": "SNAX Extension", - "description": "Enable Snitch Accelerator eXtension (SNAX)", - "default": false + "snax_acc": { + "type": "number", + "title": "SNAX Accelerator Extension", + "description": "Enable specific Snitch Accelerator eXtension (SNAX). 1 for MAC engine and 2 for GEMM engine.", + "default": 0 }, "snax_tcdm_ports": { "type": "number", diff --git a/hw/snax_gemm/src/snax_gemm.sv b/hw/snax_gemm/src/snax_gemm.sv new file mode 100644 index 000000000..6e005cfd7 --- /dev/null +++ b/hw/snax_gemm/src/snax_gemm.sv @@ -0,0 +1,361 @@ +//-------------------------------------------------------------------- +// Copyright 2023 Katolieke Universiteit Leuven (KUL) +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Author: Xiaoling Yi (xiaoling.yi@kuleuven.be) +//-------------------------------------------------------------------- + +// verilog_lint: waive-start line-length +// verilog_lint: waive-start no-trailing-spaces + +import riscv_instr::*; +import reqrsp_pkg::*; + +module snax_gemm # ( + parameter int unsigned DataWidth = 64, + parameter int unsigned SnaxTcdmPorts = 16, + parameter type acc_req_t = logic, + parameter type acc_rsp_t = logic, + parameter type tcdm_req_t = logic, + parameter type tcdm_rsp_t = logic +)( + input logic clk_i, + input logic rst_ni, + + input logic snax_qvalid_i, + output logic snax_qready_o, + input acc_req_t snax_req_i, + + output acc_rsp_t snax_resp_o, + output logic snax_pvalid_o, + input logic snax_pready_i, + + output tcdm_req_t [SnaxTcdmPorts-1:0] snax_tcdm_req_o, + input tcdm_rsp_t [SnaxTcdmPorts-1:0] snax_tcdm_rsp_i +); + + // Local parameters for input and output sizes + localparam int unsigned InputMatrixSize = DataWidth*SnaxTcdmPorts/2; + localparam int unsigned OutputMatrixSize = InputMatrixSize*4; // x4 because of multiplication and addition considerations + + // CSRs + localparam int unsigned RegNum = 5; + localparam int unsigned CsrAddrOFfset = 32'h3c0; + + logic [31:0] CSRs [RegNum]; + logic [31:0] csr_addr; + + logic write_csr; + logic read_csr; + + + // Gemm wires + logic [ InputMatrixSize-1:0] io_a_io_in; + logic [ InputMatrixSize-1:0] io_b_io_in; + logic [OutputMatrixSize-1:0] io_c_io_out; + logic [OutputMatrixSize / 2 -1:0] io_c_io_out_reg; + logic io_start_do; + logic io_data_in_valid; + logic io_data_out_valid; + + localparam int unsigned HalfC = InputMatrixSize*2; + localparam int unsigned HalfHalfCAddr = HalfC/2/8; + localparam int unsigned HalfCAddr = HalfC/8; + localparam int unsigned HalfHalfC = HalfC/2; + + // 2 cycle to write data out because output is 8 * 8 * 32bits = 2048. + // But TCDM has 16 ports only (2048 / (16 * 32) = 2). + logic read_tcdm; + logic write_tcdm_1; + logic write_tcdm_2; + logic read_tcdm_done; + logic write_tcdm_done; + logic write_tcdm_done_1; + logic write_tcdm_done_2; + logic tcdm_not_ready; + logic [SnaxTcdmPorts-1:0] snax_tcdm_rsp_i_p_valid; + logic [SnaxTcdmPorts-1:0] snax_tcdm_req_o_q_valid; + + // Gemm control states, we have 3 states currently. + // By default, the state is IDLE_GEMM + // When io_start_do assert, we send read data request and turn to COMP_GEMM + // When io_data_out_valid assert, we write back the least half of results to TCDM and turn to WRITE_GEMM + // Afterwards, we turn to IDLE_GEMM. + typedef enum logic [2:0] { + IDLE_GEMM, + COMP_GEMM, + WRITE_GEMM + } ctrl_states_t; + + ctrl_states_t cstate, nstate; + + // Write CSRs + always_ff @ (posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + for (int i=0; i < RegNum - 1; i++) begin + CSRs[i] <= 32'd0; + end + CSRs[4] <= 32'd1; + end else begin + if(write_csr == 1'b1 && io_start_do != 1'b1) begin + CSRs[csr_addr] <= snax_req_i.data_arga[31:0]; + end + else begin + if (io_start_do == 1'b1) begin + CSRs[4] <= 32'd0; + end + else if (write_tcdm_done_2 == 1'b1) begin + CSRs[4] <= 32'd1; + end + end + end + end + + // Read CSRs + always_comb begin + if (!rst_ni) begin + snax_resp_o.data = 0; + snax_resp_o.id = 0; + snax_resp_o.error = 1'b0; + snax_pvalid_o = 1'b0; + end else begin + if(read_csr) begin + snax_resp_o.data = {32'b0,CSRs[csr_addr]}; + snax_resp_o.id = snax_req_i.id; + snax_resp_o.error = 1'b0; + snax_pvalid_o = 1'b1; + end + else begin + snax_resp_o.data = 0; + snax_resp_o.id = 0; + snax_resp_o.error = 1'b0; + snax_pvalid_o = 1'b0; + end + end + end + + // Read or write control logic + always_comb begin + if (!rst_ni) begin + read_csr = 1'b0; + write_csr = 1'b0; + end + else if(snax_qvalid_i) begin + unique casez (snax_req_i.data_op) + CSRRS, CSRRSI, CSRRC, CSRRCI: begin + read_csr = 1'b1; + write_csr = 1'b0; + end + default: begin + write_csr = 1'b1; + read_csr = 1'b0; + end + endcase + end + else begin + read_csr = 1'b0; + write_csr = 1'b0; + end + end + + assign snax_qready_o = 1'b1; + assign csr_addr = snax_req_i.data_argb - CsrAddrOFfset; + + Gemm inst_gemm( + .clock ( clk_i ), // :9016:11 + .reset ( !rst_ni ), // :9017:11 + .io_data_in_valid ( io_data_in_valid ), // src/main/scala/gemm/gemm.scala:309:16 + .io_a_io_in ( io_a_io_in ), // src/main/scala/gemm/gemm.scala:309:16 + .io_b_io_in ( io_b_io_in ), // src/main/scala/gemm/gemm.scala:309:16 + .io_data_out_valid ( io_data_out_valid ), // src/main/scala/gemm/gemm.scala:309:16 + .io_c_io_out ( io_c_io_out ) // src/main/scala/gemm/gemm.scala:309:16 + ); + + // Holding output + always_ff @ (posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + io_c_io_out_reg <= 0; + end else begin + if (io_data_out_valid) begin + io_c_io_out_reg <= io_c_io_out[OutputMatrixSize - 1: OutputMatrixSize / 2]; + end + end + end + + // Changing states + always_ff @ (posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + cstate <= IDLE_GEMM; + end else begin + cstate <= nstate; + end + end + + // Next state changes + always_comb begin + case(cstate) + IDLE_GEMM: begin + if (io_start_do) begin + nstate = COMP_GEMM; + end else begin + nstate = IDLE_GEMM; + end + end + COMP_GEMM: begin + if (write_tcdm_done_1) begin + nstate = WRITE_GEMM; + end else begin + nstate = COMP_GEMM; + end + end + WRITE_GEMM: begin + if (write_tcdm_done_2) begin + nstate = IDLE_GEMM; + end else begin + nstate = WRITE_GEMM; + end + end + default: begin + nstate = IDLE_GEMM; + end + endcase + + end + + assign io_start_do = snax_qvalid_i & (csr_addr == 3) & snax_qready_o; + + // read data from TCDM and write data to TCDM + + always_comb begin + for (int i = 0; i < SnaxTcdmPorts / 2; i++) begin + if(!rst_ni) begin + snax_tcdm_req_o[i].q_valid = 1'b0; + snax_tcdm_req_o[i].q.addr = 17'b0; + snax_tcdm_req_o[i].q.write = 1'b0; + snax_tcdm_req_o[i].q.amo = AMONone; + snax_tcdm_req_o[i].q.data = {DataWidth{1'b0}}; + snax_tcdm_req_o[i].q.strb = {(DataWidth / 8){1'b0}}; + snax_tcdm_req_o[i].q.user = '0; + + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q_valid = 1'b0; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.addr = 17'b0; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.write = 1'b0; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.amo = AMONone; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.data = {DataWidth{1'b0}}; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.strb = {(DataWidth / 8){1'b0}}; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.user = '0; + end + else if(read_tcdm) begin + snax_tcdm_req_o[i].q_valid = 1'b1; + snax_tcdm_req_o[i].q.addr = CSRs[0] + i * 8; + snax_tcdm_req_o[i].q.write = 1'b0; + snax_tcdm_req_o[i].q.amo = AMONone; + snax_tcdm_req_o[i].q.data = {DataWidth{1'b0}}; + snax_tcdm_req_o[i].q.strb = {(DataWidth / 8){1'b1}}; + snax_tcdm_req_o[i].q.user = '0; + + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q_valid = 1'b1; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.addr = CSRs[1] + i * 8; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.write = 1'b0; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.amo = AMONone; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.data = {DataWidth{1'b0}}; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.strb = {(DataWidth / 8){1'b1}}; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.user = '0; + end + else if(write_tcdm_1) begin + snax_tcdm_req_o[i].q_valid = 1'b1; + snax_tcdm_req_o[i].q.addr = CSRs[2] + i * 8; + snax_tcdm_req_o[i].q.write = 1'b1; + snax_tcdm_req_o[i].q.amo = AMONone; + snax_tcdm_req_o[i].q.data = io_c_io_out[i * DataWidth +: DataWidth]; + snax_tcdm_req_o[i].q.strb = {(DataWidth / 8){1'b1}}; + snax_tcdm_req_o[i].q.user = '0; + + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q_valid = 1'b1; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.addr = CSRs[2] + i * 8 + HalfHalfCAddr; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.write = 1'b1; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.amo = AMONone; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.data = io_c_io_out[(i * DataWidth + HalfHalfC) +: DataWidth]; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.strb = {(DataWidth / 8){1'b1}}; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.user = '0; + end + else if(write_tcdm_2) begin + snax_tcdm_req_o[i].q_valid = 1'b1; + snax_tcdm_req_o[i].q.addr = CSRs[2] + i * 8 + HalfCAddr; + snax_tcdm_req_o[i].q.write = 1'b1; + snax_tcdm_req_o[i].q.amo = AMONone; + snax_tcdm_req_o[i].q.data = io_c_io_out_reg[(i * DataWidth) +: DataWidth]; + snax_tcdm_req_o[i].q.strb = {(DataWidth / 8){1'b1}}; + snax_tcdm_req_o[i].q.user = '0; + + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q_valid = 1'b1; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.addr = CSRs[2] + i * 8 + HalfCAddr + HalfHalfCAddr; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.write = 1'b1; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.amo = AMONone; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.data = io_c_io_out_reg[(i * DataWidth + HalfHalfC) +: DataWidth]; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.strb = {(DataWidth / 8){1'b1}}; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.user = '0; + end + else begin + snax_tcdm_req_o[i].q_valid = 1'b0; + snax_tcdm_req_o[i].q.addr = 17'b0; + snax_tcdm_req_o[i].q.write = 1'b0; + snax_tcdm_req_o[i].q.amo = AMONone; + snax_tcdm_req_o[i].q.data = {DataWidth{1'b0}}; + snax_tcdm_req_o[i].q.strb = {(DataWidth / 8){1'b0}}; + snax_tcdm_req_o[i].q.user = '0; + + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q_valid = 1'b0; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.addr = 17'b0; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.write = 1'b0; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.amo = AMONone; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.data = {DataWidth{1'b0}}; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.strb = {(DataWidth / 8){1'b0}}; + snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.user = '0; + end + end + end + + always_comb begin + if (!rst_ni) begin + io_a_io_in = 512'b0; + io_b_io_in = 512'b0; + end else begin + for (int i = 0; i < SnaxTcdmPorts / 2; i++) begin + if(io_data_in_valid) begin + io_a_io_in[i * DataWidth +: DataWidth] = snax_tcdm_rsp_i[i].p.data; + io_b_io_in[i * DataWidth +: DataWidth] = snax_tcdm_rsp_i[i + SnaxTcdmPorts / 2].p.data; + end + else begin + io_a_io_in[i * DataWidth +: DataWidth] = 0; + io_b_io_in[i * DataWidth +: DataWidth] = 0; + end + end + end + end + + always_comb begin + for (int i = 0; i < SnaxTcdmPorts; i++) begin + if(!rst_ni) begin + snax_tcdm_rsp_i_p_valid[i] = 1'b0; + snax_tcdm_req_o_q_valid[i] = 1'b0; + end + else begin + snax_tcdm_rsp_i_p_valid[i] = snax_tcdm_rsp_i[i].p_valid; + snax_tcdm_req_o_q_valid[i] = snax_tcdm_req_o[i].q_valid; + end + end + end + + assign tcdm_not_ready = ~io_data_in_valid; + assign io_data_in_valid = ((&snax_tcdm_rsp_i_p_valid) === 1'b1 && (cstate == COMP_GEMM))? 1'b1 : 1'b0; + assign read_tcdm = io_start_do; + assign write_tcdm_1 = io_data_out_valid; + assign write_tcdm_2 = cstate == WRITE_GEMM; + assign read_tcdm_done = io_data_in_valid; + assign write_tcdm_done_1 = (&snax_tcdm_req_o_q_valid) && cstate == COMP_GEMM; + assign write_tcdm_done_2 = (&snax_tcdm_req_o_q_valid) && cstate == WRITE_GEMM; + assign write_tcdm_done = write_tcdm_done_1 & write_tcdm_done_2; + +endmodule diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv index 5c9b442ba..909aad57f 100644 --- a/hw/snitch_cluster/src/snitch_cluster.sv +++ b/hw/snitch_cluster/src/snitch_cluster.sv @@ -96,9 +96,9 @@ module snitch_cluster /// FPU configuration. parameter fpnew_pkg::fpu_implementation_t FPUImplementation [NrCores] = '{default: fpnew_pkg::fpu_implementation_t'(0)}, - /// Enable SNAX per core - parameter bit [NrCores-1:0] SNAX = '0, - /// Number of SNAX TCDM ports, the default assumes HWPE MAC engine + /// Enable SNAX accelerators per core. Integer coded. + parameter int unsigned SNAX [NrCores] = '{default: 0}, + /// Number of SNAX TCDM ports per core parameter int unsigned SnaxTcdmPorts [NrCores] = '{default: 0}, /// Physical Memory Attribute Configuration parameter snitch_pma_pkg::snitch_pma_t SnitchPMACfg = '0, @@ -266,7 +266,17 @@ module snitch_cluster return n; endfunction + function automatic int unsigned check_if_snax(int unsigned core_idx); + for (int i = 0; i < core_idx; i++) begin + if ( SNAX[i] > 0) begin + return 1; + end + end + return 0; + endfunction + // SNAX TCDM + localparam int unsigned SnaxSystem = check_if_snax(NrCores); localparam int unsigned TotalSnaxTcdmPorts = get_snax_tcdm_port_offs(NrCores); localparam int unsigned NrTCDMPortsCores = get_tcdm_port_offs(NrCores); @@ -778,7 +788,7 @@ module snitch_cluster end // generate TCDM for snax if any of the cores has SNAX enabled - if(|SNAX) begin: gen_yes_snax_tcdm_interconnect + if( SnaxSystem ) begin: gen_yes_snax_tcdm_interconnect snitch_tcdm_interconnect #( .NumInp (NumTCDMIn + TotalSnaxTcdmPorts), .NumOut (NrBanks), @@ -974,12 +984,15 @@ module snitch_cluster for (genvar i = 0; i < NrCores; i++) begin : gen_snax_acc + // Calculate exact count of SNAX TCDM ports localparam int unsigned LocalSnaxTcdmPorts = get_snax_tcdm_ports(i); localparam int unsigned LocalSnaxTcdmOffset = get_snax_tcdm_port_offs(i); tcdm_req_t [LocalSnaxTcdmPorts-1:0] hang_snax_tcdm_req; tcdm_rsp_t [LocalSnaxTcdmPorts-1:0] hang_snax_tcdm_rsp; + // Remap SNAX TCDM ports to prune cores that + // do not use SNAX TCDM ports for (genvar j = 0; j < LocalSnaxTcdmPorts; j++) begin : gen_snax_tcdm_map_per_core always_comb begin snax_tcdm_req[LocalSnaxTcdmOffset+j] = hang_snax_tcdm_req[j]; @@ -987,7 +1000,7 @@ module snitch_cluster end end - if(SNAX[i]) begin: gen_yes_mac + if(SNAX[i] == 1) begin: gen_snax_mac snax_mac # ( .DataWidth ( 32 ), @@ -1009,6 +1022,28 @@ module snitch_cluster .snax_tcdm_rsp_i ( hang_snax_tcdm_rsp ) ); + end else if (SNAX[i] == 2) begin: gen_snax_gemm + + snax_gemm # ( + .DataWidth ( NarrowDataWidth ), + .SnaxTcdmPorts ( LocalSnaxTcdmPorts ), + .acc_req_t ( acc_req_t ), + .acc_rsp_t ( acc_resp_t ), + .tcdm_req_t ( tcdm_req_t ), + .tcdm_rsp_t ( tcdm_rsp_t ) + ) i_snax_gemm ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .snax_req_i ( snax_req[i] ), + .snax_qvalid_i ( snax_qvalid[i] ), + .snax_qready_o ( snax_qready[i] ), + .snax_resp_o ( snax_resp[i] ), + .snax_pvalid_o ( snax_pvalid[i] ), + .snax_pready_i ( snax_pready[i] ), + .snax_tcdm_req_o ( hang_snax_tcdm_req ), + .snax_tcdm_rsp_i ( hang_snax_tcdm_rsp ) + ); + end else begin: gen_no_snax_acc // Tie these signal to low when no SNAX accelerator is present assign snax_qready[i] = '0; diff --git a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl index 6c1ff17cb..5a07b5b89 100644 --- a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl +++ b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl @@ -251,6 +251,7 @@ module ${cfg['name']}_wrapper ( localparam int unsigned NumSequencerInstr [${cfg['nr_cores']}] = '{${core_cfg('num_sequencer_instructions')}}; localparam int unsigned NumSsrs [${cfg['nr_cores']}] = '{${core_cfg('num_ssrs')}}; localparam int unsigned SsrMuxRespDepth [${cfg['nr_cores']}] = '{${core_cfg('ssr_mux_resp_depth')}}; + localparam int unsigned SNAX [${cfg['nr_cores']}] = '{${core_cfg('snax_acc')}}; localparam int unsigned SnaxTcdmPorts [${cfg['nr_cores']}] = '{${core_cfg('snax_tcdm_ports')}}; // Snitch cluster under test. @@ -296,7 +297,7 @@ module ${cfg['name']}_wrapper ( .Xdma (${core_cfg_flat('xdma')}), .Xssr (${core_cfg_flat('xssr')}), .Xfrep (${core_cfg_flat('xfrep')}), - .SNAX(${core_cfg_flat('snax')}), + .SNAX(SNAX), .SnaxTcdmPorts(SnaxTcdmPorts), .FPUImplementation (${cfg['pkg_name']}::FPUImplementation), .SnitchPMACfg (${cfg['pkg_name']}::SnitchPMACfg), diff --git a/target/snitch_cluster/cfg/snax-gemm.hjson b/target/snitch_cluster/cfg/snax-gemm.hjson new file mode 100644 index 000000000..96e6f23f4 --- /dev/null +++ b/target/snitch_cluster/cfg/snax-gemm.hjson @@ -0,0 +1,122 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Cluster configuration for a simple testbench system. +{ + nr_s1_quadrant: 1, + s1_quadrant: { + nr_clusters: 1, + }, + + cluster: { + boot_addr: 4096, // 0x1000 + cluster_base_addr: 268435456, // 0x1000_0000 + cluster_base_offset: 0, // 0x0 + cluster_base_hartid: 0, + addr_width: 48, + data_width: 64, + tcdm: { + size: 128, + banks: 32, + }, + cluster_periph_size: 64, // kB + zero_mem_size: 64, // kB + dma_data_width: 512, + dma_axi_req_fifo_depth: 3, + dma_req_fifo_depth: 3, + // Timing parameters + timing: { + lat_comp_fp32: 3, + lat_comp_fp64: 3, + lat_comp_fp16: 2, + lat_comp_fp16_alt: 2, + lat_comp_fp8: 1, + lat_comp_fp8_alt: 1, + lat_noncomp: 1, + lat_conv: 1, + lat_sdotp: 2, + fpu_pipe_config: "BEFORE" + narrow_xbar_latency: "CUT_ALL_PORTS", + wide_xbar_latency: "CUT_ALL_PORTS", + // Isolate the core. + register_core_req: true, + register_core_rsp: true, + register_offload_req: true, + register_offload_rsp: true + }, + hives: [ + // Hive 0 + { + icache: { + size: 8, // total instruction cache size in kByte + sets: 2, // number of ways + cacheline: 256 // word size in bits + }, + cores: [ + { $ref: "#/snax_gemm_core_template" }, + { $ref: "#/dma_core_template" }, + ] + } + ] + }, + dram: { + // 0x8000_0000 + address: 2147483648, + // 0x8000_0000 + length: 2147483648 + }, + peripherals: { + clint: { + // 0xffff_0000 + address: 4294901760, + // 0x0000_1000 + length: 4096 + }, + }, + // Templates. + snax_gemm_core_template: { + isa: "rv32imafd", + xssr: true, + xfrep: true, + xdma: false, + xf16: true, + xf16alt: true, + xf8: true, + xf8alt: true, + xfdotp: true, + xfvec: true, + snax_acc: 2, + snax_tcdm_ports: 16, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + // Enable division/square root unit + // Xdiv_sqrt: true, + }, + dma_core_template: { + isa: "rv32imafd", + // Xdiv_sqrt: true, + # isa: "rv32ema", + xdma: true + xssr: false + xfrep: false + xf16: false, + xf16alt: false, + xf8: false, + xf8alt: false, + xfdotp: false, + xfvec: false, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + } +} diff --git a/target/snitch_cluster/cfg/snax.hjson b/target/snitch_cluster/cfg/snax-mac.hjson similarity index 73% rename from target/snitch_cluster/cfg/snax.hjson rename to target/snitch_cluster/cfg/snax-mac.hjson index c4bbf2bc3..851d3b735 100644 --- a/target/snitch_cluster/cfg/snax.hjson +++ b/target/snitch_cluster/cfg/snax-mac.hjson @@ -86,7 +86,7 @@ xf8alt: true, xfdotp: true, xfvec: true, - snax: true, + snax_acc: 1, snax_tcdm_ports: 4, num_int_outstanding_loads: 1, num_int_outstanding_mem: 4, @@ -98,50 +98,6 @@ // Enable division/square root unit // Xdiv_sqrt: true, }, - compute_core_template: { - isa: "rv32imafd", - xssr: true, - xfrep: true, - xdma: false, - xf16: true, - xf16alt: true, - xf8: true, - xf8alt: true, - xfdotp: true, - xfvec: true, - snax: false, - num_int_outstanding_loads: 1, - num_int_outstanding_mem: 4, - num_fp_outstanding_loads: 4, - num_fp_outstanding_mem: 4, - num_sequencer_instructions: 16, - num_dtlb_entries: 1, - num_itlb_entries: 1, - // Enable division/square root unit - // Xdiv_sqrt: true, - }, - compute_core_template: { - isa: "rv32imafd", - xssr: true, - xfrep: true, - xdma: false, - xf16: true, - xf16alt: true, - xf8: true, - xf8alt: true, - xfdotp: true, - xfvec: true, - snax: false, - num_int_outstanding_loads: 1, - num_int_outstanding_mem: 4, - num_fp_outstanding_loads: 4, - num_fp_outstanding_mem: 4, - num_sequencer_instructions: 16, - num_dtlb_entries: 1, - num_itlb_entries: 1, - // Enable division/square root unit - // Xdiv_sqrt: true, - }, dma_core_template: { isa: "rv32imafd", // Xdiv_sqrt: true, diff --git a/target/snitch_cluster/sw/apps/snax-gemm-run.yaml b/target/snitch_cluster/sw/apps/snax-gemm-run.yaml new file mode 100644 index 000000000..93bd32d8b --- /dev/null +++ b/target/snitch_cluster/sw/apps/snax-gemm-run.yaml @@ -0,0 +1,17 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +runs: + - app: blas/axpy + cmd: ../../sw/blas/axpy/verify.py {sim_bin} {elf} + - app: blas/gemm + - app: dnn/batchnorm + - app: dnn/linear + - app: dnn/maxpool + - app: dnn/gemm +# dnn/gelu # seems like it stalls +# dnn/conv2d # fails with exit code 32 +# dnn/fusedconv # fails newly +# dnn/layernorm # throws illegal instruction on FDIV in simulation +# dnn/softmax # throws illegal instruction on FDIV in simulation diff --git a/target/snitch_cluster/sw/apps/snax-run.yaml b/target/snitch_cluster/sw/apps/snax-mac-run.yaml similarity index 100% rename from target/snitch_cluster/sw/apps/snax-run.yaml rename to target/snitch_cluster/sw/apps/snax-mac-run.yaml diff --git a/target/snitch_cluster/sw/tests/snax-run.yaml b/target/snitch_cluster/sw/tests/snax-gemm-run.yaml similarity index 100% rename from target/snitch_cluster/sw/tests/snax-run.yaml rename to target/snitch_cluster/sw/tests/snax-gemm-run.yaml diff --git a/target/snitch_cluster/sw/tests/snax-mac-run.yaml b/target/snitch_cluster/sw/tests/snax-mac-run.yaml new file mode 100644 index 000000000..880f5e3e2 --- /dev/null +++ b/target/snitch_cluster/sw/tests/snax-mac-run.yaml @@ -0,0 +1,71 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +runs: + - app: atomics + simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x4 + - app: barrier + - app: data_mover + - app: dma_simple + #- app: event_unit # SNAX failes because this test needs more cores + - app: fence_i + - app: fp8_comparison_scalar + simulators: [vsim, vcs, verilator] # banshee fails with segfault + - app: fp8_comparison_vector + simulators: [vsim, vcs, verilator] # banshee fails with segfault + - app: fp8_computation_scalar + simulators: [vsim, vcs, verilator] # banshee fails with JIT issue + - app: fp8_computation_vector + simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x6 + - app: fp8alt_comparison_scalar + simulators: [vsim, vcs, verilator] # banshee fails with segfault + - app: fp8alt_comparison_vector + simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10 + - app: fp8alt_computation_scalar + simulators: [vsim, vcs, verilator] # banshee fails with JIT issue + - app: fp8alt_computation_vector + simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x12 + - app: fp16_comparison_scalar + simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10 + - app: fp16_comparison_vector + simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10 + - app: fp16_computation_scalar + simulators: [vsim, vcs, verilator] # banshee fails with JIT issue + - app: fp16_computation_vector + simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x6 + - app: fp16alt_comparison_scalar + simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10 + - app: fp16alt_comparison_vector + simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10 + - app: fp16alt_computation_scalar + simulators: [vsim, vcs, verilator] # banshee fails with JIT issue + - app: fp16alt_computation_vector + simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x16 + - app: fp32_comparison_scalar + - app: fp32_comparison_vector + - app: fp32_computation_scalar + simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x2 + - app: fp32_computation_vector + simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x2 + - app: fp32_conversions_scalar + simulators: [vsim, vcs, verilator] # banshee fails with illegal instruction + - app: fp64_conversions_scalar + simulators: [vsim, vcs, verilator] + # - app: interrupt + - app: interrupt-local + - app: multi_cluster + #- app: openmp_parallel # This requires multiple cores too + #- app: openmp_for_static_schedule # This requires multiple cores too + - app: openmp_double_buffering + - app: perf_cnt + - app: printf_simple + - app: printf_fmtint + - app: simple + #- app: team_global # This requires all 9 cores to be present + - app: tls + - app: varargs_1 + - app: varargs_2 + - app: zero_mem + - app: non_null_exitcode + exit_code: 14