From eb67e8ac44890658047020574e6707a971d95bb9 Mon Sep 17 00:00:00 2001
From: Ryan Antonio <ryan.antonio@esat.kuleuven.be>
Date: Fri, 29 Sep 2023 16:34:13 +0200
Subject: [PATCH] Add SNAX GEMM (#25)

* bender: Add snax-gemm to bender

* bender: Update bender with latest snax-gemm repo

* hw: Add snax-gemm.sv

* hw: Modify snitch_cluster and add snax-gemm instance

* hw: Modify snitch cluster template

* docs: Modify snitch cluster schema

* lint: Exclude snax_gemm from license check

* hw: Lint snax_gemm

* cfg: Modify snax.hjson with new config

* improve gemm

* modify gemm.sv

* hw: modify status CSR

* cfg: Add snax-hjson configuration

* tests: Renamed snax-run.yaml to snax-mac-run.yaml

* cfg: Modify to use MAC generation

* cfg: Rename snax.hjson to snax-mac.hjson

* ci: Change build to SNAX-MAC runs

* cfg: Modify snax-gemm.hjson

* tests: Add snax-gemm-run.yaml

* ci: Add the SNAX-GEMM in CI

* ci: Fix lint indentation

* cfg: Take out original Snitch compute core templates

---------

Co-authored-by: xiaoling-yi <xiaoling.yi@student.kuleuven.be>
---
 .github/workflows/ci.yml                      |  32 +-
 .github/workflows/lint.yml                    |   1 +
 Bender.yml                                    |  10 +-
 docs/schema/snitch_cluster.schema.json        |  10 +-
 hw/snax_gemm/src/snax_gemm.sv                 | 361 ++++++++++++++++++
 hw/snitch_cluster/src/snitch_cluster.sv       |  45 ++-
 .../src/snitch_cluster_wrapper.sv.tpl         |   3 +-
 target/snitch_cluster/cfg/snax-gemm.hjson     | 122 ++++++
 .../cfg/{snax.hjson => snax-mac.hjson}        |  46 +--
 .../snitch_cluster/sw/apps/snax-gemm-run.yaml |  17 +
 .../apps/{snax-run.yaml => snax-mac-run.yaml} |   0
 .../{snax-run.yaml => snax-gemm-run.yaml}     |   0
 .../snitch_cluster/sw/tests/snax-mac-run.yaml |  71 ++++
 13 files changed, 656 insertions(+), 62 deletions(-)
 create mode 100644 hw/snax_gemm/src/snax_gemm.sv
 create mode 100644 target/snitch_cluster/cfg/snax-gemm.hjson
 rename target/snitch_cluster/cfg/{snax.hjson => snax-mac.hjson} (73%)
 create mode 100644 target/snitch_cluster/sw/apps/snax-gemm-run.yaml
 rename target/snitch_cluster/sw/apps/{snax-run.yaml => snax-mac-run.yaml} (100%)
 rename target/snitch_cluster/sw/tests/{snax-run.yaml => snax-gemm-run.yaml} (100%)
 create mode 100644 target/snitch_cluster/sw/tests/snax-mac-run.yaml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index db116c316..aa4a408e9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -50,8 +50,8 @@ jobs:
   # Simulate SW on Snax Cluster w/ Verilator #
   ############################################
   
-  sw-snax-cluster-vlt:
-    name: Simulate SW on SNAX Cluster w/ Verilator
+  sw-snax-mac-cluster-vlt:
+    name: Simulate SW on SNAX MAC Cluster w/ Verilator
     runs-on: ubuntu-22.04
     container:
       image: ghcr.io/pulp-platform/snitch_cluster:main
@@ -61,7 +61,7 @@ jobs:
           submodules: 'recursive'
       - name: Build Hardware
         run: |
-          make CFG_OVERRIDE=cfg/snax.hjson \
+          make CFG_OVERRIDE=cfg/snax-mac.hjson \
           -C target/snitch_cluster bin/snitch_cluster.vlt
       - name: Build Software
         run: |
@@ -70,8 +70,30 @@ jobs:
       - name: Run Tests
         working-directory: target/snitch_cluster
         run: |-
-          ./run.py sw/tests/snax-run.yaml --simulator verilator
-          ./run.py sw/apps/snax-run.yaml --simulator verilator
+          ./run.py sw/tests/snax-mac-run.yaml --simulator verilator
+          ./run.py sw/apps/snax-mac-run.yaml --simulator verilator
+
+  sw-snax-gemm-cluster-vlt:
+    name: Simulate SW on SNAX GEMM Cluster w/ Verilator
+    runs-on: ubuntu-22.04
+    container:
+      image: ghcr.io/pulp-platform/snitch_cluster:main
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          submodules: 'recursive'
+      - name: Build Hardware
+        run: |
+          make CFG_OVERRIDE=cfg/snax-gemm.hjson \
+          -C target/snitch_cluster bin/snitch_cluster.vlt
+      - name: Build Software
+        run: |
+          make -C target/snitch_cluster sw
+      - name: Run Tests
+        working-directory: target/snitch_cluster
+        run: |-
+          ./run.py sw/tests/snax-gemm-run.yaml --simulator verilator
+          ./run.py sw/apps/snax-gemm-run.yaml --simulator verilator
 
   ############################################
   # Build SW on Snitch Cluster w/ Banshee #
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 5a062ba60..68cb06e03 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -87,6 +87,7 @@ jobs:
             sw/math/Makefile
             hw/snitch/src/csr_snax_def.sv
             hw/snax_hwpe_mac/src/*
+            hw/snax_gemm/src/*
             target/snitch_cluster/sw/apps/snax-mac/*
             target/snitch_cluster/sw/apps/snax-mac-simple/*
 
diff --git a/Bender.yml b/Bender.yml
index d5f431a24..b9e93b664 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -17,6 +17,8 @@ package:
     - Nils Wistoff <nwistoff@iis.ee.ethz.ch>
     - Luca Bertaccini <lbertaccini@iis.ee.ethz.ch>
     - Matheus Cavalcante <matheusd@iis.ee.ethz.ch>
+    - Xiaoling Yi <xiaoling.yi@esat.kuleuven.be>
+    - Ryan Antonio <ryan.antonio@esat.kuleuven.be>
 
 dependencies:
   axi:                { git: https://github.com/pulp-platform/axi,                version:  0.39.0  }
@@ -27,6 +29,7 @@ dependencies:
   tech_cells_generic: { git: https://github.com/pulp-platform/tech_cells_generic, version:  0.2.11  }
   riscv-dbg:          { git: https://github.com/pulp-platform/riscv-dbg,          version:  0.8.0   }
   hwpe-mac-engine:    { git: https://github.com/KULeuven-MICAS/hwpe-mac-engine.git, rev: 5d3b4525b665169fc8321c8a811f3c83ad3c72e8 }
+  snax-gemm:          { git: https://github.com/KULeuven-MICAS/snax-gemm.git,     rev: 6b538447969cb0c00a4b749b38d68ae932c6aca1 }
 
 vendor_package:
   - name: musl
@@ -184,13 +187,18 @@ sources:
     files:
       - hw/snitch/test/snitch_l0_tlb_tb.sv
 
-  # hwpe_mac
+  # snax mac
   - files:
       # Level 0
       - hw/snax_hwpe_mac/src/snax_hwpe_ctrl.sv
       - hw/snax_hwpe_mac/src/snax_hwpe_to_reqrsp.sv
       - hw/snax_hwpe_mac/src/snax_mac.sv
 
+  # snax gemm
+  - files:
+      # Level 0
+      - hw/snax_gemm/src/snax_gemm.sv
+
   # snitch_vm
   - files:
       - hw/snitch_vm/src/snitch_ptw.sv
diff --git a/docs/schema/snitch_cluster.schema.json b/docs/schema/snitch_cluster.schema.json
index 5ea872ec6..e1ad5ed51 100644
--- a/docs/schema/snitch_cluster.schema.json
+++ b/docs/schema/snitch_cluster.schema.json
@@ -416,11 +416,11 @@
                                     "description": "Enable Smallfloat vector extension (SIMD).",
                                     "default": false
                                 },
-                                "snax": {
-                                    "type": "boolean",
-                                    "title": "SNAX Extension",
-                                    "description": "Enable Snitch Accelerator eXtension (SNAX)",
-                                    "default": false
+                                "snax_acc": {
+                                    "type": "number",
+                                    "title": "SNAX Accelerator Extension",
+                                    "description": "Enable specific Snitch Accelerator eXtension (SNAX). 1 for MAC engine and 2 for GEMM engine.",
+                                    "default": 0
                                 },
                                 "snax_tcdm_ports": {
                                     "type": "number",
diff --git a/hw/snax_gemm/src/snax_gemm.sv b/hw/snax_gemm/src/snax_gemm.sv
new file mode 100644
index 000000000..6e005cfd7
--- /dev/null
+++ b/hw/snax_gemm/src/snax_gemm.sv
@@ -0,0 +1,361 @@
+//--------------------------------------------------------------------
+// Copyright 2023 Katolieke Universiteit Leuven (KUL)
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Author: Xiaoling Yi (xiaoling.yi@kuleuven.be)
+//--------------------------------------------------------------------
+
+// verilog_lint: waive-start line-length
+// verilog_lint: waive-start no-trailing-spaces
+
+import riscv_instr::*;
+import reqrsp_pkg::*;
+
+module snax_gemm # (
+  parameter int unsigned DataWidth     = 64,
+  parameter int unsigned SnaxTcdmPorts = 16,
+  parameter type         acc_req_t     = logic,
+  parameter type         acc_rsp_t     = logic,
+  parameter type         tcdm_req_t    = logic,
+  parameter type         tcdm_rsp_t    = logic
+)(
+  input     logic                           clk_i,
+  input     logic                           rst_ni,
+
+  input     logic                           snax_qvalid_i,
+  output    logic                           snax_qready_o,
+  input     acc_req_t                       snax_req_i,
+
+  output    acc_rsp_t                       snax_resp_o,
+  output    logic                           snax_pvalid_o,
+  input     logic                           snax_pready_i,
+
+  output    tcdm_req_t  [SnaxTcdmPorts-1:0] snax_tcdm_req_o,
+  input     tcdm_rsp_t  [SnaxTcdmPorts-1:0] snax_tcdm_rsp_i
+);
+
+  // Local parameters for input and output sizes
+  localparam int unsigned InputMatrixSize  = DataWidth*SnaxTcdmPorts/2;
+  localparam int unsigned OutputMatrixSize = InputMatrixSize*4;          // x4 because of multiplication and addition considerations
+
+  // CSRs
+  localparam int unsigned RegNum        = 5;
+  localparam int unsigned CsrAddrOFfset = 32'h3c0;
+
+  logic [31:0] CSRs [RegNum];
+  logic [31:0] csr_addr;
+
+  logic write_csr;
+  logic read_csr;
+
+
+  // Gemm wires
+  logic [ InputMatrixSize-1:0] io_a_io_in;
+  logic [ InputMatrixSize-1:0] io_b_io_in;
+  logic [OutputMatrixSize-1:0] io_c_io_out;
+  logic [OutputMatrixSize / 2 -1:0] io_c_io_out_reg;
+  logic      io_start_do;
+  logic      io_data_in_valid;
+  logic      io_data_out_valid;
+
+  localparam int unsigned HalfC         = InputMatrixSize*2;
+  localparam int unsigned HalfHalfCAddr = HalfC/2/8;
+  localparam int unsigned HalfCAddr     = HalfC/8;
+  localparam int unsigned HalfHalfC     = HalfC/2;
+  
+  // 2 cycle to write data out because output is 8 * 8 * 32bits = 2048.
+  // But TCDM has 16 ports only (2048 / (16 * 32) = 2).
+  logic read_tcdm;
+  logic write_tcdm_1;
+  logic write_tcdm_2;
+  logic read_tcdm_done;
+  logic write_tcdm_done;
+  logic write_tcdm_done_1;
+  logic write_tcdm_done_2;
+  logic tcdm_not_ready;
+  logic [SnaxTcdmPorts-1:0] snax_tcdm_rsp_i_p_valid;
+  logic [SnaxTcdmPorts-1:0] snax_tcdm_req_o_q_valid;
+
+  // Gemm control states, we have 3 states currently.
+  // By default, the state is IDLE_GEMM
+  // When io_start_do assert, we send read data request and turn to COMP_GEMM
+  // When io_data_out_valid assert, we write back the least half of results to TCDM and turn to WRITE_GEMM
+  // Afterwards, we turn to IDLE_GEMM.
+  typedef enum logic [2:0] {
+    IDLE_GEMM,
+    COMP_GEMM,
+    WRITE_GEMM
+  } ctrl_states_t;
+
+  ctrl_states_t cstate, nstate;
+
+  // Write CSRs
+  always_ff @ (posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      for (int i=0; i < RegNum - 1; i++) begin
+        CSRs[i] <= 32'd0;
+      end     
+      CSRs[4] <= 32'd1;
+    end else begin
+      if(write_csr == 1'b1 && io_start_do != 1'b1) begin
+        CSRs[csr_addr] <= snax_req_i.data_arga[31:0];
+      end 
+      else begin
+        if (io_start_do == 1'b1) begin
+          CSRs[4] <= 32'd0;          
+        end        
+        else if (write_tcdm_done_2 == 1'b1) begin
+          CSRs[4] <= 32'd1;
+        end
+      end
+    end
+  end
+
+  // Read CSRs
+  always_comb begin
+    if (!rst_ni) begin
+        snax_resp_o.data  = 0;
+        snax_resp_o.id    = 0;
+        snax_resp_o.error = 1'b0;
+        snax_pvalid_o     = 1'b0;        
+    end else begin
+      if(read_csr) begin
+        snax_resp_o.data  = {32'b0,CSRs[csr_addr]};
+        snax_resp_o.id    = snax_req_i.id;
+        snax_resp_o.error = 1'b0;
+        snax_pvalid_o     = 1'b1;
+      end
+      else begin
+        snax_resp_o.data  = 0;
+        snax_resp_o.id    = 0;
+        snax_resp_o.error = 1'b0;
+        snax_pvalid_o     = 1'b0;        
+      end
+    end
+  end
+
+  // Read or write control logic
+  always_comb begin
+    if (!rst_ni) begin
+      read_csr = 1'b0;
+      write_csr = 1'b0;      
+    end
+    else if(snax_qvalid_i) begin
+      unique casez (snax_req_i.data_op)
+        CSRRS, CSRRSI, CSRRC, CSRRCI: begin
+          read_csr  = 1'b1;
+          write_csr = 1'b0;
+        end
+        default: begin
+          write_csr = 1'b1;
+          read_csr  = 1'b0;
+        end
+      endcase      
+    end   
+    else begin
+      read_csr  = 1'b0;
+      write_csr = 1'b0;
+    end
+  end
+
+  assign snax_qready_o = 1'b1;
+  assign csr_addr = snax_req_i.data_argb - CsrAddrOFfset;
+
+  Gemm inst_gemm(
+    .clock             ( clk_i             ),   // <stdin>:9016:11
+    .reset             ( !rst_ni           ),   // <stdin>:9017:11
+    .io_data_in_valid  ( io_data_in_valid  ),   // src/main/scala/gemm/gemm.scala:309:16
+    .io_a_io_in        ( io_a_io_in        ),   // src/main/scala/gemm/gemm.scala:309:16
+    .io_b_io_in        ( io_b_io_in        ),   // src/main/scala/gemm/gemm.scala:309:16
+    .io_data_out_valid ( io_data_out_valid ),   // src/main/scala/gemm/gemm.scala:309:16
+    .io_c_io_out       ( io_c_io_out       )    // src/main/scala/gemm/gemm.scala:309:16
+  );
+
+  // Holding output
+  always_ff @ (posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      io_c_io_out_reg <= 0;
+    end else begin
+      if (io_data_out_valid) begin
+        io_c_io_out_reg <= io_c_io_out[OutputMatrixSize - 1: OutputMatrixSize / 2];
+      end
+    end
+  end
+
+  // Changing states
+  always_ff @ (posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      cstate <= IDLE_GEMM;
+    end else begin
+      cstate <= nstate;
+    end
+  end
+
+  // Next state changes
+  always_comb begin
+    case(cstate)
+      IDLE_GEMM: begin
+        if (io_start_do) begin
+          nstate = COMP_GEMM;
+        end else begin
+          nstate = IDLE_GEMM;
+        end
+      end 
+      COMP_GEMM: begin
+        if (write_tcdm_done_1) begin 
+          nstate = WRITE_GEMM;
+        end else begin
+          nstate = COMP_GEMM;
+        end
+      end          
+      WRITE_GEMM: begin
+        if (write_tcdm_done_2) begin 
+          nstate = IDLE_GEMM;
+        end else begin
+          nstate = WRITE_GEMM;
+        end
+      end    
+      default: begin
+        nstate = IDLE_GEMM;
+      end
+    endcase
+
+  end
+
+  assign io_start_do = snax_qvalid_i & (csr_addr == 3) & snax_qready_o;
+
+  // read data from TCDM and write data to TCDM
+
+  always_comb begin
+      for (int i = 0; i < SnaxTcdmPorts / 2; i++) begin
+        if(!rst_ni) begin
+          snax_tcdm_req_o[i].q_valid = 1'b0;
+          snax_tcdm_req_o[i].q.addr  = 17'b0;
+          snax_tcdm_req_o[i].q.write = 1'b0;
+          snax_tcdm_req_o[i].q.amo  = AMONone;
+          snax_tcdm_req_o[i].q.data = {DataWidth{1'b0}};
+          snax_tcdm_req_o[i].q.strb = {(DataWidth / 8){1'b0}};
+          snax_tcdm_req_o[i].q.user = '0;
+
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q_valid = 1'b0;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.addr  = 17'b0;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.write = 1'b0;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.amo  = AMONone;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.data = {DataWidth{1'b0}};
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.strb = {(DataWidth / 8){1'b0}};
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.user = '0;          
+        end
+        else if(read_tcdm) begin
+          snax_tcdm_req_o[i].q_valid = 1'b1;
+          snax_tcdm_req_o[i].q.addr  = CSRs[0] + i * 8;
+          snax_tcdm_req_o[i].q.write = 1'b0;
+          snax_tcdm_req_o[i].q.amo  = AMONone;
+          snax_tcdm_req_o[i].q.data = {DataWidth{1'b0}};
+          snax_tcdm_req_o[i].q.strb = {(DataWidth / 8){1'b1}};
+          snax_tcdm_req_o[i].q.user = '0;
+
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q_valid = 1'b1;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.addr  = CSRs[1] + i * 8;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.write = 1'b0;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.amo  = AMONone;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.data = {DataWidth{1'b0}};
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.strb = {(DataWidth / 8){1'b1}};
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.user = '0;                    
+        end
+        else if(write_tcdm_1) begin
+          snax_tcdm_req_o[i].q_valid = 1'b1;
+          snax_tcdm_req_o[i].q.addr  = CSRs[2] + i * 8;
+          snax_tcdm_req_o[i].q.write = 1'b1;
+          snax_tcdm_req_o[i].q.amo  = AMONone;
+          snax_tcdm_req_o[i].q.data = io_c_io_out[i * DataWidth +: DataWidth];
+          snax_tcdm_req_o[i].q.strb = {(DataWidth / 8){1'b1}};
+          snax_tcdm_req_o[i].q.user = '0;
+
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q_valid = 1'b1;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.addr  = CSRs[2] + i * 8 + HalfHalfCAddr;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.write = 1'b1;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.amo  = AMONone;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.data = io_c_io_out[(i * DataWidth + HalfHalfC) +: DataWidth];
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.strb = {(DataWidth / 8){1'b1}};
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.user = '0;                   
+        end  
+        else if(write_tcdm_2) begin
+          snax_tcdm_req_o[i].q_valid = 1'b1;
+          snax_tcdm_req_o[i].q.addr  = CSRs[2] + i * 8 + HalfCAddr;
+          snax_tcdm_req_o[i].q.write = 1'b1;
+          snax_tcdm_req_o[i].q.amo  = AMONone;
+          snax_tcdm_req_o[i].q.data = io_c_io_out_reg[(i * DataWidth) +: DataWidth];
+          snax_tcdm_req_o[i].q.strb = {(DataWidth / 8){1'b1}};
+          snax_tcdm_req_o[i].q.user = '0;
+
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q_valid = 1'b1;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.addr  = CSRs[2] + i * 8 + HalfCAddr + HalfHalfCAddr;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.write = 1'b1;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.amo  = AMONone;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.data = io_c_io_out_reg[(i * DataWidth + HalfHalfC) +: DataWidth];
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.strb = {(DataWidth / 8){1'b1}};
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.user = '0;                    
+        end              
+        else begin
+          snax_tcdm_req_o[i].q_valid = 1'b0;
+          snax_tcdm_req_o[i].q.addr  = 17'b0;
+          snax_tcdm_req_o[i].q.write = 1'b0;
+          snax_tcdm_req_o[i].q.amo  = AMONone;
+          snax_tcdm_req_o[i].q.data = {DataWidth{1'b0}};
+          snax_tcdm_req_o[i].q.strb = {(DataWidth / 8){1'b0}};
+          snax_tcdm_req_o[i].q.user = '0;        
+
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q_valid = 1'b0;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.addr  = 17'b0;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.write = 1'b0;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.amo  = AMONone;
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.data = {DataWidth{1'b0}};
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.strb = {(DataWidth / 8){1'b0}};
+          snax_tcdm_req_o[i + SnaxTcdmPorts / 2].q.user = '0;                  
+        end 
+      end
+  end 
+
+  always_comb begin
+    if (!rst_ni) begin
+        io_a_io_in = 512'b0;        
+        io_b_io_in = 512'b0;        
+    end else begin
+      for (int i = 0; i < SnaxTcdmPorts / 2; i++) begin
+        if(io_data_in_valid) begin
+          io_a_io_in[i * DataWidth +: DataWidth] = snax_tcdm_rsp_i[i].p.data;
+          io_b_io_in[i * DataWidth +: DataWidth] = snax_tcdm_rsp_i[i + SnaxTcdmPorts / 2].p.data;
+        end
+        else begin
+          io_a_io_in[i * DataWidth +: DataWidth] = 0;
+          io_b_io_in[i * DataWidth +: DataWidth] = 0;
+        end
+      end
+    end
+  end  
+
+  always_comb begin
+      for (int i = 0; i < SnaxTcdmPorts; i++) begin
+        if(!rst_ni) begin
+          snax_tcdm_rsp_i_p_valid[i] = 1'b0;
+          snax_tcdm_req_o_q_valid[i] = 1'b0;
+        end
+        else begin
+          snax_tcdm_rsp_i_p_valid[i] = snax_tcdm_rsp_i[i].p_valid;
+          snax_tcdm_req_o_q_valid[i] = snax_tcdm_req_o[i].q_valid;
+        end 
+      end
+  end 
+
+  assign tcdm_not_ready    = ~io_data_in_valid;
+  assign io_data_in_valid  = ((&snax_tcdm_rsp_i_p_valid) === 1'b1 && (cstate == COMP_GEMM))? 1'b1 : 1'b0;
+  assign read_tcdm         = io_start_do;
+  assign write_tcdm_1      = io_data_out_valid;
+  assign write_tcdm_2      = cstate                               == WRITE_GEMM;
+  assign read_tcdm_done    = io_data_in_valid;
+  assign write_tcdm_done_1 = (&snax_tcdm_req_o_q_valid) && cstate == COMP_GEMM;
+  assign write_tcdm_done_2 = (&snax_tcdm_req_o_q_valid) && cstate == WRITE_GEMM;
+  assign write_tcdm_done   = write_tcdm_done_1 & write_tcdm_done_2;
+
+endmodule
diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv
index 5c9b442ba..909aad57f 100644
--- a/hw/snitch_cluster/src/snitch_cluster.sv
+++ b/hw/snitch_cluster/src/snitch_cluster.sv
@@ -96,9 +96,9 @@ module snitch_cluster
   /// FPU configuration.
   parameter fpnew_pkg::fpu_implementation_t FPUImplementation [NrCores] =
     '{default: fpnew_pkg::fpu_implementation_t'(0)},
-  /// Enable SNAX per core
-  parameter bit [NrCores-1:0] SNAX          = '0,
-  /// Number of SNAX TCDM ports, the default assumes HWPE MAC engine
+  /// Enable SNAX accelerators per core. Integer coded.
+  parameter int unsigned  SNAX         [NrCores] = '{default: 0},
+  /// Number of SNAX TCDM ports per core
   parameter int unsigned SnaxTcdmPorts [NrCores] = '{default: 0},
   /// Physical Memory Attribute Configuration
   parameter snitch_pma_pkg::snitch_pma_t SnitchPMACfg = '0,
@@ -266,7 +266,17 @@ module snitch_cluster
     return n;
   endfunction
 
+  function automatic int unsigned check_if_snax(int unsigned core_idx);
+    for (int i = 0; i < core_idx; i++) begin
+      if ( SNAX[i] > 0) begin
+        return 1;
+      end
+    end
+    return 0;
+  endfunction
+
   // SNAX TCDM
+  localparam int unsigned SnaxSystem         = check_if_snax(NrCores);
   localparam int unsigned TotalSnaxTcdmPorts = get_snax_tcdm_port_offs(NrCores);
 
   localparam int unsigned NrTCDMPortsCores = get_tcdm_port_offs(NrCores);
@@ -778,7 +788,7 @@ module snitch_cluster
   end
 
   // generate TCDM for snax if any of the cores has SNAX enabled
-  if(|SNAX) begin: gen_yes_snax_tcdm_interconnect
+  if( SnaxSystem ) begin: gen_yes_snax_tcdm_interconnect
     snitch_tcdm_interconnect #(
       .NumInp (NumTCDMIn + TotalSnaxTcdmPorts),
       .NumOut (NrBanks),
@@ -974,12 +984,15 @@ module snitch_cluster
 
   for (genvar i = 0; i < NrCores; i++) begin : gen_snax_acc
 
+    // Calculate exact count of SNAX TCDM ports
     localparam int unsigned LocalSnaxTcdmPorts  = get_snax_tcdm_ports(i);
     localparam int unsigned LocalSnaxTcdmOffset = get_snax_tcdm_port_offs(i);
 
     tcdm_req_t [LocalSnaxTcdmPorts-1:0] hang_snax_tcdm_req;
     tcdm_rsp_t [LocalSnaxTcdmPorts-1:0] hang_snax_tcdm_rsp;
 
+    // Remap SNAX TCDM ports to prune cores that
+    // do not use SNAX TCDM ports
     for (genvar j = 0; j < LocalSnaxTcdmPorts; j++) begin : gen_snax_tcdm_map_per_core
       always_comb begin
         snax_tcdm_req[LocalSnaxTcdmOffset+j] = hang_snax_tcdm_req[j];
@@ -987,7 +1000,7 @@ module snitch_cluster
       end
     end
 
-    if(SNAX[i]) begin: gen_yes_mac
+    if(SNAX[i] == 1) begin: gen_snax_mac
 
       snax_mac # (
         .DataWidth          ( 32               ),
@@ -1009,6 +1022,28 @@ module snitch_cluster
         .snax_tcdm_rsp_i    ( hang_snax_tcdm_rsp )
       );
 
+    end else if (SNAX[i] == 2) begin: gen_snax_gemm
+
+      snax_gemm # (
+        .DataWidth          ( NarrowDataWidth  ),
+        .SnaxTcdmPorts      ( LocalSnaxTcdmPorts ),
+        .acc_req_t          ( acc_req_t        ),
+        .acc_rsp_t          ( acc_resp_t       ),
+        .tcdm_req_t         ( tcdm_req_t       ),
+        .tcdm_rsp_t         ( tcdm_rsp_t       )
+      ) i_snax_gemm (
+        .clk_i              ( clk_i            ),
+        .rst_ni             ( rst_ni           ),
+        .snax_req_i         ( snax_req[i]      ),
+        .snax_qvalid_i      ( snax_qvalid[i]   ),
+        .snax_qready_o      ( snax_qready[i]   ),
+        .snax_resp_o        ( snax_resp[i]     ),
+        .snax_pvalid_o      ( snax_pvalid[i]   ),
+        .snax_pready_i      ( snax_pready[i]   ),
+        .snax_tcdm_req_o    ( hang_snax_tcdm_req ),
+        .snax_tcdm_rsp_i    ( hang_snax_tcdm_rsp )
+      );
+
     end else begin: gen_no_snax_acc
       // Tie these signal to low when no SNAX accelerator is present
       assign snax_qready[i]   = '0;
diff --git a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
index 6c1ff17cb..5a07b5b89 100644
--- a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
+++ b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
@@ -251,6 +251,7 @@ module ${cfg['name']}_wrapper (
   localparam int unsigned NumSequencerInstr [${cfg['nr_cores']}] = '{${core_cfg('num_sequencer_instructions')}};
   localparam int unsigned NumSsrs [${cfg['nr_cores']}] = '{${core_cfg('num_ssrs')}};
   localparam int unsigned SsrMuxRespDepth [${cfg['nr_cores']}] = '{${core_cfg('ssr_mux_resp_depth')}};
+  localparam int unsigned SNAX [${cfg['nr_cores']}] = '{${core_cfg('snax_acc')}};
   localparam int unsigned SnaxTcdmPorts [${cfg['nr_cores']}] = '{${core_cfg('snax_tcdm_ports')}};
 
   // Snitch cluster under test.
@@ -296,7 +297,7 @@ module ${cfg['name']}_wrapper (
     .Xdma (${core_cfg_flat('xdma')}),
     .Xssr (${core_cfg_flat('xssr')}),
     .Xfrep (${core_cfg_flat('xfrep')}),
-    .SNAX(${core_cfg_flat('snax')}),
+    .SNAX(SNAX),
     .SnaxTcdmPorts(SnaxTcdmPorts),
     .FPUImplementation (${cfg['pkg_name']}::FPUImplementation),
     .SnitchPMACfg (${cfg['pkg_name']}::SnitchPMACfg),
diff --git a/target/snitch_cluster/cfg/snax-gemm.hjson b/target/snitch_cluster/cfg/snax-gemm.hjson
new file mode 100644
index 000000000..96e6f23f4
--- /dev/null
+++ b/target/snitch_cluster/cfg/snax-gemm.hjson
@@ -0,0 +1,122 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Cluster configuration for a simple testbench system.
+{
+    nr_s1_quadrant: 1,
+    s1_quadrant: {
+        nr_clusters: 1,
+    },
+
+    cluster: {
+        boot_addr: 4096, // 0x1000
+        cluster_base_addr: 268435456, // 0x1000_0000
+        cluster_base_offset: 0, // 0x0
+        cluster_base_hartid: 0,
+        addr_width: 48,
+        data_width: 64,
+        tcdm: {
+            size: 128,
+            banks: 32,
+        },
+        cluster_periph_size: 64, // kB
+        zero_mem_size: 64, // kB
+        dma_data_width: 512,
+        dma_axi_req_fifo_depth: 3,
+        dma_req_fifo_depth: 3,
+        // Timing parameters
+        timing: {
+            lat_comp_fp32: 3,
+            lat_comp_fp64: 3,
+            lat_comp_fp16: 2,
+            lat_comp_fp16_alt: 2,
+            lat_comp_fp8: 1,
+            lat_comp_fp8_alt: 1,
+            lat_noncomp: 1,
+            lat_conv: 1,
+            lat_sdotp: 2,
+            fpu_pipe_config: "BEFORE"
+            narrow_xbar_latency: "CUT_ALL_PORTS",
+            wide_xbar_latency: "CUT_ALL_PORTS",
+            // Isolate the core.
+            register_core_req: true,
+            register_core_rsp: true,
+            register_offload_req: true,
+            register_offload_rsp: true
+        },
+        hives: [
+            // Hive 0
+            {
+                icache: {
+                    size: 8, // total instruction cache size in kByte
+                    sets: 2, // number of ways
+                    cacheline: 256 // word size in bits
+                },
+                cores: [
+                    { $ref: "#/snax_gemm_core_template" },
+                    { $ref: "#/dma_core_template" },
+                ]
+            }
+        ]
+    },
+    dram: {
+        // 0x8000_0000
+        address: 2147483648,
+        // 0x8000_0000
+        length: 2147483648
+    },
+    peripherals: {
+        clint: {
+            // 0xffff_0000
+            address: 4294901760,
+            // 0x0000_1000
+            length: 4096
+        },
+    },
+    // Templates.
+    snax_gemm_core_template: {
+        isa: "rv32imafd",
+        xssr: true,
+        xfrep: true,
+        xdma: false,
+        xf16: true,
+        xf16alt: true,
+        xf8: true,
+        xf8alt: true,
+        xfdotp: true,
+        xfvec: true,
+        snax_acc: 2,
+        snax_tcdm_ports: 16,
+        num_int_outstanding_loads: 1,
+        num_int_outstanding_mem: 4,
+        num_fp_outstanding_loads: 4,
+        num_fp_outstanding_mem: 4,
+        num_sequencer_instructions: 16,
+        num_dtlb_entries: 1,
+        num_itlb_entries: 1,
+        // Enable division/square root unit
+        // Xdiv_sqrt: true,
+    },
+    dma_core_template: {
+        isa: "rv32imafd",
+        // Xdiv_sqrt: true,
+        # isa: "rv32ema",
+        xdma: true
+        xssr: false
+        xfrep: false
+        xf16: false,
+        xf16alt: false,
+        xf8: false,
+        xf8alt: false,
+        xfdotp: false,
+        xfvec: false,
+        num_int_outstanding_loads: 1,
+        num_int_outstanding_mem: 4,
+        num_fp_outstanding_loads: 4,
+        num_fp_outstanding_mem: 4,
+        num_sequencer_instructions: 16,
+        num_dtlb_entries: 1,
+        num_itlb_entries: 1,
+    }
+}
diff --git a/target/snitch_cluster/cfg/snax.hjson b/target/snitch_cluster/cfg/snax-mac.hjson
similarity index 73%
rename from target/snitch_cluster/cfg/snax.hjson
rename to target/snitch_cluster/cfg/snax-mac.hjson
index c4bbf2bc3..851d3b735 100644
--- a/target/snitch_cluster/cfg/snax.hjson
+++ b/target/snitch_cluster/cfg/snax-mac.hjson
@@ -86,7 +86,7 @@
         xf8alt: true,
         xfdotp: true,
         xfvec: true,
-        snax: true,
+        snax_acc: 1,
         snax_tcdm_ports: 4,
         num_int_outstanding_loads: 1,
         num_int_outstanding_mem: 4,
@@ -98,50 +98,6 @@
         // Enable division/square root unit
         // Xdiv_sqrt: true,
     },
-    compute_core_template: {
-        isa: "rv32imafd",
-        xssr: true,
-        xfrep: true,
-        xdma: false,
-        xf16: true,
-        xf16alt: true,
-        xf8: true,
-        xf8alt: true,
-        xfdotp: true,
-        xfvec: true,
-        snax: false,
-        num_int_outstanding_loads: 1,
-        num_int_outstanding_mem: 4,
-        num_fp_outstanding_loads: 4,
-        num_fp_outstanding_mem: 4,
-        num_sequencer_instructions: 16,
-        num_dtlb_entries: 1,
-        num_itlb_entries: 1,
-        // Enable division/square root unit
-        // Xdiv_sqrt: true,
-    },
-    compute_core_template: {
-        isa: "rv32imafd",
-        xssr: true,
-        xfrep: true,
-        xdma: false,
-        xf16: true,
-        xf16alt: true,
-        xf8: true,
-        xf8alt: true,
-        xfdotp: true,
-        xfvec: true,
-        snax: false,
-        num_int_outstanding_loads: 1,
-        num_int_outstanding_mem: 4,
-        num_fp_outstanding_loads: 4,
-        num_fp_outstanding_mem: 4,
-        num_sequencer_instructions: 16,
-        num_dtlb_entries: 1,
-        num_itlb_entries: 1,
-        // Enable division/square root unit
-        // Xdiv_sqrt: true,
-    },
     dma_core_template: {
         isa: "rv32imafd",
         // Xdiv_sqrt: true,
diff --git a/target/snitch_cluster/sw/apps/snax-gemm-run.yaml b/target/snitch_cluster/sw/apps/snax-gemm-run.yaml
new file mode 100644
index 000000000..93bd32d8b
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/snax-gemm-run.yaml
@@ -0,0 +1,17 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+runs:
+  - app: blas/axpy
+    cmd: ../../sw/blas/axpy/verify.py {sim_bin} {elf}
+  - app: blas/gemm
+  - app: dnn/batchnorm
+  - app: dnn/linear
+  - app: dnn/maxpool
+  - app: dnn/gemm
+# dnn/gelu # seems like it stalls
+# dnn/conv2d # fails with exit code 32
+# dnn/fusedconv # fails newly
+# dnn/layernorm # throws illegal instruction on FDIV in simulation
+# dnn/softmax # throws illegal instruction on FDIV in simulation
diff --git a/target/snitch_cluster/sw/apps/snax-run.yaml b/target/snitch_cluster/sw/apps/snax-mac-run.yaml
similarity index 100%
rename from target/snitch_cluster/sw/apps/snax-run.yaml
rename to target/snitch_cluster/sw/apps/snax-mac-run.yaml
diff --git a/target/snitch_cluster/sw/tests/snax-run.yaml b/target/snitch_cluster/sw/tests/snax-gemm-run.yaml
similarity index 100%
rename from target/snitch_cluster/sw/tests/snax-run.yaml
rename to target/snitch_cluster/sw/tests/snax-gemm-run.yaml
diff --git a/target/snitch_cluster/sw/tests/snax-mac-run.yaml b/target/snitch_cluster/sw/tests/snax-mac-run.yaml
new file mode 100644
index 000000000..880f5e3e2
--- /dev/null
+++ b/target/snitch_cluster/sw/tests/snax-mac-run.yaml
@@ -0,0 +1,71 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+runs:
+  - app: atomics
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x4
+  - app: barrier
+  - app: data_mover
+  - app: dma_simple
+  #- app: event_unit # SNAX failes because this test needs more cores
+  - app: fence_i
+  - app: fp8_comparison_scalar
+    simulators: [vsim, vcs, verilator] # banshee fails with segfault
+  - app: fp8_comparison_vector
+    simulators: [vsim, vcs, verilator] # banshee fails with segfault
+  - app: fp8_computation_scalar
+    simulators: [vsim, vcs, verilator] # banshee fails with JIT issue
+  - app: fp8_computation_vector
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x6
+  - app: fp8alt_comparison_scalar
+    simulators: [vsim, vcs, verilator] # banshee fails with segfault
+  - app: fp8alt_comparison_vector
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
+  - app: fp8alt_computation_scalar
+    simulators: [vsim, vcs, verilator] # banshee fails with JIT issue
+  - app: fp8alt_computation_vector
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x12
+  - app: fp16_comparison_scalar
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
+  - app: fp16_comparison_vector
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
+  - app: fp16_computation_scalar
+    simulators: [vsim, vcs, verilator] # banshee fails with JIT issue
+  - app: fp16_computation_vector
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x6
+  - app: fp16alt_comparison_scalar
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
+  - app: fp16alt_comparison_vector
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
+  - app: fp16alt_computation_scalar
+    simulators: [vsim, vcs, verilator] # banshee fails with JIT issue
+  - app: fp16alt_computation_vector
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x16
+  - app: fp32_comparison_scalar
+  - app: fp32_comparison_vector
+  - app: fp32_computation_scalar
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x2
+  - app: fp32_computation_vector
+    simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x2
+  - app: fp32_conversions_scalar
+    simulators: [vsim, vcs, verilator] # banshee fails with illegal instruction
+  - app: fp64_conversions_scalar
+    simulators: [vsim, vcs, verilator]
+  # - app: interrupt
+  - app: interrupt-local
+  - app: multi_cluster
+  #- app: openmp_parallel # This requires multiple cores too
+  #- app: openmp_for_static_schedule # This requires multiple cores too
+  - app: openmp_double_buffering
+  - app: perf_cnt
+  - app: printf_simple
+  - app: printf_fmtint
+  - app: simple
+  #- app: team_global # This requires all 9 cores to be present
+  - app: tls
+  - app: varargs_1
+  - app: varargs_2
+  - app: zero_mem
+  - app: non_null_exitcode
+    exit_code: 14