From 9298e18feba03b97c2ce7a7424f0f94af554c94a Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Tue, 31 Oct 2023 18:20:58 +0100
Subject: [PATCH] dnn: Add FlashAttention-2 layer

---
 sw/blas/gemm/src/gemm.h                       | 117 +++---
 sw/dnn/flashattention_2/data/datagen.py       | 175 ++++++++
 sw/dnn/flashattention_2/data/params.hjson     |  11 +
 .../flashattention_2/src/flashattention_2.h   | 384 ++++++++++++++++++
 sw/dnn/flashattention_2/src/main.c            |  14 +
 sw/dnn/flashattention_2/verify.py             |  95 +++++
 sw/dnn/layernorm/src/layernorm.h              |   3 +-
 sw/dnn/requirements.txt                       |   2 +-
 sw/dnn/src/dnn.h                              |   1 +
 target/snitch_cluster/sw/apps/Makefile        |   1 +
 .../sw/apps/dnn/flashattention_2/Makefile     |  12 +
 .../sw/runtime/banshee/src/dump.h             |  28 --
 target/snitch_cluster/sw/toolchain.mk         |   2 +-
 13 files changed, 766 insertions(+), 79 deletions(-)
 create mode 100755 sw/dnn/flashattention_2/data/datagen.py
 create mode 100644 sw/dnn/flashattention_2/data/params.hjson
 create mode 100644 sw/dnn/flashattention_2/src/flashattention_2.h
 create mode 100644 sw/dnn/flashattention_2/src/main.c
 create mode 100755 sw/dnn/flashattention_2/verify.py
 create mode 100644 target/snitch_cluster/sw/apps/dnn/flashattention_2/Makefile
 delete mode 100644 target/snitch_cluster/sw/runtime/banshee/src/dump.h

diff --git a/sw/blas/gemm/src/gemm.h b/sw/blas/gemm/src/gemm.h
index ea2c865636..3bd57aa79f 100644
--- a/sw/blas/gemm/src/gemm.h
+++ b/sw/blas/gemm/src/gemm.h
@@ -21,8 +21,18 @@ typedef __fp16 v4f16 __attribute__((vector_size(8)));
 typedef char v8f8 __attribute__((vector_size(8)));
 #endif
 
-dump_float(gemm, 8);
-dump_uint(index, 9);
+// Floating-point multiplications by zero cannot be optimized as in some
+// edge cases they do not yield zero:
+// - 0f * NaN = NaN
+// - 0f * INFINITY == NaN
+// Thus in order to optimize it, we need to test for zero. You can use this
+// function for free when `multiplier` is a constant.
+static inline double multiply_opt(double multiplicand, double multiplier) {
+    if (multiplier)
+        return multiplicand * multiplier;
+    else
+        return 0;
+}
 
 void gemm_fp64_baseline(uint32_t M, uint32_t N, uint32_t K, double* A,
                         uint32_t ldA, uint32_t ta, double* B, uint32_t ldB,
@@ -30,10 +40,15 @@ void gemm_fp64_baseline(uint32_t M, uint32_t N, uint32_t K, double* A,
     if (!ta && !tb) {
         for (uint32_t m = 0; m < M; m++) {
             for (uint32_t n = 0; n < N; n++) {
-                double c0 = BETA * C[m * ldC + n];
+                double c0 = multiply_opt(C[m * ldC + n], BETA);
                 for (uint32_t k = 0; k < K; k++) {
                     // dump_index(k + m * ldA);
                     // dump_gemm(A[k + m * ldA]);
+
+                    // if (snrt_cluster_core_idx() == 7) {
+                    //     printf("k = %d, m = %d, n = %d, ldA = %d, ldB =
+                    //     %d\n", k, m, n, ldA, ldB);
+                    // }
                     c0 += A[k + m * ldA] * B[k * ldB + n];
                 }
                 C[m * ldC + n] = c0;
@@ -42,7 +57,7 @@ void gemm_fp64_baseline(uint32_t M, uint32_t N, uint32_t K, double* A,
     } else if (ta && !tb) {
         for (uint32_t m = 0; m < M; m++) {
             for (uint32_t n = 0; n < N; n++) {
-                register double c0 = BETA * C[m * ldC + n];
+                double c0 = multiply_opt(C[m * ldC + n], BETA);
                 for (uint32_t k = 0; k < K; k++) {
                     c0 += A[k * M * ldA + m * ldA] * B[k * ldB + n];
                 }
@@ -52,7 +67,7 @@ void gemm_fp64_baseline(uint32_t M, uint32_t N, uint32_t K, double* A,
     } else if (!ta && tb) {
         for (uint32_t m = 0; m < M; m++) {
             for (uint32_t n = 0; n < N; n++) {
-                register double c0 = BETA * C[m * ldC + n];
+                double c0 = multiply_opt(C[m * ldC + n], BETA);
                 for (uint32_t k = 0; k < K; k++) {
                     c0 += A[k + m * ldA] * B[k + n * ldB];
                 }
@@ -62,7 +77,7 @@ void gemm_fp64_baseline(uint32_t M, uint32_t N, uint32_t K, double* A,
     } else {
         for (uint32_t m = 0; m < M; m++) {
             for (uint32_t n = 0; n < N; n++) {
-                register double c0 = BETA * C[m * ldC + n];
+                double c0 = multiply_opt(C[m * ldC + n], BETA);
                 for (uint32_t k = 0; k < K; k++) {
                     c0 += A[k * M * ldA + m * ldA] * B[k + n * ldB];
                 }
@@ -999,46 +1014,54 @@ void gemm(precision_t prec, uint32_t expand, uint32_t setup_ssr,
           uint32_t transa, uint32_t transb, uint32_t m, uint32_t n, uint32_t k,
           double alpha, void* a, uint32_t lda, void* b, uint32_t ldb,
           uint32_t beta, void* c, uint32_t ldc) {
-    const uint32_t compute_num = snrt_cluster_compute_core_num();
-    const uint32_t compute_id = snrt_cluster_core_idx();
-
-    // Compute cores work not on contiguous blocks but on strided rows
-    uint32_t lda_strided = compute_num * lda;
-    uint32_t ldc_strided = compute_num * ldc;
-
-    // Compute cores access A and C at offsets of one row from each other
-    uint32_t offsetA = compute_id * lda;
-    uint32_t offsetC = compute_id * ldc;
-
-    // Compute fraction of C rows every core computes
-    uint32_t frac_m = m / compute_num;
-
-    switch (prec) {
-        case FP64:
-            gemm_fp64_opt(frac_m, n, k, (double*)a + offsetA, lda_strided,
-                          transa, (double*)b, ldb, transb, (double*)c + offsetC,
-                          ldc_strided, &beta, setup_ssr);
-            break;
-        case FP32:
-            gemm_fp32_opt(frac_m, n, k, (float*)a + offsetA, lda_strided,
-                          (float*)b, ldb, (float*)c + offsetC, ldc_strided,
-                          &beta, setup_ssr);
-            break;
-        case FP16:
-            if (expand) {
-                gemm_fp16_ex_opt(
-                    frac_m, n, k, (__fp16*)a + offsetA, lda_strided, (__fp16*)b,
-                    ldb, (__fp16*)c + offsetC, ldc_strided, &beta, setup_ssr);
-            } else {
-                gemm_fp16_opt(frac_m, n, k, (__fp16*)a + offsetA, lda_strided,
-                              (__fp16*)b, ldb, (__fp16*)c + offsetC,
-                              ldc_strided, &beta, setup_ssr);
-            }
-            break;
-        case FP8:
-            gemm_fp8_ex_opt(frac_m, n, k, (char*)a + offsetA, lda, (char*)b,
-                            ldb, (char*)c + offsetC, ldc_strided, &beta,
-                            setup_ssr);
-            break;
+    if (snrt_is_compute_core()) {
+        const uint32_t compute_num = snrt_cluster_compute_core_num();
+        const uint32_t compute_id = snrt_cluster_core_idx();
+
+        // Compute cores work not on contiguous blocks but on strided rows
+        uint32_t lda_strided = compute_num * lda;
+        uint32_t ldc_strided = compute_num * ldc;
+
+        // Compute cores access A and C at offsets of one row from each other
+        uint32_t offsetA = compute_id * lda;
+        uint32_t offsetC = compute_id * ldc;
+
+        // Compute fraction of C rows every core computes
+        uint32_t frac_m = m / compute_num;
+
+        switch (prec) {
+            case FP64:
+                // gemm_fp64_opt(frac_m, n, k, (double*)a + offsetA,
+                // lda_strided,
+                //               transa, (double*)b, ldb, transb, (double*)c +
+                //               offsetC, ldc_strided, &beta, setup_ssr);
+                gemm_fp64_baseline(frac_m, n, k, (double*)a + offsetA,
+                                   lda_strided, transa, (double*)b, ldb, transb,
+                                   (double*)c + offsetC, ldc_strided, beta);
+                break;
+            case FP32:
+                gemm_fp32_opt(frac_m, n, k, (float*)a + offsetA, lda_strided,
+                              (float*)b, ldb, (float*)c + offsetC, ldc_strided,
+                              &beta, setup_ssr);
+                break;
+            case FP16:
+                if (expand) {
+                    gemm_fp16_ex_opt(frac_m, n, k, (__fp16*)a + offsetA,
+                                     lda_strided, (__fp16*)b, ldb,
+                                     (__fp16*)c + offsetC, ldc_strided, &beta,
+                                     setup_ssr);
+                } else {
+                    gemm_fp16_opt(frac_m, n, k, (__fp16*)a + offsetA,
+                                  lda_strided, (__fp16*)b, ldb,
+                                  (__fp16*)c + offsetC, ldc_strided, &beta,
+                                  setup_ssr);
+                }
+                break;
+            case FP8:
+                gemm_fp8_ex_opt(frac_m, n, k, (char*)a + offsetA, lda, (char*)b,
+                                ldb, (char*)c + offsetC, ldc_strided, &beta,
+                                setup_ssr);
+                break;
+        }
     }
 }
diff --git a/sw/dnn/flashattention_2/data/datagen.py b/sw/dnn/flashattention_2/data/datagen.py
new file mode 100755
index 0000000000..8e90612df2
--- /dev/null
+++ b/sw/dnn/flashattention_2/data/datagen.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Viviane Potocnik <vivianep@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import argparse
+import numpy as np
+import pathlib
+import hjson
+import sys
+import os
+import torch
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/"))
+import data_utils  # noqa: E402
+from data_utils import emit_license, \
+                       format_struct_definition, format_array_definition, \
+                       format_array_declaration, format_ifdef_wrapper  # noqa: E402
+
+torch.manual_seed(42)
+
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
+
+PRECISION = {
+    'FP64': '64',
+    'FP32': '32',
+    'FP16': '16',
+    'FP8': '8'
+}
+
+
+def torch_golden_model(Q, K, V):
+    return torch.nn.functional.scaled_dot_product_attention(Q, K, V)
+
+
+def exact_golden_model(Q, K, V, B_r, B_c):
+    # Convert torch tensors to numpy arrays
+    Q = Q.numpy()
+    K = K.numpy()
+    V = V.numpy()
+    # Get layer dimensions
+    N = Q.shape[0]
+    # Calculate tiling parameters
+    T_r = N // B_r
+    T_c = N // B_c
+    # Transpose K
+    K_t = np.transpose(K)
+    # Iterate tiles
+    O_tiles = []
+    for i in range(T_r):
+        # Tile Q
+        start_row = i * B_r
+        end_row = start_row + B_r
+        Q_i = Q[start_row:end_row, :]
+        # Initialize l_i, m_i, O_i
+        m_i = np.full((B_r, 1), -np.inf)
+        for j in range(T_c):
+            # Tile K_t and V
+            start_col = j * B_c
+            end_col = start_col + B_c
+            K_t_j = K_t[:,start_col:end_col]
+            V_j = V[start_col:end_col, ]
+            # Compute O tile update
+            S_ij = np.matmul(Q_i, K_t_j)
+            m_i_prev = m_i
+            m_i = np.maximum(m_i_prev, np.max(S_ij, 1, keepdims=True))
+            shifted_exp = np.exp(m_i_prev - m_i)
+            P_ij = np.exp(S_ij - m_i)
+            PxV = np.matmul(P_ij, V_j)
+            if j == 0:
+                l_i = np.sum(P_ij, 1, keepdims=True)
+                O_i = PxV
+            else:
+                l_i = (shifted_exp * l_i) + np.sum(P_ij, 1, keepdims=True)
+                diag = np.diag(shifted_exp[:, 0])
+                diag_inv = np.linalg.inv(diag)
+                O_i = np.matmul(diag_inv, O_i)
+                O_i += PxV
+        # Finalize O tile
+        diag_l_i = np.diag(l_i[:, 0])
+        diag_l_inv_i = np.linalg.inv(diag_l_i)
+        O_i = np.matmul(diag_l_inv_i, O_i)
+        O_tiles.append(O_i)
+    return np.concatenate(O_tiles, 0)
+
+
+def emit_header(section, params):
+    N = params['N']
+    d = params['d']
+    B_r = params['B_r']
+    B_c = params['B_c']
+    prec = PRECISION[params['dtype']]
+
+    # Verify layer parameters are valid
+    assert (N % B_r) == 0, 'N is not an integer multiple of B_r'
+    assert (N % B_c) == 0, 'N is not an integer multiple of B_c'
+    assert (B_r % 8) == 0, 'B_r must be an integer multiple of the number of cores in a cluster'
+
+    torch_type = data_utils.floating_point_torch_type(prec)
+
+    Q = torch.rand(N, d, requires_grad=False, dtype=torch_type)
+    K = torch.rand(N, d, requires_grad=False, dtype=torch_type)
+    V = torch.rand(N, d, requires_grad=False, dtype=torch_type)
+
+    output = exact_golden_model(Q, K, V, B_r, B_c)
+
+    # Layer implementation assumes K is in (d, N) layout
+    K = torch.transpose(K, 0, 1)
+
+    ctype = data_utils.floating_point_ctype(prec)
+
+    q_uid = 'Q'
+    k_uid = 'K'
+    v_uid = 'V'
+    o_uid = 'O'
+
+    layer_cfg = {
+        **params,
+        'Q': q_uid,
+        'K': k_uid,
+        'V': v_uid,
+        'O': o_uid,
+    }
+
+    data_str = [emit_license()]
+    data_str += [format_array_declaration(ctype, q_uid, Q.shape)]
+    data_str += [format_array_declaration(ctype, k_uid, K.shape)]
+    data_str += [format_array_declaration(ctype, v_uid, V.shape)]
+    data_str += [format_array_declaration(ctype, o_uid, output.shape)]
+    data_str += [format_struct_definition('flashattention_2_layer_t', 'layer', layer_cfg)]
+    data_str += [format_array_definition(ctype, q_uid, Q)]
+    data_str += [format_array_definition(ctype, k_uid, K)]
+    data_str += [format_array_definition(ctype, v_uid, V)]
+    result_def = format_array_definition(ctype, 'golden', output)
+    data_str += [format_ifdef_wrapper('BIST', result_def)]
+    data_str = '\n\n'.join(data_str)
+
+    return data_str
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
+    parser.add_argument(
+        "-c", "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel'
+    )
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
+    parser.add_argument(
+        'output',
+        type=pathlib.Path,
+        help='Path of the output header file')
+    args = parser.parse_args()
+
+    # Load param config file
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+
+    # Emit header file
+    with open(args.output, 'w') as f:
+        f.write(emit_header(args.section, param))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sw/dnn/flashattention_2/data/params.hjson b/sw/dnn/flashattention_2/data/params.hjson
new file mode 100644
index 0000000000..6376dd4596
--- /dev/null
+++ b/sw/dnn/flashattention_2/data/params.hjson
@@ -0,0 +1,11 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+{
+    N: 128
+    d: 128
+    B_r: 32
+    B_c: 16
+    dtype: FP64
+}
\ No newline at end of file
diff --git a/sw/dnn/flashattention_2/src/flashattention_2.h b/sw/dnn/flashattention_2/src/flashattention_2.h
new file mode 100644
index 0000000000..f719225711
--- /dev/null
+++ b/sw/dnn/flashattention_2/src/flashattention_2.h
@@ -0,0 +1,384 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Viviane Potocnik <vivianep@iis.ee.ethz.ch>
+//         Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "snrt.h"
+
+// #define ENABLE_PRINTS
+
+/**
+ * @struct flashattention_2_layer_t
+ * @brief This structure contains all parameters necessary
+ *        for computing a FlashAttention-2 layer. Refer to
+ *        "FlashAttention-2: Faster Attention with Better
+ *        Parallelism and Work Partitioning" for more info
+ * @var flashattention_2_layer_t::N
+ * Sequence length in number of tokens
+ * @var flashattention_2_layer_t::d
+ * Head dimension
+ * @var flashattention_2_layer_t::Q
+ * Pointer to query tensor
+ * @var flashattention_2_layer_t::K
+ * Pointer to key tensor
+ * @var flashattention_2_layer_t::V
+ * Pointer to value tensor
+ * @var flashattention_2_layer_t::O
+ * Pointer to output tensor
+ */
+typedef struct {
+    uint32_t N;
+    uint32_t d;
+    uint32_t B_r;
+    uint32_t B_c;
+    void *Q;
+    void *K;
+    void *V;
+    void *O;
+    precision_t dtype;
+} flashattention_2_layer_t;
+
+static inline void flashattention_2_layer(flashattention_2_layer_t layer) {
+    // alias layer parameters
+    uint32_t N = layer.N;
+    uint32_t d = layer.d;
+    uint32_t B_r = layer.B_r;
+    uint32_t B_c = layer.B_c;
+    double *Q_l3 = layer.Q;
+    double *K_l3 = layer.K;
+    double *V_l3 = layer.V;
+    double *O_l3 = layer.O;
+
+    // alias system parameters
+    // TODO adapt these for Occamy
+    uint32_t compute_id = snrt_global_core_idx();
+    uint32_t cluster_id = snrt_cluster_idx();
+    uint32_t num_cores = snrt_cluster_compute_core_num();
+    uint32_t num_clusters = snrt_cluster_num();
+
+    // compute the tiling parameters
+    uint32_t T_r = N / B_r;  // number of row blocks
+    uint32_t T_c = N / B_c;  // number of column blocks
+
+    // compute the size of the matrices
+    uint32_t q_fa_size = B_r * d * sizeof(double);
+    uint32_t k_fa_size = B_c * d * sizeof(double);
+    uint32_t v_fa_size = B_c * d * sizeof(double);
+    uint32_t s_fa_size = B_r * B_c * sizeof(double);
+    uint32_t p_fa_size = B_r * B_c * sizeof(double);
+    uint32_t o_fa_size = B_r * d * sizeof(double);
+    uint32_t m_i_size = B_r * sizeof(double);
+    uint32_t m_i_prev_size = m_i_size;
+    uint32_t l_i_size = B_r * sizeof(double);
+    uint32_t shifted_exp_size = B_r * sizeof(double);
+
+    // allocate memory in TCDM
+    void *tcdm_ptr = (double *)snrt_l1_next();
+    double *Q_fa = tcdm_ptr;
+    tcdm_ptr += q_fa_size;
+    double *K_fa = tcdm_ptr;
+    tcdm_ptr += k_fa_size;
+    double *V_fa = tcdm_ptr;
+    tcdm_ptr += v_fa_size;
+    double *S_fa = tcdm_ptr;
+    tcdm_ptr += s_fa_size;
+    double *P_fa = tcdm_ptr;
+    tcdm_ptr += p_fa_size;
+    double *O_fa = tcdm_ptr;
+    tcdm_ptr += o_fa_size;
+    double *m_i = tcdm_ptr;
+    tcdm_ptr += m_i_size;
+    double *m_i_prev = tcdm_ptr;
+    tcdm_ptr += m_i_prev_size;
+    double *l_i = tcdm_ptr;
+    tcdm_ptr += l_i_size;
+    double shifted_exp;
+    double row_sum;
+
+    float used_memory_kB =
+        (float)((uint64_t)tcdm_ptr - (uint64_t)snrt_l1_next()) / 1024.0f;
+
+    DUMP(used_memory_kB);
+
+    // Iterate row blocks of Q
+    uint32_t start_loop_outer = snrt_mcycle();
+    for (int t_r = 0; t_r < T_r; t_r++) {
+        // DMA copy Q row block to TCDM
+        uint32_t start_dma = snrt_mcycle();
+        if (snrt_is_dm_core()) {
+            uint32_t q_fa_offset = t_r * B_r * d;
+
+            snrt_dma_txid_t txid_q_fa =
+                snrt_dma_start_2d(Q_fa,               /* dst */
+                                  Q_l3 + q_fa_offset, /* src */
+                                  d * sizeof(double), /* size */
+                                  d * sizeof(double), /* dst_stride */
+                                  d * sizeof(double), /* src_stride */
+                                  B_r);               /* repetitions */
+
+            snrt_dma_wait_all();
+
+#ifdef ENABLE_PRINTS
+            printf("Q_fa:\n");
+            for (int i = 0; i < B_r * d; i++) {
+                DUMP((float)(Q_fa[i]));
+            }
+#endif
+        }
+        uint32_t end_dma = snrt_mcycle();
+
+        snrt_cluster_hw_barrier();
+
+        // Initialize m_i, m_i_prev, l_i, row_sum
+        uint32_t rows_per_core = B_r / num_cores;
+        uint32_t start_row = rows_per_core * compute_id;
+        uint32_t end_row = start_row + rows_per_core;
+        if (snrt_is_compute_core()) {
+            for (int row_idx = start_row; row_idx < end_row; row_idx++) {
+                m_i[row_idx] = -INFINITY;
+                m_i_prev[row_idx] = -INFINITY;
+                l_i[row_idx] = 0.0f;
+            }
+        }
+
+        snrt_cluster_hw_barrier();
+
+#ifdef ENABLE_PRINTS
+        if (snrt_cluster_core_idx() == 0) {
+            printf("m_i:\n");
+            for (int i = 0; i < B_r; i++) {
+                DUMP((float)(m_i[i]));
+            }
+            printf("l_i:\n");
+            for (int i = 0; i < B_r; i++) {
+                DUMP((float)(l_i[i]));
+            }
+        }
+#endif
+
+        snrt_cluster_hw_barrier();
+
+        // Iterate column blocks of K (corresponding to row blocks of V)
+        uint32_t start_loop_inner = snrt_mcycle();
+        for (int t_c = 0; t_c < T_c; t_c++) {
+#ifdef ENABLE_PRINTS
+            if (snrt_cluster_core_idx() == 0) {
+                printf("(i, j) = (%d, %d)\n", t_r, t_c);
+            }
+#endif
+
+            snrt_cluster_hw_barrier();
+
+            // DMA copy K column block (d, B_c) and V row block (B_c, d) to
+            // TCDM. K is stored in (d, N) form in memory, V in (N, d) form
+            uint32_t k_fa_offset = t_c * B_c;
+            uint32_t v_fa_offset = t_c * B_c * d;
+            uint32_t start_dma = snrt_mcycle();
+            if (!snrt_is_compute_core()) {
+                // K is in (d, N) format in main memory
+                snrt_dma_txid_t txid_k_fa =
+                    snrt_dma_start_2d(K_fa,                 /* dst */
+                                      K_l3 + k_fa_offset,   /* src */
+                                      B_c * sizeof(double), /* size */
+                                      B_c * sizeof(double), /* dst_stride */
+                                      N * sizeof(double),   /* src_stride */
+                                      d);                   /* repetitions */
+
+                snrt_dma_txid_t txid_v_fa =
+                    snrt_dma_start_2d(V_fa,               /* dst */
+                                      V_l3 + v_fa_offset, /* src */
+                                      d * sizeof(double), /* size */
+                                      d * sizeof(double), /* dst_stride */
+                                      d * sizeof(double), /* src_stride */
+                                      B_c);               /* repetitions */
+
+                snrt_dma_wait_all();
+
+#ifdef ENABLE_PRINTS
+                printf("K_fa:\n");
+                for (int i = 0; i < B_c * d; i++) {
+                    DUMP((float)(K_fa[i]));
+                }
+                printf("V_fa:\n");
+                for (int i = 0; i < B_c * d; i++) {
+                    DUMP((float)(V_fa[i]));
+                }
+#endif
+            }
+            uint32_t end_dma = snrt_mcycle();
+
+            snrt_cluster_hw_barrier();
+
+            // Calculate O tile from Q, K and V tiles
+            if (snrt_is_compute_core()) {
+                // Matrix multiplication between row block of Q and transposed
+                // column block of K to calculate a tile of S: S = Q * K^T.
+                // The S tile is of form (B_r, B_c)
+                uint32_t start_gemm = snrt_mcycle();
+                gemm(FP64, 0, 0, 0, 0, B_r, B_c, d, 1, Q_fa, d, K_fa, B_c, 0,
+                     S_fa, B_c);
+                uint32_t end_gemm = snrt_mcycle();
+
+                snrt_cluster_hw_barrier();
+
+#ifdef ENABLE_PRINTS
+                if (snrt_cluster_core_idx() == 0) {
+                    printf("S_ij:\n");
+                    for (int i = 0; i < B_r * B_c; i++) {
+                        DUMP((float)(S_fa[i]));
+                    }
+                }
+#endif
+
+                // Iterate over the rows of the S row block, distributing
+                // the rows to the cores
+                for (int row_idx = start_row; row_idx < end_row; row_idx++) {
+                    // Save m of current tile to rescale next tile
+                    m_i_prev[row_idx] = m_i[row_idx];
+
+                    // Initialize "local" row_sum to zero
+                    row_sum = 0.0;
+
+                    // Iterate over all columns to calculate maximum for the
+                    // current row
+                    for (int col_idx = 0; col_idx < B_c; col_idx++) {
+                        double val = S_fa[row_idx * B_c + col_idx];
+                        if (val > m_i[row_idx]) m_i[row_idx] = val;
+                    }
+
+                    // Calculate P tile as the "local" softmax of S
+                    for (int col_idx = 0; col_idx < B_c; col_idx++) {
+                        P_fa[row_idx * B_c + col_idx] =
+                            expf(S_fa[row_idx * B_c + col_idx] - m_i[row_idx]);
+                        row_sum += P_fa[row_idx * B_c + col_idx];
+                    }
+
+                    // Calculate rescaling factor l
+                    shifted_exp = expf(m_i_prev[row_idx] - m_i[row_idx]);
+                    l_i[row_idx] = l_i[row_idx] * shifted_exp + row_sum;
+
+                    // If not in first t_c iteration, update
+                    // O_ij = diag(shifted_exp)^(-1) * O_i(j-1)
+                    if (t_c != 0) {
+                        for (int col_idx = 0; col_idx < d; col_idx++) {
+                            O_fa[row_idx * d + col_idx] /= shifted_exp;
+                        }
+                    }
+                }
+
+                snrt_cluster_hw_barrier();
+
+#ifdef ENABLE_PRINTS
+                if (snrt_cluster_core_idx() == 0) {
+                    printf("m_i:\n");
+                    for (int i = 0; i < B_r; i++) {
+                        DUMP((float)(m_i[i]));
+                    }
+                    printf("P_ij:\n");
+                    for (int i = 0; i < B_r * B_c; i++) {
+                        DUMP((float)(P_fa[i]));
+                    }
+                    printf("l_i:\n");
+                    for (int i = 0; i < B_r; i++) {
+                        DUMP((float)(l_i[i]));
+                    }
+                    if (t_c != 0) {
+                        printf("O_ij:\n");
+                        for (int i = 0; i < B_r * d; i++) {
+                            DUMP((float)(O_fa[i]));
+                        }
+                    }
+                }
+#endif
+
+                snrt_cluster_hw_barrier();
+
+                // Calculate O tile (O_ij) of size (B_r, d).
+                // The P tile is of size (B_r, B_c) and V of size (B_c, d)
+                if (t_c == 0) {
+                    // In first t_c iteration, initialize O_ij to P_ij * V_j
+                    gemm(FP64, 0, 0, 0, 0, B_r, d, B_c, 1, P_fa, B_c, V_fa, d,
+                         0.0f, O_fa, d);
+                } else {
+                    // In successive t_c iterations, O_ij += P_ij * V_j
+                    gemm(FP64, 0, 0, 0, 0, B_r, d, B_c, 1, P_fa, B_c, V_fa, d,
+                         1.0f, O_fa, d);
+                }
+
+                uint32_t end_stats = snrt_mcycle();
+
+                snrt_cluster_hw_barrier();
+
+#ifdef ENABLE_PRINTS
+                if (snrt_cluster_core_idx() == 0) {
+                    printf("O_ij:\n");
+                    for (int i = 0; i < B_r * d; i++) {
+                        DUMP((float)(O_fa[i]));
+                    }
+                }
+#endif
+            } else {
+                snrt_cluster_hw_barrier();
+                snrt_cluster_hw_barrier();
+                snrt_cluster_hw_barrier();
+                snrt_cluster_hw_barrier();
+            }
+        }  // end of T_c loop
+
+        snrt_cluster_hw_barrier();
+
+        // Rescaling for last t_c iteration
+        // O_i = diag(l_i_Tc)^-1 * O_i
+        if (snrt_is_compute_core()) {
+            for (int row_idx = start_row; row_idx < end_row; row_idx++) {
+                for (int col_idx = 0; col_idx < d; col_idx++) {
+                    O_fa[row_idx * d + col_idx] /= l_i[row_idx];
+                }
+            }
+        }
+
+        snrt_fpu_fence();
+
+        snrt_cluster_hw_barrier();
+
+#ifdef ENABLE_PRINTS
+        if (snrt_cluster_core_idx() == 0) {
+            printf("O_ij:\n");
+            for (int i = 0; i < B_r * d; i++) {
+                DUMP((float)(O_fa[i]));
+            }
+        }
+#endif
+
+        snrt_cluster_hw_barrier();
+
+        // Write back O row block (B_r, d) to DRAM
+        uint32_t start_dma_write_back = snrt_mcycle();
+        if (snrt_is_dm_core()) {
+            uint32_t o_fa_offset = t_r * B_r * d;
+            snrt_dma_txid_t txid_o_fa =
+                snrt_dma_start_2d(O_l3 + o_fa_offset, /* dst */
+                                  O_fa,               /* src */
+                                  d * sizeof(double), /* size */
+                                  d * sizeof(double), /* dst_stride */
+                                  d * sizeof(double), /* src_stride */
+                                  B_r);               /* repetitions */
+
+            snrt_dma_wait_all();
+
+#ifdef ENABLE_PRINTS
+            printf("O_l3:\n");
+            for (int i = 0; i < B_r * d; i++) {
+                DUMP((float)(O_l3[o_fa_offset + i]));
+            }
+#endif
+        }
+        uint32_t end_dma_write_back = snrt_mcycle();
+
+    }  // end of T_r loop
+    uint32_t end_loop_outer = snrt_mcycle();
+
+    snrt_cluster_hw_barrier();
+}
diff --git a/sw/dnn/flashattention_2/src/main.c b/sw/dnn/flashattention_2/src/main.c
new file mode 100644
index 0000000000..daadeede6b
--- /dev/null
+++ b/sw/dnn/flashattention_2/src/main.c
@@ -0,0 +1,14 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "dnn.h"
+
+#include "data.h"
+
+int main() {
+    flashattention_2_layer(layer);
+    return 0;
+}
diff --git a/sw/dnn/flashattention_2/verify.py b/sw/dnn/flashattention_2/verify.py
new file mode 100755
index 0000000000..2471a96d29
--- /dev/null
+++ b/sw/dnn/flashattention_2/verify.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import sys
+from pathlib import Path
+import numpy as np
+import torch
+from data.datagen import exact_golden_model
+
+sys.path.append(str(Path(__file__).parent / '../../../util/sim/'))
+import verification  # noqa: E402
+from elf import Elf  # noqa: E402
+import data_utils  # noqa: E402
+from data_utils import bytes_to_float, bytes_to_struct  # noqa: E402
+
+
+ERR_THRESHOLD = 1E-6
+
+PRECISION_T = {
+    8: '64',
+    4: '32',
+    2: '16',
+    1: '8'
+}
+
+NUMPY_T = {
+    '64': np.float64,
+    '32': np.float32,
+    '16': np.float16
+}
+
+
+def main():
+    # Run simulation and get outputs
+    args = verification.parse_args()
+    raw_results = verification.simulate(sim_bin=args.sim_bin,
+                                        snitch_bin=args.snitch_bin,
+                                        symbols_bin=args.symbols_bin,
+                                        log=args.log,
+                                        output_uids=['O'])
+
+    # Extract input operands from ELF file
+    if args.symbols_bin:
+        elf = Elf(args.symbols_bin)
+    else:
+        elf = Elf(args.snitch_bin)
+
+    layer_struct = {
+        'N': 'I',
+        'd': 'I',
+        'B_r': 'I',
+        'B_c': 'I',
+        'Q': 'I',
+        'K': 'I',
+        'V': 'I',
+        'O': 'I',
+        'dtype': 'I'
+    }
+    layer = bytes_to_struct(elf.get_symbol_contents('layer'), layer_struct)
+    N = layer['N']
+    d = layer['d']
+    B_r = layer['B_r']
+    B_c = layer['B_c']
+    prec = PRECISION_T[layer['dtype']]
+
+    Q = np.array(bytes_to_float(elf.get_symbol_contents('Q'), prec), dtype=NUMPY_T[prec])
+    K = np.array(bytes_to_float(elf.get_symbol_contents('K'), prec), dtype=NUMPY_T[prec])
+    V = np.array(bytes_to_float(elf.get_symbol_contents('V'), prec), dtype=NUMPY_T[prec])
+    Q = torch.from_numpy(Q.reshape(N, d))
+    V = torch.from_numpy(V.reshape(N, d))
+    # Golden model expects key matrix in (N, d) form, while Snitch binary stores it in (d, N)
+    K = torch.from_numpy(K.reshape(d, N))
+    K = torch.transpose(K, 0, 1)
+
+    # Verify results
+    O_actual = np.array(bytes_to_float(raw_results['O'], prec), dtype=NUMPY_T[prec])
+    O_golden = exact_golden_model(Q, K, V, B_r, B_c).flatten()
+    # O_golden = golden_model(Q, K, V).detach().numpy().flatten()
+
+    relative_err = np.absolute((O_golden - O_actual) / O_golden)
+    fail = np.any(relative_err > ERR_THRESHOLD)
+    if (fail):
+        verification.dump_results_to_csv([O_golden, O_actual, relative_err],
+                                         Path.cwd() / 'flashattention_2_results.csv')
+        print('Maximum relative error:', np.max(relative_err))
+
+    return int(fail)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/sw/dnn/layernorm/src/layernorm.h b/sw/dnn/layernorm/src/layernorm.h
index e09771541a..9dff3cf795 100644
--- a/sw/dnn/layernorm/src/layernorm.h
+++ b/sw/dnn/layernorm/src/layernorm.h
@@ -4,10 +4,9 @@
 
 #pragma once
 
+#include "dnn.h"
 #include "math.h"
 #include "snrt.h"
-// #include "printf.h"
-#include "dnn.h"
 
 /**
  * @struct layernorm_layer_struct
diff --git a/sw/dnn/requirements.txt b/sw/dnn/requirements.txt
index 7524fee3a7..a05999073a 100644
--- a/sw/dnn/requirements.txt
+++ b/sw/dnn/requirements.txt
@@ -3,5 +3,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
 numpy
-torch>=1.8
+torch==2.1
 hjson
diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h
index a38cc9c76e..ebe7e1b7e3 100644
--- a/sw/dnn/src/dnn.h
+++ b/sw/dnn/src/dnn.h
@@ -203,4 +203,5 @@ typedef struct network_single_cluster_t_ {
 #include "../linear/src/linear.h"
 #include "../maxpool/src/maxpool.h"
 #include "../softmax/src/softmax.h"
+#include "../flashattention_2/src/flashattention_2.h"
 // #include "utils.h"
diff --git a/target/snitch_cluster/sw/apps/Makefile b/target/snitch_cluster/sw/apps/Makefile
index 1a8ce51ae2..596b37e4ff 100644
--- a/target/snitch_cluster/sw/apps/Makefile
+++ b/target/snitch_cluster/sw/apps/Makefile
@@ -18,6 +18,7 @@ SUBDIRS += dnn/layernorm
 SUBDIRS += dnn/linear
 SUBDIRS += dnn/maxpool
 SUBDIRS += dnn/softmax
+SUBDIRS += dnn/flashattention_2
 SUBDIRS += montecarlo/pi_estimation
 
 .PHONY: all clean $(SUBDIRS)
diff --git a/target/snitch_cluster/sw/apps/dnn/flashattention_2/Makefile b/target/snitch_cluster/sw/apps/dnn/flashattention_2/Makefile
new file mode 100644
index 0000000000..75585b2d65
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/dnn/flashattention_2/Makefile
@@ -0,0 +1,12 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+APP ?= flashattention_2
+
+include ../../../../../../sw/dnn/common.mk
+include ../../common.mk
+
+$(DEP): $(DATA_H)
diff --git a/target/snitch_cluster/sw/runtime/banshee/src/dump.h b/target/snitch_cluster/sw/runtime/banshee/src/dump.h
deleted file mode 100644
index 8f24cc1b92..0000000000
--- a/target/snitch_cluster/sw/runtime/banshee/src/dump.h
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2020 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Authors: Samuel Riedel, ETH Zurich <sriedel@iis.ee.ethz.ch>
-//          Viviane Potocnik, ETH Zurich <vivianep@iis.ee.ethz.ch>
-
-// Dump a value via CSR
-// !!! Careful: This is only supported in simulation and an experimental
-// feature. All writes to unimplemented CSR registers will be dumped by Snitch.
-// This can be exploited to quickly print measurement values from all cores
-// simultaneously without the hassle of printf. To specify multiple metrics,
-// different CSRs can be used. The macro will define a function that will then
-// always print via the same CSR. E.g., `dump(errors, 8)` will define a function
-// with the following signature: `dump_errors(uint32_t val)`, which will print
-// the given value via the 8th register. Alternatively, the `write_csr(reg,
-// val)` macro can be used directly.
-
-#define dump_float(name, reg)                                                  \
-    static __attribute__((always_inline)) inline void dump_##name(float val) { \
-        asm volatile("csrw " #reg ", %0" ::"rK"(val));                         \
-    }
-
-#define dump_uint(name, reg)                                                   \
-    static                                                                     \
-        __attribute__((always_inline)) inline void dump_##name(uint32_t val) { \
-        asm volatile("csrw " #reg ", %0" ::"rK"(val));                         \
-    }
\ No newline at end of file
diff --git a/target/snitch_cluster/sw/toolchain.mk b/target/snitch_cluster/sw/toolchain.mk
index ca5f6a4820..d8e3aa6cb7 100644
--- a/target/snitch_cluster/sw/toolchain.mk
+++ b/target/snitch_cluster/sw/toolchain.mk
@@ -32,7 +32,7 @@ RISCV_CFLAGS += -menable-experimental-extensions
 RISCV_CFLAGS += -mabi=ilp32d
 RISCV_CFLAGS += -mcmodel=medany
 # RISCV_CFLAGS += -mno-fdiv # Not supported by Clang
-RISCV_CFLAGS += -ffast-math
+# RISCV_CFLAGS += -ffast-math
 RISCV_CFLAGS += -fno-builtin-printf
 RISCV_CFLAGS += -fno-builtin-sqrtf
 RISCV_CFLAGS += -fno-common