diff --git a/sw/snRuntime/src/dma.h b/sw/snRuntime/src/dma.h
index 169e54d7b..758229d3b 100644
--- a/sw/snRuntime/src/dma.h
+++ b/sw/snRuntime/src/dma.h
@@ -205,3 +205,119 @@ inline void snrt_dma_memset(void *ptr, uint8_t value, uint32_t len) {
         snrt_dma_start_2d(ptr, ptr, 64, 64, 0, len / 64);
     snrt_dma_wait_all();
 }
+
+
+//================================================================================
+// Matrix tile functions
+//================================================================================
+
+
+/// Load a 2D-tile of shape (tile_x1_size, tile_x0_size) from the 2D array
+/// of shape (full_x1_size, full_x0_size). The specific tile is selected
+/// by the (tile_x1_idx, tile_x0_idx) tuple. Every element in the src and
+/// destination arrays has prec bytes.
+inline snrt_dma_txid_t snrt_dma_load_2d_tile(void *dst, void *src,
+                                             size_t tile_x1_idx, size_t tile_x0_idx,
+                                             size_t tile_x1_size, size_t tile_x0_size,
+                                             size_t full_x0_size, uint32_t prec) {
+    size_t src_offset = 0;
+    // Advance src array in x0 and x1 dimensions, and convert to byte offset
+    src_offset += tile_x0_idx * tile_x0_size;
+    src_offset += tile_x1_idx * tile_x1_size * full_x0_size;
+    src_offset *= prec;
+    // Initiate transfer
+    return snrt_dma_start_2d(
+        dst,                  // dst
+        src + src_offset,     // src
+        tile_x0_size * prec,  // size
+        tile_x0_size * prec,  // dst_stride
+        full_x0_size * prec,  // src_stride
+        tile_x1_size          // repeat
+    );
+}
+
+/// Store a 2D-tile of shape (tile_x1_size, tile_x0_size) to the 2D array
+/// of shape (full_x1_size, full_x0_size). The specific tile is selected
+/// by the (tile_x1_idx, tile_x0_idx) tuple. Every element in the src and
+/// destination arrays has prec bytes.
+inline snrt_dma_txid_t snrt_dma_store_2d_tile(void *dst, void *src,
+                                             size_t tile_x1_idx, size_t tile_x0_idx,
+                                             size_t tile_x1_size, size_t tile_x0_size,
+                                             size_t full_x0_size, uint32_t prec) {
+    size_t dst_offset = 0;
+    // Advance dst array in x0 and x1 dimensions, and convert to byte offset
+    dst_offset += tile_x0_idx * tile_x0_size;
+    dst_offset += tile_x1_idx * tile_x1_size * full_x0_size;
+    dst_offset *= prec;
+    // Initiate transfer
+    return snrt_dma_start_2d(
+        dst + dst_offset,     // dst
+        src,                  // src
+        tile_x0_size * prec,  // size
+        full_x0_size * prec,  // dst_stride
+        tile_x0_size * prec,  // src_stride
+        tile_x1_size          // repeat
+    );
+}
+
+//================================================================================
+// Reduction functions
+//================================================================================
+
+// Assumes the dst and src buffers are at the same offset in the TCDM of every
+// cluster
+inline void snrt_global_reduction_dma(double* dst_buffer, double* src_buffer,
+                                      size_t len) {
+    // If we have a single cluster the reduction degenerates to a memcpy
+    if (snrt_cluster_num() == 1) {
+        if (!snrt_is_compute_core()) {
+            snrt_dma_start_1d(dst_buffer, src_buffer, len * sizeof(double));
+            snrt_dma_wait_all();
+        }
+        snrt_cluster_hw_barrier();
+    } else {
+        // Iterate levels in the binary reduction tree
+        int num_levels = ceil(log2(snrt_cluster_num()));
+        for (unsigned int level = 0; level < num_levels; level++) {
+
+            // Determine whether the current cluster is an active cluster.
+            // An active cluster is a cluster that participates in the current
+            // level of the reduction tree. Every second cluster among the active
+            // ones is a sender.
+            uint32_t is_active = (snrt_cluster_idx() % (1 << level)) == 0;
+            uint32_t is_sender = (snrt_cluster_idx() % (1 << (level + 1))) != 0;
+
+            // If the cluster is a sender, it sends the data in its source
+            // buffer to the respective receiver's destination buffer
+            if (is_active && is_sender) {
+                if (!snrt_is_compute_core()) {
+                    void *dst = (void *)dst_buffer -
+                        (1 << level) * SNRT_CLUSTER_OFFSET;
+                    snrt_dma_start_1d(dst, src_buffer, len * sizeof(double));
+                    snrt_dma_wait_all();
+                }
+            }
+
+            // Synchronize senders and receivers
+            snrt_global_barrier();
+
+            // Every cluster which is not a sender performs the reduction
+            if (is_active && !is_sender) {
+                // Computation is parallelized over the compute cores
+                if (snrt_is_compute_core()) {
+                    uint32_t items_per_core =
+                        len / snrt_cluster_compute_core_num();
+                    uint32_t core_offset =
+                        snrt_cluster_core_idx() * items_per_core;
+                    for (uint32_t i = 0; i < items_per_core; i++) {
+                        uint32_t abs_i = core_offset + i;
+                        dst_buffer[abs_i] += src_buffer[abs_i];
+                    }
+                }
+            }
+
+            // Synchronize compute and DM cores for next tree level
+            snrt_cluster_hw_barrier();
+        }
+    }
+}
diff --git a/sw/snRuntime/src/dump.h b/sw/snRuntime/src/dump.h
index 1d65395b5..f105cf1af 100644
--- a/sw/snRuntime/src/dump.h
+++ b/sw/snRuntime/src/dump.h
@@ -23,3 +23,10 @@
     }
 
 #define DUMP(val) ({ asm volatile("csrw 0x7C3, %0" ::"rK"(val)); })
+
+#define NAMED_DUMP(type, name, reg)                                           \
+static __attribute__((always_inline)) inline void dump_##name(type val) { \
+asm volatile("csrw " #reg ", %0" ::"rK"(val));                        \
+}
+
+#define DUMP(val) ({ asm volatile("csrw 0x7C3, %0" ::"rK"(val)); })
\ No newline at end of file