sw: Add synthetic tests for CAQ races and CAQ-FREP interaction

pulp-platform · Feb 14, 2024 · 2534b15 · 2534b15
1 parent 275ac25
commit 2534b15
Show file tree

Hide file tree

Showing 3 changed files with 219 additions and 0 deletions.
diff --git a/sw/tests/caq.c b/sw/tests/caq.c
@@ -0,0 +1,99 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+#include <snrt.h>
+
+#define NUM_WORKERS 8
+
+int main() {
+    uint32_t core_id = snrt_cluster_core_idx();
+
+    // Only use one cluster
+    if (snrt_cluster_idx() != 0 || core_id >= NUM_WORKERS) {
+        snrt_cluster_hw_barrier();
+        snrt_cluster_hw_barrier();
+        return 0;
+    }
+
+    // Allocate and initialize common return for all cores
+    volatile uint32_t *ret = (volatile uint32_t *)snrt_l1_next();
+    if (core_id == 0) {
+        *ret = NUM_WORKERS * 0b1111;
+        asm volatile("fence" ::: "memory");
+    }
+    snrt_cluster_hw_barrier();
+
+    // Allocate 8 doubles to work on on stack; 4 inputs and 4 outputs
+    volatile double work[8] = {3.4232857249561 + 0.565 * core_id,  // in0
+                               2.3164242512938 + 0.565 * core_id,  // in1
+                               8.3332613559798 + 0.565 * core_id,  // in2
+                               5.6413213082822 + 0.565 * core_id,  // in3
+                               -1.0,
+                               -1.0,
+                               -1.0,
+                               -1.0};
+
+    // Test integer-FP load-store races
+    asm volatile(
+        // Preload ft0 with in0
+        "fld      ft0,   0      (%[b]) \n"
+        // Preload ft0 with in1
+        "fld      ft1,  (1*8)   (%[b]) \n"
+        // Preload {t1, t0} with in2
+        "lw       t0,   (2*8)   (%[b]) \n"
+        "lw       t1,   (2*8+4) (%[b]) \n"
+        // Preload {t3, t2} with in3
+        "lw       t2,   (3*8)   (%[b]) \n"
+        "lw       t3,   (3*8+4) (%[b]) \n"
+        // Preload work[4] with in2 (x guard)
+        "sw       t0,   (4*8)   (%[b]) \n"
+        "sw       t1,   (4*8+4) (%[b]) \n"
+        // Preload work[5] with in3 (x guard)
+        "sw       t2,   (5*8)   (%[b]) \n"
+        "sw       t3,   (5*8+4) (%[b]) \n"
+
+        // FS -> IL race: {t1, t0} should contain in0 at end, *not* in2
+        "fsd      ft0,  (4*8)   (%[b]) \n"
+        "lw       t0,   (4*8)   (%[b]) \n"
+        "lw       t1,   (4*8+4) (%[b]) \n"
+        // FS -> IS race: work[4] should contain in0 at end, *not* in1 or in2
+        "fsd      ft1,  (4*8)   (%[b]) \n"
+        "sw       t0,   (4*8)   (%[b]) \n"
+        "sw       t1,   (4*8+4) (%[b]) \n"
+        // FL -> IS race: ft2 should contain in0 at end, *not* in3
+        "fld      ft2,  (4*8)   (%[b]) \n"
+        "sw       t2,   (5*8)   (%[b]) \n"
+        "sw       t3,   (5*8+4) (%[b]) \n"
+        // WB: work[5] should contain in0 at end, *not* in1, in2 or in3
+        "fsd      ft2,  (5*8)   (%[b]) \n"
+        // FL -> Atomic race: AMOs modify memory!
+        "fld      ft2,  (3*8)   (%[b]) \n"
+        "fsd      ft2,  (6*8)   (%[b]) \n"
+        "addi     t0, %[b], (6*8)      \n"
+        "addi     t1, zero, 0xF        \n"
+        // WB: work[7] should be in3 (unmutated) and work[6] in3 with mant.+0xF
+        "fld      ft2,  (6*8)   (%[b]) \n"
+        "amoadd.w zero, t1, (t0)       \n"
+        "fsd      ft2,  (7*8)   (%[b]) \n" ::[b] "r"(work)
+        : "t0", "t1", "t2", "t3", "ft0", "ft1", "ft2", "memory");
+
+    // Replicate AMO magic
+    volatile double tmp = work[3];
+    volatile uint32_t *tmp_lo = (volatile uint32_t *)(void *)&tmp;
+    *tmp_lo += 0xF;
+
+    // Verify contents of output fields
+    volatile uint32_t o0c = (work[4] == work[0]);
+    volatile uint32_t o1c = (work[5] == work[0]);
+    volatile uint32_t o2c = (work[6] == tmp);
+    volatile uint32_t o3c = (work[7] == work[3]);
+
+    // Compose, atomically add output nibble
+    volatile uint32_t ret_loc =
+        ((o3c & 1) << 3) | ((o2c & 1) << 2) | ((o1c & 1) << 1) | (o1c & 1);
+    __atomic_fetch_add(ret, -ret_loc, __ATOMIC_RELAXED);
+
+    // Let us see if all cores arrive here
+    snrt_cluster_hw_barrier();
+    return (core_id == 0 ? *ret : 0);
+}
diff --git a/sw/tests/caq_frep.c b/sw/tests/caq_frep.c
@@ -0,0 +1,117 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+#include <snrt.h>
+
+#define NUM_WORKERS 8
+
+int main() {
+    uint32_t core_id = snrt_cluster_core_idx();
+
+    // Only use one cluster
+    if (snrt_cluster_idx() != 0 || core_id >= NUM_WORKERS) {
+        snrt_cluster_hw_barrier();
+        snrt_cluster_hw_barrier();
+        return 0;
+    }
+
+    // Allocate and initialize common return for all cores
+    volatile uint32_t *ret = (volatile uint32_t *)snrt_l1_next();
+    if (core_id == 0) {
+        *ret = NUM_WORKERS * 0b1111;
+        asm volatile("fence" ::: "memory");
+    }
+    snrt_cluster_hw_barrier();
+
+    // Allocate 8 doubles to work on on stack; 4 inputs and 5 outputs
+    volatile double work[9] = {3.4232857249561 + 0.565 * core_id,  // in0
+                               2.3164242512938 + 0.565 * core_id,  // in1
+                               8.3332613559798 + 0.565 * core_id,  // in2
+                               5.6413213082822 + 0.565 * core_id,  // in3
+                               -1.0,
+                               -1.0,
+                               -1.0,
+                               -1.0,
+                               -1.0};
+
+    // Test integer-FP load-store races using FREP
+    asm volatile(
+        // Preload t0-2 with zero
+        "mv       t0, zero             \n"
+        "mv       t1, zero             \n"
+        "mv       t2, zero             \n"
+        // Preload ft0-7 with in0-3 and in3-0 (reversed)
+        "fld      ft0,   (0*8)  (%[b]) \n"
+        "fld      ft1,   (1*8)  (%[b]) \n"
+        "fld      ft2,   (2*8)  (%[b]) \n"
+        "fld      ft3,   (3*8)  (%[b]) \n"
+        "fld      ft4,   (3*8)  (%[b]) \n"
+        "fld      ft5,   (2*8)  (%[b]) \n"
+        "fld      ft6,   (1*8)  (%[b]) \n"
+        "fld      ft7,   (0*8)  (%[b]) \n"
+        // Fill buffer with float stores and issue integer loads at the same
+        // time. The integer loads should *not* overtake the first (nonrepeat)
+        // stores. The repeated stores overwriting the to-be-loaded data
+        // *should* be overtaken and *not* block the (nonrepeated) integer
+        // loads. In the end, the integer regs should contain the LSWs of
+        // in0-2 and work[4:7] should contain in3-0 (reverse order).
+        "frep.o %[c4], 4, 7, 0b0100    \n"
+        "fsd      ft0,   (4*8)  (%[b]) \n"
+        "lw       t0,    (4*8)  (%[b]) \n"
+        "fsd      ft1,   (5*8)  (%[b]) \n"
+        "lw       t1,    (5*8)  (%[b]) \n"
+        "fsd      ft2,   (6*8)  (%[b]) \n"
+        "lw       t2,    (6*8)  (%[b]) \n"
+        "fsd      ft3,   (7*8)  (%[b]) \n"
+        // Synchronize to wait for FREP to conclude
+        "fmv.x.w  t4, ft3              \n"
+        "mv       zero, t4             \n"
+        // We check the contents of t0-2 by overwriting the LSWs of work[7:5].
+        // This should not change work[7:5] unless t0-2 are wrong.
+        "sw       t0,    (7*8)  (%[b]) \n"
+        "sw       t1,    (6*8)  (%[b]) \n"
+        "sw       t2,    (5*8)  (%[b]) \n"
+        // Quick nonverifying check with a single-instruction FREP.I.
+        // Make sure in trace this does not stall with different targets
+        "frep.i   %[c100], 1, 3, 0b001 \n"
+        "fsd      ft1,   (8*8)  (%[b]) \n"
+        "lw       t0,    (8*8)  (%[b]) \n"
+        // We check FREP.I by repeatedly loading (for a sufficiently long time)
+        // work[8], which we write in3 to using the integer core. The needed
+        // instructions for the latter are issued *after* the FREP, but should
+        // overtake repeated loads (at least the last), leading to a repeated
+        // store of in3 in work[8] previously containing in1. We Finally mutate
+        // work[8] with itself to ensure our float store blocks the int load.
+        "frep.i   %[c100], 2, 3, 0b0   \n"
+        "fld      ft0,   (8*8)  (%[b]) \n"
+        "lw       t0,    (3*8)  (%[b]) \n"
+        "lw       t1,  (3*8+4)  (%[b]) \n"
+        "sw       t0,    (8*8)  (%[b]) \n"
+        "sw       t1,  (8*8+4)  (%[b]) \n"
+        "fsd      ft0,   (8*8)  (%[b]) \n"
+        // Load LSW of just-stored work[8] into t0 to get in3, not in0.
+        "lw       t0,    (8*8)  (%[b]) \n"
+        // Synchronize
+        "fmv.x.w  t4, ft0              \n"
+        "mv       zero, t4             \n"
+        // Store t0 back to LSW of work[8] which should not mutate it.
+        "sw       t0,    (8*8)  (%[b]) \n" ::[b] "r"(work),
+        [ c4 ] "r"(4), [ c100 ] "r"(100)
+        : "t0", "t1", "t2", "t3", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5",
+          "ft6", "ft7", "memory");
+
+    // Verify contents of output fields
+    volatile uint32_t o0c = (work[7] == work[0]);
+    volatile uint32_t o1c = (work[6] == work[1]);
+    volatile uint32_t o2c = (work[5] == work[2]);
+    volatile uint32_t o3c = (work[4] == work[3] || work[8] != work[3]);
+
+    // Compose, atomically add output nibble
+    volatile uint32_t ret_loc =
+        ((o3c & 1) << 3) | ((o2c & 1) << 2) | ((o1c & 1) << 1) | (o1c & 1);
+    __atomic_fetch_add(ret, -ret_loc, __ATOMIC_RELAXED);
+
+    // Let us see if all cores arrive here
+    snrt_cluster_hw_barrier();
+    return (core_id == 0 ? *ret : 0);
+}
diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml
@@ -69,6 +69,9 @@ runs:
   - elf: tests/build/zero_mem.elf
   - elf: tests/build/non_null_exitcode.elf
     retcode: 126
+  - elf: tests/build/caq.elf
+  - elf: tests/build/caq_frep.elf
+    simulators: [vsim, vcs, verilator] # banshee does not model FREP timing
   - elf: apps/blas/axpy/build/axpy.elf
     cmd: [../../../sw/blas/axpy/verify.py, "${sim_bin}", "${elf}"]
   - elf: apps/blas/gemm/build/gemm.elf