sw: Sharpen CAQ tests, ensure full failure on pre-CAQ HW

pulp-platform · Feb 28, 2024 · 8b26e42 · 8b26e42
1 parent db3e00c
commit 8b26e42
Show file tree

Hide file tree

Showing 2 changed files with 94 additions and 23 deletions.
diff --git a/sw/tests/caq.c b/sw/tests/caq.c
@@ -1,10 +1,23 @@
 // Copyright 2020 ETH Zurich and University of Bologna.
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
+
+// Without CAQ, all checks in all cores should fail (return 8*0b1111 == 120).
+// With CAQ, all checks in all cores should pass (return 8*0b0000 == 0).
+
 #include <snrt.h>
 
 #define NUM_WORKERS 8
 
+// To prevent X reads on non-CAQ-proofed systems, we need a sync
+inline void fp_sync() {
+    asm volatile(
+        "fmv.x.w  t0, ft3  \n"
+        "mv       zero, t0 \n"
+        "fence             \n" ::
+            : "t0", "memory");
+}
+
 int main() {
     uint32_t core_id = snrt_cluster_core_idx();
 
@@ -33,6 +46,9 @@ int main() {
                                -1.0,
                                -1.0};
 
+    // Ensure FP data is written even without CAQ (prevents X loads)
+    fp_sync();
+
     // Test integer-FP load-store races
     asm volatile(
         // Preload ft0 with in0
@@ -71,22 +87,33 @@ int main() {
         "fsd      ft2,  (6*8)   (%[b]) \n"
         "addi     t0, %[b], (6*8)      \n"
         "addi     t1, zero, 0xF        \n"
-        // WB: work[7] should be in3 (unmutated) and work[6] in3 with mant.+0xF
-        "fld      ft2,  (6*8)   (%[b]) \n"
-        "amoadd.w zero, t1, (t0)       \n"
-        "fsd      ft2,  (7*8)   (%[b]) \n" ::[b] "r"(work)
+        // Stall-spam sequencer: ensures fld happens *after* atomic without CAQ
+        "fmadd.d  ft0, ft2, ft3, ft0   \n"
+        "fmadd.d  ft0, ft2, ft3, ft0   \n"
+        "fmadd.d  ft0, ft2, ft3, ft0   \n"
+        "fmadd.d  ft0, ft2, ft3, ft0   \n"
+        // WB: work[7] should be in1 (unmutated) and work[6] in1 with mant.+0xF
+        "fsd      ft1,  (6*8)   (%[b]) \n"
+        "amoadd.w t2, t1, (t0)         \n"
+        "fsd      ft1,  (7*8)   (%[b]) \n"
+        // Sync before AMO writeback to prevent race with fsd without CAQ
+        "fmv.x.w  t0, ft3              \n"
+        "mv       zero, t0             \n"
+        "sw       t2,   (7*8)   (%[b]) \n" ::[b] "r"(work)
         : "t0", "t1", "t2", "t3", "ft0", "ft1", "ft2", "memory");
 
-    // Replicate AMO magic
-    volatile double tmp = work[3];
+    // Replicate AMO magic (with necessary syncs)
+    volatile double tmp = work[1];
+    fp_sync();
     volatile uint32_t *tmp_lo = (volatile uint32_t *)(void *)&tmp;
     *tmp_lo += 0xF;
+    fp_sync();
 
     // Verify contents of output fields
     volatile uint32_t o0c = (work[4] == work[0]);
     volatile uint32_t o1c = (work[5] == work[0]);
     volatile uint32_t o2c = (work[6] == tmp);
-    volatile uint32_t o3c = (work[7] == work[3]);
+    volatile uint32_t o3c = (work[7] == work[1]);
 
     // Compose, atomically add output nibble
     volatile uint32_t ret_loc =

diff --git a/sw/tests/caq_frep.c b/sw/tests/caq_frep.c
@@ -1,10 +1,23 @@
 // Copyright 2020 ETH Zurich and University of Bologna.
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
+
+// Without CAQ, all checks in all cores should fail (return 8*0b11111 == 248).
+// With CAQ, all checks in all cores should pass (return 8*0b00000 == 0).
+
 #include <snrt.h>
 
 #define NUM_WORKERS 8
 
+// To prevent X reads on non-CAQ-proofed systems, we need a sync
+inline void fp_sync() {
+    asm volatile(
+        "fmv.x.w  t0, ft3  \n"
+        "mv       zero, t0 \n"
+        "fence             \n" ::
+            : "t0", "memory");
+}
+
 int main() {
     uint32_t core_id = snrt_cluster_core_idx();
 
@@ -18,7 +31,7 @@ int main() {
     // Allocate and initialize common return for all cores
     volatile uint32_t *ret = (volatile uint32_t *)snrt_l1_next();
     if (core_id == 0) {
-        *ret = NUM_WORKERS * 0b1111;
+        *ret = NUM_WORKERS * 0b11111;
         asm volatile("fence" ::: "memory");
     }
     snrt_cluster_hw_barrier();
@@ -34,6 +47,9 @@ int main() {
                                -1.0,
                                -1.0};
 
+    // Ensure FP data is written even without CAQ (prevents X loads)
+    fp_sync();
+
     // Test integer-FP load-store races using FREP
     asm volatile(
         // Preload t0-2 with zero
@@ -49,6 +65,11 @@ int main() {
         "fld      ft5,   (2*8)  (%[b]) \n"
         "fld      ft6,   (1*8)  (%[b]) \n"
         "fld      ft7,   (0*8)  (%[b]) \n"
+        // Stall-spam sequencer: ensures fsd's happen *after* lw's without CAQ
+        "fmadd.d  ft0, ft2, ft3, ft0   \n"
+        "fmadd.d  ft0, ft2, ft3, ft0   \n"
+        "fmadd.d  ft0, ft2, ft3, ft0   \n"
+        "fmadd.d  ft0, ft2, ft3, ft0   \n"
         // Fill buffer with float stores and issue integer loads at the same
         // time. The integer loads should *not* overtake the first (nonrepeat)
         // stores. The repeated stores overwriting the to-be-loaded data
@@ -64,8 +85,11 @@ int main() {
         "lw       t2,    (6*8)  (%[b]) \n"
         "fsd      ft3,   (7*8)  (%[b]) \n"
         // Synchronize to wait for FREP to conclude
-        "fmv.x.w  t4, ft3              \n"
-        "mv       zero, t4             \n"
+        "fmv.x.w  t3, ft3              \n"
+        "mv       zero, t3             \n"
+        // Preload t3-t4 with in1
+        "lw       t3,    (1*8)  (%[b]) \n"
+        "lw       t4,  (1*8+4)  (%[b]) \n"
         // We check the contents of t0-2 by overwriting the LSWs of work[7:5].
         // This should not change work[7:5] unless t0-2 are wrong.
         "sw       t0,    (7*8)  (%[b]) \n"
@@ -74,41 +98,61 @@ int main() {
         // Quick nonverifying check with a single-instruction FREP.I.
         // Make sure in trace this does not stall with different targets
         "frep.i   %[c100], 1, 3, 0b001 \n"
-        "fsd      ft1,   (8*8)  (%[b]) \n"
-        "lw       t0,    (8*8)  (%[b]) \n"
+        "fsd      ft1,   (4*8)  (%[b]) \n"
+        "lw       t0,    (4*8)  (%[b]) \n"
+        // Stall-spam sequencer to ensure FPSS is behind on execution
+        "fmadd.d  ft0, ft2, ft3, ft0   \n"
+        "fmadd.d  ft0, ft2, ft3, ft0   \n"
+        "fmadd.d  ft0, ft2, ft3, ft0   \n"
+        "fmadd.d  ft0, ft2, ft3, ft0   \n"
         // We check FREP.I by repeatedly loading (for a sufficiently long time)
-        // work[8], which we write in3 to using the integer core. The needed
+        // work[4], which we write in3 to using the integer core. The needed
         // instructions for the latter are issued *after* the FREP, but should
         // overtake repeated loads (at least the last), leading to a repeated
         // store of in3 in work[8] previously containing in1. We Finally mutate
         // work[8] with itself to ensure our float store blocks the int load.
+        "fsd      ft1,   (8*8)  (%[b]) \n"
         "frep.i   %[c100], 2, 3, 0b0   \n"
-        "fld      ft0,   (8*8)  (%[b]) \n"
+        "fld      ft0,   (4*8)  (%[b]) \n"
         "lw       t0,    (3*8)  (%[b]) \n"
         "lw       t1,  (3*8+4)  (%[b]) \n"
-        "sw       t0,    (8*8)  (%[b]) \n"
-        "sw       t1,  (8*8+4)  (%[b]) \n"
+        "sw       t0,    (4*8)  (%[b]) \n"
+        "sw       t1,  (4*8+4)  (%[b]) \n"
         "fsd      ft0,   (8*8)  (%[b]) \n"
+        // Try to spoil later store of in1 in work[4] if core skips FREP.
+        "fsd      ft3,   (4*8)  (%[b]) \n"
         // Load LSW of just-stored work[8] into t0 to get in3, not in0.
         "lw       t0,    (8*8)  (%[b]) \n"
+        // Store to work[4] to observe possible reorder of next step.
+        // If this goes wrong, work[4] will contain in3 in the end, not in1.
+        "fsd      ft3,   (8*8)  (%[b]) \n"
+        // Store in1 to work[4] with integer core. If we have no CAQ,
+        // we skip past the FREP and this results in incorrect work[8].
+        // We store only the LSW to ensure this happens after the prior fsd.
+        "sw       t3,    (4*8)  (%[b]) \n"
+        "sw       t4,  (4*8+4)  (%[b]) \n"
         // Synchronize
-        "fmv.x.w  t4, ft0              \n"
-        "mv       zero, t4             \n"
+        "fmv.x.w  t3, ft0              \n"
+        "mv       zero, t3             \n"
         // Store t0 back to LSW of work[8] which should not mutate it.
         "sw       t0,    (8*8)  (%[b]) \n" ::[b] "r"(work),
         [ c4 ] "r"(4), [ c100 ] "r"(100)
-        : "t0", "t1", "t2", "t3", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5",
-          "ft6", "ft7", "memory");
+        : "t0", "t1", "t2", "t3", "t4", "ft0", "ft1", "ft2", "ft3", "ft4",
+          "ft5", "ft6", "ft7", "memory");
+
+    // Ensure integer stores are written
+    fp_sync();
 
     // Verify contents of output fields
     volatile uint32_t o0c = (work[7] == work[0]);
     volatile uint32_t o1c = (work[6] == work[1]);
     volatile uint32_t o2c = (work[5] == work[2]);
-    volatile uint32_t o3c = (work[4] == work[3] || work[8] != work[3]);
+    volatile uint32_t o3c = (work[4] == work[1]);
+    volatile uint32_t o4c = (work[8] == work[3]);
 
     // Compose, atomically add output nibble
-    volatile uint32_t ret_loc =
-        ((o3c & 1) << 3) | ((o2c & 1) << 2) | ((o1c & 1) << 1) | (o1c & 1);
+    volatile uint32_t ret_loc = ((o4c & 1) << 4) | ((o3c & 1) << 3) |
+                                ((o2c & 1) << 2) | ((o1c & 1) << 1) | (o1c & 1);
     __atomic_fetch_add(ret, -ret_loc, __ATOMIC_RELAXED);
 
     // Let us see if all cores arrive here