Skip to content

Commit

Permalink
sw: Add synthetic tests for CAQ races and CAQ-FREP interaction
Browse files Browse the repository at this point in the history
  • Loading branch information
paulsc96 committed Feb 14, 2024
1 parent 275ac25 commit 41dc987
Show file tree
Hide file tree
Showing 3 changed files with 219 additions and 0 deletions.
99 changes: 99 additions & 0 deletions sw/tests/caq.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// Copyright 2020 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
#include <snrt.h>

#define NUM_WORKERS 8

int main() {
uint32_t core_id = snrt_cluster_core_idx();

// Only use one cluster
if (snrt_cluster_idx() != 0 || core_id >= NUM_WORKERS) {
snrt_cluster_hw_barrier();
snrt_cluster_hw_barrier();
return 0;
}

// Allocate and initialize common return for all cores
volatile uint32_t *ret = (volatile uint32_t *)snrt_l1_next();
if (core_id == 0) {
*ret = NUM_WORKERS * 0b1111;
asm volatile("fence" ::: "memory");
}
snrt_cluster_hw_barrier();

// Allocate 8 doubles to work on on stack; 4 inputs and 4 outputs
volatile double work[8] = {3.4232857249561 + 0.565 * core_id, // in0
2.3164242512938 + 0.565 * core_id, // in1
8.3332613559798 + 0.565 * core_id, // in2
5.6413213082822 + 0.565 * core_id, // in3
-1.0,
-1.0,
-1.0,
-1.0};

// Test integer-FP load-store races
asm volatile(
// Preload ft0 with in0
"fld ft0, 0 (%[b]) \n"
// Preload ft0 with in1
"fld ft1, (1*8) (%[b]) \n"
// Preload {t1, t0} with in2
"lw t0, (2*8) (%[b]) \n"
"lw t1, (2*8+4) (%[b]) \n"
// Preload {t3, t2} with in3
"lw t2, (3*8) (%[b]) \n"
"lw t3, (3*8+4) (%[b]) \n"
// Preload work[4] with in2 (x guard)
"sw t0, (4*8) (%[b]) \n"
"sw t1, (4*8+4) (%[b]) \n"
// Preload work[5] with in3 (x guard)
"sw t2, (5*8) (%[b]) \n"
"sw t3, (5*8+4) (%[b]) \n"

// FS -> IL race: {t1, t0} should contain in0 at end, *not* in2
"fsd ft0, (4*8) (%[b]) \n"
"lw t0, (4*8) (%[b]) \n"
"lw t1, (4*8+4) (%[b]) \n"
// FS -> IS race: work[4] should contain in0 at end, *not* in1 or in2
"fsd ft1, (4*8) (%[b]) \n"
"sw t0, (4*8) (%[b]) \n"
"sw t1, (4*8+4) (%[b]) \n"
// FL -> IS race: ft2 should contain in0 at end, *not* in3
"fld ft2, (4*8) (%[b]) \n"
"sw t2, (5*8) (%[b]) \n"
"sw t3, (5*8+4) (%[b]) \n"
// WB: work[5] should contain in0 at end, *not* in1, in2 or in3
"fsd ft2, (5*8) (%[b]) \n"
// FL -> Atomic race: AMOs modify memory!
"fld ft2, (3*8) (%[b]) \n"
"fsd ft2, (6*8) (%[b]) \n"
"addi t0, %[b], (6*8) \n"
"addi t1, zero, 0xF \n"
// WB: work[7] should be in3 (unmutated) and work[6] in3 with mant.+0xF
"fld ft2, (6*8) (%[b]) \n"
"amoadd.w zero, t1, (t0) \n"
"fsd ft2, (7*8) (%[b]) \n" ::[b] "r"(work)
: "t0", "t1", "t2", "t3", "ft0", "ft1", "ft2", "memory");

// Replicate AMO magic
volatile double tmp = work[3];
volatile uint32_t *tmp_lo = (volatile uint32_t *)(void *)&tmp;
*tmp_lo += 0xF;

// Verify contents of output fields
volatile uint32_t o0c = (work[4] == work[0]);
volatile uint32_t o1c = (work[5] == work[0]);
volatile uint32_t o2c = (work[6] == tmp);
volatile uint32_t o3c = (work[7] == work[3]);

// Compose, atomically add output nibble
volatile uint32_t ret_loc =
((o3c & 1) << 3) | ((o2c & 1) << 2) | ((o1c & 1) << 1) | (o1c & 1);
__atomic_fetch_add(ret, -ret_loc, __ATOMIC_RELAXED);

// Let us see if all cores arrive here
snrt_cluster_hw_barrier();
return (core_id == 0 ? *ret : 0);
}
117 changes: 117 additions & 0 deletions sw/tests/caq_frep.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// Copyright 2020 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
#include <snrt.h>

#define NUM_WORKERS 8

int main() {
uint32_t core_id = snrt_cluster_core_idx();

// Only use one cluster
if (snrt_cluster_idx() != 0 || core_id >= NUM_WORKERS) {
snrt_cluster_hw_barrier();
snrt_cluster_hw_barrier();
return 0;
}

// Allocate and initialize common return for all cores
volatile uint32_t *ret = (volatile uint32_t *)snrt_l1_next();
if (core_id == 0) {
*ret = NUM_WORKERS * 0b1111;
asm volatile("fence" ::: "memory");
}
snrt_cluster_hw_barrier();

// Allocate 8 doubles to work on on stack; 4 inputs and 5 outputs
volatile double work[9] = {3.4232857249561 + 0.565 * core_id, // in0
2.3164242512938 + 0.565 * core_id, // in1
8.3332613559798 + 0.565 * core_id, // in2
5.6413213082822 + 0.565 * core_id, // in3
-1.0,
-1.0,
-1.0,
-1.0,
-1.0};

// Test integer-FP load-store races using FREP
asm volatile(
// Preload t0-2 with zero
"mv t0, zero \n"
"mv t1, zero \n"
"mv t2, zero \n"
// Preload ft0-7 with in0-3 and in3-0 (reversed)
"fld ft0, (0*8) (%[b]) \n"
"fld ft1, (1*8) (%[b]) \n"
"fld ft2, (2*8) (%[b]) \n"
"fld ft3, (3*8) (%[b]) \n"
"fld ft4, (3*8) (%[b]) \n"
"fld ft5, (2*8) (%[b]) \n"
"fld ft6, (1*8) (%[b]) \n"
"fld ft7, (0*8) (%[b]) \n"
// Fill buffer with float stores and issue integer loads at the same
// time. The integer loads should *not* overtake the first (nonrepeat)
// stores. The repeated stores overwriting the to-be-loaded data
// *should* be overtaken and *not* block the (nonrepeated) integer
// loads. In the end, the integer regs should contain the LSWs of
// in0-2 and work[4:7] should contain in3-0 (reverse order).
"frep.o %[c4], 4, 7, 0b0100 \n"
"fsd ft0, (4*8) (%[b]) \n"
"lw t0, (4*8) (%[b]) \n"
"fsd ft1, (5*8) (%[b]) \n"
"lw t1, (5*8) (%[b]) \n"
"fsd ft2, (6*8) (%[b]) \n"
"lw t2, (6*8) (%[b]) \n"
"fsd ft3, (7*8) (%[b]) \n"
// Synchronize to wait for FREP to conclude
"fmv.x.w t4, ft3 \n"
"mv zero, t4 \n"
// We check the contents of t0-2 by overwriting the LSWs of work[7:5].
// This should not change work[7:5] unless t0-2 are wrong.
"sw t0, (7*8) (%[b]) \n"
"sw t1, (6*8) (%[b]) \n"
"sw t2, (5*8) (%[b]) \n"
// Quick nonverifying check with a single-instruction FREP.I.
// Make sure in trace this does not stall with different targets
"frep.i %[c100], 1, 3, 0b001 \n"
"fsd ft1, (8*8) (%[b]) \n"
"lw t0, (8*8) (%[b]) \n"
// We check FREP.I by repeatedly loading (for a sufficiently long time)
// work[8], which we write in3 to using the integer core. The needed
// instructions for the latter are issued *after* the FREP, but should
// overtake repeated loads (at least the last), leading to a repeated
// store of in3 in work[8] previously containing in1. We Finally mutate
// work[8] with itself to ensure our float store blocks the int load.
"frep.i %[c100], 2, 3, 0b0 \n"
"fld ft0, (8*8) (%[b]) \n"
"lw t0, (3*8) (%[b]) \n"
"lw t1, (3*8+4) (%[b]) \n"
"sw t0, (8*8) (%[b]) \n"
"sw t1, (8*8+4) (%[b]) \n"
"fsd ft0, (8*8) (%[b]) \n"
// Load LSW of just-stored work[8] into t0 to get in3, not in0.
"lw t0, (8*8) (%[b]) \n"
// Synchronize
"fmv.x.w t4, ft0 \n"
"mv zero, t4 \n"
// Store t0 back to LSW of work[8] which should not mutate it.
"sw t0, (8*8) (%[b]) \n" ::[b] "r"(work),
[ c4 ] "r"(4), [ c100 ] "r"(100)
: "t0", "t1", "t2", "t3", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5",
"ft6", "ft7", "memory");

// Verify contents of output fields
volatile uint32_t o0c = (work[7] == work[0]);
volatile uint32_t o1c = (work[6] == work[1]);
volatile uint32_t o2c = (work[5] == work[2]);
volatile uint32_t o3c = (work[4] == work[3] || work[8] != work[3]);

// Compose, atomically add output nibble
volatile uint32_t ret_loc =
((o3c & 1) << 3) | ((o2c & 1) << 2) | ((o1c & 1) << 1) | (o1c & 1);
__atomic_fetch_add(ret, -ret_loc, __ATOMIC_RELAXED);

// Let us see if all cores arrive here
snrt_cluster_hw_barrier();
return (core_id == 0 ? *ret : 0);
}
3 changes: 3 additions & 0 deletions target/snitch_cluster/sw/run.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ runs:
- elf: tests/build/zero_mem.elf
- elf: tests/build/non_null_exitcode.elf
retcode: 126
- elf: tests/build/caq.elf
- elf: tests/build/caq_frep.elf
simulators: [vsim, vcs, verilator] # banshee does not precisely simulate FREP timing
- elf: apps/blas/axpy/build/axpy.elf
cmd: [../../../sw/blas/axpy/verify.py, "${sim_bin}", "${elf}"]
- elf: apps/blas/gemm/build/gemm.elf
Expand Down

0 comments on commit 41dc987

Please sign in to comment.