From 01dec6fb6f1fb735faede453f32aeef5364d6230 Mon Sep 17 00:00:00 2001 From: Stephen Street Date: Mon, 8 Jul 2024 16:38:06 -0700 Subject: [PATCH] Add C11 standard atomic support (#1645) * Add runtime support for stdatomics * Fix lock calculation and enable atomic_flag support --- src/rp2_common/CMakeLists.txt | 1 + src/rp2_common/pico_atomic/CMakeLists.txt | 11 + .../pico_atomic/include/stdatomic.h | 27 ++ src/rp2_common/pico_atomic/pico_atomic.c | 345 ++++++++++++++++++ src/rp2_common/pico_runtime/CMakeLists.txt | 3 + 5 files changed, 387 insertions(+) create mode 100644 src/rp2_common/pico_atomic/CMakeLists.txt create mode 100644 src/rp2_common/pico_atomic/include/stdatomic.h create mode 100644 src/rp2_common/pico_atomic/pico_atomic.c diff --git a/src/rp2_common/CMakeLists.txt b/src/rp2_common/CMakeLists.txt index 82f56f8c5..19c52c146 100644 --- a/src/rp2_common/CMakeLists.txt +++ b/src/rp2_common/CMakeLists.txt @@ -51,6 +51,7 @@ if (NOT PICO_BARE_METAL) pico_add_subdirectory(pico_malloc) pico_add_subdirectory(pico_printf) pico_add_subdirectory(pico_rand) + pico_add_subdirectory(pico_atomic) pico_add_subdirectory(pico_stdio) pico_add_subdirectory(pico_stdio_semihosting) diff --git a/src/rp2_common/pico_atomic/CMakeLists.txt b/src/rp2_common/pico_atomic/CMakeLists.txt new file mode 100644 index 000000000..5670430eb --- /dev/null +++ b/src/rp2_common/pico_atomic/CMakeLists.txt @@ -0,0 +1,11 @@ +if (NOT TARGET pico_atomic) + pico_add_library(pico_atomic) + + target_sources(pico_atomic INTERFACE + ${CMAKE_CURRENT_LIST_DIR}/pico_atomic.c + ) + + target_include_directories(pico_atomic_headers INTERFACE ${CMAKE_CURRENT_LIST_DIR}/include) + + target_link_libraries(pico_atomic INTERFACE pico_sync) +endif() diff --git a/src/rp2_common/pico_atomic/include/stdatomic.h b/src/rp2_common/pico_atomic/include/stdatomic.h new file mode 100644 index 000000000..071286f0b --- /dev/null +++ b/src/rp2_common/pico_atomic/include/stdatomic.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2024 Raspberry Pi (Trading) Ltd. + * Copyright (c) 2024 Stephen Street (stephen@redrocketcomputing.com). + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#ifndef __STDATOMIC_H +#define __STDATOMIC_H + +#include_next + +#undef atomic_flag_test_and_set +#undef atomic_flag_test_and_set_explicit +#undef atomic_flag_clear +#undef atomic_flag_clear_explicit + +extern _Bool __atomic_test_and_set_m0(volatile void *mem, int model); +extern void __atomic_clear_m0 (volatile void *mem, int model); + +#define atomic_flag_test_and_set(PTR) __atomic_test_and_set_m0((PTR), __ATOMIC_SEQ_CST) +#define atomic_flag_test_and_set_explicit(PTR, MO) __atomic_test_and_set_m0((PTR), (MO)) + +#define atomic_flag_clear(PTR) __atomic_clear_m0((PTR), __ATOMIC_SEQ_CST) +#define atomic_flag_clear_explicit(PTR, MO) __atomic_clear_m0((PTR), (MO)) + +#endif diff --git a/src/rp2_common/pico_atomic/pico_atomic.c b/src/rp2_common/pico_atomic/pico_atomic.c new file mode 100644 index 000000000..8dd6b9685 --- /dev/null +++ b/src/rp2_common/pico_atomic/pico_atomic.c @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2024 Raspberry Pi (Trading) Ltd. + * Copyright (c) 2024 Stephen Street (stephen@redrocketcomputing.com). + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include +#include + +#include "hardware/address_mapped.h" +#include "hardware/regs/watchdog.h" +#include "hardware/sync.h" + +#include "pico/config.h" + +#ifndef __optimize +#define __optimize __attribute__((optimize("-Os"))) +#endif + +/* Must be powers of 2 */ +#define ATOMIC_STRIPE 4UL +#define ATOMIC_LOCKS 16UL +#define ATOMIC_LOCK_WIDTH 2UL +#define ATOMIC_LOCK_IDX_Pos ((sizeof(unsigned long) * 8) - (__builtin_clz(ATOMIC_STRIPE - 1))) +#define ATOMIC_LOCK_IDX_Msk (ATOMIC_LOCKS - 1UL) +#define ATOMIC_LOCK_REG ((io_rw_32 *)(WATCHDOG_BASE + WATCHDOG_SCRATCH3_OFFSET)) + +static __used __attribute__((section(".preinit_array.00030"))) void __atomic_init(void) { + *ATOMIC_LOCK_REG = 0; +} + +/* + To eliminate interference with existing hardware spinlock usage and reduce multicore contention on + unique atomic variables, we use one of the watchdog scratch registers (WATCHDOG_SCRATCH3) to + implement 16, 2 bit, multicore locks, via a varation of Dekker's algorithm + (see https://en.wikipedia.org/wiki/Dekker%27s_algorithm). The lock is selected as a + function of the variable address and the stripe width which hashes variables + addresses to locks numbers. +*/ +static __optimize uint32_t __atomic_lock(volatile void *mem) { + const uint32_t core = get_core_num(); + const uint32_t lock_idx = (((uintptr_t)mem) >> ATOMIC_LOCK_IDX_Pos) & ATOMIC_LOCK_IDX_Msk; + const uint32_t lock_pos = lock_idx * ATOMIC_LOCK_WIDTH; + const uint32_t lock_mask = ((1UL << ATOMIC_LOCK_WIDTH) - 1) << lock_pos; + const uint32_t locked_mask = 1UL << (lock_pos + core); + + uint32_t state = save_and_disable_interrupts(); + while (true) { + + /* First set the bit */ + hw_set_bits(ATOMIC_LOCK_REG, locked_mask); + __dmb(); + + /* Did we get the lock? */ + if ((*ATOMIC_LOCK_REG & lock_mask) == locked_mask) + break; + + /* Nope, clear our side */ + __dmb(); + hw_clear_bits(ATOMIC_LOCK_REG, locked_mask); + + /* Need to break any ties if the cores are in lock step, is this really required? */ + for (uint32_t i = core * 2; i > 0; --i) + asm volatile ("nop"); + } + + return state; +} + +static __optimize void __atomic_unlock(volatile void *mem, uint32_t state) { + const uint32_t lock_idx = (((uintptr_t)mem) >> ATOMIC_LOCK_IDX_Pos) & ATOMIC_LOCK_IDX_Msk; + const uint32_t lock_pos = lock_idx * ATOMIC_LOCK_WIDTH; + const uint32_t locked_mask = 1UL << (lock_pos + get_core_num()); + + __dmb(); + hw_clear_bits(ATOMIC_LOCK_REG, locked_mask); + restore_interrupts(state); +} + +__optimize uint8_t __atomic_fetch_add_1(volatile void *mem, uint8_t val, __unused int model) { + volatile uint8_t *ptr = mem; + uint8_t state = __atomic_lock(mem); + uint8_t result = *ptr; + *ptr += val; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint8_t __atomic_fetch_sub_1(volatile void *mem, uint8_t val, __unused int model) { + volatile uint8_t *ptr = mem; + uint8_t state = __atomic_lock(mem); + uint8_t result = *ptr; + *ptr -= val; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint8_t __atomic_fetch_and_1(volatile void *mem, uint8_t val, __unused int model) { + volatile uint8_t *ptr = mem; + uint8_t state = __atomic_lock(mem); + uint8_t result = *ptr; + *ptr &= val; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint8_t __atomic_fetch_or_1(volatile void *mem, uint8_t val, __unused int model) { + volatile uint8_t *ptr = mem; + uint8_t state = __atomic_lock(mem); + uint8_t result = *ptr; + *ptr |= val; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint8_t __atomic_exchange_1(volatile void *mem, uint8_t val, __unused int model) { + volatile uint8_t *ptr = mem; + uint8_t state = __atomic_lock(mem); + uint8_t result = *ptr; + *ptr = val; + __atomic_unlock(mem, state); + return result; +} + +__optimize bool __atomic_compare_exchange_1(volatile void *mem, void *expected, uint8_t desired, __unused bool weak, __unused int success, __unused int failure) { + bool result = false; + volatile uint8_t *ptr = mem; + uint8_t *e_ptr = expected; + uint8_t state = __atomic_lock(mem); + if (*ptr == *e_ptr) { + *ptr = desired; + result = true; + } else + *e_ptr = *ptr; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint16_t __atomic_fetch_add_2(volatile void *mem, uint16_t val, __unused int model) { + volatile uint16_t *ptr = mem; + uint16_t state = __atomic_lock(mem); + uint16_t result = *ptr; + *ptr += val; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint16_t __atomic_fetch_sub_2(volatile void *mem, uint16_t val, __unused int model) { + volatile uint16_t *ptr = mem; + uint16_t state = __atomic_lock(mem); + uint16_t result = *ptr; + *ptr -= val; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint16_t __atomic_fetch_and_2(volatile void *mem, uint16_t val, __unused int model) { + volatile uint16_t *ptr = mem; + uint16_t state = __atomic_lock(mem); + uint16_t result = *ptr; + *ptr &= val; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint16_t __atomic_fetch_or_2(volatile void *mem, uint16_t val, __unused int model) { + volatile uint16_t *ptr = mem; + uint16_t state = __atomic_lock(mem); + uint16_t result = *ptr; + *ptr |= val; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint16_t __atomic_exchange_2(volatile void *mem, uint16_t val, __unused int model) { + volatile uint16_t *ptr = mem; + uint16_t state = __atomic_lock(mem); + uint16_t result = *ptr; + *ptr = val; + __atomic_unlock(mem, state); + return result; +} + +__optimize bool __atomic_compare_exchange_2(volatile void *mem, void *expected, uint16_t desired, __unused bool weak, __unused int success, __unused int failure) { + bool result = false; + volatile uint16_t *ptr = mem; + uint16_t *e_ptr = expected; + uint16_t state = __atomic_lock(mem); + if (*ptr == *e_ptr) { + *ptr = desired; + result = true; + } else + *e_ptr = *ptr; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint32_t __atomic_fetch_add_4(volatile void *mem, uint32_t val, __unused int model) { + volatile uint32_t *ptr = mem; + uint32_t state = __atomic_lock(mem); + uint32_t result = *ptr; + *ptr += val; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint32_t __atomic_fetch_sub_4(volatile void *mem, uint32_t val, __unused int model) { + volatile uint32_t *ptr = mem; + uint32_t state = __atomic_lock(mem); + uint32_t result = *ptr; + *ptr -= val; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint32_t __atomic_fetch_and_4(volatile void *mem, uint32_t val, __unused int model) { + volatile uint32_t *ptr = mem; + uint32_t state = __atomic_lock(mem); + uint32_t result = *ptr; + *ptr &= val; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint32_t __atomic_fetch_or_4(volatile void *mem, uint32_t val, __unused int model) { + volatile uint32_t *ptr = mem; + uint32_t state = __atomic_lock(mem); + uint32_t result = *ptr; + *ptr |= val; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint32_t __atomic_exchange_4(volatile void *mem, uint32_t val, __unused int model) { + volatile uint32_t *ptr = mem; + uint32_t state = __atomic_lock(mem); + uint32_t result = *ptr; + *ptr = val; + __atomic_unlock(mem, state); + return result; +} + +__optimize bool __atomic_compare_exchange_4(volatile void *mem, void *expected, uint32_t desired, __unused bool weak, __unused int success, __unused int failure) { + bool result = false; + volatile uint32_t *ptr = mem; + uint32_t *e_ptr = expected; + uint32_t state = __atomic_lock(mem); + if (*ptr == *e_ptr) { + *ptr = desired; + result = true; + } else + *e_ptr = *ptr; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint64_t __atomic_fetch_add_8(volatile void *mem, uint64_t val, __unused int model) { + volatile uint64_t *ptr = mem; + uint64_t state = __atomic_lock(mem); + uint64_t result = *ptr; + *ptr += val; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint64_t __atomic_fetch_sub_8(volatile void *mem, uint64_t val, __unused int model) { + volatile uint64_t *ptr = mem; + uint64_t state = __atomic_lock(mem); + uint64_t result = *ptr; + *ptr -= val; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint64_t __atomic_fetch_and_8(volatile void *mem, uint64_t val, __unused int model) { + volatile uint64_t *ptr = mem; + uint64_t state = __atomic_lock(mem); + uint64_t result = *ptr; + *ptr &= val; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint64_t __atomic_fetch_or_8(volatile void *mem, uint64_t val, __unused int model) { + volatile uint64_t *ptr = mem; + uint64_t state = __atomic_lock(mem); + uint64_t result = *ptr; + *ptr |= val; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint64_t __atomic_exchange_8(volatile void *mem, uint64_t val, __unused int model) { + volatile uint64_t *ptr = mem; + uint64_t state = __atomic_lock(mem); + uint64_t result = *ptr; + *ptr = val; + __atomic_unlock(mem, state); + return result; +} + +__optimize bool __atomic_compare_exchange_8(volatile void *mem, void *expected, uint64_t desired, __unused bool weak, __unused int success, __unused int failure) { + bool result = false; + volatile uint64_t *ptr = mem; + uint64_t *e_ptr = expected; + uint64_t state = __atomic_lock(mem); + if (*ptr == *e_ptr) { + *ptr = desired; + result = true; + } else + *e_ptr = *ptr; + __atomic_unlock(mem, state); + return result; +} + +__optimize uint64_t __atomic_load_8(volatile void *mem, __unused int model) { + volatile uint64_t *ptr = mem; + uint32_t state = __atomic_lock(mem); + uint32_t result = *ptr; + __atomic_unlock(mem, state); + return result; +} + +__optimize void __atomic_store_8(volatile void *mem, uint64_t val, __unused int model) { + volatile uint64_t *ptr = mem; + uint32_t state = __atomic_lock(mem); + *ptr = val; + __atomic_unlock(mem, state); +} + +__optimize bool __atomic_test_and_set_m0(volatile void *mem, __unused int model) { + volatile bool *ptr = mem; + uint32_t state = __atomic_lock(mem); + volatile bool result = *ptr; + *ptr = true; + __atomic_unlock(mem, state); + return result; +} + +__optimize void __atomic_clear_m0(volatile void *mem, __unused int model) { + volatile bool *ptr = mem; + *ptr = false; + __dmb(); +} diff --git a/src/rp2_common/pico_runtime/CMakeLists.txt b/src/rp2_common/pico_runtime/CMakeLists.txt index 9879ba1dc..791bf8fe2 100644 --- a/src/rp2_common/pico_runtime/CMakeLists.txt +++ b/src/rp2_common/pico_runtime/CMakeLists.txt @@ -35,6 +35,9 @@ endif() if (TARGET pico_mem_ops) pico_mirrored_target_link_libraries(pico_runtime INTERFACE pico_mem_ops) endif() +if (TARGET pico_atomic) + pico_mirrored_target_link_libraries(pico_runtime INTERFACE pico_atomic) +endif() if (TARGET pico_standard_link) pico_mirrored_target_link_libraries(pico_runtime INTERFACE pico_standard_link) endif()