From c326a3b388e85f2235898d6736aa657f0ae6c075 Mon Sep 17 00:00:00 2001 From: Enrico Zelioli Date: Mon, 2 Dec 2024 16:28:39 +0100 Subject: [PATCH] Add SMP support --- hw/bootrom/cheshire_bootrom.S | 125 ++++++++++++++++++++++++---------- sw/include/smp.h | 60 +++++----------- sw/include/util.h | 34 +++++++++ sw/lib/crt0.S | 115 +++++++++++++++++++------------ sw/lib/smp.c | 51 ++++++++++++++ sw/link/common.ldh | 4 ++ sw/tests/smp_hello.c | 64 +++++++++++++++++ 7 files changed, 334 insertions(+), 119 deletions(-) create mode 100644 sw/lib/smp.c create mode 100644 sw/tests/smp_hello.c diff --git a/hw/bootrom/cheshire_bootrom.S b/hw/bootrom/cheshire_bootrom.S index 22bbeb9f0..c04f938f8 100644 --- a/hw/bootrom/cheshire_bootrom.S +++ b/hw/bootrom/cheshire_bootrom.S @@ -5,10 +5,15 @@ // Nicole Narr // Christopher Reinwardt // Paul Scheffler +// Enrico Zelioli -// TODO: Avoid hardcoding in addresses and offsets +#include +#include -#include "smp.h" +// The hart that non-SMP tests should run on +#ifndef NONSMP_HART +#define NONSMP_HART 0 +#endif .section .text._start @@ -47,7 +52,11 @@ _start: li x31, 0 // Pause SMP harts - smp_pause(t0, t1) + li t1, 0x8 + csrw mie, t1 + li t0, NONSMP_HART + csrr t1, mhartid + bne t0, t1, _wait_for_ipi // Init stack and global pointer with safe, linked values la sp, __stack_pointer$ @@ -57,56 +66,101 @@ _start: .option pop // If LLC present: Wait for end of BIST, then extend stack and set to all SPM - la t0, __base_regs - lw t0, 80(t0) // regs.HW_FEATURES - andi t0, t0, 2 // regs.HW_FEATURES.llc + la t0, __base_regs + lw t0, CHESHIRE_HW_FEATURES_REG_OFFSET(t0) + andi t0, t0, 2 // HW_FEATURES.llc beqz t0, _prom_check_run - la t0, __base_llc + la t0, __base_llc _wait_llc_bist: - lw t1, 72(t0) // llc.BIST_STATUS_DONE_BIT + lw t1, AXI_LLC_BIST_STATUS_REG_OFFSET(t0) // Check BIST status done bit beqz t1, _wait_llc_bist - li t1, -1 - sw t1, 0(t0) // llc.CFG_SPM_LOW - sw t1, 4(t0) // llc.CFG_SPM_HIGH - li t1, 1 - sw t1, 16(t0) // llc.CFG_COMMIT + li t1, -1 + sw t1, AXI_LLC_CFG_SPM_LOW_REG_OFFSET(t0) + sw t1, AXI_LLC_CFG_SPM_HIGH_REG_OFFSET(t0) + li t1, 1 + sw t1, AXI_LLC_COMMIT_CFG_REG_OFFSET(t0) // Correct stack to start at end of SPM - la t0, __base_regs - la sp, __base_spm - lw t0, 84(t0) // regs.LLC_SIZE - add sp, sp, t0 + la t0, __base_regs + la sp, __base_spm + lw t0, CHESHIRE_LLC_SIZE_REG_OFFSET(t0) + add sp, sp, t0 addi sp, sp, -8 // Enter Platform ROM if present. _prom_check_run: // Note that we have internal access to SPM here *if and only if* there is an LLC. la t0, __base_regs - lw t0, 72(t0) // regs.PLATFORM_ROM + lw t0, CHESHIRE_PLATFORM_ROM_REG_OFFSET(t0) beqz t0, _boot jalr t0 +// Move to next stage of booting +// 1. Write the address of next stage boot loader in Cheshire's scratch registers +// 2. Resume execution of all other harts .global boot_next_stage boot_next_stage: - // Non-SMP hart: Write boot address into global scratch registers - la t0, __base_regs - sw a0, 16(t0) // regs.SCRATCH[4] + + // Non-SMP hart: write boot address into global scratch registers + la t0, __base_regs + sw a0, CHESHIRE_SCRATCH_4_REG_OFFSET(t0) srli a0, a0, 32 - sw a0, 20(t0) // regs.SCRATCH[5] + sw a0, CHESHIRE_SCRATCH_5_REG_OFFSET(t0) fence - // Resume SMP harts - smp_resume(t0, t1, t2) + + // Resume SMP harts: set CLINT IPI registers + // NOTE: this will cause CLINT to send IPIs to all cores, therefore also the + // non-smp hart will receive one. The following instructions make sure that + // all harts will wait until the IPI is received (WFI with global ie disabled), + // then clear the IPI in the CLINT and wait until all other harts are done with it. + la t0, __base_clint + la t2, __base_regs + lw t2, CHESHIRE_NUM_INT_HARTS_REG_OFFSET(t2) + slli t2, t2, 2 + add t2, t0, t2 // t2 = CLINT_BASE + (n_harts * 4) +1: + li t1, 1 + sw t1, 0(t0) + addi t0, t0, 4 + blt t0, t2, 1b + +// Stall hart until IPI is raised +_wait_for_ipi: + + // Wait until this hart receives IPI + wfi + csrr t1, mip + andi t1, t1, 0x8 + beqz t1, _wait_for_ipi + + // Clear CLINT IPI register for this hart + la t0, __base_clint + csrr t1, mhartid + slli t1, t1, 2 + add t1, t1, t0 + sw zero, 0(t1) // *(CLINT_BASE + hart_id * 4) = 0 + + la t2, __base_regs + lw t2, CHESHIRE_NUM_INT_HARTS_REG_OFFSET(t2) + slli t2, t2, 2 + add t2, t0, t2 // t2 = CLINT_BASE + (n_harts * 4) + + // Wait until *all* CLINT IPI registers are cleared +1: + lw t1, 0(t0) + bnez t1, 1b + addi t0, t0, 4 + blt t0, t2, 1b + + // Jump to next stage // Load boot address from global scratch registers - la t0, __base_regs - lwu t1, 20(t0) // regs.SCRATCH[5] + la t0, __base_regs + lwu t1, CHESHIRE_SCRATCH_5_REG_OFFSET(t0) slli t1, t1, 32 - lwu t0, 16(t0) // regs.SCRATCH[4] - or t0, t0, t1 - // Store hartid to a0 - csrr a0, mhartid - // Jump to boot address - jalr ra, 0(t0) - // We should never get here - ret + lwu t0, CHESHIRE_SCRATCH_4_REG_OFFSET(t0) + or t0, t0, t1 + csrr a0, mhartid // Store hartid to a0 + jalr ra, 0(t0) // Jump to boot address + ret // We should never get here // Reset regs, full fence, then jump to main _boot: @@ -120,9 +174,10 @@ _boot: .global _exit _exit: // Save the return value to scratch register 2 and wait forever + // Set bit 0 to signal that the execution is done. slli a0, a0, 1 ori a0, a0, 1 la t0, __base_regs - sw a0, 8(t0) // regs.SCRATCH[2] + sw a0, CHESHIRE_SCRATCH_2_REG_OFFSET(t0) 1: wfi j 1b diff --git a/sw/include/smp.h b/sw/include/smp.h index d13d87579..ea77d8c4e 100644 --- a/sw/include/smp.h +++ b/sw/include/smp.h @@ -1,49 +1,25 @@ -// Copyright 2023 ETH Zurich and University of Bologna. +// Copyright 2022 ETH Zurich and University of Bologna. // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 +// +// Emanuele Parisi +// Enrico Zelioli #pragma once -// The hart that non-SMP tests should run on -#ifndef NONSMP_HART -#define NONSMP_HART 0 -#endif +#include +#include -// Let non-SMP hart continue and all other harts jump (and loop) in smp_resume -#define smp_pause(reg1, reg2) \ - li reg2, 0x8; \ - csrw mie, reg2; \ - li reg1, NONSMP_HART; \ - csrr reg2, mhartid; \ - bne reg1, reg2, 2f +#include "util.h" +#include "regs/cheshire.h" +#include "params.h" -#define smp_resume(reg1, reg2, reg3) \ - la reg1, __base_clint; \ - la reg3, __base_regs; \ - lw reg3, 76(reg3); /* regs.NUM_INT_HARTS */ \ - slli reg3, reg3, 2; \ - add reg3, reg1, reg3; \ - 1:; \ - li reg2, 1; \ - sw reg2, 0(reg1); \ - addi reg1, reg1, 4; \ - blt reg1, reg3, 1b; \ - 2:; \ - wfi; \ - csrr reg2, mip; \ - andi reg2, reg2, 0x8; \ - beqz reg2, 2b; \ - la reg1, __base_clint; \ - csrr reg2, mhartid; \ - slli reg2, reg2, 2; \ - add reg2, reg2, reg1; \ - sw zero, 0(reg2); \ - la reg3, __base_regs; \ - lw reg3, 76(reg3); /* regs.NUM_INT_HARTS */ \ - slli reg3, reg3, 2; \ - add reg3, reg1, reg3; \ - 3:; \ - lw reg2, 0(reg1); \ - bnez reg2, 3b; \ - addi reg1, reg1, 4; \ - blt reg1, reg3, 3b +/* + * Resume execution in all harts. + * Send an IPI to all harts except for hart 0. + */ +void smp_resume(void); + +void smp_barrier_init(); +void smp_barrier_up(uint64_t n_processes); +void smp_barrier_down(); diff --git a/sw/include/util.h b/sw/include/util.h index 07159945d..621636281 100644 --- a/sw/include/util.h +++ b/sw/include/util.h @@ -33,6 +33,10 @@ static inline void wfi() { asm volatile("wfi" ::: "memory"); } +static inline void nop() { + asm volatile("nop" ::: "memory"); +} + // Enables or disables M-mode timer interrupts. static inline void set_mtie(int enable) { if (enable) @@ -41,6 +45,29 @@ static inline void set_mtie(int enable) { asm volatile("csrc mie, %0" ::"r"(128) : "memory"); } +// Enables or disables M-mode software interrupts. +static inline void set_msie(int enable) { + if (enable) + asm volatile("csrs mie, %0" ::"r"(8) : "memory"); + else + asm volatile("csrc mie, %0" ::"r"(8) : "memory"); +} + +// Enables or disables M-mode software interrupts pending bit. +static inline void set_msip(int enable) { + if (enable) + asm volatile("csrs mip, %0" ::"r"(8) : "memory"); + else + asm volatile("csrc mip, %0" ::"r"(8) : "memory"); +} + +// Get M-mode software interrupts pending bit. +static inline uint64_t get_msip() { + uint64_t msip; + asm volatile("csrr %0, mip" : "=r"(msip)::"memory"); + return (msip & 0x8) >> 3; +} + // Enables or disables M-mode global interrupts. static inline void set_mie(int enable) { if (enable) @@ -49,6 +76,13 @@ static inline void set_mie(int enable) { asm volatile("csrci mstatus, 8" ::: "memory"); } +// Get hart id +static inline uint64_t get_mhartid() { + uint64_t mhartid; + asm volatile("csrr %0, mhartid" : "=r"(mhartid)::"memory"); + return mhartid; +} + // Get cycle count since reset static inline uint64_t get_mcycle() { uint64_t mcycle; diff --git a/sw/lib/crt0.S b/sw/lib/crt0.S index ebf372ada..7bf8a870b 100644 --- a/sw/lib/crt0.S +++ b/sw/lib/crt0.S @@ -5,6 +5,8 @@ // Nicole Narr // Christopher Reinwardt // Paul Scheffler +// Emanuele Parisi +// Enrico Zelioli .section .text._start @@ -14,28 +16,29 @@ _start: // Globally disable Machine and Supervisor interrupts csrrc x0, mstatus, 10 - // Park SMP harts - csrr t0, mhartid - beqz t0, 2f -1: - wfi - j 1b -2: - // Init stack and global pointer iff linked as nonzero - mv t1, sp - la t0, __stack_pointer$ - beqz t0, 1f - mv sp, t0 -1: .option push +_init_gp: + // Init global pointer iff linked as nonzero + .option push .option norelax la t0, __global_pointer$ - beqz t0, 1f + beqz t0, _init_sp mv gp, t0 -1: .option pop - + .option pop + +_init_sp: + // Init stack pointer iff linked as nonzero + mv t0, sp + la t1, __stack_pointer$ + beqz t1, _init_context + la t2, __stack_size$ + csrr t3, mhartid + mul t3, t3, t2 + sub sp, t1, t3 + +_init_context: // Store existing stack, global, return pointers on new stack addi sp, sp, -24 - sd t1, 0(sp) + sd t0, 0(sp) sd gp, 8(sp) sd ra, 16(sp) @@ -43,31 +46,6 @@ _start: la t0, _trap_handler_wrap csrrw x0, mtvec, t0 - // Zero the .bss section - la t0, __bss_start // t0 = bss start address - la t1, __bss_end // t1 = bss end address - sub t2, t1, t0 // t2 = #bytes to zero - li a0, 0 - -_zero_bss_loop: - addi t4, t2, -32 - blez t2, _fp_init // t2 <= 0? => No bss to zero - blt t4, x0, _zero_bss_rem // t4 < 0? => Less than 4 words left - sd a0, 0(t0) - sd a0, 8(t0) - sd a0, 16(t0) - sd a0, 24(t0) - addi t2, t2, -32 - addi t0, t0, 32 - bgt t2, x0, _zero_bss_loop // Still more to go - j _fp_init - -_zero_bss_rem: - sb a0, 0(t0) - addi t2, t2, -1 - addi t0, t0, 1 - bgt t2, x0, _zero_bss_rem - _fp_init: // Set FS state to "Initial", enabling FP instructions li t1, 1 @@ -111,6 +89,40 @@ _fp_init: // Set FS state to "Clean" csrrc x0, mstatus, t1 +// Pause all harts except for hart 0 until a IPI is received. +// On wake-up every core resumes execution from the beginning of main(). +_smp_pause: + // Pause harts with hart ID != 0 + csrr t0, mhartid + bnez t0, _wait_for_ipi + +_zero_bss_init: + // Zero the .bss section + la t0, __bss_start // t0 = bss start address + la t1, __bss_end // t1 = bss end address + sub t2, t1, t0 // t2 = #bytes to zero + li a0, 0 + +_zero_bss_loop: + addi t4, t2, -32 + blez t2, _entry // t2 <= 0? => No bss to zero + blt t4, x0, _zero_bss_rem // t4 < 0? => Less than 4 words left + sd a0, 0(t0) + sd a0, 8(t0) + sd a0, 16(t0) + sd a0, 24(t0) + addi t2, t2, -32 + addi t0, t0, 32 + bgt t2, x0, _zero_bss_loop // Still more to go + j _entry + +_zero_bss_rem: + sb a0, 0(t0) + addi t2, t2, -1 + addi t0, t0, 1 + bgt t2, x0, _zero_bss_rem + +_entry: // Full fence, then jump to main fence call main @@ -130,6 +142,25 @@ _exit: // Hand over to whatever called us, passing return ret +_wait_for_ipi: + csrs mie, 0x8 // Enable M-mode software interrupts +1: + wfi + csrr t0, mip + andi t0, t0, 0x8 + beqz t0, 1b + + // Received IPI -> clear MIP and CLINT IPI register + csrc mip, 0x8 + la t0, __base_clint + csrr t1, mhartid + slli t1, t1, 2 + add t1, t1, t0 + sw zero, 0(t1) // *(CLINT_BASE + hart_id * 4) = 0 + + // Resume execution of non-smp harts at beginning of main + j _entry + // This wraps the C trap handler to save the (integer-only) caller-save // registers and perform a proper machine-mode exception return. .align 4 diff --git a/sw/lib/smp.c b/sw/lib/smp.c new file mode 100644 index 000000000..a1986fa3f --- /dev/null +++ b/sw/lib/smp.c @@ -0,0 +1,51 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Emanuele Parisi +// Enrico Zelioli + +#include "smp.h" + +void smp_resume(void) { + uint32_t num_harts = *reg32(&__base_regs, CHESHIRE_NUM_INT_HARTS_REG_OFFSET); + // Flush cache and wake-up all sleeping cores + fence(); + for (uint32_t i=1; i +// Christopher Reinwardt +// Emanuele Parisi +// Enrico Zelioli +// +// Simple SMP Hello World. + +#include "regs/cheshire.h" +#include "dif/clint.h" +#include "dif/uart.h" +#include "params.h" +#include "util.h" +#include "smp.h" +#include "printf.h" + +uint32_t __attribute__((section(".data"))) semaphore = 0x0; + +void semaphore_wait() { + asm volatile ( + " li t0, 1 \n" + "1: \n" + " amoswap.w.aq t0, t0, (%0) \n" + " bnez t0, 1b \n" + ::"r"(&semaphore) + ); +} + +void semaphore_post() { + asm volatile ( + " amoswap.w.rl zero, zero, (%0) \n" + ::"r"(&semaphore) + ); +} + +int main(void) { + + uint64_t hart_id = get_mhartid(); + uint32_t num_harts = *reg32(&__base_regs, CHESHIRE_NUM_INT_HARTS_REG_OFFSET); + + if (hart_id == 0) { + uint32_t rtc_freq = *reg32(&__base_regs, CHESHIRE_RTC_FREQ_REG_OFFSET); + uint64_t reset_freq = clint_get_core_freq(rtc_freq, 2500); + uart_init(&__base_uart, reset_freq, __BOOT_BAUDRATE); + smp_barrier_init(); + smp_resume(); + } + + smp_barrier_up(num_harts); + + for (uint64_t i=0; i<1; i++) { + semaphore_wait(); + printf("Core %d/%d up\n", hart_id, num_harts); + uart_write_flush(&__base_uart); + semaphore_post(); + } + + smp_barrier_down(); + + return 0; +}