From b0ca8747b0378fa5b9d7afddf22adccb1c90f3a7 Mon Sep 17 00:00:00 2001 From: Kevin J Bowers Date: Thu, 19 Dec 2024 16:10:20 -0600 Subject: [PATCH] Stronger fd_tpool compiler memory fencing guarantees As per the discussion in issue #3710, gave fd_tpool API stronger compiler memory fencing guarantees and updated FD_SPIN_PAUSE documentation to be explicit about its current (lack of) fencing guarantees. In the process, this fixes a latent bug caused by missing compiler memory fence on targets without FD_HAS_X86. The missing fence could cause the the compiler to "optimize" the tpool worker task dispatch outer loop into an infinite loop for worker threads in tpools containing 2 or more tiles within the same process on targets without FD_HAS_X86. With this, in the vast majority of circumstances (e.g. not writing low level concurrency tests/benchmarks), normal code (i.e. single-threaded non-conflicting) should "just work" without use of volatile, FD_VOLATILE, FD_COMPILER_MFENCE, FD_ATOMIC, etc when thread parallelized via fd_tpool / FD_FOR_ALL / FD_MAP_REDUCE (and the unit test was updated accordingly). --- src/util/fd_util_base.h | 6 ++- src/util/tpool/fd_tpool.cxx | 74 +++++++++++++++++++++++-------------- src/util/tpool/fd_tpool.h | 73 +++++++++++++++++++++++------------- src/util/tpool/test_tpool.c | 73 ++++++++++++++---------------------- 4 files changed, 125 insertions(+), 101 deletions(-) diff --git a/src/util/fd_util_base.h b/src/util/fd_util_base.h index 4450b974a6..7fec7e4409 100644 --- a/src/util/fd_util_base.h +++ b/src/util/fd_util_base.h @@ -742,7 +742,11 @@ fd_type_pun_const( void const * p ) { the other logical cores sharing the same underlying physical core for a few clocks without yielding it to the operating system scheduler. Typically useful for shared memory spin polling loops, especially if - hyperthreading is in use. */ + hyperthreading is in use. IMPORTANT SAFETY TIP! This might act as a + FD_COMPILER_MFENCE on some combinations of toolchains and targets + (e.g. gcc documents that __builtin_ia32_pause also does a compiler + memory) but this should not be relied upon for portable code + (consider making this a compiler memory fence on all platforms?) */ #if FD_HAS_X86 #define FD_SPIN_PAUSE() __builtin_ia32_pause() diff --git a/src/util/tpool/fd_tpool.cxx b/src/util/tpool/fd_tpool.cxx index 117fc16182..036dac84ee 100644 --- a/src/util/tpool/fd_tpool.cxx +++ b/src/util/tpool/fd_tpool.cxx @@ -25,8 +25,9 @@ fd_tpool_private_worker( int argc, ulong scratch_sz = cfg->scratch_sz; fd_tpool_private_worker_t worker[1]; + FD_COMPILER_MFENCE(); - FD_VOLATILE( worker->state ) = FD_TPOOL_WORKER_STATE_BOOT; + worker->state = FD_TPOOL_WORKER_STATE_BOOT; FD_COMPILER_MFENCE(); worker->tile_idx = (uint)tile_idx; @@ -36,16 +37,19 @@ fd_tpool_private_worker( int argc, if( scratch_sz ) fd_scratch_attach( scratch, fd_tpool_private_scratch_frame, scratch_sz, FD_TPOOL_WORKER_SCRATCH_DEPTH ); FD_COMPILER_MFENCE(); - FD_VOLATILE( worker->state ) = FD_TPOOL_WORKER_STATE_IDLE; - FD_COMPILER_MFENCE(); - FD_VOLATILE( fd_tpool_private_worker( tpool )[ worker_idx ] ) = worker; + worker->state = FD_TPOOL_WORKER_STATE_IDLE; FD_COMPILER_MFENCE(); + fd_tpool_private_worker( tpool )[ worker_idx ] = worker; + for(;;) { /* We are IDLE ... see what we should do next */ - int state = FD_VOLATILE_CONST( worker->state ); + FD_COMPILER_MFENCE(); + int state = worker->state; + FD_COMPILER_MFENCE(); + if( FD_UNLIKELY( state!=FD_TPOOL_WORKER_STATE_EXEC ) ) { if( FD_UNLIKELY( state!=FD_TPOOL_WORKER_STATE_IDLE ) ) break; FD_SPIN_PAUSE(); @@ -54,15 +58,17 @@ fd_tpool_private_worker( int argc, /* We are EXEC ... do the task and then transition to IDLE */ - fd_tpool_task_t task = FD_VOLATILE_CONST( worker->task ); + fd_tpool_task_t task = worker->task; - void * task_tpool = FD_VOLATILE_CONST( worker->task_tpool ); - ulong task_t0 = FD_VOLATILE_CONST( worker->task_t0 ); ulong task_t1 = FD_VOLATILE_CONST( worker->task_t1 ); - void * task_args = FD_VOLATILE_CONST( worker->task_args ); - void * task_reduce = FD_VOLATILE_CONST( worker->task_reduce ); ulong task_stride = FD_VOLATILE_CONST( worker->task_stride ); - ulong task_l0 = FD_VOLATILE_CONST( worker->task_l0 ); ulong task_l1 = FD_VOLATILE_CONST( worker->task_l1 ); - ulong task_m0 = FD_VOLATILE_CONST( worker->task_m0 ); ulong task_m1 = FD_VOLATILE_CONST( worker->task_m1 ); - ulong task_n0 = FD_VOLATILE_CONST( worker->task_n0 ); ulong task_n1 = FD_VOLATILE_CONST( worker->task_n1 ); + void * task_tpool = worker->task_tpool; + ulong task_t0 = worker->task_t0; ulong task_t1 = worker->task_t1; + void * task_args = worker->task_args; + void * task_reduce = worker->task_reduce; ulong task_stride = worker->task_stride; + ulong task_l0 = worker->task_l0; ulong task_l1 = worker->task_l1; + ulong task_m0 = worker->task_m0; ulong task_m1 = worker->task_m1; + ulong task_n0 = worker->task_n0; ulong task_n1 = worker->task_n1; + + FD_COMPILER_MFENCE(); try { task( task_tpool,task_t0,task_t1, task_args, task_reduce,task_stride, task_l0,task_l1, task_m0,task_m1, task_n0,task_n1 ); @@ -71,8 +77,7 @@ fd_tpool_private_worker( int argc, } FD_COMPILER_MFENCE(); - FD_VOLATILE( worker->state ) = FD_TPOOL_WORKER_STATE_IDLE; - FD_COMPILER_MFENCE(); + worker->state = FD_TPOOL_WORKER_STATE_IDLE; } /* state is HALT, clean up and then reset back to BOOT */ @@ -80,8 +85,9 @@ fd_tpool_private_worker( int argc, if( scratch_sz ) fd_scratch_detach( NULL ); FD_COMPILER_MFENCE(); - FD_VOLATILE( worker->state ) = FD_TPOOL_WORKER_STATE_BOOT; + worker->state = FD_TPOOL_WORKER_STATE_BOOT; FD_COMPILER_MFENCE(); + return 0; } @@ -101,6 +107,8 @@ fd_tpool_t * fd_tpool_init( void * mem, ulong worker_max ) { + FD_COMPILER_MFENCE(); + if( FD_UNLIKELY( !mem ) ) { FD_LOG_WARNING(( "NULL mem" )); return NULL; @@ -120,8 +128,9 @@ fd_tpool_init( void * mem, fd_memset( mem, 0, footprint ); fd_tpool_private_worker_t * worker0 = (fd_tpool_private_worker_t *)mem; + FD_COMPILER_MFENCE(); - FD_VOLATILE( worker0->state ) = FD_TPOOL_WORKER_STATE_EXEC; + worker0->state = FD_TPOOL_WORKER_STATE_EXEC; FD_COMPILER_MFENCE(); fd_tpool_t * tpool = (fd_tpool_t *)(worker0+1); @@ -129,7 +138,7 @@ fd_tpool_init( void * mem, tpool->worker_cnt = 1UL; FD_COMPILER_MFENCE(); - FD_VOLATILE( fd_tpool_private_worker( tpool )[0] ) = worker0; + fd_tpool_private_worker( tpool )[0] = worker0; FD_COMPILER_MFENCE(); return tpool; @@ -138,16 +147,19 @@ fd_tpool_init( void * mem, void * fd_tpool_fini( fd_tpool_t * tpool ) { + FD_COMPILER_MFENCE(); + if( FD_UNLIKELY( !tpool ) ) { FD_LOG_WARNING(( "NULL tpool" )); return NULL; } - while( fd_tpool_worker_cnt( tpool )>1UL ) + while( fd_tpool_worker_cnt( tpool )>1UL ) { if( FD_UNLIKELY( !fd_tpool_worker_pop( tpool ) ) ) { FD_LOG_WARNING(( "fd_tpool_worker_pop failed" )); return NULL; } + } return (void *)fd_tpool_private_worker0( tpool ); } @@ -158,6 +170,8 @@ fd_tpool_worker_push( fd_tpool_t * tpool, void * scratch, ulong scratch_sz ) { + FD_COMPILER_MFENCE(); + if( FD_UNLIKELY( !tpool ) ) { FD_LOG_WARNING(( "NULL tpool" )); return NULL; @@ -215,7 +229,7 @@ fd_tpool_worker_push( fd_tpool_t * tpool, char ** argv = (char **)fd_type_pun( cfg ); FD_COMPILER_MFENCE(); - FD_VOLATILE( worker[ worker_cnt ] ) = NULL; + worker[ worker_cnt ] = NULL; FD_COMPILER_MFENCE(); if( FD_UNLIKELY( !fd_tile_exec_new( tile_idx, fd_tpool_private_worker, argc, argv ) ) ) { @@ -232,6 +246,8 @@ fd_tpool_worker_push( fd_tpool_t * tpool, fd_tpool_t * fd_tpool_worker_pop( fd_tpool_t * tpool ) { + FD_COMPILER_MFENCE(); + if( FD_UNLIKELY( !tpool ) ) { FD_LOG_WARNING(( "NULL tpool" )); return NULL; @@ -248,16 +264,17 @@ fd_tpool_worker_pop( fd_tpool_t * tpool ) { int volatile * vstate = (int volatile *)&(worker->state); int state; - /* Testing for IDLE isn't strictly necessary given requirements - to use this but can help catch user errors. Likewise, - FD_ATOMIC_CAS isn't strictly necessary given correct operation but - can more robustly catch such errors. */ + /* Testing for IDLE isn't strictly necessary given requirements to use + this but can help catch user errors. Likewise, FD_ATOMIC_CAS isn't + strictly necessary given correct operation but can more robustly + catch such errors. */ # if FD_HAS_ATOMIC FD_COMPILER_MFENCE(); state = FD_ATOMIC_CAS( vstate, FD_TPOOL_WORKER_STATE_IDLE, FD_TPOOL_WORKER_STATE_HALT ); FD_COMPILER_MFENCE(); + if( FD_UNLIKELY( state!=FD_TPOOL_WORKER_STATE_IDLE ) ) { FD_LOG_WARNING(( "worker to pop is not idle (%i-%s)", state, fd_tpool_worker_state_cstr( state ) )); return NULL; @@ -268,10 +285,12 @@ fd_tpool_worker_pop( fd_tpool_t * tpool ) { FD_COMPILER_MFENCE(); state = *vstate; FD_COMPILER_MFENCE(); + if( FD_UNLIKELY( state!=FD_TPOOL_WORKER_STATE_IDLE ) ) { FD_LOG_WARNING(( "worker to pop is not idle (%i-%s)", state, fd_tpool_worker_state_cstr( state ) )); return NULL; } + FD_COMPILER_MFENCE(); *vstate = FD_TPOOL_WORKER_STATE_HALT; FD_COMPILER_MFENCE(); @@ -333,9 +352,7 @@ FD_TPOOL_EXEC_ALL_IMPL_FTR #if FD_HAS_ATOMIC FD_TPOOL_EXEC_ALL_IMPL_HDR(taskq) ulong * l_next = (ulong *)_tpool; - FD_COMPILER_MFENCE(); - void * tpool = (void *)FD_VOLATILE_CONST( l_next[1] ); - FD_COMPILER_MFENCE(); + void * tpool = (void *)l_next[1]; for(;;) { /* Note that we use an ATOMIC_CAS here instead of an @@ -345,8 +362,9 @@ FD_TPOOL_EXEC_ALL_IMPL_HDR(taskq) not overflow. */ FD_COMPILER_MFENCE(); - ulong m0 = FD_VOLATILE_CONST( *l_next ); + ulong m0 = *l_next; FD_COMPILER_MFENCE(); + if( FD_UNLIKELY( m0>=l1 ) ) break; ulong m1 = m0+1UL; if( FD_UNLIKELY( FD_ATOMIC_CAS( l_next, m0, m1 )!=m0 ) ) { diff --git a/src/util/tpool/fd_tpool.h b/src/util/tpool/fd_tpool.h index 68e677f321..e2919c61b7 100644 --- a/src/util/tpool/fd_tpool.h +++ b/src/util/tpool/fd_tpool.h @@ -646,7 +646,8 @@ FD_FN_CONST ulong fd_tpool_footprint( ulong worker_max ); operation and can also flexibly participates in the tpool in bulk operations. This uses init/fini semantics instead of new/join/leave/delete semantics because thread pools aren't - meaningfully sharable between processes / thread groups. */ + meaningfully sharable between processes / thread groups. This + function acts as a compiler memory fence. */ fd_tpool_t * fd_tpool_init( void * mem, @@ -658,7 +659,8 @@ fd_tpool_init( void * mem, "worker 0" thread) and no other operations on the tpool should be in progress when this is called or done after this is called. Returns the memory region used by the tpool on success (this is not a simple - cast of mem) and NULL on failure (logs details). */ + cast of mem) and NULL on failure (logs details). This function acts + as a compiler memory fence. */ void * fd_tpool_fini( fd_tpool_t * tpool ); @@ -692,7 +694,8 @@ fd_tpool_fini( fd_tpool_t * tpool ); scratch region specified, etc). No other operations on tpool should be in process when this is called - or started while this is running. */ + or started while this is running. This function acts as a compiler + memory fence. */ fd_tpool_t * fd_tpool_worker_push( fd_tpool_t * tpool, @@ -710,7 +713,8 @@ fd_tpool_worker_push( fd_tpool_t * tpool, tpool, bad tile_idx, tile was not idle, bad scratch region, etc). No other operations on the tpool should be in process when this is - called or started while this is running. */ + called or started while this is running. This function acts as a + compiler memory fence. */ fd_tpool_t * fd_tpool_worker_pop( fd_tpool_t * tpool ); @@ -748,12 +752,17 @@ fd_tpool_worker_scratch_sz( fd_tpool_t const * tpool, no input argument checking. Specifically, assumes tpool is valid and worker_idx is in [0,worker_cnt). Return value will be a FD_TPOOL_WORKER_STATE value (and, in correct usage, either IDLE or - EXEC). worker 0 is special. The state here will always be EXEC. */ + EXEC). worker 0 is special. The state here will always be EXEC. + This function acts as a compiler memory fence. */ static inline int fd_tpool_worker_state( fd_tpool_t const * tpool, ulong worker_idx ) { - return FD_VOLATILE_CONST( fd_tpool_private_worker( tpool )[ worker_idx ]->state ); + int * _state = &fd_tpool_private_worker( tpool )[ worker_idx ]->state; + FD_COMPILER_MFENCE(); + int state = *_state; + FD_COMPILER_MFENCE(); + return state; } /* fd_tpool_exec calls @@ -775,7 +784,8 @@ fd_tpool_worker_state( fd_tpool_t const * tpool, assumes tpool is valid, worker_idx in (0,worker_cnt) (yes, open on both ends), caller is not tpool thread worker_idx, task is valid. worker_idx 0 is special and is considered to always be in the EXEC - state so we cannot call exec on it. */ + state so we cannot call exec on it. This function acts as a compiler + memory fence. */ static inline void fd_tpool_exec( fd_tpool_t * tpool, ulong worker_idx, @@ -788,17 +798,16 @@ fd_tpool_exec( fd_tpool_t * tpool, ulong worker_idx, ulong task_m0, ulong task_m1, ulong task_n0, ulong task_n1 ) { fd_tpool_private_worker_t * worker = fd_tpool_private_worker( tpool )[ worker_idx ]; + worker->task = task; + worker->task_tpool = task_tpool; + worker->task_t0 = task_t0; worker->task_t1 = task_t1; + worker->task_args = task_args; + worker->task_reduce = task_reduce; worker->task_stride = task_stride; + worker->task_l0 = task_l0; worker->task_l1 = task_l1; + worker->task_m0 = task_m0; worker->task_m1 = task_m1; + worker->task_n0 = task_n0; worker->task_n1 = task_n1; FD_COMPILER_MFENCE(); - FD_VOLATILE( worker->task ) = task; - FD_VOLATILE( worker->task_tpool ) = task_tpool; - FD_VOLATILE( worker->task_t0 ) = task_t0; FD_VOLATILE( worker->task_t1 ) = task_t1; - FD_VOLATILE( worker->task_args ) = task_args; - FD_VOLATILE( worker->task_reduce ) = task_reduce; FD_VOLATILE( worker->task_stride ) = task_stride; - FD_VOLATILE( worker->task_l0 ) = task_l0; FD_VOLATILE( worker->task_l1 ) = task_l1; - FD_VOLATILE( worker->task_m0 ) = task_m0; FD_VOLATILE( worker->task_m1 ) = task_m1; - FD_VOLATILE( worker->task_n0 ) = task_n0; FD_VOLATILE( worker->task_n1 ) = task_n1; - FD_COMPILER_MFENCE(); - FD_VOLATILE( worker->state ) = FD_TPOOL_WORKER_STATE_EXEC; + worker->state = FD_TPOOL_WORKER_STATE_EXEC; FD_COMPILER_MFENCE(); } @@ -807,15 +816,19 @@ fd_tpool_exec( fd_tpool_t * tpool, ulong worker_idx, input argument checking. Specifically, assumes tpool is valid, worker_idx in (0,worker_cnt) (yes, open on both ends) and caller is not tpool thread worker_idx. worker_idx 0 is considered to always be - in an exec state so we cannot call wait on it. */ + in an exec state so we cannot call wait on it. On return, the caller + atomically observed worker_idx was not in the EXEC state at some + point between when this was called and when this returned. This + function acts as a compiler memory fence. */ static inline void fd_tpool_wait( fd_tpool_t const * tpool, ulong worker_idx ) { - int volatile * vstate = (int volatile *)&(fd_tpool_private_worker( tpool )[ worker_idx ]->state); - int state; + int * _state = &fd_tpool_private_worker( tpool )[ worker_idx ]->state; for(;;) { - state = *vstate; + FD_COMPILER_MFENCE(); + int state = *_state; + FD_COMPILER_MFENCE(); if( FD_LIKELY( state!=FD_TPOOL_WORKER_STATE_EXEC ) ) break; FD_SPIN_PAUSE(); } @@ -884,7 +897,8 @@ fd_tpool_wait( fd_tpool_t const * tpool, As this is used in high performance contexts, this does no input argument checking. Specifically, it assumes tpool is valid, task is - valid, 0<=t01L ) { tpool = fd_tpool_init( tpool_mem, 2UL ); FD_TEST( tpool ); fd_tile_exec_delete( fd_tile_exec_new( 1UL, tile_self_push_main, 0, (char **)tpool ), NULL ); - FD_COMPILER_MFENCE(); - FD_VOLATILE( spin_done ) = 0; - FD_COMPILER_MFENCE(); - fd_tile_exec_t * exec = fd_tile_exec_new( 1UL, tile_spin_main, 0, (char **)fd_type_pun( &spin_done ) ); /* self push */ + char * spin_done = (char *)0UL; + FD_COMPILER_MFENCE(); /* FIXME: give fd_tile_exec_new stronger fencing semantics */ + fd_tile_exec_t * exec = fd_tile_exec_new( 1UL, tile_spin_main, 0, &spin_done ); FD_TEST( exec ); FD_TEST( !fd_tpool_worker_push( tpool, 1UL, NULL, 0UL ) ); /* Already in use */ - FD_COMPILER_MFENCE(); - FD_VOLATILE( spin_done ) = 1; - FD_COMPILER_MFENCE(); + spin_done = (char *)1UL; fd_tile_exec_delete( exec, NULL ); FD_TEST( fd_tpool_fini( tpool )==(void *)tpool_mem ); @@ -457,15 +446,11 @@ main( int argc, if( tile_cnt>1L ) { ulong worker_idx = tile_cnt-1UL; - FD_COMPILER_MFENCE(); - FD_VOLATILE( spin_done ) = 0; - FD_COMPILER_MFENCE(); + ulong spin_done = 0UL; fd_tpool_exec( tpool,worker_idx, worker_spin, &spin_done, 1UL,2UL,(void *)3UL,(void *)4UL,5UL,6UL,7UL,8UL,9UL,10UL,11UL ); FD_TEST( fd_tpool_worker_state( tpool, worker_idx )==FD_TPOOL_WORKER_STATE_EXEC ); FD_TEST( !fd_tpool_worker_pop( tpool ) ); /* already in use */ - FD_COMPILER_MFENCE(); - FD_VOLATILE( spin_done ) = 1; - FD_COMPILER_MFENCE(); + spin_done = 1UL; fd_tpool_wait( tpool,worker_idx ); FD_TEST( fd_tpool_worker_state( tpool, worker_idx )==FD_TPOOL_WORKER_STATE_IDLE ); } @@ -646,14 +631,12 @@ main( int argc, FD_LOG_NOTICE(( "Testing FD_FOR_ALL" )); for( ulong rem=100000UL; rem; rem-- ) { - FD_COMPILER_MFENCE(); test_t0 = fd_rng_ulong_roll( rng, tile_cnt ); test_t1 = fd_rng_ulong_roll( rng, tile_cnt ); fd_swap_if( test_t1