Skip to content

Commit

Permalink
Stronger fd_tpool compiler memory fencing guarantees
Browse files Browse the repository at this point in the history
As per the discussion in issue #3710, gave fd_tpool API stronger
compiler memory fencing guarantees and updated FD_SPIN_PAUSE
documentation to be explicit about its current (lack of) fencing
guarantees.

In the process, this fixes a latent bug caused by missing compiler
memory fence on targets without FD_HAS_X86.  The missing fence could
cause the the compiler to "optimize" the tpool worker task dispatch
outer loop into an infinite loop for worker threads in tpools containing
2 or more tiles within the same process on targets without FD_HAS_X86.

With this, in the vast majority of circumstances (e.g. not writing low
level concurrency tests/benchmarks), normal code (i.e. single-threaded
non-conflicting) should "just work" without use of volatile,
FD_VOLATILE, FD_COMPILER_MFENCE, FD_ATOMIC, etc when thread parallelized
via fd_tpool / FD_FOR_ALL / FD_MAP_REDUCE (and the unit test was updated
accordingly).
  • Loading branch information
kbowers-jump committed Dec 19, 2024
1 parent 61ba5f1 commit b0ca874
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 101 deletions.
6 changes: 5 additions & 1 deletion src/util/fd_util_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -742,7 +742,11 @@ fd_type_pun_const( void const * p ) {
the other logical cores sharing the same underlying physical core for
a few clocks without yielding it to the operating system scheduler.
Typically useful for shared memory spin polling loops, especially if
hyperthreading is in use. */
hyperthreading is in use. IMPORTANT SAFETY TIP! This might act as a
FD_COMPILER_MFENCE on some combinations of toolchains and targets
(e.g. gcc documents that __builtin_ia32_pause also does a compiler
memory) but this should not be relied upon for portable code
(consider making this a compiler memory fence on all platforms?) */

#if FD_HAS_X86
#define FD_SPIN_PAUSE() __builtin_ia32_pause()
Expand Down
74 changes: 46 additions & 28 deletions src/util/tpool/fd_tpool.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ fd_tpool_private_worker( int argc,
ulong scratch_sz = cfg->scratch_sz;

fd_tpool_private_worker_t worker[1];

FD_COMPILER_MFENCE();
FD_VOLATILE( worker->state ) = FD_TPOOL_WORKER_STATE_BOOT;
worker->state = FD_TPOOL_WORKER_STATE_BOOT;
FD_COMPILER_MFENCE();

worker->tile_idx = (uint)tile_idx;
Expand All @@ -36,16 +37,19 @@ fd_tpool_private_worker( int argc,
if( scratch_sz ) fd_scratch_attach( scratch, fd_tpool_private_scratch_frame, scratch_sz, FD_TPOOL_WORKER_SCRATCH_DEPTH );

FD_COMPILER_MFENCE();
FD_VOLATILE( worker->state ) = FD_TPOOL_WORKER_STATE_IDLE;
FD_COMPILER_MFENCE();
FD_VOLATILE( fd_tpool_private_worker( tpool )[ worker_idx ] ) = worker;
worker->state = FD_TPOOL_WORKER_STATE_IDLE;
FD_COMPILER_MFENCE();

fd_tpool_private_worker( tpool )[ worker_idx ] = worker;

for(;;) {

/* We are IDLE ... see what we should do next */

int state = FD_VOLATILE_CONST( worker->state );
FD_COMPILER_MFENCE();
int state = worker->state;
FD_COMPILER_MFENCE();

if( FD_UNLIKELY( state!=FD_TPOOL_WORKER_STATE_EXEC ) ) {
if( FD_UNLIKELY( state!=FD_TPOOL_WORKER_STATE_IDLE ) ) break;
FD_SPIN_PAUSE();
Expand All @@ -54,15 +58,17 @@ fd_tpool_private_worker( int argc,

/* We are EXEC ... do the task and then transition to IDLE */

fd_tpool_task_t task = FD_VOLATILE_CONST( worker->task );
fd_tpool_task_t task = worker->task;

void * task_tpool = FD_VOLATILE_CONST( worker->task_tpool );
ulong task_t0 = FD_VOLATILE_CONST( worker->task_t0 ); ulong task_t1 = FD_VOLATILE_CONST( worker->task_t1 );
void * task_args = FD_VOLATILE_CONST( worker->task_args );
void * task_reduce = FD_VOLATILE_CONST( worker->task_reduce ); ulong task_stride = FD_VOLATILE_CONST( worker->task_stride );
ulong task_l0 = FD_VOLATILE_CONST( worker->task_l0 ); ulong task_l1 = FD_VOLATILE_CONST( worker->task_l1 );
ulong task_m0 = FD_VOLATILE_CONST( worker->task_m0 ); ulong task_m1 = FD_VOLATILE_CONST( worker->task_m1 );
ulong task_n0 = FD_VOLATILE_CONST( worker->task_n0 ); ulong task_n1 = FD_VOLATILE_CONST( worker->task_n1 );
void * task_tpool = worker->task_tpool;
ulong task_t0 = worker->task_t0; ulong task_t1 = worker->task_t1;
void * task_args = worker->task_args;
void * task_reduce = worker->task_reduce; ulong task_stride = worker->task_stride;
ulong task_l0 = worker->task_l0; ulong task_l1 = worker->task_l1;
ulong task_m0 = worker->task_m0; ulong task_m1 = worker->task_m1;
ulong task_n0 = worker->task_n0; ulong task_n1 = worker->task_n1;

FD_COMPILER_MFENCE();

try {
task( task_tpool,task_t0,task_t1, task_args, task_reduce,task_stride, task_l0,task_l1, task_m0,task_m1, task_n0,task_n1 );
Expand All @@ -71,17 +77,17 @@ fd_tpool_private_worker( int argc,
}

FD_COMPILER_MFENCE();
FD_VOLATILE( worker->state ) = FD_TPOOL_WORKER_STATE_IDLE;
FD_COMPILER_MFENCE();
worker->state = FD_TPOOL_WORKER_STATE_IDLE;
}

/* state is HALT, clean up and then reset back to BOOT */

if( scratch_sz ) fd_scratch_detach( NULL );

FD_COMPILER_MFENCE();
FD_VOLATILE( worker->state ) = FD_TPOOL_WORKER_STATE_BOOT;
worker->state = FD_TPOOL_WORKER_STATE_BOOT;
FD_COMPILER_MFENCE();

return 0;
}

Expand All @@ -101,6 +107,8 @@ fd_tpool_t *
fd_tpool_init( void * mem,
ulong worker_max ) {

FD_COMPILER_MFENCE();

if( FD_UNLIKELY( !mem ) ) {
FD_LOG_WARNING(( "NULL mem" ));
return NULL;
Expand All @@ -120,16 +128,17 @@ fd_tpool_init( void * mem,
fd_memset( mem, 0, footprint );

fd_tpool_private_worker_t * worker0 = (fd_tpool_private_worker_t *)mem;

FD_COMPILER_MFENCE();
FD_VOLATILE( worker0->state ) = FD_TPOOL_WORKER_STATE_EXEC;
worker0->state = FD_TPOOL_WORKER_STATE_EXEC;
FD_COMPILER_MFENCE();

fd_tpool_t * tpool = (fd_tpool_t *)(worker0+1);
tpool->worker_max = worker_max;
tpool->worker_cnt = 1UL;

FD_COMPILER_MFENCE();
FD_VOLATILE( fd_tpool_private_worker( tpool )[0] ) = worker0;
fd_tpool_private_worker( tpool )[0] = worker0;
FD_COMPILER_MFENCE();

return tpool;
Expand All @@ -138,16 +147,19 @@ fd_tpool_init( void * mem,
void *
fd_tpool_fini( fd_tpool_t * tpool ) {

FD_COMPILER_MFENCE();

if( FD_UNLIKELY( !tpool ) ) {
FD_LOG_WARNING(( "NULL tpool" ));
return NULL;
}

while( fd_tpool_worker_cnt( tpool )>1UL )
while( fd_tpool_worker_cnt( tpool )>1UL ) {
if( FD_UNLIKELY( !fd_tpool_worker_pop( tpool ) ) ) {
FD_LOG_WARNING(( "fd_tpool_worker_pop failed" ));
return NULL;
}
}

return (void *)fd_tpool_private_worker0( tpool );
}
Expand All @@ -158,6 +170,8 @@ fd_tpool_worker_push( fd_tpool_t * tpool,
void * scratch,
ulong scratch_sz ) {

FD_COMPILER_MFENCE();

if( FD_UNLIKELY( !tpool ) ) {
FD_LOG_WARNING(( "NULL tpool" ));
return NULL;
Expand Down Expand Up @@ -215,7 +229,7 @@ fd_tpool_worker_push( fd_tpool_t * tpool,
char ** argv = (char **)fd_type_pun( cfg );

FD_COMPILER_MFENCE();
FD_VOLATILE( worker[ worker_cnt ] ) = NULL;
worker[ worker_cnt ] = NULL;
FD_COMPILER_MFENCE();

if( FD_UNLIKELY( !fd_tile_exec_new( tile_idx, fd_tpool_private_worker, argc, argv ) ) ) {
Expand All @@ -232,6 +246,8 @@ fd_tpool_worker_push( fd_tpool_t * tpool,
fd_tpool_t *
fd_tpool_worker_pop( fd_tpool_t * tpool ) {

FD_COMPILER_MFENCE();

if( FD_UNLIKELY( !tpool ) ) {
FD_LOG_WARNING(( "NULL tpool" ));
return NULL;
Expand All @@ -248,16 +264,17 @@ fd_tpool_worker_pop( fd_tpool_t * tpool ) {
int volatile * vstate = (int volatile *)&(worker->state);
int state;

/* Testing for IDLE isn't strictly necessary given requirements
to use this but can help catch user errors. Likewise,
FD_ATOMIC_CAS isn't strictly necessary given correct operation but
can more robustly catch such errors. */
/* Testing for IDLE isn't strictly necessary given requirements to use
this but can help catch user errors. Likewise, FD_ATOMIC_CAS isn't
strictly necessary given correct operation but can more robustly
catch such errors. */

# if FD_HAS_ATOMIC

FD_COMPILER_MFENCE();
state = FD_ATOMIC_CAS( vstate, FD_TPOOL_WORKER_STATE_IDLE, FD_TPOOL_WORKER_STATE_HALT );
FD_COMPILER_MFENCE();

if( FD_UNLIKELY( state!=FD_TPOOL_WORKER_STATE_IDLE ) ) {
FD_LOG_WARNING(( "worker to pop is not idle (%i-%s)", state, fd_tpool_worker_state_cstr( state ) ));
return NULL;
Expand All @@ -268,10 +285,12 @@ fd_tpool_worker_pop( fd_tpool_t * tpool ) {
FD_COMPILER_MFENCE();
state = *vstate;
FD_COMPILER_MFENCE();

if( FD_UNLIKELY( state!=FD_TPOOL_WORKER_STATE_IDLE ) ) {
FD_LOG_WARNING(( "worker to pop is not idle (%i-%s)", state, fd_tpool_worker_state_cstr( state ) ));
return NULL;
}

FD_COMPILER_MFENCE();
*vstate = FD_TPOOL_WORKER_STATE_HALT;
FD_COMPILER_MFENCE();
Expand Down Expand Up @@ -333,9 +352,7 @@ FD_TPOOL_EXEC_ALL_IMPL_FTR
#if FD_HAS_ATOMIC
FD_TPOOL_EXEC_ALL_IMPL_HDR(taskq)
ulong * l_next = (ulong *)_tpool;
FD_COMPILER_MFENCE();
void * tpool = (void *)FD_VOLATILE_CONST( l_next[1] );
FD_COMPILER_MFENCE();
void * tpool = (void *)l_next[1];
for(;;) {

/* Note that we use an ATOMIC_CAS here instead of an
Expand All @@ -345,8 +362,9 @@ FD_TPOOL_EXEC_ALL_IMPL_HDR(taskq)
not overflow. */

FD_COMPILER_MFENCE();
ulong m0 = FD_VOLATILE_CONST( *l_next );
ulong m0 = *l_next;
FD_COMPILER_MFENCE();

if( FD_UNLIKELY( m0>=l1 ) ) break;
ulong m1 = m0+1UL;
if( FD_UNLIKELY( FD_ATOMIC_CAS( l_next, m0, m1 )!=m0 ) ) {
Expand Down
Loading

0 comments on commit b0ca874

Please sign in to comment.