diff --git a/frame/include/blis.h b/frame/include/blis.h index d87018d009..6292f47452 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -80,21 +80,6 @@ extern "C" { #include "bli_pragma_macro_defs.h" -// -- Threading definitions -- - -#include "bli_thread.h" -#include "bli_thread_range.h" -#include "bli_thread_range_slab_rr.h" -#include "bli_thread_range_tlb.h" - -#include "bli_pthread.h" - - -// -- Constant definitions -- - -#include "bli_extern_defs.h" - - // -- BLIS architecture/kernel definitions -- #include "bli_pre_ker_params.h" @@ -116,6 +101,21 @@ extern "C" { #include "bli_kernel_macro_defs.h" +// -- Threading definitions -- + +#include "bli_thread.h" +#include "bli_thread_range.h" +#include "bli_thread_range_slab_rr.h" +#include "bli_thread_range_tlb.h" + +#include "bli_pthread.h" + + +// -- Constant definitions -- + +#include "bli_extern_defs.h" + + // -- Base operation prototypes -- #include "bli_init.h" diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c index e9f9d9dc70..79618f1a83 100644 --- a/frame/thread/bli_thrcomm.c +++ b/frame/thread/bli_thrcomm.c @@ -206,6 +206,8 @@ void* bli_thrcomm_bcast return object; } +#ifndef BLIS_TREE_BARRIER + // Use __sync_* builtins (assumed available) if __atomic_* ones are not present. #ifndef __ATOMIC_RELAXED @@ -214,14 +216,10 @@ void* bli_thrcomm_bcast #define __ATOMIC_RELEASE #define __ATOMIC_ACQ_REL -#define __atomic_load_n(ptr, constraint) \ - __sync_fetch_and_add(ptr, 0) -#define __atomic_add_fetch(ptr, value, constraint) \ - __sync_add_and_fetch(ptr, value) -#define __atomic_fetch_add(ptr, value, constraint) \ - __sync_fetch_and_add(ptr, value) -#define __atomic_fetch_xor(ptr, value, constraint) \ - __sync_fetch_and_xor(ptr, value) +#define __atomic_load_n( ptr, constraint ) __sync_fetch_and_add( ptr, 0 ) +#define __atomic_add_fetch( ptr, value, constraint ) __sync_add_and_fetch( ptr, value ) +#define __atomic_fetch_add( ptr, value, constraint ) __sync_fetch_and_add( ptr, value ) +#define __atomic_fetch_xor( ptr, value, constraint ) __sync_fetch_and_xor( ptr, value ) #endif @@ -269,3 +267,5 @@ void bli_thrcomm_barrier_atomic( dim_t t_id, thrcomm_t* comm ) } } +#endif + diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h index 04cb23a38a..436b057116 100644 --- a/frame/thread/bli_thrcomm.h +++ b/frame/thread/bli_thrcomm.h @@ -51,17 +51,17 @@ struct barrier_s // the fields above and fields below. char padding1[ BLIS_CACHE_LINE_SIZE ]; - int count; + dim_t count; // We insert a cache line of padding here to eliminate false sharing between // the fields above and fields below. char padding2[ BLIS_CACHE_LINE_SIZE ]; - volatile int signal; + gint_t signal; // We insert a cache line of padding here to eliminate false sharing between // this struct and the next one. - char padding2[ BLIS_CACHE_LINE_SIZE ]; + char padding3[ BLIS_CACHE_LINE_SIZE ]; }; typedef struct barrier_s barrier_t; #endif diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index a42dabe180..487832cd17 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -114,6 +114,10 @@ void bli_thrcomm_cleanup_openmp( thrcomm_t* comm ) void bli_thrcomm_barrier_openmp( dim_t t_id, thrcomm_t* comm ) { + // Return early if the comm is NULL or if there is only one + // thread participating. + if ( comm == NULL || comm->n_threads == 1 ) return; + bli_thrcomm_tree_barrier( comm->barriers[t_id] ); } @@ -176,27 +180,42 @@ void bli_thrcomm_tree_barrier_free( barrier_t* barrier ) return; } +// Use __sync_* builtins (assumed available) if __atomic_* ones are not present. +#ifndef __ATOMIC_RELAXED + +#define __ATOMIC_RELAXED +#define __ATOMIC_ACQUIRE +#define __ATOMIC_RELEASE +#define __ATOMIC_ACQ_REL + +//#define __atomic_add_fetch( ptr, value, constraint ) __sync_add_and_fetch( ptr, value ) +//#define __atomic_fetch_add( ptr, value, constraint ) __sync_fetch_and_add( ptr, value ) + +#define __atomic_load_n( ptr, constraint ) __sync_fetch_and_add( ptr, 0 ) +#define __atomic_sub_fetch( ptr, value, constraint ) __sync_sub_and_fetch( ptr, value ) +#define __atomic_fetch_xor( ptr, value, constraint ) __sync_fetch_and_xor( ptr, value ) + +#endif + void bli_thrcomm_tree_barrier( barrier_t* barack ) { - int my_signal = barack->signal; - int my_count; + gint_t my_signal = __atomic_load_n( &barack->signal, __ATOMIC_RELAXED ); - _Pragma( "omp atomic capture" ) - my_count = barack->count--; + dim_t my_count = + __atomic_sub_fetch( &barack->count, 1, __ATOMIC_ACQ_REL ); - if ( my_count == 1 ) + if ( my_count == 0 ) { if ( barack->dad != NULL ) { bli_thrcomm_tree_barrier( barack->dad ); } barack->count = barack->arity; - barack->signal = !barack->signal; + __atomic_fetch_xor( &barack->signal, 1, __ATOMIC_RELEASE ); } else { - volatile int* listener = &barack->signal; - while ( *listener == my_signal ) {} + while ( __atomic_load_n( &barack->signal, __ATOMIC_ACQUIRE ) == my_signal ) {} } }