diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp index d7e743edd..60192e8af 100644 --- a/coreneuron/network/netcvode.cpp +++ b/coreneuron/network/netcvode.cpp @@ -533,8 +533,10 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method nrn_pragma_acc(parallel loop present( nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end]) - copy(net_send_buf_count) if (nt->compute_gpu) async(nt->streams[nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait) + copy(net_send_buf_count) if (nt->compute_gpu) + async(nt->streams[nt->stream_id])) + nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) + if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait) for (int i = 0; i < nt->ncell; ++i) { PreSyn* ps = presyns + i; PreSynHelper* psh = presyns_helper + i; @@ -569,9 +571,13 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method #ifdef CORENEURON_ENABLE_GPU int* nsbuffer = nt->_net_send_buffer; #endif - nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->streams[nt->stream_id])) + nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) + async(nt->streams[nt->stream_id])) nrn_pragma_acc(wait async(nt->streams[nt->stream_id])) - nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt]) depend(inout: nt->streams[nt->stream_id]) nowait) + // clang-format off + nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt]) + depend(inout: nt->streams[nt->stream_id]) nowait) + // clang-format on nrn_pragma_omp(taskwait) } diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp index 066ca15bb..a39458f25 100644 --- a/coreneuron/network/partrans.cpp +++ b/coreneuron/network/partrans.cpp @@ -63,7 +63,10 @@ void nrnmpi_v_transfer() { } nrn_pragma_acc(update host(src_gather [0:n_src_gather]) if (nt->compute_gpu) async(nt->streams[nt->stream_id])) - nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait) + // clang-format off + nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu) + depend(inout: nt->streams[nt->stream_id]) nowait) + // clang-format on } // copy gathered source values to outsrc_buf_ diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp index e1ee3fd39..c9a93bcd9 100644 --- a/coreneuron/permute/cellorder.cpp +++ b/coreneuron/permute/cellorder.cpp @@ -600,14 +600,18 @@ void solve_interleaved2(int ith) { defined(_OPENACC) int nstride = stridedispl[nwarp]; #endif - nrn_pragma_acc(parallel loop gang vector vector_length( - warpsize) present(nt [0:1], - strides [0:nstride], - ncycles [0:nwarp], - stridedispl [0:nwarp + 1], - rootbegin [0:nwarp + 1], - nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->streams[nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait) + nrn_pragma_acc(parallel loop gang vector vector_length(warpsize) + present(nt [0:1], + strides [0:nstride], + ncycles [0:nwarp], + stridedispl [0:nwarp + 1], + rootbegin [0:nwarp + 1], + nodebegin [0:nwarp + 1]) if (nt->compute_gpu) + async(nt->streams[nt->stream_id])) + // clang-format off + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) + depend(inout: nt->streams[nt->stream_id]) nowait) + // clang-format on for (int icore = 0; icore < ncore; ++icore) { int iwarp = icore / warpsize; // figure out the >> value int ic = icore & (warpsize - 1); // figure out the & mask diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp index 1d6ffdfcb..ad71f4c7a 100644 --- a/coreneuron/sim/fadvance_core.cpp +++ b/coreneuron/sim/fadvance_core.cpp @@ -317,7 +317,10 @@ void nrncore2nrn_send_values(NrnThread* nth) { double* gather_i = tr->gather[i]; nrn_pragma_acc(update self(gather_i [0:1]) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) - nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu) depend(inout: nth->streams[nth->stream_id]) nowait) + // clang-format off + nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu) + depend(inout: nth->streams[nth->stream_id]) nowait) + // clang-format on } nrn_pragma_acc(wait async(nth->streams[nth->stream_id])) for (int i = 0; i < tr->n_trajec; ++i) { @@ -341,7 +344,8 @@ static void* nrn_fixed_step_thread(NrnThread* nth) { if (nth->ncell) { /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can launch kernel) */ - nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) + nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) + async(nth->streams[nth->stream_id])) nrn_pragma_acc(wait async(nth->streams[nth->stream_id])) nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) fixed_play_continuous(nth); @@ -377,7 +381,8 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) { if (nth->ncell) { /*@todo: do we need to update nth->_t on GPU */ - nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->streams[nth->stream_id])) + nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) + async(nth->streams[nth->stream_id])) nrn_pragma_acc(wait async(nth->streams[nth->stream_id])) nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) fixed_play_continuous(nth); diff --git a/coreneuron/sim/multicore.hpp b/coreneuron/sim/multicore.hpp index 56f8d3af8..44189191d 100644 --- a/coreneuron/sim/multicore.hpp +++ b/coreneuron/sim/multicore.hpp @@ -130,10 +130,11 @@ struct NrnThread: public MemoryManaged { NrnThreadBAList* tbl[BEFORE_AFTER_SIZE]; /* wasteful since almost all empty */ - int shadow_rhs_cnt = 0; /* added to facilitate the NrnThread transfer to GPU */ - int compute_gpu = 0; /* define whether to compute with gpus */ - int stream_id = 0; /* define where the kernel will be launched on GPU stream */ - std::vector streams; /* vector of stream ids needed for async execution of OpenMP in multiple streams */ + int shadow_rhs_cnt = 0; /* added to facilitate the NrnThread transfer to GPU */ + int compute_gpu = 0; /* define whether to compute with gpus */ + int stream_id = 0; /* define where the kernel will be launched on GPU stream */ + std::vector streams; /* vector of stream ids needed for async execution of OpenMP in + multiple streams */ int _net_send_buffer_size = 0; int _net_send_buffer_cnt = 0; int* _net_send_buffer = nullptr; diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp index 7f6f1d3af..42de967d0 100644 --- a/coreneuron/sim/treeset_core.cpp +++ b/coreneuron/sim/treeset_core.cpp @@ -152,8 +152,10 @@ static void nrn_lhs(NrnThread* _nt) { so here we transform so it only has membrane current contribution */ double* p = _nt->nrn_fast_imem->nrn_sav_d; - nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id])) - nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait) + nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) + async(_nt->streams[_nt->stream_id])) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu) + depend(inout: _nt->streams[_nt->stream_id]) nowait) for (int i = i1; i < i3; ++i) { p[i] += vec_d[i]; }