Skip to content
This repository has been archived by the owner on Mar 20, 2023. It is now read-only.

Commit

Permalink
fixing race condition in cell permute 2 : performance optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
Christos Kotsalos committed Aug 9, 2022
1 parent 12c7f57 commit f827cf4
Showing 1 changed file with 12 additions and 33 deletions.
45 changes: 12 additions & 33 deletions coreneuron/permute/cellorder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,26 +478,23 @@ static void bksub_interleaved(NrnThread* nt,
}

// icore ranges [0:warpsize) ; stride[ncycle]
nrn_pragma_acc(routine vector)
static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* stride, int lastnode) {
int icycle = ncycle - 1;
int istride = stride[icycle];
int i = lastnode - istride + icore;
//#ifndef CORENEURON_ENABLE_GPU
int ii = i;
//#endif

// execute until all tree depths are executed
bool has_subtrees_to_compute = true;

// clang-format off
nrn_pragma_acc(loop seq)
for (; has_subtrees_to_compute; ) { // ncycle loop
//#ifndef CORENEURON_ENABLE_GPU
// serial test, gpu does this in parallel
nrn_pragma_acc(loop)
nrn_pragma_acc(loop vector)
for (int icore = 0; icore < warpsize; ++icore) {
int i = ii + icore;
//#endif
if (icore < istride) { // most efficient if istride equal warpsize
// what is the index
int ip = GPU_PARENT(i);
Expand All @@ -509,9 +506,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
nrn_pragma_omp(atomic update)
GPU_RHS(ip) -= p * GPU_RHS(i);
}
//#ifndef CORENEURON_ENABLE_GPU
}
//#endif
// if finished with all tree depths then ready to break
// (note that break is not allowed in OpenACC)
if (icycle == 0) {
Expand All @@ -521,52 +516,41 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
--icycle;
istride = stride[icycle];
i -= istride;
//#ifndef CORENEURON_ENABLE_GPU
ii -= istride;
//#endif
}
// clang-format on
}

// icore ranges [0:warpsize) ; stride[ncycle]
nrn_pragma_acc(routine vector)
static void bksub_interleaved2(NrnThread* nt,
int root,
int lastroot,
int icore,
int ncycle,
int* stride,
int firstnode) {
//#ifndef CORENEURON_ENABLE_GPU
nrn_pragma_acc(loop seq)
for (int i = root; i < lastroot; i += 1) {
//#else
// nrn_pragma_acc(loop seq)
// for (int i = root; i < lastroot; i += warpsize) {
//#endif
GPU_RHS(i) /= GPU_D(i); // the root
}

int i = firstnode + icore;
//#ifndef CORENEURON_ENABLE_GPU
int ii = i;
//#endif
nrn_pragma_acc(loop seq)
for (int icycle = 0; icycle < ncycle; ++icycle) {
int istride = stride[icycle];
//#ifndef CORENEURON_ENABLE_GPU
nrn_pragma_acc(loop)
// serial test, gpu does this in parallel
nrn_pragma_acc(loop vector)
for (int icore = 0; icore < warpsize; ++icore) {
int i = ii + icore;
//#endif
if (icore < istride) {
int ip = GPU_PARENT(i);
GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip);
GPU_RHS(i) /= GPU_D(i);
}
i += istride;
//#ifndef CORENEURON_ENABLE_GPU
}
ii += istride;
//#endif
}
}

Expand Down Expand Up @@ -602,15 +586,14 @@ void solve_interleaved2(int ith) {
defined(_OPENACC)
int nstride = stridedispl[nwarp];
#endif
nrn_pragma_acc(parallel loop gang vector vector_length(
warpsize) present(nt [0:1],
nrn_pragma_acc(parallel loop gang present(nt [0:1],
strides [0:nstride],
ncycles [0:nwarp],
stridedispl [0:nwarp + 1],
rootbegin [0:nwarp + 1],
nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id))
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
for (int icore = 0; icore < ncore; ++icore) {
for (int icore = 0; icore < ncore; icore += warpsize) {
int iwarp = icore / warpsize; // figure out the >> value
int ic = icore & (warpsize - 1); // figure out the & mask
int ncycle = ncycles[iwarp];
Expand All @@ -619,14 +602,10 @@ void solve_interleaved2(int ith) {
int lastroot = rootbegin[iwarp + 1];
int firstnode = nodebegin[iwarp];
int lastnode = nodebegin[iwarp + 1];
//#ifndef CORENEURON_ENABLE_GPU
if (ic == 0) { // serial test mode. triang and bksub do all cores in warp
//#endif
triang_interleaved2(nt, ic, ncycle, stride, lastnode);
bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
//#ifndef CORENEURON_ENABLE_GPU
} // serial test mode
//#endif

// triang and bksub do all cores in warp
triang_interleaved2(nt, ic, ncycle, stride, lastnode);
bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
}
nrn_pragma_acc(wait(nt->stream_id))
#ifdef _OPENACC
Expand Down

0 comments on commit f827cf4

Please sign in to comment.