diff --git a/benchmarks/coarse.py b/benchmarks/coarse.py index 76848376..6adfc52c 100755 --- a/benchmarks/coarse.py +++ b/benchmarks/coarse.py @@ -49,7 +49,10 @@ ) # Warmup + #dst2 = g.copy(dst) + #src2 = g.copy(src) for n in range(5): + #co.mat([dst, dst2], [src, src2]) co.mat(dst, src) # Time cgpt diff --git a/benchmarks/stencil.py b/benchmarks/stencil.py index 99c7a030..a9a87413 100755 --- a/benchmarks/stencil.py +++ b/benchmarks/stencil.py @@ -9,142 +9,146 @@ evec = [(1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)] nevec = [tuple([-x for x in y]) for y in evec] - + for precision in [g.single, g.double]: - grid = g.grid(g.default.get_ivec("--grid", [16, 16, 16, 32], 4), precision) - N = g.default.get_int("--N", 1000) + for fast_osites in [0,1]: + grid = g.grid(g.default.get_ivec("--grid", [16, 16, 16, 32], 4), precision) + N = g.default.get_int("--N", 1000) - g.message( + g.message( f""" Local Stencil Benchmark with fdimensions : {grid.fdimensions} precision : {precision.__name__} + fast_osites : {fast_osites} """ - ) - - # plaquette - U = g.qcd.gauge.random(grid, rng, scale=0.5) - _U = [1, 2, 3, 4] - _X = 0 - _Xp = [1, 2, 3, 4] - V = g.mcolor(grid) - rng.element(V) - # U = g.qcd.gauge.transformed(U, V) - code = [] - for mu in range(4): - for nu in range(0, mu): - code.append( - { - "target": 0, - "accumulate": -1 if (mu == 1 and nu == 0) else 0, - "weight": 1.0, - "factor": [ - (_U[mu], _X, 0), - (_U[nu], _Xp[mu], 0), - (_U[mu], _Xp[nu], 1), - (_U[nu], _X, 1), - ], - } - ) - st = g.stencil.matrix( - U[0], - [(0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)], - code, - ) - # test plaquette - P = g.lattice(U[0]) - st(P, *U) - pl = g.sum(g.trace(P)).real / P.grid.gsites / 3 / 2 / 3 - assert abs(g.qcd.gauge.plaquette(U) - pl) < precision.eps * 100 - - # Flops - gauge_otype = U[0].otype - Nc = gauge_otype.shape[0] - flops_per_matrix_multiply = Nc**3 * 6 + (Nc - 1) * Nc**2 * 2 - flops_per_site = 3 * flops_per_matrix_multiply * 4 * 3 - flops = flops_per_site * P.grid.gsites * N - nbytes = (5 * Nc * Nc * 2) * precision.nbytes * P.grid.gsites * N + ) - # Warmup - for n in range(5): + # plaquette + U = g.qcd.gauge.random(grid, rng, scale=0.5) + _U = [1, 2, 3, 4] + _X = 0 + _Xp = [1, 2, 3, 4] + V = g.mcolor(grid) + rng.element(V) + # U = g.qcd.gauge.transformed(U, V) + code = [] + for mu in range(4): + for nu in range(0, mu): + code.append( + { + "target": 0, + "accumulate": -1 if (mu == 1 and nu == 0) else 0, + "weight": 1.0, + "factor": [ + (_U[mu], _X, 0), + (_U[nu], _Xp[mu], 0), + (_U[mu], _Xp[nu], 1), + (_U[nu], _X, 1), + ], + } + ) + st = g.stencil.matrix( + U[0], + [(0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)], + code, + ) + st.memory_access_pattern(fast_osites=fast_osites) + # test plaquette + P = g.lattice(U[0]) st(P, *U) + pl = g.sum(g.trace(P)).real / P.grid.gsites / 3 / 2 / 3 + assert abs(g.qcd.gauge.plaquette(U) - pl) < precision.eps * 100 + + # Flops + gauge_otype = U[0].otype + Nc = gauge_otype.shape[0] + flops_per_matrix_multiply = Nc**3 * 6 + (Nc - 1) * Nc**2 * 2 + flops_per_site = 3 * flops_per_matrix_multiply * 4 * 3 + flops = flops_per_site * P.grid.gsites * N + nbytes = (5 * Nc * Nc * 2) * precision.nbytes * P.grid.gsites * N + + # Warmup + for n in range(5): + st(P, *U) - # Time - t0 = g.time() - for n in range(N): - st(P, *U) - t1 = g.time() + # Time + t0 = g.time() + for n in range(N): + st(P, *U) + t1 = g.time() - # Report - GFlopsPerSec = flops / (t1 - t0) / 1e9 - GBPerSec = nbytes / (t1 - t0) / 1e9 - g.message( - f""" + # Report + GFlopsPerSec = flops / (t1 - t0) / 1e9 + GBPerSec = nbytes / (t1 - t0) / 1e9 + g.message( + f""" {N} applications of plaquette stencil Time to complete : {t1-t0:.2f} s Total performance : {GFlopsPerSec:.2f} GFlops/s Effective memory bandwidth : {GBPerSec:.2f} GB/s""" - ) + ) - # covariant laplacian stencil - src = g.vspincolor(grid) - rng.cnormal(src) - UdagShift = [g(g.adj(g.cshift(U[mu], mu, -1))) for mu in range(4)] - _U = [0,1,2,3] - _UdagShift = [4,5,6,7] - _X = 0 - _Xp = [1,2,3,4] - _Xm = [5,6,7,8] - code = [(0,1,_X,-1,-8.0,[])] - for mu in range(4): - code.append((0,1,_Xp[mu], 0, 1.0,[(_U[mu], _X, 0)])) - code.append((0,1,_Xm[mu], 0, 1.0,[(_UdagShift[mu], _X, 0)])) - # can switch last line to next one - #code.append((0,1,_Xm[mu], 0, 1.0,[(_U[mu], _Xm[mu], 1)])) - st = g.stencil.matrix_vector( - U[0], - src, - [(0, 0, 0, 0)] + evec + nevec, - code - ) - # test laplace - dst = g.lattice(src) - st(U + UdagShift, [dst,src]) - - lap = g.create.smear.laplace(g.covariant.shift(U, boundary_phases=[1]*4), [0,1,2,3]) - dst2 = lap(src) - eps2 = g.norm2(dst - dst2) / g.norm2(dst) - assert eps2 ** 0.5 < precision.eps * 100 - - # Flops - gauge_otype = U[0].otype - Nc = gauge_otype.shape[0] - Ns = 4 - flops_per_matrix_vector_multiply = Ns * Nc * (Nc * 6 + (Nc - 1) * 2) - flops_per_vector_add = Ns * Nc * 2 - flops_per_site = 8 * flops_per_matrix_vector_multiply + 8 * flops_per_vector_add - flops = flops_per_site * src.grid.gsites * N - nbytes = (8 * Nc * Nc * 2 + Nc * Ns * 2) * precision.nbytes * src.grid.gsites * N - - # Warmup - for n in range(5): + # covariant laplacian stencil + src = g.vspincolor(grid) + rng.cnormal(src) + UdagShift = [g(g.adj(g.cshift(U[mu], mu, -1))) for mu in range(4)] + _U = [0,1,2,3] + _UdagShift = [4,5,6,7] + _X = 0 + _Xp = [1,2,3,4] + _Xm = [5,6,7,8] + code = [(0,1,_X,-1,-8.0,[])] + for mu in range(4): + code.append((0,1,_Xp[mu], 0, 1.0,[(_U[mu], _X, 0)])) + code.append((0,1,_Xm[mu], 0, 1.0,[(_UdagShift[mu], _X, 0)])) + # can switch last line to next one + #code.append((0,1,_Xm[mu], 0, 1.0,[(_U[mu], _Xm[mu], 1)])) + st = g.stencil.matrix_vector( + U[0], + src, + [(0, 0, 0, 0)] + evec + nevec, + code + ) + st.memory_access_pattern(fast_osites=fast_osites) + # test laplace + dst = g.lattice(src) st(U + UdagShift, [dst,src]) + + lap = g.create.smear.laplace(g.covariant.shift(U, boundary_phases=[1]*4), [0,1,2,3]) + dst2 = lap(src) + eps2 = g.norm2(dst - dst2) / g.norm2(dst) + assert eps2 ** 0.5 < precision.eps * 100 - # Time - t0 = g.time() - for n in range(N): - st(U + UdagShift, [dst,src]) - t1 = g.time() - - # Report - GFlopsPerSec = flops / (t1 - t0) / 1e9 - GBPerSec = nbytes / (t1 - t0) / 1e9 - g.message( + # Flops + gauge_otype = U[0].otype + Nc = gauge_otype.shape[0] + Ns = 4 + flops_per_matrix_vector_multiply = Ns * Nc * (Nc * 6 + (Nc - 1) * 2) + flops_per_vector_add = Ns * Nc * 2 + flops_per_site = 8 * flops_per_matrix_vector_multiply + 8 * flops_per_vector_add + flops = flops_per_site * src.grid.gsites * N + nbytes = (8 * Nc * Nc * 2 + Nc * Ns * 2) * precision.nbytes * src.grid.gsites * N + + # Warmup + for n in range(5): + st(U + UdagShift, [dst,src]) + + # Time + t0 = g.time() + for n in range(N): + st(U + UdagShift, [dst,src]) + t1 = g.time() + + # Report + GFlopsPerSec = flops / (t1 - t0) / 1e9 + GBPerSec = nbytes / (t1 - t0) / 1e9 + g.message( f""" {N} applications of laplace stencil Time to complete : {t1-t0:.2f} s Total performance : {GFlopsPerSec:.2f} GFlops/s Effective memory bandwidth : {GBPerSec:.2f} GB/s""" - ) + ) diff --git a/benchmarks/stencil_coarse_operator.py b/benchmarks/stencil_coarse_operator.py index 891ac061..a4eda658 100755 --- a/benchmarks/stencil_coarse_operator.py +++ b/benchmarks/stencil_coarse_operator.py @@ -9,88 +9,96 @@ evec = [(1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)] nevec = [tuple([-x for x in y]) for y in evec] - -for precision in [g.single, g.double]: - grid = g.grid(g.default.get_ivec("--grid", [16, 16, 16, 32], 4), precision) - N = g.default.get_int("--N", 1000) - g.message( - f""" +for precision in [g.single, g.double]: + for fast_osites in [0,1]: + grid = g.grid(g.default.get_ivec("--grid", [16, 16, 16, 32], 4), precision) + N = g.default.get_int("--N", 1000) + n_rhs = g.default.get_int("--n_rhs", 12) + g.message( + f""" Local Stencil Benchmark with fdimensions : {grid.fdimensions} + n_rhs : {n_rhs} precision : {precision.__name__} + fast_osites : {fast_osites} """ - ) - - # coarse grid stencil - nbasis = g.default.get_int("--nbasis", 12) - vcoarse = g.vcomplex(grid, nbasis) - g.message(f"Virtual blocks: {len(vcoarse.v_obj)}") - nbasis_blocks = len(vcoarse.v_obj) - mcoarse = [g.mcomplex(grid, nbasis) for _ in range(9)] - rng.cnormal(mcoarse) - rng.cnormal(vcoarse) + ) + + # coarse grid stencil + nbasis = g.default.get_int("--nbasis", 12) + vcoarse = [g.vcomplex(grid, nbasis) for _ in range(n_rhs)] + g.message(f"Virtual blocks: {len(vcoarse[0].v_obj)}") + nbasis_blocks = len(vcoarse[0].v_obj) + mcoarse = [g.mcomplex(grid, nbasis) for _ in range(9)] + rng.cnormal(mcoarse) + rng.cnormal(vcoarse) - # reference solve - vref = g.lattice(vcoarse) - vref = g(mcoarse[0] * vcoarse) - for mu in range(4): - vref += mcoarse[1 + 2*mu] * g.cshift(vcoarse, mu, 1) - vref += mcoarse[2 + 2*mu] * g.cshift(vcoarse, mu, -1) - # now coarse grid dirac operator - _X = 0 - _Xp = [1,2,3,4] - _Xm = [5,6,7,8] - _ID = 0 - _M = [1,3,5,7] - _Mdag = [2,4,6,8] - code = [] - for iblock in range(nbasis_blocks): - # todo block matrix multiply here - for jblock in range(nbasis_blocks): - matrix_index = nbasis_blocks * jblock + iblock - code.append((iblock,nbasis_blocks + jblock,_X,-1 if jblock == 0 else iblock,1.0,[(matrix_index,_X,0)])) + # reference solve + vref = [g.lattice(v) for v in vcoarse] + for i in range(n_rhs): + vref[i] = g(mcoarse[0] * vcoarse[i]) for mu in range(4): - code.append((iblock,nbasis_blocks + jblock,_Xp[mu], iblock, 1.0, [(nbasis_blocks**2 * _M[mu] + matrix_index, _X, 0)])) - code.append((iblock,nbasis_blocks + jblock,_Xm[mu], iblock, 1.0, [(nbasis_blocks**2 * _Mdag[mu] + matrix_index, _X, 0)])) - st = g.stencil.matrix_vector( - mcoarse[0], - vcoarse, - [(0, 0, 0, 0)] + evec + nevec, - code, len(code) // nbasis_blocks - ) - vdst = g.lattice(vcoarse) - st(mcoarse, [vdst,vcoarse]) - eps2 = g.norm2(vdst - vref) / g.norm2(vref) - # g.message(f"Test: {eps2}") - assert eps2 ** 0.5 < precision.eps * 100 - - # Flops - flops_per_matrix_vector_multiply = nbasis * (nbasis * 6 + (nbasis - 1) * 2) - flops_per_vector_add = nbasis * 2 - flops_per_site = 9 * flops_per_matrix_vector_multiply + 8 * flops_per_vector_add - flops = flops_per_site * vcoarse.grid.gsites * N - nbytes = (9 * nbasis ** 2 * 2 + 2 * nbasis * 2) * precision.nbytes * vcoarse.grid.gsites * N + vref[i] += mcoarse[1 + 2*mu] * g.cshift(vcoarse[i], mu, 1) + vref[i] += mcoarse[2 + 2*mu] * g.cshift(vcoarse[i], mu, -1) + # now coarse grid dirac operator + _X = 0 + _Xp = [1,2,3,4] + _Xm = [5,6,7,8] + _ID = 0 + _M = [1,3,5,7] + _Mdag = [2,4,6,8] + code = [] + for i in range(n_rhs): + ioff = nbasis_blocks * i + for iblock in range(nbasis_blocks): + # todo block matrix multiply here + for jblock in range(nbasis_blocks): + matrix_index = nbasis_blocks * jblock + iblock + code.append((iblock + ioff,nbasis_blocks*n_rhs + jblock + ioff,_X,-1 if jblock == 0 else iblock + ioff,1.0,[(matrix_index,_X,0)])) + for mu in range(4): + code.append((iblock + ioff,nbasis_blocks*n_rhs + jblock + ioff,_Xp[mu], iblock + ioff, 1.0, [(nbasis_blocks**2 * _M[mu] + matrix_index, _X, 0)])) + code.append((iblock + ioff,nbasis_blocks*n_rhs + jblock + ioff,_Xm[mu], iblock + ioff, 1.0, [(nbasis_blocks**2 * _Mdag[mu] + matrix_index, _X, 0)])) + st = g.stencil.matrix_vector( + mcoarse[0], + vcoarse[0], + [(0, 0, 0, 0)] + evec + nevec, + code, len(code) // nbasis_blocks // n_rhs + ) + st.memory_access_pattern(fast_osites=fast_osites) + vdst = [g.lattice(v) for v in vcoarse] + st(mcoarse, vdst + vcoarse) + for i in range(n_rhs): + eps2 = g.norm2(vdst[i] - vref[i]) / g.norm2(vref[i]) + # g.message(f"Test {i}: {eps2}") + assert eps2 ** 0.5 < precision.eps * 100 - # Warmup - for n in range(5): - st(mcoarse, [vdst,vcoarse]) + # Flops + flops_per_matrix_vector_multiply = nbasis * (nbasis * 6 + (nbasis - 1) * 2) + flops_per_vector_add = nbasis * 2 + flops_per_site = 9 * flops_per_matrix_vector_multiply + 8 * flops_per_vector_add + flops = flops_per_site * vcoarse[0].grid.gsites * N * n_rhs + nbytes = (9 * nbasis ** 2 * 2 + 2 * nbasis * 2) * precision.nbytes * vcoarse[0].grid.gsites * N * n_rhs + + # Warmup + for n in range(5): + st(mcoarse, vdst + vcoarse) - # Time - t0 = g.time() - for n in range(N): - st(mcoarse, [vdst,vcoarse]) - t1 = g.time() + # Time + t0 = g.time() + for n in range(N): + st(mcoarse, vdst + vcoarse) + t1 = g.time() - # Report - GFlopsPerSec = flops / (t1 - t0) / 1e9 - GBPerSec = nbytes / (t1 - t0) / 1e9 - g.message( - f""" + # Report + GFlopsPerSec = flops / (t1 - t0) / 1e9 + GBPerSec = nbytes / (t1 - t0) / 1e9 + g.message( + f""" {N} applications of {nbasis} x {nbasis}, 9-point operator Time to complete : {t1-t0:.2f} s Total performance : {GFlopsPerSec:.2f} GFlops/s Effective memory bandwidth : {GBPerSec:.2f} GB/s""" - ) + ) diff --git a/lib/cgpt/lib/stencil.cc b/lib/cgpt/lib/stencil.cc index 3d1a60dd..0ce3d872 100644 --- a/lib/cgpt/lib/stencil.cc +++ b/lib/cgpt/lib/stencil.cc @@ -64,7 +64,8 @@ EXPORT(stencil_matrix_execute,{ void* _stencil; PyObject* _fields; - if (!PyArg_ParseTuple(args, "lO", &_stencil, &_fields)) { + long fast_osites; + if (!PyArg_ParseTuple(args, "lOl", &_stencil, &_fields, &fast_osites)) { return NULL; } @@ -73,7 +74,7 @@ EXPORT(stencil_matrix_execute,{ std::vector __fields; cgpt_basis_fill(__fields,_fields); - stencil->execute(__fields); + stencil->execute(__fields, fast_osites); return PyLong_FromLong(0); }); @@ -84,7 +85,8 @@ EXPORT(stencil_matrix_vector_execute,{ void* _stencil; PyObject* _matrix_fields; PyObject* _vector_fields; - if (!PyArg_ParseTuple(args, "lOO", &_stencil, &_matrix_fields, &_vector_fields)) { + long fast_osites; + if (!PyArg_ParseTuple(args, "lOOl", &_stencil, &_matrix_fields, &_vector_fields, &fast_osites)) { return NULL; } @@ -94,7 +96,7 @@ EXPORT(stencil_matrix_vector_execute,{ cgpt_basis_fill(__matrix_fields,_matrix_fields); cgpt_basis_fill(__vector_fields,_vector_fields); - stencil->execute(__matrix_fields, __vector_fields); + stencil->execute(__matrix_fields, __vector_fields, fast_osites); return PyLong_FromLong(0); }); diff --git a/lib/cgpt/lib/stencil/common.h b/lib/cgpt/lib/stencil/common.h index 58dc8c53..6cd454c3 100644 --- a/lib/cgpt/lib/stencil/common.h +++ b/lib/cgpt/lib/stencil/common.h @@ -17,6 +17,16 @@ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ +// indexing +#define MAP_INDEXING(ss, oblock) \ + if (_fast_osites) { \ + oblock = ss_block / osites; \ + ss = ss_block % osites; \ + } else { \ + ss = ss_block / _npb; \ + oblock = ss_block % _npb; \ + } + // cartesian stencil fetch #define fetch_cs(sidx, obj, point, site, view, do_adj, tag) { \ if (point == -1) { \ @@ -29,8 +39,8 @@ } else { \ obj = coalescedRead(buf ## tag[sidx][SE->_offset]); \ } \ - acceleratorSynchronise(); \ } \ + acceleratorSynchronise(); \ if (do_adj) \ obj = adj(obj); \ } diff --git a/lib/cgpt/lib/stencil/matrix.h b/lib/cgpt/lib/stencil/matrix.h index 708f6a8a..2f4729c1 100644 --- a/lib/cgpt/lib/stencil/matrix.h +++ b/lib/cgpt/lib/stencil/matrix.h @@ -42,7 +42,7 @@ struct cgpt_stencil_matrix_code_t { class cgpt_stencil_matrix_base { public: virtual ~cgpt_stencil_matrix_base() {}; - virtual void execute(const std::vector& fields) = 0; + virtual void execute(const std::vector& fields, int fast_osites) = 0; }; template @@ -125,7 +125,7 @@ class cgpt_stencil_matrix : public cgpt_stencil_matrix_base { } } - virtual void execute(PVector> &fields) { + virtual void execute(PVector> &fields, int fast_osites) { VECTOR_VIEW_OPEN(fields,fields_v,AcceleratorWrite); @@ -139,14 +139,19 @@ class cgpt_stencil_matrix : public cgpt_stencil_matrix_base { int _npb = n_code_parallel_blocks; int _npbs = n_code_parallel_block_size; + uint64_t osites = fields[0].Grid()->oSites(); + + int _fast_osites = fast_osites; + if (local) { auto sview = general_local_stencil->View(); - accelerator_for(ss_block,fields[0].Grid()->oSites() * _npb,T::Nsimd(),{ + accelerator_for(ss_block,osites * _npb,T::Nsimd(),{ - auto ss = ss_block / _npb; - auto oblock = ss_block % _npb; + uint64_t ss, oblock; + + MAP_INDEXING(ss, oblock); for (int iblock=0;iblock<_npbs;iblock++) { @@ -179,8 +184,9 @@ class cgpt_stencil_matrix : public cgpt_stencil_matrix_base { // now loop accelerator_for(ss_block,fields[0].Grid()->oSites() * _npb,T::Nsimd(),{ - auto ss = ss_block / _npb; - auto oblock = ss_block % _npb; + uint64_t ss, oblock; + + MAP_INDEXING(ss, oblock); for (int iblock=0;iblock<_npbs;iblock++) { @@ -214,10 +220,10 @@ class cgpt_stencil_matrix : public cgpt_stencil_matrix_base { VECTOR_VIEW_CLOSE(fields_v); } - virtual void execute(const std::vector& __fields) { + virtual void execute(const std::vector& __fields, int fast_osites) { PVector> fields; cgpt_basis_fill(fields,__fields); - execute(fields); + execute(fields, fast_osites); } }; diff --git a/lib/cgpt/lib/stencil/matrix_vector.h b/lib/cgpt/lib/stencil/matrix_vector.h index 70b174ed..d73f8277 100644 --- a/lib/cgpt/lib/stencil/matrix_vector.h +++ b/lib/cgpt/lib/stencil/matrix_vector.h @@ -45,7 +45,7 @@ struct cgpt_stencil_matrix_vector_code_t { class cgpt_stencil_matrix_vector_base { public: virtual ~cgpt_stencil_matrix_vector_base() {}; - virtual void execute(const std::vector& matrix_fields, const std::vector& vector_fields) = 0; + virtual void execute(const std::vector& matrix_fields, const std::vector& vector_fields, int fast_osites) = 0; }; template @@ -146,7 +146,7 @@ class cgpt_stencil_matrix_vector : public cgpt_stencil_matrix_vector_base { } } - virtual void execute(PVector> &fields_matrix, PVector> &fields_vector) { + virtual void execute(PVector> &fields_matrix, PVector> &fields_vector, int fast_osites) { VECTOR_VIEW_OPEN(fields_matrix,fields_m_v,AcceleratorRead); VECTOR_VIEW_OPEN(fields_vector,fields_v_v,AcceleratorWrite); @@ -162,13 +162,18 @@ class cgpt_stencil_matrix_vector : public cgpt_stencil_matrix_vector_base { int _npb = n_code_parallel_blocks; int _npbs = n_code_parallel_block_size; + uint64_t osites = fields_matrix[0].Grid()->oSites(); + + int _fast_osites = fast_osites; + if (local) { auto sview = general_local_stencil->View(); - accelerator_for(ss_block,fields_matrix[0].Grid()->oSites() * _npb,M::Nsimd(),{ + accelerator_for(ss_block,osites * _npb,M::Nsimd(),{ - auto ss = ss_block / _npb; - auto oblock = ss_block % _npb; + uint64_t ss, oblock; + + MAP_INDEXING(ss, oblock); for (int iblock=0;iblock<_npbs;iblock++) { @@ -196,11 +201,12 @@ class cgpt_stencil_matrix_vector : public cgpt_stencil_matrix_vector_base { CGPT_CARTESIAN_STENCIL_HALO_EXCHANGE(M, _matrix); CGPT_CARTESIAN_STENCIL_HALO_EXCHANGE(V, _vector); - - accelerator_for(ss_block,fields_matrix[0].Grid()->oSites() * _npb,M::Nsimd(),{ + + accelerator_for(ss_block,osites * _npb,M::Nsimd(),{ - auto ss = ss_block / _npb; - auto oblock = ss_block % _npb; + uint64_t ss, oblock; + + MAP_INDEXING(ss, oblock); for (int iblock=0;iblock<_npbs;iblock++) { @@ -235,12 +241,12 @@ class cgpt_stencil_matrix_vector : public cgpt_stencil_matrix_vector_base { VECTOR_VIEW_CLOSE(fields_v_v); } - virtual void execute(const std::vector& __matrix_fields, const std::vector& __vector_fields) { + virtual void execute(const std::vector& __matrix_fields, const std::vector& __vector_fields, int fast_osites) { PVector> matrix_fields; PVector> vector_fields; cgpt_basis_fill(matrix_fields,__matrix_fields); cgpt_basis_fill(vector_fields,__vector_fields); - execute(matrix_fields, vector_fields); + execute(matrix_fields, vector_fields, fast_osites); } }; diff --git a/lib/gpt/core/local_stencil/matrix.py b/lib/gpt/core/local_stencil/matrix.py index 87fc3b5b..e95dbac4 100644 --- a/lib/gpt/core/local_stencil/matrix.py +++ b/lib/gpt/core/local_stencil/matrix.py @@ -36,12 +36,16 @@ def __init__(self, lat, points, code, code_parallel_block_size=None, local=1): self.obj = cgpt.stencil_matrix_create( lat.v_obj[0], lat.grid.obj, points, self.code, code_parallel_block_size, local ) + self.fast_osites = 1 def __call__(self, *fields): - cgpt.stencil_matrix_execute(self.obj, list(fields)) + cgpt.stencil_matrix_execute(self.obj, list(fields), self.fast_osites) def __del__(self): cgpt.stencil_matrix_delete(self.obj) def data_access_hints(self, *hints): pass + + def memory_access_pattern(self, fast_osites): + self.fast_osites = fast_osites diff --git a/lib/gpt/core/local_stencil/matrix_vector.py b/lib/gpt/core/local_stencil/matrix_vector.py index b266a85a..9227052f 100644 --- a/lib/gpt/core/local_stencil/matrix_vector.py +++ b/lib/gpt/core/local_stencil/matrix_vector.py @@ -49,12 +49,16 @@ def __init__(self, lat_matrix, lat_vector, points, code, code_parallel_block_siz code_parallel_block_size, local ) + self.fast_osites = 1 def __call__(self, matrix_fields, vector_fields): - cgpt.stencil_matrix_vector_execute(self.obj, matrix_fields, vector_fields) + cgpt.stencil_matrix_vector_execute(self.obj, matrix_fields, vector_fields, self.fast_osites) def __del__(self): cgpt.stencil_matrix_vector_delete(self.obj) def data_access_hints(self, *hints): pass + + def memory_access_pattern(self, fast_osites): + self.fast_osites = fast_osites diff --git a/lib/gpt/core/object_type/container.py b/lib/gpt/core/object_type/container.py index 8a7555df..61f9640d 100644 --- a/lib/gpt/core/object_type/container.py +++ b/lib/gpt/core/object_type/container.py @@ -30,8 +30,11 @@ ### # Helper +max_fundamental_singlet_size = gpt.default.get_int("--max_fundamental_singlet_size", 100000) def decompose(n, ns, rank): for x in reversed(sorted(ns)): + if x > max_fundamental_singlet_size: + continue if n % x == 0: return [x] * ((n // x) ** rank) raise Exception("Cannot decompose %d in available fundamentals %s" % (n, ns))