diff --git a/benchmarks/communication.py b/benchmarks/communication.py index 0c066214..37424b13 100755 --- a/benchmarks/communication.py +++ b/benchmarks/communication.py @@ -62,9 +62,11 @@ for l in range(nwarm): plan(lat, lat) + g.barrier() t0 = g.time() for l in range(n): plan(lat, lat) + g.barrier() t1 = g.time() msg = ( diff --git a/benchmarks/stencil.py b/benchmarks/stencil.py index 452d3395..03b33b36 100755 --- a/benchmarks/stencil.py +++ b/benchmarks/stencil.py @@ -11,139 +11,135 @@ nevec = [tuple([-x for x in y]) for y in evec] for precision in [g.single, g.double]: - for fast_osites in [0, 1]: - grid = g.grid(g.default.get_ivec("--grid", [16, 16, 16, 32], 4), precision) - N = g.default.get_int("--N", 1000) + grid = g.grid(g.default.get_ivec("--grid", [16, 16, 16, 32], 4), precision) + N = g.default.get_int("--N", 1000) - g.message( - f""" + g.message( + f""" Local Stencil Benchmark with fdimensions : {grid.fdimensions} precision : {precision.__name__} - fast_osites : {fast_osites} """ ) - # plaquette - U = g.qcd.gauge.random(grid, rng, scale=0.5) - _U = [1, 2, 3, 4] - _X = 0 - _Xp = [1, 2, 3, 4] - V = g.mcolor(grid) - rng.element(V) - # U = g.qcd.gauge.transformed(U, V) - code = [] - for mu in range(4): - for nu in range(0, mu): - code.append( - { - "target": 0, - "accumulate": -1 if (mu == 1 and nu == 0) else 0, - "weight": 1.0, - "factor": [ - (_U[mu], _X, 0), - (_U[nu], _Xp[mu], 0), - (_U[mu], _Xp[nu], 1), - (_U[nu], _X, 1), - ], - } - ) - st = g.stencil.matrix( - U[0], - [(0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)], - code, - ) - st.memory_access_pattern(fast_osites=fast_osites) - # test plaquette - P = g.lattice(U[0]) + # plaquette + U = g.qcd.gauge.random(grid, rng, scale=0.5) + _U = [1, 2, 3, 4] + _X = 0 + _Xp = [1, 2, 3, 4] + V = g.mcolor(grid) + rng.element(V) + # U = g.qcd.gauge.transformed(U, V) + code = [] + for mu in range(4): + for nu in range(0, mu): + code.append( + { + "target": 0, + "accumulate": -1 if (mu == 1 and nu == 0) else 0, + "weight": 1.0, + "factor": [ + (_U[mu], _X, 0), + (_U[nu], _Xp[mu], 0), + (_U[mu], _Xp[nu], 1), + (_U[nu], _X, 1), + ], + } + ) + st = g.stencil.matrix( + U[0], + [(0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)], + code, + ) + # test plaquette + P = g.lattice(U[0]) + st(P, *U) + pl = g.sum(g.trace(P)).real / P.grid.gsites / 3 / 2 / 3 + assert abs(g.qcd.gauge.plaquette(U) - pl) < precision.eps * 100 + + # Flops + gauge_otype = U[0].otype + Nc = gauge_otype.shape[0] + flops_per_matrix_multiply = Nc**3 * 6 + (Nc - 1) * Nc**2 * 2 + flops_per_site = 3 * flops_per_matrix_multiply * 4 * 3 + flops = flops_per_site * P.grid.gsites * N + nbytes = (5 * Nc * Nc * 2) * precision.nbytes * P.grid.gsites * N + + # Warmup + for n in range(5): st(P, *U) - pl = g.sum(g.trace(P)).real / P.grid.gsites / 3 / 2 / 3 - assert abs(g.qcd.gauge.plaquette(U) - pl) < precision.eps * 100 - - # Flops - gauge_otype = U[0].otype - Nc = gauge_otype.shape[0] - flops_per_matrix_multiply = Nc**3 * 6 + (Nc - 1) * Nc**2 * 2 - flops_per_site = 3 * flops_per_matrix_multiply * 4 * 3 - flops = flops_per_site * P.grid.gsites * N - nbytes = (5 * Nc * Nc * 2) * precision.nbytes * P.grid.gsites * N - - # Warmup - for n in range(5): - st(P, *U) - - # Time - t0 = g.time() - for n in range(N): - st(P, *U) - t1 = g.time() - - # Report - GFlopsPerSec = flops / (t1 - t0) / 1e9 - GBPerSec = nbytes / (t1 - t0) / 1e9 - g.message( - f""" + + # Time + t0 = g.time() + for n in range(N): + st(P, *U) + t1 = g.time() + + # Report + GFlopsPerSec = flops / (t1 - t0) / 1e9 + GBPerSec = nbytes / (t1 - t0) / 1e9 + g.message( + f""" {N} applications of plaquette stencil Time to complete : {t1-t0:.2f} s Total performance : {GFlopsPerSec:.2f} GFlops/s Effective memory bandwidth : {GBPerSec:.2f} GB/s""" - ) - - # covariant laplacian stencil - src = g.vspincolor(grid) - rng.cnormal(src) - UdagShift = [g(g.adj(g.cshift(U[mu], mu, -1))) for mu in range(4)] - _U = [0, 1, 2, 3] - _UdagShift = [4, 5, 6, 7] - _X = 0 - _Xp = [1, 2, 3, 4] - _Xm = [5, 6, 7, 8] - code = [(0, 1, _X, -1, -8.0, [])] - for mu in range(4): - code.append((0, 1, _Xp[mu], 0, 1.0, [(_U[mu], _X, 0)])) - code.append((0, 1, _Xm[mu], 0, 1.0, [(_UdagShift[mu], _X, 0)])) - # can switch last line to next one - # code.append((0,1,_Xm[mu], 0, 1.0,[(_U[mu], _Xm[mu], 1)])) - st = g.stencil.matrix_vector(U[0], src, [(0, 0, 0, 0)] + evec + nevec, code) - st.memory_access_pattern(fast_osites=fast_osites) - # test laplace - dst = g.lattice(src) + ) + + # covariant laplacian stencil + src = g.vspincolor(grid) + rng.cnormal(src) + UdagShift = [g(g.adj(g.cshift(U[mu], mu, -1))) for mu in range(4)] + _U = [0, 1, 2, 3] + _UdagShift = [4, 5, 6, 7] + _X = 0 + _Xp = [1, 2, 3, 4] + _Xm = [5, 6, 7, 8] + code = [(0, 1, _X, -1, -8.0, [])] + for mu in range(4): + code.append((0, 1, _Xp[mu], 0, 1.0, [(_U[mu], _X, 0)])) + code.append((0, 1, _Xm[mu], 0, 1.0, [(_UdagShift[mu], _X, 0)])) + # can switch last line to next one + # code.append((0,1,_Xm[mu], 0, 1.0,[(_U[mu], _Xm[mu], 1)])) + st = g.stencil.matrix_vector(U[0], src, [(0, 0, 0, 0)] + evec + nevec, code) + # test laplace + dst = g.lattice(src) + st(U + UdagShift, [dst, src]) + + lap = g.create.smear.laplace(g.covariant.shift(U, boundary_phases=[1] * 4), [0, 1, 2, 3]) + dst2 = lap(src) + eps2 = g.norm2(dst - dst2) / g.norm2(dst) + assert eps2**0.5 < precision.eps * 100 + + # Flops + gauge_otype = U[0].otype + Nc = gauge_otype.shape[0] + Ns = 4 + flops_per_matrix_vector_multiply = Ns * Nc * (Nc * 6 + (Nc - 1) * 2) + flops_per_vector_add = Ns * Nc * 2 + flops_per_site = 8 * flops_per_matrix_vector_multiply + 8 * flops_per_vector_add + flops = flops_per_site * src.grid.gsites * N + nbytes = (8 * Nc * Nc * 2 + Nc * Ns * 2) * precision.nbytes * src.grid.gsites * N + + # Warmup + for n in range(5): st(U + UdagShift, [dst, src]) - lap = g.create.smear.laplace(g.covariant.shift(U, boundary_phases=[1] * 4), [0, 1, 2, 3]) - dst2 = lap(src) - eps2 = g.norm2(dst - dst2) / g.norm2(dst) - assert eps2**0.5 < precision.eps * 100 - - # Flops - gauge_otype = U[0].otype - Nc = gauge_otype.shape[0] - Ns = 4 - flops_per_matrix_vector_multiply = Ns * Nc * (Nc * 6 + (Nc - 1) * 2) - flops_per_vector_add = Ns * Nc * 2 - flops_per_site = 8 * flops_per_matrix_vector_multiply + 8 * flops_per_vector_add - flops = flops_per_site * src.grid.gsites * N - nbytes = (8 * Nc * Nc * 2 + Nc * Ns * 2) * precision.nbytes * src.grid.gsites * N - - # Warmup - for n in range(5): - st(U + UdagShift, [dst, src]) - - # Time - t0 = g.time() - for n in range(N): - st(U + UdagShift, [dst, src]) - t1 = g.time() + # Time + t0 = g.time() + for n in range(N): + st(U + UdagShift, [dst, src]) + t1 = g.time() - # Report - GFlopsPerSec = flops / (t1 - t0) / 1e9 - GBPerSec = nbytes / (t1 - t0) / 1e9 - g.message( - f""" + # Report + GFlopsPerSec = flops / (t1 - t0) / 1e9 + GBPerSec = nbytes / (t1 - t0) / 1e9 + g.message( + f""" {N} applications of laplace stencil Time to complete : {t1-t0:.2f} s Total performance : {GFlopsPerSec:.2f} GFlops/s Effective memory bandwidth : {GBPerSec:.2f} GB/s""" - ) + ) diff --git a/benchmarks/stencil_tensor.py b/benchmarks/stencil_tensor.py index 9a59565b..cd49ad5b 100755 --- a/benchmarks/stencil_tensor.py +++ b/benchmarks/stencil_tensor.py @@ -27,27 +27,18 @@ g.message("m3 = m1 * m2") g.message(g.norm2(m3 - m3ref)) -for osites_per_instruction in [4, 16, 32, 128, 256]: # [1,8,16,32,64]: - for osites_per_cache_block in [ - 4096, - 2**15, - grid.gsites, - ]: # [2**11, 2**13, 2**15, grid.gsites]: - ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block) - - g.message(osites_per_instruction, osites_per_cache_block) - t = g.timer("d") - t("expr") - for i in range(300): - g.eval(m3, m1 * m2) - t("stencil") - for i in range(300): - ein(m3, m1, m2) - t() - g.message(t) - eps2 = g.norm2(m3 - m3ref) / g.norm2(m3) - assert eps2 < 1e-25 - g.message(eps2) +t = g.timer("d") +t("expr") +for i in range(300): + g.eval(m3, m1 * m2) +t("stencil") +for i in range(300): + ein(m3, m1, m2) +t() +g.message(t) +eps2 = g.norm2(m3 - m3ref) / g.norm2(m3) +assert eps2 < 1e-25 +g.message(eps2) # next @@ -100,23 +91,18 @@ g.message("m4 = m1 * m2 * m3") g.message(g.norm2(m4 - m4ref)) -for osites_per_instruction in [16, 32, 64, 256]: - for osites_per_cache_block in [16 * 16 * 16, 16 * 16 * 16 * 32, grid.gsites]: - ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block) - - g.message(osites_per_instruction, osites_per_cache_block) - t = g.timer("d") - t("expr") - for i in range(300): - g.eval(m4, m1 * m2 * m2) - t("stencil") - for i in range(300): - ein(m4, tmp, m1, m2, m3) - t() - g.message(t) - eps2 = g.norm2(m4 - m4ref) / g.norm2(m4) - assert eps2 < 1e-25 - g.message(eps2) +t = g.timer("d") +t("expr") +for i in range(300): + g.eval(m4, m1 * m2 * m2) +t("stencil") +for i in range(300): + ein(m4, tmp, m1, m2, m3) +t() +g.message(t) +eps2 = g.norm2(m4 - m4ref) / g.norm2(m4) +assert eps2 < 1e-25 +g.message(eps2) g.message("Diquark") @@ -159,18 +145,13 @@ g.message(g.norm2(R - R2) / g.norm2(R)) # # D[i2[0], i1[0]] += sign1 * sign2 * Q1[i1[1], i2[1]] * g.transpose(Q2[i1[2], i2[2]]) -for osites_per_instruction in [1, 2, 4, 16, 32, 64, 256]: - for osites_per_cache_block in [grid.gsites]: - ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block) - - g.message(osites_per_instruction, osites_per_cache_block) - t = g.timer("d") - t("diquark") - for i in range(30): - g.qcd.baryon.diquark(Q1, Q2) - t("stencil") - for i in range(30): - ein(R, Q1, Q2) - t() - g.message(t) - g.message(g.norm2(R - R2) / g.norm2(R)) +t = g.timer("d") +t("diquark") +for i in range(30): + g.qcd.baryon.diquark(Q1, Q2) +t("stencil") +for i in range(30): + ein(R, Q1, Q2) +t() +g.message(t) +g.message(g.norm2(R - R2) / g.norm2(R)) diff --git a/lib/cgpt/lib/expression/mul_implementation.h b/lib/cgpt/lib/expression/mul_implementation.h index f4c2d69d..dbcd3a66 100644 --- a/lib/cgpt/lib/expression/mul_implementation.h +++ b/lib/cgpt/lib/expression/mul_implementation.h @@ -208,7 +208,7 @@ void cgpt_unary(const Lattice * & pl, const Lattice & l, int unary) { pl = &l; break; case BIT_TRANS|BIT_CONJ: - pl = new Lattice( adj(l) ); + pl = new Lattice( transpose(conjugate(l)) ); // temporary fix break; case BIT_TRANS: pl = new Lattice( transpose(l) ); diff --git a/lib/cgpt/lib/instantiate/expression_mul_double_iVSpin4Color8.cc b/lib/cgpt/lib/instantiate/expression_mul_double_iVSpin4Color8.cc index d285de20..df3fb441 100644 --- a/lib/cgpt/lib/instantiate/expression_mul_double_iVSpin4Color8.cc +++ b/lib/cgpt/lib/instantiate/expression_mul_double_iVSpin4Color8.cc @@ -47,7 +47,7 @@ template<> cgpt_Lattice_base* cgpt_lattice_mul(cgpt_Lattice_base* dst, bool ac, int unary_a, Lattice< iVSpin4Color8 >& la,int unary_b, cgpt_Lattice_base* b, int unary_expr, ComplexD coef) { typedef vComplexD vtype; _COMPATIBLE_(iSinglet); - _OUTER_PRODUCT_(iVSpin4Color8); + //_OUTER_PRODUCT_(iVSpin4Color8); _INNER_PRODUCT_(iVSpin4Color8); ERR("Not implemented"); } diff --git a/lib/cgpt/lib/instantiate/expression_mul_single_iVSpin4Color8.cc b/lib/cgpt/lib/instantiate/expression_mul_single_iVSpin4Color8.cc index 8dbcf1b0..6444667a 100644 --- a/lib/cgpt/lib/instantiate/expression_mul_single_iVSpin4Color8.cc +++ b/lib/cgpt/lib/instantiate/expression_mul_single_iVSpin4Color8.cc @@ -47,7 +47,7 @@ template<> cgpt_Lattice_base* cgpt_lattice_mul(cgpt_Lattice_base* dst, bool ac, int unary_a, Lattice< iVSpin4Color8 >& la,int unary_b, cgpt_Lattice_base* b, int unary_expr, ComplexD coef) { typedef vComplexF vtype; _COMPATIBLE_(iSinglet); - _OUTER_PRODUCT_(iVSpin4Color8); + //_OUTER_PRODUCT_(iVSpin4Color8); _INNER_PRODUCT_(iVSpin4Color8); ERR("Not implemented"); } diff --git a/lib/gpt/core/log.py b/lib/gpt/core/log.py index 29f6761e..397c7e69 100644 --- a/lib/gpt/core/log.py +++ b/lib/gpt/core/log.py @@ -16,13 +16,23 @@ # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # -import gpt, sys +import gpt, sys, os +from inspect import getframeinfo, stack +verbose = gpt.default.is_verbose("message_context") def message(*a, force_output=False): # conversion to string can be an mpi process (i.e. for lattice), # so need to do it on all ranks s = " ".join([str(x) for x in a]) + if verbose: + slines = [] + for st in stack()[1:]: + caller = getframeinfo(st[0]) + slines.append(f"{caller.filename}:{caller.lineno}") + cpath = os.path.commonpath(slines) + cs = ";".join([ x[len(cpath)+1:] for x in slines ]) + s = f"[{cpath}|{cs}]\n{s}" if gpt.rank() == 0 or force_output: lines = s.split("\n") if len(lines) > 0: diff --git a/tests/manual/mpi.py b/tests/manual/mpi.py index fd4bd5d4..fcefa734 100755 --- a/tests/manual/mpi.py +++ b/tests/manual/mpi.py @@ -89,10 +89,11 @@ def D_DWF(dst, src, U, b, c, mass, M5): pc = g.qcd.fermion.preconditioner inv = g.algorithms.inverter g.default.set_verbose("cg_convergence") +g.default.set_verbose("mixed_precision_performance") cg_e_inner = inv.cg({"eps": 1e-4, "eps_abs": exact_prec * 0.3, "maxiter": 40000, "miniter": 50}) cg_e = inv.defect_correcting( - #inv.mixed_precision(inv.preconditioned(pc.eo2_ne(), cg_e_inner), g.single, g.double), - inv.preconditioned(pc.eo2_ne(), cg_e_inner), + inv.mixed_precision(inv.preconditioned(pc.eo2_ne(), cg_e_inner), g.single, g.double), + # inv.preconditioned(pc.eo2_ne(), cg_e_inner), eps=exact_prec, maxiter=100, )