diff --git a/benchmarks/stencil.py b/benchmarks/stencil.py new file mode 100755 index 00000000..c5e581df --- /dev/null +++ b/benchmarks/stencil.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +# +# Authors: Christoph Lehner 2023 +# +import gpt as g + +g.default.set_verbose("random", False) +rng = g.random("benchmark", "vectorized_ranlux24_24_64") + +for precision in [g.single, g.double]: + grid = g.grid(g.default.get_ivec("--grid", [16, 16, 16, 32], 4), precision) + N = g.default.get_int("--N", 1000) + + g.message( + f""" + Local Stencil Benchmark with + fdimensions : {grid.fdimensions} + precision : {precision.__name__} +""" + ) + + U = g.qcd.gauge.random(grid, rng, scale=0.5) + _U = [1, 2, 3, 4] + _X = 0 + _Xp = [1, 2, 3, 4] + V = g.mcolor(grid) + rng.element(V) + # U = g.qcd.gauge.transformed(U, V) + code = [] + for mu in range(4): + for nu in range(0, mu): + code.append( + { + "target": 0, + "accumulate": -1 if (mu == 1 and nu == 0) else 0, + "weight": 1.0, + "factor": [ + (_U[mu], _X, 0), + (_U[nu], _Xp[mu], 0), + (_U[mu], _Xp[nu], 1), + (_U[nu], _X, 1), + ], + } + ) + st = g.stencil.matrix( + U[0], [(0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)], [0], [1,2,3,4], code + ) + # test plaquette + P = g.lattice(U[0]) + st(P, *U) + pl = g.sum(g.trace(P)).real / P.grid.gsites / 3 / 2 / 3 + assert abs(g.qcd.gauge.plaquette(U) - pl) < precision.eps * 100 + + # Flops + gauge_otype = U[0].otype + Nc = gauge_otype.shape[0] + flops_per_matrix_multiply = Nc**3 * 6 + (Nc - 1) * Nc**2 * 2 + flops_per_site = 3 * flops_per_matrix_multiply * 4 * 3 + flops = flops_per_site * P.grid.gsites * N + nbytes = (5 * Nc * Nc * 2) * precision.nbytes * P.grid.gsites * N + + # Warmup + for n in range(5): + st(P, *U) + + # Time + t0 = g.time() + for n in range(N): + st(P, *U) + t1 = g.time() + + # Report + GFlopsPerSec = flops / (t1 - t0) / 1e9 + GBPerSec = nbytes / (t1 - t0) / 1e9 + g.message( + f"""{N} applications of plaquette stencil + Time to complete : {t1-t0:.2f} s + Total performance : {GFlopsPerSec:.2f} GFlops/s + Effective memory bandwidth : {GBPerSec:.2f} GB/s""" + ) diff --git a/lib/gpt/core/copy_plan.py b/lib/gpt/core/copy_plan.py index 9a9d8140..5a4adc1b 100644 --- a/lib/gpt/core/copy_plan.py +++ b/lib/gpt/core/copy_plan.py @@ -147,7 +147,7 @@ def __call__(self, local_only=False, skip_optimize=False, use_communication_buff gpt.message(t_cgpt) gpt.message( - f"copy_plan: create: {t1-t0} s (local_only = {local_only}, skip_optimize = {skip_optimize}, use_communication_buffers = {use_communication_buffers})" + f"copy_plan: create: {t1-t0} s (local_only = {local_only}, skip_optimize = {skip_optimize}, use_communication_buffers = {use_communication_buffers}, communication_buffer_location = {self.communication_buffer_location.__name__})" ) return copy_plan_executer( diff --git a/lib/gpt/core/stencil/matrix.py b/lib/gpt/core/stencil/matrix.py index 96dbaec9..60f1aa7f 100644 --- a/lib/gpt/core/stencil/matrix.py +++ b/lib/gpt/core/stencil/matrix.py @@ -40,8 +40,12 @@ def __init__(self, lat, points, write_fields, read_fields, code, code_parallel_b ) self.write_fields = write_fields self.read_fields = read_fields + self.verbose_performance = g.default.is_verbose("stencil_performance") def __call__(self, *fields): + if self.verbose_performance: + t = g.timer("stencil.matrix") + t("create fields") padded_fields = [] padded_field = None for i in range(len(fields)): @@ -54,6 +58,13 @@ def __call__(self, *fields): for i in range(len(fields)): if padded_fields[i] is None: padded_fields[i] = g.lattice(padded_field) + if self.verbose_performance: + t("local stencil") self.local_stencil(*padded_fields) + if self.verbose_performance: + t("extract") for i in self.write_fields: self.padding.extract(fields[i], padded_fields[i]) + if self.verbose_performance: + t() + g.message(t) diff --git a/lib/gpt/qcd/fermion/register.py b/lib/gpt/qcd/fermion/register.py index 93d42a0a..8f065af2 100644 --- a/lib/gpt/qcd/fermion/register.py +++ b/lib/gpt/qcd/fermion/register.py @@ -11,15 +11,25 @@ def register(reg, op): reg.Mdiag = lambda dst, src: op.apply_unary_operator(2009, dst, src) reg.Dminus = lambda dst, src: op.apply_unary_operator(2010, dst, src) reg.DminusDag = lambda dst, src: op.apply_unary_operator(2011, dst, src) - reg.ImportPhysicalFermionSource = lambda dst, src: op.apply_unary_operator(2012, dst, src) - reg.ImportUnphysicalFermion = lambda dst, src: op.apply_unary_operator(2013, dst, src) - reg.ExportPhysicalFermionSolution = lambda dst, src: op.apply_unary_operator(2014, dst, src) - reg.ExportPhysicalFermionSource = lambda dst, src: op.apply_unary_operator(2015, dst, src) + reg.ImportPhysicalFermionSource = lambda dst, src: op.apply_unary_operator( + 2012, dst, src + ) + reg.ImportUnphysicalFermion = lambda dst, src: op.apply_unary_operator( + 2013, dst, src + ) + reg.ExportPhysicalFermionSolution = lambda dst, src: op.apply_unary_operator( + 2014, dst, src + ) + reg.ExportPhysicalFermionSource = lambda dst, src: op.apply_unary_operator( + 2015, dst, src + ) reg.Dhop = lambda dst, src: op.apply_unary_operator(3001, dst, src) reg.DhopDag = lambda dst, src: op.apply_unary_operator(4001, dst, src) reg.DhopEO = lambda dst, src: op.apply_unary_operator(3002, dst, src) reg.DhopEODag = lambda dst, src: op.apply_unary_operator(4002, dst, src) - reg.Mdir = lambda dst, src, dir, disp: op.apply_dirdisp_operator(5001, dst, src, dir, disp) + reg.Mdir = lambda dst, src, dir, disp: op.apply_dirdisp_operator( + 5001, dst, src, dir, disp + ) reg.MDeriv = lambda mat, dst, src: op.apply_deriv_operator(6001, mat, dst, src) reg.MDerivDag = lambda mat, dst, src: op.apply_deriv_operator(7001, mat, dst, src) reg.MoeDeriv = lambda mat, dst, src: op.apply_deriv_operator(6002, mat, dst, src) @@ -27,8 +37,14 @@ def register(reg, op): reg.MeoDeriv = lambda mat, dst, src: op.apply_deriv_operator(6003, mat, dst, src) reg.MeoDerivDag = lambda mat, dst, src: op.apply_deriv_operator(7003, mat, dst, src) reg.DhopDeriv = lambda mat, dst, src: op.apply_deriv_operator(6004, mat, dst, src) - reg.DhopDerivDag = lambda mat, dst, src: op.apply_deriv_operator(7004, mat, dst, src) + reg.DhopDerivDag = lambda mat, dst, src: op.apply_deriv_operator( + 7004, mat, dst, src + ) reg.DhopDerivEO = lambda mat, dst, src: op.apply_deriv_operator(6005, mat, dst, src) - reg.DhopDerivEODag = lambda mat, dst, src: op.apply_deriv_operator(7005, mat, dst, src) + reg.DhopDerivEODag = lambda mat, dst, src: op.apply_deriv_operator( + 7005, mat, dst, src + ) reg.DhopDerivOE = lambda mat, dst, src: op.apply_deriv_operator(6006, mat, dst, src) - reg.DhopDerivOEDag = lambda mat, dst, src: op.apply_deriv_operator(7006, mat, dst, src) + reg.DhopDerivOEDag = lambda mat, dst, src: op.apply_deriv_operator( + 7006, mat, dst, src + )