Skip to content

Commit

Permalink
coarse grid operator benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
lehner committed Oct 8, 2023
1 parent 0aa1bd7 commit 2ea2508
Showing 1 changed file with 81 additions and 4 deletions.
85 changes: 81 additions & 4 deletions benchmarks/stencil.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,96 @@
g.default.set_verbose("random", False)
rng = g.random("benchmark", "vectorized_ranlux24_24_64")

evec = [(1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)]
nevec = [tuple([-x for x in y]) for y in evec]

for precision in [g.single, g.double]:
grid = g.grid(g.default.get_ivec("--grid", [16, 16, 16, 32], 4), precision)
N = g.default.get_int("--N", 1000)

g.message(
f"""
Local Stencil Benchmark with
fdimensions : {grid.fdimensions}
precision : {precision.__name__}
"""
)

# coarse grid stencil
nbasis = g.default.get_int("--nbasis", 12)
vcoarse = g.vcomplex(grid, nbasis)
g.message(f"Virtual blocks: {len(vcoarse.v_obj)}")
nbasis_blocks = len(vcoarse.v_obj)
mcoarse = [g.mcomplex(grid, nbasis) for _ in range(9)]
rng.cnormal(mcoarse)
rng.cnormal(vcoarse)

# reference solve
vref = g.lattice(vcoarse)
vref = g(mcoarse[0] * vcoarse)
for mu in range(4):
vref += mcoarse[1 + 2*mu] * g.cshift(vcoarse, mu, 1)
vref += mcoarse[2 + 2*mu] * g.cshift(vcoarse, mu, -1)
# now coarse grid dirac operator
_X = 0
_Xp = [1,2,3,4]
_Xm = [5,6,7,8]
_ID = 0
_M = [1,3,5,7]
_Mdag = [2,4,6,8]
code = []
for iblock in range(nbasis_blocks):
# todo block matrix multiply here
for jblock in range(nbasis_blocks):
matrix_index = nbasis_blocks * jblock + iblock
code.append((iblock,nbasis_blocks + jblock,_X,-1 if jblock == 0 else iblock,1.0,[(matrix_index,_X,0)]))
for mu in range(4):
code.append((iblock,nbasis_blocks + jblock,_Xp[mu], iblock, 1.0, [(nbasis_blocks**2 * _M[mu] + matrix_index, _X, 0)]))
code.append((iblock,nbasis_blocks + jblock,_Xm[mu], iblock, 1.0, [(nbasis_blocks**2 * _Mdag[mu] + matrix_index, _X, 0)]))
st = g.stencil.matrix_vector(
mcoarse[0],
vcoarse,
[(0, 0, 0, 0)] + evec + nevec,
code, len(code) // nbasis_blocks
)
vdst = g.lattice(vcoarse)
st(mcoarse, [vdst,vcoarse])
eps2 = g.norm2(vdst - vref) / g.norm2(vref)
# g.message(f"Test: {eps2}")
assert eps2 ** 0.5 < precision.eps * 100

# Flops
flops_per_matrix_vector_multiply = nbasis * (nbasis * 6 + (nbasis - 1) * 2)
flops_per_vector_add = nbasis * 2
flops_per_site = 9 * flops_per_matrix_vector_multiply + 8 * flops_per_vector_add
flops = flops_per_site * vcoarse.grid.gsites * N
nbytes = (9 * nbasis ** 2 * 2 + 2 * nbasis * 2) * precision.nbytes * vcoarse.grid.gsites * N

# Warmup
for n in range(5):
st(mcoarse, [vdst,vcoarse])

# Time
t0 = g.time()
for n in range(N):
st(mcoarse, [vdst,vcoarse])
t1 = g.time()

# Report
GFlopsPerSec = flops / (t1 - t0) / 1e9
GBPerSec = nbytes / (t1 - t0) / 1e9
g.message(
f"""
{N} applications of {nbasis} x {nbasis}, 9-point operator
Time to complete : {t1-t0:.2f} s
Total performance : {GFlopsPerSec:.2f} GFlops/s
Effective memory bandwidth : {GBPerSec:.2f} GB/s"""
)


# plaquette
U = g.qcd.gauge.random(grid, rng, scale=0.5)
_U = [1, 2, 3, 4]
_X = 0
Expand Down Expand Up @@ -75,7 +153,8 @@
GFlopsPerSec = flops / (t1 - t0) / 1e9
GBPerSec = nbytes / (t1 - t0) / 1e9
g.message(
f"""{N} applications of plaquette stencil
f"""
{N} applications of plaquette stencil
Time to complete : {t1-t0:.2f} s
Total performance : {GFlopsPerSec:.2f} GFlops/s
Effective memory bandwidth : {GBPerSec:.2f} GB/s"""
Expand All @@ -84,8 +163,6 @@
# covariant laplacian stencil
src = g.vspincolor(grid)
rng.cnormal(src)
evec = [(1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)]
nevec = [tuple([-x for x in y]) for y in evec]
UdagShift = [g(g.adj(g.cshift(U[mu], mu, -1))) for mu in range(4)]
_U = [0,1,2,3]
_UdagShift = [4,5,6,7]
Expand Down Expand Up @@ -117,7 +194,7 @@
gauge_otype = U[0].otype
Nc = gauge_otype.shape[0]
Ns = 4
flops_per_matrix_vector_multiply = Ns * (Nc**2 * 6 + (Nc - 1) * Nc * 2)
flops_per_matrix_vector_multiply = Ns * Nc * (Nc * 6 + (Nc - 1) * 2)
flops_per_vector_add = Ns * Nc * 2
flops_per_site = 8 * flops_per_matrix_vector_multiply + 8 * flops_per_vector_add
flops = flops_per_site * src.grid.gsites * N
Expand Down

0 comments on commit 2ea2508

Please sign in to comment.