Skip to content

Commit

Permalink
better logging and rocm fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
lehner committed Sep 20, 2024
1 parent 75d7332 commit b9e0f86
Show file tree
Hide file tree
Showing 8 changed files with 164 additions and 174 deletions.
2 changes: 2 additions & 0 deletions benchmarks/communication.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,11 @@
for l in range(nwarm):
plan(lat, lat)

g.barrier()
t0 = g.time()
for l in range(n):
plan(lat, lat)
g.barrier()
t1 = g.time()

msg = (
Expand Down
226 changes: 111 additions & 115 deletions benchmarks/stencil.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,139 +11,135 @@
nevec = [tuple([-x for x in y]) for y in evec]

for precision in [g.single, g.double]:
for fast_osites in [0, 1]:
grid = g.grid(g.default.get_ivec("--grid", [16, 16, 16, 32], 4), precision)
N = g.default.get_int("--N", 1000)
grid = g.grid(g.default.get_ivec("--grid", [16, 16, 16, 32], 4), precision)
N = g.default.get_int("--N", 1000)

g.message(
f"""
g.message(
f"""
Local Stencil Benchmark with
fdimensions : {grid.fdimensions}
precision : {precision.__name__}
fast_osites : {fast_osites}
"""
)

# plaquette
U = g.qcd.gauge.random(grid, rng, scale=0.5)
_U = [1, 2, 3, 4]
_X = 0
_Xp = [1, 2, 3, 4]
V = g.mcolor(grid)
rng.element(V)
# U = g.qcd.gauge.transformed(U, V)
code = []
for mu in range(4):
for nu in range(0, mu):
code.append(
{
"target": 0,
"accumulate": -1 if (mu == 1 and nu == 0) else 0,
"weight": 1.0,
"factor": [
(_U[mu], _X, 0),
(_U[nu], _Xp[mu], 0),
(_U[mu], _Xp[nu], 1),
(_U[nu], _X, 1),
],
}
)
st = g.stencil.matrix(
U[0],
[(0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)],
code,
)
st.memory_access_pattern(fast_osites=fast_osites)
# test plaquette
P = g.lattice(U[0])
# plaquette
U = g.qcd.gauge.random(grid, rng, scale=0.5)
_U = [1, 2, 3, 4]
_X = 0
_Xp = [1, 2, 3, 4]
V = g.mcolor(grid)
rng.element(V)
# U = g.qcd.gauge.transformed(U, V)
code = []
for mu in range(4):
for nu in range(0, mu):
code.append(
{
"target": 0,
"accumulate": -1 if (mu == 1 and nu == 0) else 0,
"weight": 1.0,
"factor": [
(_U[mu], _X, 0),
(_U[nu], _Xp[mu], 0),
(_U[mu], _Xp[nu], 1),
(_U[nu], _X, 1),
],
}
)
st = g.stencil.matrix(
U[0],
[(0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)],
code,
)
# test plaquette
P = g.lattice(U[0])
st(P, *U)
pl = g.sum(g.trace(P)).real / P.grid.gsites / 3 / 2 / 3
assert abs(g.qcd.gauge.plaquette(U) - pl) < precision.eps * 100

# Flops
gauge_otype = U[0].otype
Nc = gauge_otype.shape[0]
flops_per_matrix_multiply = Nc**3 * 6 + (Nc - 1) * Nc**2 * 2
flops_per_site = 3 * flops_per_matrix_multiply * 4 * 3
flops = flops_per_site * P.grid.gsites * N
nbytes = (5 * Nc * Nc * 2) * precision.nbytes * P.grid.gsites * N

# Warmup
for n in range(5):
st(P, *U)
pl = g.sum(g.trace(P)).real / P.grid.gsites / 3 / 2 / 3
assert abs(g.qcd.gauge.plaquette(U) - pl) < precision.eps * 100

# Flops
gauge_otype = U[0].otype
Nc = gauge_otype.shape[0]
flops_per_matrix_multiply = Nc**3 * 6 + (Nc - 1) * Nc**2 * 2
flops_per_site = 3 * flops_per_matrix_multiply * 4 * 3
flops = flops_per_site * P.grid.gsites * N
nbytes = (5 * Nc * Nc * 2) * precision.nbytes * P.grid.gsites * N

# Warmup
for n in range(5):
st(P, *U)

# Time
t0 = g.time()
for n in range(N):
st(P, *U)
t1 = g.time()

# Report
GFlopsPerSec = flops / (t1 - t0) / 1e9
GBPerSec = nbytes / (t1 - t0) / 1e9
g.message(
f"""

# Time
t0 = g.time()
for n in range(N):
st(P, *U)
t1 = g.time()

# Report
GFlopsPerSec = flops / (t1 - t0) / 1e9
GBPerSec = nbytes / (t1 - t0) / 1e9
g.message(
f"""
{N} applications of plaquette stencil
Time to complete : {t1-t0:.2f} s
Total performance : {GFlopsPerSec:.2f} GFlops/s
Effective memory bandwidth : {GBPerSec:.2f} GB/s"""
)

# covariant laplacian stencil
src = g.vspincolor(grid)
rng.cnormal(src)
UdagShift = [g(g.adj(g.cshift(U[mu], mu, -1))) for mu in range(4)]
_U = [0, 1, 2, 3]
_UdagShift = [4, 5, 6, 7]
_X = 0
_Xp = [1, 2, 3, 4]
_Xm = [5, 6, 7, 8]
code = [(0, 1, _X, -1, -8.0, [])]
for mu in range(4):
code.append((0, 1, _Xp[mu], 0, 1.0, [(_U[mu], _X, 0)]))
code.append((0, 1, _Xm[mu], 0, 1.0, [(_UdagShift[mu], _X, 0)]))
# can switch last line to next one
# code.append((0,1,_Xm[mu], 0, 1.0,[(_U[mu], _Xm[mu], 1)]))
st = g.stencil.matrix_vector(U[0], src, [(0, 0, 0, 0)] + evec + nevec, code)
st.memory_access_pattern(fast_osites=fast_osites)
# test laplace
dst = g.lattice(src)
)

# covariant laplacian stencil
src = g.vspincolor(grid)
rng.cnormal(src)
UdagShift = [g(g.adj(g.cshift(U[mu], mu, -1))) for mu in range(4)]
_U = [0, 1, 2, 3]
_UdagShift = [4, 5, 6, 7]
_X = 0
_Xp = [1, 2, 3, 4]
_Xm = [5, 6, 7, 8]
code = [(0, 1, _X, -1, -8.0, [])]
for mu in range(4):
code.append((0, 1, _Xp[mu], 0, 1.0, [(_U[mu], _X, 0)]))
code.append((0, 1, _Xm[mu], 0, 1.0, [(_UdagShift[mu], _X, 0)]))
# can switch last line to next one
# code.append((0,1,_Xm[mu], 0, 1.0,[(_U[mu], _Xm[mu], 1)]))
st = g.stencil.matrix_vector(U[0], src, [(0, 0, 0, 0)] + evec + nevec, code)
# test laplace
dst = g.lattice(src)
st(U + UdagShift, [dst, src])

lap = g.create.smear.laplace(g.covariant.shift(U, boundary_phases=[1] * 4), [0, 1, 2, 3])
dst2 = lap(src)
eps2 = g.norm2(dst - dst2) / g.norm2(dst)
assert eps2**0.5 < precision.eps * 100

# Flops
gauge_otype = U[0].otype
Nc = gauge_otype.shape[0]
Ns = 4
flops_per_matrix_vector_multiply = Ns * Nc * (Nc * 6 + (Nc - 1) * 2)
flops_per_vector_add = Ns * Nc * 2
flops_per_site = 8 * flops_per_matrix_vector_multiply + 8 * flops_per_vector_add
flops = flops_per_site * src.grid.gsites * N
nbytes = (8 * Nc * Nc * 2 + Nc * Ns * 2) * precision.nbytes * src.grid.gsites * N

# Warmup
for n in range(5):
st(U + UdagShift, [dst, src])

lap = g.create.smear.laplace(g.covariant.shift(U, boundary_phases=[1] * 4), [0, 1, 2, 3])
dst2 = lap(src)
eps2 = g.norm2(dst - dst2) / g.norm2(dst)
assert eps2**0.5 < precision.eps * 100

# Flops
gauge_otype = U[0].otype
Nc = gauge_otype.shape[0]
Ns = 4
flops_per_matrix_vector_multiply = Ns * Nc * (Nc * 6 + (Nc - 1) * 2)
flops_per_vector_add = Ns * Nc * 2
flops_per_site = 8 * flops_per_matrix_vector_multiply + 8 * flops_per_vector_add
flops = flops_per_site * src.grid.gsites * N
nbytes = (8 * Nc * Nc * 2 + Nc * Ns * 2) * precision.nbytes * src.grid.gsites * N

# Warmup
for n in range(5):
st(U + UdagShift, [dst, src])

# Time
t0 = g.time()
for n in range(N):
st(U + UdagShift, [dst, src])
t1 = g.time()
# Time
t0 = g.time()
for n in range(N):
st(U + UdagShift, [dst, src])
t1 = g.time()

# Report
GFlopsPerSec = flops / (t1 - t0) / 1e9
GBPerSec = nbytes / (t1 - t0) / 1e9
g.message(
f"""
# Report
GFlopsPerSec = flops / (t1 - t0) / 1e9
GBPerSec = nbytes / (t1 - t0) / 1e9
g.message(
f"""
{N} applications of laplace stencil
Time to complete : {t1-t0:.2f} s
Total performance : {GFlopsPerSec:.2f} GFlops/s
Effective memory bandwidth : {GBPerSec:.2f} GB/s"""
)
)
87 changes: 34 additions & 53 deletions benchmarks/stencil_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,27 +27,18 @@
g.message("m3 = m1 * m2")
g.message(g.norm2(m3 - m3ref))

for osites_per_instruction in [4, 16, 32, 128, 256]: # [1,8,16,32,64]:
for osites_per_cache_block in [
4096,
2**15,
grid.gsites,
]: # [2**11, 2**13, 2**15, grid.gsites]:
ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block)

g.message(osites_per_instruction, osites_per_cache_block)
t = g.timer("d")
t("expr")
for i in range(300):
g.eval(m3, m1 * m2)
t("stencil")
for i in range(300):
ein(m3, m1, m2)
t()
g.message(t)
eps2 = g.norm2(m3 - m3ref) / g.norm2(m3)
assert eps2 < 1e-25
g.message(eps2)
t = g.timer("d")
t("expr")
for i in range(300):
g.eval(m3, m1 * m2)
t("stencil")
for i in range(300):
ein(m3, m1, m2)
t()
g.message(t)
eps2 = g.norm2(m3 - m3ref) / g.norm2(m3)
assert eps2 < 1e-25
g.message(eps2)


# next
Expand Down Expand Up @@ -100,23 +91,18 @@
g.message("m4 = m1 * m2 * m3")
g.message(g.norm2(m4 - m4ref))

for osites_per_instruction in [16, 32, 64, 256]:
for osites_per_cache_block in [16 * 16 * 16, 16 * 16 * 16 * 32, grid.gsites]:
ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block)

g.message(osites_per_instruction, osites_per_cache_block)
t = g.timer("d")
t("expr")
for i in range(300):
g.eval(m4, m1 * m2 * m2)
t("stencil")
for i in range(300):
ein(m4, tmp, m1, m2, m3)
t()
g.message(t)
eps2 = g.norm2(m4 - m4ref) / g.norm2(m4)
assert eps2 < 1e-25
g.message(eps2)
t = g.timer("d")
t("expr")
for i in range(300):
g.eval(m4, m1 * m2 * m2)
t("stencil")
for i in range(300):
ein(m4, tmp, m1, m2, m3)
t()
g.message(t)
eps2 = g.norm2(m4 - m4ref) / g.norm2(m4)
assert eps2 < 1e-25
g.message(eps2)


g.message("Diquark")
Expand Down Expand Up @@ -159,18 +145,13 @@
g.message(g.norm2(R - R2) / g.norm2(R))
#
# D[i2[0], i1[0]] += sign1 * sign2 * Q1[i1[1], i2[1]] * g.transpose(Q2[i1[2], i2[2]])
for osites_per_instruction in [1, 2, 4, 16, 32, 64, 256]:
for osites_per_cache_block in [grid.gsites]:
ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block)

g.message(osites_per_instruction, osites_per_cache_block)
t = g.timer("d")
t("diquark")
for i in range(30):
g.qcd.baryon.diquark(Q1, Q2)
t("stencil")
for i in range(30):
ein(R, Q1, Q2)
t()
g.message(t)
g.message(g.norm2(R - R2) / g.norm2(R))
t = g.timer("d")
t("diquark")
for i in range(30):
g.qcd.baryon.diquark(Q1, Q2)
t("stencil")
for i in range(30):
ein(R, Q1, Q2)
t()
g.message(t)
g.message(g.norm2(R - R2) / g.norm2(R))
2 changes: 1 addition & 1 deletion lib/cgpt/lib/expression/mul_implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ void cgpt_unary(const Lattice<T> * & pl, const Lattice<T> & l, int unary) {
pl = &l;
break;
case BIT_TRANS|BIT_CONJ:
pl = new Lattice<T>( adj(l) );
pl = new Lattice<T>( transpose(conjugate(l)) ); // temporary fix
break;
case BIT_TRANS:
pl = new Lattice<T>( transpose(l) );
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ template<>
cgpt_Lattice_base* cgpt_lattice_mul(cgpt_Lattice_base* dst, bool ac, int unary_a, Lattice< iVSpin4Color8<vComplexD> >& la,int unary_b, cgpt_Lattice_base* b, int unary_expr, ComplexD coef) {
typedef vComplexD vtype;
_COMPATIBLE_(iSinglet);
_OUTER_PRODUCT_(iVSpin4Color8);
//_OUTER_PRODUCT_(iVSpin4Color8);
_INNER_PRODUCT_(iVSpin4Color8);
ERR("Not implemented");
}
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ template<>
cgpt_Lattice_base* cgpt_lattice_mul(cgpt_Lattice_base* dst, bool ac, int unary_a, Lattice< iVSpin4Color8<vComplexF> >& la,int unary_b, cgpt_Lattice_base* b, int unary_expr, ComplexD coef) {
typedef vComplexF vtype;
_COMPATIBLE_(iSinglet);
_OUTER_PRODUCT_(iVSpin4Color8);
//_OUTER_PRODUCT_(iVSpin4Color8);
_INNER_PRODUCT_(iVSpin4Color8);
ERR("Not implemented");
}
Loading

0 comments on commit b9e0f86

Please sign in to comment.