Skip to content

Commit

Permalink
a100 version acceptable
Browse files Browse the repository at this point in the history
  • Loading branch information
lehner committed Oct 24, 2023
1 parent 0e4b077 commit 830cc51
Show file tree
Hide file tree
Showing 14 changed files with 209 additions and 629 deletions.
74 changes: 38 additions & 36 deletions benchmarks/stencil_tensor.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#!/usr/bin/env python3
import gpt as g
#grid = g.grid([64,64,64,64], g.double)
#grid = g.grid([32,32,32,32], g.double)
grid = g.grid([32,32,32,32], g.double)
#grid = g.grid([32,16,16,16], g.double)
grid = g.grid([16,16,16,32], g.double)
#grid = g.grid([16,16,16,32], g.double)
#grid = g.grid([2*4,4*3,3*4,3*3*4], g.double)
m1 = g.mcolor(grid)
m2 = g.mcolor(grid)
m3 = g.mcolor(grid)
Expand All @@ -21,28 +22,30 @@
(0,dst,ti.mov if l == 0 else ti.inc,1.0,[(1,0,3*i + l),(2,0,3*l + j)])
)

ein = g.stencil.tensor(m1, [(0, 0, 0, 0), (1, 0, 0, 0)], code, len(code))# // 9

#ein.memory_access_pattern(fast_osites=-3)
segments = [(3, 9)]
ein = g.stencil.tensor(m1, [(0, 0, 0, 0), (1, 0, 0, 0)], code, segments)

ein(m3,m1,m2)
g.message(g.norm2(m3 - m3ref))


for block_size in [1,4,8,-1,-4,-8,-16,-32,-64]:
ein.memory_access_pattern(fast_osites=block_size)
for osites_per_instruction in [1,4,8,16,32,64]:
for osites_per_cache_block in [2048*4, 4096*4, 8192*4]:
ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block)

g.message(block_size)
t=g.timer("d")
t("expr")
for i in range(300):
g.eval(m3,m1*m2)
t("stencil")
for i in range(300):
ein(m3,m1,m2)
t()
g.message(t)
g.message(g.norm2(m3 - m3ref))
g.message(osites_per_instruction, osites_per_cache_block)
t=g.timer("d")
t("expr")
for i in range(300):
g.eval(m3,m1*m2)
t("stencil")
for i in range(300):
ein(m3,m1,m2)
t()
g.message(t)
eps2 = g.norm2(m3 - m3ref) / g.norm2(m3)
assert eps2 < 1e-25
g.message(eps2)


# D_{a2,a1} = epsilon_{a1,b1,c1}*epsilon_{a2,b2,c2}*spin_transpose(Q1_{b1,b2})*Q2_{c1,c2}
Expand Down Expand Up @@ -73,7 +76,8 @@
)

g.message(len(code))
ein = g.stencil.tensor(Q1, [(0, 0, 0, 0), (1, 0, 0, 0)], code)
segments = [(len(code) // 16, 16)]
ein = g.stencil.tensor(Q1, [(0, 0, 0, 0), (1, 0, 0, 0)], code, segments)

R = g.mspincolor(grid)
R[:] = 0
Expand All @@ -84,20 +88,18 @@
g.message(g.norm2(R - R2) / g.norm2(R))
#
# D[i2[0], i1[0]] += sign1 * sign2 * Q1[i1[1], i2[1]] * g.transpose(Q2[i1[2], i2[2]])


for block_size in [1,4,8,-1,-4,-8,-16,-32]:
ein.memory_access_pattern(fast_osites=block_size)

g.message(block_size)

t=g.timer("d")
t("diquark")
for i in range(30):
g.qcd.baryon.diquark(Q1,Q2)
t("stencil")
for i in range(30):
ein(R, Q1, Q2)
t()
g.message(t)

for osites_per_instruction in [1,4,8,16,32,64]:
for osites_per_cache_block in [2048*4, 4096*4, 8192*4]:
ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block)

g.message(osites_per_instruction, osites_per_cache_block)
t=g.timer("d")
t("diquark")
for i in range(30):
g.qcd.baryon.diquark(Q1,Q2)
t("stencil")
for i in range(30):
ein(R, Q1, Q2)
t()
g.message(t)
g.message(g.norm2(R - R2) / g.norm2(R))
172 changes: 0 additions & 172 deletions lib/cgpt/lib/benchmarks.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,185 +19,13 @@
#include "lib.h"
#include "benchmarks.h"

/*
E X P O R T(test_legacy_omega,{
void* _dst, *_src;
long type, mu;
if (!PyArg_ParseTuple(args, "llll", &_dst, &_src, &type, &mu)) {
return NULL;
}
cgpt_Lattice_base* dst = (cgpt_Lattice_base*)_dst;
cgpt_Lattice_base* src = (cgpt_Lattice_base*)_src;
auto& src_l = compatible<iMSpin4Color3<vComplexD>>(src)->l;
auto& dst_l = compatible<iMSpin4<vComplexD>>(dst)->l;
omega(dst_l, src_l, type, mu);
return PyLong_FromLong(0);
});
*/

template<typename vobj_a, typename vobj_b>
void mk_binary_mul_ll(const micro_kernel_arg_t & arg, size_t i0, size_t i1, size_t n_subblock) {
typedef decltype(vobj_a()*vobj_b()) vobj_c;
typedef typename vobj_c::scalar_object sobj_c;

micro_kernel_view(vobj_a, a_p, 0);
micro_kernel_view(vobj_b, b_p, 1);
micro_kernel_view(vobj_c, c_p, 2);

micro_kernel_for(idx, i1-i0, sizeof(vobj_c)/sizeof(sobj_c), n_subblock, {
coalescedWrite(a_p[idx], coalescedRead(b_p[idx]) * coalescedRead(c_p[idx]));
});
}


template<typename Lat>
void micro_kernels(int lat) {

Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);

typedef typename Lat::vector_object vobj;
typedef typename Lat::scalar_object sobj;
Lat a(&Grid), b(&Grid), c(&Grid), d(&Grid);

std::cout << GridLogMessage << lat << "^4" << std::endl;

GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
random(pRNG,a); random(pRNG,b);

int Nwarm = 10;
int N = 500;
double gb, t0, t1, t2, t3, t4, t5, t0b, t1b;
mk_timer t_et, t_eti;
std::map<std::string, mk_timer> t_mk;
std::vector<micro_kernel_blocking_t> blockings = {
#ifdef GRID_HAS_ACCELERATOR
{ 8*1024, 1 },
{ 32*1024, 1 },
{ 256*1024, 1 },
#else
{ 512, 8 },
{ 512, 16 },
{ 512, 32 },
{ 512, 64 },
{ 256, 8 },
{ 256, 16 },
{ 256, 32 },
{ 128, 8 },
{ 128, 16 }
#endif
};

gb = 4.0 * 3.0 * sizeof(sobj) * Grid._fsites / 1e9;
for (int i=0;i<Nwarm+N;i++) {
t0 = cgpt_time();
c = a*b;
d = a*c;
c = a*b;
d = a*c;
t1 = cgpt_time();
if (i>=Nwarm)
t_et.add(t1-t0);
}

Lat d_copy = a*a*b;

for (int i=0;i<Nwarm+N;i++) {
t0 = cgpt_time();
d = a*a*b;
d = a*a*b;
t1 = cgpt_time();
if (i>=Nwarm)
t_eti.add(t1-t0);
}

d = Zero();

t2 = cgpt_time();
std::vector<micro_kernel_t> expression;
micro_kernel_arg_t views_c_a_b, views_d_a_c;

views_c_a_b.add(c, AcceleratorWriteDiscard, false);
views_c_a_b.add(a, AcceleratorRead);
views_c_a_b.add(b, AcceleratorRead);

views_d_a_c.add(d, AcceleratorWriteDiscard);
views_d_a_c.add(a, AcceleratorRead);
views_d_a_c.add(c, AcceleratorRead, false);

// TODO: internal index size
expression.push_back({ mk_binary_mul_ll<vobj,vobj>, views_c_a_b });
expression.push_back({ mk_binary_mul_ll<vobj,vobj>, views_d_a_c });
expression.push_back({ mk_binary_mul_ll<vobj,vobj>, views_c_a_b });
expression.push_back({ mk_binary_mul_ll<vobj,vobj>, views_d_a_c });

t3 = cgpt_time();

for (auto b : blockings) {
mk_timer t;
for (int i=0;i<Nwarm+N;i++) {
t0 = cgpt_time();
eval_micro_kernels(expression, b);
t1 = cgpt_time();
if (i>=Nwarm)
t.add(t1-t0);
}
char buf[256];
sprintf(buf,"MK %d-%d",b.block_size,b.subblock_size);
t_mk[buf] = t;
}
t5 = cgpt_time();

views_c_a_b.release();
views_d_a_c.release();

d -= d_copy;
double err2 = norm2(d);

t_et.print ("GridET separate", gb);
t_eti.print("GridET joint ", gb);
for (auto t : t_mk)
t.second.print (t.first, gb);

}

template<typename Lat>
void mk_bench_mul() {
micro_kernels<Lat>(4);
micro_kernels<Lat>(6);
micro_kernels<Lat>(8);
micro_kernels<Lat>(10);
micro_kernels<Lat>(12);
micro_kernels<Lat>(16);
#ifdef GRID_HAS_ACCELERATOR
micro_kernels<Lat>(24);
micro_kernels<Lat>(32);
micro_kernels<Lat>(48);
#endif
}

EXPORT(benchmarks,{

//mask();
//half();
//benchmarks(8);
//benchmarks(16);
//benchmarks(32);
std::cout << GridLogMessage << std::endl << std::endl << "Benchmarking ComplexD" << std::endl << std::endl;
mk_bench_mul<LatticeComplexD>();

std::cout << GridLogMessage << std::endl << std::endl << "Benchmarking ColourD" << std::endl << std::endl;
mk_bench_mul<LatticeColourMatrixD>();

//std::cout << GridLogMessage << std::endl << std::endl << "Benchmarking SpinColourD" << std::endl << std::endl;
//mk_bench_mul<LatticeSpinColourMatrixD>();

return PyLong_FromLong(0);
});
Expand Down
2 changes: 1 addition & 1 deletion lib/cgpt/lib/lattice/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class cgpt_Lattice_base {
virtual GridBase* get_grid() = 0;
virtual cgpt_stencil_matrix_base* stencil_matrix(GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size, long local) = 0;
virtual cgpt_stencil_matrix_vector_base* stencil_matrix_vector(cgpt_Lattice_base* matrix, GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size, long local) = 0;
virtual cgpt_stencil_tensor_base* stencil_tensor(GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size, long local) = 0;
virtual cgpt_stencil_tensor_base* stencil_tensor(GridBase* grid, PyObject* shifts, PyObject* code, PyObject* segments, long local) = 0;

};

Expand Down
4 changes: 2 additions & 2 deletions lib/cgpt/lib/lattice/implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ class cgpt_Lattice : public cgpt_Lattice_base {
return cgpt_stencil_matrix_vector_create<T>(matrix, grid, shifts, code, code_parallel_block_size, local);
}

virtual cgpt_stencil_tensor_base* stencil_tensor(GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size, long local) {
return cgpt_stencil_tensor_create<T>(grid, shifts, code, code_parallel_block_size, local);
virtual cgpt_stencil_tensor_base* stencil_tensor(GridBase* grid, PyObject* shifts, PyObject* code, PyObject* segments, long local) {
return cgpt_stencil_tensor_create<T>(grid, shifts, code, segments, local);
}
};
1 change: 0 additions & 1 deletion lib/cgpt/lib/lib.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
#include "foundation.h"
#include "reduce.h"
#include "sort.h"
#include "micro_kernel.h"
#include "convert.h"
#include "checksums.h"
#include "parameters.h"
Expand Down
43 changes: 0 additions & 43 deletions lib/cgpt/lib/micro_kernel.cc

This file was deleted.

21 changes: 0 additions & 21 deletions lib/cgpt/lib/micro_kernel.h

This file was deleted.

Loading

0 comments on commit 830cc51

Please sign in to comment.