diff --git a/benchmarks/stencil.py b/benchmarks/stencil.py index c5e581df..dbed0941 100755 --- a/benchmarks/stencil.py +++ b/benchmarks/stencil.py @@ -43,7 +43,11 @@ } ) st = g.stencil.matrix( - U[0], [(0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)], [0], [1,2,3,4], code + U[0], + [(0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)], + [0], + [1, 2, 3, 4], + code, ) # test plaquette P = g.lattice(U[0]) diff --git a/lib/cgpt/lib/lattice/base.h b/lib/cgpt/lib/lattice/base.h index 761a3d15..1bb32720 100644 --- a/lib/cgpt/lib/lattice/base.h +++ b/lib/cgpt/lib/lattice/base.h @@ -64,8 +64,8 @@ class cgpt_Lattice_base { virtual void invert_matrix(std::vector& matrix_inv, std::vector& matrix, long n_virtual) = 0; virtual void determinant(cgpt_Lattice_base* det, std::vector& matrix, long n_virtual) = 0; // this determines type of matrix[0] virtual GridBase* get_grid() = 0; - virtual cgpt_stencil_matrix_base* stencil_matrix(GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size) = 0; - virtual cgpt_stencil_matrix_vector_base* stencil_matrix_vector(cgpt_Lattice_base* matrix, GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size) = 0; + virtual cgpt_stencil_matrix_base* stencil_matrix(GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size, long local) = 0; + virtual cgpt_stencil_matrix_vector_base* stencil_matrix_vector(cgpt_Lattice_base* matrix, GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size, long local) = 0; }; diff --git a/lib/cgpt/lib/lattice/implementation.h b/lib/cgpt/lib/lattice/implementation.h index b309fc5a..de1e6ddf 100644 --- a/lib/cgpt/lib/lattice/implementation.h +++ b/lib/cgpt/lib/lattice/implementation.h @@ -273,12 +273,12 @@ class cgpt_Lattice : public cgpt_Lattice_base { return l.Grid(); } - virtual cgpt_stencil_matrix_base* stencil_matrix(GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size) { - return cgpt_stencil_matrix_create(grid, shifts, code, code_parallel_block_size); + virtual cgpt_stencil_matrix_base* stencil_matrix(GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size, long local) { + return cgpt_stencil_matrix_create(grid, shifts, code, code_parallel_block_size, local); } - virtual cgpt_stencil_matrix_vector_base* stencil_matrix_vector(cgpt_Lattice_base* matrix, GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size) { - return cgpt_stencil_matrix_vector_create(matrix, grid, shifts, code, code_parallel_block_size); + virtual cgpt_stencil_matrix_vector_base* stencil_matrix_vector(cgpt_Lattice_base* matrix, GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size, long local) { + return cgpt_stencil_matrix_vector_create(matrix, grid, shifts, code, code_parallel_block_size, local); } }; diff --git a/lib/cgpt/lib/lib.h b/lib/cgpt/lib/lib.h index e0f5bcaa..8278d5d0 100644 --- a/lib/cgpt/lib/lib.h +++ b/lib/cgpt/lib/lib.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/lib/cgpt/lib/stencil.cc b/lib/cgpt/lib/stencil.cc index 524f778a..3d1a60dd 100644 --- a/lib/cgpt/lib/stencil.cc +++ b/lib/cgpt/lib/stencil.cc @@ -24,8 +24,9 @@ EXPORT(stencil_matrix_create,{ void* _lattice; PyObject* _shifts, * _code; long _code_parallel_block_size; - if (!PyArg_ParseTuple(args, "llOOl", &_lattice, &_grid, &_shifts, &_code, - &_code_parallel_block_size)) { + long _local; + if (!PyArg_ParseTuple(args, "llOOll", &_lattice, &_grid, &_shifts, &_code, + &_code_parallel_block_size, &_local)) { return NULL; } @@ -33,7 +34,8 @@ EXPORT(stencil_matrix_create,{ cgpt_Lattice_base* lattice = (cgpt_Lattice_base*)_lattice; return PyLong_FromVoidPtr(lattice->stencil_matrix(grid, _shifts, _code, - _code_parallel_block_size)); + _code_parallel_block_size, + _local)); }); EXPORT(stencil_matrix_vector_create,{ @@ -43,8 +45,9 @@ EXPORT(stencil_matrix_vector_create,{ void* _lattice_vector; PyObject* _shifts, * _code; long _code_parallel_block_size; - if (!PyArg_ParseTuple(args, "lllOOl", &_lattice_matrix, &_lattice_vector, &_grid, &_shifts, &_code, - &_code_parallel_block_size)) { + long _local; + if (!PyArg_ParseTuple(args, "lllOOll", &_lattice_matrix, &_lattice_vector, &_grid, &_shifts, &_code, + &_code_parallel_block_size, &_local)) { return NULL; } @@ -53,7 +56,8 @@ EXPORT(stencil_matrix_vector_create,{ cgpt_Lattice_base* lattice_vector = (cgpt_Lattice_base*)_lattice_vector; return PyLong_FromVoidPtr(lattice_vector->stencil_matrix_vector(lattice_matrix, grid, _shifts, _code, - _code_parallel_block_size)); + _code_parallel_block_size, + _local)); }); EXPORT(stencil_matrix_execute,{ diff --git a/lib/cgpt/lib/stencil/common.h b/lib/cgpt/lib/stencil/common.h index b3189f48..88f62ed4 100644 --- a/lib/cgpt/lib/stencil/common.h +++ b/lib/cgpt/lib/stencil/common.h @@ -17,6 +17,26 @@ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ +// cartesian stencil fetch +#define fetch_cs(sidx, obj, point, site, view, do_adj) { \ + if (point == -1) { \ + obj = coalescedRead(view[site]); \ + } else { \ + int ptype; \ + auto SE = sview[sidx].GetEntry(ptype, point, ss); \ + if (SE->_is_local ) { \ + int perm = SE->_permute; \ + obj = coalescedReadPermute(view[SE->_offset],ptype,perm,lane); \ + } else { \ + obj = coalescedRead(buf[sidx][SE->_offset],lane); \ + } \ + acceleratorSynchronise(); \ + } \ + if (do_adj) \ + obj = adj(obj); \ + } + +// general local stencil fetch #ifndef GRID_HAS_ACCELERATOR // cpu fetch version @@ -91,3 +111,80 @@ template struct matrixFromTypeAtLevel2 struct matrixFromTypeAtLevel2,0,N2> { typedef iScalar::type> type; }; template struct matrixFromTypeAtLevel2,0,N2> { typedef iMatrix::type,M> type; }; template struct matrixFromTypeAtLevel2,0,N2> { typedef iMatrix::type,M> type; }; + +// cartesian +static void cgpt_shift_to_cartesian_dir_disp(const Coordinate& s, int& dir, int& disp) { + dir = -1; + for (int i=0;i +class CartesianStencilManager { +public: + GridBase* grid; + const std::vector& shifts; + std::map > field_points; + std::vector stencils; + Vector stencil_map; + std::map > field_point_map; + + CartesianStencilManager(GridBase* _grid, const std::vector& _shifts) : grid(_grid), shifts(_shifts) { + } + + bool is_trivial(int point) { + for (auto s : shifts[point]) + if (s != 0) + return false; + return true; + } + + void register_point(int index, int point) { + // register non-trivial points + if (is_trivial(point)) + return; + + field_points[index].insert(point); + + if ((int)stencil_map.size() < index + 1) + stencil_map.resize(index + 1, -1); + } + + void create_stencils() { + // all fields in field_points now need a stencil + for (auto & fp : field_points) { + + int index = fp.first; + + std::vector dirs, disps; + for (auto p : fp.second) { + int dir, disp; + cgpt_shift_to_cartesian_dir_disp(shifts[p], dir, disp); + ASSERT(dir != -1); + + field_point_map[index][p] = (int)dirs.size(); + + dirs.push_back(dir); + disps.push_back(disp); + + stencil_map[index] = (int)stencils.size(); + stencils.push_back(CartesianStencil_t(grid,dirs.size(),Even,dirs,disps,SimpleStencilParams())); + } + } + } + + int map_point(int index, int point) { + if (is_trivial(point)) + return -1; + return field_point_map[index][point]; + } + +}; diff --git a/lib/cgpt/lib/stencil/matrix.h b/lib/cgpt/lib/stencil/matrix.h index 8fbc3253..10d27963 100644 --- a/lib/cgpt/lib/stencil/matrix.h +++ b/lib/cgpt/lib/stencil/matrix.h @@ -49,16 +49,28 @@ template class cgpt_stencil_matrix : public cgpt_stencil_matrix_base { public: - GeneralLocalStencil stencil; + typedef CartesianStencil CartesianStencil_t; + typedef CartesianStencilView CartesianStencilView_t; + Vector code; Vector factors; + int n_code_parallel_block_size, n_code_parallel_blocks; + int local; + + // local == true + GeneralLocalStencil* general_local_stencil; + + // local == false + SimpleCompressor* compressor; + CartesianStencilManager* sm; cgpt_stencil_matrix(GridBase* grid, const std::vector& shifts, const std::vector& _code, - int _n_code_parallel_block_size) : - stencil(grid,shifts), code(_code.size()), + int _n_code_parallel_block_size, + int _local) : + code(_code.size()), local(_local), n_code_parallel_block_size(_n_code_parallel_block_size) { ASSERT(_code.size() % n_code_parallel_block_size == 0); @@ -80,9 +92,37 @@ class cgpt_stencil_matrix : public cgpt_stencil_matrix_base { memcpy(code[i].factor, &_code[i].factor[0], sizeof(cgpt_stencil_matrix_factor_t) * code[i].size); nfactors += code[i].size; } + + if (local) { + general_local_stencil = new GeneralLocalStencil(grid,shifts); + } else { + + sm = new CartesianStencilManager(grid, shifts); + + // for all factors that require a non-trivial shift, create a separate stencil object + for (int i=0;iregister_point(factors[i].index, factors[i].point); + } + + sm->create_stencils(); + + for (int i=0;imap_point(factors[i].index, factors[i].point); + } + + compressor = new SimpleCompressor(); + } + + } virtual ~cgpt_stencil_matrix() { + if (local) { + delete general_local_stencil; + } else { + delete compressor; + delete sm; + } } virtual void execute(PVector> &fields) { @@ -99,36 +139,98 @@ class cgpt_stencil_matrix : public cgpt_stencil_matrix_base { int _npb = n_code_parallel_blocks; int _npbs = n_code_parallel_block_size; - auto sview = stencil.View(); - - accelerator_for(ss_block,fields[0].Grid()->oSites() * _npb,T::Nsimd(),{ - - auto ss = ss_block / _npb; - auto oblock = ss_block % _npb; - - for (int iblock=0;iblock<_npbs;iblock++) { - - int i = oblock * _npbs + iblock; - - obj_t t; - - const auto _f0 = &p_code[i].factor[0]; - fetch(t, _f0->point, ss, fields_v[_f0->index], _f0->adj); - - for (int j=1;jpoint, ss, fields_v[_f->index], _f->adj); - t = t * f; + if (local) { + + auto sview = general_local_stencil->View(); + + accelerator_for(ss_block,fields[0].Grid()->oSites() * _npb,T::Nsimd(),{ + + auto ss = ss_block / _npb; + auto oblock = ss_block % _npb; + + for (int iblock=0;iblock<_npbs;iblock++) { + + int i = oblock * _npbs + iblock; + + obj_t t; + + const auto _f0 = &p_code[i].factor[0]; + fetch(t, _f0->point, ss, fields_v[_f0->index], _f0->adj); + + for (int j=1;jpoint, ss, fields_v[_f->index], _f->adj); + t = t * f; + } + + obj_t r = p_code[i].weight * t; + if (p_code[i].accumulate != -1) + r += coalescedRead(fields_v[p_code[i].accumulate][ss]); + coalescedWrite(fields_v[p_code[i].target][ss], r); } - - obj_t r = p_code[i].weight * t; - if (p_code[i].accumulate != -1) - r += coalescedRead(fields_v[p_code[i].accumulate][ss]); - coalescedWrite(fields_v[p_code[i].target][ss], r); + + }); + + } else { + + Vector _buf; + Vector _sview; + + int* stencil_map = &sm->stencil_map[0]; + for (int i=0;i<(int)sm->stencil_map.size();i++) { + int s = stencil_map[i]; + if (s != -1) { + sm->stencils[s].HaloExchange(fields[i], *compressor); } - - }); + } + + for (int i=0;i<(int)sm->stencils.size();i++) { + _buf.push_back(sm->stencils[i].CommBuf()); + _sview.push_back(sm->stencils[i].View(AcceleratorRead)); + } + + obj_t** buf = &_buf[0]; + CartesianStencilView_t* sview = &_sview[0]; + + // now loop + accelerator_for(ss_block,fields[0].Grid()->oSites() * _npb,T::Nsimd(),{ + + const int lane=acceleratorSIMTlane(T::Nsimd()); + + auto ss = ss_block / _npb; + auto oblock = ss_block % _npb; + + for (int iblock=0;iblock<_npbs;iblock++) { + + int i = oblock * _npbs + iblock; + + obj_t t; + + const auto _f0 = &p_code[i].factor[0]; + fetch_cs(stencil_map[_f0->index], t, _f0->point, ss, fields_v[_f0->index], _f0->adj); + + for (int j=1;jindex], f, _f->point, ss, fields_v[_f->index], _f->adj); + t = t * f; + } + + obj_t r = p_code[i].weight * t; + if (p_code[i].accumulate != -1) + r += coalescedRead(fields_v[p_code[i].accumulate][ss]); + coalescedWrite(fields_v[p_code[i].target][ss], r); + } + + }); + + // and cleanup + + for (auto & sv : _sview) + sv.ViewClose(); + + } VECTOR_VIEW_CLOSE(fields_v); } @@ -161,20 +263,22 @@ static void cgpt_convert(PyObject* in, cgpt_stencil_matrix_code_t& out) { // not implemented message template NotEnableIf,cgpt_stencil_matrix_base*> -cgpt_stencil_matrix_create(GridBase* grid, PyObject* _shifts, PyObject* _code, long code_parallel_block_size) { +cgpt_stencil_matrix_create(GridBase* grid, PyObject* _shifts, + PyObject* _code, long code_parallel_block_size, long local) { ERR("cgpt_stencil_matrix not implemented for type %s",typeid(T).name()); } // implemented for endomorphisms template EnableIf,cgpt_stencil_matrix_base*> -cgpt_stencil_matrix_create(GridBase* grid, PyObject* _shifts, PyObject* _code, long code_parallel_block_size) { +cgpt_stencil_matrix_create(GridBase* grid, PyObject* _shifts, + PyObject* _code, long code_parallel_block_size, long local) { std::vector shifts; cgpt_convert(_shifts,shifts); std::vector code; cgpt_convert(_code,code); - - return new cgpt_stencil_matrix(grid,shifts,code,code_parallel_block_size); + + return new cgpt_stencil_matrix(grid,shifts,code,code_parallel_block_size, local); } diff --git a/lib/cgpt/lib/stencil/matrix_vector.h b/lib/cgpt/lib/stencil/matrix_vector.h index d52b4295..b2506439 100644 --- a/lib/cgpt/lib/stencil/matrix_vector.h +++ b/lib/cgpt/lib/stencil/matrix_vector.h @@ -171,7 +171,7 @@ static void cgpt_convert(PyObject* in, cgpt_stencil_matrix_vector_code_t& out) { template cgpt_stencil_matrix_vector_base* cgpt_stencil_matrix_vector_create(cgpt_Lattice_base* __matrix, GridBase* grid, PyObject* _shifts, PyObject* _code, - long code_parallel_block_size) { + long code_parallel_block_size, long local) { std::vector shifts; cgpt_convert(_shifts,shifts); @@ -179,6 +179,9 @@ cgpt_stencil_matrix_vector_create(cgpt_Lattice_base* __matrix, GridBase* grid, P std::vector code; cgpt_convert(_code,code); + // for now only local is implemented + ASSERT(local); + // test __matrix type against matrix in spin space, // color space spin+color space, and singlet space if (is_compatible::type>(__matrix)) { diff --git a/lib/gpt/core/local_stencil/matrix.py b/lib/gpt/core/local_stencil/matrix.py index bbb8d3af..a4ef73bf 100644 --- a/lib/gpt/core/local_stencil/matrix.py +++ b/lib/gpt/core/local_stencil/matrix.py @@ -27,14 +27,14 @@ def parse(c): class matrix: - def __init__(self, lat, points, code, code_parallel_block_size=None): + def __init__(self, lat, points, code, code_parallel_block_size=None, local=1): self.points = points self.code = [parse(c) for c in code] self.code_parallel_block_size = code_parallel_block_size if code_parallel_block_size is None: code_parallel_block_size = len(code) self.obj = cgpt.stencil_matrix_create( - lat.v_obj[0], lat.grid.obj, points, self.code, code_parallel_block_size + lat.v_obj[0], lat.grid.obj, points, self.code, code_parallel_block_size, local ) def __call__(self, *fields): diff --git a/lib/gpt/core/local_stencil/matrix_vector.py b/lib/gpt/core/local_stencil/matrix_vector.py index 411ea72d..c03b9b7e 100644 --- a/lib/gpt/core/local_stencil/matrix_vector.py +++ b/lib/gpt/core/local_stencil/matrix_vector.py @@ -47,6 +47,7 @@ def __init__(self, lat_matrix, lat_vector, points, code, code_parallel_block_siz points, self.code, code_parallel_block_size, + 1 ) def __call__(self, matrix_fields, vector_fields): diff --git a/lib/gpt/core/stencil/matrix.py b/lib/gpt/core/stencil/matrix.py index 60f1aa7f..0f3c4ed2 100644 --- a/lib/gpt/core/stencil/matrix.py +++ b/lib/gpt/core/stencil/matrix.py @@ -19,13 +19,7 @@ import gpt as g -# TODO: explore this thoroughly -# - SIMD mask in lat.grid restrictions? dimensions plus margin needs to play nice with simd. -# - overlap comms and compute; need padding.start_communicate and padding.wait_communicate -# and a list of margin and inner points -# - SIMD in multi-rhs ? maybe add --simd_mask flag to command line ? -# - should do margins automatically seems best -class matrix: +class matrix_padded: def __init__(self, lat, points, write_fields, read_fields, code, code_parallel_block_size=None): margin = [0] * lat.grid.nd for p in points: @@ -68,3 +62,13 @@ def __call__(self, *fields): if self.verbose_performance: t() g.message(t) + + +def matrix(lat, points, write_fields, read_fields, code, code_parallel_block_size=None): + # check if all points are cartesian + for p in points: + if len([s for s in p if s != 0]) > 1: + return matrix_padded( + lat, points, write_fields, read_fields, code, code_parallel_block_size + ) + return g.local_stencil.matrix(lat, points, code, code_parallel_block_size, local=0) diff --git a/lib/gpt/qcd/fermion/register.py b/lib/gpt/qcd/fermion/register.py index 8f065af2..93d42a0a 100644 --- a/lib/gpt/qcd/fermion/register.py +++ b/lib/gpt/qcd/fermion/register.py @@ -11,25 +11,15 @@ def register(reg, op): reg.Mdiag = lambda dst, src: op.apply_unary_operator(2009, dst, src) reg.Dminus = lambda dst, src: op.apply_unary_operator(2010, dst, src) reg.DminusDag = lambda dst, src: op.apply_unary_operator(2011, dst, src) - reg.ImportPhysicalFermionSource = lambda dst, src: op.apply_unary_operator( - 2012, dst, src - ) - reg.ImportUnphysicalFermion = lambda dst, src: op.apply_unary_operator( - 2013, dst, src - ) - reg.ExportPhysicalFermionSolution = lambda dst, src: op.apply_unary_operator( - 2014, dst, src - ) - reg.ExportPhysicalFermionSource = lambda dst, src: op.apply_unary_operator( - 2015, dst, src - ) + reg.ImportPhysicalFermionSource = lambda dst, src: op.apply_unary_operator(2012, dst, src) + reg.ImportUnphysicalFermion = lambda dst, src: op.apply_unary_operator(2013, dst, src) + reg.ExportPhysicalFermionSolution = lambda dst, src: op.apply_unary_operator(2014, dst, src) + reg.ExportPhysicalFermionSource = lambda dst, src: op.apply_unary_operator(2015, dst, src) reg.Dhop = lambda dst, src: op.apply_unary_operator(3001, dst, src) reg.DhopDag = lambda dst, src: op.apply_unary_operator(4001, dst, src) reg.DhopEO = lambda dst, src: op.apply_unary_operator(3002, dst, src) reg.DhopEODag = lambda dst, src: op.apply_unary_operator(4002, dst, src) - reg.Mdir = lambda dst, src, dir, disp: op.apply_dirdisp_operator( - 5001, dst, src, dir, disp - ) + reg.Mdir = lambda dst, src, dir, disp: op.apply_dirdisp_operator(5001, dst, src, dir, disp) reg.MDeriv = lambda mat, dst, src: op.apply_deriv_operator(6001, mat, dst, src) reg.MDerivDag = lambda mat, dst, src: op.apply_deriv_operator(7001, mat, dst, src) reg.MoeDeriv = lambda mat, dst, src: op.apply_deriv_operator(6002, mat, dst, src) @@ -37,14 +27,8 @@ def register(reg, op): reg.MeoDeriv = lambda mat, dst, src: op.apply_deriv_operator(6003, mat, dst, src) reg.MeoDerivDag = lambda mat, dst, src: op.apply_deriv_operator(7003, mat, dst, src) reg.DhopDeriv = lambda mat, dst, src: op.apply_deriv_operator(6004, mat, dst, src) - reg.DhopDerivDag = lambda mat, dst, src: op.apply_deriv_operator( - 7004, mat, dst, src - ) + reg.DhopDerivDag = lambda mat, dst, src: op.apply_deriv_operator(7004, mat, dst, src) reg.DhopDerivEO = lambda mat, dst, src: op.apply_deriv_operator(6005, mat, dst, src) - reg.DhopDerivEODag = lambda mat, dst, src: op.apply_deriv_operator( - 7005, mat, dst, src - ) + reg.DhopDerivEODag = lambda mat, dst, src: op.apply_deriv_operator(7005, mat, dst, src) reg.DhopDerivOE = lambda mat, dst, src: op.apply_deriv_operator(6006, mat, dst, src) - reg.DhopDerivOEDag = lambda mat, dst, src: op.apply_deriv_operator( - 7006, mat, dst, src - ) + reg.DhopDerivOEDag = lambda mat, dst, src: op.apply_deriv_operator(7006, mat, dst, src)