From 830cc51f06594ce9da12407d50cd3b1919e107c4 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Tue, 24 Oct 2023 09:47:20 +0200 Subject: [PATCH] a100 version acceptable --- benchmarks/stencil_tensor.py | 74 +++--- lib/cgpt/lib/benchmarks.cc | 172 ------------- lib/cgpt/lib/lattice/base.h | 2 +- lib/cgpt/lib/lattice/implementation.h | 4 +- lib/cgpt/lib/lib.h | 1 - lib/cgpt/lib/micro_kernel.cc | 43 ---- lib/cgpt/lib/micro_kernel.h | 21 -- lib/cgpt/lib/micro_kernel/action.h | 32 --- lib/cgpt/lib/micro_kernel/argument.h | 60 ----- lib/cgpt/lib/micro_kernel/macros.h | 42 ---- lib/cgpt/lib/stencil.cc | 23 +- lib/cgpt/lib/stencil/tensor.h | 340 +++++++++++--------------- lib/gpt/core/local_stencil/tensor.py | 20 +- lib/gpt/core/stencil/tensor.py | 4 +- 14 files changed, 209 insertions(+), 629 deletions(-) delete mode 100644 lib/cgpt/lib/micro_kernel.cc delete mode 100644 lib/cgpt/lib/micro_kernel.h delete mode 100644 lib/cgpt/lib/micro_kernel/action.h delete mode 100644 lib/cgpt/lib/micro_kernel/argument.h delete mode 100644 lib/cgpt/lib/micro_kernel/macros.h diff --git a/benchmarks/stencil_tensor.py b/benchmarks/stencil_tensor.py index 194c9504e..d80b672bd 100755 --- a/benchmarks/stencil_tensor.py +++ b/benchmarks/stencil_tensor.py @@ -1,9 +1,10 @@ #!/usr/bin/env python3 import gpt as g #grid = g.grid([64,64,64,64], g.double) -#grid = g.grid([32,32,32,32], g.double) +grid = g.grid([32,32,32,32], g.double) #grid = g.grid([32,16,16,16], g.double) -grid = g.grid([16,16,16,32], g.double) +#grid = g.grid([16,16,16,32], g.double) +#grid = g.grid([2*4,4*3,3*4,3*3*4], g.double) m1 = g.mcolor(grid) m2 = g.mcolor(grid) m3 = g.mcolor(grid) @@ -21,28 +22,30 @@ (0,dst,ti.mov if l == 0 else ti.inc,1.0,[(1,0,3*i + l),(2,0,3*l + j)]) ) -ein = g.stencil.tensor(m1, [(0, 0, 0, 0), (1, 0, 0, 0)], code, len(code))# // 9 - -#ein.memory_access_pattern(fast_osites=-3) +segments = [(3, 9)] +ein = g.stencil.tensor(m1, [(0, 0, 0, 0), (1, 0, 0, 0)], code, segments) ein(m3,m1,m2) g.message(g.norm2(m3 - m3ref)) -for block_size in [1,4,8,-1,-4,-8,-16,-32,-64]: - ein.memory_access_pattern(fast_osites=block_size) +for osites_per_instruction in [1,4,8,16,32,64]: + for osites_per_cache_block in [2048*4, 4096*4, 8192*4]: + ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block) - g.message(block_size) - t=g.timer("d") - t("expr") - for i in range(300): - g.eval(m3,m1*m2) - t("stencil") - for i in range(300): - ein(m3,m1,m2) - t() - g.message(t) - g.message(g.norm2(m3 - m3ref)) + g.message(osites_per_instruction, osites_per_cache_block) + t=g.timer("d") + t("expr") + for i in range(300): + g.eval(m3,m1*m2) + t("stencil") + for i in range(300): + ein(m3,m1,m2) + t() + g.message(t) + eps2 = g.norm2(m3 - m3ref) / g.norm2(m3) + assert eps2 < 1e-25 + g.message(eps2) # D_{a2,a1} = epsilon_{a1,b1,c1}*epsilon_{a2,b2,c2}*spin_transpose(Q1_{b1,b2})*Q2_{c1,c2} @@ -73,7 +76,8 @@ ) g.message(len(code)) -ein = g.stencil.tensor(Q1, [(0, 0, 0, 0), (1, 0, 0, 0)], code) +segments = [(len(code) // 16, 16)] +ein = g.stencil.tensor(Q1, [(0, 0, 0, 0), (1, 0, 0, 0)], code, segments) R = g.mspincolor(grid) R[:] = 0 @@ -84,20 +88,18 @@ g.message(g.norm2(R - R2) / g.norm2(R)) # # D[i2[0], i1[0]] += sign1 * sign2 * Q1[i1[1], i2[1]] * g.transpose(Q2[i1[2], i2[2]]) - - -for block_size in [1,4,8,-1,-4,-8,-16,-32]: - ein.memory_access_pattern(fast_osites=block_size) - - g.message(block_size) - - t=g.timer("d") - t("diquark") - for i in range(30): - g.qcd.baryon.diquark(Q1,Q2) - t("stencil") - for i in range(30): - ein(R, Q1, Q2) - t() - g.message(t) - +for osites_per_instruction in [1,4,8,16,32,64]: + for osites_per_cache_block in [2048*4, 4096*4, 8192*4]: + ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block) + + g.message(osites_per_instruction, osites_per_cache_block) + t=g.timer("d") + t("diquark") + for i in range(30): + g.qcd.baryon.diquark(Q1,Q2) + t("stencil") + for i in range(30): + ein(R, Q1, Q2) + t() + g.message(t) + g.message(g.norm2(R - R2) / g.norm2(R)) diff --git a/lib/cgpt/lib/benchmarks.cc b/lib/cgpt/lib/benchmarks.cc index ed8208e61..d63d05676 100644 --- a/lib/cgpt/lib/benchmarks.cc +++ b/lib/cgpt/lib/benchmarks.cc @@ -19,170 +19,6 @@ #include "lib.h" #include "benchmarks.h" -/* -E X P O R T(test_legacy_omega,{ - - void* _dst, *_src; - long type, mu; - if (!PyArg_ParseTuple(args, "llll", &_dst, &_src, &type, &mu)) { - return NULL; - } - - cgpt_Lattice_base* dst = (cgpt_Lattice_base*)_dst; - cgpt_Lattice_base* src = (cgpt_Lattice_base*)_src; - - auto& src_l = compatible>(src)->l; - auto& dst_l = compatible>(dst)->l; - - omega(dst_l, src_l, type, mu); - - return PyLong_FromLong(0); - }); -*/ - -template -void mk_binary_mul_ll(const micro_kernel_arg_t & arg, size_t i0, size_t i1, size_t n_subblock) { - typedef decltype(vobj_a()*vobj_b()) vobj_c; - typedef typename vobj_c::scalar_object sobj_c; - - micro_kernel_view(vobj_a, a_p, 0); - micro_kernel_view(vobj_b, b_p, 1); - micro_kernel_view(vobj_c, c_p, 2); - - micro_kernel_for(idx, i1-i0, sizeof(vobj_c)/sizeof(sobj_c), n_subblock, { - coalescedWrite(a_p[idx], coalescedRead(b_p[idx]) * coalescedRead(c_p[idx])); - }); -} - - -template -void micro_kernels(int lat) { - - Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); - Coordinate mpi_layout = GridDefaultMpi(); - Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); - GridCartesian Grid(latt_size,simd_layout,mpi_layout); - - typedef typename Lat::vector_object vobj; - typedef typename Lat::scalar_object sobj; - Lat a(&Grid), b(&Grid), c(&Grid), d(&Grid); - - std::cout << GridLogMessage << lat << "^4" << std::endl; - - GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - random(pRNG,a); random(pRNG,b); - - int Nwarm = 10; - int N = 500; - double gb, t0, t1, t2, t3, t4, t5, t0b, t1b; - mk_timer t_et, t_eti; - std::map t_mk; - std::vector blockings = { -#ifdef GRID_HAS_ACCELERATOR - { 8*1024, 1 }, - { 32*1024, 1 }, - { 256*1024, 1 }, -#else - { 512, 8 }, - { 512, 16 }, - { 512, 32 }, - { 512, 64 }, - { 256, 8 }, - { 256, 16 }, - { 256, 32 }, - { 128, 8 }, - { 128, 16 } -#endif - }; - - gb = 4.0 * 3.0 * sizeof(sobj) * Grid._fsites / 1e9; - for (int i=0;i=Nwarm) - t_et.add(t1-t0); - } - - Lat d_copy = a*a*b; - - for (int i=0;i=Nwarm) - t_eti.add(t1-t0); - } - - d = Zero(); - - t2 = cgpt_time(); - std::vector expression; - micro_kernel_arg_t views_c_a_b, views_d_a_c; - - views_c_a_b.add(c, AcceleratorWriteDiscard, false); - views_c_a_b.add(a, AcceleratorRead); - views_c_a_b.add(b, AcceleratorRead); - - views_d_a_c.add(d, AcceleratorWriteDiscard); - views_d_a_c.add(a, AcceleratorRead); - views_d_a_c.add(c, AcceleratorRead, false); - - // TODO: internal index size - expression.push_back({ mk_binary_mul_ll, views_c_a_b }); - expression.push_back({ mk_binary_mul_ll, views_d_a_c }); - expression.push_back({ mk_binary_mul_ll, views_c_a_b }); - expression.push_back({ mk_binary_mul_ll, views_d_a_c }); - - t3 = cgpt_time(); - - for (auto b : blockings) { - mk_timer t; - for (int i=0;i=Nwarm) - t.add(t1-t0); - } - char buf[256]; - sprintf(buf,"MK %d-%d",b.block_size,b.subblock_size); - t_mk[buf] = t; - } - t5 = cgpt_time(); - - views_c_a_b.release(); - views_d_a_c.release(); - - d -= d_copy; - double err2 = norm2(d); - - t_et.print ("GridET separate", gb); - t_eti.print("GridET joint ", gb); - for (auto t : t_mk) - t.second.print (t.first, gb); - -} - -template -void mk_bench_mul() { - micro_kernels(4); - micro_kernels(6); - micro_kernels(8); - micro_kernels(10); - micro_kernels(12); - micro_kernels(16); -#ifdef GRID_HAS_ACCELERATOR - micro_kernels(24); - micro_kernels(32); - micro_kernels(48); -#endif -} - EXPORT(benchmarks,{ //mask(); @@ -190,14 +26,6 @@ EXPORT(benchmarks,{ //benchmarks(8); //benchmarks(16); //benchmarks(32); - std::cout << GridLogMessage << std::endl << std::endl << "Benchmarking ComplexD" << std::endl << std::endl; - mk_bench_mul(); - - std::cout << GridLogMessage << std::endl << std::endl << "Benchmarking ColourD" << std::endl << std::endl; - mk_bench_mul(); - - //std::cout << GridLogMessage << std::endl << std::endl << "Benchmarking SpinColourD" << std::endl << std::endl; - //mk_bench_mul(); return PyLong_FromLong(0); }); diff --git a/lib/cgpt/lib/lattice/base.h b/lib/cgpt/lib/lattice/base.h index 69fc7a7e3..fa0642d5a 100644 --- a/lib/cgpt/lib/lattice/base.h +++ b/lib/cgpt/lib/lattice/base.h @@ -73,7 +73,7 @@ class cgpt_Lattice_base { virtual GridBase* get_grid() = 0; virtual cgpt_stencil_matrix_base* stencil_matrix(GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size, long local) = 0; virtual cgpt_stencil_matrix_vector_base* stencil_matrix_vector(cgpt_Lattice_base* matrix, GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size, long local) = 0; - virtual cgpt_stencil_tensor_base* stencil_tensor(GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size, long local) = 0; + virtual cgpt_stencil_tensor_base* stencil_tensor(GridBase* grid, PyObject* shifts, PyObject* code, PyObject* segments, long local) = 0; }; diff --git a/lib/cgpt/lib/lattice/implementation.h b/lib/cgpt/lib/lattice/implementation.h index 83eeda516..439de5ae4 100644 --- a/lib/cgpt/lib/lattice/implementation.h +++ b/lib/cgpt/lib/lattice/implementation.h @@ -295,7 +295,7 @@ class cgpt_Lattice : public cgpt_Lattice_base { return cgpt_stencil_matrix_vector_create(matrix, grid, shifts, code, code_parallel_block_size, local); } - virtual cgpt_stencil_tensor_base* stencil_tensor(GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size, long local) { - return cgpt_stencil_tensor_create(grid, shifts, code, code_parallel_block_size, local); + virtual cgpt_stencil_tensor_base* stencil_tensor(GridBase* grid, PyObject* shifts, PyObject* code, PyObject* segments, long local) { + return cgpt_stencil_tensor_create(grid, shifts, code, segments, local); } }; diff --git a/lib/cgpt/lib/lib.h b/lib/cgpt/lib/lib.h index 856691baa..25516e3bb 100644 --- a/lib/cgpt/lib/lib.h +++ b/lib/cgpt/lib/lib.h @@ -37,7 +37,6 @@ #include "foundation.h" #include "reduce.h" #include "sort.h" -#include "micro_kernel.h" #include "convert.h" #include "checksums.h" #include "parameters.h" diff --git a/lib/cgpt/lib/micro_kernel.cc b/lib/cgpt/lib/micro_kernel.cc deleted file mode 100644 index 3a95745a5..000000000 --- a/lib/cgpt/lib/micro_kernel.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* - GPT - Grid Python Toolkit - Copyright (C) 2020 Christoph Lehner (christoph.lehner@ur.de, https://github.com/lehner/gpt) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -*/ -#include "lib.h" - -void eval_micro_kernels(const std::vector & kernels, const micro_kernel_blocking_t & blocking) { - - size_t n = kernels.size(); - - size_t o_sites = kernels[0].arg.o_sites; - size_t block_size = blocking.block_size; - size_t subblock_size = blocking.subblock_size; - - micro_kernel_region({ - - for (size_t j=0;j<(o_sites + block_size - 1)/block_size;j++) { - - for (size_t i=0;i & kernels, const micro_kernel_blocking_t & blocking); diff --git a/lib/cgpt/lib/micro_kernel/argument.h b/lib/cgpt/lib/micro_kernel/argument.h deleted file mode 100644 index 29e3fdd02..000000000 --- a/lib/cgpt/lib/micro_kernel/argument.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - GPT - Grid Python Toolkit - Copyright (C) 2020 Christoph Lehner (christoph.lehner@ur.de, https://github.com/lehner/gpt) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -*/ - -class ViewContainerBase { -public: - virtual ~ViewContainerBase() {}; -}; - -template -class ViewContainer : public ViewContainerBase { -public: - View v; - - ViewContainer(View &_v) : v(_v) {}; - virtual ~ViewContainer() { v.ViewClose(); } -}; - -struct micro_kernel_arg_t { - struct tuple_t { - ViewContainerBase* view; - bool persistent; - }; - - std::vector views; - size_t o_sites; - - template - void add(Lattice& l, ViewMode mode, bool persistent = true) { - size_t _o_sites = l.Grid()->oSites(); - if (views.size() == 0) { - o_sites = _o_sites; - } else { - ASSERT(o_sites == _o_sites); - } - auto l_v = l.View(mode); - views.push_back({ new ViewContainer(l_v), persistent }); - } - - void release() { - for (auto x : views) - delete x.view; - } - -}; diff --git a/lib/cgpt/lib/micro_kernel/macros.h b/lib/cgpt/lib/micro_kernel/macros.h deleted file mode 100644 index 71074edf3..000000000 --- a/lib/cgpt/lib/micro_kernel/macros.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - GPT - Grid Python Toolkit - Copyright (C) 2020 Christoph Lehner (christoph.lehner@ur.de, https://github.com/lehner/gpt) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -*/ -#ifndef GRID_HAS_ACCELERATOR - -#define micro_kernel_for(idx, n_idx, nsimd, nsubblock, ...) { \ - int n_thread = thread_num(); \ - int n_threads = thread_max(); \ - for (size_t ib=n_subblock*n_thread;ib>*)arg.views[idx].view)->v; \ - auto ptr = &ptr ## _v[arg.views[idx].persistent ? i0 : 0]; - diff --git a/lib/cgpt/lib/stencil.cc b/lib/cgpt/lib/stencil.cc index 63db24402..d58646d89 100644 --- a/lib/cgpt/lib/stencil.cc +++ b/lib/cgpt/lib/stencil.cc @@ -64,11 +64,11 @@ EXPORT(stencil_tensor_create,{ void* _grid; void* _lattice; - PyObject* _shifts, * _code; + PyObject* _shifts, * _code, * _segments; long _code_parallel_block_size; long _local; - if (!PyArg_ParseTuple(args, "llOOll", &_lattice, &_grid, &_shifts, &_code, - &_code_parallel_block_size, &_local)) { + if (!PyArg_ParseTuple(args, "llOOOl", &_lattice, &_grid, &_shifts, &_code, + &_segments, &_local)) { return NULL; } @@ -76,8 +76,7 @@ EXPORT(stencil_tensor_create,{ cgpt_Lattice_base* lattice = (cgpt_Lattice_base*)_lattice; return PyLong_FromVoidPtr(lattice->stencil_tensor(grid, _shifts, _code, - _code_parallel_block_size, - _local)); + _segments, _local)); }); EXPORT(stencil_matrix_execute,{ @@ -125,8 +124,11 @@ EXPORT(stencil_tensor_execute,{ void* _stencil; PyObject* _fields; - long fast_osites; - if (!PyArg_ParseTuple(args, "lOl", &_stencil, &_fields, &fast_osites)) { + long osites_per_instruction; + long osites_per_cache_block; + if (!PyArg_ParseTuple(args, "lOll", &_stencil, &_fields, + &osites_per_instruction, + &osites_per_cache_block)) { return NULL; } @@ -135,7 +137,12 @@ EXPORT(stencil_tensor_execute,{ std::vector __fields; cgpt_basis_fill(__fields,_fields); - stencil->execute(__fields, fast_osites); + cgpt_stencil_tensor_execute_params_t params = + { + osites_per_instruction, + osites_per_cache_block + }; + stencil->execute(__fields, params); return PyLong_FromLong(0); }); diff --git a/lib/cgpt/lib/stencil/tensor.h b/lib/cgpt/lib/stencil/tensor.h index 369d5d4c0..c0d8a3160 100644 --- a/lib/cgpt/lib/stencil/tensor.h +++ b/lib/cgpt/lib/stencil/tensor.h @@ -18,6 +18,16 @@ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ +struct cgpt_stencil_tensor_code_segment_t { + int block_size; + int number_of_blocks; +}; + +struct cgpt_stencil_tensor_execute_params_t { + int osites_per_instruction; + int osites_per_cache_block; +}; + struct cgpt_stencil_tensor_factor_t { uint16_t index; // index of field int16_t point; // index of shift @@ -44,7 +54,8 @@ struct cgpt_stencil_tensor_code_t { class cgpt_stencil_tensor_base { public: virtual ~cgpt_stencil_tensor_base() {}; - virtual void execute(const std::vector& fields, int fast_osites) = 0; + virtual void execute(const std::vector& fields, + const cgpt_stencil_tensor_execute_params_t& params) = 0; }; template @@ -56,8 +67,8 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { Vector code; Vector factors; - - int n_code_parallel_block_size, n_code_parallel_blocks; + + std::vector segments; int local; // local == true @@ -70,13 +81,15 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { cgpt_stencil_tensor(GridBase* grid, const std::vector& shifts, const std::vector& _code, - int _n_code_parallel_block_size, + const std::vector& _segments, int _local) : - code(_code.size()), local(_local), - n_code_parallel_block_size(_n_code_parallel_block_size) { + code(_code.size()), local(_local), segments(_segments) { - ASSERT(_code.size() % n_code_parallel_block_size == 0); - n_code_parallel_blocks = (int)_code.size() / n_code_parallel_block_size; + // test + size_t code_expected_size = 0; + for (auto & s : segments) + code_expected_size += s.block_size * s.number_of_blocks; + ASSERT(_code.size() == code_expected_size); // total number of factors int nfactors = 0; @@ -138,8 +151,8 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { stencils should return different options for current hardware for performance (including max _npb) */ - template - void block_execute(const std::vector& fields, int fast_osites) { + template + void block_execute(const std::vector& fields, int osites_per_cache_block) { #ifndef GRID_HAS_ACCELERATOR typedef typename T::vector_type element_t; @@ -158,14 +171,8 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { int nd = fields[0]->get_grid()->Nd(); - int _npb = n_code_parallel_blocks; - int _npbs = n_code_parallel_block_size; - uint64_t osites = fields[0]->get_grid()->oSites(); - uint64_t osite_blocks = osites; - int _fast_osites; - if (local) { ERR("Not implemented yet"); @@ -173,7 +180,7 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { } else { //CGPT_CARTESIAN_STENCIL_HALO_EXCHANGE(T,); - + #define TC_MOV 0 #define TC_INC 1 #define TC_MOV_NEG 2 @@ -183,208 +190,131 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { #define TC_MOV_NEG_CC 6 #define TC_DEC_CC 7 #define TC_MUL 8 - - /* -#ifdef GRID_HAS_ACCELERATOR - - +#define ID(a) a +#define CONJ(a) adj(a) + +#define EXECUTE(KB, NN) \ + switch (_p->instruction) \ + { \ + case TC_INC: KB(+=,ID,NN); break; \ + case TC_MOV: KB(=,ID,NN); break; \ + case TC_DEC: KB(-=,ID,NN); break; \ + case TC_MOV_NEG: KB(=-,ID,NN); break; \ + case TC_INC_CC: KB(+=,CONJ,NN); break; \ + case TC_MOV_CC: KB(=,CONJ,NN); break; \ + case TC_DEC_CC: KB(-=,CONJ,NN); break; \ + case TC_MOV_NEG_CC: KB(=-,CONJ,NN); break; \ + case TC_MUL: \ + { \ + auto w = ((coeff_t)_p->weight); \ + for (int ff=0;ffindex] * NSIMD; \ + element_t* __restrict__ e_b = &fields_v[_f1->index][bNN * NN * ss + _f1->element * NSIMD + lane]; \ + for (int ff=0;ffweight) -#define KERNEL(composition, mod_first) \ - for (int ff=0;ff < BLOCK_SIZE;ff++) \ - coalescedWriteElement(fields_v[_p->target][BLOCK_SIZE * ss + ff], \ - composition(coalescedReadElement(fields_v[_p->target][BLOCK_SIZE * ss + ff], _p->element), \ - mod_first(coalescedReadElement(fields_v[_f0->index][BLOCK_SIZE * ss + ff], _f0->element)), \ - coalescedReadElement(fields_v[_f1->index][BLOCK_SIZE * ss + ff], _f1->element)), \ - _p->element); + + ASSERT(osites_per_cache_block % osites_per_instruction == 0); + uint64_t ocache_blocks = (osites + osites_per_cache_block - 1) / osites_per_cache_block; + for (uint64_t ocache_block = 0;ocache_block < ocache_blocks;ocache_block++) { + uint64_t osites0 = min(ocache_block * osites_per_cache_block, osites); + uint64_t osites1 = min(osites0 + osites_per_cache_block, osites); - osite_blocks = osites / BLOCK_SIZE; + uint64_t osites_in_cache_block = osites1 - osites0; + + uint64_t oblocks = osites_in_cache_block / osites_per_instruction; + uint64_t oblock0 = osites0 / osites_per_instruction; - for (int iter=0;iter<((osites % BLOCK_SIZE == 0) ? 1 : 2);iter++) { + uint64_t osites_extra_start = oblocks * osites_per_instruction; + uint64_t osites_extra = osites_in_cache_block - osites_extra_start; - uint64_t osite_offset = (iter == 0) ? 0 : osite_blocks * BLOCK_SIZE; - if (iter == 1) { - BLOCK_SIZE = 1; - osite_blocks = osites - osite_offset; - } - - accelerator_forNB(ss_block,osite_blocks * _npb,T::Nsimd(),{ - - uint64_t ss, oblock; - - if (_fast_osites) { - oblock = ss_block / osite_blocks; - ss = osite_offset + ss_block % osite_blocks; - } else { - ss = osite_offset + ss_block / _npb; - oblock = ss_block % _npb; - } - - for (int iblock=0;iblock<_npbs;iblock++) { + //std::cout << GridLogMessage<< "Group " << osites0 << " to " << osites1 << " has oblocks " << oblocks << " and extra " << osites_extra << " from " << osites_extra_start << " compare to " << osites << std::endl; + + int coffset = 0; + for (auto & segment : segments) { + int _npb = segment.number_of_blocks; + int _npbs = segment.block_size; + + accelerator_forNB(ss_block, oblocks * _npb, T::Nsimd(), { - int i = oblock * _npbs + iblock; + uint64_t ss = ss_block / _npb + oblock0; + uint64_t cc = ss_block % _npb; - const auto _p = &p_code[i]; - const auto _f0 = &_p->factor[0]; - const auto _f1 = &_p->factor[1]; + for (int ic=0;ic<_npbs;ic++) { + + const auto _p = &p_code[coffset + cc * _npbs + ic]; + const auto _f0 = &_p->factor[0]; + const auto _f1 = &_p->factor[1]; + + int aNN = nelements[_f0->index] * NSIMD; + int cNN = nelements[_p->target] * NSIMD; - switch (_p->instruction) { - case TC_INC: - KERNEL(_INC,_ID); - break; - case TC_MOV: - KERNEL(_MOV,_ID); - break; - case TC_DEC: - KERNEL(_DEC,_ID); - break; - case TC_MOV_NEG: - KERNEL(_MOV_NEG,_ID); - break; - case TC_INC_CC: - KERNEL(_INC,_CONJ); - break; - case TC_MOV_CC: - KERNEL(_MOV,_CONJ); - break; - case TC_DEC_CC: - KERNEL(_DEC,_CONJ); - break; - case TC_MOV_NEG_CC: - KERNEL(_MOV_NEG,_CONJ); - break; - case TC_MUL: - KERNEL(_MUL,_ID); - break; + int lane = acceleratorSIMTlane(T::Nsimd()); + element_t* __restrict__ e_a = &fields_v[_f0->index][aNN * osites_per_instruction * ss + _f0->element * NSIMD + lane]; + element_t* __restrict__ e_c = &fields_v[_p->target][cNN * osites_per_instruction * ss + _p->element * NSIMD + lane]; + + EXECUTE(KERNEL_BIN, osites_per_instruction); } - } - - }); + + }); + + if (osites_extra) { + accelerator_forNB(ss_block, osites_extra * _npb, T::Nsimd(), { + + uint64_t ss = ss_block / _npb + osites0 + osites_extra_start; + uint64_t cc = ss_block % _npb; + + for (int ic=0;ic<_npbs;ic++) { + + const auto _p = &p_code[coffset + cc * _npbs + ic]; + const auto _f0 = &_p->factor[0]; + const auto _f1 = &_p->factor[1]; + + int aNN = nelements[_f0->index] * NSIMD; + int cNN = nelements[_p->target] * NSIMD; + + int lane = acceleratorSIMTlane(T::Nsimd()); + element_t* __restrict__ e_a = &fields_v[_f0->index][aNN * ss + _f0->element * NSIMD + lane]; + element_t* __restrict__ e_c = &fields_v[_p->target][cNN * ss + _p->element * NSIMD + lane]; + + EXECUTE(KERNEL_BIN, 1); + } + }); + } + + coffset += _npb * _npbs; + } } accelerator_barrier(); - - -#else - */ - - // CPU version - ASSERT(osites % BLOCK_SIZE == 0); - osites /= BLOCK_SIZE; - - int _fast_osites = fast_osites; - - accelerator_for(ss_block,osites * _npb,T::Nsimd(),{ - - uint64_t ss, oblock; - - MAP_INDEXING(ss, oblock); - - for (int iblock=0;iblock<_npbs;iblock++) { - - int i = oblock * _npbs + iblock; - - const auto _p = &p_code[i]; - const auto _f0 = &_p->factor[0]; - const auto _f1 = &_p->factor[1]; - - int aNN = nelements[_f0->index] * NSIMD; - int bNN = nelements[_f1->index] * NSIMD; - int cNN = nelements[_p->target] * NSIMD; - - int lane = acceleratorSIMTlane(T::Nsimd()); - element_t* __restrict__ e_a = &fields_v[_f0->index][aNN * BLOCK_SIZE * ss + _f0->element * NSIMD + lane]; - element_t* __restrict__ e_b = &fields_v[_f1->index][bNN * BLOCK_SIZE * ss + _f1->element * NSIMD + lane]; - element_t* __restrict__ e_c = &fields_v[_p->target][cNN * BLOCK_SIZE * ss + _p->element * NSIMD + lane]; - -#define TC_MOV 0 -#define TC_INC 1 -#define TC_MOV_NEG 2 -#define TC_DEC 3 -#define TC_MOV_CC 4 -#define TC_INC_CC 5 -#define TC_MOV_NEG_CC 6 -#define TC_DEC_CC 7 -#define TC_MUL 8 - -#define ID(a) a -#define CONJ(a) adj(a) -#define KERNEL(signature, functor) \ - for (int ff=0;ffinstruction) { - case TC_INC: - KERNEL(+=,ID); - break; - case TC_MOV: - KERNEL(=,ID); - break; - case TC_DEC: - KERNEL(-=,ID); - break; - case TC_MOV_NEG: - KERNEL(=-,ID); - break; - case TC_INC_CC: - KERNEL(+=,CONJ); - break; - case TC_MOV_CC: - KERNEL(=,CONJ); - break; - case TC_DEC_CC: - KERNEL(-=,CONJ); - break; - case TC_MOV_NEG_CC: - KERNEL(=-,CONJ); - break; - case TC_MUL: - for (int ff=0;ffweight); - break; - } - } - - }); - - - //#endif // and cleanup //CGPT_CARTESIAN_STENCIL_CLEANUP(T,); - } VECTOR_ELEMENT_VIEW_CLOSE(fields); } - virtual void execute(const std::vector& fields, int kernel_param) { + virtual void execute(const std::vector& fields, + const cgpt_stencil_tensor_execute_params_t& params) { - int _BLOCK_SIZE, _fast_osites; - if (kernel_param > 0) { - _BLOCK_SIZE = kernel_param; - _fast_osites = 1; - } else { - _BLOCK_SIZE = -kernel_param; - _fast_osites = 0; - } - - switch (_BLOCK_SIZE) { - case 1: block_execute<1>(fields, _fast_osites); break; - case 2: block_execute<2>(fields, _fast_osites); break; - case 4: block_execute<4>(fields, _fast_osites); break; - case 8: block_execute<8>(fields, _fast_osites); break; - case 16: block_execute<16>(fields, _fast_osites); break; - case 32: block_execute<32>(fields, _fast_osites); break; - case 64: block_execute<64>(fields, _fast_osites); break; - default: ERR("BLOCK_SIZE = %d not implemented", _BLOCK_SIZE); + switch (params.osites_per_instruction) { + case 1: block_execute<1>(fields, params.osites_per_cache_block); break; + case 2: block_execute<2>(fields, params.osites_per_cache_block); break; + case 4: block_execute<4>(fields, params.osites_per_cache_block); break; + case 8: block_execute<8>(fields, params.osites_per_cache_block); break; + case 16: block_execute<16>(fields, params.osites_per_cache_block); break; + case 32: block_execute<32>(fields, params.osites_per_cache_block); break; + case 64: block_execute<64>(fields, params.osites_per_cache_block); break; + default: ERR("params.osites_per_instruction = %d not implemented", params.osites_per_cache_block); } } @@ -398,6 +328,13 @@ static void cgpt_convert(PyObject* in, cgpt_stencil_tensor_factor_t& out) { cgpt_convert(PyTuple_GetItem(in, 2), out.element); } +static void cgpt_convert(PyObject* in, cgpt_stencil_tensor_code_segment_t& out) { + ASSERT(PyTuple_Check(in)); + ASSERT(PyTuple_Size(in) == 2); + cgpt_convert(PyTuple_GetItem(in, 0), out.block_size); + cgpt_convert(PyTuple_GetItem(in, 1), out.number_of_blocks); +} + static void cgpt_convert(PyObject* in, cgpt_stencil_tensor_code_t& out) { ASSERT(PyDict_Check(in)); @@ -411,7 +348,7 @@ static void cgpt_convert(PyObject* in, cgpt_stencil_tensor_code_t& out) { template cgpt_stencil_tensor_base* cgpt_stencil_tensor_create(GridBase* grid, PyObject* _shifts, - PyObject* _code, long code_parallel_block_size, + PyObject* _code, PyObject* _segments, long local) { std::vector shifts; @@ -420,5 +357,8 @@ cgpt_stencil_tensor_base* cgpt_stencil_tensor_create(GridBase* grid, PyObject* _ std::vector code; cgpt_convert(_code,code); - return new cgpt_stencil_tensor(grid,shifts,code,code_parallel_block_size, local); + std::vector segments; + cgpt_convert(_segments,segments); + + return new cgpt_stencil_tensor(grid,shifts,code, segments, local); } diff --git a/lib/gpt/core/local_stencil/tensor.py b/lib/gpt/core/local_stencil/tensor.py index 4d267136a..43700183d 100644 --- a/lib/gpt/core/local_stencil/tensor.py +++ b/lib/gpt/core/local_stencil/tensor.py @@ -33,19 +33,20 @@ def parse(c): class tensor: - def __init__(self, lat, points, code, code_parallel_block_size=None, local=1): + def __init__(self, lat, points, code, segments, local=1): self.points = points self.code = [parse(c) for c in code] - self.code_parallel_block_size = code_parallel_block_size - if code_parallel_block_size is None: - code_parallel_block_size = len(code) + self.segments = segments self.obj = cgpt.stencil_tensor_create( - lat.v_obj[0], lat.grid.obj, points, self.code, code_parallel_block_size, local + lat.v_obj[0], lat.grid.obj, points, self.code, self.segments, local ) - self.fast_osites = 1 + self.osites_per_instruction = 4 + self.osites_per_cache_block = 4096 def __call__(self, *fields): - cgpt.stencil_tensor_execute(self.obj, list(fields), self.fast_osites) + cgpt.stencil_tensor_execute(self.obj, list(fields), + self.osites_per_instruction, + self.osites_per_cache_block) def __del__(self): cgpt.stencil_tensor_delete(self.obj) @@ -53,5 +54,6 @@ def __del__(self): def data_access_hints(self, *hints): pass - def memory_access_pattern(self, fast_osites): - self.fast_osites = fast_osites + def memory_access_pattern(self, osites_per_instruction, osites_per_cache_block): + self.osites_per_instruction = osites_per_instruction + self.osites_per_cache_block = osites_per_cache_block diff --git a/lib/gpt/core/stencil/tensor.py b/lib/gpt/core/stencil/tensor.py index 5d24fd3f2..d2a379b9d 100644 --- a/lib/gpt/core/stencil/tensor.py +++ b/lib/gpt/core/stencil/tensor.py @@ -19,9 +19,9 @@ import gpt as g -def tensor(lat, points, code, code_parallel_block_size=None): +def tensor(lat, points, code, segments): # check if all points are cartesian for p in points: if len([s for s in p if s != 0]) > 1: raise Exception("Only cartesian version currently implemented") - return g.local_stencil.tensor(lat, points, code, code_parallel_block_size, local=0) + return g.local_stencil.tensor(lat, points, code, segments, local=0)