towards general stencil architecture

lehner · Oct 23, 2023 · 31d05fb · 31d05fb
1 parent 6d24604
commit 31d05fb
Show file tree

Hide file tree

Showing 8 changed files with 191 additions and 36 deletions.
diff --git a/benchmarks/stencil_tensor.py b/benchmarks/stencil_tensor.py
@@ -2,7 +2,8 @@
 import gpt as g
 #grid = g.grid([64,64,64,64], g.double)
 #grid = g.grid([32,32,32,32], g.double)
-grid = g.grid([32,16,16,16], g.double)
+#grid = g.grid([32,16,16,16], g.double)
+grid = g.grid([16,16,16,32], g.double)
 m1 = g.mcolor(grid)
 m2 = g.mcolor(grid)
 m3 = g.mcolor(grid)
@@ -20,15 +21,15 @@
                 (0,dst,ti.mov if l == 0 else ti.inc,1.0,[(1,0,3*i + l),(2,0,3*l + j)])
             )
 
-ein = g.stencil.tensor(m1, [(0, 0, 0, 0), (1, 0, 0, 0)], code, len(code) // 9)
+ein = g.stencil.tensor(m1, [(0, 0, 0, 0), (1, 0, 0, 0)], code, len(code))# // 9
 
-ein.memory_access_pattern(fast_osites=-3)
+#ein.memory_access_pattern(fast_osites=-3)
 
 ein(m3,m1,m2)
 g.message(g.norm2(m3 - m3ref))
 
 
-for block_size in [1,4,8,-1,-4,-5,-8]:
+for block_size in [1,4,8,-1,-4,-8,-16,-32,-64]:
     ein.memory_access_pattern(fast_osites=block_size)
 
     g.message(block_size)
@@ -85,7 +86,7 @@
 #            D[i2[0], i1[0]] += sign1 * sign2 * Q1[i1[1], i2[1]] * g.transpose(Q2[i1[2], i2[2]])
 
 
-for block_size in [1,4,8,-1,-4,-5,-8]:
+for block_size in [1,4,8,-1,-4,-8,-16,-32]:
     ein.memory_access_pattern(fast_osites=block_size)
 
     g.message(block_size)

diff --git a/lib/cgpt/lib/copy.h b/lib/cgpt/lib/copy.h
@@ -214,7 +214,8 @@ static void cgpt_tensor_indices_to_memory_offsets(std::vector<long>& t_indices,
 
     long Nsimd, word, simd_word;
     std::vector<long> _lshape;
-    l->describe_data_layout(Nsimd,word,simd_word,_lshape);
+    l->describe_data_layout(Nsimd,word,simd_word);
+    l->describe_data_shape(_lshape);
     ASSERT(_lshape.size() == dim_indices);
 
     // virtual memory layout
@@ -280,7 +281,8 @@ static int cgpt_get_vlat_data_layout(GridBase* & grid,
 
     std::vector<long> ishape;
     long Nsimd, word, simd_word;
-    l->describe_data_layout(Nsimd,word,simd_word,ishape);
+    l->describe_data_layout(Nsimd,word,simd_word);
+    l->describe_data_shape(ishape);
 
     long _sz_scalar = simd_word;
     long _sz_vector = simd_word * Nsimd;

diff --git a/lib/cgpt/lib/foundation.h b/lib/cgpt/lib/foundation.h
@@ -46,6 +46,23 @@ using namespace Grid;
   for(uint64_t k=0;k<__ ## v.size();k++) __ ## v[k].ViewClose();
 
 
+#define VECTOR_ELEMENT_VIEW_OPEN(ET, l, v, mode)			\
+  Vector<ET*> _ ## v; _ ## v.reserve(l.size());				\
+  Vector<int> __ ## v; __ ## v.reserve(l.size());			\
+  for(uint64_t k=0;k<l.size();k++) {					\
+    _ ## v.push_back((ET*)l[k]->memory_view_open(mode));			\
+    long Nsimd, word, simd_word;					\
+    l[k]->describe_data_layout(Nsimd,word,simd_word);			\
+    __ ## v.push_back(word / simd_word);				\
+  }									\
+  auto v = & _ ## v[0];							\
+  auto nelements = & __ ## v[0];
+
+#define VECTOR_ELEMENT_VIEW_CLOSE(l)					\
+  for(uint64_t k=0;k<l.size();k++) l[k]->memory_view_close();
+
+
+
 NAMESPACE_BEGIN(Grid);
 
 // aligned vector

diff --git a/lib/cgpt/lib/lattice/base.h b/lib/cgpt/lib/lattice/base.h
@@ -18,6 +18,10 @@
 */
 class cgpt_block_map_base;
 class cgpt_lattice_term;
+class cgpt_stencil_matrix_base;
+class cgpt_stencil_matrix_vector_base;
+class cgpt_stencil_tensor_base;
+
 class cgpt_Lattice_base {
 public:
   virtual ~cgpt_Lattice_base() { };
@@ -56,7 +60,10 @@ class cgpt_Lattice_base {
   virtual void basis_rotate(std::vector<cgpt_Lattice_base*> &basis,ComplexD* Qt,int j0, int j1, int k0,int k1,int Nm,bool use_accelerator) = 0;
   virtual void linear_combination(std::vector<cgpt_Lattice_base*>& dst, std::vector<cgpt_Lattice_base*> &basis,ComplexD* Qt, long n_virtual, long basis_n_block) = 0;
   virtual PyObject* memory_view(memory_type mt) = 0; // access to internal memory storage, can be simd format
-  virtual void describe_data_layout(long & Nsimd, long & word, long & simd_word, std::vector<long> & ishape) = 0;
+  virtual void* memory_view_open(ViewMode mode) = 0;
+  virtual void memory_view_close() = 0;
+  virtual void describe_data_layout(long & Nsimd, long & word, long & simd_word) = 0;
+  virtual void describe_data_shape(std::vector<long> & ishape) = 0;
   virtual int get_numpy_dtype() = 0;
   virtual cgpt_block_map_base* block_map(GridBase* coarse, std::vector<cgpt_Lattice_base*>& basis, 
 					 long basis_n_virtual, long basis_virtual_size, long basis_n_block,

diff --git a/lib/cgpt/lib/lattice/implementation.h b/lib/cgpt/lib/lattice/implementation.h
@@ -20,6 +20,7 @@ template<class T>
 class cgpt_Lattice : public cgpt_Lattice_base {
 public:
   Lattice<T> l;
+  ViewMode view_mode;
 
   typedef typename Lattice<T>::vector_object vobj;
   typedef typename vobj::scalar_object sobj;
@@ -208,6 +209,15 @@ class cgpt_Lattice : public cgpt_Lattice_base {
     cgpt_linear_combination(dst,basis,Qt,n_virtual,basis_n_block);
   }
 
+  virtual void* memory_view_open(ViewMode mode) {
+    view_mode = mode;
+    return MemoryManager::ViewOpen(l.getHostPointer(),l.oSites()*sizeof(T), mode, l.Advise()); 
+  }
+
+  virtual void memory_view_close() {
+    MemoryManager::ViewClose(l.getHostPointer(), view_mode);
+  }
+
   virtual PyObject* memory_view(memory_type mt) {
 
     if (mt == mt_none) {
@@ -231,11 +241,13 @@ class cgpt_Lattice : public cgpt_Lattice_base {
     return r;
   }
 
-  virtual void describe_data_layout(long & Nsimd, long & word, long & simd_word, std::vector<long> & ishape) {
-    GridBase* grid = l.Grid();
-    Nsimd = grid->Nsimd();
+  virtual void describe_data_layout(long & Nsimd, long & word, long & simd_word) {
+    Nsimd = l.Grid()->Nsimd();
     word = sizeof(sobj);
     simd_word = sizeof(Coeff_t);
+  }
+
+  virtual void describe_data_shape(std::vector<long> & ishape) {
     ishape.resize(0);
     cgpt_numpy_data_layout(sobj(),ishape);
     if (ishape.size() == 0) // treat complex numbers as 1d array with one element

diff --git a/lib/cgpt/lib/lib.h b/lib/cgpt/lib/lib.h
@@ -44,9 +44,9 @@
 #include "numpy.h"
 #include "distribute.h"
 #include "transform.h"
-#include "stencil.h"
 #include "grid.h"
 #include "lattice.h"
+#include "stencil.h"
 #include "util.h"
 #include "precision.h"
 #include "expression.h"

diff --git a/lib/cgpt/lib/random/parallel.h b/lib/cgpt/lib/random/parallel.h
@@ -217,7 +217,8 @@ PyObject* cgpt_random_sample(DIST & dist,sRNG& srng,pRNG& prng,
     for (auto & l : lattices) {
       long Nsimd, word, simd_word;
       std::vector<long> ishape;
-      l->describe_data_layout(Nsimd, word, simd_word, ishape);
+      l->describe_data_layout(Nsimd, word, simd_word);
+      l->describe_data_shape(ishape);
       long l_n_per_site = word / simd_word;
       n_per_site += (size_t)l_n_per_site;
       if (complex_size) {

diff --git a/lib/cgpt/lib/stencil/tensor.h b/lib/cgpt/lib/stencil/tensor.h
@@ -131,42 +131,48 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
       delete sm;
     }
   }
-
-  virtual void execute(PVector<Lattice<T>> &fields, int fast_osites) {
+  /*
 
-    VECTOR_VIEW_OPEN(fields,fields_v,AcceleratorWrite);
+    TODO:
+    
+    stencils should return different options for current hardware for performance (including max _npb)
+
+  */
+  template<int BLOCK_SIZE>
+  void block_execute(const std::vector<cgpt_Lattice_base*>& fields, int fast_osites) {
+
+#ifndef GRID_HAS_ACCELERATOR
+    typedef typename T::vector_type element_t;
+#define NSIMD 1
+#else
+    typedef typename T::scalar_type element_t;
+#define NSIMD (sizeof(typename T::vector_type) / sizeof(typename T::scalar_type))
+#endif
+    typedef typename element_t::scalar_type coeff_t;
+
+    VECTOR_ELEMENT_VIEW_OPEN(element_t, fields, fields_v, AcceleratorWrite);
 
     int n_code = code.size();
     const cgpt_stencil_tensor_code_offload_t* p_code = &code[0];
 
-    typedef typename T::scalar_type coeff_t;
-
-    int nd = fields[0].Grid()->Nd();
+    int nd = fields[0]->get_grid()->Nd();
 
     int _npb = n_code_parallel_blocks;
     int _npbs = n_code_parallel_block_size;
 
-    uint64_t osites = fields[0].Grid()->oSites();
+    uint64_t osites = fields[0]->get_grid()->oSites();
     uint64_t osite_blocks = osites;
 
-    int _fast_osites, BLOCK_SIZE;
+    int _fast_osites;
 
     if (local) {
 
       ERR("Not implemented yet");
 
     } else {
 
-      CGPT_CARTESIAN_STENCIL_HALO_EXCHANGE(T,);
+      //CGPT_CARTESIAN_STENCIL_HALO_EXCHANGE(T,);
 
-      if (fast_osites > 0) {
-	BLOCK_SIZE = fast_osites;
-	_fast_osites = 1;
-      } else {
-	BLOCK_SIZE = -fast_osites;
-	_fast_osites = 0;
-      }
-
 #define TC_MOV 0
 #define TC_INC 1
 #define TC_MOV_NEG 2
@@ -177,6 +183,11 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
 #define TC_DEC_CC 7
 #define TC_MUL 8
 
+      /*
+#ifdef GRID_HAS_ACCELERATOR
+
+
+      
 #define _ID(a) a
 #define _CONJ(a) adj(a)
 #define _INC(a,b,c)      a + b*c
@@ -258,19 +269,123 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
       }
       
       accelerator_barrier();
+
+
+#else
+      */
+
+      // CPU version
+      ASSERT(osites % BLOCK_SIZE == 0);
+      osites /= BLOCK_SIZE;
+
+      int _fast_osites = fast_osites;
+
+      accelerator_for(ss_block,osites * _npb,T::Nsimd(),{
+
+          uint64_t ss, oblock;
+
+	  MAP_INDEXING(ss, oblock);
+
+	  for (int iblock=0;iblock<_npbs;iblock++) {
+
+	    int i = oblock * _npbs + iblock;
+
+	    const auto _p = &p_code[i];
+	    const auto _f0 = &_p->factor[0];
+	    const auto _f1 = &_p->factor[1];
+
+	    int aNN = nelements[_f0->index] * NSIMD;
+	    int bNN = nelements[_f1->index] * NSIMD;
+	    int cNN = nelements[_p->target] * NSIMD;
+
+	    int lane = acceleratorSIMTlane(T::Nsimd());
+	    element_t* __restrict__ e_a = &fields_v[_f0->index][aNN * BLOCK_SIZE * ss + _f0->element * NSIMD + lane];
+	    element_t* __restrict__ e_b = &fields_v[_f1->index][bNN * BLOCK_SIZE * ss + _f1->element * NSIMD + lane];
+	    element_t* __restrict__ e_c = &fields_v[_p->target][cNN * BLOCK_SIZE * ss + _p->element * NSIMD + lane];
+
+#define TC_MOV 0
+#define TC_INC 1
+#define TC_MOV_NEG 2
+#define TC_DEC 3
+#define TC_MOV_CC 4
+#define TC_INC_CC 5
+#define TC_MOV_NEG_CC 6
+#define TC_DEC_CC 7
+#define TC_MUL 8
+
+#define ID(a) a
+#define CONJ(a) adj(a)
+#define KERNEL(signature, functor)					\
+	    for (int ff=0;ff<BLOCK_SIZE;ff++)				\
+	      e_c[cNN * ff] signature functor(e_a[aNN * ff]) * e_b[bNN * ff];
+
+	    switch (_p->instruction) {
+	    case TC_INC:
+	      KERNEL(+=,ID);
+	      break;
+	    case TC_MOV:
+	      KERNEL(=,ID);
+	      break;
+	    case TC_DEC:
+	      KERNEL(-=,ID);
+	      break;
+	    case TC_MOV_NEG:
+	      KERNEL(=-,ID);
+	      break;
+	    case TC_INC_CC:
+	      KERNEL(+=,CONJ);
+	      break;
+	    case TC_MOV_CC:
+	      KERNEL(=,CONJ);
+	      break;
+	    case TC_DEC_CC:
+	      KERNEL(-=,CONJ);
+	      break;
+	    case TC_MOV_NEG_CC:
+	      KERNEL(=-,CONJ);
+	      break;
+	    case TC_MUL:
+	      for (int ff=0;ff<BLOCK_SIZE;ff++)	\
+		e_c[cNN * ff] *= ((coeff_t)_p->weight);
+	      break;
+	    }
+	  }
+
+	});
+
+
+      //#endif
 
       // and cleanup
-      CGPT_CARTESIAN_STENCIL_CLEANUP(T,);
+      //CGPT_CARTESIAN_STENCIL_CLEANUP(T,);
 
     }
 
-    VECTOR_VIEW_CLOSE(fields_v);
+    VECTOR_ELEMENT_VIEW_CLOSE(fields);
   }
 
-  virtual void execute(const std::vector<cgpt_Lattice_base*>& __fields, int fast_osites) {
-    PVector<Lattice<T>> fields;
-    cgpt_basis_fill(fields,__fields);
-    execute(fields, fast_osites);
+  virtual void execute(const std::vector<cgpt_Lattice_base*>& fields, int kernel_param) {
+
+    int _BLOCK_SIZE, _fast_osites;
+    if (kernel_param > 0) {
+      _BLOCK_SIZE = kernel_param;
+      _fast_osites = 1;
+    } else {
+      _BLOCK_SIZE = -kernel_param;
+      _fast_osites = 0;
+    }
+
+    switch (_BLOCK_SIZE) {
+    case 1: block_execute<1>(fields, _fast_osites); break;
+    case 2: block_execute<2>(fields, _fast_osites); break;
+    case 4: block_execute<4>(fields, _fast_osites); break;
+    case 8: block_execute<8>(fields, _fast_osites); break;
+    case 16: block_execute<16>(fields, _fast_osites); break;
+    case 32: block_execute<32>(fields, _fast_osites); break;
+    case 64: block_execute<64>(fields, _fast_osites); break;
+    default: ERR("BLOCK_SIZE = %d not implemented", _BLOCK_SIZE);
+    }
+
   }
 };