Skip to content

Commit

Permalink
towards general stencil architecture
Browse files Browse the repository at this point in the history
  • Loading branch information
lehner committed Oct 23, 2023
1 parent 6d24604 commit 31d05fb
Show file tree
Hide file tree
Showing 8 changed files with 191 additions and 36 deletions.
11 changes: 6 additions & 5 deletions benchmarks/stencil_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import gpt as g
#grid = g.grid([64,64,64,64], g.double)
#grid = g.grid([32,32,32,32], g.double)
grid = g.grid([32,16,16,16], g.double)
#grid = g.grid([32,16,16,16], g.double)
grid = g.grid([16,16,16,32], g.double)
m1 = g.mcolor(grid)
m2 = g.mcolor(grid)
m3 = g.mcolor(grid)
Expand All @@ -20,15 +21,15 @@
(0,dst,ti.mov if l == 0 else ti.inc,1.0,[(1,0,3*i + l),(2,0,3*l + j)])
)

ein = g.stencil.tensor(m1, [(0, 0, 0, 0), (1, 0, 0, 0)], code, len(code) // 9)
ein = g.stencil.tensor(m1, [(0, 0, 0, 0), (1, 0, 0, 0)], code, len(code))# // 9

ein.memory_access_pattern(fast_osites=-3)
#ein.memory_access_pattern(fast_osites=-3)

ein(m3,m1,m2)
g.message(g.norm2(m3 - m3ref))


for block_size in [1,4,8,-1,-4,-5,-8]:
for block_size in [1,4,8,-1,-4,-8,-16,-32,-64]:
ein.memory_access_pattern(fast_osites=block_size)

g.message(block_size)
Expand Down Expand Up @@ -85,7 +86,7 @@
# D[i2[0], i1[0]] += sign1 * sign2 * Q1[i1[1], i2[1]] * g.transpose(Q2[i1[2], i2[2]])


for block_size in [1,4,8,-1,-4,-5,-8]:
for block_size in [1,4,8,-1,-4,-8,-16,-32]:
ein.memory_access_pattern(fast_osites=block_size)

g.message(block_size)
Expand Down
6 changes: 4 additions & 2 deletions lib/cgpt/lib/copy.h
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,8 @@ static void cgpt_tensor_indices_to_memory_offsets(std::vector<long>& t_indices,

long Nsimd, word, simd_word;
std::vector<long> _lshape;
l->describe_data_layout(Nsimd,word,simd_word,_lshape);
l->describe_data_layout(Nsimd,word,simd_word);
l->describe_data_shape(_lshape);
ASSERT(_lshape.size() == dim_indices);

// virtual memory layout
Expand Down Expand Up @@ -280,7 +281,8 @@ static int cgpt_get_vlat_data_layout(GridBase* & grid,

std::vector<long> ishape;
long Nsimd, word, simd_word;
l->describe_data_layout(Nsimd,word,simd_word,ishape);
l->describe_data_layout(Nsimd,word,simd_word);
l->describe_data_shape(ishape);

long _sz_scalar = simd_word;
long _sz_vector = simd_word * Nsimd;
Expand Down
17 changes: 17 additions & 0 deletions lib/cgpt/lib/foundation.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,23 @@ using namespace Grid;
for(uint64_t k=0;k<__ ## v.size();k++) __ ## v[k].ViewClose();


#define VECTOR_ELEMENT_VIEW_OPEN(ET, l, v, mode) \
Vector<ET*> _ ## v; _ ## v.reserve(l.size()); \
Vector<int> __ ## v; __ ## v.reserve(l.size()); \
for(uint64_t k=0;k<l.size();k++) { \
_ ## v.push_back((ET*)l[k]->memory_view_open(mode)); \
long Nsimd, word, simd_word; \
l[k]->describe_data_layout(Nsimd,word,simd_word); \
__ ## v.push_back(word / simd_word); \
} \
auto v = & _ ## v[0]; \
auto nelements = & __ ## v[0];

#define VECTOR_ELEMENT_VIEW_CLOSE(l) \
for(uint64_t k=0;k<l.size();k++) l[k]->memory_view_close();



NAMESPACE_BEGIN(Grid);

// aligned vector
Expand Down
9 changes: 8 additions & 1 deletion lib/cgpt/lib/lattice/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
*/
class cgpt_block_map_base;
class cgpt_lattice_term;
class cgpt_stencil_matrix_base;
class cgpt_stencil_matrix_vector_base;
class cgpt_stencil_tensor_base;

class cgpt_Lattice_base {
public:
virtual ~cgpt_Lattice_base() { };
Expand Down Expand Up @@ -56,7 +60,10 @@ class cgpt_Lattice_base {
virtual void basis_rotate(std::vector<cgpt_Lattice_base*> &basis,ComplexD* Qt,int j0, int j1, int k0,int k1,int Nm,bool use_accelerator) = 0;
virtual void linear_combination(std::vector<cgpt_Lattice_base*>& dst, std::vector<cgpt_Lattice_base*> &basis,ComplexD* Qt, long n_virtual, long basis_n_block) = 0;
virtual PyObject* memory_view(memory_type mt) = 0; // access to internal memory storage, can be simd format
virtual void describe_data_layout(long & Nsimd, long & word, long & simd_word, std::vector<long> & ishape) = 0;
virtual void* memory_view_open(ViewMode mode) = 0;
virtual void memory_view_close() = 0;
virtual void describe_data_layout(long & Nsimd, long & word, long & simd_word) = 0;
virtual void describe_data_shape(std::vector<long> & ishape) = 0;
virtual int get_numpy_dtype() = 0;
virtual cgpt_block_map_base* block_map(GridBase* coarse, std::vector<cgpt_Lattice_base*>& basis,
long basis_n_virtual, long basis_virtual_size, long basis_n_block,
Expand Down
18 changes: 15 additions & 3 deletions lib/cgpt/lib/lattice/implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ template<class T>
class cgpt_Lattice : public cgpt_Lattice_base {
public:
Lattice<T> l;
ViewMode view_mode;

typedef typename Lattice<T>::vector_object vobj;
typedef typename vobj::scalar_object sobj;
Expand Down Expand Up @@ -208,6 +209,15 @@ class cgpt_Lattice : public cgpt_Lattice_base {
cgpt_linear_combination(dst,basis,Qt,n_virtual,basis_n_block);
}

virtual void* memory_view_open(ViewMode mode) {
view_mode = mode;
return MemoryManager::ViewOpen(l.getHostPointer(),l.oSites()*sizeof(T), mode, l.Advise());
}

virtual void memory_view_close() {
MemoryManager::ViewClose(l.getHostPointer(), view_mode);
}

virtual PyObject* memory_view(memory_type mt) {

if (mt == mt_none) {
Expand All @@ -231,11 +241,13 @@ class cgpt_Lattice : public cgpt_Lattice_base {
return r;
}

virtual void describe_data_layout(long & Nsimd, long & word, long & simd_word, std::vector<long> & ishape) {
GridBase* grid = l.Grid();
Nsimd = grid->Nsimd();
virtual void describe_data_layout(long & Nsimd, long & word, long & simd_word) {
Nsimd = l.Grid()->Nsimd();
word = sizeof(sobj);
simd_word = sizeof(Coeff_t);
}

virtual void describe_data_shape(std::vector<long> & ishape) {
ishape.resize(0);
cgpt_numpy_data_layout(sobj(),ishape);
if (ishape.size() == 0) // treat complex numbers as 1d array with one element
Expand Down
2 changes: 1 addition & 1 deletion lib/cgpt/lib/lib.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@
#include "numpy.h"
#include "distribute.h"
#include "transform.h"
#include "stencil.h"
#include "grid.h"
#include "lattice.h"
#include "stencil.h"
#include "util.h"
#include "precision.h"
#include "expression.h"
Expand Down
3 changes: 2 additions & 1 deletion lib/cgpt/lib/random/parallel.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,8 @@ PyObject* cgpt_random_sample(DIST & dist,sRNG& srng,pRNG& prng,
for (auto & l : lattices) {
long Nsimd, word, simd_word;
std::vector<long> ishape;
l->describe_data_layout(Nsimd, word, simd_word, ishape);
l->describe_data_layout(Nsimd, word, simd_word);
l->describe_data_shape(ishape);
long l_n_per_site = word / simd_word;
n_per_site += (size_t)l_n_per_site;
if (complex_size) {
Expand Down
161 changes: 138 additions & 23 deletions lib/cgpt/lib/stencil/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,42 +131,48 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
delete sm;
}
}

virtual void execute(PVector<Lattice<T>> &fields, int fast_osites) {
/*
VECTOR_VIEW_OPEN(fields,fields_v,AcceleratorWrite);
TODO:
stencils should return different options for current hardware for performance (including max _npb)
*/
template<int BLOCK_SIZE>
void block_execute(const std::vector<cgpt_Lattice_base*>& fields, int fast_osites) {

#ifndef GRID_HAS_ACCELERATOR
typedef typename T::vector_type element_t;
#define NSIMD 1
#else
typedef typename T::scalar_type element_t;
#define NSIMD (sizeof(typename T::vector_type) / sizeof(typename T::scalar_type))
#endif
typedef typename element_t::scalar_type coeff_t;

VECTOR_ELEMENT_VIEW_OPEN(element_t, fields, fields_v, AcceleratorWrite);

int n_code = code.size();
const cgpt_stencil_tensor_code_offload_t* p_code = &code[0];

typedef typename T::scalar_type coeff_t;

int nd = fields[0].Grid()->Nd();
int nd = fields[0]->get_grid()->Nd();

int _npb = n_code_parallel_blocks;
int _npbs = n_code_parallel_block_size;

uint64_t osites = fields[0].Grid()->oSites();
uint64_t osites = fields[0]->get_grid()->oSites();
uint64_t osite_blocks = osites;

int _fast_osites, BLOCK_SIZE;
int _fast_osites;

if (local) {

ERR("Not implemented yet");

} else {

CGPT_CARTESIAN_STENCIL_HALO_EXCHANGE(T,);
//CGPT_CARTESIAN_STENCIL_HALO_EXCHANGE(T,);

if (fast_osites > 0) {
BLOCK_SIZE = fast_osites;
_fast_osites = 1;
} else {
BLOCK_SIZE = -fast_osites;
_fast_osites = 0;
}

#define TC_MOV 0
#define TC_INC 1
#define TC_MOV_NEG 2
Expand All @@ -177,6 +183,11 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
#define TC_DEC_CC 7
#define TC_MUL 8

/*
#ifdef GRID_HAS_ACCELERATOR
#define _ID(a) a
#define _CONJ(a) adj(a)
#define _INC(a,b,c) a + b*c
Expand Down Expand Up @@ -258,19 +269,123 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
}
accelerator_barrier();
#else
*/

// CPU version
ASSERT(osites % BLOCK_SIZE == 0);
osites /= BLOCK_SIZE;

int _fast_osites = fast_osites;

accelerator_for(ss_block,osites * _npb,T::Nsimd(),{

uint64_t ss, oblock;

MAP_INDEXING(ss, oblock);

for (int iblock=0;iblock<_npbs;iblock++) {

int i = oblock * _npbs + iblock;

const auto _p = &p_code[i];
const auto _f0 = &_p->factor[0];
const auto _f1 = &_p->factor[1];

int aNN = nelements[_f0->index] * NSIMD;
int bNN = nelements[_f1->index] * NSIMD;
int cNN = nelements[_p->target] * NSIMD;

int lane = acceleratorSIMTlane(T::Nsimd());
element_t* __restrict__ e_a = &fields_v[_f0->index][aNN * BLOCK_SIZE * ss + _f0->element * NSIMD + lane];
element_t* __restrict__ e_b = &fields_v[_f1->index][bNN * BLOCK_SIZE * ss + _f1->element * NSIMD + lane];
element_t* __restrict__ e_c = &fields_v[_p->target][cNN * BLOCK_SIZE * ss + _p->element * NSIMD + lane];

#define TC_MOV 0
#define TC_INC 1
#define TC_MOV_NEG 2
#define TC_DEC 3
#define TC_MOV_CC 4
#define TC_INC_CC 5
#define TC_MOV_NEG_CC 6
#define TC_DEC_CC 7
#define TC_MUL 8

#define ID(a) a
#define CONJ(a) adj(a)
#define KERNEL(signature, functor) \
for (int ff=0;ff<BLOCK_SIZE;ff++) \
e_c[cNN * ff] signature functor(e_a[aNN * ff]) * e_b[bNN * ff];

switch (_p->instruction) {
case TC_INC:
KERNEL(+=,ID);
break;
case TC_MOV:
KERNEL(=,ID);
break;
case TC_DEC:
KERNEL(-=,ID);
break;
case TC_MOV_NEG:
KERNEL(=-,ID);
break;
case TC_INC_CC:
KERNEL(+=,CONJ);
break;
case TC_MOV_CC:
KERNEL(=,CONJ);
break;
case TC_DEC_CC:
KERNEL(-=,CONJ);
break;
case TC_MOV_NEG_CC:
KERNEL(=-,CONJ);
break;
case TC_MUL:
for (int ff=0;ff<BLOCK_SIZE;ff++) \
e_c[cNN * ff] *= ((coeff_t)_p->weight);
break;
}
}

});


//#endif

// and cleanup
CGPT_CARTESIAN_STENCIL_CLEANUP(T,);
//CGPT_CARTESIAN_STENCIL_CLEANUP(T,);

}

VECTOR_VIEW_CLOSE(fields_v);
VECTOR_ELEMENT_VIEW_CLOSE(fields);
}

virtual void execute(const std::vector<cgpt_Lattice_base*>& __fields, int fast_osites) {
PVector<Lattice<T>> fields;
cgpt_basis_fill(fields,__fields);
execute(fields, fast_osites);
virtual void execute(const std::vector<cgpt_Lattice_base*>& fields, int kernel_param) {

int _BLOCK_SIZE, _fast_osites;
if (kernel_param > 0) {
_BLOCK_SIZE = kernel_param;
_fast_osites = 1;
} else {
_BLOCK_SIZE = -kernel_param;
_fast_osites = 0;
}

switch (_BLOCK_SIZE) {
case 1: block_execute<1>(fields, _fast_osites); break;
case 2: block_execute<2>(fields, _fast_osites); break;
case 4: block_execute<4>(fields, _fast_osites); break;
case 8: block_execute<8>(fields, _fast_osites); break;
case 16: block_execute<16>(fields, _fast_osites); break;
case 32: block_execute<32>(fields, _fast_osites); break;
case 64: block_execute<64>(fields, _fast_osites); break;
default: ERR("BLOCK_SIZE = %d not implemented", _BLOCK_SIZE);
}

}
};

Expand Down

0 comments on commit 31d05fb

Please sign in to comment.