Skip to content

Commit

Permalink
acceptable setup on m1
Browse files Browse the repository at this point in the history
  • Loading branch information
lehner committed Oct 25, 2023
1 parent e1e55d8 commit d4553c1
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 29 deletions.
2 changes: 1 addition & 1 deletion benchmarks/stencil_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
g.message(g.norm2(m3 - m3ref))


for osites_per_instruction in [32,128,256]: #[1,8,16,32,64]:
for osites_per_instruction in [16,32,128,256]: #[1,8,16,32,64]:
for osites_per_cache_block in [2**15, grid.gsites]: #[2**11, 2**13, 2**15, grid.gsites]:
ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block)

Expand Down
2 changes: 1 addition & 1 deletion lib/cgpt/lib/foundation.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ using namespace Grid;
__ ## v.push_back(word / simd_word); \
} \
auto v = & _ ## v[0]; \
auto nelements = & __ ## v[0];
auto v ## _nelements = & __ ## v[0];

#define VECTOR_ELEMENT_VIEW_CLOSE(l) \
for(uint64_t k=0;k<l.size();k++) l[k]->memory_view_close();
Expand Down
75 changes: 48 additions & 27 deletions lib/cgpt/lib/stencil/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,17 @@ struct cgpt_stencil_tensor_execute_params_t {
};

struct cgpt_stencil_tensor_factor_t {
void* base_ptr;
uint64_t stride;
int16_t index; // index of field
int16_t point; // index of shift
uint16_t element; // index of tensor element
int16_t is_temporary;
};

struct cgpt_stencil_tensor_code_offload_t {
void* base_ptr;
uint64_t stride;
int16_t target;
int16_t is_temporary;
uint16_t element;
Expand Down Expand Up @@ -164,7 +168,7 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
VECTOR_ELEMENT_VIEW_OPEN(element_t, fields, fields_v, AcceleratorWrite);

int n_code = code.size();
const cgpt_stencil_tensor_code_offload_t* p_code = &code[0];
cgpt_stencil_tensor_code_offload_t* p_code = &code[0];

int nd = fields[0]->get_grid()->Nd();

Expand Down Expand Up @@ -211,11 +215,10 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
} \
break; \
}

#define KERNEL_BIN(signature, op, functor, NN) { \
int bNN = nelements[_f1->index] * NSIMD; \
int ss_b = (_f1->is_temporary) ? ss : ss + oblock0; \
element_t* __restrict__ e_b = &fields_v[_f1->index][bNN * NN * ss_b + _f1->element * NSIMD + lane]; \
auto bNN = _f1->stride; \
element_t* __restrict__ e_b = ((element_t*)_f1->base_ptr) + bNN * NN * MAP_INDEX(_f1,ss) + lane; \
for (int ff=0;ff<NN;ff++) \
e_c[cNN * ff] signature functor(e_a[aNN * ff]) op e_b[bNN * ff]; \
}
Expand All @@ -226,7 +229,6 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
#ifndef GRID_HAS_ACCELERATOR
thread_region {
#endif

uint64_t ocache_blocks = (osites + osites_per_cache_block - 1) / osites_per_cache_block;
for (uint64_t ocache_block = 0;ocache_block < ocache_blocks;ocache_block++) {
uint64_t osites0 = std::min(ocache_block * osites_per_cache_block, osites);
Expand All @@ -240,15 +242,38 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
uint64_t osites_extra_start = oblocks * osites_per_instruction;
uint64_t osites_extra = osites_in_cache_block - osites_extra_start;

// set base_ptr for current views
for (int i=0;i<n_code;i++) {
cgpt_stencil_tensor_code_offload_t* _p = &p_code[i];
_p->stride = fields_v_nelements[_p->target] * NSIMD;
if (_p->is_temporary) {
_p->base_ptr = &fields_v[_p->target][_p->element * NSIMD];
} else {
_p->base_ptr = &fields_v[_p->target][_p->stride * osites_per_instruction * oblock0 + _p->element * NSIMD];
}
for (int j=0;j<_p->size;j++) {
cgpt_stencil_tensor_factor_t* _f = &_p->factor[j];
_f->stride = fields_v_nelements[_f->index] * NSIMD;
if (_f->is_temporary) {
_f->base_ptr = &fields_v[_f->index][_f->element * NSIMD];
} else {
_f->base_ptr = &fields_v[_f->index][_f->stride * osites_per_instruction * oblock0 + _f->element * NSIMD];
}
}
}

//std::cout << GridLogMessage<< "Group " << osites0 << " to " << osites1 << " has oblocks " << oblocks << " and extra " << osites_extra << " from " << osites_extra_start << " compare to " << osites << std::endl;
#ifdef GRID_HAS_ACCELERATOR
#define MAP_INDEX(x,ss) ss
int coffset = 0;
for (auto & segment : segments) {
int _npb = segment.number_of_blocks;
int _npbs = segment.block_size;
accelerator_forNB(ss_block, oblocks * _npb, T::Nsimd(), {
uint64_t cc = ss_block % _npb;
#else
#define MAP_INDEX(x,ss) ss
//(x->is_temporary ? ss : ss)
#define _npb 1
#define _npbs n_code
#define coffset 0
Expand All @@ -259,19 +284,17 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
uint64_t ss = ss_block / _npb;

for (int ic=0;ic<_npbs;ic++) {

const cgpt_stencil_tensor_code_offload_t* __restrict__ _p = &p_code[coffset + cc * _npbs + ic];
const cgpt_stencil_tensor_factor_t* __restrict__ _f0 = &_p->factor[0];
const cgpt_stencil_tensor_factor_t* __restrict__ _f1 = &_p->factor[1];

const auto _p = &p_code[coffset + cc * _npbs + ic];
const auto _f0 = &_p->factor[0];
const auto _f1 = &_p->factor[1];

int aNN = nelements[_f0->index] * NSIMD;
int cNN = nelements[_p->target] * NSIMD;

int lane = acceleratorSIMTlane(T::Nsimd());
int ss_a = (_f0->is_temporary) ? ss : ss + oblock0;
int ss_c = (_p->is_temporary) ? ss : ss + oblock0;
element_t* __restrict__ e_a = &fields_v[_f0->index][aNN * osites_per_instruction * ss_a + _f0->element * NSIMD + lane];
element_t* __restrict__ e_c = &fields_v[_p->target][cNN * osites_per_instruction * ss_c + _p->element * NSIMD + lane];
auto aNN = _f0->stride;
element_t* __restrict__ e_a = ((element_t*)_f0->base_ptr) + aNN * osites_per_instruction * MAP_INDEX(_f0,ss) + lane;

auto cNN = _p->stride;
element_t* __restrict__ e_c = ((element_t*)_p->base_ptr) + cNN * osites_per_instruction * MAP_INDEX(_p,ss) + lane;

EXECUTE(KERNEL_BIN, osites_per_instruction);
}
Expand All @@ -291,19 +314,17 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {

for (int ic=0;ic<_npbs;ic++) {

const auto _p = &p_code[coffset + cc * _npbs + ic];
const auto _f0 = &_p->factor[0];
const auto _f1 = &_p->factor[1];

int aNN = nelements[_f0->index] * NSIMD;
int cNN = nelements[_p->target] * NSIMD;
const cgpt_stencil_tensor_code_offload_t* __restrict__ _p = &p_code[coffset + cc * _npbs + ic];
const cgpt_stencil_tensor_factor_t* __restrict__ _f0 = &_p->factor[0];
const cgpt_stencil_tensor_factor_t* __restrict__ _f1 = &_p->factor[1];

int lane = acceleratorSIMTlane(T::Nsimd());
int ss_a = (_f0->is_temporary) ? ss : ss + oblock0; // on M1 did not find a single case for which cache re-use for temporaries worked
int ss_c = (_p->is_temporary) ? ss : ss + oblock0;
element_t* __restrict__ e_a = &fields_v[_f0->index][aNN * ss_a + _f0->element * NSIMD + lane];
element_t* __restrict__ e_c = &fields_v[_p->target][cNN * ss_c + _p->element * NSIMD + lane];
auto aNN = _f0->stride;
element_t* __restrict__ e_a = ((element_t*)_f0->base_ptr) + aNN * MAP_INDEX(_f0,ss) + lane;

auto cNN = _p->stride;
element_t* __restrict__ e_c = ((element_t*)_p->base_ptr) + cNN * MAP_INDEX(_p,ss) + lane;

EXECUTE(KERNEL_BIN, 1);
}
});
Expand Down

0 comments on commit d4553c1

Please sign in to comment.