diff --git a/benchmarks/stencil_tensor.py b/benchmarks/stencil_tensor.py index 2b710387..dade597b 100755 --- a/benchmarks/stencil_tensor.py +++ b/benchmarks/stencil_tensor.py @@ -30,7 +30,7 @@ g.message(g.norm2(m3 - m3ref)) -for osites_per_instruction in [32,128,256]: #[1,8,16,32,64]: +for osites_per_instruction in [16,32,128,256]: #[1,8,16,32,64]: for osites_per_cache_block in [2**15, grid.gsites]: #[2**11, 2**13, 2**15, grid.gsites]: ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block) diff --git a/lib/cgpt/lib/foundation.h b/lib/cgpt/lib/foundation.h index 856c1e4d..c076d932 100644 --- a/lib/cgpt/lib/foundation.h +++ b/lib/cgpt/lib/foundation.h @@ -56,7 +56,7 @@ using namespace Grid; __ ## v.push_back(word / simd_word); \ } \ auto v = & _ ## v[0]; \ - auto nelements = & __ ## v[0]; + auto v ## _nelements = & __ ## v[0]; #define VECTOR_ELEMENT_VIEW_CLOSE(l) \ for(uint64_t k=0;kmemory_view_close(); diff --git a/lib/cgpt/lib/stencil/tensor.h b/lib/cgpt/lib/stencil/tensor.h index 45068ce6..0b316774 100644 --- a/lib/cgpt/lib/stencil/tensor.h +++ b/lib/cgpt/lib/stencil/tensor.h @@ -29,6 +29,8 @@ struct cgpt_stencil_tensor_execute_params_t { }; struct cgpt_stencil_tensor_factor_t { + void* base_ptr; + uint64_t stride; int16_t index; // index of field int16_t point; // index of shift uint16_t element; // index of tensor element @@ -36,6 +38,8 @@ struct cgpt_stencil_tensor_factor_t { }; struct cgpt_stencil_tensor_code_offload_t { + void* base_ptr; + uint64_t stride; int16_t target; int16_t is_temporary; uint16_t element; @@ -164,7 +168,7 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { VECTOR_ELEMENT_VIEW_OPEN(element_t, fields, fields_v, AcceleratorWrite); int n_code = code.size(); - const cgpt_stencil_tensor_code_offload_t* p_code = &code[0]; + cgpt_stencil_tensor_code_offload_t* p_code = &code[0]; int nd = fields[0]->get_grid()->Nd(); @@ -211,11 +215,10 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { } \ break; \ } - + #define KERNEL_BIN(signature, op, functor, NN) { \ - int bNN = nelements[_f1->index] * NSIMD; \ - int ss_b = (_f1->is_temporary) ? ss : ss + oblock0; \ - element_t* __restrict__ e_b = &fields_v[_f1->index][bNN * NN * ss_b + _f1->element * NSIMD + lane]; \ + auto bNN = _f1->stride; \ + element_t* __restrict__ e_b = ((element_t*)_f1->base_ptr) + bNN * NN * MAP_INDEX(_f1,ss) + lane; \ for (int ff=0;ffstride = fields_v_nelements[_p->target] * NSIMD; + if (_p->is_temporary) { + _p->base_ptr = &fields_v[_p->target][_p->element * NSIMD]; + } else { + _p->base_ptr = &fields_v[_p->target][_p->stride * osites_per_instruction * oblock0 + _p->element * NSIMD]; + } + for (int j=0;j<_p->size;j++) { + cgpt_stencil_tensor_factor_t* _f = &_p->factor[j]; + _f->stride = fields_v_nelements[_f->index] * NSIMD; + if (_f->is_temporary) { + _f->base_ptr = &fields_v[_f->index][_f->element * NSIMD]; + } else { + _f->base_ptr = &fields_v[_f->index][_f->stride * osites_per_instruction * oblock0 + _f->element * NSIMD]; + } + } + } + //std::cout << GridLogMessage<< "Group " << osites0 << " to " << osites1 << " has oblocks " << oblocks << " and extra " << osites_extra << " from " << osites_extra_start << " compare to " << osites << std::endl; #ifdef GRID_HAS_ACCELERATOR +#define MAP_INDEX(x,ss) ss int coffset = 0; for (auto & segment : segments) { int _npb = segment.number_of_blocks; @@ -249,6 +272,8 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { accelerator_forNB(ss_block, oblocks * _npb, T::Nsimd(), { uint64_t cc = ss_block % _npb; #else +#define MAP_INDEX(x,ss) ss + //(x->is_temporary ? ss : ss) #define _npb 1 #define _npbs n_code #define coffset 0 @@ -259,19 +284,17 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { uint64_t ss = ss_block / _npb; for (int ic=0;ic<_npbs;ic++) { + + const cgpt_stencil_tensor_code_offload_t* __restrict__ _p = &p_code[coffset + cc * _npbs + ic]; + const cgpt_stencil_tensor_factor_t* __restrict__ _f0 = &_p->factor[0]; + const cgpt_stencil_tensor_factor_t* __restrict__ _f1 = &_p->factor[1]; - const auto _p = &p_code[coffset + cc * _npbs + ic]; - const auto _f0 = &_p->factor[0]; - const auto _f1 = &_p->factor[1]; - - int aNN = nelements[_f0->index] * NSIMD; - int cNN = nelements[_p->target] * NSIMD; - int lane = acceleratorSIMTlane(T::Nsimd()); - int ss_a = (_f0->is_temporary) ? ss : ss + oblock0; - int ss_c = (_p->is_temporary) ? ss : ss + oblock0; - element_t* __restrict__ e_a = &fields_v[_f0->index][aNN * osites_per_instruction * ss_a + _f0->element * NSIMD + lane]; - element_t* __restrict__ e_c = &fields_v[_p->target][cNN * osites_per_instruction * ss_c + _p->element * NSIMD + lane]; + auto aNN = _f0->stride; + element_t* __restrict__ e_a = ((element_t*)_f0->base_ptr) + aNN * osites_per_instruction * MAP_INDEX(_f0,ss) + lane; + + auto cNN = _p->stride; + element_t* __restrict__ e_c = ((element_t*)_p->base_ptr) + cNN * osites_per_instruction * MAP_INDEX(_p,ss) + lane; EXECUTE(KERNEL_BIN, osites_per_instruction); } @@ -291,19 +314,17 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base { for (int ic=0;ic<_npbs;ic++) { - const auto _p = &p_code[coffset + cc * _npbs + ic]; - const auto _f0 = &_p->factor[0]; - const auto _f1 = &_p->factor[1]; - - int aNN = nelements[_f0->index] * NSIMD; - int cNN = nelements[_p->target] * NSIMD; + const cgpt_stencil_tensor_code_offload_t* __restrict__ _p = &p_code[coffset + cc * _npbs + ic]; + const cgpt_stencil_tensor_factor_t* __restrict__ _f0 = &_p->factor[0]; + const cgpt_stencil_tensor_factor_t* __restrict__ _f1 = &_p->factor[1]; int lane = acceleratorSIMTlane(T::Nsimd()); - int ss_a = (_f0->is_temporary) ? ss : ss + oblock0; // on M1 did not find a single case for which cache re-use for temporaries worked - int ss_c = (_p->is_temporary) ? ss : ss + oblock0; - element_t* __restrict__ e_a = &fields_v[_f0->index][aNN * ss_a + _f0->element * NSIMD + lane]; - element_t* __restrict__ e_c = &fields_v[_p->target][cNN * ss_c + _p->element * NSIMD + lane]; + auto aNN = _f0->stride; + element_t* __restrict__ e_a = ((element_t*)_f0->base_ptr) + aNN * MAP_INDEX(_f0,ss) + lane; + auto cNN = _p->stride; + element_t* __restrict__ e_c = ((element_t*)_p->base_ptr) + cNN * MAP_INDEX(_p,ss) + lane; + EXECUTE(KERNEL_BIN, 1); } });