From 830cc51f06594ce9da12407d50cd3b1919e107c4 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Tue, 24 Oct 2023 09:47:20 +0200
Subject: [PATCH] a100 version acceptable

---
 benchmarks/stencil_tensor.py          |  74 +++---
 lib/cgpt/lib/benchmarks.cc            | 172 -------------
 lib/cgpt/lib/lattice/base.h           |   2 +-
 lib/cgpt/lib/lattice/implementation.h |   4 +-
 lib/cgpt/lib/lib.h                    |   1 -
 lib/cgpt/lib/micro_kernel.cc          |  43 ----
 lib/cgpt/lib/micro_kernel.h           |  21 --
 lib/cgpt/lib/micro_kernel/action.h    |  32 ---
 lib/cgpt/lib/micro_kernel/argument.h  |  60 -----
 lib/cgpt/lib/micro_kernel/macros.h    |  42 ----
 lib/cgpt/lib/stencil.cc               |  23 +-
 lib/cgpt/lib/stencil/tensor.h         | 340 +++++++++++---------------
 lib/gpt/core/local_stencil/tensor.py  |  20 +-
 lib/gpt/core/stencil/tensor.py        |   4 +-
 14 files changed, 209 insertions(+), 629 deletions(-)
 delete mode 100644 lib/cgpt/lib/micro_kernel.cc
 delete mode 100644 lib/cgpt/lib/micro_kernel.h
 delete mode 100644 lib/cgpt/lib/micro_kernel/action.h
 delete mode 100644 lib/cgpt/lib/micro_kernel/argument.h
 delete mode 100644 lib/cgpt/lib/micro_kernel/macros.h

diff --git a/benchmarks/stencil_tensor.py b/benchmarks/stencil_tensor.py
index 194c9504e..d80b672bd 100755
--- a/benchmarks/stencil_tensor.py
+++ b/benchmarks/stencil_tensor.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python3
 import gpt as g
 #grid = g.grid([64,64,64,64], g.double)
-#grid = g.grid([32,32,32,32], g.double)
+grid = g.grid([32,32,32,32], g.double)
 #grid = g.grid([32,16,16,16], g.double)
-grid = g.grid([16,16,16,32], g.double)
+#grid = g.grid([16,16,16,32], g.double)
+#grid = g.grid([2*4,4*3,3*4,3*3*4], g.double)
 m1 = g.mcolor(grid)
 m2 = g.mcolor(grid)
 m3 = g.mcolor(grid)
@@ -21,28 +22,30 @@
                 (0,dst,ti.mov if l == 0 else ti.inc,1.0,[(1,0,3*i + l),(2,0,3*l + j)])
             )
 
-ein = g.stencil.tensor(m1, [(0, 0, 0, 0), (1, 0, 0, 0)], code, len(code))# // 9
-
-#ein.memory_access_pattern(fast_osites=-3)
+segments = [(3, 9)]
+ein = g.stencil.tensor(m1, [(0, 0, 0, 0), (1, 0, 0, 0)], code, segments)
 
 ein(m3,m1,m2)
 g.message(g.norm2(m3 - m3ref))
 
 
-for block_size in [1,4,8,-1,-4,-8,-16,-32,-64]:
-    ein.memory_access_pattern(fast_osites=block_size)
+for osites_per_instruction in [1,4,8,16,32,64]:
+    for osites_per_cache_block in [2048*4, 4096*4, 8192*4]:
+        ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block)
 
-    g.message(block_size)
-    t=g.timer("d")
-    t("expr")
-    for i in range(300):
-        g.eval(m3,m1*m2)
-    t("stencil")
-    for i in range(300):
-        ein(m3,m1,m2)
-    t()
-    g.message(t)
-    g.message(g.norm2(m3 - m3ref))
+        g.message(osites_per_instruction, osites_per_cache_block)
+        t=g.timer("d")
+        t("expr")
+        for i in range(300):
+            g.eval(m3,m1*m2)
+        t("stencil")
+        for i in range(300):
+            ein(m3,m1,m2)
+        t()
+        g.message(t)
+        eps2 = g.norm2(m3 - m3ref) / g.norm2(m3)
+        assert eps2 < 1e-25
+        g.message(eps2)
 
 
 # D_{a2,a1} = epsilon_{a1,b1,c1}*epsilon_{a2,b2,c2}*spin_transpose(Q1_{b1,b2})*Q2_{c1,c2}
@@ -73,7 +76,8 @@
                     )
 
 g.message(len(code))
-ein = g.stencil.tensor(Q1, [(0, 0, 0, 0), (1, 0, 0, 0)], code)
+segments = [(len(code) // 16, 16)]
+ein = g.stencil.tensor(Q1, [(0, 0, 0, 0), (1, 0, 0, 0)], code, segments)
 
 R = g.mspincolor(grid)
 R[:] = 0
@@ -84,20 +88,18 @@
 g.message(g.norm2(R - R2) / g.norm2(R))
 #
 #            D[i2[0], i1[0]] += sign1 * sign2 * Q1[i1[1], i2[1]] * g.transpose(Q2[i1[2], i2[2]])
-
-
-for block_size in [1,4,8,-1,-4,-8,-16,-32]:
-    ein.memory_access_pattern(fast_osites=block_size)
-
-    g.message(block_size)
-    
-    t=g.timer("d")
-    t("diquark")
-    for i in range(30):
-        g.qcd.baryon.diquark(Q1,Q2)
-    t("stencil")
-    for i in range(30):
-        ein(R, Q1, Q2)
-    t()
-    g.message(t)
-    
+for osites_per_instruction in [1,4,8,16,32,64]:
+    for osites_per_cache_block in [2048*4, 4096*4, 8192*4]:
+        ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block)
+
+        g.message(osites_per_instruction, osites_per_cache_block)
+        t=g.timer("d")
+        t("diquark")
+        for i in range(30):
+            g.qcd.baryon.diquark(Q1,Q2)
+        t("stencil")
+        for i in range(30):
+            ein(R, Q1, Q2)
+        t()
+        g.message(t)
+        g.message(g.norm2(R - R2) / g.norm2(R))    
diff --git a/lib/cgpt/lib/benchmarks.cc b/lib/cgpt/lib/benchmarks.cc
index ed8208e61..d63d05676 100644
--- a/lib/cgpt/lib/benchmarks.cc
+++ b/lib/cgpt/lib/benchmarks.cc
@@ -19,170 +19,6 @@
 #include "lib.h"
 #include "benchmarks.h"
 
-/*
-E X P O R T(test_legacy_omega,{
-    
-    void* _dst, *_src;
-    long type, mu;
-    if (!PyArg_ParseTuple(args, "llll", &_dst, &_src, &type, &mu)) {
-      return NULL;
-    }
-    
-    cgpt_Lattice_base* dst = (cgpt_Lattice_base*)_dst;
-    cgpt_Lattice_base* src = (cgpt_Lattice_base*)_src;
-
-    auto& src_l = compatible<iMSpin4Color3<vComplexD>>(src)->l;
-    auto& dst_l = compatible<iMSpin4<vComplexD>>(dst)->l;
-
-    omega(dst_l, src_l, type, mu);
-    
-    return PyLong_FromLong(0);
-  });
-*/
-
-template<typename vobj_a, typename vobj_b>
-void mk_binary_mul_ll(const micro_kernel_arg_t & arg, size_t i0, size_t i1, size_t n_subblock) {
-  typedef decltype(vobj_a()*vobj_b()) vobj_c;
-  typedef typename vobj_c::scalar_object sobj_c;
-  
-  micro_kernel_view(vobj_a, a_p, 0);
-  micro_kernel_view(vobj_b, b_p, 1);
-  micro_kernel_view(vobj_c, c_p, 2);
-
-  micro_kernel_for(idx, i1-i0, sizeof(vobj_c)/sizeof(sobj_c), n_subblock, {
-      coalescedWrite(a_p[idx], coalescedRead(b_p[idx]) * coalescedRead(c_p[idx]));
-    });
-}
-
-  
-template<typename Lat>
-void micro_kernels(int lat) {
-  
-  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
-  Coordinate mpi_layout  = GridDefaultMpi();
-  Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-
-  typedef typename Lat::vector_object vobj;
-  typedef typename Lat::scalar_object sobj;
-  Lat a(&Grid), b(&Grid), c(&Grid), d(&Grid);
-
-  std::cout << GridLogMessage << lat << "^4" << std::endl;
-    
-  GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-  random(pRNG,a);   random(pRNG,b);
-
-  int Nwarm = 10;
-  int N = 500;
-  double gb, t0, t1, t2, t3, t4, t5, t0b, t1b;
-  mk_timer t_et, t_eti;
-  std::map<std::string, mk_timer> t_mk;
-  std::vector<micro_kernel_blocking_t> blockings = {
-#ifdef GRID_HAS_ACCELERATOR
-    { 8*1024, 1 },
-    { 32*1024, 1 },
-    { 256*1024, 1 },
-#else
-    { 512, 8 },
-    { 512, 16 },
-    { 512, 32 },
-    { 512, 64 },
-    { 256, 8 },
-    { 256, 16 },
-    { 256, 32 },
-    { 128, 8 },
-    { 128, 16 }
-#endif
-  };
- 
-  gb = 4.0 * 3.0 * sizeof(sobj) * Grid._fsites / 1e9;
-  for (int i=0;i<Nwarm+N;i++) {
-    t0 = cgpt_time();
-    c = a*b;
-    d = a*c;
-    c = a*b;
-    d = a*c;
-    t1 = cgpt_time();
-    if (i>=Nwarm)
-      t_et.add(t1-t0);
-  }
-
-  Lat d_copy = a*a*b;
-
-  for (int i=0;i<Nwarm+N;i++) {
-    t0 = cgpt_time();
-    d = a*a*b;
-    d = a*a*b;
-    t1 = cgpt_time();
-    if (i>=Nwarm)
-      t_eti.add(t1-t0);
-  }
-
-  d = Zero();
-  
-  t2 = cgpt_time();
-  std::vector<micro_kernel_t> expression;
-  micro_kernel_arg_t views_c_a_b, views_d_a_c;
-
-  views_c_a_b.add(c, AcceleratorWriteDiscard, false);
-  views_c_a_b.add(a, AcceleratorRead);
-  views_c_a_b.add(b, AcceleratorRead);
-
-  views_d_a_c.add(d, AcceleratorWriteDiscard);
-  views_d_a_c.add(a, AcceleratorRead);
-  views_d_a_c.add(c, AcceleratorRead, false);
-
-  // TODO: internal index size
-  expression.push_back({ mk_binary_mul_ll<vobj,vobj>, views_c_a_b });
-  expression.push_back({ mk_binary_mul_ll<vobj,vobj>, views_d_a_c });
-  expression.push_back({ mk_binary_mul_ll<vobj,vobj>, views_c_a_b });
-  expression.push_back({ mk_binary_mul_ll<vobj,vobj>, views_d_a_c });
-
-  t3 = cgpt_time();
-
-  for (auto b : blockings) {
-    mk_timer t;
-    for (int i=0;i<Nwarm+N;i++) {
-      t0 = cgpt_time();
-      eval_micro_kernels(expression, b);
-      t1 = cgpt_time();
-      if (i>=Nwarm)
-        t.add(t1-t0);
-    }
-    char buf[256];
-    sprintf(buf,"MK %d-%d",b.block_size,b.subblock_size);
-    t_mk[buf] = t;
-  }
-  t5 = cgpt_time();
-
-  views_c_a_b.release();
-  views_d_a_c.release();
-
-  d -= d_copy;
-  double err2 = norm2(d);
-
-  t_et.print ("GridET separate", gb);
-  t_eti.print("GridET joint   ", gb);
-  for (auto t : t_mk)
-    t.second.print (t.first, gb);
-  
-}
-
-template<typename Lat>
-void mk_bench_mul() {
-  micro_kernels<Lat>(4);
-  micro_kernels<Lat>(6);
-  micro_kernels<Lat>(8);
-  micro_kernels<Lat>(10);
-  micro_kernels<Lat>(12);
-  micro_kernels<Lat>(16);
-#ifdef GRID_HAS_ACCELERATOR
-  micro_kernels<Lat>(24);
-  micro_kernels<Lat>(32);
-  micro_kernels<Lat>(48);
-#endif
-}
-
 EXPORT(benchmarks,{
     
     //mask();
@@ -190,14 +26,6 @@ EXPORT(benchmarks,{
     //benchmarks(8);
     //benchmarks(16);
     //benchmarks(32);
-    std::cout << GridLogMessage << std::endl << std::endl << "Benchmarking ComplexD" << std::endl << std::endl;
-    mk_bench_mul<LatticeComplexD>();
-
-    std::cout << GridLogMessage << std::endl << std::endl << "Benchmarking ColourD" << std::endl << std::endl;
-    mk_bench_mul<LatticeColourMatrixD>();
-
-    //std::cout << GridLogMessage << std::endl << std::endl << "Benchmarking SpinColourD" << std::endl << std::endl;
-    //mk_bench_mul<LatticeSpinColourMatrixD>();
 
     return PyLong_FromLong(0);
   });
diff --git a/lib/cgpt/lib/lattice/base.h b/lib/cgpt/lib/lattice/base.h
index 69fc7a7e3..fa0642d5a 100644
--- a/lib/cgpt/lib/lattice/base.h
+++ b/lib/cgpt/lib/lattice/base.h
@@ -73,7 +73,7 @@ class cgpt_Lattice_base {
   virtual GridBase* get_grid() = 0;
   virtual cgpt_stencil_matrix_base* stencil_matrix(GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size, long local) = 0;
   virtual cgpt_stencil_matrix_vector_base* stencil_matrix_vector(cgpt_Lattice_base* matrix, GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size, long local) = 0;
-  virtual cgpt_stencil_tensor_base* stencil_tensor(GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size, long local) = 0;
+  virtual cgpt_stencil_tensor_base* stencil_tensor(GridBase* grid, PyObject* shifts, PyObject* code, PyObject* segments, long local) = 0;
 
 };
 
diff --git a/lib/cgpt/lib/lattice/implementation.h b/lib/cgpt/lib/lattice/implementation.h
index 83eeda516..439de5ae4 100644
--- a/lib/cgpt/lib/lattice/implementation.h
+++ b/lib/cgpt/lib/lattice/implementation.h
@@ -295,7 +295,7 @@ class cgpt_Lattice : public cgpt_Lattice_base {
     return cgpt_stencil_matrix_vector_create<T>(matrix, grid, shifts, code, code_parallel_block_size, local);
   }
 
-  virtual cgpt_stencil_tensor_base* stencil_tensor(GridBase* grid, PyObject* shifts, PyObject* code, long code_parallel_block_size, long local) {
-    return cgpt_stencil_tensor_create<T>(grid, shifts, code, code_parallel_block_size, local);
+  virtual cgpt_stencil_tensor_base* stencil_tensor(GridBase* grid, PyObject* shifts, PyObject* code, PyObject* segments, long local) {
+    return cgpt_stencil_tensor_create<T>(grid, shifts, code, segments, local);
   }
 };
diff --git a/lib/cgpt/lib/lib.h b/lib/cgpt/lib/lib.h
index 856691baa..25516e3bb 100644
--- a/lib/cgpt/lib/lib.h
+++ b/lib/cgpt/lib/lib.h
@@ -37,7 +37,6 @@
 #include "foundation.h"
 #include "reduce.h"
 #include "sort.h"
-#include "micro_kernel.h"
 #include "convert.h"
 #include "checksums.h"
 #include "parameters.h"
diff --git a/lib/cgpt/lib/micro_kernel.cc b/lib/cgpt/lib/micro_kernel.cc
deleted file mode 100644
index 3a95745a5..000000000
--- a/lib/cgpt/lib/micro_kernel.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-    GPT - Grid Python Toolkit
-    Copyright (C) 2020  Christoph Lehner (christoph.lehner@ur.de, https://github.com/lehner/gpt)
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-*/
-#include "lib.h"
-
-void eval_micro_kernels(const std::vector<micro_kernel_t> & kernels, const micro_kernel_blocking_t & blocking) {
-
-  size_t n = kernels.size();
-
-  size_t o_sites = kernels[0].arg.o_sites;
-  size_t block_size = blocking.block_size;
-  size_t subblock_size = blocking.subblock_size;
-
-  micro_kernel_region({
-      
-      for (size_t j=0;j<(o_sites + block_size - 1)/block_size;j++) {
-
-        for (size_t i=0;i<n;i++) {
-          auto& k = kernels[i];
-          
-          size_t j0 = std::min(j*block_size, o_sites);
-          size_t j1 = std::min(j0 + block_size, o_sites);
-          k.action(k.arg, j0, j1, subblock_size);
-        }
-
-      }
-    });
-}
diff --git a/lib/cgpt/lib/micro_kernel.h b/lib/cgpt/lib/micro_kernel.h
deleted file mode 100644
index 0fefa14f7..000000000
--- a/lib/cgpt/lib/micro_kernel.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
-    GPT - Grid Python Toolkit
-    Copyright (C) 2020  Christoph Lehner (christoph.lehner@ur.de, https://github.com/lehner/gpt)
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-*/
-#include "micro_kernel/argument.h"
-#include "micro_kernel/action.h"
-#include "micro_kernel/macros.h"
diff --git a/lib/cgpt/lib/micro_kernel/action.h b/lib/cgpt/lib/micro_kernel/action.h
deleted file mode 100644
index 9a3e0d463..000000000
--- a/lib/cgpt/lib/micro_kernel/action.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
-    GPT - Grid Python Toolkit
-    Copyright (C) 2020  Christoph Lehner (christoph.lehner@ur.de, https://github.com/lehner/gpt)
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-*/
-
-typedef void (* micro_kernel_action_t)(const micro_kernel_arg_t & arg, size_t i0, size_t i1, size_t subblock_size);
-
-struct micro_kernel_t {
-  micro_kernel_action_t action;
-  micro_kernel_arg_t arg;
-};
-
-struct micro_kernel_blocking_t {
-  size_t block_size;
-  size_t subblock_size;
-};
-
-void eval_micro_kernels(const std::vector<micro_kernel_t> & kernels, const micro_kernel_blocking_t & blocking);
diff --git a/lib/cgpt/lib/micro_kernel/argument.h b/lib/cgpt/lib/micro_kernel/argument.h
deleted file mode 100644
index 29e3fdd02..000000000
--- a/lib/cgpt/lib/micro_kernel/argument.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
-    GPT - Grid Python Toolkit
-    Copyright (C) 2020  Christoph Lehner (christoph.lehner@ur.de, https://github.com/lehner/gpt)
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-*/
-
-class ViewContainerBase {
-public:
-  virtual ~ViewContainerBase() {};
-};
-
-template<class View> 
-class ViewContainer : public ViewContainerBase {
-public:
-  View v;
-  
-  ViewContainer(View &_v) : v(_v) {};
-  virtual ~ViewContainer() { v.ViewClose(); }
-};
-
-struct micro_kernel_arg_t {
-  struct tuple_t {
-    ViewContainerBase* view;
-    bool persistent;
-  };
-  
-  std::vector<tuple_t> views;
-  size_t o_sites;
-
-  template<class T>
-  void add(Lattice<T>& l, ViewMode mode, bool persistent = true) {
-    size_t _o_sites = l.Grid()->oSites();
-    if (views.size() == 0) {
-      o_sites = _o_sites;
-    } else {
-      ASSERT(o_sites == _o_sites);
-    }
-    auto l_v = l.View(mode);
-    views.push_back({ new ViewContainer<decltype(l_v)>(l_v), persistent });
-  }
-
-  void release() {
-    for (auto x : views)
-      delete x.view;
-  }
-
-};
diff --git a/lib/cgpt/lib/micro_kernel/macros.h b/lib/cgpt/lib/micro_kernel/macros.h
deleted file mode 100644
index 71074edf3..000000000
--- a/lib/cgpt/lib/micro_kernel/macros.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
-    GPT - Grid Python Toolkit
-    Copyright (C) 2020  Christoph Lehner (christoph.lehner@ur.de, https://github.com/lehner/gpt)
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-*/
-#ifndef GRID_HAS_ACCELERATOR
-
-#define micro_kernel_for(idx, n_idx, nsimd, nsubblock, ...) {           \
-    int n_thread = thread_num();                                        \
-    int n_threads = thread_max();                                       \
-    for (size_t ib=n_subblock*n_thread;ib<n_idx;ib+=n_subblock*n_threads) { \
-      for (size_t idx=ib;idx<ib+n_subblock && idx<n_idx;idx++) {        \
-        __VA_ARGS__;                                                    \
-      }}}
-#define micro_kernel_region(...) { thread_region { __VA_ARGS__ } }
-
-#else
-
-// TODO: for2d and use nsubblock
-#define micro_kernel_for(idx, n_idx, nsimd, nsubblock, ...) accelerator_forNB(idx, n_idx, nsimd, __VA_ARGS__)
-//#define micro_kernel_for(idx, n_idx, nsimd, nsubblock, ...) accelerator_for2dNB(ib, n_idx / nsubblock, jb, nsubblock, nsimd, uint64_t idx=ib__VA_ARGS__)
-#define micro_kernel_region(...) { __VA_ARGS__; accelerator_barrier(dummy); }
-
-#endif
-
-#define micro_kernel_view(vobj, ptr, idx)                               \
-  auto ptr ## _v = ((ViewContainer<LatticeView<vobj>>*)arg.views[idx].view)->v; \
-  auto ptr = &ptr ## _v[arg.views[idx].persistent ? i0 : 0];
-
diff --git a/lib/cgpt/lib/stencil.cc b/lib/cgpt/lib/stencil.cc
index 63db24402..d58646d89 100644
--- a/lib/cgpt/lib/stencil.cc
+++ b/lib/cgpt/lib/stencil.cc
@@ -64,11 +64,11 @@ EXPORT(stencil_tensor_create,{
 
     void* _grid;
     void* _lattice;
-    PyObject* _shifts, * _code;
+    PyObject* _shifts, * _code, * _segments;
     long _code_parallel_block_size;
     long _local;
-    if (!PyArg_ParseTuple(args, "llOOll", &_lattice, &_grid, &_shifts, &_code,
-			  &_code_parallel_block_size, &_local)) {
+    if (!PyArg_ParseTuple(args, "llOOOl", &_lattice, &_grid, &_shifts, &_code,
+			  &_segments, &_local)) {
       return NULL;
     }
     
@@ -76,8 +76,7 @@ EXPORT(stencil_tensor_create,{
     cgpt_Lattice_base* lattice = (cgpt_Lattice_base*)_lattice;
 
     return PyLong_FromVoidPtr(lattice->stencil_tensor(grid, _shifts, _code,
-						      _code_parallel_block_size,
-						      _local));
+						      _segments, _local));
   });
 
 EXPORT(stencil_matrix_execute,{
@@ -125,8 +124,11 @@ EXPORT(stencil_tensor_execute,{
 
     void* _stencil;
     PyObject* _fields;
-    long fast_osites;
-    if (!PyArg_ParseTuple(args, "lOl", &_stencil, &_fields, &fast_osites)) {
+    long osites_per_instruction;
+    long osites_per_cache_block;
+    if (!PyArg_ParseTuple(args, "lOll", &_stencil, &_fields,
+			  &osites_per_instruction,
+			  &osites_per_cache_block)) {
       return NULL;
     }
     
@@ -135,7 +137,12 @@ EXPORT(stencil_tensor_execute,{
     std::vector<cgpt_Lattice_base*> __fields;
     cgpt_basis_fill(__fields,_fields);
 
-    stencil->execute(__fields, fast_osites);
+    cgpt_stencil_tensor_execute_params_t params =
+      {
+       osites_per_instruction,
+       osites_per_cache_block
+      };
+    stencil->execute(__fields, params);
 
     return PyLong_FromLong(0);
   });
diff --git a/lib/cgpt/lib/stencil/tensor.h b/lib/cgpt/lib/stencil/tensor.h
index 369d5d4c0..c0d8a3160 100644
--- a/lib/cgpt/lib/stencil/tensor.h
+++ b/lib/cgpt/lib/stencil/tensor.h
@@ -18,6 +18,16 @@
     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */
 
+struct cgpt_stencil_tensor_code_segment_t {
+  int block_size;
+  int number_of_blocks;
+};
+
+struct cgpt_stencil_tensor_execute_params_t {
+  int osites_per_instruction;
+  int osites_per_cache_block;
+};
+
 struct cgpt_stencil_tensor_factor_t {
   uint16_t index; // index of field
   int16_t point; // index of shift
@@ -44,7 +54,8 @@ struct cgpt_stencil_tensor_code_t {
 class cgpt_stencil_tensor_base {
  public:
   virtual ~cgpt_stencil_tensor_base() {};
-  virtual void execute(const std::vector<cgpt_Lattice_base*>& fields, int fast_osites) = 0;
+  virtual void execute(const std::vector<cgpt_Lattice_base*>& fields,
+		       const cgpt_stencil_tensor_execute_params_t& params) = 0;
 };
 
 template<typename T>
@@ -56,8 +67,8 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
   
   Vector<cgpt_stencil_tensor_code_offload_t> code;
   Vector<cgpt_stencil_tensor_factor_t> factors;
-    
-  int n_code_parallel_block_size, n_code_parallel_blocks;
+
+  std::vector<cgpt_stencil_tensor_code_segment_t> segments;
   int local;
 
   // local == true
@@ -70,13 +81,15 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
   cgpt_stencil_tensor(GridBase* grid,
 		      const std::vector<Coordinate>& shifts,
 		      const std::vector<cgpt_stencil_tensor_code_t>& _code,
-		      int _n_code_parallel_block_size,
+		      const std::vector<cgpt_stencil_tensor_code_segment_t>& _segments,
 		      int _local) :
-    code(_code.size()), local(_local),
-    n_code_parallel_block_size(_n_code_parallel_block_size) {
+    code(_code.size()), local(_local), segments(_segments) {
 
-    ASSERT(_code.size() % n_code_parallel_block_size == 0);
-    n_code_parallel_blocks = (int)_code.size() / n_code_parallel_block_size;
+    // test
+    size_t code_expected_size = 0;
+    for (auto & s : segments)
+      code_expected_size += s.block_size * s.number_of_blocks;
+    ASSERT(_code.size() == code_expected_size);
     
     // total number of factors
     int nfactors = 0;
@@ -138,8 +151,8 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
     stencils should return different options for current hardware for performance (including max _npb)
 
   */
-  template<int BLOCK_SIZE>
-  void block_execute(const std::vector<cgpt_Lattice_base*>& fields, int fast_osites) {
+  template<int osites_per_instruction>
+  void block_execute(const std::vector<cgpt_Lattice_base*>& fields, int osites_per_cache_block) {
 
 #ifndef GRID_HAS_ACCELERATOR
     typedef typename T::vector_type element_t;
@@ -158,14 +171,8 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
 
     int nd = fields[0]->get_grid()->Nd();
 
-    int _npb = n_code_parallel_blocks;
-    int _npbs = n_code_parallel_block_size;
-
     uint64_t osites = fields[0]->get_grid()->oSites();
-    uint64_t osite_blocks = osites;
 
-    int _fast_osites;
-    
     if (local) {
 
       ERR("Not implemented yet");
@@ -173,7 +180,7 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
     } else {
 
       //CGPT_CARTESIAN_STENCIL_HALO_EXCHANGE(T,);
-      
+
 #define TC_MOV 0
 #define TC_INC 1
 #define TC_MOV_NEG 2
@@ -183,208 +190,131 @@ class cgpt_stencil_tensor : public cgpt_stencil_tensor_base {
 #define TC_MOV_NEG_CC 6
 #define TC_DEC_CC 7
 #define TC_MUL 8
-
-      /*
-#ifdef GRID_HAS_ACCELERATOR
-
-
+#define ID(a) a
+#define CONJ(a) adj(a)
+	      
+#define EXECUTE(KB, NN)							\
+	      switch (_p->instruction)					\
+		{							\
+		case TC_INC: KB(+=,ID,NN); break;			\
+		case TC_MOV: KB(=,ID,NN); break;			\
+		case TC_DEC: KB(-=,ID,NN); break;			\
+		case TC_MOV_NEG: KB(=-,ID,NN); break;			\
+		case TC_INC_CC: KB(+=,CONJ,NN); break;			\
+		case TC_MOV_CC: KB(=,CONJ,NN); break;			\
+		case TC_DEC_CC: KB(-=,CONJ,NN); break;			\
+		case TC_MOV_NEG_CC: KB(=-,CONJ,NN); break;		\
+		case TC_MUL:						\
+		  {							\
+		    auto w = ((coeff_t)_p->weight);			\
+		    for (int ff=0;ff<NN;ff++)				\
+		      e_c[cNN * ff] = w * e_a[aNN * ff];		\
+		  }							\
+		  break;						\
+		}
+
+#define KERNEL_BIN(signature, functor, NN) {				\
+	int bNN = nelements[_f1->index] * NSIMD;			\
+	element_t* __restrict__ e_b = &fields_v[_f1->index][bNN * NN * ss + _f1->element * NSIMD + lane]; \
+	for (int ff=0;ff<NN;ff++)					\
+	  e_c[cNN * ff] signature functor(e_a[aNN * ff]) * e_b[bNN * ff]; \
+      }
       
-#define _ID(a) a
-#define _CONJ(a) adj(a)
-#define _INC(a,b,c)      a + b*c
-#define _MOV(a,b,c)          b*c
-#define _MOV_NEG(a,b,c)    - b*c
-#define _DEC(a,b,c)      a - b*c
-#define _MUL(a,b,c)      a*((coeff_t)_p->weight)
-#define KERNEL(composition, mod_first)					\
-      for (int ff=0;ff < BLOCK_SIZE;ff++)				\
-	coalescedWriteElement(fields_v[_p->target][BLOCK_SIZE * ss + ff], \
-			      composition(coalescedReadElement(fields_v[_p->target][BLOCK_SIZE * ss + ff], _p->element), \
-					  mod_first(coalescedReadElement(fields_v[_f0->index][BLOCK_SIZE * ss + ff], _f0->element)), \
-					  coalescedReadElement(fields_v[_f1->index][BLOCK_SIZE * ss + ff], _f1->element)), \
-			      _p->element);
+
+      ASSERT(osites_per_cache_block % osites_per_instruction == 0);
       
+      uint64_t ocache_blocks = (osites + osites_per_cache_block - 1) / osites_per_cache_block;
+      for (uint64_t ocache_block = 0;ocache_block < ocache_blocks;ocache_block++) {
+	uint64_t osites0 = min(ocache_block * osites_per_cache_block, osites);
+	uint64_t osites1 = min(osites0 + osites_per_cache_block, osites);
 
-      osite_blocks = osites / BLOCK_SIZE;
+	uint64_t osites_in_cache_block = osites1 - osites0;
+	
+	uint64_t oblocks = osites_in_cache_block / osites_per_instruction;
+	uint64_t oblock0 = osites0 / osites_per_instruction;
 
-      for (int iter=0;iter<((osites % BLOCK_SIZE == 0) ? 1 : 2);iter++) {
+	uint64_t osites_extra_start = oblocks * osites_per_instruction;
+	uint64_t osites_extra = osites_in_cache_block - osites_extra_start;
 
-	uint64_t osite_offset = (iter == 0) ? 0 : osite_blocks * BLOCK_SIZE;
-	if (iter == 1) {
-	  BLOCK_SIZE = 1;
-	  osite_blocks = osites - osite_offset;
-	}
-	
-	accelerator_forNB(ss_block,osite_blocks * _npb,T::Nsimd(),{
-	    
-	    uint64_t ss, oblock;
-	    
-	    if (_fast_osites) {
-	      oblock = ss_block / osite_blocks;
-	      ss = osite_offset + ss_block % osite_blocks;
-	    } else {
-	      ss = osite_offset + ss_block / _npb;
-	      oblock = ss_block % _npb;
-	    }
-	    
-	    for (int iblock=0;iblock<_npbs;iblock++) {
+	//std::cout << GridLogMessage<< "Group " << osites0 << " to " << osites1 << " has oblocks " << oblocks << " and extra " << osites_extra << " from " << osites_extra_start << " compare to " << osites << std::endl;
+
+	int coffset = 0;
+	for (auto & segment : segments) {
+	  int _npb = segment.number_of_blocks;
+	  int _npbs = segment.block_size;
+	  
+	  accelerator_forNB(ss_block, oblocks * _npb, T::Nsimd(), {
 	      
-	      int i = oblock * _npbs + iblock;
+	      uint64_t ss = ss_block / _npb + oblock0;
+	      uint64_t cc = ss_block % _npb;
 	      
-	      const auto _p = &p_code[i];
-	      const auto _f0 = &_p->factor[0];
-	      const auto _f1 = &_p->factor[1];
+	      for (int ic=0;ic<_npbs;ic++) {
+		
+		const auto _p = &p_code[coffset + cc * _npbs + ic];
+		const auto _f0 = &_p->factor[0];
+		const auto _f1 = &_p->factor[1];
+		
+		int aNN = nelements[_f0->index] * NSIMD;
+		int cNN = nelements[_p->target] * NSIMD;
 	      
-	      switch (_p->instruction) {
-	      case TC_INC:
-		KERNEL(_INC,_ID);
-		break;
-	      case TC_MOV:
-		KERNEL(_MOV,_ID);
-		break;
-	      case TC_DEC:
-		KERNEL(_DEC,_ID);
-		break;
-	      case TC_MOV_NEG:
-		KERNEL(_MOV_NEG,_ID);
-		break;
-	      case TC_INC_CC:
-		KERNEL(_INC,_CONJ);
-		break;
-	      case TC_MOV_CC:
-		KERNEL(_MOV,_CONJ);
-		break;
-	      case TC_DEC_CC:
-		KERNEL(_DEC,_CONJ);
-		break;
-	      case TC_MOV_NEG_CC:
-		KERNEL(_MOV_NEG,_CONJ);
-		break;
-	      case TC_MUL:
-		KERNEL(_MUL,_ID);
-		break;
+		int lane = acceleratorSIMTlane(T::Nsimd());
+		element_t* __restrict__ e_a = &fields_v[_f0->index][aNN * osites_per_instruction * ss + _f0->element * NSIMD + lane];
+		element_t* __restrict__ e_c = &fields_v[_p->target][cNN * osites_per_instruction * ss + _p->element * NSIMD + lane];
+		
+		EXECUTE(KERNEL_BIN, osites_per_instruction);
 	      }
-	    }
-	    
-	  });
+	      
+	    });
+	  
+	  if (osites_extra) {
+	    accelerator_forNB(ss_block, osites_extra * _npb, T::Nsimd(), {
+		
+		uint64_t ss = ss_block / _npb + osites0 + osites_extra_start;
+		uint64_t cc = ss_block % _npb;
+		
+		for (int ic=0;ic<_npbs;ic++) {
+		  
+		  const auto _p = &p_code[coffset + cc * _npbs + ic];
+		  const auto _f0 = &_p->factor[0];
+		  const auto _f1 = &_p->factor[1];
+		  
+		  int aNN = nelements[_f0->index] * NSIMD;
+		  int cNN = nelements[_p->target] * NSIMD;
+		  
+		  int lane = acceleratorSIMTlane(T::Nsimd());
+		  element_t* __restrict__ e_a = &fields_v[_f0->index][aNN * ss + _f0->element * NSIMD + lane];
+		  element_t* __restrict__ e_c = &fields_v[_p->target][cNN * ss + _p->element * NSIMD + lane];
+		  
+		  EXECUTE(KERNEL_BIN, 1);
+		}	    
+	      });
+	  }
+
+	  coffset += _npb * _npbs;
+	}
       }
       
       accelerator_barrier();
-
-
-#else
-      */
-      
-      // CPU version
-      ASSERT(osites % BLOCK_SIZE == 0);
-      osites /= BLOCK_SIZE;
-
-      int _fast_osites = fast_osites;
-      
-      accelerator_for(ss_block,osites * _npb,T::Nsimd(),{
-
-          uint64_t ss, oblock;
-
-	  MAP_INDEXING(ss, oblock);
-
-	  for (int iblock=0;iblock<_npbs;iblock++) {
-	    
-	    int i = oblock * _npbs + iblock;
-
-	    const auto _p = &p_code[i];
-	    const auto _f0 = &_p->factor[0];
-	    const auto _f1 = &_p->factor[1];
-
-	    int aNN = nelements[_f0->index] * NSIMD;
-	    int bNN = nelements[_f1->index] * NSIMD;
-	    int cNN = nelements[_p->target] * NSIMD;
-
-	    int lane = acceleratorSIMTlane(T::Nsimd());
-	    element_t* __restrict__ e_a = &fields_v[_f0->index][aNN * BLOCK_SIZE * ss + _f0->element * NSIMD + lane];
-	    element_t* __restrict__ e_b = &fields_v[_f1->index][bNN * BLOCK_SIZE * ss + _f1->element * NSIMD + lane];
-	    element_t* __restrict__ e_c = &fields_v[_p->target][cNN * BLOCK_SIZE * ss + _p->element * NSIMD + lane];
-
-#define TC_MOV 0
-#define TC_INC 1
-#define TC_MOV_NEG 2
-#define TC_DEC 3
-#define TC_MOV_CC 4
-#define TC_INC_CC 5
-#define TC_MOV_NEG_CC 6
-#define TC_DEC_CC 7
-#define TC_MUL 8
-	    
-#define ID(a) a
-#define CONJ(a) adj(a)
-#define KERNEL(signature, functor)					\
-	    for (int ff=0;ff<BLOCK_SIZE;ff++)				\
-	      e_c[cNN * ff] signature functor(e_a[aNN * ff]) * e_b[bNN * ff];
-
-	    switch (_p->instruction) {
-	    case TC_INC:
-	      KERNEL(+=,ID);
-	      break;
-	    case TC_MOV:
-	      KERNEL(=,ID);
-	      break;
-	    case TC_DEC:
-	      KERNEL(-=,ID);
-	      break;
-	    case TC_MOV_NEG:
-	      KERNEL(=-,ID);
-	      break;
-	    case TC_INC_CC:
-	      KERNEL(+=,CONJ);
-	      break;
-	    case TC_MOV_CC:
-	      KERNEL(=,CONJ);
-	      break;
-	    case TC_DEC_CC:
-	      KERNEL(-=,CONJ);
-	      break;
-	    case TC_MOV_NEG_CC:
-	      KERNEL(=-,CONJ);
-	      break;
-	    case TC_MUL:
-	      for (int ff=0;ff<BLOCK_SIZE;ff++)	\
-		e_c[cNN * ff] *= ((coeff_t)_p->weight);
-	      break;
-	    }
-	  }
-	  
-	});
-
-
-      //#endif
       
       // and cleanup
       //CGPT_CARTESIAN_STENCIL_CLEANUP(T,);
-      
     }
 
     VECTOR_ELEMENT_VIEW_CLOSE(fields);
   }
 
-  virtual void execute(const std::vector<cgpt_Lattice_base*>& fields, int kernel_param) {
+  virtual void execute(const std::vector<cgpt_Lattice_base*>& fields,
+		       const cgpt_stencil_tensor_execute_params_t& params) {
     
-    int _BLOCK_SIZE, _fast_osites;
-    if (kernel_param > 0) {
-      _BLOCK_SIZE = kernel_param;
-      _fast_osites = 1;
-    } else {
-      _BLOCK_SIZE = -kernel_param;
-      _fast_osites = 0;
-    }
-
-    switch (_BLOCK_SIZE) {
-    case 1: block_execute<1>(fields, _fast_osites); break;
-    case 2: block_execute<2>(fields, _fast_osites); break;
-    case 4: block_execute<4>(fields, _fast_osites); break;
-    case 8: block_execute<8>(fields, _fast_osites); break;
-    case 16: block_execute<16>(fields, _fast_osites); break;
-    case 32: block_execute<32>(fields, _fast_osites); break;
-    case 64: block_execute<64>(fields, _fast_osites); break;
-    default: ERR("BLOCK_SIZE = %d not implemented", _BLOCK_SIZE);
+    switch (params.osites_per_instruction) {
+    case 1: block_execute<1>(fields, params.osites_per_cache_block); break;
+    case 2: block_execute<2>(fields, params.osites_per_cache_block); break;
+    case 4: block_execute<4>(fields, params.osites_per_cache_block); break;
+    case 8: block_execute<8>(fields, params.osites_per_cache_block); break;
+    case 16: block_execute<16>(fields, params.osites_per_cache_block); break;
+    case 32: block_execute<32>(fields, params.osites_per_cache_block); break;
+    case 64: block_execute<64>(fields, params.osites_per_cache_block); break;
+    default: ERR("params.osites_per_instruction = %d not implemented", params.osites_per_cache_block);
     }
 
   }
@@ -398,6 +328,13 @@ static void cgpt_convert(PyObject* in, cgpt_stencil_tensor_factor_t& out) {
   cgpt_convert(PyTuple_GetItem(in, 2), out.element);
 }
 
+static void cgpt_convert(PyObject* in, cgpt_stencil_tensor_code_segment_t& out) {
+  ASSERT(PyTuple_Check(in));
+  ASSERT(PyTuple_Size(in) == 2);
+  cgpt_convert(PyTuple_GetItem(in, 0), out.block_size);
+  cgpt_convert(PyTuple_GetItem(in, 1), out.number_of_blocks);
+}
+
 static void cgpt_convert(PyObject* in, cgpt_stencil_tensor_code_t& out) {
   ASSERT(PyDict_Check(in));
 
@@ -411,7 +348,7 @@ static void cgpt_convert(PyObject* in, cgpt_stencil_tensor_code_t& out) {
 
 template<typename T>
 cgpt_stencil_tensor_base* cgpt_stencil_tensor_create(GridBase* grid, PyObject* _shifts,
-						     PyObject* _code, long code_parallel_block_size,
+						     PyObject* _code, PyObject* _segments,
 						     long local) {
 
   std::vector<Coordinate> shifts;
@@ -420,5 +357,8 @@ cgpt_stencil_tensor_base* cgpt_stencil_tensor_create(GridBase* grid, PyObject* _
   std::vector<cgpt_stencil_tensor_code_t> code;
   cgpt_convert(_code,code);
 
-  return new cgpt_stencil_tensor<T>(grid,shifts,code,code_parallel_block_size, local);
+  std::vector<cgpt_stencil_tensor_code_segment_t> segments;
+  cgpt_convert(_segments,segments);
+
+  return new cgpt_stencil_tensor<T>(grid,shifts,code, segments, local);
 }
diff --git a/lib/gpt/core/local_stencil/tensor.py b/lib/gpt/core/local_stencil/tensor.py
index 4d267136a..43700183d 100644
--- a/lib/gpt/core/local_stencil/tensor.py
+++ b/lib/gpt/core/local_stencil/tensor.py
@@ -33,19 +33,20 @@ def parse(c):
 
 
 class tensor:
-    def __init__(self, lat, points, code, code_parallel_block_size=None, local=1):
+    def __init__(self, lat, points, code, segments, local=1):
         self.points = points
         self.code = [parse(c) for c in code]
-        self.code_parallel_block_size = code_parallel_block_size
-        if code_parallel_block_size is None:
-            code_parallel_block_size = len(code)
+        self.segments = segments
         self.obj = cgpt.stencil_tensor_create(
-            lat.v_obj[0], lat.grid.obj, points, self.code, code_parallel_block_size, local
+            lat.v_obj[0], lat.grid.obj, points, self.code, self.segments, local
         )
-        self.fast_osites = 1
+        self.osites_per_instruction = 4
+        self.osites_per_cache_block = 4096
 
     def __call__(self, *fields):
-        cgpt.stencil_tensor_execute(self.obj, list(fields), self.fast_osites)
+        cgpt.stencil_tensor_execute(self.obj, list(fields),
+                                    self.osites_per_instruction,
+                                    self.osites_per_cache_block)
 
     def __del__(self):
         cgpt.stencil_tensor_delete(self.obj)
@@ -53,5 +54,6 @@ def __del__(self):
     def data_access_hints(self, *hints):
         pass
 
-    def memory_access_pattern(self, fast_osites):
-        self.fast_osites = fast_osites
+    def memory_access_pattern(self, osites_per_instruction, osites_per_cache_block):
+        self.osites_per_instruction = osites_per_instruction
+        self.osites_per_cache_block = osites_per_cache_block
diff --git a/lib/gpt/core/stencil/tensor.py b/lib/gpt/core/stencil/tensor.py
index 5d24fd3f2..d2a379b9d 100644
--- a/lib/gpt/core/stencil/tensor.py
+++ b/lib/gpt/core/stencil/tensor.py
@@ -19,9 +19,9 @@
 import gpt as g
 
 
-def tensor(lat, points, code, code_parallel_block_size=None):
+def tensor(lat, points, code, segments):
     # check if all points are cartesian
     for p in points:
         if len([s for s in p if s != 0]) > 1:
             raise Exception("Only cartesian version currently implemented")
-    return g.local_stencil.tensor(lat, points, code, code_parallel_block_size, local=0)
+    return g.local_stencil.tensor(lat, points, code, segments, local=0)