diff --git a/benchmarks/coarse.py b/benchmarks/coarse.py
index 76848376..6adfc52c 100755
--- a/benchmarks/coarse.py
+++ b/benchmarks/coarse.py
@@ -49,7 +49,10 @@
     )
 
     # Warmup
+    #dst2 = g.copy(dst)
+    #src2 = g.copy(src)
     for n in range(5):
+        #co.mat([dst, dst2], [src, src2])
         co.mat(dst, src)
 
     # Time cgpt
diff --git a/benchmarks/stencil.py b/benchmarks/stencil.py
index 99c7a030..a9a87413 100755
--- a/benchmarks/stencil.py
+++ b/benchmarks/stencil.py
@@ -9,142 +9,146 @@
 
 evec = [(1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)]
 nevec = [tuple([-x for x in y]) for y in evec]
-    
+
 for precision in [g.single, g.double]:
-    grid = g.grid(g.default.get_ivec("--grid", [16, 16, 16, 32], 4), precision)
-    N = g.default.get_int("--N", 1000)
+    for fast_osites in [0,1]:
+        grid = g.grid(g.default.get_ivec("--grid", [16, 16, 16, 32], 4), precision)
+        N = g.default.get_int("--N", 1000)
 
-    g.message(
+        g.message(
         f"""
 
         
     Local Stencil Benchmark with
     fdimensions  : {grid.fdimensions}
     precision    : {precision.__name__}
+    fast_osites  : {fast_osites}
 """
-    )
-
-    # plaquette    
-    U = g.qcd.gauge.random(grid, rng, scale=0.5)
-    _U = [1, 2, 3, 4]
-    _X = 0
-    _Xp = [1, 2, 3, 4]
-    V = g.mcolor(grid)
-    rng.element(V)
-    # U = g.qcd.gauge.transformed(U, V)
-    code = []
-    for mu in range(4):
-        for nu in range(0, mu):
-            code.append(
-                {
-                    "target": 0,
-                    "accumulate": -1 if (mu == 1 and nu == 0) else 0,
-                    "weight": 1.0,
-                    "factor": [
-                        (_U[mu], _X, 0),
-                        (_U[nu], _Xp[mu], 0),
-                        (_U[mu], _Xp[nu], 1),
-                        (_U[nu], _X, 1),
-                    ],
-                }
-            )
-    st = g.stencil.matrix(
-        U[0],
-        [(0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)],
-        code,
-    )
-    # test plaquette
-    P = g.lattice(U[0])
-    st(P, *U)
-    pl = g.sum(g.trace(P)).real / P.grid.gsites / 3 / 2 / 3
-    assert abs(g.qcd.gauge.plaquette(U) - pl) < precision.eps * 100
-
-    # Flops
-    gauge_otype = U[0].otype
-    Nc = gauge_otype.shape[0]
-    flops_per_matrix_multiply = Nc**3 * 6 + (Nc - 1) * Nc**2 * 2
-    flops_per_site = 3 * flops_per_matrix_multiply * 4 * 3
-    flops = flops_per_site * P.grid.gsites * N
-    nbytes = (5 * Nc * Nc * 2) * precision.nbytes * P.grid.gsites * N
+        )
 
-    # Warmup
-    for n in range(5):
+        # plaquette    
+        U = g.qcd.gauge.random(grid, rng, scale=0.5)
+        _U = [1, 2, 3, 4]
+        _X = 0
+        _Xp = [1, 2, 3, 4]
+        V = g.mcolor(grid)
+        rng.element(V)
+        # U = g.qcd.gauge.transformed(U, V)
+        code = []
+        for mu in range(4):
+            for nu in range(0, mu):
+                code.append(
+                    {
+                        "target": 0,
+                        "accumulate": -1 if (mu == 1 and nu == 0) else 0,
+                        "weight": 1.0,
+                        "factor": [
+                            (_U[mu], _X, 0),
+                            (_U[nu], _Xp[mu], 0),
+                            (_U[mu], _Xp[nu], 1),
+                            (_U[nu], _X, 1),
+                        ],
+                    }
+                )
+        st = g.stencil.matrix(
+            U[0],
+            [(0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)],
+            code,
+        )
+        st.memory_access_pattern(fast_osites=fast_osites)
+        # test plaquette
+        P = g.lattice(U[0])
         st(P, *U)
+        pl = g.sum(g.trace(P)).real / P.grid.gsites / 3 / 2 / 3
+        assert abs(g.qcd.gauge.plaquette(U) - pl) < precision.eps * 100
+        
+        # Flops
+        gauge_otype = U[0].otype
+        Nc = gauge_otype.shape[0]
+        flops_per_matrix_multiply = Nc**3 * 6 + (Nc - 1) * Nc**2 * 2
+        flops_per_site = 3 * flops_per_matrix_multiply * 4 * 3
+        flops = flops_per_site * P.grid.gsites * N
+        nbytes = (5 * Nc * Nc * 2) * precision.nbytes * P.grid.gsites * N
+    
+        # Warmup
+        for n in range(5):
+            st(P, *U)
 
-    # Time
-    t0 = g.time()
-    for n in range(N):
-        st(P, *U)
-    t1 = g.time()
+        # Time
+        t0 = g.time()
+        for n in range(N):
+            st(P, *U)
+        t1 = g.time()
 
-    # Report
-    GFlopsPerSec = flops / (t1 - t0) / 1e9
-    GBPerSec = nbytes / (t1 - t0) / 1e9
-    g.message(
-        f"""
+        # Report
+        GFlopsPerSec = flops / (t1 - t0) / 1e9
+        GBPerSec = nbytes / (t1 - t0) / 1e9
+        g.message(
+            f"""
 {N} applications of plaquette stencil
     Time to complete            : {t1-t0:.2f} s
     Total performance           : {GFlopsPerSec:.2f} GFlops/s
     Effective memory bandwidth  : {GBPerSec:.2f} GB/s"""
-    )
+        )
 
-    # covariant laplacian stencil
-    src = g.vspincolor(grid)
-    rng.cnormal(src)
-    UdagShift = [g(g.adj(g.cshift(U[mu], mu, -1))) for mu in range(4)]
-    _U = [0,1,2,3]
-    _UdagShift = [4,5,6,7]
-    _X = 0
-    _Xp = [1,2,3,4]
-    _Xm = [5,6,7,8]
-    code = [(0,1,_X,-1,-8.0,[])]
-    for mu in range(4):
-        code.append((0,1,_Xp[mu], 0, 1.0,[(_U[mu], _X, 0)]))
-        code.append((0,1,_Xm[mu], 0, 1.0,[(_UdagShift[mu], _X, 0)]))
-        # can switch last line to next one
-        #code.append((0,1,_Xm[mu], 0, 1.0,[(_U[mu], _Xm[mu], 1)]))
-    st = g.stencil.matrix_vector(
-        U[0],
-        src,
-        [(0, 0, 0, 0)] + evec + nevec,
-        code
-    )
-    # test laplace
-    dst = g.lattice(src)
-    st(U + UdagShift, [dst,src])
-
-    lap = g.create.smear.laplace(g.covariant.shift(U, boundary_phases=[1]*4), [0,1,2,3])
-    dst2 = lap(src)
-    eps2 = g.norm2(dst - dst2) / g.norm2(dst)
-    assert eps2 ** 0.5 < precision.eps * 100
-
-    # Flops
-    gauge_otype = U[0].otype
-    Nc = gauge_otype.shape[0]
-    Ns = 4
-    flops_per_matrix_vector_multiply = Ns * Nc * (Nc * 6 + (Nc - 1) * 2)
-    flops_per_vector_add = Ns * Nc * 2
-    flops_per_site = 8 * flops_per_matrix_vector_multiply + 8 * flops_per_vector_add
-    flops = flops_per_site * src.grid.gsites * N
-    nbytes = (8 * Nc * Nc * 2 + Nc * Ns * 2) * precision.nbytes * src.grid.gsites * N
-
-    # Warmup
-    for n in range(5):
+        # covariant laplacian stencil
+        src = g.vspincolor(grid)
+        rng.cnormal(src)
+        UdagShift = [g(g.adj(g.cshift(U[mu], mu, -1))) for mu in range(4)]
+        _U = [0,1,2,3]
+        _UdagShift = [4,5,6,7]
+        _X = 0
+        _Xp = [1,2,3,4]
+        _Xm = [5,6,7,8]
+        code = [(0,1,_X,-1,-8.0,[])]
+        for mu in range(4):
+            code.append((0,1,_Xp[mu], 0, 1.0,[(_U[mu], _X, 0)]))
+            code.append((0,1,_Xm[mu], 0, 1.0,[(_UdagShift[mu], _X, 0)]))
+            # can switch last line to next one
+            #code.append((0,1,_Xm[mu], 0, 1.0,[(_U[mu], _Xm[mu], 1)]))
+        st = g.stencil.matrix_vector(
+            U[0],
+            src,
+            [(0, 0, 0, 0)] + evec + nevec,
+            code
+        )
+        st.memory_access_pattern(fast_osites=fast_osites)
+        # test laplace
+        dst = g.lattice(src)
         st(U + UdagShift, [dst,src])
+        
+        lap = g.create.smear.laplace(g.covariant.shift(U, boundary_phases=[1]*4), [0,1,2,3])
+        dst2 = lap(src)
+        eps2 = g.norm2(dst - dst2) / g.norm2(dst)
+        assert eps2 ** 0.5 < precision.eps * 100
 
-    # Time
-    t0 = g.time()
-    for n in range(N):
-        st(U + UdagShift, [dst,src])
-    t1 = g.time()
-
-    # Report
-    GFlopsPerSec = flops / (t1 - t0) / 1e9
-    GBPerSec = nbytes / (t1 - t0) / 1e9
-    g.message(
+        # Flops
+        gauge_otype = U[0].otype
+        Nc = gauge_otype.shape[0]
+        Ns = 4
+        flops_per_matrix_vector_multiply = Ns * Nc * (Nc * 6 + (Nc - 1) * 2)
+        flops_per_vector_add = Ns * Nc * 2
+        flops_per_site = 8 * flops_per_matrix_vector_multiply + 8 * flops_per_vector_add
+        flops = flops_per_site * src.grid.gsites * N
+        nbytes = (8 * Nc * Nc * 2 + Nc * Ns * 2) * precision.nbytes * src.grid.gsites * N
+        
+        # Warmup
+        for n in range(5):
+            st(U + UdagShift, [dst,src])
+            
+        # Time
+        t0 = g.time()
+        for n in range(N):
+            st(U + UdagShift, [dst,src])
+        t1 = g.time()
+            
+        # Report
+        GFlopsPerSec = flops / (t1 - t0) / 1e9
+        GBPerSec = nbytes / (t1 - t0) / 1e9
+        g.message(
         f"""
 {N} applications of laplace stencil
     Time to complete            : {t1-t0:.2f} s
     Total performance           : {GFlopsPerSec:.2f} GFlops/s
     Effective memory bandwidth  : {GBPerSec:.2f} GB/s"""
-    )
+        )
diff --git a/benchmarks/stencil_coarse_operator.py b/benchmarks/stencil_coarse_operator.py
index 891ac061..a4eda658 100755
--- a/benchmarks/stencil_coarse_operator.py
+++ b/benchmarks/stencil_coarse_operator.py
@@ -9,88 +9,96 @@
 
 evec = [(1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 0, 1)]
 nevec = [tuple([-x for x in y]) for y in evec]
-    
-for precision in [g.single, g.double]:
-    grid = g.grid(g.default.get_ivec("--grid", [16, 16, 16, 32], 4), precision)
-    N = g.default.get_int("--N", 1000)
 
-    g.message(
-        f"""
+for precision in [g.single, g.double]:
+    for fast_osites in [0,1]:
+        grid = g.grid(g.default.get_ivec("--grid", [16, 16, 16, 32], 4), precision)
+        N = g.default.get_int("--N", 1000)
+        n_rhs = g.default.get_int("--n_rhs", 12)
+        g.message(
+            f"""
 
         
     Local Stencil Benchmark with
     fdimensions  : {grid.fdimensions}
+    n_rhs        : {n_rhs}
     precision    : {precision.__name__}
+    fast_osites  : {fast_osites}
 """
-    )
-
-    # coarse grid stencil
-    nbasis = g.default.get_int("--nbasis", 12)
-    vcoarse = g.vcomplex(grid, nbasis)
-    g.message(f"Virtual blocks: {len(vcoarse.v_obj)}")
-    nbasis_blocks = len(vcoarse.v_obj)
-    mcoarse = [g.mcomplex(grid, nbasis) for _ in range(9)]
-    rng.cnormal(mcoarse)
-    rng.cnormal(vcoarse)
+        )
+        
+        # coarse grid stencil
+        nbasis = g.default.get_int("--nbasis", 12)
+        vcoarse = [g.vcomplex(grid, nbasis) for _ in range(n_rhs)]
+        g.message(f"Virtual blocks: {len(vcoarse[0].v_obj)}")
+        nbasis_blocks = len(vcoarse[0].v_obj)
+        mcoarse = [g.mcomplex(grid, nbasis) for _ in range(9)]
+        rng.cnormal(mcoarse)
+        rng.cnormal(vcoarse)
 
-    # reference solve
-    vref = g.lattice(vcoarse)
-    vref = g(mcoarse[0] * vcoarse)
-    for mu in range(4):
-        vref += mcoarse[1 + 2*mu] * g.cshift(vcoarse, mu, 1)
-        vref += mcoarse[2 + 2*mu] * g.cshift(vcoarse, mu, -1)
-    # now coarse grid dirac operator
-    _X = 0
-    _Xp = [1,2,3,4]
-    _Xm = [5,6,7,8]
-    _ID = 0
-    _M = [1,3,5,7]
-    _Mdag = [2,4,6,8]
-    code = []
-    for iblock in range(nbasis_blocks):
-        # todo block matrix multiply here
-        for jblock in range(nbasis_blocks):
-            matrix_index = nbasis_blocks * jblock + iblock
-            code.append((iblock,nbasis_blocks + jblock,_X,-1 if jblock == 0 else iblock,1.0,[(matrix_index,_X,0)]))
+        # reference solve
+        vref = [g.lattice(v) for v in vcoarse]
+        for i in range(n_rhs):
+            vref[i] = g(mcoarse[0] * vcoarse[i])
             for mu in range(4):
-                code.append((iblock,nbasis_blocks + jblock,_Xp[mu], iblock, 1.0, [(nbasis_blocks**2 * _M[mu] + matrix_index, _X, 0)]))
-                code.append((iblock,nbasis_blocks + jblock,_Xm[mu], iblock, 1.0, [(nbasis_blocks**2 * _Mdag[mu] + matrix_index, _X, 0)]))
-    st = g.stencil.matrix_vector(
-        mcoarse[0],
-        vcoarse,
-        [(0, 0, 0, 0)] + evec + nevec,
-        code, len(code) // nbasis_blocks
-    )
-    vdst = g.lattice(vcoarse)
-    st(mcoarse, [vdst,vcoarse])
-    eps2 = g.norm2(vdst - vref) / g.norm2(vref)
-    # g.message(f"Test: {eps2}")
-    assert eps2 ** 0.5 < precision.eps * 100
-
-    # Flops
-    flops_per_matrix_vector_multiply = nbasis * (nbasis * 6 + (nbasis - 1) * 2)
-    flops_per_vector_add = nbasis * 2
-    flops_per_site = 9 * flops_per_matrix_vector_multiply + 8 * flops_per_vector_add
-    flops = flops_per_site * vcoarse.grid.gsites * N
-    nbytes = (9 * nbasis ** 2 * 2 + 2 * nbasis * 2) * precision.nbytes * vcoarse.grid.gsites * N
+                vref[i] += mcoarse[1 + 2*mu] * g.cshift(vcoarse[i], mu, 1)
+                vref[i] += mcoarse[2 + 2*mu] * g.cshift(vcoarse[i], mu, -1)
+        # now coarse grid dirac operator
+        _X = 0
+        _Xp = [1,2,3,4]
+        _Xm = [5,6,7,8]
+        _ID = 0
+        _M = [1,3,5,7]
+        _Mdag = [2,4,6,8]
+        code = []
+        for i in range(n_rhs):
+            ioff = nbasis_blocks * i
+            for iblock in range(nbasis_blocks):
+                # todo block matrix multiply here
+                for jblock in range(nbasis_blocks):
+                    matrix_index = nbasis_blocks * jblock + iblock
+                    code.append((iblock + ioff,nbasis_blocks*n_rhs + jblock + ioff,_X,-1 if jblock == 0 else iblock + ioff,1.0,[(matrix_index,_X,0)]))
+                    for mu in range(4):
+                        code.append((iblock + ioff,nbasis_blocks*n_rhs + jblock + ioff,_Xp[mu], iblock + ioff, 1.0, [(nbasis_blocks**2 * _M[mu] + matrix_index, _X, 0)]))
+                        code.append((iblock + ioff,nbasis_blocks*n_rhs + jblock + ioff,_Xm[mu], iblock + ioff, 1.0, [(nbasis_blocks**2 * _Mdag[mu] + matrix_index, _X, 0)]))
+        st = g.stencil.matrix_vector(
+            mcoarse[0],
+            vcoarse[0],
+            [(0, 0, 0, 0)] + evec + nevec,
+            code, len(code) // nbasis_blocks // n_rhs
+        )
+        st.memory_access_pattern(fast_osites=fast_osites)
+        vdst = [g.lattice(v) for v in vcoarse]
+        st(mcoarse, vdst + vcoarse)
+        for i in range(n_rhs):
+            eps2 = g.norm2(vdst[i] - vref[i]) / g.norm2(vref[i])
+            # g.message(f"Test {i}: {eps2}")
+            assert eps2 ** 0.5 < precision.eps * 100
 
-    # Warmup
-    for n in range(5):
-        st(mcoarse, [vdst,vcoarse])
+        # Flops
+        flops_per_matrix_vector_multiply = nbasis * (nbasis * 6 + (nbasis - 1) * 2)
+        flops_per_vector_add = nbasis * 2
+        flops_per_site = 9 * flops_per_matrix_vector_multiply + 8 * flops_per_vector_add
+        flops = flops_per_site * vcoarse[0].grid.gsites * N * n_rhs
+        nbytes = (9 * nbasis ** 2 * 2 + 2 * nbasis * 2) * precision.nbytes * vcoarse[0].grid.gsites * N * n_rhs
+        
+        # Warmup
+        for n in range(5):
+            st(mcoarse, vdst + vcoarse)
 
-    # Time
-    t0 = g.time()
-    for n in range(N):
-        st(mcoarse, [vdst,vcoarse])
-    t1 = g.time()
+        # Time
+        t0 = g.time()
+        for n in range(N):
+            st(mcoarse, vdst + vcoarse)
+        t1 = g.time()
 
-    # Report
-    GFlopsPerSec = flops / (t1 - t0) / 1e9
-    GBPerSec = nbytes / (t1 - t0) / 1e9
-    g.message(
-        f"""
+        # Report
+        GFlopsPerSec = flops / (t1 - t0) / 1e9
+        GBPerSec = nbytes / (t1 - t0) / 1e9
+        g.message(
+            f"""
 {N} applications of {nbasis} x {nbasis}, 9-point operator
     Time to complete            : {t1-t0:.2f} s
     Total performance           : {GFlopsPerSec:.2f} GFlops/s
     Effective memory bandwidth  : {GBPerSec:.2f} GB/s"""
-    )
+        )
diff --git a/lib/cgpt/lib/stencil.cc b/lib/cgpt/lib/stencil.cc
index 3d1a60dd..0ce3d872 100644
--- a/lib/cgpt/lib/stencil.cc
+++ b/lib/cgpt/lib/stencil.cc
@@ -64,7 +64,8 @@ EXPORT(stencil_matrix_execute,{
 
     void* _stencil;
     PyObject* _fields;
-    if (!PyArg_ParseTuple(args, "lO", &_stencil, &_fields)) {
+    long fast_osites;
+    if (!PyArg_ParseTuple(args, "lOl", &_stencil, &_fields, &fast_osites)) {
       return NULL;
     }
     
@@ -73,7 +74,7 @@ EXPORT(stencil_matrix_execute,{
     std::vector<cgpt_Lattice_base*> __fields;
     cgpt_basis_fill(__fields,_fields);
 
-    stencil->execute(__fields);
+    stencil->execute(__fields, fast_osites);
 
     return PyLong_FromLong(0);
   });
@@ -84,7 +85,8 @@ EXPORT(stencil_matrix_vector_execute,{
     void* _stencil;
     PyObject* _matrix_fields;
     PyObject* _vector_fields;
-    if (!PyArg_ParseTuple(args, "lOO", &_stencil, &_matrix_fields, &_vector_fields)) {
+    long fast_osites;
+    if (!PyArg_ParseTuple(args, "lOOl", &_stencil, &_matrix_fields, &_vector_fields, &fast_osites)) {
       return NULL;
     }
     
@@ -94,7 +96,7 @@ EXPORT(stencil_matrix_vector_execute,{
     cgpt_basis_fill(__matrix_fields,_matrix_fields);
     cgpt_basis_fill(__vector_fields,_vector_fields);
 
-    stencil->execute(__matrix_fields, __vector_fields);
+    stencil->execute(__matrix_fields, __vector_fields, fast_osites);
 
     return PyLong_FromLong(0);
   });
diff --git a/lib/cgpt/lib/stencil/common.h b/lib/cgpt/lib/stencil/common.h
index 58dc8c53..6cd454c3 100644
--- a/lib/cgpt/lib/stencil/common.h
+++ b/lib/cgpt/lib/stencil/common.h
@@ -17,6 +17,16 @@
     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */
 
+// indexing
+#define MAP_INDEXING(ss, oblock)		\
+  if (_fast_osites) {				\
+    oblock = ss_block / osites;			\
+    ss = ss_block % osites;			\
+  } else {					\
+    ss = ss_block / _npb;			\
+    oblock = ss_block % _npb;			\
+  }
+  
 // cartesian stencil fetch
 #define fetch_cs(sidx, obj, point, site, view, do_adj, tag) {		\
     if (point == -1) {							\
@@ -29,8 +39,8 @@
       } else {								\
 	obj = coalescedRead(buf ## tag[sidx][SE->_offset]);		\
       }									\
-      acceleratorSynchronise();						\
     }									\
+    acceleratorSynchronise();						\
     if (do_adj)								\
       obj = adj(obj);							\
   }
diff --git a/lib/cgpt/lib/stencil/matrix.h b/lib/cgpt/lib/stencil/matrix.h
index 708f6a8a..2f4729c1 100644
--- a/lib/cgpt/lib/stencil/matrix.h
+++ b/lib/cgpt/lib/stencil/matrix.h
@@ -42,7 +42,7 @@ struct cgpt_stencil_matrix_code_t {
 class cgpt_stencil_matrix_base {
  public:
   virtual ~cgpt_stencil_matrix_base() {};
-  virtual void execute(const std::vector<cgpt_Lattice_base*>& fields) = 0;
+  virtual void execute(const std::vector<cgpt_Lattice_base*>& fields, int fast_osites) = 0;
 };
 
 template<typename T>
@@ -125,7 +125,7 @@ class cgpt_stencil_matrix : public cgpt_stencil_matrix_base {
     }
   }
  
-  virtual void execute(PVector<Lattice<T>> &fields) {
+  virtual void execute(PVector<Lattice<T>> &fields, int fast_osites) {
 
     VECTOR_VIEW_OPEN(fields,fields_v,AcceleratorWrite);
 
@@ -139,14 +139,19 @@ class cgpt_stencil_matrix : public cgpt_stencil_matrix_base {
     int _npb = n_code_parallel_blocks;
     int _npbs = n_code_parallel_block_size;
 
+    uint64_t osites = fields[0].Grid()->oSites();
+
+    int _fast_osites = fast_osites;
+    
     if (local) {
 
       auto sview = general_local_stencil->View();
       
-      accelerator_for(ss_block,fields[0].Grid()->oSites() * _npb,T::Nsimd(),{
+      accelerator_for(ss_block,osites * _npb,T::Nsimd(),{
 	  
-	  auto ss = ss_block / _npb;
-	  auto oblock = ss_block % _npb;
+          uint64_t ss, oblock;
+					      
+	  MAP_INDEXING(ss, oblock);
 	  
 	  for (int iblock=0;iblock<_npbs;iblock++) {
 	    
@@ -179,8 +184,9 @@ class cgpt_stencil_matrix : public cgpt_stencil_matrix_base {
       // now loop
       accelerator_for(ss_block,fields[0].Grid()->oSites() * _npb,T::Nsimd(),{
 
-	  auto ss = ss_block / _npb;
-	  auto oblock = ss_block % _npb;
+          uint64_t ss, oblock;
+					      
+	  MAP_INDEXING(ss, oblock);
 	  
 	  for (int iblock=0;iblock<_npbs;iblock++) {
 	    
@@ -214,10 +220,10 @@ class cgpt_stencil_matrix : public cgpt_stencil_matrix_base {
     VECTOR_VIEW_CLOSE(fields_v);
   }
 
-  virtual void execute(const std::vector<cgpt_Lattice_base*>& __fields) {
+  virtual void execute(const std::vector<cgpt_Lattice_base*>& __fields, int fast_osites) {
     PVector<Lattice<T>> fields;
     cgpt_basis_fill(fields,__fields);
-    execute(fields);
+    execute(fields, fast_osites);
   }
 };
 
diff --git a/lib/cgpt/lib/stencil/matrix_vector.h b/lib/cgpt/lib/stencil/matrix_vector.h
index 70b174ed..d73f8277 100644
--- a/lib/cgpt/lib/stencil/matrix_vector.h
+++ b/lib/cgpt/lib/stencil/matrix_vector.h
@@ -45,7 +45,7 @@ struct cgpt_stencil_matrix_vector_code_t {
 class cgpt_stencil_matrix_vector_base {
  public:
   virtual ~cgpt_stencil_matrix_vector_base() {};
-  virtual void execute(const std::vector<cgpt_Lattice_base*>& matrix_fields, const std::vector<cgpt_Lattice_base*>& vector_fields) = 0;
+  virtual void execute(const std::vector<cgpt_Lattice_base*>& matrix_fields, const std::vector<cgpt_Lattice_base*>& vector_fields, int fast_osites) = 0;
 };
 
 template<typename M, typename V>
@@ -146,7 +146,7 @@ class cgpt_stencil_matrix_vector : public cgpt_stencil_matrix_vector_base {
     }
   }
  
-  virtual void execute(PVector<Lattice<M>> &fields_matrix, PVector<Lattice<V>> &fields_vector) {
+  virtual void execute(PVector<Lattice<M>> &fields_matrix, PVector<Lattice<V>> &fields_vector, int fast_osites) {
 
     VECTOR_VIEW_OPEN(fields_matrix,fields_m_v,AcceleratorRead);
     VECTOR_VIEW_OPEN(fields_vector,fields_v_v,AcceleratorWrite);
@@ -162,13 +162,18 @@ class cgpt_stencil_matrix_vector : public cgpt_stencil_matrix_vector_base {
     int _npb = n_code_parallel_blocks;
     int _npbs = n_code_parallel_block_size;
 
+    uint64_t osites = fields_matrix[0].Grid()->oSites();
+
+    int _fast_osites = fast_osites;
+
     if (local) {
       auto sview = general_local_stencil->View();
       
-      accelerator_for(ss_block,fields_matrix[0].Grid()->oSites() * _npb,M::Nsimd(),{
+      accelerator_for(ss_block,osites * _npb,M::Nsimd(),{
 	  
-	  auto ss = ss_block / _npb;
-	  auto oblock = ss_block % _npb;
+          uint64_t ss, oblock;
+					      
+	  MAP_INDEXING(ss, oblock);
 	  
 	  for (int iblock=0;iblock<_npbs;iblock++) {
 	    
@@ -196,11 +201,12 @@ class cgpt_stencil_matrix_vector : public cgpt_stencil_matrix_vector_base {
 
       CGPT_CARTESIAN_STENCIL_HALO_EXCHANGE(M, _matrix);
       CGPT_CARTESIAN_STENCIL_HALO_EXCHANGE(V, _vector);
-
-      accelerator_for(ss_block,fields_matrix[0].Grid()->oSites() * _npb,M::Nsimd(),{
+     
+      accelerator_for(ss_block,osites * _npb,M::Nsimd(),{
 	  
-	  auto ss = ss_block / _npb;
-	  auto oblock = ss_block % _npb;
+	  uint64_t ss, oblock;
+
+	  MAP_INDEXING(ss, oblock);
 	  
 	  for (int iblock=0;iblock<_npbs;iblock++) {
 	    
@@ -235,12 +241,12 @@ class cgpt_stencil_matrix_vector : public cgpt_stencil_matrix_vector_base {
     VECTOR_VIEW_CLOSE(fields_v_v);
   }
 
-  virtual void execute(const std::vector<cgpt_Lattice_base*>& __matrix_fields, const std::vector<cgpt_Lattice_base*>& __vector_fields) {
+  virtual void execute(const std::vector<cgpt_Lattice_base*>& __matrix_fields, const std::vector<cgpt_Lattice_base*>& __vector_fields, int fast_osites) {
     PVector<Lattice<M>> matrix_fields;
     PVector<Lattice<V>> vector_fields;
     cgpt_basis_fill(matrix_fields,__matrix_fields);
     cgpt_basis_fill(vector_fields,__vector_fields);
-    execute(matrix_fields, vector_fields);
+    execute(matrix_fields, vector_fields, fast_osites);
   }
 };
 
diff --git a/lib/gpt/core/local_stencil/matrix.py b/lib/gpt/core/local_stencil/matrix.py
index 87fc3b5b..e95dbac4 100644
--- a/lib/gpt/core/local_stencil/matrix.py
+++ b/lib/gpt/core/local_stencil/matrix.py
@@ -36,12 +36,16 @@ def __init__(self, lat, points, code, code_parallel_block_size=None, local=1):
         self.obj = cgpt.stencil_matrix_create(
             lat.v_obj[0], lat.grid.obj, points, self.code, code_parallel_block_size, local
         )
+        self.fast_osites = 1
 
     def __call__(self, *fields):
-        cgpt.stencil_matrix_execute(self.obj, list(fields))
+        cgpt.stencil_matrix_execute(self.obj, list(fields), self.fast_osites)
 
     def __del__(self):
         cgpt.stencil_matrix_delete(self.obj)
 
     def data_access_hints(self, *hints):
         pass
+
+    def memory_access_pattern(self, fast_osites):
+        self.fast_osites = fast_osites
diff --git a/lib/gpt/core/local_stencil/matrix_vector.py b/lib/gpt/core/local_stencil/matrix_vector.py
index b266a85a..9227052f 100644
--- a/lib/gpt/core/local_stencil/matrix_vector.py
+++ b/lib/gpt/core/local_stencil/matrix_vector.py
@@ -49,12 +49,16 @@ def __init__(self, lat_matrix, lat_vector, points, code, code_parallel_block_siz
             code_parallel_block_size,
             local
         )
+        self.fast_osites = 1
 
     def __call__(self, matrix_fields, vector_fields):
-        cgpt.stencil_matrix_vector_execute(self.obj, matrix_fields, vector_fields)
+        cgpt.stencil_matrix_vector_execute(self.obj, matrix_fields, vector_fields, self.fast_osites)
 
     def __del__(self):
         cgpt.stencil_matrix_vector_delete(self.obj)
 
     def data_access_hints(self, *hints):
         pass
+
+    def memory_access_pattern(self, fast_osites):
+        self.fast_osites = fast_osites
diff --git a/lib/gpt/core/object_type/container.py b/lib/gpt/core/object_type/container.py
index 8a7555df..61f9640d 100644
--- a/lib/gpt/core/object_type/container.py
+++ b/lib/gpt/core/object_type/container.py
@@ -30,8 +30,11 @@
 
 ###
 # Helper
+max_fundamental_singlet_size = gpt.default.get_int("--max_fundamental_singlet_size", 100000)
 def decompose(n, ns, rank):
     for x in reversed(sorted(ns)):
+        if x > max_fundamental_singlet_size:
+            continue
         if n % x == 0:
             return [x] * ((n // x) ** rank)
     raise Exception("Cannot decompose %d in available fundamentals %s" % (n, ns))