tune

lehner · Jun 6, 2024 · 758392a · 758392a
1 parent d9984ba
commit 758392a
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 15 deletions.
diff --git a/lib/gpt/core/local_stencil/matrix.py b/lib/gpt/core/local_stencil/matrix.py
@@ -46,6 +46,3 @@ def __del__(self):
 
     def data_access_hints(self, *hints):
         pass
-
-    def memory_access_pattern(self, fast_osites):
-        self.fast_osites = fast_osites
diff --git a/lib/gpt/core/local_stencil/matrix_vector.py b/lib/gpt/core/local_stencil/matrix_vector.py
@@ -61,6 +61,3 @@ def __del__(self):
 
     def data_access_hints(self, *hints):
         pass
-
-    def memory_access_pattern(self, fast_osites):
-        self.fast_osites = fast_osites
diff --git a/lib/gpt/core/local_stencil/tensor.py b/lib/gpt/core/local_stencil/tensor.py
@@ -47,22 +47,21 @@ def __init__(self, lat, points, code, segments, local=1):
         self.obj = cgpt.stencil_tensor_create(
             lat.v_obj[0], lat.grid.obj, points, self.code, self.segments, local
         )
-        self.osites_per_cache_block = lat.grid.gsites
 
         # auto tuner
+        gsites = int(lat.grid.gsites)
         tag = f"local_tensor({lat.otype.__name__}, {lat.grid.describe()}, {hash_code(code)}, {len(segments)}, {local})"
-        super().__init__(tag, [2, 4, 8, 16, 32, 64, 128, 256], 4)
+        super().__init__(tag, [
+            (opi, opi * opcb) for opi in [2, 4, 8, 16, 32, 64, 128, 256] for opcb in [256, 1024, 8192, gsites]
+        ], (4, gsites))
 
     @auto_tuned_method
-    def __call__(self, opi, *fields):
-        cgpt.stencil_tensor_execute(self.obj, list(fields), opi, self.osites_per_cache_block)
+    def __call__(self, performance_args, *fields):
+        opi, opcb = performance_args
+        cgpt.stencil_tensor_execute(self.obj, list(fields), opi, opcb)
 
     def __del__(self):
         cgpt.stencil_tensor_delete(self.obj)
 
     def data_access_hints(self, *hints):
         pass
-
-    def memory_access_pattern(self, osites_per_instruction, osites_per_cache_block):
-        self.osites_per_instruction = osites_per_instruction
-        self.osites_per_cache_block = osites_per_cache_block
diff --git a/lib/gpt/qcd/gauge/smear/local_stout.py b/lib/gpt/qcd/gauge/smear/local_stout.py
@@ -384,7 +384,7 @@ def gradient(self, U, dU):
         for b in range(ng):
             dJdX[b] = g(-dJdX[b])
 
-        t("Invert M_ab")
+        t("invert M_ab")
         inv_M_ab = g.matrix.inv(M_ab)
 
         t("N M^-1")
@@ -425,7 +425,9 @@ def gradient(self, U, dU):
             PlaqR = g((-rho) * csf(U[nu], nu, csf(U[mu], mu, csb(U[nu], nu, csb(U_mu_masked, mu)))))
 
             dJdXe_nMpInv_y = dJdXe_nMpInv
+            t("compute_adj_ab")
             compute_adj_ab(PlaqL, PlaqR, Nxy, generators, cache_ab)
+            t("non-local")
             Fdet1_nu = g(g.transpose(Nxy) * dJdXe_nMpInv_y)
 
             PlaqR = g((-1.0) * PlaqR)
@@ -439,7 +441,9 @@ def gradient(self, U, dU):
             PlaqL = csb(U_mu_masked, mu)
 
             dJdXe_nMpInv_y = g.cshift(dJdXe_nMpInv, mu, -1)
+            t("compute_adj_ab")
             compute_adj_ab(PlaqL, PlaqR, Nxy, generators, cache_ab)
+            t("non-local")
             Fdet1_nu += g.transpose(Nxy) * dJdXe_nMpInv_y
 
             MpInvJx_nu = g.cshift(MpInvJx, mu, -1)
@@ -453,7 +457,9 @@ def gradient(self, U, dU):
             PlaqR = csf(U[nu], nu)
 
             dJdXe_nMpInv_y = g.cshift(dJdXe_nMpInv, nu, 1)
+            t("compute_adj_ab")
             compute_adj_ab(PlaqL, PlaqR, Nxy, generators, cache_ab)
+            t("non-local")
             Fdet1_nu += g.transpose(Nxy) * dJdXe_nMpInv_y
 
             MpInvJx_nu = g.cshift(MpInvJx, nu, 1)
@@ -469,7 +475,9 @@ def gradient(self, U, dU):
             dJdXe_nMpInv_y = g.cshift(dJdXe_nMpInv, mu, -1)
             dJdXe_nMpInv_y = g.cshift(dJdXe_nMpInv_y, nu, 1)
 
+            t("compute_adj_ab")
             compute_adj_ab(PlaqL, PlaqR, Nxy, generators, cache_ab)
+            t("non-local")
             Fdet1_nu += g.transpose(Nxy) * dJdXe_nMpInv_y
 
             MpInvJx_nu = g.cshift(MpInvJx, mu, -1)
@@ -480,15 +488,19 @@ def gradient(self, U, dU):
             Fdet2_nu += FdetV
 
             # force contributions to fundamental representation
+            t("adj_to_fund")
             adjoint_to_fundamental(Fdet1[nu], Fdet1_nu, generators)
             adjoint_to_fundamental(Fdet2[nu], Fdet2_nu, generators)
+            t("non-local")
 
             # mu cw
             PlaqL = g((-rho) * csf(U[mu], mu, csb(U[nu], nu, csb(U_mu_masked, mu))))
             PlaqR = csb(U[nu], nu)
 
             dJdXe_nMpInv_y = g.cshift(dJdXe_nMpInv, nu, -1)
+            t("compute_adj_ab")
             compute_adj_ab(PlaqL, PlaqR, Nxy, generators, cache_ab)
+            t("non-local")
             Fdet1_mu += g.transpose(Nxy) * dJdXe_nMpInv_y
 
             MpInvJx_nu = g.cshift(MpInvJx, nu, -1)
@@ -503,7 +515,9 @@ def gradient(self, U, dU):
 
             dJdXe_nMpInv_y = g.cshift(dJdXe_nMpInv, nu, 1)
 
+            t("compute_adj_ab")
             compute_adj_ab(PlaqL, PlaqR, Nxy, generators, cache_ab)
+            t("non-local")
             Fdet1_mu += g.transpose(Nxy) * dJdXe_nMpInv_y
 
             MpInvJx_nu = g.cshift(MpInvJx, nu, 1)
@@ -516,8 +530,10 @@ def gradient(self, U, dU):
         t("aggregate")
         Fdet1_mu += g.transpose(NxxAd) * dJdXe_nMpInv
 
+        t("adj_to_fund")
         adjoint_to_fundamental(Fdet1[mu], Fdet1_mu, generators)
         adjoint_to_fundamental(Fdet2[mu], Fdet2_mu, generators)
+        t("aggregate")
 
         force = [g((0.5 * 1j) * (x + y)) for x, y in zip(Fdet1, Fdet2)]