optimize

lehner · Oct 26, 2023 · 199b9c5 · 199b9c5
1 parent 6800f15
commit 199b9c5
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 9 deletions.
diff --git a/benchmarks/matrix_multiply.py b/benchmarks/matrix_multiply.py
@@ -30,8 +30,11 @@
         three = g.lattice(grid, tp)
         rng.cnormal([one, two])
 
-        # Rank inner product
+        # matrix multiply
         nbytes = 3.0 * one.global_bytes() * N
+        n = (one.otype.nfloats // 2)**0.5
+        flops_per_matrix_multiply = n * n * (n * 6 + (n - 1) * 2)
+        flops = flops_per_matrix_multiply = grid.gsites * N * flops_per_matrix_multiply
 
         # Time
         dt = 0.0
@@ -44,10 +47,12 @@
 
         # Report
         GBPerSec = nbytes / dt / 1e9
+        GFLPerSec = flops / dt / 1e9
         g.message(
             f"""{N} matrix_multiply
     Object type                 : {tp.__name__}
     Time to complete            : {dt:.2g} s
+    GFlops/s                    : {GFLPerSec:.2f}
     Effective memory bandwidth  : {GBPerSec:.2f} GB/s
 """
         )
diff --git a/benchmarks/stencil_tensor.py b/benchmarks/stencil_tensor.py
@@ -1,11 +1,8 @@
 #!/usr/bin/env python3
 import gpt as g
 
-#grid = g.grid([64,64,64,64], g.double)
-grid = g.grid([32,32,32,32], g.double)
-#grid = g.grid([32,16,16,16], g.double)
-#grid = g.grid([16,16,16,32], g.double)
-#grid = g.grid([2*4,4*3,3*4,3*3*4], g.double)
+grid = g.grid(g.default.get_ivec("--grid", [16, 16, 16, 32], 4), g.double)
+
 m1 = g.mcolor(grid)
 m2 = g.mcolor(grid)
 m3 = g.mcolor(grid)
@@ -69,6 +66,7 @@
                     (0,dst,ti.mov if l == 0 else ti.inc,1.0,[(2,0,3*i + l),(-1,0,3*l + j)])
                 )
     segments = [(3, 9), (3, 9)]
+    #segments = [(27*2, 1)]
 else:
     for i in range(3):
         for j in range(3):
@@ -153,7 +151,7 @@
 g.message(g.norm2(R - R2) / g.norm2(R))
 #
 #            D[i2[0], i1[0]] += sign1 * sign2 * Q1[i1[1], i2[1]] * g.transpose(Q2[i1[2], i2[2]])
-for osites_per_instruction in [4,16,32,64,256]:
+for osites_per_instruction in [1,2,4,16,32,64,256]:
     for osites_per_cache_block in [ grid.gsites]:
         ein.memory_access_pattern(osites_per_instruction, osites_per_cache_block)
 

diff --git a/lib/gpt/core/local_stencil/matrix.py b/lib/gpt/core/local_stencil/matrix.py
@@ -36,7 +36,7 @@ def __init__(self, lat, points, code, code_parallel_block_size=None, local=1):
         self.obj = cgpt.stencil_matrix_create(
             lat.v_obj[0], lat.grid.obj, points, self.code, code_parallel_block_size, local
         )
-        self.fast_osites = 1
+        self.fast_osites = 0
 
     def __call__(self, *fields):
         cgpt.stencil_matrix_execute(self.obj, list(fields), self.fast_osites)

diff --git a/lib/gpt/core/local_stencil/matrix_vector.py b/lib/gpt/core/local_stencil/matrix_vector.py
@@ -51,7 +51,7 @@ def __init__(
             code_parallel_block_size,
             local,
         )
-        self.fast_osites = 1
+        self.fast_osites = 0
 
     def __call__(self, matrix_fields, vector_fields):
         cgpt.stencil_matrix_vector_execute(self.obj, matrix_fields, vector_fields, self.fast_osites)