[ci skip] set limit for pragma unrolls

getkeops · Aug 11, 2021 · 31f7280 · 31f7280
1 parent 6f63977
commit 31f7280
Show file tree

Hide file tree

Showing 8 changed files with 34 additions and 23 deletions.
diff --git a/keops/python_engine/formulas/maths/TensorDot.py b/keops/python_engine/formulas/maths/TensorDot.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 from keops.python_engine.formulas.Operation import Operation
-
+from keops.python_engine.utils.code_gen_utils import use_pragma_unroll
 
 ####################################
 ######  Tensor Dot Product     #####
@@ -152,11 +152,11 @@ def Op(self, out, table, arg0, arg1):
 
         return f"""
                     #if C_CONTIGUOUS     // row major
-                        #pragma unroll
+                        {use_pragma_unroll()}
                         for (int i = 0; i < {out.dim}; i++)
                             {out.id}[i] = ({out.dtype})(0.0f);
                         
-                        #pragma unroll                        
+                        {use_pragma_unroll()}                       
                         {str_code}
                     #else               // column major
                         

diff --git a/keops/python_engine/formulas/reductions/KMin_ArgKMin_Reduction.py b/keops/python_engine/formulas/reductions/KMin_ArgKMin_Reduction.py
@@ -7,6 +7,7 @@
     new_c_varname,
     c_if,
     c_array,
+    use_pragma_unroll
 )
 from keops.python_engine.formulas.reductions.Reduction import Reduction
 
@@ -79,10 +80,10 @@ def ReducePairShort(self, acc, xi, ind):
                     {{
                         {xik.declare()}
                         {l.declare()}
-                        #pragma unroll
+                        {use_pragma_unroll()}
                         for(int {k.id}=0; {k.id}<{fdim}; {k.id}++) {{
                             {xik.assign(xi[k])}
-                            #pragma unroll                     
+                            {use_pragma_unroll()}                 
                             for({l.id}={(k+(K-1)*2*fdim).id}; {l.id}>={k.id} && {(xik<acc[l]).id}; {l.id}-={2*fdim}) {{
                                 {tmpl.declare_assign(acc[l])}
                                 {indtmpl.declare_assign(acc[l+fdim])}

diff --git a/keops/python_engine/mapreduce/GpuReduc1D_chunks.py b/keops/python_engine/mapreduce/GpuReduc1D_chunks.py
@@ -11,7 +11,8 @@
     sizeof,
     pointer,
     table,
-    table4
+    table4,
+    use_pragma_unroll
 )
 from keops.python_engine.formulas.reductions.sum_schemes import *
 from keops.python_engine.compilation import Gpu_link_compile
@@ -186,7 +187,7 @@ def get_code(self):
                             }}
                             
                             // looping on chunks (except the last)
-                    		#pragma unroll
+                    		{use_pragma_unroll()}
                     		for (int chunk=0; chunk<{chk.nchunks}-1; chunk++) {{
                               {chunk_sub_routine}
                             }}

diff --git a/keops/python_engine/mapreduce/GpuReduc1D_finalchunks.py b/keops/python_engine/mapreduce/GpuReduc1D_finalchunks.py
@@ -12,7 +12,8 @@
     pointer,
     table,
     table4,
-    Var_loader
+    Var_loader,
+    use_pragma_unroll
 )
 from keops.python_engine.formulas.reductions.sum_schemes import *
 from keops.python_engine.formulas.reductions.Sum_Reduction import Sum_Reduction
@@ -39,15 +40,15 @@ def do_finalchunk_sub(dtype, varfinal, dimfinalchunk_curr,
                 __syncthreads();
                 for (int jrel = 0; (jrel < blockDim.x) && (jrel < {ny.id} - {jstart.id}); jrel++, yjrel += {dimfinalchunk}) {{          
                     if ({i.id} < {nx.id}) {{ // we compute only if needed
-                        #pragma unroll
+                        {use_pragma_unroll()}
                         for (int k=0; k<{dimfinalchunk_curr}; k++) {{
                             {acc.id}[k] += yjrel[k] * fout[jrel];
                         }}
                     }}
                     __syncthreads();
                 }}
                 if ({i.id} < {nx.id}) {{
-                    #pragma unroll
+                    {use_pragma_unroll()}
                     for (int k=0; k<{dimfinalchunk_curr}; k++)
                         {out.id}[i*{dimout}+{chunk.id}*{dimfinalchunk}+k] += {acc.id}[k];
                 }}
@@ -145,7 +146,7 @@ def get_code(self):
                           {xi.declare()}
                           if (i < nx) {{
                               {load_vars(dimsx, indsi, xi, args, row_index=i)} // load xi variables from global memory to local thread memory
-                              #pragma unroll
+                              {use_pragma_unroll()}
                               for (int k=0; k<{dimout}; k++) {{
                                   out[i*{dimout}+k] = 0.0f;
                               }}

diff --git a/keops/python_engine/mapreduce/GpuReduc1D_ranges_chunks.py b/keops/python_engine/mapreduce/GpuReduc1D_ranges_chunks.py
@@ -13,7 +13,8 @@
     pointer,
     table,
     table4,
-    Var_loader
+    Var_loader,
+    use_pragma_unroll
 )
 from keops.python_engine.formulas.reductions.sum_schemes import *
 from keops.python_engine.compilation import Gpu_link_compile
@@ -286,7 +287,7 @@ def get_code(self):
                                   }}
                                   
                                   // looping on chunks (except the last)
-                          		  #pragma unroll
+                          		  {use_pragma_unroll()}
                           		  for (int chunk=0; chunk<{chk.nchunks}-1; chunk++) {{
                                       {chunk_sub_routine}
                                   }}

diff --git a/keops/python_engine/mapreduce/GpuReduc1D_ranges_finalchunks.py b/keops/python_engine/mapreduce/GpuReduc1D_ranges_finalchunks.py
@@ -13,7 +13,8 @@
     pointer,
     table,
     table4,
-    Var_loader
+    Var_loader,
+    use_pragma_unroll
 )
 from keops.python_engine.formulas.reductions.sum_schemes import *
 from keops.python_engine.formulas.reductions.Sum_Reduction import Sum_Reduction
@@ -48,15 +49,15 @@ def do_finalchunk_sub_ranges(dtype, fun_global, varfinal, dimfinalchunk_curr,
                 __syncthreads();
                 for (int jrel = 0; (jrel < blockDim.x) && (jrel < {end_y.id} - {jstart.id}); jrel++, yjrel += {dimfinalchunk}) {{          
                     if ({i.id} < {end_x.id}) {{ // we compute only if needed
-                        #pragma unroll
+                        {use_pragma_unroll()}
                         for (int k=0; k<{dimfinalchunk_curr}; k++) {{
                             {acc.id}[k] += yjrel[k] * fout[jrel];
                         }}
                     }}
                     __syncthreads();
                 }}
                 if ({i.id} < {end_x.id}) {{
-                    #pragma unroll
+                    {use_pragma_unroll()}
                     for (int k=0; k<{dimfinalchunk_curr}; k++)
                         {out.id}[i*{dimout}+{chunk.id}*{dimfinalchunk}+k] += {acc.id}[k];
                 }}
@@ -219,7 +220,7 @@ def get_code(self):
                                   {varloader.load_vars("i", xi, args, row_index=threadIdx_x, offsets=indices_i, indsref=indsi_global)} // Possibly, with offsets as we support broadcasting over batch dimensions
                               }}
                               
-                              #pragma unroll
+                              {use_pragma_unroll()}
                               for (int k=0; k<{dimout}; k++) {{
                                   out[i*{dimout}+k] = 0.0f;
                               }}

diff --git a/keops/python_engine/mapreduce/GpuReduc2D.py b/keops/python_engine/mapreduce/GpuReduc2D.py
@@ -6,7 +6,8 @@
     c_include,
     signature_list,
     call_list,
-    pointer
+    pointer,
+    use_pragma_unroll
 )
 from keops.python_engine.compilation import Gpu_link_compile
 from keops.python_engine.formulas.reductions.sum_schemes import block_sum, kahan_scheme
@@ -188,7 +189,7 @@ def get_code(self):
                             // lines of size DIMRED. The final reduction, which "sums over the block lines",
                             // shall be done in a later step.
                             if(i<nx) {{
-                                #pragma unroll
+                                {use_pragma_unroll()}
                                 for(int k=0; k<{dimred}; k++) {{
                                     out[blockIdx.y*{dimred}*nx+i*{dimred}+k] = acc[k];
                                 }}

diff --git a/keops/python_engine/utils/code_gen_utils.py b/keops/python_engine/utils/code_gen_utils.py
@@ -190,6 +190,11 @@ def __getitem__(self, other):
         else:
             raise ValueError("not implemented")
 
+def use_pragma_unroll(n=64):
+    if n is None:
+        return f"\n#pragma unroll\n"
+    else:
+        return f"\n#pragma unroll({n})\n"
 
 def c_for_loop(start, end, incr, pragma_unroll=False):
     def to_string(x):
@@ -208,7 +213,7 @@ def to_string(x):
     def print(body_code):
         string = ""
         if pragma_unroll:
-            string += "\n#pragma unroll\n"
+            string += use_pragma_unroll()
         string += f""" for(int {k.id}={start}; {k.id}<{end}; {k.id}+=({incr})) {{
                             {body_code}
                         }}
@@ -615,7 +620,7 @@ def load_vars(dims, inds, xloc, args, row_index=c_zero_int, offsets=None, indsre
             row_index_str = (
                     f"({row_index.id}+{offsets.id}[{l}])" if offsets else row_index.id
                 )
-            string += "#pragma unroll\n"
+            string += use_pragma_unroll()
             string += f"for(int v=0; v<{dims[u]}; v++) {{\n"
             string += f"    {xloc.id}[a] = {args[inds[u]].id}[{row_index_str}*{dims[u]}+v];\n"
             string += "     a++;\n"
@@ -657,7 +662,7 @@ def load_vars_chunks(inds, dim_chunk, dim_chunk_load, dim_org,
         string += "{"
         string += "int a=0;\n"
         for u in range(len(inds)):
-            string += "#pragma unroll\n"
+            string += use_pragma_unroll()
             string += f"for(int v=0; v<{dim_chunk_load}; v++) {{\n"
             string += f"    {xloc.id}[a] = {args[inds[u]].id}[{row_index.id}*{dim_org}+{k.id}*{dim_chunk}+v];\n"
             string += "     a++;\n"
@@ -686,7 +691,7 @@ def load_vars_chunks_offsets(inds, indsref, dim_chunk, dim_chunk_load, dim_org,
         string += "int a=0;\n"
         for u in range(len(inds)):
             l = indsref.index(inds[u])
-            string += "#pragma unroll\n"
+            string += {use_pragma_unroll()}
             string += f"for(int v=0; v<{dim_chunk_load}; v++) {{\n"
             string += f"    {xloc.id}[a] = {args[inds[u]].id}[({row_index.id}+{offsets.id}[{l}])*{dim_org}+{k.id}*{dim_chunk}+v];\n"
             string += "     a++;\n"