Merge branch 'global_var_v2' into Index

getkeops · Apr 16, 2024 · bab8ad5 · bab8ad5
2 parents ad3930c + 9cc98b3
commit bab8ad5
Show file tree

Hide file tree

Showing 22 changed files with 367 additions and 103 deletions.
diff --git a/keopscore/keopscore/config/config.py b/keopscore/keopscore/config/config.py
@@ -10,6 +10,7 @@
 # global parameters can be set here :
 use_cuda = True  # use cuda if possible
 use_OpenMP = True  # use OpenMP if possible (see function set_OpenMP below)
+lim_dim_local_var = 10000
 
 # System Path
 base_dir_path = os.path.abspath(join(os.path.dirname(os.path.realpath(__file__)), ".."))

diff --git a/keopscore/keopscore/mapreduce/Chunk_Mode_Constants.py b/keopscore/keopscore/mapreduce/Chunk_Mode_Constants.py
@@ -5,7 +5,7 @@
 
 class Chunk_Mode_Constants:
     def __init__(self, red_formula):
-        varloader = Var_loader(red_formula)
+        varloader = Var_loader(red_formula, force_all_local=True)
 
         self.red_formula = red_formula
         self.dimred = red_formula.dimred  # dimension of reduction operation

diff --git a/keopscore/keopscore/mapreduce/MapReduce.py b/keopscore/keopscore/mapreduce/MapReduce.py
@@ -40,7 +40,9 @@ def __init__(
         self.use_half = use_half
         self.use_fast_math = use_fast_math
         self.device_id = device_id
-        self.varloader = Var_loader(self.red_formula)
+        self.varloader = Var_loader(
+            self.red_formula, force_all_local=self.force_all_local
+        )
 
     def get_code(self):
         self.headers = "#define C_CONTIGUOUS 1\n"
@@ -64,8 +66,8 @@ def get_code(self):
         nx = c_variable("signed long int", "nx")
         ny = c_variable("signed long int", "ny")
 
-        self.xi = c_array(dtype, self.varloader.dimx, "xi")
-        self.param_loc = c_array(dtype, self.varloader.dimp, "param_loc")
+        self.xi = c_array(dtype, self.varloader.dimx_local, "xi")
+        self.param_loc = c_array(dtype, self.varloader.dimp_local, "param_loc")
 
         argname = new_c_varname("arg")
         self.arg = c_variable(pointer(pointer(dtype)), argname)

diff --git a/keopscore/keopscore/mapreduce/cpu/CpuAssignZero.py b/keopscore/keopscore/mapreduce/cpu/CpuAssignZero.py
@@ -11,6 +11,8 @@
 class CpuAssignZero(MapReduce, Cpu_link_compile):
     # class for generating the final C++ code, Cpu version
 
+    force_all_local = True
+
     def __init__(self, *args):
         MapReduce.__init__(self, *args)
         Cpu_link_compile.__init__(self)

diff --git a/keopscore/keopscore/mapreduce/cpu/CpuReduc.py b/keopscore/keopscore/mapreduce/cpu/CpuReduc.py
@@ -12,6 +12,7 @@ class for generating the final C++ code, Cpu version
     """
 
     AssignZero = CpuAssignZero
+    force_all_local = True
 
     def __init__(self, *args):
         MapReduce.__init__(self, *args)

diff --git a/keopscore/keopscore/mapreduce/cpu/CpuReduc_ranges.py b/keopscore/keopscore/mapreduce/cpu/CpuReduc_ranges.py
@@ -15,6 +15,7 @@ class CpuReduc_ranges(MapReduce, Cpu_link_compile):
     # class for generating the final C++ code, Cpu version
 
     AssignZero = CpuAssignZero
+    force_all_local = True
 
     def __init__(self, *args):
         MapReduce.__init__(self, *args)
@@ -41,7 +42,6 @@ def get_code(self):
         param_loc = self.param_loc
 
         varloader = self.varloader
-        table = varloader.table(xi, yj, param_loc)
 
         nvarsi, nvarsj, nvarsp = (
             len(self.varloader.Varsi),
@@ -60,8 +60,13 @@ def get_code(self):
         indices_i = c_array("int", nvarsi, "indices_i")
         indices_j = c_array("int", nvarsj, "indices_j")
         indices_p = c_array("int", nvarsp, "indices_p")
-        imstartx = c_variable("int", "i-start_x")
-        jmstarty = c_variable("int", "j-start_y")
+        imstartx = c_variable("int", "(i-start_x)")
+        jmstarty = c_variable("int", "(j-start_y)")
+
+        table_nobatchmode = varloader.table(xi, yj, param_loc, args, i, j)
+        table_batchmode = varloader.table(
+            xi, yj, param_loc, args, imstartx, jmstarty, indices_i, indices_j, indices_p
+        )
 
         headers = ["cmath", "stdlib.h"]
         if keopscore.config.config.use_OpenMP:
@@ -210,13 +215,13 @@ def get_code(self):
                 if (nbatchdims == 0) {{
                     for (signed long int j = start_y; j < end_y; j++) {{
                         {varloader.load_vars("j", yj, args, row_index=j)}
-                        {red_formula.formula(fout,table)}
+                        {red_formula.formula(fout,table_nobatchmode)}
                         {sum_scheme.accumulate_result(acc, fout, j)}
                     }}
                 }} else {{
                     for (signed long int j = start_y; j < end_y; j++) {{
                         {varloader.load_vars("j", yj, args, row_index=jmstarty, offsets=indices_j)}
-                        {red_formula.formula(fout,table)}
+                        {red_formula.formula(fout,table_batchmode)}
                         {sum_scheme.accumulate_result(acc, fout, jmstarty)}
                     }}
                 }}

diff --git a/keopscore/keopscore/mapreduce/gpu/GpuAssignZero.py b/keopscore/keopscore/mapreduce/gpu/GpuAssignZero.py
@@ -9,6 +9,8 @@
 class GpuAssignZero(MapReduce, Gpu_link_compile):
     # class for generating the final C++ code, Gpu version
 
+    force_all_local = True
+
     def __init__(self, *args):
         MapReduce.__init__(self, *args)
         Gpu_link_compile.__init__(self)

diff --git a/keopscore/keopscore/mapreduce/gpu/GpuReduc1D.py b/keopscore/keopscore/mapreduce/gpu/GpuReduc1D.py
@@ -11,6 +11,7 @@ class GpuReduc1D(MapReduce, Gpu_link_compile):
     # class for generating the final C++ code, Gpu version
 
     AssignZero = GpuAssignZero
+    force_all_local = False
 
     def __init__(self, *args):
         MapReduce.__init__(self, *args)
@@ -35,9 +36,12 @@ def get_code(self):
 
         param_loc = self.param_loc
         xi = self.xi
-        yjloc = c_array(dtype, varloader.dimy, f"(yj + threadIdx.x * {varloader.dimy})")
-        yjrel = c_array(dtype, varloader.dimy, "yjrel")
-        table = varloader.table(self.xi, yjrel, self.param_loc)
+        yjloc = c_array(
+            dtype, varloader.dimy_local, f"(yj + threadIdx.x * {varloader.dimy_local})"
+        )
+        yjrel = c_array(dtype, varloader.dimy_local, "yjrel")
+        j_call = c_variable("signed long int", "(jstart+jrel)")
+        table = varloader.table(self.xi, yjrel, self.param_loc, args, i, j_call)
         jreltile = c_variable("signed long int", "(jrel + tile * blockDim.x)")
 
         self.code = f"""
@@ -80,7 +84,7 @@ def get_code(self):
                             if (i < nx) {{ // we compute x1i only if needed
                               {dtype} * yjrel = yj;
                               {sum_scheme.initialize_temporary_accumulator_block_init()}
-                              for (signed long int jrel = 0; (jrel < blockDim.x) && (jrel < ny - jstart); jrel++, yjrel += {varloader.dimy}) {{
+                              for (signed long int jrel = 0; (jrel < blockDim.x) && (jrel < ny - jstart); jrel++, yjrel += {varloader.dimy_local}) {{
                                 {red_formula.formula(fout, table)} // Call the function, which outputs results in fout
                                 {sum_scheme.accumulate_result(acc, fout, jreltile)}
                               }}

diff --git a/keopscore/keopscore/mapreduce/gpu/GpuReduc1D_chunks.py b/keopscore/keopscore/mapreduce/gpu/GpuReduc1D_chunks.py
@@ -91,6 +91,10 @@ def do_chunk_sub(
         xi,
         yjrel,
         param_loc,
+        [True] * chk.nminargs,
+        None,
+        None,
+        None,
     )
     foutj = c_variable(pointer(dtype), "foutj")
 
@@ -122,6 +126,7 @@ class GpuReduc1D_chunks(MapReduce, Gpu_link_compile):
     # class for generating the final C++ code, Gpu version
 
     AssignZero = GpuAssignZero
+    force_all_local = True
 
     def __init__(self, *args):
         MapReduce.__init__(self, *args)
@@ -266,7 +271,7 @@ def get_code(self):
 
                           // load parameters variables from global memory to local thread memory
                           {param_loc.declare()}
-                          {load_vars(chk.dimsp_notchunked, chk.indsp_notchunked, param_loc, args)}
+                          {load_vars(chk.dimsp_notchunked, chk.indsp_notchunked, param_loc, args, is_local=varloader.is_local_var)}
                           
                           {acc.declare()}
                           
@@ -282,7 +287,7 @@ def get_code(self):
                           {fout_chunk.declare()}
                           
                           if (i < nx) {{
-                            {load_vars(chk.dimsx_notchunked, chk.indsi_notchunked, xi, args, row_index=i)} // load xi variables from global memory to local thread memory
+                            {load_vars(chk.dimsx_notchunked, chk.indsi_notchunked, xi, args, row_index=i, is_local=varloader.is_local_var)} // load xi variables from global memory to local thread memory
                           }}
 
                           for (signed long int jstart = 0, tile = 0; jstart < ny; jstart += blockDim.x, tile++) {{
@@ -291,7 +296,7 @@ def get_code(self):
                             signed long int j = tile * blockDim.x + threadIdx.x;
 
                             if (j < ny) {{ // we load yj from device global memory only if j<ny
-                              {load_vars(chk.dimsy_notchunked, chk.indsj_notchunked, yjloc, args, row_index=j)} 
+                              {load_vars(chk.dimsy_notchunked, chk.indsj_notchunked, yjloc, args, row_index=j, is_local=varloader.is_local_var)} 
                             }}
                             __syncthreads();
 

diff --git a/keopscore/keopscore/mapreduce/gpu/GpuReduc1D_finalchunks.py b/keopscore/keopscore/mapreduce/gpu/GpuReduc1D_finalchunks.py
@@ -73,6 +73,7 @@ class GpuReduc1D_finalchunks(MapReduce, Gpu_link_compile):
     # class for generating the final C++ code, Gpu version
 
     AssignZero = GpuAssignZero
+    force_all_local = True
 
     def __init__(self, *args):
         MapReduce.__init__(self, *args)
@@ -100,7 +101,7 @@ def get_code(self):
         varfinal = self.red_formula.formula.children[1 - ind_fun_internal]
         nchunks = 1 + (varfinal.dim - 1) // dimfinalchunk
         dimlastfinalchunk = varfinal.dim - (nchunks - 1) * dimfinalchunk
-        varloader = Var_loader(fun_internal)
+        varloader = Var_loader(fun_internal, force_all_local=self.force_all_local)
         dimsx = varloader.dimsx
         dimsy = varloader.dimsy
         dimsp = varloader.dimsp
@@ -130,7 +131,7 @@ def get_code(self):
         yjloc = c_array(dtype, dimy, f"(yj + threadIdx.x * {dimy})")
         foutjrel = c_array(dtype, dimfout, f"({fout.id}+jrel*{dimfout})")
         yjrel = c_array(dtype, dimy, "yjrel")
-        table = self.varloader.table(xi, yjrel, param_loc)
+        table = self.varloader.table(xi, yjrel, param_loc, None, None, None)
 
         last_chunk = c_variable("signed long int", f"{nchunks-1}")
 
@@ -189,7 +190,7 @@ def get_code(self):
                           // get the value of variable (index with i)
                           {xi.declare()}
                           if (i < nx) {{
-                              {load_vars(dimsx, indsi, xi, args, row_index=i)} // load xi variables from global memory to local thread memory
+                              {load_vars(dimsx, indsi, xi, args, row_index=i, is_local=varloader.is_local_var)} // load xi variables from global memory to local thread memory
                               {use_pragma_unroll()}
                               for (signed long int k=0; k<{dimout}; k++) {{
                                   out[i*{dimout}+k] = 0.0f;
@@ -203,7 +204,7 @@ def get_code(self):
                               // get the current column
                               signed long int j = tile * blockDim.x + threadIdx.x;
                               if (j < ny) {{ // we load yj from device global memory only if j<ny
-                                  {load_vars(dimsy, indsj, yjloc, args, row_index=j)} // load yj variables from global memory to shared memory
+                                  {load_vars(dimsy, indsj, yjloc, args, row_index=j, is_local=varloader.is_local_var)} // load yj variables from global memory to shared memory
                               }}
                               __syncthreads();
 

diff --git a/keopscore/keopscore/mapreduce/gpu/GpuReduc1D_ranges.py b/keopscore/keopscore/mapreduce/gpu/GpuReduc1D_ranges.py
@@ -12,6 +12,7 @@ class GpuReduc1D_ranges(MapReduce, Gpu_link_compile):
     # class for generating the final C++ code, Gpu version
 
     AssignZero = GpuAssignZero
+    force_all_local = False
 
     def __init__(self, *args):
         MapReduce.__init__(self, *args)
@@ -43,15 +44,36 @@ def get_code(self):
 
         param_loc = self.param_loc
         xi = self.xi
-        yjloc = c_array(dtype, varloader.dimy, f"(yj + threadIdx.x * {varloader.dimy})")
-        yjrel = c_array(dtype, varloader.dimy, "yjrel")
-        table = varloader.table(self.xi, yjrel, self.param_loc)
+        yjloc = c_array(
+            dtype, varloader.dimy_local, f"(yj + threadIdx.x * {varloader.dimy_local})"
+        )
+        yjrel = c_array(dtype, varloader.dimy_local, "yjrel")
         jreltile = c_variable("signed long int", "(jrel + tile * blockDim.x)")
 
         indices_i = c_array("signed long int", nvarsi, "indices_i")
         indices_j = c_array("signed long int", nvarsj, "indices_j")
         indices_p = c_array("signed long int", nvarsp, "indices_p")
 
+        threadIdx_x = c_variable("signed long int", "threadIdx.x")
+
+        starty = c_variable("signed long int", "start_y")
+        j_call = c_variable("signed long int", "(jstart+jrel-start_y)")
+
+        table_batchmode = varloader.table(
+            self.xi,
+            yjrel,
+            self.param_loc,
+            args,
+            threadIdx_x,
+            j_call,
+            indices_i,
+            indices_j,
+            indices_p,
+        )
+        table_nobatchmode = varloader.table(
+            self.xi, yjrel, self.param_loc, args, i, j_call
+        )
+
         declare_assign_indices_i = (
             "signed long int *indices_i = offsets;" if nvarsi > 0 else ""
         )
@@ -64,10 +86,6 @@ def get_code(self):
             else ""
         )
 
-        starty = c_variable("signed long int", "start_y")
-
-        threadIdx_x = c_variable("signed long int", "threadIdx.x")
-
         if dtype == "half2":
             self.headers += c_include("cuda_fp16.h")
 
@@ -148,13 +166,13 @@ def get_code(self):
                                           {dtype} * yjrel = yj; // Loop on the columns of the current block.
                                           {sum_scheme.initialize_temporary_accumulator_block_init()}
                                           if (nbatchdims == 0) {{
-                                              for(signed long int jrel = 0; (jrel < blockDim.x) && (jrel<end_y-jstart); jrel++, yjrel+={varloader.dimy}) {{
-                                                  {red_formula.formula(fout,table)} // Call the function, which outputs results in xi[0:DIMX1]
+                                              for(signed long int jrel = 0; (jrel < blockDim.x) && (jrel<end_y-jstart); jrel++, yjrel+={varloader.dimy_local}) {{
+                                                  {red_formula.formula(fout,table_nobatchmode)} // Call the function, which outputs results in xi[0:DIMX1]
                                                   {sum_scheme.accumulate_result(acc, fout, jreltile+starty)}
                                               }} 
                                           }} else {{
-                                              for(signed long int jrel = 0; (jrel < blockDim.x) && (jrel<end_y-jstart); jrel++, yjrel+={varloader.dimy}) {{
-                                                  {red_formula.formula(fout,table)} // Call the function, which outputs results in fout
+                                              for(signed long int jrel = 0; (jrel < blockDim.x) && (jrel<end_y-jstart); jrel++, yjrel+={varloader.dimy_local}) {{
+                                                  {red_formula.formula(fout,table_batchmode)} // Call the function, which outputs results in fout
                                                   {sum_scheme.accumulate_result(acc, fout, jreltile)}
                                               }}
                                           }}

diff --git a/keopscore/keopscore/mapreduce/gpu/GpuReduc1D_ranges_chunks.py b/keopscore/keopscore/mapreduce/gpu/GpuReduc1D_ranges_chunks.py
@@ -70,7 +70,7 @@ def do_chunk_sub_ranges(
         row_index=i,
     )
 
-    varloader_global = Var_loader(red_formula)
+    varloader_global = Var_loader(red_formula, force_all_local=True)
     indsi_global = varloader_global.indsi
     indsj_global = varloader_global.indsj
     indsp_global = varloader_global.indsp
@@ -146,6 +146,10 @@ def do_chunk_sub_ranges(
         xi,
         yjrel,
         param_loc,
+        [True] * chk.nminargs,
+        None,
+        None,
+        None,
     )
     foutj = c_variable(pointer(dtype), "foutj")
 
@@ -187,6 +191,7 @@ class GpuReduc1D_ranges_chunks(MapReduce, Gpu_link_compile):
     # class for generating the final C++ code, Gpu version
 
     AssignZero = GpuAssignZero
+    force_all_local = True
 
     def __init__(self, *args):
         MapReduce.__init__(self, *args)
@@ -204,7 +209,7 @@ def get_code(self):
         dtype = self.dtype
         dtypeacc = self.dtypeacc
 
-        varloader_global = Var_loader(red_formula)
+        varloader_global = Var_loader(red_formula, force_all_local=self.force_all_local)
         indsi_global = varloader_global.indsi
         indsj_global = varloader_global.indsj
         indsp_global = varloader_global.indsp
@@ -401,9 +406,9 @@ def get_code(self):
                           // load parameters variables from global memory to local thread memory
                           {param_loc.declare()}
                           if (nbatchdims == 0) {{
-                              {load_vars(chk.dimsp_notchunked, chk.indsp_notchunked, param_loc, args)}
+                              {load_vars(chk.dimsp_notchunked, chk.indsp_notchunked, param_loc, args, is_local=varloader_global.is_local_var)}
                           }} else {{
-                              {load_vars(chk.dimsp_notchunked, chk.indsp_notchunked, param_loc, args, offsets=indices_p)}
+                              {load_vars(chk.dimsp_notchunked, chk.indsp_notchunked, param_loc, args, offsets=indices_p, is_local=varloader_global.is_local_var)}
                           }}
                           
                           {acc.declare()}
@@ -422,10 +427,10 @@ def get_code(self):
                           if (i < end_x) {{
                               // load xi variables from global memory to local thread memory
                               if (nbatchdims == 0) {{
-                                  {load_vars(chk.dimsx_notchunked, chk.indsi_notchunked, xi, args, row_index=i)} 
+                                  {load_vars(chk.dimsx_notchunked, chk.indsi_notchunked, xi, args, row_index=i, is_local=varloader_global.is_local_var)} 
                               }} else {{
                                   {load_vars(chk.dimsx_notchunked, chk.indsi_notchunked, xi, args, 
-                                              row_index=threadIdx_x, offsets=indices_i, indsref=indsi_global)}
+                                              row_index=threadIdx_x, offsets=indices_i, indsref=indsi_global, is_local=varloader_global.is_local_var)}
                               }} 
                           }}
                           
@@ -442,11 +447,11 @@ def get_code(self):
                                       if(j<end_y) // we load yj from device global memory only if j<end_y
                                           if (nbatchdims == 0) {{
                                               // load yj variables from global memory to shared memory
-                                              {load_vars(chk.dimsy_notchunked, chk.indsj_notchunked, yjloc, args, row_index=j)} 
+                                              {load_vars(chk.dimsy_notchunked, chk.indsj_notchunked, yjloc, args, row_index=j, is_local=varloader_global.is_local_var)} 
                                           }} else {{
                                               // Possibly, with offsets as we support broadcasting over batch dimensions
                                               {load_vars(chk.dimsy_notchunked, chk.indsj_notchunked, yjloc, args, 
-                                                          row_index=j-starty, offsets=indices_j, indsref=indsj_global)}
+                                                          row_index=j-starty, offsets=indices_j, indsref=indsj_global, is_local=varloader_global.is_local_var)}
                                           }}
                                       __syncthreads();