Skip to content

Commit

Permalink
Merge branch 'global_var_v2' into Index
Browse files Browse the repository at this point in the history
  • Loading branch information
Joan Alexis Glaunès committed Apr 16, 2024
2 parents ad3930c + 9cc98b3 commit bab8ad5
Show file tree
Hide file tree
Showing 22 changed files with 367 additions and 103 deletions.
1 change: 1 addition & 0 deletions keopscore/keopscore/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# global parameters can be set here :
use_cuda = True # use cuda if possible
use_OpenMP = True # use OpenMP if possible (see function set_OpenMP below)
lim_dim_local_var = 10000

# System Path
base_dir_path = os.path.abspath(join(os.path.dirname(os.path.realpath(__file__)), ".."))
Expand Down
2 changes: 1 addition & 1 deletion keopscore/keopscore/mapreduce/Chunk_Mode_Constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

class Chunk_Mode_Constants:
def __init__(self, red_formula):
varloader = Var_loader(red_formula)
varloader = Var_loader(red_formula, force_all_local=True)

self.red_formula = red_formula
self.dimred = red_formula.dimred # dimension of reduction operation
Expand Down
8 changes: 5 additions & 3 deletions keopscore/keopscore/mapreduce/MapReduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ def __init__(
self.use_half = use_half
self.use_fast_math = use_fast_math
self.device_id = device_id
self.varloader = Var_loader(self.red_formula)
self.varloader = Var_loader(
self.red_formula, force_all_local=self.force_all_local
)

def get_code(self):
self.headers = "#define C_CONTIGUOUS 1\n"
Expand All @@ -64,8 +66,8 @@ def get_code(self):
nx = c_variable("signed long int", "nx")
ny = c_variable("signed long int", "ny")

self.xi = c_array(dtype, self.varloader.dimx, "xi")
self.param_loc = c_array(dtype, self.varloader.dimp, "param_loc")
self.xi = c_array(dtype, self.varloader.dimx_local, "xi")
self.param_loc = c_array(dtype, self.varloader.dimp_local, "param_loc")

argname = new_c_varname("arg")
self.arg = c_variable(pointer(pointer(dtype)), argname)
Expand Down
2 changes: 2 additions & 0 deletions keopscore/keopscore/mapreduce/cpu/CpuAssignZero.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
class CpuAssignZero(MapReduce, Cpu_link_compile):
# class for generating the final C++ code, Cpu version

force_all_local = True

def __init__(self, *args):
MapReduce.__init__(self, *args)
Cpu_link_compile.__init__(self)
Expand Down
1 change: 1 addition & 0 deletions keopscore/keopscore/mapreduce/cpu/CpuReduc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class for generating the final C++ code, Cpu version
"""

AssignZero = CpuAssignZero
force_all_local = True

def __init__(self, *args):
MapReduce.__init__(self, *args)
Expand Down
15 changes: 10 additions & 5 deletions keopscore/keopscore/mapreduce/cpu/CpuReduc_ranges.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class CpuReduc_ranges(MapReduce, Cpu_link_compile):
# class for generating the final C++ code, Cpu version

AssignZero = CpuAssignZero
force_all_local = True

def __init__(self, *args):
MapReduce.__init__(self, *args)
Expand All @@ -41,7 +42,6 @@ def get_code(self):
param_loc = self.param_loc

varloader = self.varloader
table = varloader.table(xi, yj, param_loc)

nvarsi, nvarsj, nvarsp = (
len(self.varloader.Varsi),
Expand All @@ -60,8 +60,13 @@ def get_code(self):
indices_i = c_array("int", nvarsi, "indices_i")
indices_j = c_array("int", nvarsj, "indices_j")
indices_p = c_array("int", nvarsp, "indices_p")
imstartx = c_variable("int", "i-start_x")
jmstarty = c_variable("int", "j-start_y")
imstartx = c_variable("int", "(i-start_x)")
jmstarty = c_variable("int", "(j-start_y)")

table_nobatchmode = varloader.table(xi, yj, param_loc, args, i, j)
table_batchmode = varloader.table(
xi, yj, param_loc, args, imstartx, jmstarty, indices_i, indices_j, indices_p
)

headers = ["cmath", "stdlib.h"]
if keopscore.config.config.use_OpenMP:
Expand Down Expand Up @@ -210,13 +215,13 @@ def get_code(self):
if (nbatchdims == 0) {{
for (signed long int j = start_y; j < end_y; j++) {{
{varloader.load_vars("j", yj, args, row_index=j)}
{red_formula.formula(fout,table)}
{red_formula.formula(fout,table_nobatchmode)}
{sum_scheme.accumulate_result(acc, fout, j)}
}}
}} else {{
for (signed long int j = start_y; j < end_y; j++) {{
{varloader.load_vars("j", yj, args, row_index=jmstarty, offsets=indices_j)}
{red_formula.formula(fout,table)}
{red_formula.formula(fout,table_batchmode)}
{sum_scheme.accumulate_result(acc, fout, jmstarty)}
}}
}}
Expand Down
2 changes: 2 additions & 0 deletions keopscore/keopscore/mapreduce/gpu/GpuAssignZero.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
class GpuAssignZero(MapReduce, Gpu_link_compile):
# class for generating the final C++ code, Gpu version

force_all_local = True

def __init__(self, *args):
MapReduce.__init__(self, *args)
Gpu_link_compile.__init__(self)
Expand Down
12 changes: 8 additions & 4 deletions keopscore/keopscore/mapreduce/gpu/GpuReduc1D.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class GpuReduc1D(MapReduce, Gpu_link_compile):
# class for generating the final C++ code, Gpu version

AssignZero = GpuAssignZero
force_all_local = False

def __init__(self, *args):
MapReduce.__init__(self, *args)
Expand All @@ -35,9 +36,12 @@ def get_code(self):

param_loc = self.param_loc
xi = self.xi
yjloc = c_array(dtype, varloader.dimy, f"(yj + threadIdx.x * {varloader.dimy})")
yjrel = c_array(dtype, varloader.dimy, "yjrel")
table = varloader.table(self.xi, yjrel, self.param_loc)
yjloc = c_array(
dtype, varloader.dimy_local, f"(yj + threadIdx.x * {varloader.dimy_local})"
)
yjrel = c_array(dtype, varloader.dimy_local, "yjrel")
j_call = c_variable("signed long int", "(jstart+jrel)")
table = varloader.table(self.xi, yjrel, self.param_loc, args, i, j_call)
jreltile = c_variable("signed long int", "(jrel + tile * blockDim.x)")

self.code = f"""
Expand Down Expand Up @@ -80,7 +84,7 @@ def get_code(self):
if (i < nx) {{ // we compute x1i only if needed
{dtype} * yjrel = yj;
{sum_scheme.initialize_temporary_accumulator_block_init()}
for (signed long int jrel = 0; (jrel < blockDim.x) && (jrel < ny - jstart); jrel++, yjrel += {varloader.dimy}) {{
for (signed long int jrel = 0; (jrel < blockDim.x) && (jrel < ny - jstart); jrel++, yjrel += {varloader.dimy_local}) {{
{red_formula.formula(fout, table)} // Call the function, which outputs results in fout
{sum_scheme.accumulate_result(acc, fout, jreltile)}
}}
Expand Down
11 changes: 8 additions & 3 deletions keopscore/keopscore/mapreduce/gpu/GpuReduc1D_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ def do_chunk_sub(
xi,
yjrel,
param_loc,
[True] * chk.nminargs,
None,
None,
None,
)
foutj = c_variable(pointer(dtype), "foutj")

Expand Down Expand Up @@ -122,6 +126,7 @@ class GpuReduc1D_chunks(MapReduce, Gpu_link_compile):
# class for generating the final C++ code, Gpu version

AssignZero = GpuAssignZero
force_all_local = True

def __init__(self, *args):
MapReduce.__init__(self, *args)
Expand Down Expand Up @@ -266,7 +271,7 @@ def get_code(self):
// load parameters variables from global memory to local thread memory
{param_loc.declare()}
{load_vars(chk.dimsp_notchunked, chk.indsp_notchunked, param_loc, args)}
{load_vars(chk.dimsp_notchunked, chk.indsp_notchunked, param_loc, args, is_local=varloader.is_local_var)}
{acc.declare()}
Expand All @@ -282,7 +287,7 @@ def get_code(self):
{fout_chunk.declare()}
if (i < nx) {{
{load_vars(chk.dimsx_notchunked, chk.indsi_notchunked, xi, args, row_index=i)} // load xi variables from global memory to local thread memory
{load_vars(chk.dimsx_notchunked, chk.indsi_notchunked, xi, args, row_index=i, is_local=varloader.is_local_var)} // load xi variables from global memory to local thread memory
}}
for (signed long int jstart = 0, tile = 0; jstart < ny; jstart += blockDim.x, tile++) {{
Expand All @@ -291,7 +296,7 @@ def get_code(self):
signed long int j = tile * blockDim.x + threadIdx.x;
if (j < ny) {{ // we load yj from device global memory only if j<ny
{load_vars(chk.dimsy_notchunked, chk.indsj_notchunked, yjloc, args, row_index=j)}
{load_vars(chk.dimsy_notchunked, chk.indsj_notchunked, yjloc, args, row_index=j, is_local=varloader.is_local_var)}
}}
__syncthreads();
Expand Down
9 changes: 5 additions & 4 deletions keopscore/keopscore/mapreduce/gpu/GpuReduc1D_finalchunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ class GpuReduc1D_finalchunks(MapReduce, Gpu_link_compile):
# class for generating the final C++ code, Gpu version

AssignZero = GpuAssignZero
force_all_local = True

def __init__(self, *args):
MapReduce.__init__(self, *args)
Expand Down Expand Up @@ -100,7 +101,7 @@ def get_code(self):
varfinal = self.red_formula.formula.children[1 - ind_fun_internal]
nchunks = 1 + (varfinal.dim - 1) // dimfinalchunk
dimlastfinalchunk = varfinal.dim - (nchunks - 1) * dimfinalchunk
varloader = Var_loader(fun_internal)
varloader = Var_loader(fun_internal, force_all_local=self.force_all_local)
dimsx = varloader.dimsx
dimsy = varloader.dimsy
dimsp = varloader.dimsp
Expand Down Expand Up @@ -130,7 +131,7 @@ def get_code(self):
yjloc = c_array(dtype, dimy, f"(yj + threadIdx.x * {dimy})")
foutjrel = c_array(dtype, dimfout, f"({fout.id}+jrel*{dimfout})")
yjrel = c_array(dtype, dimy, "yjrel")
table = self.varloader.table(xi, yjrel, param_loc)
table = self.varloader.table(xi, yjrel, param_loc, None, None, None)

last_chunk = c_variable("signed long int", f"{nchunks-1}")

Expand Down Expand Up @@ -189,7 +190,7 @@ def get_code(self):
// get the value of variable (index with i)
{xi.declare()}
if (i < nx) {{
{load_vars(dimsx, indsi, xi, args, row_index=i)} // load xi variables from global memory to local thread memory
{load_vars(dimsx, indsi, xi, args, row_index=i, is_local=varloader.is_local_var)} // load xi variables from global memory to local thread memory
{use_pragma_unroll()}
for (signed long int k=0; k<{dimout}; k++) {{
out[i*{dimout}+k] = 0.0f;
Expand All @@ -203,7 +204,7 @@ def get_code(self):
// get the current column
signed long int j = tile * blockDim.x + threadIdx.x;
if (j < ny) {{ // we load yj from device global memory only if j<ny
{load_vars(dimsy, indsj, yjloc, args, row_index=j)} // load yj variables from global memory to shared memory
{load_vars(dimsy, indsj, yjloc, args, row_index=j, is_local=varloader.is_local_var)} // load yj variables from global memory to shared memory
}}
__syncthreads();
Expand Down
40 changes: 29 additions & 11 deletions keopscore/keopscore/mapreduce/gpu/GpuReduc1D_ranges.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class GpuReduc1D_ranges(MapReduce, Gpu_link_compile):
# class for generating the final C++ code, Gpu version

AssignZero = GpuAssignZero
force_all_local = False

def __init__(self, *args):
MapReduce.__init__(self, *args)
Expand Down Expand Up @@ -43,15 +44,36 @@ def get_code(self):

param_loc = self.param_loc
xi = self.xi
yjloc = c_array(dtype, varloader.dimy, f"(yj + threadIdx.x * {varloader.dimy})")
yjrel = c_array(dtype, varloader.dimy, "yjrel")
table = varloader.table(self.xi, yjrel, self.param_loc)
yjloc = c_array(
dtype, varloader.dimy_local, f"(yj + threadIdx.x * {varloader.dimy_local})"
)
yjrel = c_array(dtype, varloader.dimy_local, "yjrel")
jreltile = c_variable("signed long int", "(jrel + tile * blockDim.x)")

indices_i = c_array("signed long int", nvarsi, "indices_i")
indices_j = c_array("signed long int", nvarsj, "indices_j")
indices_p = c_array("signed long int", nvarsp, "indices_p")

threadIdx_x = c_variable("signed long int", "threadIdx.x")

starty = c_variable("signed long int", "start_y")
j_call = c_variable("signed long int", "(jstart+jrel-start_y)")

table_batchmode = varloader.table(
self.xi,
yjrel,
self.param_loc,
args,
threadIdx_x,
j_call,
indices_i,
indices_j,
indices_p,
)
table_nobatchmode = varloader.table(
self.xi, yjrel, self.param_loc, args, i, j_call
)

declare_assign_indices_i = (
"signed long int *indices_i = offsets;" if nvarsi > 0 else ""
)
Expand All @@ -64,10 +86,6 @@ def get_code(self):
else ""
)

starty = c_variable("signed long int", "start_y")

threadIdx_x = c_variable("signed long int", "threadIdx.x")

if dtype == "half2":
self.headers += c_include("cuda_fp16.h")

Expand Down Expand Up @@ -148,13 +166,13 @@ def get_code(self):
{dtype} * yjrel = yj; // Loop on the columns of the current block.
{sum_scheme.initialize_temporary_accumulator_block_init()}
if (nbatchdims == 0) {{
for(signed long int jrel = 0; (jrel < blockDim.x) && (jrel<end_y-jstart); jrel++, yjrel+={varloader.dimy}) {{
{red_formula.formula(fout,table)} // Call the function, which outputs results in xi[0:DIMX1]
for(signed long int jrel = 0; (jrel < blockDim.x) && (jrel<end_y-jstart); jrel++, yjrel+={varloader.dimy_local}) {{
{red_formula.formula(fout,table_nobatchmode)} // Call the function, which outputs results in xi[0:DIMX1]
{sum_scheme.accumulate_result(acc, fout, jreltile+starty)}
}}
}} else {{
for(signed long int jrel = 0; (jrel < blockDim.x) && (jrel<end_y-jstart); jrel++, yjrel+={varloader.dimy}) {{
{red_formula.formula(fout,table)} // Call the function, which outputs results in fout
for(signed long int jrel = 0; (jrel < blockDim.x) && (jrel<end_y-jstart); jrel++, yjrel+={varloader.dimy_local}) {{
{red_formula.formula(fout,table_batchmode)} // Call the function, which outputs results in fout
{sum_scheme.accumulate_result(acc, fout, jreltile)}
}}
}}
Expand Down
21 changes: 13 additions & 8 deletions keopscore/keopscore/mapreduce/gpu/GpuReduc1D_ranges_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def do_chunk_sub_ranges(
row_index=i,
)

varloader_global = Var_loader(red_formula)
varloader_global = Var_loader(red_formula, force_all_local=True)
indsi_global = varloader_global.indsi
indsj_global = varloader_global.indsj
indsp_global = varloader_global.indsp
Expand Down Expand Up @@ -146,6 +146,10 @@ def do_chunk_sub_ranges(
xi,
yjrel,
param_loc,
[True] * chk.nminargs,
None,
None,
None,
)
foutj = c_variable(pointer(dtype), "foutj")

Expand Down Expand Up @@ -187,6 +191,7 @@ class GpuReduc1D_ranges_chunks(MapReduce, Gpu_link_compile):
# class for generating the final C++ code, Gpu version

AssignZero = GpuAssignZero
force_all_local = True

def __init__(self, *args):
MapReduce.__init__(self, *args)
Expand All @@ -204,7 +209,7 @@ def get_code(self):
dtype = self.dtype
dtypeacc = self.dtypeacc

varloader_global = Var_loader(red_formula)
varloader_global = Var_loader(red_formula, force_all_local=self.force_all_local)
indsi_global = varloader_global.indsi
indsj_global = varloader_global.indsj
indsp_global = varloader_global.indsp
Expand Down Expand Up @@ -401,9 +406,9 @@ def get_code(self):
// load parameters variables from global memory to local thread memory
{param_loc.declare()}
if (nbatchdims == 0) {{
{load_vars(chk.dimsp_notchunked, chk.indsp_notchunked, param_loc, args)}
{load_vars(chk.dimsp_notchunked, chk.indsp_notchunked, param_loc, args, is_local=varloader_global.is_local_var)}
}} else {{
{load_vars(chk.dimsp_notchunked, chk.indsp_notchunked, param_loc, args, offsets=indices_p)}
{load_vars(chk.dimsp_notchunked, chk.indsp_notchunked, param_loc, args, offsets=indices_p, is_local=varloader_global.is_local_var)}
}}
{acc.declare()}
Expand All @@ -422,10 +427,10 @@ def get_code(self):
if (i < end_x) {{
// load xi variables from global memory to local thread memory
if (nbatchdims == 0) {{
{load_vars(chk.dimsx_notchunked, chk.indsi_notchunked, xi, args, row_index=i)}
{load_vars(chk.dimsx_notchunked, chk.indsi_notchunked, xi, args, row_index=i, is_local=varloader_global.is_local_var)}
}} else {{
{load_vars(chk.dimsx_notchunked, chk.indsi_notchunked, xi, args,
row_index=threadIdx_x, offsets=indices_i, indsref=indsi_global)}
row_index=threadIdx_x, offsets=indices_i, indsref=indsi_global, is_local=varloader_global.is_local_var)}
}}
}}
Expand All @@ -442,11 +447,11 @@ def get_code(self):
if(j<end_y) // we load yj from device global memory only if j<end_y
if (nbatchdims == 0) {{
// load yj variables from global memory to shared memory
{load_vars(chk.dimsy_notchunked, chk.indsj_notchunked, yjloc, args, row_index=j)}
{load_vars(chk.dimsy_notchunked, chk.indsj_notchunked, yjloc, args, row_index=j, is_local=varloader_global.is_local_var)}
}} else {{
// Possibly, with offsets as we support broadcasting over batch dimensions
{load_vars(chk.dimsy_notchunked, chk.indsj_notchunked, yjloc, args,
row_index=j-starty, offsets=indices_j, indsref=indsj_global)}
row_index=j-starty, offsets=indices_j, indsref=indsj_global, is_local=varloader_global.is_local_var)}
}}
__syncthreads();
Expand Down
Loading

0 comments on commit bab8ad5

Please sign in to comment.