Skip to content

Commit

Permalink
[ci skip] set limit for pragma unrolls
Browse files Browse the repository at this point in the history
  • Loading branch information
joanglaunes committed Aug 11, 2021
1 parent 6f63977 commit 31f7280
Show file tree
Hide file tree
Showing 8 changed files with 34 additions and 23 deletions.
6 changes: 3 additions & 3 deletions keops/python_engine/formulas/maths/TensorDot.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np

from keops.python_engine.formulas.Operation import Operation

from keops.python_engine.utils.code_gen_utils import use_pragma_unroll

####################################
###### Tensor Dot Product #####
Expand Down Expand Up @@ -152,11 +152,11 @@ def Op(self, out, table, arg0, arg1):

return f"""
#if C_CONTIGUOUS // row major
#pragma unroll
{use_pragma_unroll()}
for (int i = 0; i < {out.dim}; i++)
{out.id}[i] = ({out.dtype})(0.0f);
#pragma unroll
{use_pragma_unroll()}
{str_code}
#else // column major
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
new_c_varname,
c_if,
c_array,
use_pragma_unroll
)
from keops.python_engine.formulas.reductions.Reduction import Reduction

Expand Down Expand Up @@ -79,10 +80,10 @@ def ReducePairShort(self, acc, xi, ind):
{{
{xik.declare()}
{l.declare()}
#pragma unroll
{use_pragma_unroll()}
for(int {k.id}=0; {k.id}<{fdim}; {k.id}++) {{
{xik.assign(xi[k])}
#pragma unroll
{use_pragma_unroll()}
for({l.id}={(k+(K-1)*2*fdim).id}; {l.id}>={k.id} && {(xik<acc[l]).id}; {l.id}-={2*fdim}) {{
{tmpl.declare_assign(acc[l])}
{indtmpl.declare_assign(acc[l+fdim])}
Expand Down
5 changes: 3 additions & 2 deletions keops/python_engine/mapreduce/GpuReduc1D_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
sizeof,
pointer,
table,
table4
table4,
use_pragma_unroll
)
from keops.python_engine.formulas.reductions.sum_schemes import *
from keops.python_engine.compilation import Gpu_link_compile
Expand Down Expand Up @@ -186,7 +187,7 @@ def get_code(self):
}}
// looping on chunks (except the last)
#pragma unroll
{use_pragma_unroll()}
for (int chunk=0; chunk<{chk.nchunks}-1; chunk++) {{
{chunk_sub_routine}
}}
Expand Down
9 changes: 5 additions & 4 deletions keops/python_engine/mapreduce/GpuReduc1D_finalchunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
pointer,
table,
table4,
Var_loader
Var_loader,
use_pragma_unroll
)
from keops.python_engine.formulas.reductions.sum_schemes import *
from keops.python_engine.formulas.reductions.Sum_Reduction import Sum_Reduction
Expand All @@ -39,15 +40,15 @@ def do_finalchunk_sub(dtype, varfinal, dimfinalchunk_curr,
__syncthreads();
for (int jrel = 0; (jrel < blockDim.x) && (jrel < {ny.id} - {jstart.id}); jrel++, yjrel += {dimfinalchunk}) {{
if ({i.id} < {nx.id}) {{ // we compute only if needed
#pragma unroll
{use_pragma_unroll()}
for (int k=0; k<{dimfinalchunk_curr}; k++) {{
{acc.id}[k] += yjrel[k] * fout[jrel];
}}
}}
__syncthreads();
}}
if ({i.id} < {nx.id}) {{
#pragma unroll
{use_pragma_unroll()}
for (int k=0; k<{dimfinalchunk_curr}; k++)
{out.id}[i*{dimout}+{chunk.id}*{dimfinalchunk}+k] += {acc.id}[k];
}}
Expand Down Expand Up @@ -145,7 +146,7 @@ def get_code(self):
{xi.declare()}
if (i < nx) {{
{load_vars(dimsx, indsi, xi, args, row_index=i)} // load xi variables from global memory to local thread memory
#pragma unroll
{use_pragma_unroll()}
for (int k=0; k<{dimout}; k++) {{
out[i*{dimout}+k] = 0.0f;
}}
Expand Down
5 changes: 3 additions & 2 deletions keops/python_engine/mapreduce/GpuReduc1D_ranges_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
pointer,
table,
table4,
Var_loader
Var_loader,
use_pragma_unroll
)
from keops.python_engine.formulas.reductions.sum_schemes import *
from keops.python_engine.compilation import Gpu_link_compile
Expand Down Expand Up @@ -286,7 +287,7 @@ def get_code(self):
}}
// looping on chunks (except the last)
#pragma unroll
{use_pragma_unroll()}
for (int chunk=0; chunk<{chk.nchunks}-1; chunk++) {{
{chunk_sub_routine}
}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
pointer,
table,
table4,
Var_loader
Var_loader,
use_pragma_unroll
)
from keops.python_engine.formulas.reductions.sum_schemes import *
from keops.python_engine.formulas.reductions.Sum_Reduction import Sum_Reduction
Expand Down Expand Up @@ -48,15 +49,15 @@ def do_finalchunk_sub_ranges(dtype, fun_global, varfinal, dimfinalchunk_curr,
__syncthreads();
for (int jrel = 0; (jrel < blockDim.x) && (jrel < {end_y.id} - {jstart.id}); jrel++, yjrel += {dimfinalchunk}) {{
if ({i.id} < {end_x.id}) {{ // we compute only if needed
#pragma unroll
{use_pragma_unroll()}
for (int k=0; k<{dimfinalchunk_curr}; k++) {{
{acc.id}[k] += yjrel[k] * fout[jrel];
}}
}}
__syncthreads();
}}
if ({i.id} < {end_x.id}) {{
#pragma unroll
{use_pragma_unroll()}
for (int k=0; k<{dimfinalchunk_curr}; k++)
{out.id}[i*{dimout}+{chunk.id}*{dimfinalchunk}+k] += {acc.id}[k];
}}
Expand Down Expand Up @@ -219,7 +220,7 @@ def get_code(self):
{varloader.load_vars("i", xi, args, row_index=threadIdx_x, offsets=indices_i, indsref=indsi_global)} // Possibly, with offsets as we support broadcasting over batch dimensions
}}
#pragma unroll
{use_pragma_unroll()}
for (int k=0; k<{dimout}; k++) {{
out[i*{dimout}+k] = 0.0f;
}}
Expand Down
5 changes: 3 additions & 2 deletions keops/python_engine/mapreduce/GpuReduc2D.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
c_include,
signature_list,
call_list,
pointer
pointer,
use_pragma_unroll
)
from keops.python_engine.compilation import Gpu_link_compile
from keops.python_engine.formulas.reductions.sum_schemes import block_sum, kahan_scheme
Expand Down Expand Up @@ -188,7 +189,7 @@ def get_code(self):
// lines of size DIMRED. The final reduction, which "sums over the block lines",
// shall be done in a later step.
if(i<nx) {{
#pragma unroll
{use_pragma_unroll()}
for(int k=0; k<{dimred}; k++) {{
out[blockIdx.y*{dimred}*nx+i*{dimred}+k] = acc[k];
}}
Expand Down
13 changes: 9 additions & 4 deletions keops/python_engine/utils/code_gen_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,11 @@ def __getitem__(self, other):
else:
raise ValueError("not implemented")

def use_pragma_unroll(n=64):
if n is None:
return f"\n#pragma unroll\n"
else:
return f"\n#pragma unroll({n})\n"

def c_for_loop(start, end, incr, pragma_unroll=False):
def to_string(x):
Expand All @@ -208,7 +213,7 @@ def to_string(x):
def print(body_code):
string = ""
if pragma_unroll:
string += "\n#pragma unroll\n"
string += use_pragma_unroll()
string += f""" for(int {k.id}={start}; {k.id}<{end}; {k.id}+=({incr})) {{
{body_code}
}}
Expand Down Expand Up @@ -615,7 +620,7 @@ def load_vars(dims, inds, xloc, args, row_index=c_zero_int, offsets=None, indsre
row_index_str = (
f"({row_index.id}+{offsets.id}[{l}])" if offsets else row_index.id
)
string += "#pragma unroll\n"
string += use_pragma_unroll()
string += f"for(int v=0; v<{dims[u]}; v++) {{\n"
string += f" {xloc.id}[a] = {args[inds[u]].id}[{row_index_str}*{dims[u]}+v];\n"
string += " a++;\n"
Expand Down Expand Up @@ -657,7 +662,7 @@ def load_vars_chunks(inds, dim_chunk, dim_chunk_load, dim_org,
string += "{"
string += "int a=0;\n"
for u in range(len(inds)):
string += "#pragma unroll\n"
string += use_pragma_unroll()
string += f"for(int v=0; v<{dim_chunk_load}; v++) {{\n"
string += f" {xloc.id}[a] = {args[inds[u]].id}[{row_index.id}*{dim_org}+{k.id}*{dim_chunk}+v];\n"
string += " a++;\n"
Expand Down Expand Up @@ -686,7 +691,7 @@ def load_vars_chunks_offsets(inds, indsref, dim_chunk, dim_chunk_load, dim_org,
string += "int a=0;\n"
for u in range(len(inds)):
l = indsref.index(inds[u])
string += "#pragma unroll\n"
string += {use_pragma_unroll()}
string += f"for(int v=0; v<{dim_chunk_load}; v++) {{\n"
string += f" {xloc.id}[a] = {args[inds[u]].id}[({row_index.id}+{offsets.id}[{l}])*{dim_org}+{k.id}*{dim_chunk}+v];\n"
string += " a++;\n"
Expand Down

0 comments on commit 31f7280

Please sign in to comment.