Skip to content

Commit

Permalink
Core: variadic templates for backend.
Browse files Browse the repository at this point in the history
The assign and reduce parallelisation functions can now handle arbitrary numbers of function arguments, which are forwarded to the lambda.
Therefore, the lambda is now placed before the variadic arguments.
Note that they can now also be called without arguments, allowing e.g. an assignment of zero to an entire `field`.
  • Loading branch information
GPMueller committed Mar 28, 2020
1 parent 8f89c90 commit 2830b1f
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 154 deletions.
195 changes: 64 additions & 131 deletions core/include/engine/Backend_par.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

#include <engine/Vectormath_Defines.hpp>

#include <tuple>

#ifdef SPIRIT_USE_CUDA
#include <cub/cub.cuh>
#define SPIRIT_LAMBDA __device__
Expand All @@ -20,6 +22,7 @@ namespace par

#ifdef SPIRIT_USE_CUDA

// f(i) for all i
template<typename F>
__global__
void cu_apply(int N, F f)
Expand All @@ -28,91 +31,45 @@ namespace par
if(idx < N)
f(idx);
}

// vf1[i] = f( vf2[i] )
// f(i) for all i
template<typename F>
void apply(int N, F f)
{
cu_apply<<<(N+1023)/1024, 1024>>>(N, f);
CU_CHECK_AND_SYNC();
}

template<typename F>
scalar reduce(int N, const F f)
// vf_to[i] = f(args[i]...) for all i
template<typename A, typename Lambda, typename... args>
__global__
void cu_assign_lambda(int N, A * vf_to, Lambda lambda, const args *... arg_fields)
{
static scalarfield sf(N, 0);
// Vectormath::fill(sf, 0);

if(sf.size() != N)
sf.resize(N);

auto s = sf.data();
apply( N, [f, s] SPIRIT_LAMBDA (int idx) { s[idx] = f(idx); } );

static scalarfield ret(1, 0);

// Determine temporary storage size and allocate
void * d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, sf.data(), ret.data(), sf.size());
cudaMalloc(&d_temp_storage, temp_storage_bytes);
// Reduction
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, sf.data(), ret.data(), sf.size());
CU_CHECK_AND_SYNC();
cudaFree(d_temp_storage);
return ret[0];
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < N)
vf_to[idx] = lambda(arg_fields[idx]...);
}

template<typename A, typename F>
scalar reduce(const field<A> & vf1, const F f)
// field[i] = f(args[i]...) for all i
template<typename A, typename Lambda, typename... args>
void assign(field<A> & vf_to, Lambda lambda, const field<args> &... vf_args)
{
// TODO: remove the reliance on a temporary scalar field (maybe thrust::dot with generalized operations)
// We also use this workaround for a single field as argument, because cub does not support non-commutative reduction operations

int n = vf1.size();
static scalarfield sf(n, 0);
// Vectormath::fill(sf, 0);

if(sf.size() != vf1.size())
sf.resize(vf1.size());

auto s = sf.data();
auto v1 = vf1.data();
apply( n, [f, s, v1] SPIRIT_LAMBDA (int idx) { s[idx] = f(v1[idx]); } );

static scalarfield ret(1, 0);
// Vectormath::fill(ret, 0);

// Determine temporary storage size and allocate
void * d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, sf.data(), ret.data(), sf.size());
cudaMalloc(&d_temp_storage, temp_storage_bytes);
// Reduction
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, sf.data(), ret.data(), sf.size());
int N = vf_to.size();
cu_assign_lambda<<<(N+1023)/1024, 1024>>>(N, vf_to.data(), lambda, vf_args.data()...);
CU_CHECK_AND_SYNC();
cudaFree(d_temp_storage);
return ret[0];
}

template<typename A, typename B, typename F>
scalar reduce(const field<A> & vf1, const field<B> & vf2, const F f)
// result = sum_i f(args[i]...)
template<typename Lambda, typename... args> inline
scalar pointer_reduce(int N, const Lambda & lambda, const args *... vf_args)
{
// TODO: remove the reliance on a temporary scalar field (maybe thrust::dot with generalized operations)
int n = vf1.size();
static scalarfield sf(n, 0);
// Vectormath::fill(sf, 0);

if(sf.size() != vf1.size())
sf.resize(vf1.size());
static scalarfield sf(N, 0);
if(sf.size() != N)
sf.resize(N);

auto s = sf.data();
auto v1 = vf1.data();
auto v2 = vf2.data();
apply( n, [f, s, v1, v2] SPIRIT_LAMBDA (int idx) { s[idx] = f(v1[idx], v2[idx]); } );
apply( N, [s, vf_args...] SPIRIT_LAMBDA (int idx) { s[idx] = f(vf_args[idx]...); } );

static scalarfield ret(1, 0);
// Vectormath::fill(ret, 0);
// Determine temporary storage size and allocate
void * d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
Expand All @@ -124,93 +81,69 @@ namespace par
cudaFree(d_temp_storage);
return ret[0];
}


// vf1[i] = f( vf2[i] )
template<typename A, typename B, typename F>
__global__
void cu_set_lambda(A * vf1, const B * vf2, F f, int N)
// result = sum_i f(args[i]...)
template<typename Lambda, typename... args> inline
scalar reduce(const Lambda & lambda, const field<args> &... vf_args)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < N)
{
vf1[idx] = f(vf2[idx]);
}
auto N = std::get<0>(std::tuple<const field<args> &...>(vf_args...)).size();
// We take this umweg so that `.data()` won't be called for every element of each field
return pointer_reduce(N, lambda, vf_args.data()...);
}

// vf1[i] = f( vf2[i] )
template<typename A, typename B, typename F>
void set(field<A> & vf1, const field<B> & vf2, F f)
{
int N = vf1.size();
cu_set_lambda<<<(N+1023)/1024, 1024>>>(vf1.data(), vf2.data(), f, N);
CU_CHECK_AND_SYNC();
}

#else

// f(i) for all i
template<typename F>
scalar reduce(int N, const F f)
void apply(int N, const F & f)
{
scalar res = 0;
#pragma omp parallel for reduction(+:res)
#pragma omp parallel for
for(unsigned int idx = 0; idx < N; ++idx)
{
res += f(idx);
}
return res;
f(idx);
}

template<typename A, typename F>
scalar reduce(const field<A> & vf1, const F f)
// vf_to[i] = f(args[i]...) for all i
template<typename A, typename Lambda, typename... args> inline
void pointer_assign(int N, A * vf_to, Lambda lambda, const args *... vf_args)
{
scalar res=0;
#pragma omp parallel for reduction(+:res)
for(unsigned int idx = 0; idx < vf1.size(); ++idx)
{
res += f(vf1[idx]);
}
return res;
#pragma omp parallel for
for(unsigned int idx = 0; idx < N; ++idx)
vf_to[idx] = lambda(vf_args[idx]...);
}

// result = sum_i f( vf1[i], vf2[i] )
template<typename A, typename B, typename F>
scalar reduce(const field<A> & vf1, const field<B> & vf2, const F & f)
// vf_to[i] = f(args[i]...) for all i
template<typename A, typename Lambda, typename... args> inline
void assign(field<A> & vf_to, Lambda lambda, const field<args> &... vf_args)
{
scalar res=0;
#pragma omp parallel for reduction(+:res)
for(unsigned int idx = 0; idx < vf1.size(); ++idx)
{
res += f(vf1[idx], vf2[idx]);
}
return res;
auto N = vf_to.size();
// We take this umweg so that `.data()` won't be called for every element of each field
pointer_assign(N, vf_to.data(), lambda, vf_args.data()...);
}

// vf1[i] = f( vf2[i] )
template<typename A, typename B, typename F>
void set(field<A> & vf1, const field<B> & vf2, const F & f)
{
#pragma omp parallel for
for(unsigned int idx = 0; idx < vf1.size(); ++idx)
{
vf1[idx] = f(vf2[idx]);
}
}

// f( vf1[idx], idx ) for all i
template<typename F>
void apply(int N, const F & f)
// result = sum_i f(args[i]...)
template<typename Lambda, typename... args> inline
scalar pointer_reduce(int N, const Lambda & lambda, const args *... vf_args)
{
#pragma omp parallel for
scalar res = 0;

#pragma omp parallel for reduction(+:res)
for(unsigned int idx = 0; idx < N; ++idx)
{
f(idx);
}
}
res += lambda(vf_args[idx]...);

return res;
}
// result = sum_i f(args[i]...)
template<typename Lambda, typename... args> inline
scalar reduce(const Lambda & lambda, const field<args> &... vf_args)
{
auto N = std::get<0>(std::tuple<const field<args> &...>(vf_args...)).size();
// We take this umweg so that `.data()` won't be called for every element of each field
return pointer_reduce(N, lambda, vf_args.data()...);
}

#endif

} // namespace par
} // namespace Backend
} // namespace Engine

#endif
36 changes: 21 additions & 15 deletions core/include/engine/Solver_Kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ namespace Solver_Kernels
std::vector<field<field<Vec>>> & delta_a, std::vector<field<field<Vec>>> & delta_grad, const std::vector<field<Vec>> & grad, std::vector<field<Vec>> & grad_pr,
const int num_mem, const scalar maxmove) {
// std::cerr << "lbfgs searchdir \n";
static auto dot = [] SPIRIT_LAMBDA (const Vec & v1, const Vec &v2) {return v1.dot(v2);};
static auto set = [] SPIRIT_LAMBDA (const Vec & x) {return x;};
static auto set = [] SPIRIT_LAMBDA (const Vec & x) -> Vec {return x;};
static auto dot = [] SPIRIT_LAMBDA (const Vec & v1, const Vec &v2) -> scalar {return v1.dot(v2);};

scalar epsilon = sizeof(scalar) == sizeof(float) ? 1e-30 : 1e-300;

Expand All @@ -50,11 +50,15 @@ namespace Solver_Kernels

if (local_iter == 0) // gradient descent
{
for(int img=0; img<noi; img++) {
Backend::par::set(grad_pr[img], grad[img], set);
auto & dir = searchdir[img];
auto & g_cur = grad[img];
Backend::par::set(dir, g_cur, [] SPIRIT_LAMBDA (const Vec & x){return -x;});
for(int img=0; img<noi; img++)
{
auto & g_ptr = grad[img].data();
auto & g_pr_ptr = grad_pr[img].data();
auto & dir_ptr = searchdir[img].data();
Backend::par::apply(nos, [g_ptr, g_pr_ptr, dir_ptr] SPIRIT_LAMBDA (int idx){
g_pr_ptr[idx] = g_ptr[idx];
dir_ptr[idx] = -g_ptr[idx];
});
auto & da = delta_a[img];
auto & dg = delta_grad[img];
for (int i = 0; i < num_mem; i++)
Expand All @@ -68,7 +72,9 @@ namespace Solver_Kernels
});
}
}
} else {
}
else
{
for (int img=0; img<noi; img++)
{
auto da = delta_a[img][m_index].data();
Expand All @@ -84,7 +90,7 @@ namespace Solver_Kernels

scalar rinv_temp = 0;
for (int img=0; img<noi; img++)
rinv_temp += Backend::par::reduce(delta_grad[img][m_index], delta_a[img][m_index], dot);
rinv_temp += Backend::par::reduce(dot, delta_grad[img][m_index], delta_a[img][m_index]);

if (rinv_temp > epsilon)
rho[m_index] = 1.0 / rinv_temp;
Expand All @@ -96,14 +102,14 @@ namespace Solver_Kernels
}

for (int img=0; img<noi; img++)
Backend::par::set(q_vec[img], grad[img], set);
Backend::par::assign(q_vec[img], set, grad[img]);

for (int k = num_mem - 1; k > -1; k--)
{
c_ind = (k + m_index + 1) % num_mem;
scalar temp=0;
for (int img=0; img<noi; img++)
temp += Backend::par::reduce(delta_a[img][c_ind], q_vec[img], dot);
temp += Backend::par::reduce(dot, delta_a[img][c_ind], q_vec[img]);

alpha[c_ind] = rho[c_ind] * temp;
for (int img=0; img<noi; img++)
Expand All @@ -119,7 +125,7 @@ namespace Solver_Kernels

scalar dy2=0;
for (int img=0; img<noi; img++)
dy2 += Backend::par::reduce(delta_grad[img][m_index], delta_grad[img][m_index], dot);
dy2 += Backend::par::reduce(dot, delta_grad[img][m_index], delta_grad[img][m_index]);

for (int img=0; img<noi; img++)
{
Expand All @@ -129,9 +135,9 @@ namespace Solver_Kernels
inv_rhody2 = 1.0 / rhody2;
else
inv_rhody2 = 1.0/(epsilon);
Backend::par::set(searchdir[img], q_vec[img], [inv_rhody2] SPIRIT_LAMBDA (const Vec & q){
Backend::par::assign(searchdir[img], [inv_rhody2] SPIRIT_LAMBDA (const Vec & q){
return inv_rhody2 * q;
} );
}, q_vec[img] );
}

for (int k = 0; k < num_mem; k++)
Expand All @@ -143,7 +149,7 @@ namespace Solver_Kernels

scalar rhopdg = 0;
for(int img=0; img<noi; img++)
rhopdg += Backend::par::reduce(delta_grad[img][c_ind], searchdir[img], dot);
rhopdg += Backend::par::reduce(dot, delta_grad[img][c_ind], searchdir[img]);

rhopdg *= rho[c_ind];

Expand Down
4 changes: 3 additions & 1 deletion core/include/engine/Solver_LBFGS_Atlas.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@ void Method_Solver<Solver::LBFGS_Atlas>::Iteration()
// Scale by averaging
for(int img=0; img<noi; img++)
{
a_norm_rms = std::max(a_norm_rms, scalar( sqrt( Backend::par::reduce(this->atlas_directions[img], [] SPIRIT_LAMBDA (const Vector2 & v){ return v.squaredNorm(); }) / nos )));
a_norm_rms = std::max( a_norm_rms, scalar( sqrt(
Backend::par::reduce([] SPIRIT_LAMBDA (const Vector2 & v){ return v.squaredNorm(); }, this->atlas_directions[img]) / nos
)));
}
scalar scaling = (a_norm_rms > maxmove) ? maxmove/a_norm_rms : 1.0;

Expand Down
8 changes: 6 additions & 2 deletions core/src/engine/Hamiltonian_Heisenberg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -623,14 +623,18 @@ namespace Engine
if(idx_ddi >= 0)
this->Gradient_DDI(spins, gradient);

energy += Backend::par::reduce( N, [s,g] SPIRIT_LAMBDA ( int idx ) { return 0.5 * g[idx].dot(s[idx]) ;} );
energy += Backend::par::reduce( [] SPIRIT_LAMBDA ( const Vector3 & s, const Vector3 & g ) {
return 0.5 * g.dot(s);
}, spins, gradient );

// External field
if(idx_zeeman >= 0)
{
Vector3 ext_field = external_field_normal * external_field_magnitude;
this->Gradient_Zeeman(gradient);
energy += Backend::par::reduce( N, [s, ext_field, mu_s] SPIRIT_LAMBDA ( int idx ) { return -mu_s[idx] * ext_field.dot(s[idx]) ;} );
energy += Backend::par::reduce( [ext_field] SPIRIT_LAMBDA ( const Vector3 & s, const scalar & mu_s ) {
return -mu_s * ext_field.dot(s);
}, spins, geometry->mu_s );
}

// Quadruplets
Expand Down
Loading

0 comments on commit 2830b1f

Please sign in to comment.