Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Core: variadic templates for backend. #559

Draft
wants to merge 5 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 66 additions & 83 deletions core/include/engine/Backend_par.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

#include <engine/Vectormath_Defines.hpp>

#include <tuple>

#ifdef SPIRIT_USE_CUDA
#include <cub/cub.cuh>
#define SPIRIT_LAMBDA __device__
Expand All @@ -20,28 +22,46 @@ namespace par

#ifdef SPIRIT_USE_CUDA

template<typename F>
// f(i) for all i
template<typename Lambda>
__global__
void cu_apply(int N, F f)
void cu_apply(int N, Lambda f)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < N)
f(idx);
}

// vf1[i] = f( vf2[i] )
template<typename F>
void apply(int N, F f)
// f(i) for all i
template<typename Lambda>
void apply(int N, Lambda f)
{
cu_apply<<<(N+1023)/1024, 1024>>>(N, f);
CU_CHECK_AND_SYNC();
}

template<typename F>
scalar reduce(int N, const F f)
// vf_to[i] = f(args[i]...) for all i
template<typename A, typename Lambda, typename... args>
__global__
void cu_assign_lambda(int N, A * vf_to, Lambda lambda, const args *... arg_fields)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < N)
vf_to[idx] = lambda(arg_fields[idx]...);
}
// field[i] = f(args[i]...) for all i
template<typename A, typename Lambda, typename... args>
void assign(field<A> & vf_to, Lambda lambda, const field<args> &... vf_args)
{
int N = vf_to.size();
cu_assign_lambda<<<(N+1023)/1024, 1024>>>(N, vf_to.data(), lambda, vf_args.data()...);
CU_CHECK_AND_SYNC();
}


template<typename Lambda>
scalar reduce(int N, const Lambda f)
{
static scalarfield sf(N, 0);
// Vectormath::fill(sf, 0);

if(sf.size() != N)
sf.resize(N);
Expand All @@ -63,15 +83,14 @@ namespace par
return ret[0];
}

template<typename A, typename F>
scalar reduce(const field<A> & vf1, const F f)
template<typename A, typename Lambda>
scalar reduce(int N, const Lambda f, const field<A> & vf1)
{
// TODO: remove the reliance on a temporary scalar field (maybe thrust::dot with generalized operations)
// We also use this workaround for a single field as argument, because cub does not support non-commutative reduction operations

int n = vf1.size();
static scalarfield sf(n, 0);
// Vectormath::fill(sf, 0);

if(sf.size() != vf1.size())
sf.resize(vf1.size());
Expand All @@ -81,7 +100,6 @@ namespace par
apply( n, [f, s, v1] SPIRIT_LAMBDA (int idx) { s[idx] = f(v1[idx]); } );

static scalarfield ret(1, 0);
// Vectormath::fill(ret, 0);

// Determine temporary storage size and allocate
void * d_temp_storage = NULL;
Expand All @@ -96,12 +114,11 @@ namespace par
}

template<typename A, typename B, typename F>
scalar reduce(const field<A> & vf1, const field<B> & vf2, const F f)
scalar reduce(int N, const F f, const field<A> & vf1, const field<B> & vf2)
{
// TODO: remove the reliance on a temporary scalar field (maybe thrust::dot with generalized operations)
int n = vf1.size();
static scalarfield sf(n, 0);
// Vectormath::fill(sf, 0);

if(sf.size() != vf1.size())
sf.resize(vf1.size());
Expand All @@ -112,7 +129,6 @@ namespace par
apply( n, [f, s, v1, v2] SPIRIT_LAMBDA (int idx) { s[idx] = f(v1[idx], v2[idx]); } );

static scalarfield ret(1, 0);
// Vectormath::fill(ret, 0);
// Determine temporary storage size and allocate
void * d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
Expand All @@ -125,92 +141,59 @@ namespace par
return ret[0];
}


// vf1[i] = f( vf2[i] )
template<typename A, typename B, typename F>
__global__
void cu_set_lambda(A * vf1, const B * vf2, F f, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < N)
{
vf1[idx] = f(vf2[idx]);
}
}

// vf1[i] = f( vf2[i] )
template<typename A, typename B, typename F>
void set(field<A> & vf1, const field<B> & vf2, F f)
{
int N = vf1.size();
cu_set_lambda<<<(N+1023)/1024, 1024>>>(vf1.data(), vf2.data(), f, N);
CU_CHECK_AND_SYNC();
}

#else

// f(i) for all i
template<typename F>
scalar reduce(int N, const F f)
void apply(int N, const F & f)
{
scalar res = 0;
#pragma omp parallel for reduction(+:res)
#pragma omp parallel for
for(unsigned int idx = 0; idx < N; ++idx)
{
res += f(idx);
}
return res;
f(idx);
}

template<typename A, typename F>
scalar reduce(const field<A> & vf1, const F f)
// vf_to[i] = f(args[i]...) for all i
template<typename A, typename Lambda, typename... args> inline
void pointer_assign(int N, A * vf_to, Lambda lambda, const args *... vf_args)
{
scalar res=0;
#pragma omp parallel for reduction(+:res)
for(unsigned int idx = 0; idx < vf1.size(); ++idx)
{
res += f(vf1[idx]);
}
return res;
#pragma omp parallel for
for(unsigned int idx = 0; idx < N; ++idx)
vf_to[idx] = lambda(vf_args[idx]...);
}

// result = sum_i f( vf1[i], vf2[i] )
template<typename A, typename B, typename F>
scalar reduce(const field<A> & vf1, const field<B> & vf2, const F & f)
// vf_to[i] = f(args[i]...) for all i
template<typename A, typename Lambda, typename... args> inline
void assign(field<A> & vf_to, Lambda lambda, const field<args> &... vf_args)
{
scalar res=0;
#pragma omp parallel for reduction(+:res)
for(unsigned int idx = 0; idx < vf1.size(); ++idx)
{
res += f(vf1[idx], vf2[idx]);
}
return res;
auto N = vf_to.size();
// We take this umweg so that `.data()` won't be called for every element of each field
pointer_assign(N, vf_to.data(), lambda, vf_args.data()...);
}

// vf1[i] = f( vf2[i] )
template<typename A, typename B, typename F>
void set(field<A> & vf1, const field<B> & vf2, const F & f)
{
#pragma omp parallel for
for(unsigned int idx = 0; idx < vf1.size(); ++idx)
{
vf1[idx] = f(vf2[idx]);
}
}

// f( vf1[idx], idx ) for all i
template<typename F>
void apply(int N, const F & f)
// result = sum_i f(args[i]...)
template<typename Lambda, typename... args> inline
scalar pointer_reduce(int N, const Lambda & lambda, const args *... vf_args)
{
#pragma omp parallel for
scalar res = 0;

#pragma omp parallel for reduction(+:res)
for(unsigned int idx = 0; idx < N; ++idx)
{
f(idx);
}
}
res += lambda(vf_args[idx]...);

return res;
}
// result = sum_i f(args[i]...)
template<typename Lambda, typename... args> inline
scalar reduce(int N, const Lambda & lambda, const field<args> &... vf_args)
{
// We take this umweg so that `.data()` won't be called for every element of each field
return pointer_reduce(N, lambda, vf_args.data()...);
}

#endif

} // namespace par
} // namespace Backend
} // namespace Engine

#endif
5 changes: 3 additions & 2 deletions core/include/engine/Hamiltonian_Heisenberg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,11 +152,12 @@ namespace Engine
// Calculate the Quadruplet energy
void E_Quadruplet(const vectorfield & spins, scalarfield & Energy);

private:
int idx_zeeman, idx_anisotropy, idx_exchange, idx_dmi, idx_ddi, idx_quadruplet;
void Gradient_DDI_Cutoff(const vectorfield& spins, vectorfield & gradient);
void Gradient_DDI_Direct(const vectorfield& spins, vectorfield & gradient);
void Gradient_DDI_FFT(const vectorfield& spins, vectorfield & gradient);

private:
int idx_zeeman, idx_anisotropy, idx_exchange, idx_dmi, idx_ddi, idx_quadruplet;
void E_DDI_Direct(const vectorfield& spins, scalarfield & Energy);
void E_DDI_Cutoff(const vectorfield& spins, scalarfield & Energy);
void E_DDI_FFT(const vectorfield& spins, scalarfield & Energy);
Expand Down
34 changes: 20 additions & 14 deletions core/include/engine/Solver_Depondt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,15 @@ void Method_Solver<Solver::Depondt>::Iteration ()
{
auto& conf = *this->configurations[i];
auto& conf_predictor = *this->configurations_predictor[i];
auto anglep = angle.data();
auto axisp = rotationaxis[i].data();
auto f_virtual = forces_virtual[i].data();

// For Rotation matrix R := R( H_normed, angle )
Vectormath::norm( forces_virtual[i], angle ); // angle = |forces_virtual|

Vectormath::set_c_a( 1, forces_virtual[i], rotationaxis[i] ); // rotationaxis = |forces_virtual|
Vectormath::normalize_vectors( rotationaxis[i] ); // normalize rotation axis
Backend::par::apply(nos, [anglep, axisp, f_virtual] SPIRIT_LAMBDA (int idx) {
anglep[idx] = f_virtual[idx].norm(); // angle = |forces_virtual|
axisp[idx] = f_virtual[idx].normalized(); // rotationaxis = forces_virtual/|forces_virtual|
});

// Get spin predictor n' = R(H) * n
Vectormath::rotate( conf, rotationaxis[i], angle, conf_predictor );
Expand All @@ -60,16 +63,19 @@ void Method_Solver<Solver::Depondt>::Iteration ()
for (int i=0; i < this->noi; i++)
{
auto& conf = *this->configurations[i];

// Calculate the linear combination of the two forces_virtuals
Vectormath::set_c_a( 0.5, forces_virtual[i], temp1); // H = H/2
Vectormath::add_c_a( 0.5, forces_virtual_predictor[i], temp1 ); // H = (H + H')/2

// Get the rotation angle as norm of temp1 ...For Rotation matrix R' := R( H'_normed, angle' )
Vectormath::norm( temp1, angle ); // angle' = |forces_virtual lin combination|

// Normalize temp1 to get rotation axes
Vectormath::normalize_vectors( temp1 );
auto temp1p = temp1.data();
auto anglep = angle.data();
auto f_virtual = forces_virtual[i].data();
auto f_virtual_predictor = forces_virtual_predictor[i].data();

Backend::par::apply(nos, [temp1p, anglep, f_virtual, f_virtual_predictor] SPIRIT_LAMBDA (int idx) {
// Calculate the linear combination of the two forces_virtuals
temp1p[idx] = 0.5 * (f_virtual[idx] + f_virtual_predictor[idx]);
// Get the rotation angle as norm of temp1 ...For Rotation matrix R' := R( H'_normed, angle' )
anglep[idx] = temp1p[idx].norm();
// Normalize temp1 to get rotation axes
temp1p[idx].normalize();
});

// Get new spin conf n_new = R( (H+H')/2 ) * n
Vectormath::rotate( conf, temp1, angle, conf );
Expand Down
37 changes: 15 additions & 22 deletions core/include/engine/Solver_Heun.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,14 @@ void Method_Solver<Solver::Heun>::Iteration ()
// Predictor for each image
for (int i = 0; i < this->noi; ++i)
{
auto& conf = *this->configurations[i];
auto& conf_temp = *this->configurations_temp[i];
auto& conf_predictor = *this->configurations_predictor[i];
auto conf = this->configurations[i]->data();
auto conf_predictor = this->configurations_predictor[i]->data();
auto f_virtual = this->forces_virtual[i].data();

// First step - Predictor
Vectormath::set_c_cross( -1, conf, forces_virtual[i], conf_temp ); // temp1 = -( conf x A )
Vectormath::set_c_a( 1, conf, conf_predictor ); // configurations_predictor = conf
Vectormath::add_c_a( 1, conf_temp, conf_predictor ); // configurations_predictor = conf + dt*temp1

// Normalize spins
Vectormath::normalize_vectors( conf_predictor );
Backend::par::apply( nos, [conf, conf_predictor, f_virtual] SPIRIT_LAMBDA (int idx) {
conf_predictor[idx] = ( conf[idx] - conf[idx].cross( f_virtual[idx] ) ).normalized();
} );
}

// Calculate_Force for the Corrector
Expand All @@ -60,21 +57,17 @@ void Method_Solver<Solver::Heun>::Iteration ()
// Corrector step for each image
for (int i=0; i < this->noi; i++)
{
auto& conf = *this->configurations[i];
auto& conf_temp = *this->configurations_temp[i];
auto& conf_predictor = *this->configurations_predictor[i];
auto conf = this->configurations[i]->data();
auto conf_predictor = this->configurations_predictor[i]->data();
auto f_virtual = this->forces_virtual[i].data();
auto f_virtual_predictor = this->forces_virtual_predictor[i].data();

// Second step - Corrector
Vectormath::scale( conf_temp, 0.5 ); // configurations_temp = 0.5 * configurations_temp
Vectormath::add_c_a( 1, conf, conf_temp ); // configurations_temp = conf + 0.5 * configurations_temp
Vectormath::set_c_cross( -1, conf_predictor, forces_virtual_predictor[i], temp1 ); // temp1 = - ( conf' x A' )
Vectormath::add_c_a( 0.5, temp1, conf_temp ); // configurations_temp = conf + 0.5 * configurations_temp + 0.5 * temp1

// Normalize spins
Vectormath::normalize_vectors( conf_temp );

// Copy out
conf = conf_temp;
Backend::par::apply( nos, [conf, conf_predictor, f_virtual, f_virtual_predictor] SPIRIT_LAMBDA (int idx) {
conf[idx] = (
conf[idx] - 0.5 * ( conf[idx].cross( f_virtual[idx] ) + conf_predictor[idx].cross(f_virtual_predictor[idx]) )
).normalized();
} );
}
};

Expand Down
Loading