Core: variadic templates for backend.

The assign and reduce parallelisation functions can now handle arbitrary numbers of function arguments, which are forwarded to the lambda. Therefore, the lambda is now placed before the variadic arguments. Note that they can now also be called without arguments, allowing e.g. an assignment of zero to an entire `field`.
spirit-code · Mar 28, 2020 · 2830b1f · 2830b1f
1 parent 8f89c90
commit 2830b1f
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 154 deletions.
diff --git a/core/include/engine/Backend_par.hpp b/core/include/engine/Backend_par.hpp
@@ -4,6 +4,8 @@
 
 #include <engine/Vectormath_Defines.hpp>
 
+#include <tuple>
+
 #ifdef SPIRIT_USE_CUDA
     #include <cub/cub.cuh>
     #define SPIRIT_LAMBDA __device__
@@ -20,6 +22,7 @@ namespace par
 
 #ifdef SPIRIT_USE_CUDA
 
+    // f(i) for all i
     template<typename F>
     __global__
     void cu_apply(int N, F f)
@@ -28,91 +31,45 @@ namespace par
         if(idx < N)
             f(idx);
     }
-
-    // vf1[i] = f( vf2[i] )
+    // f(i) for all i
     template<typename F>
     void apply(int N, F f)
     {
         cu_apply<<<(N+1023)/1024, 1024>>>(N, f);
         CU_CHECK_AND_SYNC();
     }
 
-    template<typename F>
-    scalar reduce(int N, const F f)
+    // vf_to[i] = f(args[i]...) for all i
+    template<typename A, typename Lambda, typename... args>
+    __global__
+    void cu_assign_lambda(int N, A * vf_to, Lambda lambda, const args *... arg_fields)
     {
-        static scalarfield sf(N, 0);
-        // Vectormath::fill(sf, 0);
-
-        if(sf.size() != N)
-            sf.resize(N);
-
-        auto s  = sf.data();
-        apply( N, [f, s] SPIRIT_LAMBDA (int idx) { s[idx] = f(idx); } );
-
-        static scalarfield ret(1, 0);
-
-        // Determine temporary storage size and allocate
-        void * d_temp_storage = NULL;
-        size_t temp_storage_bytes = 0;
-        cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, sf.data(), ret.data(), sf.size());
-        cudaMalloc(&d_temp_storage, temp_storage_bytes);
-        // Reduction
-        cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, sf.data(), ret.data(), sf.size());
-        CU_CHECK_AND_SYNC();
-        cudaFree(d_temp_storage);
-        return ret[0];
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if(idx < N)
+            vf_to[idx] = lambda(arg_fields[idx]...);
     }
-
-    template<typename A, typename F>
-    scalar reduce(const field<A> & vf1, const F f)
+    // field[i] = f(args[i]...) for all i
+    template<typename A, typename Lambda, typename... args>
+    void assign(field<A> & vf_to, Lambda lambda, const field<args> &... vf_args)
     {
-        // TODO: remove the reliance on a temporary scalar field (maybe thrust::dot with generalized operations)
-        // We also use this workaround for a single field as argument, because cub does not support non-commutative reduction operations
-
-        int n = vf1.size();
-        static scalarfield sf(n, 0);
-        // Vectormath::fill(sf, 0);
-
-        if(sf.size() != vf1.size())
-            sf.resize(vf1.size());
-
-        auto s  = sf.data();
-        auto v1 = vf1.data();
-        apply( n, [f, s, v1] SPIRIT_LAMBDA (int idx) { s[idx] = f(v1[idx]); } );
-
-        static scalarfield ret(1, 0);
-        // Vectormath::fill(ret, 0);
-
-        // Determine temporary storage size and allocate
-        void * d_temp_storage = NULL;
-        size_t temp_storage_bytes = 0;
-        cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, sf.data(), ret.data(), sf.size());
-        cudaMalloc(&d_temp_storage, temp_storage_bytes);
-        // Reduction
-        cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, sf.data(), ret.data(), sf.size());
+        int N = vf_to.size();
+        cu_assign_lambda<<<(N+1023)/1024, 1024>>>(N, vf_to.data(), lambda, vf_args.data()...);
         CU_CHECK_AND_SYNC();
-        cudaFree(d_temp_storage);
-        return ret[0];
     }
 
-    template<typename A, typename B, typename F>
-    scalar reduce(const field<A> & vf1, const field<B> & vf2, const F f)
+    // result = sum_i f(args[i]...)
+    template<typename Lambda, typename... args> inline
+    scalar pointer_reduce(int N, const Lambda & lambda, const args *... vf_args)
     {
         // TODO: remove the reliance on a temporary scalar field (maybe thrust::dot with generalized operations)
-        int n = vf1.size();
-        static scalarfield sf(n, 0);
-        // Vectormath::fill(sf, 0);
-
-        if(sf.size() != vf1.size())
-            sf.resize(vf1.size());
+        static scalarfield sf(N, 0);
+        if(sf.size() != N)
+            sf.resize(N);
 
         auto s  = sf.data();
-        auto v1 = vf1.data();
-        auto v2 = vf2.data();
-        apply( n, [f, s, v1, v2] SPIRIT_LAMBDA (int idx) { s[idx] = f(v1[idx], v2[idx]); } );
+        apply( N, [s, vf_args...] SPIRIT_LAMBDA (int idx) { s[idx] = f(vf_args[idx]...); } );
 
         static scalarfield ret(1, 0);
-        // Vectormath::fill(ret, 0);
         // Determine temporary storage size and allocate
         void * d_temp_storage = NULL;
         size_t temp_storage_bytes = 0;
@@ -124,93 +81,69 @@ namespace par
         cudaFree(d_temp_storage);
         return ret[0];
     }
-
-
-    // vf1[i] = f( vf2[i] )
-    template<typename A, typename B, typename F>
-    __global__
-    void cu_set_lambda(A * vf1, const B * vf2, F f, int N)
+    // result = sum_i f(args[i]...)
+    template<typename Lambda, typename... args> inline
+    scalar reduce(const Lambda & lambda, const field<args> &... vf_args)
     {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if(idx < N)
-        {
-            vf1[idx] = f(vf2[idx]);
-        }
+        auto N = std::get<0>(std::tuple<const field<args> &...>(vf_args...)).size();
+        // We take this umweg so that `.data()` won't be called for every element of each field
+        return pointer_reduce(N, lambda, vf_args.data()...);
     }
 
-    // vf1[i] = f( vf2[i] )
-    template<typename A, typename B, typename F>
-    void set(field<A> & vf1, const field<B> & vf2, F f)
-    {
-        int N = vf1.size();
-        cu_set_lambda<<<(N+1023)/1024, 1024>>>(vf1.data(), vf2.data(), f, N);
-        CU_CHECK_AND_SYNC();
-    }
-
 #else
 
+    // f(i) for all i
     template<typename F>
-    scalar reduce(int N, const F f)
+    void apply(int N, const F & f)
     {
-        scalar res = 0;
-        #pragma omp parallel for reduction(+:res)
+        #pragma omp parallel for
         for(unsigned int idx = 0; idx < N; ++idx)
-        {
-            res += f(idx);
-        }
-        return res;
+            f(idx);
     }
 
-    template<typename A, typename F>
-    scalar reduce(const field<A> & vf1, const F f)
+    // vf_to[i] = f(args[i]...) for all i
+    template<typename A, typename Lambda, typename... args> inline
+    void pointer_assign(int N, A * vf_to, Lambda lambda, const args *... vf_args)
     {
-        scalar res=0;
-        #pragma omp parallel for reduction(+:res)
-        for(unsigned int idx = 0; idx < vf1.size(); ++idx)
-        {
-            res += f(vf1[idx]);
-        }
-        return res;
+        #pragma omp parallel for
+        for(unsigned int idx = 0; idx < N; ++idx)
+            vf_to[idx] = lambda(vf_args[idx]...);
     }
-
-    // result = sum_i  f( vf1[i], vf2[i] )
-    template<typename A, typename B, typename F>
-    scalar reduce(const field<A> & vf1, const field<B> & vf2, const F & f)
+    // vf_to[i] = f(args[i]...) for all i
+    template<typename A, typename Lambda, typename... args> inline
+    void assign(field<A> & vf_to, Lambda lambda, const field<args> &... vf_args)
     {
-        scalar res=0;
-        #pragma omp parallel for reduction(+:res)
-        for(unsigned int idx = 0; idx < vf1.size(); ++idx)
-        {
-            res += f(vf1[idx], vf2[idx]);
-        }
-        return res;
+        auto N = vf_to.size();
+        // We take this umweg so that `.data()` won't be called for every element of each field
+        pointer_assign(N, vf_to.data(), lambda, vf_args.data()...);
     }
 
-    // vf1[i] = f( vf2[i] )
-    template<typename A, typename B, typename F>
-    void set(field<A> & vf1, const field<B> & vf2, const F & f)
-    {
-        #pragma omp parallel for
-        for(unsigned int idx = 0; idx < vf1.size(); ++idx)
-        {
-            vf1[idx] = f(vf2[idx]);
-        }
-    }
 
-    // f( vf1[idx], idx ) for all i
-    template<typename F>
-    void apply(int N, const F & f)
+    // result = sum_i f(args[i]...)
+    template<typename Lambda, typename... args> inline
+    scalar pointer_reduce(int N, const Lambda & lambda, const args *... vf_args)
     {
-        #pragma omp parallel for
+        scalar res = 0;
+
+        #pragma omp parallel for reduction(+:res)
         for(unsigned int idx = 0; idx < N; ++idx)
-        {
-            f(idx);
-        }
-    }
+            res += lambda(vf_args[idx]...);
 
+        return res;
+    }
+    // result = sum_i f(args[i]...)
+    template<typename Lambda, typename... args> inline
+    scalar reduce(const Lambda & lambda, const field<args> &... vf_args)
+    {
+        auto N = std::get<0>(std::tuple<const field<args> &...>(vf_args...)).size();
+        // We take this umweg so that `.data()` won't be called for every element of each field
+        return pointer_reduce(N, lambda, vf_args.data()...);
+    }
 
 #endif
+
 } // namespace par
 } // namespace Backend
 } // namespace Engine
+
 #endif
diff --git a/core/include/engine/Solver_Kernels.hpp b/core/include/engine/Solver_Kernels.hpp
@@ -38,8 +38,8 @@ namespace Solver_Kernels
                              std::vector<field<field<Vec>>> & delta_a, std::vector<field<field<Vec>>> & delta_grad, const std::vector<field<Vec>> & grad, std::vector<field<Vec>> & grad_pr,
                              const int num_mem, const scalar maxmove) {
         // std::cerr << "lbfgs searchdir \n";
-        static auto dot = [] SPIRIT_LAMBDA (const Vec & v1, const Vec &v2) {return v1.dot(v2);};
-        static auto set = [] SPIRIT_LAMBDA (const Vec & x) {return x;};
+        static auto set = [] SPIRIT_LAMBDA (const Vec & x) -> Vec {return x;};
+        static auto dot = [] SPIRIT_LAMBDA (const Vec & v1, const Vec &v2) -> scalar {return v1.dot(v2);};
 
         scalar epsilon = sizeof(scalar) == sizeof(float) ? 1e-30 : 1e-300;
 
@@ -50,11 +50,15 @@ namespace Solver_Kernels
 
         if (local_iter == 0) // gradient descent
         {
-            for(int img=0; img<noi; img++) {
-                Backend::par::set(grad_pr[img], grad[img], set);
-                auto & dir = searchdir[img];
-                auto & g_cur = grad[img];
-                Backend::par::set(dir, g_cur, [] SPIRIT_LAMBDA (const Vec & x){return -x;});
+            for(int img=0; img<noi; img++)
+            {
+                auto & g_ptr = grad[img].data();
+                auto & g_pr_ptr = grad_pr[img].data();
+                auto & dir_ptr = searchdir[img].data();
+                Backend::par::apply(nos, [g_ptr, g_pr_ptr, dir_ptr] SPIRIT_LAMBDA (int idx){
+                    g_pr_ptr[idx] = g_ptr[idx];
+                    dir_ptr[idx] = -g_ptr[idx];
+                });
                 auto & da = delta_a[img];
                 auto & dg = delta_grad[img];
                 for (int i = 0; i < num_mem; i++)
@@ -68,7 +72,9 @@ namespace Solver_Kernels
                     });
                 }
             }
-        } else {
+        }
+        else
+        {
             for (int img=0; img<noi; img++)
             {
                 auto da = delta_a[img][m_index].data();
@@ -84,7 +90,7 @@ namespace Solver_Kernels
 
             scalar rinv_temp = 0;
             for (int img=0; img<noi; img++)
-                rinv_temp += Backend::par::reduce(delta_grad[img][m_index], delta_a[img][m_index], dot);
+                rinv_temp += Backend::par::reduce(dot, delta_grad[img][m_index], delta_a[img][m_index]);
 
             if (rinv_temp > epsilon)
                 rho[m_index] = 1.0 / rinv_temp;
@@ -96,14 +102,14 @@ namespace Solver_Kernels
             }
 
             for (int img=0; img<noi; img++)
-                Backend::par::set(q_vec[img], grad[img], set);
+                Backend::par::assign(q_vec[img], set, grad[img]);
 
             for (int k = num_mem - 1; k > -1; k--)
             {
                 c_ind = (k + m_index + 1) % num_mem;
                 scalar temp=0;
                 for (int img=0; img<noi; img++)
-                    temp += Backend::par::reduce(delta_a[img][c_ind], q_vec[img], dot);
+                    temp += Backend::par::reduce(dot, delta_a[img][c_ind], q_vec[img]);
 
                 alpha[c_ind] = rho[c_ind] * temp;
                 for (int img=0; img<noi; img++)
@@ -119,7 +125,7 @@ namespace Solver_Kernels
 
             scalar dy2=0;
             for (int img=0; img<noi; img++)
-                dy2 += Backend::par::reduce(delta_grad[img][m_index], delta_grad[img][m_index], dot);
+                dy2 += Backend::par::reduce(dot, delta_grad[img][m_index], delta_grad[img][m_index]);
 
             for (int img=0; img<noi; img++)
             {
@@ -129,9 +135,9 @@ namespace Solver_Kernels
                     inv_rhody2 = 1.0 / rhody2;
                 else
                     inv_rhody2 = 1.0/(epsilon);
-                Backend::par::set(searchdir[img], q_vec[img], [inv_rhody2] SPIRIT_LAMBDA (const Vec & q){
+                Backend::par::assign(searchdir[img], [inv_rhody2] SPIRIT_LAMBDA (const Vec & q){
                     return inv_rhody2 * q;
-                } );
+                }, q_vec[img] );
             }
 
             for (int k = 0; k < num_mem; k++)
@@ -143,7 +149,7 @@ namespace Solver_Kernels
 
                 scalar rhopdg = 0;
                 for(int img=0; img<noi; img++)
-                    rhopdg += Backend::par::reduce(delta_grad[img][c_ind], searchdir[img], dot);
+                    rhopdg += Backend::par::reduce(dot, delta_grad[img][c_ind], searchdir[img]);
 
                 rhopdg *= rho[c_ind];
 

diff --git a/core/include/engine/Solver_LBFGS_Atlas.hpp b/core/include/engine/Solver_LBFGS_Atlas.hpp
@@ -80,7 +80,9 @@ void Method_Solver<Solver::LBFGS_Atlas>::Iteration()
     // Scale by averaging
     for(int img=0; img<noi; img++)
     {
-        a_norm_rms = std::max(a_norm_rms, scalar( sqrt( Backend::par::reduce(this->atlas_directions[img], [] SPIRIT_LAMBDA (const Vector2 & v){ return v.squaredNorm(); }) / nos )));
+        a_norm_rms = std::max( a_norm_rms, scalar( sqrt(
+            Backend::par::reduce([] SPIRIT_LAMBDA (const Vector2 & v){ return v.squaredNorm(); }, this->atlas_directions[img]) / nos
+        )));
     }
     scalar scaling = (a_norm_rms > maxmove) ? maxmove/a_norm_rms : 1.0;
 

diff --git a/core/src/engine/Hamiltonian_Heisenberg.cpp b/core/src/engine/Hamiltonian_Heisenberg.cpp
@@ -623,14 +623,18 @@ namespace Engine
         if(idx_ddi >= 0)
             this->Gradient_DDI(spins, gradient);
 
-        energy += Backend::par::reduce( N, [s,g] SPIRIT_LAMBDA ( int idx ) { return 0.5 * g[idx].dot(s[idx]) ;} );
+        energy += Backend::par::reduce( [] SPIRIT_LAMBDA ( const Vector3 & s, const Vector3 & g ) {
+                return 0.5 * g.dot(s);
+            }, spins, gradient );
 
         // External field
         if(idx_zeeman >= 0)
         {
             Vector3 ext_field = external_field_normal * external_field_magnitude;
             this->Gradient_Zeeman(gradient);
-            energy += Backend::par::reduce( N, [s, ext_field, mu_s] SPIRIT_LAMBDA ( int idx ) { return -mu_s[idx] * ext_field.dot(s[idx]) ;} );
+            energy += Backend::par::reduce( [ext_field] SPIRIT_LAMBDA ( const Vector3 & s, const scalar & mu_s ) {
+                    return -mu_s * ext_field.dot(s);
+                }, spins, geometry->mu_s );
         }
 
         // Quadruplets