diff --git a/core/include/engine/Backend_par.hpp b/core/include/engine/Backend_par.hpp
index 8099aa5ec..b2312e1b5 100644
--- a/core/include/engine/Backend_par.hpp
+++ b/core/include/engine/Backend_par.hpp
@@ -4,6 +4,8 @@
 
 #include <engine/Vectormath_Defines.hpp>
 
+#include <tuple>
+
 #ifdef SPIRIT_USE_CUDA
     #include <cub/cub.cuh>
     #define SPIRIT_LAMBDA __device__
@@ -20,28 +22,46 @@ namespace par
 
 #ifdef SPIRIT_USE_CUDA
 
-    template<typename F>
+    // f(i) for all i
+    template<typename Lambda>
     __global__
-    void cu_apply(int N, F f)
+    void cu_apply(int N, Lambda f)
     {
         int idx = blockIdx.x * blockDim.x + threadIdx.x;
         if(idx < N)
             f(idx);
     }
-
-    // vf1[i] = f( vf2[i] )
-    template<typename F>
-    void apply(int N, F f)
+    // f(i) for all i
+    template<typename Lambda>
+    void apply(int N, Lambda f)
     {
         cu_apply<<<(N+1023)/1024, 1024>>>(N, f);
         CU_CHECK_AND_SYNC();
     }
 
-    template<typename F>
-    scalar reduce(int N, const F f)
+    // vf_to[i] = f(args[i]...) for all i
+    template<typename A, typename Lambda, typename... args>
+    __global__
+    void cu_assign_lambda(int N, A * vf_to, Lambda lambda, const args *... arg_fields)
+    {
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if(idx < N)
+            vf_to[idx] = lambda(arg_fields[idx]...);
+    }
+    // field[i] = f(args[i]...) for all i
+    template<typename A, typename Lambda, typename... args>
+    void assign(field<A> & vf_to, Lambda lambda, const field<args> &... vf_args)
+    {
+        int N = vf_to.size();
+        cu_assign_lambda<<<(N+1023)/1024, 1024>>>(N, vf_to.data(), lambda, vf_args.data()...);
+        CU_CHECK_AND_SYNC();
+    }
+
+
+    template<typename Lambda>
+    scalar reduce(int N, const Lambda f)
     {
         static scalarfield sf(N, 0);
-        // Vectormath::fill(sf, 0);
 
         if(sf.size() != N)
             sf.resize(N);
@@ -63,15 +83,14 @@ namespace par
         return ret[0];
     }
 
-    template<typename A, typename F>
-    scalar reduce(const field<A> & vf1, const F f)
+    template<typename A, typename Lambda>
+    scalar reduce(int N, const Lambda f, const field<A> & vf1)
     {
         // TODO: remove the reliance on a temporary scalar field (maybe thrust::dot with generalized operations)
         // We also use this workaround for a single field as argument, because cub does not support non-commutative reduction operations
 
         int n = vf1.size();
         static scalarfield sf(n, 0);
-        // Vectormath::fill(sf, 0);
 
         if(sf.size() != vf1.size())
             sf.resize(vf1.size());
@@ -81,7 +100,6 @@ namespace par
         apply( n, [f, s, v1] SPIRIT_LAMBDA (int idx) { s[idx] = f(v1[idx]); } );
 
         static scalarfield ret(1, 0);
-        // Vectormath::fill(ret, 0);
 
         // Determine temporary storage size and allocate
         void * d_temp_storage = NULL;
@@ -96,12 +114,11 @@ namespace par
     }
 
     template<typename A, typename B, typename F>
-    scalar reduce(const field<A> & vf1, const field<B> & vf2, const F f)
+    scalar reduce(int N, const F f, const field<A> & vf1, const field<B> & vf2)
     {
         // TODO: remove the reliance on a temporary scalar field (maybe thrust::dot with generalized operations)
         int n = vf1.size();
         static scalarfield sf(n, 0);
-        // Vectormath::fill(sf, 0);
 
         if(sf.size() != vf1.size())
             sf.resize(vf1.size());
@@ -112,7 +129,6 @@ namespace par
         apply( n, [f, s, v1, v2] SPIRIT_LAMBDA (int idx) { s[idx] = f(v1[idx], v2[idx]); } );
 
         static scalarfield ret(1, 0);
-        // Vectormath::fill(ret, 0);
         // Determine temporary storage size and allocate
         void * d_temp_storage = NULL;
         size_t temp_storage_bytes = 0;
@@ -125,92 +141,59 @@ namespace par
         return ret[0];
     }
 
-
-    // vf1[i] = f( vf2[i] )
-    template<typename A, typename B, typename F>
-    __global__
-    void cu_set_lambda(A * vf1, const B * vf2, F f, int N)
-    {
-        int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if(idx < N)
-        {
-            vf1[idx] = f(vf2[idx]);
-        }
-    }
-
-    // vf1[i] = f( vf2[i] )
-    template<typename A, typename B, typename F>
-    void set(field<A> & vf1, const field<B> & vf2, F f)
-    {
-        int N = vf1.size();
-        cu_set_lambda<<<(N+1023)/1024, 1024>>>(vf1.data(), vf2.data(), f, N);
-        CU_CHECK_AND_SYNC();
-    }
-    
 #else
 
+    // f(i) for all i
     template<typename F>
-    scalar reduce(int N, const F f)
+    void apply(int N, const F & f)
     {
-        scalar res = 0;
-        #pragma omp parallel for reduction(+:res)
+        #pragma omp parallel for
         for(unsigned int idx = 0; idx < N; ++idx)
-        {
-            res += f(idx);
-        }
-        return res;
+            f(idx);
     }
 
-    template<typename A, typename F>
-    scalar reduce(const field<A> & vf1, const F f)
+    // vf_to[i] = f(args[i]...) for all i
+    template<typename A, typename Lambda, typename... args> inline
+    void pointer_assign(int N, A * vf_to, Lambda lambda, const args *... vf_args)
     {
-        scalar res=0;
-        #pragma omp parallel for reduction(+:res)
-        for(unsigned int idx = 0; idx < vf1.size(); ++idx)
-        {
-            res += f(vf1[idx]);
-        }
-        return res;
+        #pragma omp parallel for
+        for(unsigned int idx = 0; idx < N; ++idx)
+            vf_to[idx] = lambda(vf_args[idx]...);
     }
-
-    // result = sum_i  f( vf1[i], vf2[i] )
-    template<typename A, typename B, typename F>
-    scalar reduce(const field<A> & vf1, const field<B> & vf2, const F & f)
+    // vf_to[i] = f(args[i]...) for all i
+    template<typename A, typename Lambda, typename... args> inline
+    void assign(field<A> & vf_to, Lambda lambda, const field<args> &... vf_args)
     {
-        scalar res=0;
-        #pragma omp parallel for reduction(+:res)
-        for(unsigned int idx = 0; idx < vf1.size(); ++idx)
-        {
-            res += f(vf1[idx], vf2[idx]);
-        }
-        return res;
+        auto N = vf_to.size();
+        // We take this umweg so that `.data()` won't be called for every element of each field
+        pointer_assign(N, vf_to.data(), lambda, vf_args.data()...);
     }
 
-    // vf1[i] = f( vf2[i] )
-    template<typename A, typename B, typename F>
-    void set(field<A> & vf1, const field<B> & vf2, const F & f)
-    {
-        #pragma omp parallel for
-        for(unsigned int idx = 0; idx < vf1.size(); ++idx)
-        {
-            vf1[idx] = f(vf2[idx]);
-        }
-    }
 
-    // f( vf1[idx], idx ) for all i
-    template<typename F>
-    void apply(int N, const F & f)
+    // result = sum_i f(args[i]...)
+    template<typename Lambda, typename... args> inline
+    scalar pointer_reduce(int N, const Lambda & lambda, const args *... vf_args)
     {
-        #pragma omp parallel for
+        scalar res = 0;
+
+        #pragma omp parallel for reduction(+:res)
         for(unsigned int idx = 0; idx < N; ++idx)
-        {
-            f(idx);
-        }
-    }
+            res += lambda(vf_args[idx]...);
 
+        return res;
+    }
+    // result = sum_i f(args[i]...)
+    template<typename Lambda, typename... args> inline
+    scalar reduce(int N, const Lambda & lambda, const field<args> &... vf_args)
+    {
+        // We take this umweg so that `.data()` won't be called for every element of each field
+        return pointer_reduce(N, lambda, vf_args.data()...);
+    }
 
 #endif
+
 } // namespace par
 } // namespace Backend
 } // namespace Engine
+
 #endif
\ No newline at end of file
diff --git a/core/include/engine/Hamiltonian_Heisenberg.hpp b/core/include/engine/Hamiltonian_Heisenberg.hpp
index e0cfa4008..918a84634 100644
--- a/core/include/engine/Hamiltonian_Heisenberg.hpp
+++ b/core/include/engine/Hamiltonian_Heisenberg.hpp
@@ -152,11 +152,12 @@ namespace Engine
         // Calculate the Quadruplet energy
         void E_Quadruplet(const vectorfield & spins, scalarfield & Energy);
 
-        private:
-        int idx_zeeman, idx_anisotropy, idx_exchange, idx_dmi, idx_ddi, idx_quadruplet;
         void Gradient_DDI_Cutoff(const vectorfield& spins, vectorfield & gradient);
         void Gradient_DDI_Direct(const vectorfield& spins, vectorfield & gradient);
         void Gradient_DDI_FFT(const vectorfield& spins, vectorfield & gradient);
+
+    private:
+        int idx_zeeman, idx_anisotropy, idx_exchange, idx_dmi, idx_ddi, idx_quadruplet;
         void E_DDI_Direct(const vectorfield& spins, scalarfield & Energy);
         void E_DDI_Cutoff(const vectorfield& spins, scalarfield & Energy);
         void E_DDI_FFT(const vectorfield& spins, scalarfield & Energy);
diff --git a/core/include/engine/Solver_Depondt.hpp b/core/include/engine/Solver_Depondt.hpp
index 88435a53b..cef342cfd 100644
--- a/core/include/engine/Solver_Depondt.hpp
+++ b/core/include/engine/Solver_Depondt.hpp
@@ -41,12 +41,15 @@ void Method_Solver<Solver::Depondt>::Iteration ()
     {
         auto& conf           = *this->configurations[i];
         auto& conf_predictor = *this->configurations_predictor[i];
+        auto anglep = angle.data();
+        auto axisp = rotationaxis[i].data();
+        auto f_virtual = forces_virtual[i].data();
 
         // For Rotation matrix R := R( H_normed, angle )
-        Vectormath::norm( forces_virtual[i], angle );   // angle = |forces_virtual|
-
-        Vectormath::set_c_a( 1, forces_virtual[i], rotationaxis[i] );  // rotationaxis = |forces_virtual|
-        Vectormath::normalize_vectors( rotationaxis[i] );            // normalize rotation axis
+        Backend::par::apply(nos, [anglep, axisp, f_virtual] SPIRIT_LAMBDA (int idx) {
+            anglep[idx] = f_virtual[idx].norm(); // angle = |forces_virtual|
+            axisp[idx] = f_virtual[idx].normalized(); // rotationaxis = forces_virtual/|forces_virtual|
+        });
 
         // Get spin predictor n' = R(H) * n
         Vectormath::rotate( conf, rotationaxis[i], angle, conf_predictor );
@@ -60,16 +63,19 @@ void Method_Solver<Solver::Depondt>::Iteration ()
     for (int i=0; i < this->noi; i++)
     {
         auto& conf   = *this->configurations[i];
-
-        // Calculate the linear combination of the two forces_virtuals
-        Vectormath::set_c_a( 0.5, forces_virtual[i], temp1);   // H = H/2
-        Vectormath::add_c_a( 0.5, forces_virtual_predictor[i], temp1 ); // H = (H + H')/2
-
-        // Get the rotation angle as norm of temp1 ...For Rotation matrix R' := R( H'_normed, angle' )
-        Vectormath::norm( temp1, angle );   // angle' = |forces_virtual lin combination|
-
-        // Normalize temp1 to get rotation axes
-        Vectormath::normalize_vectors( temp1 );
+        auto temp1p = temp1.data();
+        auto anglep = angle.data();
+        auto f_virtual = forces_virtual[i].data();
+        auto f_virtual_predictor = forces_virtual_predictor[i].data();
+
+        Backend::par::apply(nos, [temp1p, anglep, f_virtual, f_virtual_predictor] SPIRIT_LAMBDA (int idx) {
+            // Calculate the linear combination of the two forces_virtuals
+            temp1p[idx] = 0.5 * (f_virtual[idx] + f_virtual_predictor[idx]);
+            // Get the rotation angle as norm of temp1 ...For Rotation matrix R' := R( H'_normed, angle' )
+            anglep[idx] = temp1p[idx].norm();
+            // Normalize temp1 to get rotation axes
+            temp1p[idx].normalize();
+        });
 
         // Get new spin conf n_new = R( (H+H')/2 ) * n
         Vectormath::rotate( conf, temp1, angle, conf );
diff --git a/core/include/engine/Solver_Heun.hpp b/core/include/engine/Solver_Heun.hpp
index 8a403b759..d72820c60 100644
--- a/core/include/engine/Solver_Heun.hpp
+++ b/core/include/engine/Solver_Heun.hpp
@@ -40,17 +40,14 @@ void Method_Solver<Solver::Heun>::Iteration ()
     // Predictor for each image
     for (int i = 0; i < this->noi; ++i)
     {
-        auto& conf           = *this->configurations[i];
-        auto& conf_temp      = *this->configurations_temp[i];
-        auto& conf_predictor = *this->configurations_predictor[i];
+        auto conf           = this->configurations[i]->data();
+        auto conf_predictor = this->configurations_predictor[i]->data();
+        auto f_virtual      = this->forces_virtual[i].data();
 
         // First step - Predictor
-        Vectormath::set_c_cross( -1, conf, forces_virtual[i], conf_temp );  // temp1 = -( conf x A )
-        Vectormath::set_c_a( 1, conf, conf_predictor );                   // configurations_predictor = conf
-        Vectormath::add_c_a( 1, conf_temp, conf_predictor );         // configurations_predictor = conf + dt*temp1
-
-        // Normalize spins
-        Vectormath::normalize_vectors( conf_predictor );
+        Backend::par::apply( nos, [conf, conf_predictor, f_virtual] SPIRIT_LAMBDA (int idx) {
+            conf_predictor[idx] = ( conf[idx] - conf[idx].cross( f_virtual[idx] ) ).normalized();
+        } );
     }
 
     // Calculate_Force for the Corrector
@@ -60,21 +57,17 @@ void Method_Solver<Solver::Heun>::Iteration ()
     // Corrector step for each image
     for (int i=0; i < this->noi; i++)
     {
-        auto& conf           = *this->configurations[i];
-        auto& conf_temp      = *this->configurations_temp[i];
-        auto& conf_predictor = *this->configurations_predictor[i];
+        auto conf           = this->configurations[i]->data();
+        auto conf_predictor = this->configurations_predictor[i]->data();
+        auto f_virtual      = this->forces_virtual[i].data();
+        auto f_virtual_predictor = this->forces_virtual_predictor[i].data();
 
         // Second step - Corrector
-        Vectormath::scale( conf_temp, 0.5 );                                     // configurations_temp = 0.5 * configurations_temp
-        Vectormath::add_c_a( 1, conf, conf_temp );                               // configurations_temp = conf + 0.5 * configurations_temp
-        Vectormath::set_c_cross( -1, conf_predictor, forces_virtual_predictor[i], temp1 );   // temp1 = - ( conf' x A' )
-        Vectormath::add_c_a( 0.5, temp1, conf_temp );                            // configurations_temp = conf + 0.5 * configurations_temp + 0.5 * temp1
-
-        // Normalize spins
-        Vectormath::normalize_vectors( conf_temp );
-
-        // Copy out
-        conf = conf_temp;
+        Backend::par::apply( nos, [conf, conf_predictor, f_virtual, f_virtual_predictor] SPIRIT_LAMBDA (int idx) {
+            conf[idx] = (
+                conf[idx] - 0.5 * ( conf[idx].cross( f_virtual[idx] ) + conf_predictor[idx].cross(f_virtual_predictor[idx]) )
+            ).normalized();
+        } );
     }
 };
 
diff --git a/core/include/engine/Solver_Kernels.hpp b/core/include/engine/Solver_Kernels.hpp
index 7a1cb343b..a703ddb16 100644
--- a/core/include/engine/Solver_Kernels.hpp
+++ b/core/include/engine/Solver_Kernels.hpp
@@ -38,8 +38,8 @@ namespace Solver_Kernels
                              std::vector<field<field<Vec>>> & delta_a, std::vector<field<field<Vec>>> & delta_grad, const std::vector<field<Vec>> & grad, std::vector<field<Vec>> & grad_pr,
                              const int num_mem, const scalar maxmove) {
         // std::cerr << "lbfgs searchdir \n";
-        static auto dot = [] SPIRIT_LAMBDA (const Vec & v1, const Vec &v2) {return v1.dot(v2);};
-        static auto set = [] SPIRIT_LAMBDA (const Vec & x) {return x;};
+        static auto set = [] SPIRIT_LAMBDA (const Vec & x) -> Vec {return x;};
+        static auto dot = [] SPIRIT_LAMBDA (const Vec & v1, const Vec &v2) -> scalar {return v1.dot(v2);};
 
         scalar epsilon = sizeof(scalar) == sizeof(float) ? 1e-30 : 1e-300;
 
@@ -50,11 +50,15 @@ namespace Solver_Kernels
 
         if (local_iter == 0) // gradient descent
         {
-            for(int img=0; img<noi; img++) {
-                Backend::par::set(grad_pr[img], grad[img], set);
-                auto & dir = searchdir[img];
-                auto & g_cur = grad[img];
-                Backend::par::set(dir, g_cur, [] SPIRIT_LAMBDA (const Vec & x){return -x;});
+            for(int img=0; img<noi; img++)
+            {
+                auto g_ptr = grad[img].data();
+                auto g_pr_ptr = grad_pr[img].data();
+                auto dir_ptr = searchdir[img].data();
+                Backend::par::apply(nos, [g_ptr, g_pr_ptr, dir_ptr] SPIRIT_LAMBDA (int idx){
+                    g_pr_ptr[idx] = g_ptr[idx];
+                    dir_ptr[idx] = -g_ptr[idx];
+                });
                 auto & da = delta_a[img];
                 auto & dg = delta_grad[img];
                 for (int i = 0; i < num_mem; i++)
@@ -68,7 +72,9 @@ namespace Solver_Kernels
                     });
                 }
             }
-        } else {
+        }
+        else
+        {
             for (int img=0; img<noi; img++)
             {
                 auto da = delta_a[img][m_index].data();
@@ -84,7 +90,7 @@ namespace Solver_Kernels
 
             scalar rinv_temp = 0;
             for (int img=0; img<noi; img++)
-                rinv_temp += Backend::par::reduce(delta_grad[img][m_index], delta_a[img][m_index], dot);
+                rinv_temp += Backend::par::reduce(nos, dot, delta_grad[img][m_index], delta_a[img][m_index]);
 
             if (rinv_temp > epsilon)
                 rho[m_index] = 1.0 / rinv_temp;
@@ -96,14 +102,14 @@ namespace Solver_Kernels
             }
 
             for (int img=0; img<noi; img++)
-                Backend::par::set(q_vec[img], grad[img], set);
+                Backend::par::assign(q_vec[img], set, grad[img]);
 
             for (int k = num_mem - 1; k > -1; k--)
             {
                 c_ind = (k + m_index + 1) % num_mem;
                 scalar temp=0;
                 for (int img=0; img<noi; img++)
-                    temp += Backend::par::reduce(delta_a[img][c_ind], q_vec[img], dot);
+                    temp += Backend::par::reduce(nos, dot, delta_a[img][c_ind], q_vec[img]);
 
                 alpha[c_ind] = rho[c_ind] * temp;
                 for (int img=0; img<noi; img++)
@@ -119,7 +125,7 @@ namespace Solver_Kernels
 
             scalar dy2=0;
             for (int img=0; img<noi; img++)
-                dy2 += Backend::par::reduce(delta_grad[img][m_index], delta_grad[img][m_index], dot);
+                dy2 += Backend::par::reduce(nos, dot, delta_grad[img][m_index], delta_grad[img][m_index]);
 
             for (int img=0; img<noi; img++)
             {
@@ -129,9 +135,9 @@ namespace Solver_Kernels
                     inv_rhody2 = 1.0 / rhody2;
                 else
                     inv_rhody2 = 1.0/(epsilon);
-                Backend::par::set(searchdir[img], q_vec[img], [inv_rhody2] SPIRIT_LAMBDA (const Vec & q){
+                Backend::par::assign(searchdir[img], [inv_rhody2] SPIRIT_LAMBDA (const Vec & q){
                     return inv_rhody2 * q;
-                } );
+                }, q_vec[img] );
             }
 
             for (int k = 0; k < num_mem; k++)
@@ -143,7 +149,7 @@ namespace Solver_Kernels
 
                 scalar rhopdg = 0;
                 for(int img=0; img<noi; img++)
-                    rhopdg += Backend::par::reduce(delta_grad[img][c_ind], searchdir[img], dot);
+                    rhopdg += Backend::par::reduce(nos, dot, delta_grad[img][c_ind], searchdir[img]);
 
                 rhopdg *= rho[c_ind];
 
diff --git a/core/include/engine/Solver_LBFGS_Atlas.hpp b/core/include/engine/Solver_LBFGS_Atlas.hpp
index 0f2f85fe1..f1dc0f8bc 100644
--- a/core/include/engine/Solver_LBFGS_Atlas.hpp
+++ b/core/include/engine/Solver_LBFGS_Atlas.hpp
@@ -80,7 +80,9 @@ void Method_Solver<Solver::LBFGS_Atlas>::Iteration()
     // Scale by averaging
     for(int img=0; img<noi; img++)
     {
-        a_norm_rms = std::max(a_norm_rms, scalar( sqrt( Backend::par::reduce(this->atlas_directions[img], [] SPIRIT_LAMBDA (const Vector2 & v){ return v.squaredNorm(); }) / nos )));
+        a_norm_rms = std::max( a_norm_rms, scalar( sqrt(
+            Backend::par::reduce(nos, [] SPIRIT_LAMBDA (const Vector2 & v){ return v.squaredNorm(); }, this->atlas_directions[img]) / nos
+        )));
     }
     scalar scaling = (a_norm_rms > maxmove) ? maxmove/a_norm_rms : 1.0;
 
diff --git a/core/include/engine/Solver_RK4.hpp b/core/include/engine/Solver_RK4.hpp
index 19a8bcfa9..a16ecd826 100644
--- a/core/include/engine/Solver_RK4.hpp
+++ b/core/include/engine/Solver_RK4.hpp
@@ -51,19 +51,15 @@ void Method_Solver<Solver::RungeKutta4>::Iteration ()
     // Predictor for each image
     for (int i = 0; i < this->noi; ++i)
     {
-        auto& conf           = *this->configurations[i];
-        auto& k1             = *this->configurations_k1[i];
-        auto& conf_predictor = *this->configurations_predictor[i];
-        auto& force          =  this->forces_virtual[i];
-
-        // k1
-        Vectormath::set_c_cross( -1, conf, force, k1 );
-
-        // Predictor for k2
-        Vectormath::set_c_a( 1, conf, conf_predictor );
-        Vectormath::add_c_a( 0.5, k1, conf_predictor );
-        // Normalize
-        Vectormath::normalize_vectors( conf_predictor );
+        auto conf           = this->configurations[i]->data();
+        auto conf_predictor = this->configurations_predictor[i]->data();
+        auto force          = this->forces_virtual[i].data();
+        auto k1             = this->configurations_k1[i]->data();
+
+        Backend::par::apply( nos, [conf, conf_predictor, force, k1] SPIRIT_LAMBDA (int idx) {
+            k1[idx] = - conf[idx].cross(force[idx]);
+            conf_predictor[idx] = (conf[idx] + 0.5*k1[idx]).normalized();
+        } );
     }
 
     // Calculate_Force for the predictor
@@ -73,19 +69,15 @@ void Method_Solver<Solver::RungeKutta4>::Iteration ()
     // Predictor for each image
     for (int i = 0; i < this->noi; ++i)
     {
-        auto& conf           = *this->configurations[i];
-        auto& k2             = *this->configurations_k2[i];
-        auto& conf_predictor = *this->configurations_predictor[i];
-        auto& force          =  this->forces_virtual_predictor[i];
-
-        // k2
-        Vectormath::set_c_cross( -1, conf_predictor, force, k2 );
-
-        // Predictor for k3
-        Vectormath::set_c_a( 1, conf, conf_predictor );
-        Vectormath::add_c_a( 0.5, k2, conf_predictor );
-        // Normalize
-        Vectormath::normalize_vectors( conf_predictor );
+        auto conf           = this->configurations[i]->data();
+        auto conf_predictor = this->configurations_predictor[i]->data();
+        auto force          = this->forces_virtual_predictor[i].data();
+        auto k2             = this->configurations_k2[i]->data();
+
+        Backend::par::apply( nos, [conf, conf_predictor, force, k2] SPIRIT_LAMBDA (int idx) {
+            k2[idx] = - conf_predictor[idx].cross(force[idx]);
+            conf_predictor[idx] = (conf[idx] + 0.5*k2[idx]).normalized();
+        } );
     }
 
     // Calculate_Force for the predictor (k3)
@@ -95,19 +87,15 @@ void Method_Solver<Solver::RungeKutta4>::Iteration ()
     // Predictor for each image
     for (int i = 0; i < this->noi; ++i)
     {
-        auto& conf           = *this->configurations[i];
-        auto& k3             = *this->configurations_k3[i];
-        auto& conf_predictor = *this->configurations_predictor[i];
-        auto& force          =  this->forces_virtual_predictor[i];
-
-        // k3
-        Vectormath::set_c_cross( -1, conf_predictor, force, k3 );
-
-        // Predictor for k4
-        Vectormath::set_c_a( 1, conf, conf_predictor );
-        Vectormath::add_c_a( 1, k3, conf_predictor );
-        // Normalize
-        Vectormath::normalize_vectors( conf_predictor );
+        auto conf           = this->configurations[i]->data();
+        auto conf_predictor = this->configurations_predictor[i]->data();
+        auto force          = this->forces_virtual_predictor[i].data();
+        auto k3             = this->configurations_k3[i]->data();
+
+        Backend::par::apply( nos, [conf, conf_predictor, force, k3] SPIRIT_LAMBDA (int idx) {
+            k3[idx] = - conf_predictor[idx].cross(force[idx]);
+            conf_predictor[idx] = (conf[idx] + k3[idx]).normalized();
+        } );
     }
 
     // Calculate_Force for the predictor (k4)
@@ -117,30 +105,18 @@ void Method_Solver<Solver::RungeKutta4>::Iteration ()
     // Corrector step for each image
     for (int i=0; i < this->noi; i++)
     {
-        auto& conf           = *this->configurations[i];
-        auto& k1             = *this->configurations_k1[i];
-        auto& k2             = *this->configurations_k2[i];
-        auto& k3             = *this->configurations_k3[i];
-        auto& k4             = *this->configurations_k4[i];
-        auto& conf_predictor = *this->configurations_predictor[i];
-        auto& conf_temp      = *this->configurations_temp[i];
-        auto& force          =  this->forces_virtual_predictor[i];
-
-        // k4
-        Vectormath::set_c_cross( -1, conf_predictor, force, k4 );
-
-        // 4th order Runge Kutta step
-        Vectormath::set_c_a( 1, conf, conf_temp );
-        Vectormath::add_c_a( 1.0/6.0, k1, conf_temp );
-        Vectormath::add_c_a( 1.0/3.0, k2, conf_temp );
-        Vectormath::add_c_a( 1.0/3.0, k3, conf_temp );
-        Vectormath::add_c_a( 1.0/6.0, k4, conf_temp );
-
-        // Normalize spins
-        Vectormath::normalize_vectors( conf_temp );
-
-        // Copy out
-        conf = conf_temp;
+        auto conf           = this->configurations[i]->data();
+        auto conf_predictor = this->configurations_predictor[i]->data();
+        auto force          = this->forces_virtual_predictor[i].data();
+        auto k1             = this->configurations_k1[i]->data();
+        auto k2             = this->configurations_k2[i]->data();
+        auto k3             = this->configurations_k3[i]->data();
+        auto k4             = this->configurations_k4[i]->data();
+
+        Backend::par::apply( nos, [conf, conf_predictor, force, k1, k2, k3, k4] SPIRIT_LAMBDA (int idx) {
+            k4[idx] = - conf_predictor[idx].cross(force[idx]);
+            conf[idx] = (conf[idx] + 1.0/6.0*k1[idx] + 1.0/3.0*k2[idx] + 1.0/3.0*k3[idx] + 1.0/6.0*k4[idx]).normalized();
+        } );
     }
 };
 
diff --git a/core/include/engine/Solver_SIB.hpp b/core/include/engine/Solver_SIB.hpp
index 6b361502f..e79be815d 100644
--- a/core/include/engine/Solver_SIB.hpp
+++ b/core/include/engine/Solver_SIB.hpp
@@ -32,10 +32,13 @@ void Method_Solver<Solver::SIB>::Iteration ()
     {
         auto& image     = *this->systems[i]->spins;
         auto& predictor = *this->configurations_predictor[i];
-
         Solver_Kernels::sib_transform(image, forces_virtual[i], predictor);
-        Vectormath::add_c_a(1, image, predictor);
-        Vectormath::scale(predictor, 0.5);
+
+        auto imagep     = this->systems[i]->spins->data();
+        auto predictorp = this->configurations_predictor[i]->data();
+        Backend::par::apply( nos, [imagep, predictorp] SPIRIT_LAMBDA (int idx) {
+            predictorp[idx] = 0.5 * (imagep[idx] + predictorp[idx]);
+        } );
     }
 
     // Second part of the step
@@ -43,8 +46,7 @@ void Method_Solver<Solver::SIB>::Iteration ()
     this->Calculate_Force_Virtual(this->configurations_predictor, this->forces_predictor, this->forces_virtual_predictor);
     for (int i = 0; i < this->noi; ++i)
     {
-        auto& image     = *this->systems[i]->spins;
-
+        auto& image = *this->systems[i]->spins;
         Solver_Kernels::sib_transform(image, forces_virtual_predictor[i], image);
     }
 };
diff --git a/core/include/engine/Solver_VP.hpp b/core/include/engine/Solver_VP.hpp
index 8be52fa76..8c3c1556a 100644
--- a/core/include/engine/Solver_VP.hpp
+++ b/core/include/engine/Solver_VP.hpp
@@ -20,11 +20,11 @@ void Method_Solver<Solver::VP>::Initialize ()
 
 /*
     Template instantiation of the Simulation class for use with the VP Solver.
-		The velocity projection method is often efficient for direct minimization,
-		but deals poorly with quickly varying fields or stochastic noise.
-	Paper: P. F. Bessarab et al., Method for finding mechanism and activation energy
-		   of magnetic transitions, applied to skyrmion and antivortex annihilation,
-		   Comp. Phys. Comm. 196, 335 (2015).
+        The velocity projection method is often efficient for direct minimization,
+        but deals poorly with quickly varying fields or stochastic noise.
+    Paper: P. F. Bessarab et al., Method for finding mechanism and activation energy
+            of magnetic transitions, applied to skyrmion and antivortex annihilation,
+            Comp. Phys. Comm. 196, 335 (2015).
 */
 template <> inline
 void Method_Solver<Solver::VP>::Iteration ()
@@ -39,7 +39,6 @@ void Method_Solver<Solver::VP>::Iteration ()
         auto f_pr = forces_previous[i].data();
         auto v    = velocities[i].data();
         auto v_pr = velocities_previous[i].data();
-
         Backend::par::apply( forces[i].size(), [f, f_pr, v, v_pr] SPIRIT_LAMBDA (int idx) {
             f_pr[idx] = f[idx];
             v_pr[idx] = v[idx];
@@ -52,21 +51,17 @@ void Method_Solver<Solver::VP>::Iteration ()
 
     for (int i = 0; i < noi; ++i)
     {
-        auto& velocity      = velocities[i];
-        auto& force         = forces[i];
-        auto& force_prev    = forces_previous[i];
-
         auto f    = forces[i].data();
         auto f_pr = forces_previous[i].data();
         auto v    = velocities[i].data();
         auto m_temp = this->m;
-
-        // Calculate the new velocity
-        Backend::par::apply(force.size(), [f,f_pr,v,m_temp] SPIRIT_LAMBDA (int idx) {
+        Backend::par::apply(nos, [f,f_pr,v,m_temp] SPIRIT_LAMBDA (int idx) {
             v[idx] += 0.5/m_temp * (f_pr[idx] + f[idx]);
         });
 
         // Get the projection of the velocity on the force
+        auto& velocity      = velocities[i];
+        auto& force         = forces[i];
         projection[i]  = Vectormath::dot(velocity, force);
         force_norm2[i] = Vectormath::dot(force, force);
     }
@@ -77,11 +72,6 @@ void Method_Solver<Solver::VP>::Iteration ()
     }
     for (int i = 0; i < noi; ++i)
     {
-        auto& velocity           = velocities[i];
-        auto& force              = forces[i];
-        auto& configuration      = *(configurations[i]);
-        auto& configuration_temp = *(configurations_temp[i]);
-
         auto f    = forces[i].data();
         auto v    = velocities[i].data();
         auto conf = (configurations[i])->data();
@@ -93,18 +83,18 @@ void Method_Solver<Solver::VP>::Iteration ()
 
         // Calculate the projected velocity
         if (projection_full <= 0)
+            Backend::par::assign(velocities[i], [] SPIRIT_LAMBDA () -> Vector3 {return {0, 0, 0};});
+        else
         {
-            Vectormath::fill(velocity, { 0,0,0 });
-        } else {
-            Backend::par::apply(force.size(), [f,v,ratio] SPIRIT_LAMBDA (int idx) {
+            Backend::par::apply(nos, [f,v,ratio] SPIRIT_LAMBDA (int idx) {
                 v[idx] = f[idx] * ratio;
             });
         }
 
-        Backend::par::apply( force.size(), [conf, conf_temp, dt, m_temp, v, f] SPIRIT_LAMBDA (int idx) {
+        Backend::par::apply( nos, [conf, conf_temp, dt, m_temp, v, f] SPIRIT_LAMBDA (int idx) {
             conf_temp[idx] = conf[idx] + dt * v[idx] + 0.5/m_temp * dt * f[idx];
             conf[idx] = conf_temp[idx].normalized();
-        }); 
+        });
     }
 };
 
diff --git a/core/include/engine/Solver_VP_OSO.hpp b/core/include/engine/Solver_VP_OSO.hpp
index de8034b2c..f500b763f 100644
--- a/core/include/engine/Solver_VP_OSO.hpp
+++ b/core/include/engine/Solver_VP_OSO.hpp
@@ -21,11 +21,11 @@ void Method_Solver<Solver::VP_OSO>::Initialize ()
 
 /*
     Template instantiation of the Simulation class for use with the VP Solver.
-		The velocity projection method is often efficient for direct minimization,
-		but deals poorly with quickly varying fields or stochastic noise.
-	Paper: P. F. Bessarab et al., Method for finding mechanism and activation energy
-		   of magnetic transitions, applied to skyrmion and antivortex annihilation,
-		   Comp. Phys. Comm. 196, 335 (2015).
+        The velocity projection method is often efficient for direct minimization,
+        but deals poorly with quickly varying fields or stochastic noise.
+    Paper: P. F. Bessarab et al., Method for finding mechanism and activation energy
+            of magnetic transitions, applied to skyrmion and antivortex annihilation,
+            Comp. Phys. Comm. 196, 335 (2015).
 
     Instead of the cartesian update scheme with re-normalization, this implementation uses the orthogonal spin optimization scheme,
     described by A. Ivanov in https://arxiv.org/abs/1904.02669.
@@ -97,9 +97,9 @@ void Method_Solver<Solver::VP_OSO>::Iteration ()
 
         // Calculate the projected velocity
         if (projection_full <= 0)
+            Backend::par::assign(velocities[img], [] SPIRIT_LAMBDA () -> Vector3 {return {0, 0, 0};});
+        else
         {
-            Vectormath::fill(velocities[img], { 0,0,0 });
-        } else {
             Backend::par::apply(nos, [g,v,ratio] SPIRIT_LAMBDA (int idx) {
                 v[idx] = g[idx] * ratio;
             });
@@ -107,7 +107,7 @@ void Method_Solver<Solver::VP_OSO>::Iteration ()
 
         Backend::par::apply( nos, [sd, dt, m_temp, v, g] SPIRIT_LAMBDA (int idx) {
             sd[idx] = dt * v[idx] + 0.5/m_temp * dt * g[idx];
-        }); 
+        });
     }
     Solver_Kernels::oso_rotate( this->configurations, this->searchdir);
 }
diff --git a/core/include/engine/Vectormath.hpp b/core/include/engine/Vectormath.hpp
index 077f02bf9..2eb94d476 100644
--- a/core/include/engine/Vectormath.hpp
+++ b/core/include/engine/Vectormath.hpp
@@ -550,14 +550,6 @@ namespace Engine
         /////////////////////////////////////////////////////////////////
         //////// Vectormath-like operations
 
-        // sets sf := s
-        // sf is a scalarfield
-        // s is a scalar
-        void fill(scalarfield & sf, scalar s);
-
-        // TODO: Add the test
-        void fill(scalarfield & sf, scalar s, const intfield & mask);
-
         // Scale a scalarfield by a given value
         void scale(scalarfield & sf, scalar s);
 
@@ -573,14 +565,6 @@ namespace Engine
         // Cut off all values to remain in a certain range
         void set_range(scalarfield & sf, scalar sf_min, scalar sf_max);
 
-        // sets vf := v
-        // vf is a vectorfield
-        // v is a vector
-        void fill(vectorfield & vf, const Vector3 & v);
-        void fill(vectorfield & vf, const Vector3 & v, const intfield & mask);
-
-        // Normalize the vectors of a vectorfield
-        void normalize_vectors(vectorfield & vf);
 
         // Get the norm of a vectorfield
         void norm( const vectorfield & vf, scalarfield & norm );
diff --git a/core/src/engine/Hamiltonian_Gaussian.cpp b/core/src/engine/Hamiltonian_Gaussian.cpp
index 55a22129d..616ca7bda 100644
--- a/core/src/engine/Hamiltonian_Gaussian.cpp
+++ b/core/src/engine/Hamiltonian_Gaussian.cpp
@@ -1,5 +1,6 @@
 #include <engine/Hamiltonian_Gaussian.hpp>
 #include <engine/Vectormath.hpp>
+#include <engine/Backend_par.hpp>
 #include <utility/Exception.hpp>
 
 using namespace Data;
@@ -82,7 +83,8 @@ namespace Engine
         if (this->energy_contributions_per_spin[0].second.size() != nos) this->energy_contributions_per_spin = { { "Gaussian", scalarfield(nos,0) } };
 
         // Set to zero
-        for (auto& pair : energy_contributions_per_spin) Vectormath::fill(pair.second, 0);
+        for (auto& pair : energy_contributions_per_spin)
+            Backend::par::assign(pair.second, [] SPIRIT_LAMBDA () {return 0;});
 
         for (int i = 0; i < this->n_gaussians; ++i)
         {
diff --git a/core/src/engine/Hamiltonian_Heisenberg.cpp b/core/src/engine/Hamiltonian_Heisenberg.cpp
index ce0969bd0..981ccd7c1 100644
--- a/core/src/engine/Hamiltonian_Heisenberg.cpp
+++ b/core/src/engine/Hamiltonian_Heisenberg.cpp
@@ -227,9 +227,11 @@ namespace Engine
         for( auto& contrib : contributions )
         {
             // Allocate if not already allocated
-            if (contrib.second.size() != nos) contrib.second = scalarfield(nos, 0);
+            if (contrib.second.size() != nos)
+                contrib.second = scalarfield(nos, 0);
             // Otherwise set to zero
-            else Vectormath::fill(contrib.second, 0);
+            else
+                Backend::par::assign(contrib.second, [] SPIRIT_LAMBDA () {return 0;});
         }
 
         // External field
@@ -339,7 +341,7 @@ namespace Engine
     {
         vectorfield gradients_temp;
         gradients_temp.resize(geometry->nos);
-        Vectormath::fill(gradients_temp, {0,0,0});
+        Backend::par::assign(gradients_temp, [] SPIRIT_LAMBDA () -> Vector3 {return {0,0,0};});
         this->Gradient_DDI_Direct(spins, gradients_temp);
 
         #pragma omp parallel for
@@ -388,13 +390,13 @@ namespace Engine
         scalar Energy_DDI = 0;
         vectorfield gradients_temp;
         gradients_temp.resize(geometry->nos);
-        Vectormath::fill(gradients_temp, {0,0,0});
+        Backend::par::assign(gradients_temp, [] SPIRIT_LAMBDA () -> Vector3 {return {0,0,0};});
         this->Gradient_DDI_FFT(spins, gradients_temp);
 
         // === DEBUG: begin gradient comparison ===
             // vectorfield gradients_temp_dir;
             // gradients_temp_dir.resize(this->geometry->nos);
-            // Vectormath::fill(gradients_temp_dir, {0,0,0});
+            // Backend::par::assign(gradients_temp_dir, [] SPIRIT_LAMBDA (int idx) {return {0,0,0};});
             // Gradient_DDI_Direct(spins, gradients_temp_dir);
 
             // //get deviation
@@ -568,7 +570,7 @@ namespace Engine
     void Hamiltonian_Heisenberg::Gradient(const vectorfield & spins, vectorfield & gradient)
     {
         // Set to zero
-        Vectormath::fill(gradient, {0,0,0});
+        Backend::par::assign(gradient, [] SPIRIT_LAMBDA () -> Vector3 {return {0,0,0};});
 
         // External field
         if(idx_zeeman >= 0)
@@ -597,9 +599,8 @@ namespace Engine
 
     void Hamiltonian_Heisenberg::Gradient_and_Energy(const vectorfield & spins, vectorfield & gradient, scalar & energy)
     {
-
         // Set to zero
-        Vectormath::fill(gradient, {0,0,0});
+        Backend::par::assign(gradient, [] SPIRIT_LAMBDA () { return Vector3{0,0,0}; });
         energy = 0;
 
         auto N = spins.size();
@@ -623,14 +624,18 @@ namespace Engine
         if(idx_ddi >= 0)
             this->Gradient_DDI(spins, gradient);
 
-        energy += Backend::par::reduce( N, [s,g] SPIRIT_LAMBDA ( int idx ) { return 0.5 * g[idx].dot(s[idx]) ;} );
+        energy += Backend::par::reduce( spins.size(), [] SPIRIT_LAMBDA ( const Vector3 & s, const Vector3 & g ) {
+                return 0.5 * g.dot(s);
+            }, spins, gradient );
 
         // External field
         if(idx_zeeman >= 0)
         {
             Vector3 ext_field = external_field_normal * external_field_magnitude;
             this->Gradient_Zeeman(gradient);
-            energy += Backend::par::reduce( N, [s, ext_field, mu_s] SPIRIT_LAMBDA ( int idx ) { return -mu_s[idx] * ext_field.dot(s[idx]) ;} );
+            energy += Backend::par::reduce( spins.size(), [ext_field] SPIRIT_LAMBDA ( const Vector3 & s, const scalar & mu_s ) {
+                    return -mu_s * ext_field.dot(s);
+                }, spins, geometry->mu_s );
         }
 
         // Quadruplets
diff --git a/core/src/engine/Hamiltonian_Heisenberg.cu b/core/src/engine/Hamiltonian_Heisenberg.cu
index b35a92cff..7efbcef83 100644
--- a/core/src/engine/Hamiltonian_Heisenberg.cu
+++ b/core/src/engine/Hamiltonian_Heisenberg.cu
@@ -225,7 +225,11 @@ namespace Engine
             // Allocate if not already allocated
             if (pair.second.size() != nos) pair.second = scalarfield(nos, 0);
             // Otherwise set to zero
-            else for (auto& pair : contributions) Vectormath::fill(pair.second, 0);
+            else
+            {
+                for (auto& pair : contributions)
+                    Backend::par::assign(pair.second, [] SPIRIT_LAMBDA () {return 0;});
+            }
         }
 
         // External field
@@ -366,13 +370,11 @@ namespace Engine
 
     void Hamiltonian_Heisenberg::E_DDI_Direct(const vectorfield & spins, scalarfield & Energy)
     {
-        vectorfield gradients_temp;
-        gradients_temp.resize(geometry->nos);
-        Vectormath::fill(gradients_temp, {0,0,0});
+        vectorfield gradients_temp(spins.size(), {0, 0, 0});
         this->Gradient_DDI_Direct(spins, gradients_temp);
 
         #pragma omp parallel for
-        for (int ispin = 0; ispin < geometry->nos; ispin++)
+        for (int ispin = 0; ispin < spins.size(); ispin++)
         {
             Energy[ispin] += 0.5 * spins[ispin].dot(gradients_temp[ispin]);
         }
@@ -420,17 +422,17 @@ namespace Engine
     }
     void Hamiltonian_Heisenberg::E_DDI_FFT(const vectorfield & spins, scalarfield & Energy)
     {
+        auto N = spins.size();
         //todo maybe the gradient should be cached somehow, it is quite inefficient to calculate it
         //again just for the energy
-        vectorfield gradients_temp(geometry->nos);
-        Vectormath::fill(gradients_temp, {0,0,0});
-        this->Gradient_DDI(spins, gradients_temp);
-        CU_E_DDI_FFT<<<(geometry->nos + 1023)/1024, 1024>>>(Energy.data(), spins.data(), gradients_temp.data(), geometry->nos, geometry->n_cell_atoms, geometry->mu_s.data());
+        vectorfield gradients_temp(N, {0, 0, 0});
+        this->Gradient_DDI_FFT(spins, gradients_temp);
+        CU_E_DDI_FFT<<<(N + 1023)/1024, 1024>>>(Energy.data(), spins.data(), gradients_temp.data(), N, geometry->n_cell_atoms, geometry->mu_s.data());
 
         // === DEBUG: begin gradient comparison ===
             // vectorfield gradients_temp_dir;
             // gradients_temp_dir.resize(this->geometry->nos);
-            // Vectormath::fill(gradients_temp_dir, {0,0,0});
+            // Backend::par::assign(gradients_temp_dir, [] SPIRIT_LAMBDA () -> Vector3 {return {0, 0, 0};});
             // Gradient_DDI_Direct(spins, gradients_temp_dir);
 
             // //get deviation
@@ -622,7 +624,7 @@ namespace Engine
     void Hamiltonian_Heisenberg::Gradient(const vectorfield & spins, vectorfield & gradient)
     {
         // Set to zero
-        Vectormath::fill(gradient, {0,0,0});
+        Backend::par::assign(gradient, [] SPIRIT_LAMBDA () -> Vector3 {return {0, 0, 0};});
 
         // External field
         if(idx_zeeman >= 0)
@@ -652,7 +654,7 @@ namespace Engine
     void Hamiltonian_Heisenberg::Gradient_and_Energy(const vectorfield & spins, vectorfield & gradient, scalar & energy)
     {
         // Set to zero
-        Vectormath::fill(gradient, {0,0,0});
+        Backend::par::assign(gradient, [] SPIRIT_LAMBDA () -> Vector3 {return {0, 0, 0};});
         energy = 0;
 
         auto N = spins.size();
diff --git a/core/src/engine/Method_GNEB.cpp b/core/src/engine/Method_GNEB.cpp
index 37c9d1bce..94d9b7819 100644
--- a/core/src/engine/Method_GNEB.cpp
+++ b/core/src/engine/Method_GNEB.cpp
@@ -97,10 +97,10 @@ namespace Engine
             //      while the gradient force is manipulated (e.g. projected)
             auto eff_field = this->chain->images[img]->effective_field.data();
             auto f_grad    = F_gradient[img].data();
-            Backend::par::apply( image.size(), 
+            Backend::par::apply( image.size(),
                 [eff_field, f_grad] SPIRIT_LAMBDA (int idx)
                 {
-                    eff_field[idx] *= -1; 
+                    eff_field[idx] *= -1;
                     f_grad[idx] = eff_field[idx];
                 }
             );
@@ -231,7 +231,7 @@ namespace Engine
             }
             else
             {
-                Vectormath::fill(F_total[img], { 0,0,0 });
+                Backend::par::assign(F_total[img], [] SPIRIT_LAMBDA () -> Vector3 {return { 0,0,0 };});
             }
             // Apply pinning mask
             #ifdef SPIRIT_ENABLE_PINNING
@@ -373,20 +373,22 @@ namespace Engine
         // TODO: Find a better way to do this without so much code duplication. Probably requires extension of the Hamiltonian in a way that allows to request information about the different contributions.
         for(int img=0; img<noi; img++)
         {
+            auto tf = temp_field.data();
+            auto te = temp_energy.data();
+            Backend::par::apply(nos, [tf, te] SPIRIT_LAMBDA (int idx) {
+                tf[idx] = {0,0,0};
+                te[idx] = 0;
+            });
             auto& image = *this->configurations[img];
             if(ham.Idx_Exchange() >= 0)
             {
-                Vectormath::fill(temp_field, Vector3::Zero());
-                Vectormath::fill(temp_energy, 0);
                 ham.E_Exchange(image, temp_energy);
-                temp_energies[ham.Idx_Exchange()][img] = Vectormath::sum(temp_energy);
                 ham.Gradient_Exchange(image, temp_field);
+                temp_energies[ham.Idx_Exchange()][img] = Vectormath::sum(temp_energy);
                 temp_dE_dRx[ham.Idx_Exchange()][img] = -Vectormath::dot(temp_field, this->tangents[img]);
             }
             if(ham.Idx_Zeeman() >= 0)
             {
-                Vectormath::fill(temp_field, Vector3::Zero());
-                Vectormath::fill(temp_energy, 0);
                 ham.E_Zeeman(image, temp_energy);
                 temp_energies[ham.Idx_Zeeman()][img] = Vectormath::sum(temp_energy);
                 ham.Gradient_Zeeman(temp_field);
@@ -394,8 +396,6 @@ namespace Engine
             }
             if(ham.Idx_Anisotropy() >= 0)
             {
-                Vectormath::fill(temp_field, Vector3::Zero());
-                Vectormath::fill(temp_energy, 0);
                 ham.E_Anisotropy(image, temp_energy);
                 temp_energies[ham.Idx_Anisotropy()][img] = Vectormath::sum(temp_energy);
                 ham.Gradient_Anisotropy(image, temp_field);
@@ -403,8 +403,6 @@ namespace Engine
             }
             if(ham.Idx_DMI() >= 0)
             {
-                Vectormath::fill(temp_field, Vector3::Zero());
-                Vectormath::fill(temp_energy, 0);
                 ham.E_DMI(image, temp_energy);
                 temp_energies[ham.Idx_DMI()][img] = Vectormath::sum(temp_energy);
                 ham.Gradient_DMI(image, temp_field);
@@ -412,8 +410,6 @@ namespace Engine
             }
             if(ham.Idx_DDI() >= 0)
             {
-                Vectormath::fill(temp_field, Vector3::Zero());
-                Vectormath::fill(temp_energy, 0);
                 ham.E_DDI(image, temp_energy);
                 temp_energies[ham.Idx_DDI()][img] = Vectormath::sum(temp_energy);
                 ham.Gradient_DDI(image, temp_field);
@@ -421,8 +417,6 @@ namespace Engine
             }
             if(ham.Idx_Quadruplet() >= 0)
             {
-                Vectormath::fill(temp_field, Vector3::Zero());
-                Vectormath::fill(temp_energy, 0);
                 ham.E_Quadruplet(image, temp_energy);
                 temp_energies[ham.Idx_Quadruplet()][img] = Vectormath::sum(temp_energy);
                 ham.Gradient_Quadruplet(image, temp_field);
diff --git a/core/src/engine/Method_MMF.cpp b/core/src/engine/Method_MMF.cpp
index bc91b435d..623374050 100644
--- a/core/src/engine/Method_MMF.cpp
+++ b/core/src/engine/Method_MMF.cpp
@@ -368,7 +368,7 @@ namespace Engine
             // Spectra was not successful in calculating an eigenvector
             Log(Log_Level::Error, Log_Sender::MMF, "Failed to calculate eigenvectors of the Hessian!");
             Log(Log_Level::Info,  Log_Sender::MMF, "Zeroing the MMF force...");
-            Vectormath::fill(force, Vector3{0,0,0});
+            Backend::par::assign(force, [] SPIRIT_LAMBDA () -> Vector3 {return {0, 0, 0};});
         }
     }
 
diff --git a/core/src/engine/Solver_Kernels.cpp b/core/src/engine/Solver_Kernels.cpp
index a2ff7296c..131031c22 100644
--- a/core/src/engine/Solver_Kernels.cpp
+++ b/core/src/engine/Solver_Kernels.cpp
@@ -61,11 +61,11 @@ namespace Solver_Kernels
 
             auto s  = configurations[img]->data();
             auto sd = searchdir[img].data();
-            
-            Backend::par::apply( nos, [s, sd] SPIRIT_LAMBDA (int idx) 
+
+            Backend::par::apply( nos, [s, sd] SPIRIT_LAMBDA (int idx)
                 {
                     scalar theta = (sd[idx]).norm();
-                    scalar q = cos(theta), w = 1-q, 
+                    scalar q = cos(theta), w = 1-q,
                            x = -sd[idx][0]/theta, y = -sd[idx][1]/theta, z = -sd[idx][2]/theta,
                            s1 = -y*z*w, s2 = x*z*w, s3 = -x*y*w,
                            p1 = x*sin(theta), p2 = y*sin(theta), p3 = z*sin(theta);
@@ -89,7 +89,7 @@ namespace Solver_Kernels
     {
         int nos = searchdir.size();
         scalar theta_rms = 0;
-        theta_rms = sqrt( Backend::par::reduce(searchdir, [] SPIRIT_LAMBDA (const Vector3 & v){ return v.squaredNorm(); }) / nos );
+        theta_rms = sqrt( Backend::par::reduce( nos, [] SPIRIT_LAMBDA (const Vector3 & v){ return v.squaredNorm(); }, searchdir) / nos );
         scalar scaling = (theta_rms > maxmove) ? maxmove/theta_rms : 1.0;
         return scaling;
     }
@@ -167,7 +167,7 @@ namespace Solver_Kernels
         {
             rho[n] = 1/rho[n];
         }
-        
+
         for(int img=0; img<noi; img++)
         {
             auto s = (*configurations[img]).data();
diff --git a/core/src/engine/Vectormath.cpp b/core/src/engine/Vectormath.cpp
index ec8172a66..92d38f913 100644
--- a/core/src/engine/Vectormath.cpp
+++ b/core/src/engine/Vectormath.cpp
@@ -62,19 +62,6 @@ namespace Vectormath
 
     /////////////////////////////////////////////////////////////////
 
-    void fill(scalarfield & sf, scalar s)
-    {
-        #pragma omp parallel for
-        for (unsigned int i = 0; i<sf.size(); ++i)
-            sf[i] = s;
-    }
-    void fill(scalarfield & sf, scalar s, const intfield & mask)
-    {
-        #pragma omp parallel for
-        for (unsigned int i=0; i<sf.size(); ++i)
-            sf[i] = mask[i]*s;
-    }
-
     void scale(scalarfield & sf, scalar s)
     {
         #pragma omp parallel for
@@ -111,26 +98,6 @@ namespace Vectormath
             sf[i] = std::min( std::max( sf_min, sf[i] ), sf_max );
     }
 
-    void fill(vectorfield & vf, const Vector3 & v)
-    {
-        #pragma omp parallel for
-        for (unsigned int i=0; i<vf.size(); ++i)
-            vf[i] = v;
-    }
-    void fill(vectorfield & vf, const Vector3 & v, const intfield & mask)
-    {
-        #pragma omp parallel for
-        for (unsigned int i=0; i<vf.size(); ++i)
-            vf[i] = mask[i]*v;
-    }
-
-    void normalize_vectors(vectorfield & vf)
-    {
-        #pragma omp parallel for
-        for (unsigned int i=0; i<vf.size(); ++i)
-            vf[i].normalize();
-    }
-
     void norm( const vectorfield & vf, scalarfield & norm )
     {
         for (unsigned int i=0; i<vf.size(); ++i)
diff --git a/core/src/engine/Vectormath.cu b/core/src/engine/Vectormath.cu
index 3aff53af1..1c7acbb37 100644
--- a/core/src/engine/Vectormath.cu
+++ b/core/src/engine/Vectormath.cu
@@ -113,38 +113,9 @@ namespace Engine
             }
         }
 
-       
-        /////////////////////////////////////////////////////////////////
 
+        /////////////////////////////////////////////////////////////////
 
-        __global__ void cu_fill(scalar *sf, scalar s, size_t N)
-        {
-            int idx = blockIdx.x * blockDim.x + threadIdx.x;
-            if(idx < N)
-            {
-                sf[idx] = s;
-            }
-        }
-        void fill(scalarfield & sf, scalar s)
-        {
-            int n = sf.size();
-            cu_fill<<<(n+1023)/1024, 1024>>>(sf.data(), s, n);
-            CU_CHECK_AND_SYNC();
-        }
-        __global__ void cu_fill_mask(scalar *sf, scalar s, const int * mask, size_t N)
-        {
-            int idx = blockIdx.x * blockDim.x + threadIdx.x;
-            if(idx < N)
-            {
-                sf[idx] = mask[idx]*s;
-            }
-        }
-        void fill(scalarfield & sf, scalar s, const intfield & mask)
-        {
-            int n = sf.size();
-            cu_fill_mask<<<(n+1023)/1024, 1024>>>(sf.data(), s, mask.data(), n);
-            CU_CHECK_AND_SYNC();
-        }
 
         __global__ void cu_scale(scalar *sf, scalar s, size_t N)
         {
@@ -179,7 +150,7 @@ namespace Engine
         scalar sum(const scalarfield & sf)
         {
             static scalarfield ret(1, 0);
-            Vectormath::fill(ret, 0);
+            Backend::par::assign(ret, [] SPIRIT_LAMBDA (int idx) {return 0;});
             // Determine temporary storage size and allocate
             void * d_temp_storage = NULL;
             size_t temp_storage_bytes = 0;
@@ -219,50 +190,6 @@ namespace Engine
                 sf[i] = std::min( std::max( sf_min, sf[i] ), sf_max );
         }
 
-        __global__ void cu_fill(Vector3 *vf1, Vector3 v2, size_t N)
-        {
-            int idx = blockIdx.x * blockDim.x + threadIdx.x;
-            if(idx < N)
-            {
-                vf1[idx] = v2;
-            }
-        }
-        void fill(vectorfield & vf, const Vector3 & v)
-        {
-            int n = vf.size();
-            cu_fill<<<(n+1023)/1024, 1024>>>(vf.data(), v, n);
-            CU_CHECK_AND_SYNC();
-        }
-        __global__ void cu_fill_mask(Vector3 *vf1, Vector3 v2, const int * mask, size_t N)
-        {
-            int idx = blockIdx.x * blockDim.x + threadIdx.x;
-            if(idx < N)
-            {
-                vf1[idx] = v2;
-            }
-        }
-        void fill(vectorfield & vf, const Vector3 & v, const intfield & mask)
-        {
-            int n = vf.size();
-            cu_fill_mask<<<(n+1023)/1024, 1024>>>(vf.data(), v, mask.data(), n);
-            CU_CHECK_AND_SYNC();
-        }
-
-        __global__ void cu_normalize_vectors(Vector3 *vf, size_t N)
-        {
-            int idx = blockIdx.x * blockDim.x + threadIdx.x;
-            if(idx < N)
-            {
-                vf[idx].normalize();
-            }
-        }
-        void normalize_vectors(vectorfield & vf)
-        {
-            int n = vf.size();
-            cu_normalize_vectors<<<(n+1023)/1024, 1024>>>(vf.data(), n);
-            CU_CHECK_AND_SYNC();
-        }
-
         __global__ void cu_norm(const Vector3 * vf, scalar * norm, size_t N)
         {
             int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -387,7 +314,7 @@ namespace Engine
         Vector3 sum(const vectorfield & vf)
         {
             static vectorfield ret(1, {0,0,0});
-            Vectormath::fill(ret, {0,0,0});
+            Backend::par::assign(ret, [] SPIRIT_LAMBDA (int idx) -> Vector3 {return {0, 0, 0};});
             // Declare, allocate, and initialize device-accessible pointers for input and output
             CustomAdd    add_op;
             static const Vector3 init{0,0,0};
@@ -423,11 +350,11 @@ namespace Engine
         {
             int n = vf1.size();
             static scalarfield sf(n, 0);
-            
+
             if(sf.size() != vf1.size())
                 sf.resize(vf1.size());
 
-            Vectormath::fill(sf, 0);
+            Backend::par::assign(sf, [] SPIRIT_LAMBDA (int idx) {return 0;});
             scalar ret;
 
             // Dot product
diff --git a/core/src/io/Dataparser.cpp b/core/src/io/Dataparser.cpp
index 3ae70baaf..850be0664 100644
--- a/core/src/io/Dataparser.cpp
+++ b/core/src/io/Dataparser.cpp
@@ -46,10 +46,9 @@ namespace IO
                 geometry.atom_types[i] = -1;
             #endif
             }
-        }
 
-        // normalize read in spins
-        Vectormath::normalize_vectors( spins );
+            spins[i].normalize();
+        }
     }
 
 
diff --git a/core/src/utility/Configurations.cpp b/core/src/utility/Configurations.cpp
index 24533e0a9..4fbcbae53 100644
--- a/core/src/utility/Configurations.cpp
+++ b/core/src/utility/Configurations.cpp
@@ -1,6 +1,7 @@
 #include <utility/Configurations.hpp>
 #include <data/Spin_System.hpp>
 #include <engine/Vectormath.hpp>
+#include <engine/Backend_par.hpp>
 #include <utility/Constants.hpp>
 #include <utility/Logging.hpp>
 #include <utility/Exception.hpp>
@@ -137,9 +138,15 @@ namespace Utility
             auto distribution = std::uniform_real_distribution<scalar>(-1, 1);
 
             Engine::Vectormath::get_random_vectorfield_unitsphere(*prng, xi);
-            Engine::Vectormath::scale(xi, epsilon);
-            Engine::Vectormath::add_c_a(1, xi, *s.spins, mask);
-            Engine::Vectormath::normalize_vectors(*s.spins);
+            auto xip = xi.data();
+            auto maskp = mask.data();
+            auto spinsp = spins.data();
+            Engine::Backend::par::apply(spins.size(), [epsilon, spinsp, maskp, xip] SPIRIT_LAMBDA (int idx) {
+                xip[idx] *= epsilon;
+                if( maskp[idx] )
+                    spinsp[idx] += xip[idx];
+                spinsp[idx].normalize();
+            });
         }
 
         void Hopfion(Data::Spin_System & s, Vector3 pos, scalar r, int order, filterfunction filter)
diff --git a/core/test/test_manifoldmath.cpp b/core/test/test_manifoldmath.cpp
index e7cef2de7..ac61f3156 100644
--- a/core/test/test_manifoldmath.cpp
+++ b/core/test/test_manifoldmath.cpp
@@ -2,6 +2,7 @@
 #include <engine/Vectormath_Defines.hpp>
 #include <engine/Vectormath.hpp>
 #include <engine/Manifoldmath.hpp>
+#include <engine/Backend_par.hpp>
 
 
 TEST_CASE( "Manifold operations", "[manifold]" )
@@ -53,56 +54,68 @@ TEST_CASE( "Manifold operations", "[manifold]" )
 		Engine::Manifoldmath::invert_orthogonal(v1,v2);
 		REQUIRE( Engine::Vectormath::dot(v1, v3) == Approx(-proj_prev) );
 	}
-  
+
   SECTION( "Projection: tangetial")
   {
     REQUIRE_FALSE( Engine::Vectormath::dot(v1,v2) == Approx(0) );  // Assert they are not orthogonal
-    Engine::Vectormath::normalize_vectors( v2 );    // Normalize all vector3 of v2 for projection
-    Engine::Manifoldmath::project_tangential(v1,v2); 
+    // Normalize all vector3 of v2 for projection
+    auto v2p = v2.data();
+    Engine::Backend::par::apply(v2.size(), [v2p] SPIRIT_LAMBDA (int idx) {
+        v2p[idx].normalize();
+    });
+    Engine::Manifoldmath::project_tangential(v1,v2);
     for (int i=0; i<N_check; i++)
     {
-      
+
       REQUIRE( v2[i].dot(v1[i]) == Approx(0) );
       REQUIRE( Engine::Vectormath::dot(v1,v2) == Approx(0) );
     }
   }
-  
+
   SECTION( "Distance on Greatcircle" )
   {
     // orthogonal vectors ( angle pi/2 between them )
-    
+
     vectorfield vf3( N, Vector3{ 0.0, 0.0, 1.0 } );
     vectorfield vf4( N, Vector3{ 0.0, 1.0, 0.0 } );
-    
+
     REQUIRE( Engine::Vectormath::dot( vf3, vf4) == Approx(0) );
-    Engine::Vectormath::normalize_vectors( vf3 );             // normalize components of vf3
-    Engine::Vectormath::normalize_vectors( vf4 );             // normalize components of vf4
-    
+    auto vf3p = vf3.data();
+    auto vf4p = vf4.data();
+    Engine::Backend::par::apply(vf3.size(), [vf3p, vf4p] SPIRIT_LAMBDA (int idx) {
+        vf3p[idx].normalize();
+        vf4p[idx].normalize();
+    });
+
     scalar dist = Engine::Vectormath::angle( vf3[0], vf4[0] );
     REQUIRE( dist == Approx( std::acos(-1)*0.5 ) );   // distance should be pi/2
-    
+
     // antiparallel vectors ( angle pi between them )
-    
+
     vectorfield vf5( N, Vector3{ 0.0,  1.0, 0.0 } );
     vectorfield vf6( N, Vector3{ 0.0, -1.0, 0.0 } );
-    
+
     REQUIRE( Engine::Vectormath::dot( vf5, vf6 ) == Approx(-1*N) );
-    Engine::Vectormath::normalize_vectors( vf5 );             // normalize components of vf5
-    Engine::Vectormath::normalize_vectors( vf6 );             // normalize components of vf6
-    
+    auto vf5p = vf5.data();
+    auto vf6p = vf6.data();
+    Engine::Backend::par::apply(vf5.size(), [vf5p, vf6p] SPIRIT_LAMBDA (int idx) {
+        vf5p[idx].normalize();
+        vf6p[idx].normalize();
+    });
+
     scalar dist2 = Engine::Vectormath::angle( vf5[0], vf6[0] );
     REQUIRE( dist2 == Approx( std::acos(-1) ) );      // distance should be pi
   }
-  
+
   SECTION( "Distance on Geodesic" )
   {
     vectorfield v3( N, Vector3{ 0.0, 0.0, 1.0 } );
     vectorfield v4( N, Vector3{ 0.0, 1.0, 0.0 } );
-    
+
     REQUIRE( Engine::Vectormath::dot( v3, v4 ) == Approx( 0 ) );
     Engine::Manifoldmath::normalize( v3 );
     Engine::Manifoldmath::normalize( v4 );
-    
+
     scalar dist = Engine::Manifoldmath::dist_geodesic( v3, v4 );
     scalar dist_gc = Engine::Vectormath::angle( v3[0], v4[0] );
     REQUIRE( dist == Approx( sqrt( N * dist_gc * dist_gc ) ) );
diff --git a/core/test/test_vectormath.cpp b/core/test/test_vectormath.cpp
index 1faca3f87..43bb10d73 100644
--- a/core/test/test_vectormath.cpp
+++ b/core/test/test_vectormath.cpp
@@ -1,19 +1,20 @@
 #include <catch.hpp>
 #include <engine/Vectormath_Defines.hpp>
 #include <engine/Vectormath.hpp>
+#include <engine/Backend_par.hpp>
 
 
 TEST_CASE( "Vectormath operations", "[vectormath]" )
 {
     int N = 10000;
-	int N_check = std::min(100, N);
+    int N_check = std::min(100, N);
     scalarfield sf(N, 1);
     vectorfield vf1(N, Vector3{ 1.0, 1.0, 1.0 });
     vectorfield vf2(N, Vector3{ -1.0, 1.0, 1.0 });
 
     SECTION("Magnetization")
     {
-        Engine::Vectormath::fill( vf1, { 0, 0, 1 } );
+        Engine::Backend::par::assign(vf1, [] SPIRIT_LAMBDA () -> Vector3 {return {0, 0, 1};});
         auto m = Engine::Vectormath::Magnetization( vf1 );
         REQUIRE( m[0] == Approx( 0 ) );
         REQUIRE( m[1] == Approx( 0 ) );
@@ -50,8 +51,8 @@ TEST_CASE( "Vectormath operations", "[vectormath]" )
     {
         scalar stest = 333;
         Vector3 vtest{ 0, stest, 0 };
-        Engine::Vectormath::fill(sf, stest);
-        Engine::Vectormath::fill(vf1, vtest);
+        Engine::Backend::par::assign(sf, [stest] SPIRIT_LAMBDA () -> scalar {return stest;});
+        Engine::Backend::par::assign(vf1, [vtest] SPIRIT_LAMBDA () -> Vector3 {return vtest;});
         for (int i = 0; i < N_check; ++i)
         {
             REQUIRE(sf[i] == stest);
@@ -114,8 +115,12 @@ TEST_CASE( "Vectormath operations", "[vectormath]" )
 
         Vector3 vtest1 = vf1[0].normalized();
         Vector3 vtest2 = vf2[0].normalized();
-        Engine::Vectormath::normalize_vectors(vf1);
-        Engine::Vectormath::normalize_vectors(vf2);
+        auto vf1p = vf1.data();
+        auto vf2p = vf2.data();
+        Engine::Backend::par::apply(vf1.size(), [vf1p, vf2p] SPIRIT_LAMBDA (int idx) {
+            vf1p[idx].normalize();
+            vf2p[idx].normalize();
+        });
         for (int i = 0; i < N_check; ++i)
         {
             REQUIRE(vf1[i] == vtest1);
@@ -130,8 +135,12 @@ TEST_CASE( "Vectormath operations", "[vectormath]" )
         scalar vmc1 = std::max( vtest1[0], std::max( vtest1[1], vtest1[2] ) );
         scalar vmc2 = std::max( vtest2[0], std::max( vtest2[1], vtest2[2] ) );
 
-        Engine::Vectormath::normalize_vectors(vf1);
-        Engine::Vectormath::normalize_vectors(vf2);
+        auto vf1p = vf1.data();
+        auto vf2p = vf2.data();
+        Engine::Backend::par::apply(vf1.size(), [vf1p, vf2p] SPIRIT_LAMBDA (int idx) {
+            vf1p[idx].normalize();
+            vf2p[idx].normalize();
+        });
         scalar vfmc1 = Engine::Vectormath::max_abs_component( vf1 );
         scalar vfmc2 = Engine::Vectormath::max_abs_component( vf2 );