From 2ab0bf576dede9215c3c37feb35a66caef93ce95 Mon Sep 17 00:00:00 2001
From: ShuliangLu <graphicslsl@gmail.com>
Date: Wed, 1 Mar 2023 10:32:11 +0800
Subject: [PATCH] heterogenous material and supporter bones

---
 projects/CuLagrange/CMakeLists.txt            |   10 +-
 .../CuLagrange/fem/FastQuasiStaticStepping.cu |  719 -------
 .../CuLagrange/fem/FleshDynamicStepping.cu    | 1321 ++++++------
 projects/CuLagrange/fem/FleshQuasiStepping.cu |  588 ------
 .../CuLagrange/fem/QuasiStaticStepping.cu     |  349 ----
 .../fem/collision_energy/collision_utils.hpp  |  696 ++++---
 .../collision_energy/edge_edge_collision.hpp  |   13 +-
 .../edge_edge_sqrt_collision.hpp              |  227 --
 .../collision_energy/evaluate_collision.hpp   |  891 ++++++--
 .../vertex_face_sqrt_collision.hpp            |  106 +-
 .../geometry/BaryCentricInterpolator.cu       |  224 +-
 .../geometry/BiharmonicBoundedWeight.cu       |    2 +-
 projects/CuLagrange/geometry/CollisionVis.cu  | 1824 ++++++++++++-----
 .../CuLagrange/geometry/DeformationField.cu   |    8 +-
 .../CuLagrange/geometry/SolveLaplacian.cu     |    2 +-
 projects/CuLagrange/geometry/Topology.cu      |  345 +++-
 projects/CuLagrange/geometry/VectorField.cu   |   78 +-
 .../geometry/file_parser/read_vtk_mesh.hpp    |    7 +
 .../geometry/kernel/bary_centric_weights.hpp  |   94 +-
 .../kernel/calculate_bisector_normal.hpp      |    2 +-
 .../geometry/kernel/calculate_edge_normal.hpp |   34 +-
 .../kernel/calculate_facet_center.hpp         |    6 +-
 .../kernel/calculate_facet_normal.hpp         |   34 +-
 .../kernel/compute_characteristic_length.hpp  |    4 +-
 .../CuLagrange/geometry/kernel/geo_math.hpp   |  386 ++++
 .../CuLagrange/geometry/kernel/laplacian.hpp  |  351 ++++
 .../geometry/kernel/tiled_vector_ops.hpp      |  223 +-
 .../CuLagrange/geometry/kernel/topology.hpp   |  393 +++-
 .../geometry/linear_system/mfcg.hpp           |   21 +-
 29 files changed, 5253 insertions(+), 3705 deletions(-)
 delete mode 100644 projects/CuLagrange/fem/FastQuasiStaticStepping.cu
 delete mode 100644 projects/CuLagrange/fem/FleshQuasiStepping.cu
 delete mode 100644 projects/CuLagrange/fem/QuasiStaticStepping.cu
 delete mode 100644 projects/CuLagrange/fem/collision_energy/edge_edge_sqrt_collision.hpp
 create mode 100644 projects/CuLagrange/geometry/kernel/laplacian.hpp
diff --git a/projects/CuLagrange/CMakeLists.txt b/projects/CuLagrange/CMakeLists.txt
index a9d5fdaaf1..25b4e4b5be 100644
--- a/projects/CuLagrange/CMakeLists.txt
+++ b/projects/CuLagrange/CMakeLists.txt
@@ -70,17 +70,11 @@ target_sources(zeno PRIVATE
   # fem/Check.cu
   fem/Generation.cpp
 
-  # fem/test.cpp
-  # fem/QuasiStaticStepping.cu
-
-  # fem/FastQuasiStaticStepping.cu
-  fem/FleshQuasiStepping.cu
-
   fem/FleshDynamicStepping.cu # CHECK THIS
   fem/collision_energy/vertex_face_collision.hpp
   fem/collision_energy/vertex_face_sqrt_collision.hpp
-  fem/collision_energy/edge_edge_collision.hpp
-  fem/collision_energy/edge_edge_sqrt_collition.hpp
+  # fem/collision_energy/edge_edge_collision.hpp
+  # fem/collision_energy/edge_edge_sqrt_collition.hpp
   fem/collision_energy/evaluate_collision.hpp
   fem/collision_energy/collision_utils.hpp
 )
diff --git a/projects/CuLagrange/fem/FastQuasiStaticStepping.cu b/projects/CuLagrange/fem/FastQuasiStaticStepping.cu
deleted file mode 100644
index c9f1bb3a8c..0000000000
--- a/projects/CuLagrange/fem/FastQuasiStaticStepping.cu
+++ /dev/null
@@ -1,719 +0,0 @@
-#include "Structures.hpp"
-#include "Utils.hpp"
-#include "zensim/Logger.hpp"
-#include "zensim/cuda/execution/ExecutionPolicy.cuh"
-#include "zensim/geometry/PoissonDisk.hpp"
-#include "zensim/geometry/VdbLevelSet.h"
-#include "zensim/geometry/VdbSampler.h"
-#include "zensim/io/MeshIO.hpp"
-#include "zensim/math/bit/Bits.h"
-#include "zensim/types/Property.h"
-#include <atomic>
-#include <zeno/VDBGrid.h>
-#include <zeno/types/ListObject.h>
-#include <zeno/types/NumericObject.h>
-#include <zeno/types/PrimitiveObject.h>
-#include <zeno/types/StringObject.h>
-
-namespace zeno {
-struct FastQuasiStaticStepping : INode {
-    using T = float;
-    using dtiles_t = zs::TileVector<T,32>;
-    using tiles_t = typename ZenoParticles::particles_t;
-    using vec3 = zs::vec<T, 3>;
-    using mat3 = zs::vec<T, 3, 3>;
-
-    struct FastFEMSystem {
-        template <typename Pol, typename Model>
-        T energy(Pol &pol, const Model &model, const zs::SmallString tag, dtiles_t& vtemp) {
-            using namespace zs;
-            constexpr auto space = execspace_e::cuda;
-            Vector<T> res{verts.get_allocator(), 1};
-            res.setVal(0);
-            //   elastic potential
-            pol(range(eles.size()), [verts = proxy<space>({}, verts),
-                                    eles = proxy<space>({}, eles),
-                                    vtemp = proxy<space>({}, vtemp),
-                                    res = proxy<space>(res), tag, model = model,volf = volf] 
-                                    ZS_LAMBDA (int ei) mutable {
-                auto DmInv = eles.pack(dim_c<3, 3>, "IB", ei);
-                auto inds = eles.pack(dim_c<4>, "inds", ei, int_c);
-                vec3 xs[4] = {vtemp.pack<3>(tag, inds[0]), vtemp.pack<3>(tag, inds[1]),
-                            vtemp.pack<3>(tag, inds[2]), vtemp.pack<3>(tag, inds[3])};
-                mat3 F{};
-                {
-                auto x1x0 = xs[1] - xs[0];
-                auto x2x0 = xs[2] - xs[0];
-                auto x3x0 = xs[3] - xs[0];
-                auto Ds = mat3{x1x0[0], x2x0[0], x3x0[0], x1x0[1], x2x0[1],
-                                x3x0[1], x1x0[2], x2x0[2], x3x0[2]};
-                F = Ds * DmInv;
-                }
-                auto psi = model.psi(F);
-                auto vole = eles("vol", ei);
-
-                T gpsi = 0;
-                for(int i = 0;i != 4;++i)
-                    gpsi += (-volf.dot(xs[i])/4); 
-
-                atomic_add(exec_cuda, &res[0], (T)(vole * (psi + gpsi)));
-            });
-        // Bone Driven Potential Energy
-            T lambda = model.lam;
-            T mu = model.mu;
-            auto nmEmbedVerts = b_verts.size();
-            if(b_bcws.size() != b_verts.size()){
-                fmt::print("B_BCWS_SIZE = {}\t B_VERTS_SIZE = {}\n",b_bcws.size(),b_verts.size());
-                throw std::runtime_error("B_BCWS SIZE AND B_VERTS SIZE NOT MATCH");
-            }
-            pol(range(nmEmbedVerts), [vtemp = proxy<space>({},vtemp),
-                eles = proxy<space>({},eles),
-                b_verts = proxy<space>({},b_verts),
-                bcws = proxy<space>({},b_bcws),lambda,mu,tag,res = proxy<space>(res),bone_driven_weight = bone_driven_weight]
-                ZS_LAMBDA(int vi) mutable {
-                    auto ei = bcws("inds",vi, int_c);
-                    if(ei < 0)
-                        return;
-                    auto inds = eles.pack(dim_c<4>, "inds",ei, int_c);
-                    auto w = bcws.pack(dim_c<4>, "w",vi);
-
-                    auto tpos = vec3::zeros();
-                    for(size_t i = 0;i != 4;++i)
-                        tpos += w[i] * vtemp.pack<3>(tag,inds[i]);
-                    auto pdiff = tpos - b_verts.pack<3>("x",vi);
-
-                    T stiffness = 2.0066 * mu + 1.0122 * lambda;
-                    T bpsi = (0.5 * bcws("cnorm",vi) * stiffness * bone_driven_weight * eles("vol",ei)) * pdiff.l2NormSqr();
-                    atomic_add(exec_cuda, &res[0], (T)bpsi);
-            });
-
-            return res.getVal();
-        }
-
-        template <typename Model>
-        void gradient(zs::CudaExecutionPolicy& cudaPol,
-                                        const Model& model,
-                                        const zs::SmallString tag, 
-                                        dtiles_t& vtemp,
-                                        dtiles_t& etemp) {
-            using namespace zs;
-            constexpr auto space = execspace_e::cuda;
-            cudaPol(zs::range(eles.size()), [vtemp = proxy<space>({}, vtemp),
-                                        etemp = proxy<space>({}, etemp),
-                                        bcws = proxy<space>({},b_bcws),
-                                        b_verts = proxy<space>({},b_verts),
-                                        verts = proxy<space>({}, verts),
-                                        eles = proxy<space>({}, eles),tag, model, volf = volf] ZS_LAMBDA (int ei) mutable {
-                auto DmInv = eles.pack(dim_c<3, 3>,"IB", ei);
-                auto dFdX = dFdXMatrix(DmInv);
-                auto inds = eles.pack(dim_c<4>,"inds", ei, int_c);
-                vec3 xs[4] = {vtemp.pack(dim_c<3>, tag, inds[0]), vtemp.pack(dim_c<3>, tag, inds[1]),
-                                vtemp.pack(dim_c<3>, tag, inds[2]), vtemp.pack(dim_c<3>, tag, inds[3])};
-                mat3 F{};
-                {
-                    auto x1x0 = xs[1] - xs[0];
-                    auto x2x0 = xs[2] - xs[0];
-                    auto x3x0 = xs[3] - xs[0];
-                    auto Ds = mat3{x1x0[0], x2x0[0], x3x0[0], x1x0[1], x2x0[1],
-                                x3x0[1], x1x0[2], x2x0[2], x3x0[2]};
-                    F = Ds * DmInv;
-                }
-                auto P = model.first_piola(F);
-                auto vole = eles("vol", ei);
-                auto vecP = flatten(P);
-                auto dFdXT = dFdX.transpose();
-                auto vf = -vole * (dFdXT * vecP);
-                auto mg = volf * vole / 4;
-                for (int i = 0; i != 4; ++i) {
-                    auto vi = inds[i];
-                    for (int d = 0; d != 3; ++d)
-                        atomic_add(exec_cuda, &vtemp("grad", d, vi), vf(i * 3 + d) + mg(d));
-                }
-
-            });
-
-            if(b_bcws.size() != b_verts.size()){
-                fmt::print("B_BCWS_SIZE = {}\t B_VERTS_SIZE = {}\n",b_bcws.size(),b_verts.size());
-                throw std::runtime_error("B_BCWS SIZE AND B_VERTS SIZE NOT MATCH");
-            }
-
-            T stiffness = 2.0066 * model.mu + 1.0122 * model.lam;
-            auto nmEmbedVerts = b_verts.size();
-            cudaPol(zs::range(nmEmbedVerts),
-                [bcws = proxy<space>({},b_bcws),b_verts = proxy<space>({},b_verts),vtemp = proxy<space>({},vtemp),etemp = proxy<space>({},etemp),
-                            eles = proxy<space>({},eles),stiffness,tag,bone_driven_weight = bone_driven_weight] ZS_LAMBDA(int vi) mutable {
-                auto ei = bcws("inds",vi, int_c);
-                if(ei < 0)
-                    return;
-                auto inds = eles.pack(dim_c<4>, "inds",ei, int_c);
-                auto w = bcws.pack<4>("w",vi);
-                auto tpos = vec3::zeros();
-                for(size_t i = 0;i != 4;++i)
-                    tpos += w[i] * vtemp.pack<3>(tag,inds[i]);
-                auto pdiff = tpos - b_verts.pack<3>("x",vi);
-
-                for(size_t i = 0;i != 4;++i){
-                    auto tmp = pdiff * (-stiffness * bcws("cnorm",vi) * bone_driven_weight * w[i] * eles("vol",ei)); 
-                    // tmp = pdiff * (-lambda * bcws("cnorm",vi) * bone_driven_weight * w[i]);
-                    for(size_t d = 0;d != 3;++d)
-                        atomic_add(exec_cuda,&vtemp("grad",d,inds[i]),(T)tmp[d]);
-                }
-            });
-        }
-
-        template <typename Model>
-        void laplacian(zs::CudaExecutionPolicy& cudaPol,
-                                const Model& model,
-                                const zs::SmallString tag, 
-                                const zs::SmallString Htag,
-                                dtiles_t& vtemp,
-                                dtiles_t& etemp) {
-            using namespace zs;
-            constexpr auto space = execspace_e::cuda;
-            T stiffness = 2.0066 * model.mu + 1.0122 * model.lam;    
-            cudaPol(zs::range(eles.size()),
-                [vtemp = proxy<space>({}, vtemp),etemp = proxy<space>({}, etemp),
-                    bcws = proxy<space>({},b_bcws),b_verts = proxy<space>({},b_verts),
-                    verts = proxy<space>({},verts),eles = proxy<space>({},eles),tag,
-                    Htag,stiffness,bone_driven_weight = bone_driven_weight]
-                        ZS_LAMBDA(int ei) mutable {
-                auto DmInv = eles.template pack<3, 3>("IB", ei);
-                auto dFdX = dFdXMatrix(DmInv);
-                auto vol = eles("vol",ei);
-                etemp.template tuple<12*12>(Htag,ei) = stiffness * vol * dFdX.transpose() * dFdX;            
-            });   
-
-            cudaPol(zs::range(b_bcws.size()),
-                    [bcws = proxy<space>({},b_bcws),b_verts = proxy<space>({},b_verts),vtemp = proxy<space>({},vtemp),etemp = proxy<space>({},etemp),
-                    eles = proxy<space>({},eles),stiffness,tag,bone_driven_weight = bone_driven_weight] ZS_LAMBDA(int vi) mutable {
-                auto ei = bcws("inds",vi, int_c);
-                if(ei < 0)
-                    return;
-                auto inds = eles.pack(dim_c<4>, "inds",ei, int_c);
-                auto w = bcws.pack<4>("w",vi);
-
-                for(int i = 0;i != 4;++i)
-                    for(int j = 0;j != 4;++j){
-                        T alpha = stiffness * bone_driven_weight * w[i] * w[j] * bcws("cnorm",vi) * eles("vol",ei);
-                        for(int d = 0;d != 3;++d){
-                            atomic_add(exec_cuda,&etemp("He",(i * 3 + d) * 12 + j * 3 + d,ei),alpha);
-                        }
-                    }
-
-            });                                         
-        }
-
-        template <typename Model>
-        void hessian(zs::CudaExecutionPolicy& cudaPol,
-                                                const Model& model,
-                                                const zs::SmallString xTag,
-                                                const zs::SmallString HTag, 
-                                                dtiles_t& vtemp,
-                                                dtiles_t& etemp) {
-            using namespace zs;
-            constexpr auto space = execspace_e::cuda;
-            // fmt::print("check here 0");
-            cudaPol(zs::range(eles.size()), [vtemp = proxy<space>({}, vtemp),
-                                            etemp = proxy<space>({}, etemp),
-                                            bcws = proxy<space>({},b_bcws),
-                                            b_verts = proxy<space>({},b_verts),
-                                            verts = proxy<space>({}, verts),
-                                            eles = proxy<space>({}, eles),tag = xTag,HTag, model, volf = volf] ZS_LAMBDA (int ei) mutable {
-                auto DmInv = eles.pack(dim_c<3, 3>, "IB", ei);
-                auto dFdX = dFdXMatrix(DmInv);
-                auto inds = eles.pack(dim_c<4>, "inds", ei, int_c);
-                vec3 xs[4] = {vtemp.pack(dim_c<3>, tag, inds[0]), vtemp.pack(dim_c<3>, tag, inds[1]),
-                                vtemp.pack(dim_c<3>, tag, inds[2]), vtemp.pack(dim_c<3>, tag, inds[3])};
-                mat3 F{};
-                {
-                    auto x1x0 = xs[1] - xs[0];
-                    auto x2x0 = xs[2] - xs[0];
-                    auto x3x0 = xs[3] - xs[0];
-                    auto Ds = mat3{x1x0[0], x2x0[0], x3x0[0], x1x0[1], x2x0[1],
-                                x3x0[1], x1x0[2], x2x0[2], x3x0[2]};
-                    F = Ds * DmInv;
-                }
-                auto vole = eles("vol", ei);
-                auto dFdXT = dFdX.transpose();
-
-                auto Hq = model.first_piola_derivative(F, true_c);
-                auto H = dFdXT * Hq * dFdX * vole;
-
-                etemp.tuple<12 * 12>(HTag, ei) = H;
-
-            });
-            T stiffness = 2.0066 * model.mu + 1.0122 * model.lam;   
-            cudaPol(zs::range(b_bcws.size()),
-                    [bcws = proxy<space>({},b_bcws),b_verts = proxy<space>({},b_verts),vtemp = proxy<space>({},vtemp),etemp = proxy<space>({},etemp),
-                    eles = proxy<space>({},eles),stiffness,HTag,bone_driven_weight = bone_driven_weight] ZS_LAMBDA(int vi) mutable {
-                auto ei = bcws("inds",vi, int_c);
-                if(ei < 0)
-                    return;
-                auto inds = eles.pack(dim_c<4>,"inds",ei, int_c);
-                auto w = bcws.pack(dim_c<4>,"w",vi);
-
-                for(int i = 0;i != 4;++i)
-                    for(int j = 0;j != 4;++j){
-                        T alpha = stiffness * bone_driven_weight * w[i] * w[j] * bcws("cnorm",vi) * eles("vol",ei);
-                        for(int d = 0;d != 3;++d){
-                            atomic_add(exec_cuda,&etemp(HTag,(i * 3 + d) * 12 + j * 3 + d,ei),alpha);
-                        }
-                    }
-
-            }); 
-        }
-
-
-        template <typename Pol>
-        void precondition(Pol &pol, const zs::SmallString srcTag,
-                        const zs::SmallString dstTag,dtiles_t& vtemp) {
-        using namespace zs;
-        constexpr execspace_e space = execspace_e::cuda;
-        // precondition
-        pol(zs::range(verts.size()),
-            [vtemp = proxy<space>({}, vtemp), verts = proxy<space>({}, verts),
-            srcTag, dstTag] ZS_LAMBDA(int vi) mutable {
-                vtemp.tuple<3>(dstTag, vi) =
-                    vtemp.pack<3, 3>("P", vi) * vtemp.pack<3>(srcTag, vi);
-                // vtemp.tuple<3>(dstTag, vi) = vtemp.pack<3>(srcTag, vi);
-            });
-        }
-
-        template <typename Pol>
-        void multiply(Pol &pol, const zs::SmallString dxTag,
-                    const zs::SmallString bTag,
-                    const zs::SmallString HTag,
-                    dtiles_t& vtemp,
-                    const dtiles_t& etemp) {
-            using namespace zs;
-            constexpr execspace_e space = execspace_e::cuda;
-            constexpr auto execTag = wrapv<space>{};
-            const auto numVerts = verts.size();
-            const auto numEles = eles.size();
-            // dx -> b
-            pol(range(numVerts),
-                [execTag, vtemp = proxy<space>({}, vtemp), bTag] ZS_LAMBDA(
-                    int vi) mutable { vtemp.tuple<3>(bTag, vi) = vec3::zeros(); });
-            // elastic energy
-            pol(range(numEles), [execTag, etemp = proxy<space>({}, etemp),
-                                vtemp = proxy<space>({}, vtemp),
-                                eles = proxy<space>({}, eles), dxTag, bTag, HTag] ZS_LAMBDA(int ei) mutable {
-                constexpr int dim = 3;
-                constexpr auto dimp1 = dim + 1;
-                auto inds = eles.template pack<dimp1>("inds", ei, int_c);
-                zs::vec<T, dimp1 * dim> temp{};
-                for (int vi = 0; vi != dimp1; ++vi)
-                for (int d = 0; d != dim; ++d) {
-                    temp[vi * dim + d] = vtemp(dxTag, d, inds[vi]);
-                }
-                auto He = etemp.pack<dim * dimp1, dim * dimp1>(HTag, ei);
-
-                temp = He * temp;
-
-                for (int vi = 0; vi != dimp1; ++vi)
-                for (int d = 0; d != dim; ++d) {
-                    atomic_add(execTag, &vtemp(bTag, d, inds[vi]), temp[vi * dim + d]);
-                }
-            });
-        }
-
-        FastFEMSystem(const tiles_t &verts, const tiles_t &eles, const tiles_t &b_bcws, const tiles_t& b_verts,T bone_driven_weight,vec3 volf)
-            : verts{verts}, eles{eles}, b_bcws{b_bcws}, b_verts{b_verts}, bone_driven_weight{bone_driven_weight},volf{volf}{}
-
-        const tiles_t &verts;
-        const tiles_t &eles;
-        const tiles_t &b_bcws;  // the barycentric interpolation of embeded bones 
-        const tiles_t &b_verts; // the position of embeded bones
-
-        T bone_driven_weight;
-        vec3 volf;
-
-    };
-
-    template<typename Equation,typename Model>
-    constexpr void backtracking_line_search(zs::CudaExecutionPolicy &cudaPol,Equation& A,Model& models,int max_line_search,T armijo,
-            const zs::SmallString& dtag,const zs::SmallString& gtag,const zs::SmallString& xtag,T init_step,dtiles_t& vtemp) {
-        using namespace zs;
-        constexpr auto space = execspace_e::cuda;
-        T dg = dot(cudaPol,vtemp,gtag,dtag);
-        T E0;
-        match([&](auto &elasticModel) {
-            E0 = A.energy(cudaPol, elasticModel, xtag,vtemp);
-        })(models.getElasticModel());
-        T E{E0};
-        int line_search = 0;
-        std::vector<T> armijo_buffer(max_line_search);
-        T step = init_step;
-        cudaPol(zs::range(vtemp.size()),
-            [vtemp = proxy<space>({},vtemp),xtag,dtag,step] ZS_LAMBDA(int vi) mutable {
-                vtemp.pack<3>(xtag,vi) += step * vtemp.pack<3>(dtag,vi);
-            });
-
-        do {
-            match([&](auto &elasticModel) {
-            E = A.energy(cudaPol,elasticModel,xtag,vtemp);
-            })(models.getElasticModel());
-            // fmt::print("E: {} at alpha {}. E0 {}\n", E, alpha, E0);
-            // fmt::print("Armijo : {} < {}\n",(E - E0)/alpha,dg);
-            armijo_buffer[line_search] = (E - E0)/step;
-            // test Armojo condition
-            if(((double)E - (double)E0) < (double)armijo * (double)dg * (double)step)
-                break;
-            step /= 2;
-            cudaPol(zs::range(vtemp.size()), [vtemp = proxy<space>({}, vtemp),step,xtag,dtag] ZS_LAMBDA(int vi) mutable {
-                vtemp.tuple<3>(xtag, vi) = vtemp.pack<3>(xtag, vi) - step * vtemp.pack<3>(dtag, vi);
-            });
-            ++line_search;
-        } while (line_search < max_line_search);
-        // return line_search;
-    }
-
-    template<typename Equation,typename Model>
-    constexpr int solve_equation_using_pcg(zs::CudaExecutionPolicy &cudaPol,Equation& A,Model& models,const zs::SmallString& btag,const zs::SmallString& xtag,const zs::SmallString& Ptag,dtiles_t& vtemp,
-            zs::SmallString Htag,dtiles_t& etemp,T accuracy) {
-        using namespace zs;
-        constexpr auto space = execspace_e::cuda;
-        // set b = 0 outside the function call
-        // cudaPol(zs::range(vtemp.size()),[vtemp = proxy<space>({},vtemp)] ZS_LAMBDA(int vi) mutable {
-        //     vtemp.pack<3>("b",vi) = vec3::zeros();
-        // });
-
-        A.multiply(cudaPol,xtag,"temp","L",vtemp,etemp);
-        cudaPol(zs::range(vtemp.size()),[vtemp = proxy<space>({},vtemp)] ZS_LAMBDA(int vi) mutable {
-            vtemp.tuple<3>("r",vi) = vtemp.pack<3>("b",vi) - vtemp.pack<3>("temp",vi);
-        });
-        // no projection here
-        // A.project(cudaPol,"btag",verts,"r",vtemp);
-        A.precondition(cudaPol,"r","q",vtemp);
-        cudaPol(zs::range(vtemp.size()),
-            [vtemp = proxy<space>({},vtemp)] ZS_LAMBDA(int vi) mutable {
-                vtemp.tuple<3>("p",vi) = vtemp.pack<3>("q",vi);
-            });
-
-        T zTrk = dot(cudaPol,vtemp,"r","q");
-
-        auto residualPreconditionedNorm = std::sqrt(zTrk);
-        auto localTol = accuracy * residualPreconditionedNorm;
-        int iter = 0;
-        for(;iter != 1000;++iter){
-            if(residualPreconditionedNorm <= localTol){
-                fmt::print("finish with cg iter: {}, norm: {} zTrk: {}\n",iter,residualPreconditionedNorm,zTrk);
-                break;
-            }
-
-
-            A.multiply(cudaPol,"p","temp","L",vtemp,etemp);
-            T alpha = zTrk / dot(cudaPol,vtemp,"temp","p");
-
-            cudaPol(range(vtemp.size()), [vtemp = proxy<space>({},vtemp),alpha,xtag] ZS_LAMBDA(int vi) mutable {
-                vtemp.pack<3>(xtag, vi) += alpha * vtemp.pack<3>("p", vi);
-                vtemp.pack<3>("r", vi) -= alpha * vtemp.pack<3>("temp", vi);
-            });
-            if(iter % 51 == 50){
-                A.multiply(cudaPol,xtag,"temp","L",vtemp,etemp);
-                cudaPol(zs::range(vtemp.size()),
-                    [vtemp = proxy<space>({},vtemp),btag] ZS_LAMBDA(int vi) mutable {
-                        vtemp.template tuple<3>("r",vi) = vtemp.pack<3>(btag,vi) - vtemp.pack<3>("temp",vi);
-                    });
-            }   
-
-            A.precondition(cudaPol,"r","q",vtemp);
-            auto zTrkLast = zTrk;
-            zTrk = dot(cudaPol,vtemp,"q","r");
-            auto beta = zTrk / zTrkLast;
-
-            cudaPol(range(vtemp.size()), [vtemp = proxy<space>({}, vtemp),beta] ZS_LAMBDA(int vi) mutable {
-                vtemp("p", vi) = vtemp("q", vi) + beta * vtemp("p", vi);
-            });
-
-            residualPreconditionedNorm = std::sqrt(zTrk);
-            ++iter;
-        }
-
-        return iter;
-    }
-
-    static T reduce(zs::CudaExecutionPolicy &cudaPol, const zs::Vector<T> &res) {
-        using namespace zs;
-        constexpr auto space = execspace_e::cuda;
-        Vector<T> ret{res.get_allocator(), 1};
-        auto sid = cudaPol.getStreamid();
-        auto procid = cudaPol.getProcid();
-        auto &context = Cuda::context(procid);
-        auto stream = (cudaStream_t)context.streamSpare(sid);
-        std::size_t temp_bytes = 0;
-        cub::DeviceReduce::Reduce(nullptr, temp_bytes, res.data(), ret.data(),
-                                res.size(), std::plus<T>{}, (T)0, stream);
-        Vector<std::max_align_t> temp{res.get_allocator(),
-                                    temp_bytes / sizeof(std::max_align_t) + 1};
-        cub::DeviceReduce::Reduce(temp.data(), temp_bytes, res.data(), ret.data(),
-                                res.size(), std::plus<T>{}, (T)0, stream);
-        context.syncStreamSpare(sid);
-        return (T)ret.getVal();
-    }
-    template<int pack_dim = 3>
-    T dot(zs::CudaExecutionPolicy &cudaPol, dtiles_t &vertData,
-            const zs::SmallString tag0, const zs::SmallString tag1,int offset0 = 0,int offset1 = 0) {
-        using namespace zs;
-        constexpr auto space = execspace_e::cuda;
-        Vector<T> res{vertData.get_allocator(), vertData.size()},ret{vertData.get_allocator(),1};
-        cudaPol(range(vertData.size()),
-            [data = proxy<space>({},vertData),res = proxy<space>(res),tag0,tag1,offset0,offset1] ZS_LAMBDA(int pi) mutable {
-                res[pi] = (T)0.;
-                for(int i = 0;i < pack_dim;++i)
-                    res[pi] += data(tag0,offset0*pack_dim + i,pi) * data(tag1,offset1*pack_dim + i,pi);
-            });
-        //zs::reduce(cudaPol,std::begin(res),std:end(res),std::begin(ret), (T)0);
-        //return (T)ret.getVal();
-        return reduce(cudaPol, res);
-    }
-
-    T infNorm(zs::CudaExecutionPolicy &cudaPol, dtiles_t &vertData,
-                const zs::SmallString tag = "dir") {
-        using namespace zs;
-        constexpr auto space = execspace_e::cuda;
-        Vector<T> res{vertData.get_allocator(), 1};
-        res.setVal(0);
-        cudaPol(range(vertData.size()),
-                [data = proxy<space>({}, vertData), res = proxy<space>(res),
-                tag] __device__(int pi) mutable {
-                auto v = data.pack<3>(tag, pi);
-                atomic_max(exec_cuda, res.data(), v.abs().max());
-                });
-        return res.getVal();
-    }
-
-    virtual void apply() override {
-        using namespace zs;
-        auto zstets = get_input<ZenoParticles>("ZSParticles");
-        auto gravity = get_input<zeno::NumericObject>("gravity")->get<zeno::vec<3,T>>();
-        auto zsbones = get_input<ZenoParticles>("driven_bones");            // driven bones
-
-        auto armijo = get_param<float>("armijo");
-        auto curvature = get_param<float>("wolfe");
-        auto cg_res = get_param<float>("cg_res");                           // cg_res for inner loop of quasi-newton solver
-        auto btl_res = get_param<float>("btl_res");                         // a termination criterion for line search
-        
-        auto epsilon = get_param<float>("epsilon");
-        auto rel_epsilon = get_param<float>("rel_epsilon");
-        
-        auto models = zstets->getModel();           
-        auto& verts = zstets->getParticles();
-        auto& eles = zstets->getQuadraturePoints();
-
-        auto tag = get_param<std::string>("driven_tag");                    // tag channel where the bones are binded
-        auto bone_driven_weight = get_param<float>("bone_driven_weight");   // the weight of bone-driven potential
-        auto nm_newton_iters = get_param<int>("nm_newton_iters");
-        auto quasi_newton_window_size = get_param<int>("window_size");
-
-        auto volf = vec3::from_array(gravity * models.density);
-
-        static dtiles_t vtemp{verts.get_allocator(),
-            {
-                {"grad", 3},
-                {"gradp",3},
-                {"P", 9},
-                {"dir", 3},
-                {"xn", 3},
-                {"xn0", 3},
-                {"xp",3},
-                {"temp", 3},
-                {"r", 3},
-                {"p", 3},
-                {"q", 3},
-                {"fx", quasi_newton_window_size},
-                {"s", 3 * quasi_newton_window_size},
-                {"y", 3 * quasi_newton_window_size}
-            },verts.size()};
-        // buffer storage for laplace matrix
-        static dtiles_t etemp{eles.get_allocator(),{{"L", 12 * 12},{"H",12 * 12}},eles.size()};  
-        vtemp.resize(verts.size());
-        etemp.resize(eles.size());
-        FastFEMSystem A{verts,eles,(*zstets)[tag],zsbones->getParticles(),bone_driven_weight,volf};
-
-        constexpr auto space = execspace_e::cuda;
-        auto cudaPol = cuda_exec();   
-
-        // use the initial guess if given
-        if(verts.hasProperty("init_x")) {
-            fmt::print("set up initial guess for equation solution\n");
-            cudaPol(zs::range(verts.size()),
-                    [vtemp = proxy<space>({}, vtemp),verts = proxy<space>({}, verts)] __device__(int vi) mutable {
-                        auto x = verts.pack<3>("init_x", vi);
-                        vtemp.tuple<3>("xn", vi) = x;
-                    });      
-        } else {// use the previous simulation result
-            cudaPol(zs::range(verts.size()),
-                    [vtemp = proxy<space>({}, vtemp),
-                    verts = proxy<space>({}, verts)] __device__(int vi) mutable {
-                        auto x = verts.pack<3>("x", vi);
-                        vtemp.tuple<3>("xn", vi) = x;
-                    });
-        }
-        match([&](auto &elasticModel){
-            A.laplacian(cudaPol,elasticModel,"xn","L",vtemp,etemp);
-        })(models.getElasticModel());
-
-        match([&](auto &elasticModel){
-            A.hessian(cudaPol,elasticModel,"xn","H",vtemp,etemp);
-        })(models.getElasticModel());
-
-        // build preconditioner for fast cg convergence
-        cudaPol(zs::range(vtemp.size()),
-            [vtemp = proxy<space>({}, vtemp),
-                verts = proxy<space>({}, verts)] ZS_LAMBDA (int vi) mutable {
-                    vtemp.tuple<9>("P", vi) = mat3::zeros();
-        });        
-        cudaPol(zs::range(eles.size()),
-            [vtemp = proxy<space>({},vtemp),etemp = proxy<space>({},etemp),eles = proxy<space>({},eles)]
-                ZS_LAMBDA (int ei) mutable {
-                constexpr int dim = 3;
-                constexpr auto dimp1 = dim + 1;
-                auto inds = eles.pack(dim_c<dimp1>,"inds",ei, int_c);
-                auto He = etemp.pack<dim * dimp1,dim * dimp1>("L",ei);
-
-                for (int vi = 0; vi != dimp1; ++vi) {
-                #if 1
-                    for (int i = 0; i != dim; ++i)
-                    for (int j = i; j != dim; ++j){ 
-                        atomic_add(exec_cuda, &vtemp("P", i * dim + j, inds[vi]),He(vi * dim + i, vi * dim + j));
-                    //   atomic_add(exec_cuda, &vtemp("P", j * dim + i, inds[vi]),He(vi * dim + i, vi * dim + j));
-                    }
-                #else
-                    for (int j = 0; j != dim; ++j) {
-                        atomic_add(exec_cuda, &vtemp("P", j * dim + j, inds[vi]),
-                                He(vi * dim + j, vi * dim + j));
-                    }
-                #endif
-                }
-        });
-        cudaPol(zs::range(vtemp.size()),
-            [vtemp = proxy<space>({}, vtemp),
-                verts = proxy<space>({}, verts)] ZS_LAMBDA (int vi) mutable {
-                    constexpr int dim = 3;
-                    for (int i = 0; i != dim; ++i)
-                        for (int j = i+1; j != dim; ++j){ 
-                            vtemp("P", j * dim + i, vi) = vtemp("P", i * dim + j, vi);
-                    //   atomic_add(exec_cuda, &vtemp("P", j * dim + i, inds[vi]),He(vi * dim + i, vi * dim + j));
-                    }
-        });
-
-        cudaPol(zs::range(vtemp.size()),
-                [vtemp = proxy<space>({},vtemp)] __device__(int vi) mutable {
-                    // we need to use double-precision inverse here, when the P matrix is nearly singular or has very large coeffs
-                    vtemp.tuple<9>("P",vi) = inverse(vtemp.pack<3,3>("P",vi).cast<double>());
-        });
-
-        // solve the problem using quasi-newton solver
-        match([&](auto &elasticModel){
-            A.gradient(cudaPol,elasticModel,"xn",vtemp,etemp);
-        })(models.getElasticModel());
-
-        T gn = std::sqrt(dot(cudaPol,vtemp,"grad","grad"));
-        T xn = std::sqrt(dot(cudaPol,vtemp,"xn","xn"));
-
-        if(gn > epsilon && gn > xn * rel_epsilon && false) {
-            int k = 0;
-            T step = 1. / gn;
-            // solve for cg newton dir might be better?
-            cudaPol(zs::range(vtemp.size()),
-                [vtemp = proxy<space>({},vtemp)] ZS_LAMBDA(int vi) mutable {
-                    vtemp.tuple<3>("dir",vi) = -vtemp.pack<3,3>("P",vi) * vtemp.pack<3>("grad",vi);
-            }); 
-
-            int nm_corr = 0;
-            std::vector<T> m_alpha(quasi_newton_window_size);
-            std::vector<T> m_ys(quasi_newton_window_size);
-
-            fmt::print("SOLVE EQUA USING QUASI_NEWTON\n");
-
-            while(k < nm_newton_iters) {
-                // copy the x and grad
-                cudaPol(zs::range(vtemp.size()),
-                    [vtemp = proxy<space>({},vtemp)] ZS_LAMBDA(int vi) mutable {
-                        vtemp.tuple<3>("xp",vi) = vtemp.pack<3>("xn",vi);
-                        vtemp.tuple<3>("gradp",vi) = vtemp.pack<3>("grad",vi);
-                });
-                // do line search along the searching direction using armijo condition.../ consider wolfe only when the spd is not enforced
-                backtracking_line_search(cudaPol,A,models,10,armijo,"dir","grad","xn",step,vtemp);
-                T gn = std::sqrt(dot(cudaPol,vtemp,"grad","grad"));
-                T xn = std::sqrt(dot(cudaPol,vtemp,"xn","xn"));
-                // gradient termination criterion test
-                if(gn <= epsilon || gn <= epsilon * xn)
-                    break;
-                // add correction to hessian approximation
-                cudaPol(zs::range(vtemp.size()),
-                    [vtemp = proxy<space>({},vtemp),ws = quasi_newton_window_size,k] ZS_LAMBDA(int vi) mutable {
-                        for(int i = 0;i != 3;++i){
-                            vtemp("s",(k % ws)*3 + i,vi) = vtemp("xn",i,vi) - vtemp("xp",i,vi);
-                            vtemp("y",(k % ws)*3 + i,vi) = vtemp("grad",i,vi) - vtemp("gradp",i,vi);
-                            // vtemp.tuple<3>("s",k % ws,vi) = vtemp.pack<3>("xn",vi) - vtemp.pack<3>("xp",vi);
-                            // vtemp.tuple<3>("y",k % ws,vi) = vtemp.pack<3>("grad",vi) - vtemp.pack<3>("gradp",vi);
-                        }
-                });
-                // some problem use atomic add
-                m_ys[k % quasi_newton_window_size] = dot(cudaPol,vtemp,"s","y",k % quasi_newton_window_size,k % quasi_newton_window_size);
-                ++nm_corr;
-                // apply Hv 
-                // recursively compute d = -H*g
-                {
-                    // Loop1
-                    // m_dir = -m_g
-                    cudaPol(zs::range(vtemp.size()),
-                        [vtemp = proxy<space>({},vtemp)] ZS_LAMBDA(int vi) mutable {
-                            vtemp.tuple<3>("temp",vi) = -vtemp.pack<3>("grad",vi);
-                        });
-                    // point to the most recent correction buffer
-                    int j = (k+1) % quasi_newton_window_size;
-                    for(int i = 0;i < nm_corr;++i){
-                        // moving backward
-                        j = (j + quasi_newton_window_size - 1) % quasi_newton_window_size;
-                        m_alpha[j] = dot(cudaPol,vtemp,"s","temp",k % quasi_newton_window_size) / m_ys[j];
-                        cudaPol(zs::range(vtemp.size()),
-                            [vtemp = proxy<space>({},vtemp),alpha = m_alpha[j],ws = quasi_newton_window_size,k]
-                                ZS_LAMBDA(int vi) mutable {
-                                    for(int i = 0;i != 3;++i)
-                                        vtemp("temp",i,vi) -= alpha * vtemp("y",(k % ws)*3 + i,vi);
-                        });
-                    }
-                    // solve laplace equation using cg, do not have to be that accurate?
-                    solve_equation_using_pcg(cudaPol,A,models,"temp","dir","P",vtemp,"L",etemp,cg_res);
-                    // Loop 2
-                    for(int i = 0;i < nm_corr;++i){
-                        T beta = dot(cudaPol,vtemp,"y","dir",j) / m_ys[j];
-                        cudaPol(zs::range(vtemp.size()),
-                            [vtemp = proxy<space>({},vtemp),offset = k % quasi_newton_window_size,alpha = m_alpha[j],beta,j] ZS_LAMBDA(int vi) mutable{
-                                for(int i = 0;i != 3;++i)
-                                    vtemp("dir",i,vi) += (alpha - beta) * vtemp("s",j*3 + i,vi);
-                            });
-                        j = (j+1) % quasi_newton_window_size;
-                    }
-                }
-
-                step = 1.;
-                ++k;
-            }
-        }else{
-            fmt::print("EARLY TERMINATION\n");
-        }
-        cudaPol(zs::range(vtemp.size()),
-            [vtemp = proxy<space>({},vtemp),verts = proxy<space>({},verts)] ZS_LAMBDA(int vi) mutable {
-                verts.template tuple<3>("x",vi) = vtemp.pack<3>("xn",vi);
-        });
-
-        set_output("ZSParticles", zstets);
-    }
-};
-
-
-ZENDEFNODE(FastQuasiStaticStepping, {{"ZSParticles","driven_bones","gravity"},
-                                  {"ZSParticles"},
-                                  {{"float","armijo","0.1"},{"float","wolfe","0.9"},
-                                    {"float","cg_res","0.1"},{"float","btl_res","0.0001"},{"float","epsilon","1e-5"},
-                                    {"float","rel_epsilon","1e-3"},
-                                    {"string","driven_tag","bone_bw"},{"float","bone_driven_weight","0.0"},
-                                    {"int","nm_newton_iters","20"},{"int","window_size","8"}
-                                    },
-                                  {"FEM"}});
-
-};
\ No newline at end of file
diff --git a/projects/CuLagrange/fem/FleshDynamicStepping.cu b/projects/CuLagrange/fem/FleshDynamicStepping.cu
index 6c15619002..0403cb5c7c 100644
--- a/projects/CuLagrange/fem/FleshDynamicStepping.cu
+++ b/projects/CuLagrange/fem/FleshDynamicStepping.cu
@@ -33,23 +33,14 @@
 
 #include "collision_energy/vertex_face_sqrt_collision.hpp"
 #include "collision_energy/vertex_face_collision.hpp"
-#include "collision_energy/edge_edge_sqrt_collision.hpp"
-#include "collision_energy/edge_edge_collision.hpp"
-
-
-
+// #include "collision_energy/edge_edge_sqrt_collision.hpp"
+// #include "collision_energy/edge_edge_collision.hpp"
 
 #include "collision_energy/evaluate_collision.hpp"
 
-#define DEBUG_FLESH_DYN_STEPPING 1
-
 namespace zeno {
 
-// TODO : boundary force
-// TODO : fixed points
-// Anisotropic Cardiac
-
-#define MAX_FP_COLLISION_PAIRS 6
+#define MAX_FP_COLLISION_PAIRS 4
 
 struct FleshDynamicStepping : INode {
 
@@ -57,6 +48,7 @@ struct FleshDynamicStepping : INode {
     using Ti = int;
     using dtiles_t = zs::TileVector<T,32>;
     using tiles_t = typename ZenoParticles::particles_t;
+    using vec2 = zs::vec<T,2>;
     using vec3 = zs::vec<T, 3>;
     using mat3 = zs::vec<T, 3, 3>;
     using mat9 = zs::vec<T,9,9>;
@@ -71,43 +63,134 @@ struct FleshDynamicStepping : INode {
     // currently only backward euler integrator is supported
     // topology evaluation should be called before applying this node
     struct FEMDynamicSteppingSystem {
-
-        constexpr auto dFAdF(const mat3& A) {
-            mat9 M{};
-            M(0,0) = M(1,1) = M(2,2) = A(0,0);
-            M(3,0) = M(4,1) = M(5,2) = A(0,1);
-            M(6,0) = M(7,1) = M(8,2) = A(0,2);
-
-            M(0,3) = M(1,4) = M(2,5) = A(1,0);
-            M(3,3) = M(4,4) = M(5,5) = A(1,1);
-            M(6,3) = M(7,4) = M(8,5) = A(1,2);
-
-            M(0,6) = M(1,7) = M(2,8) = A(2,0);
-            M(3,6) = M(4,7) = M(5,8) = A(2,1);
-            M(6,6) = M(7,7) = M(8,8) = A(2,2);
-
-            return M;        
-        }
-
         template <typename Model>
         void computeCollisionEnergy(zs::CudaExecutionPolicy& cudaPol,const Model& model,
                 dtiles_t& vtemp,
                 dtiles_t& etemp,
                 dtiles_t& sttemp,
                 dtiles_t& setemp,
-                dtiles_t& cptemp,
-                // const bvh_t& stBvh,
-                // const bvh_t& seBvh,
-                const T& thickness) {
+                dtiles_t& ee_buffer,
+                dtiles_t& fe_buffer) {
             using namespace zs;
             constexpr auto space = execspace_e::cuda;
 
             T lambda = model.lam;
             T mu = model.mu;
+        }
+
 
+        void findInversion(zs::CudaExecutionPolicy& cudaPol,dtiles_t& vtemp,dtiles_t& etemp) {
+            using namespace zs;
+            constexpr auto space = execspace_e::cuda;
+            TILEVEC_OPS::fill(cudaPol,vtemp,"is_inverted",reinterpret_bits<T>((int)0));  
+            TILEVEC_OPS::fill(cudaPol,etemp,"is_inverted",reinterpret_bits<T>((int)0));  
+            cudaPol(zs::range(eles.size()),
+                [vtemp = proxy<space>({},vtemp),
+                        quads = proxy<space>({},eles),
+                        etemp = proxy<space>({},etemp)] ZS_LAMBDA(int ei) mutable {
+                    auto DmInv = quads.template pack<3,3>("IB",ei);
+                    auto inds = quads.template pack<4>("inds",ei).reinterpret_bits(int_c);
+                    vec3 x1[4] = {vtemp.template pack<3>("xn", inds[0]),
+                            vtemp.template pack<3>("xn", inds[1]),
+                            vtemp.template pack<3>("xn", inds[2]),
+                            vtemp.template pack<3>("xn", inds[3])};   
 
+                    mat3 F{};
+                    {
+                        auto x1x0 = x1[1] - x1[0];
+                        auto x2x0 = x1[2] - x1[0];
+                        auto x3x0 = x1[3] - x1[0];
+                        auto Ds = mat3{x1x0[0], x2x0[0], x3x0[0], x1x0[1], x2x0[1],
+                                        x3x0[1], x1x0[2], x2x0[2], x3x0[2]};
+                        F = Ds * DmInv;
+                    } 
+                    if(zs::determinant(F) < 0.0){
+                        // for(int i = 0;i < 4;++i)
+                        //     vtemp("is_inverted",inds[i]) = reinterpret_bits<T>((int)1);   
+                        etemp("is_inverted",ei) = reinterpret_bits<T>((int)1);   
+                    }else {
+                        etemp("is_inverted",ei) = reinterpret_bits<T>((int)0);   
+                    }               
+            });
+            cudaPol(zs::range(eles.size()),
+                [vtemp = proxy<space>({},vtemp),
+                        quads = proxy<space>({},eles),
+                        etemp = proxy<space>({},etemp)] ZS_LAMBDA(int ei) mutable {
+                auto inds = quads.template pack<4>("inds",ei).reinterpret_bits(int_c);
+                auto is_inverted = reinterpret_bits<int>(etemp("is_inverted",ei));  
+                if(is_inverted)
+                    for(int i = 0;i != 4;++i){
+                        vtemp("is_inverted",inds[i]) = reinterpret_bits<T>((int)1);     
+                    }       
+            });
         }
 
+        // template <typename Model>
+        // void computeKinematicCollisionGradientAndHessian(zs::CudaExecutionPolicy& cudaPol,const Model& model,
+        //     dtiles_t& vtemp,
+        //     dtiles_t& sptemp,
+        //     dtiles_t& sttemp,
+        //     const dtiles_t& kvtemp,
+        //     const dtiles_t& kltemp,
+        //     const dtiles_t& kttemp,
+        //     dtiles_t& kc_buffer,
+        //     dtiles_t& gh_buffer,
+        //     bool neglect_inverted = true) {
+        //         using namespace zs;
+        //         constexpr auto space = execspace_e::cuda;
+
+        //         int offset = eles.size() + b_verts.size() + points.size() * MAX_FP_COLLISION_PAIRS;
+        //         T lambda = model.lam;
+        //         T mu = model.mu;
+
+        //         // COLLISION_UTILS::do_kinematic_point_collision_detection<MAX_FP_COLLISION_PAIRS>(cudaPol,
+        //         //     vtemp,"xn",
+        //         //     points,
+        //         //     lines,
+        //         //     tris,
+        //         //     kvtemp,
+        //         //     kltemp,
+        //         //     kttemp,
+        //         //     kc_buffer,
+        //         //     in_collisionEps,out_collisionEps);
+                
+        //         // COLLISION_UTILS::evaluate_kinematic_fp_collision_grad_and_hessian(cudaPol,
+        //         //     vtemp,"xn",
+        //         //     kvtemp,
+        //         //     kc_buffer,
+        //         //     gh_buffer,offset,
+        //         //     in_collisionEps,out_collisionEps,
+        //         //     (T)collisionStiffness,
+        //         //     (T)mu,(T)lambda);    
+
+        //         if(neglect_inverted) {
+        //             cudaPol(zs::range(kc_buffer.size()),
+        //                 [gh_buffer = proxy<space>({},gh_buffer),
+        //                         vtemp = proxy<space>({},vtemp),
+        //                         kc_buffer = proxy<space>({},kc_buffer),
+        //                         offset] ZS_LAMBDA(int cpi) {
+        //                     auto inds = gh_buffer.template pack<4>("inds",cpi + offset).reinterpret_bits(int_c);
+        //                     for(int i = 0;i != 4;++i)
+        //                         if(inds[i] < 0)
+        //                             return;
+                            
+        //                     bool is_inverted = false;
+        //                     int is_fp = reinterpret_bits<int>(kc_buffer("is_fp",cpi));
+        //                     int check_len = is_fp > 0 ? 3 : 1;
+        //                     for(int i = 0;i != check_len;++i){
+        //                         auto vi = inds[i];
+        //                         auto is_vertex_inverted = reinterpret_bits<int>(vtemp("is_inverted",vi));
+        //                         if(is_vertex_inverted)
+        //                             is_inverted = true;
+        //                     }
+ 
+        //                     if(is_inverted){
+        //                         gh_buffer.template tuple<12*12>("H",cpi + offset) = zs::vec<T,12,12>::zeros();
+        //                         gh_buffer.template tuple<12>("grad",cpi + offset) = zs::vec<T,12>::zeros();
+        //                     }
+        //             });    
+        //         }                            
+        // }
 
         template <typename Model>
         void computeCollisionGradientAndHessian(zs::CudaExecutionPolicy& cudaPol,const Model& model,
@@ -115,435 +198,370 @@ struct FleshDynamicStepping : INode {
                             dtiles_t& etemp,
                             dtiles_t& sttemp,
                             dtiles_t& setemp,
-                            dtiles_t& cptemp,
-                            // const bvh_t& stBvh,
-                            // const bvh_t& seBvh,
-                            const T& thickness,
+                            // dtiles_t& ee_buffer,
+                            dtiles_t& fp_buffer,
+                            dtiles_t& kverts,
+                            dtiles_t& kc_buffer,
+                            dtiles_t& gh_buffer,
+                            T kd_theta = (T)0.0,
                             bool explicit_collision = false,
                             bool neglect_inverted = true) {
             using namespace zs;
             constexpr auto space = execspace_e::cuda;
 
+            int offset = eles.size();
+
             T lambda = model.lam;
             T mu = model.mu; 
 
-            #if DEBUG_FLESH_DYN_STEPPING
-                if(!vtemp.hasProperty("grad"))
-                    fmt::print(fg(fmt::color::red),"the vtemp has no 'grad' channel\n");
-                if(!vtemp.hasProperty("xn"))
-                    fmt::print(fg(fmt::color::red),"the verts has no 'xn' channel\n");
-                if(!vtemp.hasProperty("xp"))
-                    fmt::print(fg(fmt::color::red),"the verts has no 'xn' channel\n");
-                if(!vtemp.hasProperty("is_inverted"))
-                    fmt::print(fg(fmt::color::red),"the verts has no 'is_inverted' channel\n");
-                if(!vtemp.hasProperty("vp"))
-                    fmt::print(fg(fmt::color::red),"the verts has no 'vp' channel\n");
-
-                if(!etemp.hasProperty("H"))
-                    fmt::print(fg(fmt::color::red),"the etemp has no 'H' channel\n");
-                if(!etemp.hasProperty("ActInv"))
-                    fmt::print(fg(fmt::color::red),"the etemp has no 'ActInv' channel\n");
-                
-                if(!verts.hasProperty("m"))
-                    fmt::print(fg(fmt::color::red),"the verts has no 'm' channel\n");
-
-                if(!eles.hasProperty("inds"))
-                    fmt::print(fg(fmt::color::red),"the eles has no 'IB' channel\n");        
-                if(!eles.hasProperty("IB"))
-                    fmt::print(fg(fmt::color::red),"the eles has no 'IB' channel\n");
-                if(!eles.hasProperty("m"))
-                    fmt::print(fg(fmt::color::red),"the eles has no 'm' channel\n");
-                if(!eles.hasProperty("vol"))
-                    fmt::print(fg(fmt::color::red),"the eles has no 'vol' channel\n");
-
-                // fmt::print(fg(fmt::color::blue),"the size of tris : {}\n",tris.size());
-                if(!tris.hasProperty("inds"))
-                    fmt::print(fg(fmt::color::red),"the tris has no 'inds' channel\n");
-                if(!tris.hasProperty("area"))
-                    fmt::print(fg(fmt::color::red),"the tris has no 'area' channel\n");
-                if(!points.hasProperty("area"))
-                    fmt::print(fg(fmt::color::red),"the points has no 'area' channel\n");
-
-            #endif            
-
-            auto xtag = zs::SmallString("xn");
-            if(explicit_collision)
-                xtag = zs::SmallString("xp");
-
-
-            if(neglect_inverted) {
-            // // figure out all the vertices which is incident to an inverted tet
-                TILEVEC_OPS::fill(cudaPol,vtemp,"is_inverted",reinterpret_bits<T>((int)0));  
-                cudaPol(zs::range(eles.size()),
-                    [vtemp = proxy<space>({},vtemp),quads = proxy<space>({},eles),xtag] ZS_LAMBDA(int ei) mutable {
-                        auto DmInv = quads.template pack<3,3>("IB",ei);
-                        auto inds = quads.template pack<4>("inds",ei).reinterpret_bits(int_c);
-                        vec3 x1[4] = {vtemp.template pack<3>(xtag, inds[0]),
-                                vtemp.template pack<3>(xtag, inds[1]),
-                                vtemp.template pack<3>(xtag, inds[2]),
-                                vtemp.template pack<3>(xtag, inds[3])};   
-
-                        mat3 F{};
-                        {
-                            auto x1x0 = x1[1] - x1[0];
-                            auto x2x0 = x1[2] - x1[0];
-                            auto x3x0 = x1[3] - x1[0];
-                            auto Ds = mat3{x1x0[0], x2x0[0], x3x0[0], x1x0[1], x2x0[1],
-                                            x3x0[1], x1x0[2], x2x0[2], x3x0[2]};
-                            F = Ds * DmInv;
-                        } 
-                        if(zs::determinant(F) < 0.0)
-                            for(int i = 0;i < 4;++i)
-                                vtemp("is_inverted",inds[i]) = reinterpret_bits<T>((int)1);                  
-                });
-
-            }
-
-
-#if 0
-            TILEVEC_OPS::fill<4>(cudaPol,cptemp,"inds",zs::vec<int,4>::uniform(-1).template reinterpret_bits<T>());
-            // TILEVEC_OPS::fill<12*12>(cudaPol,cptemp,"H",zs::vec<T,12*12>::zeros());
-
-            // compute vertex facet contact pairs
-            cudaPol(zs::range(points.size()),[lambda = lambda,mu = mu,collisionStiffness = collisionStiffness,
-                            in_collisionEps = in_collisionEps,out_collisionEps = out_collisionEps,
-                            vtemp = proxy<space>({},vtemp),
-                            etemp = proxy<space>({},etemp),
-                            sttemp = proxy<space>({},sttemp),
-                            setemp = proxy<space>({},setemp),
-                            cptemp = proxy<space>({},cptemp),
-                            points = proxy<space>({},points),
-                            lines = proxy<space>({},lines),
-                            tris = proxy<space>({},tris),
-                            stbvh = proxy<space>(stBvh),thickness = thickness,
-                            neglect_inverted = neglect_inverted,xtag] ZS_LAMBDA(int svi) mutable {
-                // if(svi == 0)    {
-                //     if(tris.hasProperty("inds"))
-                //         printf("compare size : %d %d %d\n",(int)vtemp.size(),(int)tris.size(),(int)tris.propertySize("inds"));
-                //     else
-                //         printf("the tris has no inds channel!!!\n"); 
-                // }
-
-
-                auto vi = reinterpret_bits<int>(points("inds",svi));
-
-                if(neglect_inverted)   {
-                    auto is_vertex_inverted = reinterpret_bits<int>(vtemp("is_inverted",vi));
-                    if(is_vertex_inverted)
-                        return;
-                }
-
-                auto p = vtemp.template pack<3>(xtag,vi);
-                auto bv = bv_t{get_bounding_box(p - thickness, p + thickness)};
-
-
-                // check whether there is collision happening, and if so, apply the collision force and addup the collision hessian
-                int nm_collision_pairs = 0;
-                auto process_vertex_face_collision_pairs = [&](int stI) {
-
-                    if(nm_collision_pairs >= MAX_FP_COLLISION_PAIRS)     
-                        return;   
-
-                    auto tri = tris.pack(dim_c<3>, "inds",stI).reinterpret_bits(int_c);
-                    if(tri[0] == vi || tri[1] == vi || tri[2] == vi)
-                        return;
-
-                    auto t0 = vtemp.template pack<3>(xtag,tri[0]);
-                    auto t1 = vtemp.template pack<3>(xtag,tri[1]);
-                    auto t2 = vtemp.template pack<3>(xtag,tri[2]);
-                    // check whether the triangle is degenerate
-                    auto restArea = tris("area",stI);
-                    // skip the triangle too small at rest configuration
-                    // if(restArea < (T)1e-6)
-                    //     return;
-
-                    const auto e10 = t1 - t0;
-                    const auto e20 = t2 - t0;
-                    auto deformedArea = (T)0.5 * e10.cross(e20).norm();
-                    const T degeneracyEps = 1e-4;
-                    // skip the degenerate triangles
-                    const T relativeArea = deformedArea / (restArea + (T)1e-6);
-                    if(relativeArea < degeneracyEps)
-                        return;
-
-                    bool collide = false;
-
-                    if(COLLISION_UTILS::is_inside_the_cell(vtemp,xtag,
-                            lines,tris,
-                            sttemp,"nrm",
-                            setemp,"nrm",
-                            stI,p,in_collisionEps,out_collisionEps)){
-                        // printf("find collision facet-vertex collision in-cell pair : %d %d\n",stI,svi);
-                        collide = true;
-                    }
-
-                    if(!collide)
-                        return;
-
-                    // now there is collision, build the "collision tets"
-                    // if(!vtemp.hasProperty("oneRingArea"))
-                    //     printf("vtemp has no oneRingArea");
-
-                    cptemp.template tuple<4>("inds",svi * MAX_FP_COLLISION_PAIRS + nm_collision_pairs) = zs::vec<int,4>(vi,tri[0],tri[1],tri[2]).template reinterpret_bits<T>();
-
-                    auto vertexFaceCollisionAreas = restArea + points("area",svi);
-                    
-                    vec3 collision_verts[4] = {};
-                    collision_verts[0] = p;
-                    collision_verts[1] = t0;
-                    collision_verts[1] = t1;
-                    collision_verts[1] = t2;
-
-                    auto collisionEps = in_collisionEps;
-
-                    auto grad = collisionStiffness * VERTEX_FACE_SQRT_COLLISION::gradient(collision_verts,mu,lambda,collisionEps) * vertexFaceCollisionAreas;
-                    auto hessian = collisionStiffness * VERTEX_FACE_SQRT_COLLISION::hessian(collision_verts,mu,lambda,collisionEps) * vertexFaceCollisionAreas;
-                    cptemp.template tuple<12*12>("H",svi * MAX_FP_COLLISION_PAIRS + nm_collision_pairs) = hessian;
-
-                    for(int i = 0;i != 4;++i) {
-                        auto g_vi = i == 0 ? vi : tri[i-1];
-                        for (int d = 0; d != 3; ++d)
-                            atomic_add(exec_cuda, &vtemp("grad", d, g_vi), grad(i * 3 + d));
-                    }
-                    nm_collision_pairs++;
-
-                };
-                stbvh.iter_neighbors(bv,process_vertex_face_collision_pairs);
-            });
-
-#else
-
-        COLLISION_UTILS::do_facet_point_collision_detection<MAX_FP_COLLISION_PAIRS>(cudaPol,
-            vtemp,"xn",
-            points,
-            lines,
-            tris,
-            sttemp,
-            setemp,
-            cptemp,
-            // stBvh,
-            in_collisionEps,out_collisionEps);
-
-
-        // output all the collision pairs
-        // cudaPol(zs::range(cptemp.size()),
-        //     [cptemp = proxy<space>({},cptemp)] ZS_LAMBDA(int cpi) mutable {
-        //         auto inds = cptemp.template pack<4>("inds",cpi).reinterpret_bits(int_c);
-        //         bool collide = true;
-        //         for(int i = 0;i != 4;++i)
-        //             if(inds[i] < 0)
-        //                 collide = false;
-        //         if(collide)
-        //             printf("collision_pair[%d] : %d %d %d %d\n",
-        //                 cpi,inds[0],inds[1],inds[2],inds[3]);
-        // });
-
-        COLLISION_UTILS::evaluate_collision_grad_and_hessian<MAX_FP_COLLISION_PAIRS>(cudaPol,
-            vtemp,"xn",
-            cptemp,
-            in_collisionEps,out_collisionEps,
-            (T)collisionStiffness,
-            (T)mu,(T)lambda);
-
-
-
-        // project out all the neglect verts
-        if(neglect_inverted) {
-            cudaPol(zs::range(cptemp.size()),
-                [cptemp = proxy<space>({},cptemp),vtemp = proxy<space>({},vtemp)] ZS_LAMBDA(int cpi) {
-                    auto inds = cptemp.template pack<4>("inds",cpi).reinterpret_bits(int_c);
-                    for(int i = 0;i != 4;++i)
-                        if(inds[i] < 0)
-                            return;
-
-                    bool is_inverted = false;
-                    for(int i = 0;i != 4;++i){
-                        auto vi = inds[i];
-                        auto is_vertex_inverted = reinterpret_bits<int>(vtemp("is_inverted",vi));
-                        if(is_vertex_inverted)
-                            is_inverted = true;
-                    }
-
-                    if(is_inverted){
-                        cptemp.template tuple<12*12>("H",cpi) = zs::vec<T,12,12>::zeros();
-                        cptemp.template tuple<12>("grad",cpi) = zs::vec<T,12>::zeros();
-                    }
-            });    
-        }
-
-        // auto gradN = TILEVEC_OPS::inf_norm<12>(cudaPol,cptemp,"grad");
-        // fmt::print(fg(fmt::color::red),"collision gradN = {}\n",gradN);
-        // TILEVEC_OPS::fill<12*12>(cudaPol,cptemp,"H",zs::vec<T,12*12>::zeros());
-
-        TILEVEC_OPS::assemble<3,4>(cudaPol,cptemp,"grad",vtemp,"grad");
-
+            // auto stBvh = bvh_t{};
+            // auto bvs = retrieve_bounding_volumes(cudaPol,vtemp,tris,wrapv<3>{},(T)0.0,"xn");
+            // stBvh.build(cudaPol,bvs);
+            // auto avgl = compute_average_edge_length(cudaPol,vtemp,"xn",tris);
+            // auto bvh_thickness = 5 * avgl;            
+            // if(!calculate_facet_normal(cudaPol,vtemp,"xn",tris,sttemp,"nrm")){
+            //     throw std::runtime_error("fail updating facet normal");
+            // }       
+            // if(!COLLISION_UTILS::calculate_cell_bisector_normal(cudaPol,
+            //     vtemp,"xn",
+            //     lines,
+            //     tris,
+            //     sttemp,"nrm",
+            //     setemp,"nrm")){
+            //         throw std::runtime_error("fail calculate cell bisector normal");
+            // }    
+
+
+            COLLISION_UTILS::do_facet_point_collision_detection<MAX_FP_COLLISION_PAIRS>(cudaPol,
+                vtemp,"xn",
+                points,
+                lines,
+                tris,
+                sttemp,
+                setemp,
+                fp_buffer,
+                in_collisionEps,out_collisionEps);
+
+            COLLISION_UTILS::evaluate_fp_collision_grad_and_hessian(cudaPol,
+                vtemp,"xn","vn",dt,
+                fp_buffer,
+                gh_buffer,offset,
+                in_collisionEps,out_collisionEps,
+                (T)collisionStiffness,
+                (T)mu,(T)lambda,(T)kd_theta);
+            
 
-#endif
 
+            COLLISION_UTILS::do_kinematic_point_collision_detection<MAX_FP_COLLISION_PAIRS>(cudaPol,
+                vtemp,"xn",
+                points,
+                lines,
+                tris,
+                setemp,
+                sttemp,
+                kverts,
+                kc_buffer,
+                (T)kine_in_collisionEps,(T)kine_out_collisionEps,false);
+
+            offset = 0;
+
+            COLLISION_UTILS::evaluate_kinematic_fp_collision_grad_and_hessian(cudaPol,
+                eles,
+                vtemp,"xn","vn",dt,
+                tris,
+                kverts,
+                kc_buffer,
+                gh_buffer,offset,
+                (T)kine_in_collisionEps,(T)kine_out_collisionEps,
+                (T)kineCollisionStiffness,
+                (T)mu,(T)lambda,(T)kd_theta);
+
+
+            // adding collision damping on self collision
+            // int offset = eles.size() + b_verts.size();
+            // cudaPol(zs::range(fp_buffer.size() + kc_buffer.size()),
+            //     [vtemp = proxy<space>({},vtemp),
+            //         gh_buffer = proxy<space>({},gh_buffer),offset,kd_theta] ZS_LAMBDA(int ci) mutable {
+            //     auto inds = gh_buffer.pack(dim_c<4>,"inds",ci).reinterpret_bits(int_c);
+            //     for(int i = 0;i != 4;++i)
+            //         if(inds[i] < 0)
+            //             return;
+            //     vec3 vs[4] = {};
+            //     for(int i = 0;i = 4;++i)
+            //         vs[i] = vtemp.pack(dim_c<3>,"vn",inds[i]);
+            //     auto H = gh_buffer.pack(dim_c<12*12>,"H",ci);
+            //     gh_buffer.tuple(dim_c<12*12>,"H",ci) = H;
+            // });
+        
 
         }
 
 
-        template <typename Model>
+        template <typename ElasticModel,typename AnisoElasticModel>
         void computeGradientAndHessian(zs::CudaExecutionPolicy& cudaPol,
-                            const Model& model,
-                            dtiles_t& vtemp,
-                            dtiles_t& etemp) {        
+                            const ElasticModel& model,
+                            const AnisoElasticModel& amodel,
+                            const dtiles_t& vtemp,
+                            const dtiles_t& etemp,
+                            dtiles_t& gh_buffer,
+                            T kd_alpha = (T)0.0,
+                            T kd_beta = (T)0.0) {        
             using namespace zs;
             constexpr auto space = execspace_e::cuda;
 
-            #if DEBUG_FLESH_DYN_STEPPING
-                // std::cout << "CHECK THE PROPERTY CHANNEL" << std::endl;
-                if(!vtemp.hasProperty("grad"))
-                    fmt::print(fg(fmt::color::red),"the vtemp has no 'grad' channel\n");
-                if(!vtemp.hasProperty("xn"))
-                    fmt::print(fg(fmt::color::red),"the verts has no 'xn' channel\n");
-                if(!vtemp.hasProperty("xp"))
-                    fmt::print(fg(fmt::color::red),"the verts has no 'xp' channel\n");
-                if(!vtemp.hasProperty("vp"))
-                    fmt::print(fg(fmt::color::red),"the verts has no 'vp' channel\n");
-
-                if(!etemp.hasProperty("H"))
-                    fmt::print(fg(fmt::color::red),"the etemp has no 'H' channel\n");
-                if(!etemp.hasProperty("ActInv"))
-                    fmt::print(fg(fmt::color::red),"the etemp has no 'ActInv' channel\n");
-                
-                if(!verts.hasProperty("m"))
-                    fmt::print(fg(fmt::color::red),"the verts has no 'm' channel\n");
-
-                if(!eles.hasProperty("IB"))
-                    fmt::print(fg(fmt::color::red),"the eles has no 'IB' channel\n");
-                if(!eles.hasProperty("m"))
-                    fmt::print(fg(fmt::color::red),"the eles has no 'm' channel\n");
-                if(!eles.hasProperty("vol"))
-                    fmt::print(fg(fmt::color::red),"the eles has no 'vol' channel\n");
-                if(!eles.hasProperty("inds"))
-                    fmt::print(fg(fmt::color::red),"the eles has no 'inds' channel\n");
-            #endif
-
-            TILEVEC_OPS::fill<3>(cudaPol,vtemp,"grad",zs::vec<T,3>::zeros());
-            TILEVEC_OPS::fill<144>(cudaPol,etemp,"H",zs::vec<T,144>::zeros());         
-            
+            int offset = 0;
+            TILEVEC_OPS::copy<4>(cudaPol,eles,"inds",gh_buffer,"inds",offset);   
             // eval the inertia term gradient
-            cudaPol(zs::range(vtemp.size()), [dt2 = dt2,
-                        vtemp = proxy<space>({},vtemp),
+            cudaPol(zs::range(eles.size()),[dt2 = dt2,
                         verts = proxy<space>({},verts),
-                        dt = dt] ZS_LAMBDA(int vi) mutable {
-                auto m = verts("m",vi);// nodal mass
-                auto x1 = vtemp.pack<3>("xn",vi);
-                auto x0 = vtemp.pack<3>("xp",vi);
-                auto v0 = vtemp.pack<3>("vp",vi);
-                vtemp.template tuple<3>("grad",vi) = -m * (x1 - x0 - v0 * dt) / dt2;                    
+                        eles = proxy<space>({},eles),
+                        vtemp = proxy<space>({},vtemp),
+                        gh_buffer = proxy<space>({},gh_buffer),
+                        dt = dt,offset = offset] ZS_LAMBDA(int ei) mutable {
+                auto m = eles("m",ei)/(T)4.0;
+                auto inds = eles.pack(dim_c<4>,"inds",ei).reinterpret_bits(int_c);
+                auto pgrad = zs::vec<T,12>::zeros();
+                // auto H  = zs::vec<T,12,12>::zeros();
+                // if(eles.hasProperty("dt")) {
+                //     dt2 = eles("dt",ei) * eles("dt",ei);
+                // }
+
+                auto inertia = (T)1.0;
+                if(eles.hasProperty("inertia"))
+                    inertia = eles("inertia",ei);
+                for(int i = 0;i != 4;++i){
+                    auto x1 = vtemp.pack(dim_c<3>,"xn",inds[i]);
+                    auto x0 = vtemp.pack(dim_c<3>,"xp",inds[i]);
+                    auto v0 = vtemp.pack(dim_c<3>,"vp",inds[i]);
+
+                    auto alpha = inertia * m/dt2;
+                    auto nodal_pgrad = -alpha * (x1 - x0 - v0 * dt);
+                    for(int d = 0;d != 3;++d){
+                        auto idx = i * 3 + d;
+                        gh_buffer("grad",idx,ei) = nodal_pgrad[d];
+                        gh_buffer("H",idx*12 + idx,ei + offset) = alpha;
+                    }
+                    
+                }
+                // gh_buffer.tuple(dim_c<12>,"grad",ei + offset) = pgrad;
+                // gh_buffer.template tuple<12*12>("H",ei + offset) = H;
             });
 
-            cudaPol(zs::range(eles.size()), [this,dt2 = dt2,
+
+            cudaPol(zs::range(eles.size()), [dt = dt,dt2 = dt2,aniso_strength = aniso_strength,
+                            verts = proxy<space>({},verts),
                             vtemp = proxy<space>({}, vtemp),
                             etemp = proxy<space>({}, etemp),
-                            bcws = proxy<space>({},b_bcws),
-                            b_verts = proxy<space>({},b_verts),
-                            verts = proxy<space>({}, verts),
+                            gh_buffer = proxy<space>({},gh_buffer),
                             eles = proxy<space>({}, eles),
-                            model, volf = volf] ZS_LAMBDA (int ei) mutable {
-                    auto DmInv = eles.template pack<3,3>("IB",ei);
-                    auto dFdX = dFdXMatrix(DmInv);
-                    auto inds = eles.template pack<4>("inds",ei).reinterpret_bits(int_c);
-                    vec3 x1[4] = {vtemp.template pack<3>("xn", inds[0]),
-                            vtemp.template pack<3>("xn", inds[1]),
-                            vtemp.template pack<3>("xn", inds[2]),
-                            vtemp.template pack<3>("xn", inds[3])};   
+                            kd_alpha = kd_alpha,kd_beta = kd_beta,
+                            model = model,amodel = amodel, volf = volf,offset = offset] ZS_LAMBDA (int ei) mutable {
+                auto DmInv = eles.pack(dim_c<3,3>,"IB",ei);
+                auto dFdX = dFdXMatrix(DmInv);
+                auto inds = eles.pack(dim_c<4>,"inds",ei).reinterpret_bits(int_c);
+                vec3 x1[4] = {vtemp.pack(dim_c<3>,"xn", inds[0]),
+                                vtemp.pack(dim_c<3>,"xn", inds[1]),
+                                vtemp.pack(dim_c<3>,"xn", inds[2]),
+                                vtemp.pack(dim_c<3>,"xn", inds[3])};
+
+
+                mat3 FAct{};
+                {
+                    auto x1x0 = x1[1] - x1[0];
+                    auto x2x0 = x1[2] - x1[0];
+                    auto x3x0 = x1[3] - x1[0];
+                    auto Ds = mat3{x1x0[0], x2x0[0], x3x0[0], x1x0[1], x2x0[1],
+                                    x3x0[1], x1x0[2], x2x0[2], x3x0[2]};
+                    FAct = Ds * DmInv;
+                    FAct = FAct * etemp.template pack<3,3>("ActInv",ei);
+                } 
+                auto dFActdF = dFAdF(etemp.template pack<3,3>("ActInv",ei));
+
+                // add the force term in gradient
+                if(eles.hasProperty("mu") && eles.hasProperty("lam")) {
+                    model.mu = eles("mu",ei);
+                    model.lam = eles("lam",ei);
+                }
+                auto P = model.first_piola(FAct);
+                auto vole = eles("vol", ei);
+                auto vecP = flatten(P);
+                vecP = dFActdF.transpose() * vecP;
+                auto dFdXT = dFdX.transpose();
+                auto vf = -vole * (dFdXT * vecP);     
+
+                auto mg = volf * vole / (T)4.0;
+                for(int i = 0;i != 4;++i)
+                    for(int d = 0;d !=3 ;++d){
+                        vf[i*3 + d] += mg[d];
+                    }
 
-                    mat3 FAct{};
-                    {
-                        auto x1x0 = x1[1] - x1[0];
-                        auto x2x0 = x1[2] - x1[0];
-                        auto x3x0 = x1[3] - x1[0];
-                        auto Ds = mat3{x1x0[0], x2x0[0], x3x0[0], x1x0[1], x2x0[1],
-                                        x3x0[1], x1x0[2], x2x0[2], x3x0[2]};
-                        FAct = Ds * DmInv;
 
-                        FAct = FAct * etemp.template pack<3,3>("ActInv",ei);
-                    } 
-                    auto dFActdF = dFAdF(etemp.template pack<3,3>("ActInv",ei));
-
-                    // add the force term in gradient
-                    auto P = model.first_piola(FAct);
-                    auto vole = eles("vol", ei);
-                    auto vecP = flatten(P);
-                    vecP = dFActdF.transpose() * vecP;
-                    auto dFdXT = dFdX.transpose();
-                    auto vf = -vole * (dFdXT * vecP);     
-
-                    auto mg = volf * vole / 4;
-                    for (int i = 0; i != 4; ++i) {
-                        auto vi = inds[i];
-                        for (int d = 0; d != 3; ++d)
-                            atomic_add(exec_cuda, &vtemp("grad", d, vi), vf(i * 3 + d) + mg(d));
+                // assemble element-wise hessian matrix
+                auto Hq = model.first_piola_derivative(FAct, true_c);
+                auto dFdAct_dFdX = dFActdF * dFdX; 
+                // add inertia hessian term
+                auto H = dFdAct_dFdX.transpose() * Hq * dFdAct_dFdX * vole;
+
+                if(eles.hasProperty("Muscle_ID") && (int)eles("Muscle_ID",ei) >= 0) {
+                    auto fiber = eles.pack(dim_c<3>,"fiber",ei);
+                    if(zs::abs(fiber.norm() - 1.0) < 1e-3) {
+                        fiber /= fiber.norm();
+                        // if(eles.hasProperty("mu")) {
+                        //     amodel.mu = eles("mu",ei);
+                        //     // amodel.lam = eles("lam",ei);
+                            
+                        // }
+                        auto aP = amodel.do_first_piola(FAct,fiber);
+                        auto vecAP = flatten(P);
+                        vecAP = dFActdF.transpose() * vecP;
+                        vf -= vole  * dFdXT * vecAP *aniso_strength;
+
+                        auto aHq = amodel.do_first_piola_derivative(FAct,fiber);
+                        H += dFdAct_dFdX.transpose() * aHq * dFdAct_dFdX * vole * aniso_strength;
+                        // if((int)eles("Muscle_ID",ei) == 0){
+                        //     printf("fiber : %f %f %f,Fa = %f,aP = %f,aHq = %f,H = %f\n",fiber[0],fiber[1],fiber[2],(float)FAct.norm(),(float)aP.norm(),(float)aHq.norm(),(float)H.norm());
+                        // }
                     }
+                }
 
-                    // assemble element-wise hessian matrix
-                    auto Hq = model.first_piola_derivative(FAct, true_c);
-                    auto dFdAct_dFdX = dFActdF * dFdX; 
-                    // dFdAct_dFdX = dFdX; 
-                    auto H = dFdAct_dFdX.transpose() * Hq * dFdAct_dFdX * vole;
-                    etemp.template tuple<12 * 12>("H", ei) = H;
-
-                    // add inertia hessian term
-                    auto m = eles("m",ei);// element-wise mass
-                    for(int i = 0;i < 12;++i){
-                        // Mass(i,i) = 1;
-                        etemp("H",i * 12 + i,ei) += m /dt2/4;
-                    }
 
+                // adding rayleigh damping term
+                vec3 v0[4] = {vtemp.pack(dim_c<3>,"vn", inds[0]),
+                vtemp.pack(dim_c<3>,"vn", inds[1]),
+                vtemp.pack(dim_c<3>,"vn", inds[2]),
+                vtemp.pack(dim_c<3>,"vn", inds[3])}; 
+
+                auto inertia = (T)1.0;
+                if(eles.hasProperty("inertia"))
+                    inertia = eles("inertia",ei);
+
+                auto vel = COLLISION_UTILS::flatten(v0); 
+                auto m = eles("m",ei)/(T)4.0;
+                auto C = kd_beta * H + kd_alpha * inertia * m * zs::vec<T,12,12>::identity();
+                auto rdamping = C * vel;  
 
+                gh_buffer.tuple(dim_c<12>,"grad",ei + offset) = gh_buffer.pack(dim_c<12>,"grad",ei + offset) + vf - rdamping; 
+                // gh_buffer.tuple(dim_c<12>,"grad",ei + offset) = gh_buffer.pack(dim_c<12>,"grad",ei + offset) - rdamping; 
+                // H += kd_beta*H/dt;
+
+                gh_buffer.template tuple<12*12>("H",ei + offset) = gh_buffer.template pack<12,12>("H",ei + offset) + H + C/dt;
             });
         // Bone Driven Potential Energy
-            T lambda = model.lam;
-            T mu = model.mu;
+            // T lambda = model.lam;
+            // T mu = model.mu;
+
             auto nmEmbedVerts = b_verts.size();
-            cudaPol(zs::range(nmEmbedVerts), [this,
-                    bcws = proxy<space>({},b_bcws),b_verts = proxy<space>({},b_verts),vtemp = proxy<space>({},vtemp),etemp = proxy<space>({},etemp),
-                    eles = proxy<space>({},eles),lambda,mu,bone_driven_weight = bone_driven_weight] ZS_LAMBDA(int vi) mutable {
+
+            // TILEVEC_OPS::fill_range<4>(cudaPol,gh_buffer,"inds",zs::vec<int,4>::uniform(-1).reinterpret_bits(float_c),eles.size() + offset,b_verts.size());
+            // TILEVEC_OPS::fill_range<3>(cudaPol,gh_buffer,"grad",zs::vec<T,3>::zeros(),eles.size() + offset,b_verts.size());
+            // TILEVEC_OPS::fill_range<144>(cudaPol,gh_buffer,"H",zs::vec<T,144>::zeros(),eles.size() + offset,b_verts.size());
+
+            // we should neglect the inverted element
+            // std::cout << "nmEmbedVerts : " << nmEmbedVerts << std::endl;
+            // std::cout << "bcwsize :  " << b_bcws.size() << std::endl;
+            // return;
+            cudaPol(zs::range(nmEmbedVerts), [
+                    gh_buffer = proxy<space>({},gh_buffer),model = model,
+                    bcws = proxy<space>({},b_bcws),b_verts = proxy<space>(b_verts),vtemp = proxy<space>({},vtemp),etemp = proxy<space>({},etemp),
+                    eles = proxy<space>({},eles),bone_driven_weight = bone_driven_weight,offset = offset] ZS_LAMBDA(int vi) mutable {
                         auto ei = reinterpret_bits<int>(bcws("inds",vi));
-                        if(ei < 0)
+ 
+                        if(ei < 0){
+
                             return;
-                        auto inds = eles.pack<4>("inds",ei).reinterpret_bits<int>();
-                        auto w = bcws.pack<4>("w",vi);
+                        }
+                        // if(ei >= etemp.size()){
+                        //     printf("ei too big for etemp\n");
+                        //     return;
+                        // }
+                        // auto is_inverted = reinterpret_bits<int>(etemp("is_inverted",ei));
+                        // if(is_inverted){
+                        //     if(vi == 0)
+                        //         printf("inverted tet\n");
+                        //     return;
+                        // }
+
+                        // auto FatID = eles("FatID",ei);
+                        // if(FatID > 0)
+                        //     return;
+
+                        auto lambda = model.lam;
+                        auto mu = model.mu;
+                        // if(eles.hasProperty("mu") && eles.hasProperty("lam")) {
+                        //     mu = eles("mu",ei);
+                        //     lambda = eles("lam",ei);
+                        // }
+
+                        auto inds = eles.pack(dim_c<4>,"inds",ei).reinterpret_bits(int_c);
+                        // gh_buffer.tuple(dim_c<4>,"inds",vi + offset + eles.size()) = eles.pack(dim_c<4>,"inds",ei);
+                        auto w = bcws.pack(dim_c<4>,"w",vi);
+                        if(w[0] < 1e-4 || w[1] < 1e-4 || w[2] < 1e-4 || w[3] < 1e-4){
+                            if(vi == 0)
+                                printf("boundary tet\n");
+                            return;
+                        }
                         auto tpos = vec3::zeros();
-                        for(size_t i = 0;i != 4;++i)
-                            tpos += w[i] * vtemp.pack<3>("xn",inds[i]);
-                        auto pdiff = tpos - b_verts.pack<3>("x",vi);
+                        for(int i = 0;i != 4;++i)
+                            tpos += w[i] * vtemp.pack(dim_c<3>,"xn",inds[i]);
+                        // auto pdiff = tpos - b_verts.pack<3>("x",vi);
+                        auto pdiff = tpos - b_verts[vi];
 
                         T stiffness = 2.0066 * mu + 1.0122 * lambda;
 
+                        zs::vec<T,12> elm_grad{};
+                        // auto elm_H = zs::vec<T,12,12>::zeros();
+
                         for(size_t i = 0;i != 4;++i){
-                            auto tmp = pdiff * (-stiffness * bcws("cnorm",vi) * bone_driven_weight * w[i] * eles("vol",ei)); 
-                            // tmp = pdiff * (-lambda * bcws("cnorm",vi) * bone_driven_weight * w[i]);
-                            for(size_t d = 0;d != 3;++d)
-                                atomic_add(exec_cuda,&vtemp("grad",d,inds[i]),(T)tmp[d]);
+                            auto tmp = pdiff * (-stiffness *  bcws("strength",vi) * bcws("cnorm",vi) * bone_driven_weight * w[i] * eles("vol",ei)) * eles("bdw",ei); 
+                            // if(vi == 0 && i == 0) {
+                            //     printf("check: %f %f %f\n",(float)tmp[0],(float)tmp[1],(float)tmp[2]);
+                            // }
+                            for(size_t d = 0;d != 3;++d){
+                                atomic_add(exec_cuda,&gh_buffer("grad",i*3 + d,ei),tmp[d]);
+                                // elm_grad[i*3 + d] = tmp[d];
+                                // atomic_add(exec_cuda,&gh_buffer("grad",i * 3 + d,ei),tmp[d]);
+                            }
                         }
                         for(int i = 0;i != 4;++i)
                             for(int j = 0;j != 4;++j){
-                                T alpha = stiffness * bone_driven_weight * w[i] * w[j] * bcws("cnorm",vi) * eles("vol",ei);
+                                T alpha = stiffness * bone_driven_weight * w[i] * w[j] * bcws("strength",vi) * bcws("cnorm",vi) * eles("vol",ei) * eles("bdw",ei);
                                 for(int d = 0;d != 3;++d){
-                                    atomic_add(exec_cuda,&etemp("H",(i * 3 + d) * 12 + j * 3 + d,ei),alpha);
+                                    // elm_H(i*3 + d,j*3 + d) = alpha;
+                                    atomic_add(exec_cuda,&gh_buffer("H",(i*3 + d)*12 + j*3 + d,ei),alpha);
                                 }
                             }
-
+                        
+                        // for(int i = 0;i != 12;++i){
+                            // atomic_add(exec_cuda,&gh_buffer("grad",i,ei),elm_grad[i]);
+                            // for(int j = 0;j != 12;++j)
+                            //     atomic_add(exec_cuda,&gh_buffer("H",i*12 + j,ei),elm_H(i,j));
+                        // }
+                        // gh_buffer.tuple(dim_c<12>,"grad",vi + eles.size() + offset) = elm_grad;
+                        // gh_buffer.tuple(dim_c<12*12>,"H",vi + eles.size() + offset) = elm_H;
             });
 
-        }
+            // cudaPol(zs::range(eles.size()), [gh_buffer = proxy<space>({},gh_buffer)] ZS_LAMBDA (int ei) mutable {
+            //     auto H = gh_buffer.template pack<12,12>("H",ei);
+            //     make_pd(H);
+            //     gh_buffer.template tuple<12*12>("H",ei) = H;
+            // });
 
+        }
 
         FEMDynamicSteppingSystem(const tiles_t &verts, const tiles_t &eles,
                 const tiles_t& points,const tiles_t& lines,const tiles_t& tris,
                 T in_collisionEps,T out_collisionEps,
-                const tiles_t &b_bcws, const tiles_t& b_verts,T bone_driven_weight,
-                vec3 volf,const T& _dt,const T& collisionStiffness)
+                const tiles_t &b_bcws, const zs::Vector<zs::vec<T,3>>& b_verts,T bone_driven_weight,
+                const vec3& volf,const T& _dt,const T& collisionStiffness,
+                const T& kine_in_collisionEps,const T& kine_out_collisionEps,
+                const T& kineCollisionStiffness,const T& aniso_strength)
             : verts{verts}, eles{eles},points{points}, lines{lines}, tris{tris},
                     in_collisionEps{in_collisionEps},out_collisionEps{out_collisionEps},
                     b_bcws{b_bcws}, b_verts{b_verts}, bone_driven_weight{bone_driven_weight},
                     volf{volf},
-                    dt{_dt}, dt2{dt * dt},collisionStiffness{collisionStiffness},use_edge_edge_collision{true}, use_vertex_facet_collision{true} {}
+                    kine_in_collisionEps{kine_in_collisionEps},kine_out_collisionEps{kine_out_collisionEps},
+                    kineCollisionStiffness{kineCollisionStiffness},aniso_strength{aniso_strength},
+                    dt{_dt}, dt2{_dt * _dt},collisionStiffness{collisionStiffness},use_edge_edge_collision{true}, use_vertex_facet_collision{true} {}
 
         const tiles_t &verts;
         const tiles_t &eles;
@@ -551,7 +569,7 @@ struct FleshDynamicStepping : INode {
         const tiles_t &lines;
         const tiles_t &tris;
         const tiles_t &b_bcws;  // the barycentric interpolation of embeded bones 
-        const tiles_t &b_verts; // the position of embeded bones
+        const zs::Vector<zs::vec<T,3>> &b_verts; // the position of embeded bones
 
         T bone_driven_weight;
         vec3 volf;
@@ -566,6 +584,12 @@ struct FleshDynamicStepping : INode {
         bool use_edge_edge_collision;
         bool use_vertex_facet_collision;
 
+        T kine_in_collisionEps;
+        T kine_out_collisionEps;
+        T kineCollisionStiffness;
+
+        T aniso_strength;
+
         // int default_muscle_id;
         // zs::vec<T,3> default_muscle_dir;
         // T default_act;
@@ -576,22 +600,27 @@ struct FleshDynamicStepping : INode {
 
 
 
+
     void apply() override {
         using namespace zs;
         auto zsparticles = get_input<ZenoParticles>("ZSParticles");
         auto gravity = zeno::vec<3,T>(0);
         if(has_input("gravity"))
-            gravity = get_input<zeno::NumericObject>("gravity")->get<zeno::vec<3,T>>();
+            gravity = get_input2<zeno::vec<3,T>>("gravity");
         T armijo = (T)1e-4;
         T wolfe = (T)0.9;
-        // T cg_res = (T)0.001;
-        T cg_res = (T)0.0001;
+        // T cg_res = (T)0.01;
+        // T cg_res = (T)0.0001;
+        T cg_res = get_param<float>("cg_res");
         T btl_res = (T)0.1;
         auto models = zsparticles->getModel();
         auto& verts = zsparticles->getParticles();
         auto& eles = zsparticles->getQuadraturePoints();
 
-        if(eles.getPropertySize("inds") != 4)
+        // zs::Vector<vec3>(MAX_VERTS)
+        // TileVec("pos","tag","deleted","")
+
+        if(eles.getChannelSize("inds") != 4)
             throw std::runtime_error("the input zsparticles is not a tetrahedra mesh");
         if(!zsparticles->hasAuxData(ZenoParticles::s_surfTriTag))
             throw std::runtime_error("the input zsparticles has no surface tris");
@@ -599,55 +628,94 @@ struct FleshDynamicStepping : INode {
             throw std::runtime_error("the input zsparticles has no surface lines");
         if(!zsparticles->hasAuxData(ZenoParticles::s_surfVertTag)) 
             throw std::runtime_error("the input zsparticles has no surface points");
-        // if(!zsparticles->hasBvh(ZenoParticles::s_surfTriTag)) {
-        //     throw std::runtime_error("the input zsparticles has no surface tris's spacial structure");
-        // }
-        // if(!zsparticles->hasBvh(ZenoParticles::s_surfEdgeTag)) {
-        //     throw std::runtime_error("the input zsparticles has no surface edge's spacial structure");
-        // }
-        // if(!zsparticles->hasBvh(ZenoParticles::s_surfVertTag))  {
-        //     throw std::runtime_error("the input zsparticles has no surface vert's spacial structure");
-        // }
 
         auto& tris  = (*zsparticles)[ZenoParticles::s_surfTriTag];
         auto& lines = (*zsparticles)[ZenoParticles::s_surfEdgeTag];
         auto& points = (*zsparticles)[ZenoParticles::s_surfVertTag];
 
-        // auto& stBvh = zsparticles->bvh(ZenoParticles::s_surfTriTag);
-        // auto& seBvh = zsparticles->bvh(ZenoParticles::s_surfEdgeTag);
-
-
-        auto zsbones = get_input<ZenoParticles>("driven_boudary");
+        auto zsbones = get_input<PrimitiveObject>("driven_boudary");
         auto driven_tag = get_input2<std::string>("driven_tag");
         auto bone_driven_weight = get_input2<float>("driven_weight");
         auto muscle_id_tag = get_input2<std::string>("muscle_id_tag");
-        // auto bone_driven_weight = (T)0.02;
 
 
 
-        auto newton_res = (T)0.01;
+        // auto bone_driven_weight = (T)0.02;
+
+        auto newton_res = get_input2<float>("newton_res");
 
         auto dt = get_input2<float>("dt");
 
         auto volf = vec3::from_array(gravity * models.density);
 
-        std::vector<float> act_;    
+        std::vector<zeno::vec2f> act_;    
         std::size_t nm_acts = 0;
 
         if(has_input("Acts")) {
-            act_ = get_input<zeno::ListObject>("Acts")->getLiterial<float>();
+            act_ = get_input<zeno::ListObject>("Acts")->getLiterial<zeno::vec2f>();
             nm_acts = act_.size();
         }
 
         constexpr auto host_space = zs::execspace_e::openmp;
         auto ompExec = zs::omp_exec();
-        auto act_buffer = dtiles_t{{{"act",1}},nm_acts,zs::memsrc_e::host};
+        auto act_buffer = dtiles_t{{{"act",2}},nm_acts,zs::memsrc_e::host};
         ompExec(zs::range(act_buffer.size()),
             [act_buffer = proxy<host_space>({},act_buffer),act_] (int i) mutable {
-                act_buffer("act",i) = act_[i];
+                act_buffer.tuple(dim_c<2>,"act",i) = vec2(act_[i][0],act_[i][1]);
         });
+
         act_buffer = act_buffer.clone({zs::memsrc_e::device, 0});
 
+        const auto& zsbones_verts = zsbones->verts;
+        zs::Vector<zs::vec<T,3>> bverts{zsbones_verts.size()};
+        ompExec(zs::range(zsbones_verts.size()),
+            [bverts = proxy<host_space>(bverts),&zsbones_verts] (int i) mutable {
+                auto v = zsbones_verts[i];
+                bverts[i] = zs::vec<T,3>{v[0],v[1],v[2]};
+        });
+        bverts = bverts.clone({zs::memsrc_e::device,0});
+
+
+        constexpr auto space = execspace_e::cuda;
+        auto cudaPol = cuda_exec();
+
+        auto kverts = typename ZenoParticles::particles_t({
+                {"x",3},
+                {"xp",3},
+                {"area",1}},0,zs::memsrc_e::device,0);
+        if(has_input<ZenoParticles>("kinematic_boundary")){
+            auto kinematic_boundary = get_input<ZenoParticles>("kinematic_boundary");
+            // if (kinematic_boundary.empty())
+
+            // const auto& prim_kverts = kinematic_boundary.verts;
+            // auto& prim_kverts_area = kinematic_boundary.attr<float>("area");
+            auto& kb_verts = kinematic_boundary->getParticles();
+
+            // auto& kb_tris = kinematic_boundary->getQuadraturePoints();
+            // if(kb_tris.getPropertySize("inds") != 3){
+            //     fmt::print(fg(fmt::color::red),"the kinematic boundary is not a surface triangulate mesh\n");
+            //     throw std::runtime_error("the kinematic boundary is not a surface triangulate mesh");
+            // }
+            // if(!kb_tris.hasProperty("area")){
+            //     fmt::print(fg(fmt::color::red),"the kinematic boundary has no 'area' channel\n");
+            //     throw std::runtime_error("the kinematic boundary has no 'area' channel");
+            // }     
+            kverts.resize(kb_verts.size());
+            TILEVEC_OPS::copy<3>(cudaPol,kb_verts,"x",kverts,"x");
+            TILEVEC_OPS::copy<3>(cudaPol,kb_verts,"x",kverts,"xp");
+            TILEVEC_OPS::fill(cudaPol,kverts,"area",(T)1.0);
+        }
+        // std::cout << "nm_kb_tris : " << kb_tris.size() << " nm_kb_verts : " << kb_verts.size() << std::endl;
+        // cudaPol(zs::range(kb_tris.size()),
+        //     [kb_verts = proxy<space>({},kb_verts),kb_tris = proxy<space>({},kb_tris),kverts = proxy<space>({},kverts)] ZS_LAMBDA(int ti) mutable {
+        //         auto tri = kb_tris.pack(dim_c<3>,"inds",ti).reinterpret_bits(int_c);
+        //         for(int i = 0;i != 3;++i)
+        //             atomic_add(exec_cuda,&kverts("area",tri[i]),(T)kb_tris("area",ti)/(T)3.0);
+        //         if(ti == 0)
+        //             printf("tri[0] area : %f\n",(float)kb_tris("area",ti));
+        // });
+
+        const auto& bbw = (*zsparticles)[driven_tag];
         // the temp buffer only store the data that will change every iterations or every frame
         static dtiles_t vtemp{verts.get_allocator(),
                             {
@@ -657,19 +725,23 @@ struct FleshDynamicStepping : INode {
                                 {"dir", 3},
                                 {"xn", 3},
                                 {"xp",3},
+                                {"vn",3},
                                 {"vp",3},
                                 {"is_inverted",1},
-                                {"active",1}
+                                {"active",1},
+                                {"k_active",1},
+                                // {"inertia",1},
+                                {"k_thickness",1},
                             },verts.size()};
 
         // auto max_collision_pairs = tris.size() / 10; 
-        static dtiles_t etemp{eles.get_allocator(), {
-                {"H", 12 * 12},
-                {"inds",4},
-                {"ActInv",3*3},
+        static dtiles_t etemp(eles.get_allocator(), {
+                // {"H", 12 * 12},
+                    {"ActInv",3*3},
                 // {"muscle_ID",1},
-                // {"fiber",3}
-                }, eles.size()};
+                    {"is_inverted",1}
+                }, eles.size()
+        );
 
                 // {{tags}, cnt, memsrc_e::um, 0}
         static dtiles_t sttemp(tris.get_allocator(),
@@ -683,66 +755,105 @@ struct FleshDynamicStepping : INode {
             },lines.size()
         );
 
-        static dtiles_t cptemp(points.get_allocator(),{
+        // std::cout << "sttemp.size() << " << sttemp.size() << std::endl;
+        // std::cout << "setemp.size() << " << setemp.size() << std::endl;
+
+        int fp_buffer_size = points.size() * MAX_FP_COLLISION_PAIRS;
+        // int fp_buffer_size = 0;
+
+        static dtiles_t fp_buffer(points.get_allocator(),{
             {"inds",4},
             {"area",1},
-            {"grad",12},
             {"inverted",1},
-            {"H",12 * 12}
-        },points.size() * MAX_FP_COLLISION_PAIRS);
+        },fp_buffer_size);
 
+        // static dtiles_t ee_buffer(lines.get_allocator(),{
+        //     {"inds",4},
+        //     {"area",1},
+        //     {"inverted",1},
+        //     {"abary",2},
+        //     {"bbary",2},
+        //     {"bary",4}
+        // },lines.size());
 
-        constexpr auto space = execspace_e::cuda;
-        auto cudaPol = cuda_exec();
-    
+        // int ee_buffer_size = ee_buffer.size();
+        int ee_buffer_size = 0;
 
-        // TILEVEC_OPS::fill<4>(cudaPol,etemp,"inds",zs::vec<int,4>::uniform(-1).template reinterpret_bits<T>())
-        TILEVEC_OPS::copy<4>(cudaPol,eles,"inds",etemp,"inds");
 
-        auto avgl = compute_average_edge_length(cudaPol,verts,"x",tris);
-        // auto avgl = (T)1.0;
+        int kc_buffer_size = kverts.size() * MAX_FP_COLLISION_PAIRS;
+        // int kc_buffer_size = 0;
 
-        auto collisionStiffness = get_input2<float>("cstiffness");
+        static dtiles_t kc_buffer(points.get_allocator(),{
+            {"inds",2},
+            {"area",1},
+            {"inverted",1},
+        },kc_buffer_size);
 
+        // int kc_buffer_size = kc_buffer.size();
+        // int kc_buffer_size = 0;
 
-        // auto inset_ratio = get_input2<float>("collision_inset");
-        // auto outset_ratio = get_input2<float>("collision_outset");    
+// change
+        // static dtiles_t gh_buffer(eles.get_allocator(),{
+        //     {"inds",4},
+        //     {"H",12*12},
+        //     {"grad",12}
+        // },eles.size() + bbw.size() + fp_buffer.size() + kc_buffer_size);
 
-        auto in_collisionEps = get_input2<float>("in_collisionEps");
-        auto out_collisionEps = get_input2<float>("out_collisionEps");
+        static dtiles_t gh_buffer(eles.get_allocator(),{
+            {"inds",4},
+            {"H",12*12},
+            {"grad",12}
+        },eles.size() + fp_buffer.size());
 
-        FEMDynamicSteppingSystem A{
-            verts,eles,
-            points,lines,tris,
-            (T)in_collisionEps,(T)out_collisionEps,
-            (*zsparticles)[driven_tag],zsbones->getParticles(),bone_driven_weight,
-            volf,dt,collisionStiffness};
 
 
-        // TILEVEC_OPS::fill<9>(cudaPol,etemp,"ActInv",zs::vec<T,9>{1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0});
+        // TILEVEC_OPS::fill<4>(cudaPol,etemp,"inds",zs::vec<int,4>::uniform(-1).template reinterpret_bits<T>())
+        // TILEVEC_OPS::copy<4>(cudaPol,eles,"inds",etemp,"inds");
+        TILEVEC_OPS::fill<9>(cudaPol,etemp,"ActInv",zs::vec<T,9>{1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0});
+        // TILEVEC_OPS::fill(cudaPol,vtemp,"inertia",(T)1.0);
+        // if(verts.hasProperty("inertia"))
+        //     TILEVEC_OPS::copy(cudaPol,verts,"inertia",vtemp,"inertia");
+        if(verts.hasProperty("k_thickness"))
+            TILEVEC_OPS::copy(cudaPol,verts,"k_thickness",vtemp,"k_thickness");
+        else
+            TILEVEC_OPS::fill(cudaPol,vtemp,"k_thickness",(T)1.0);
         // apply muscle activation
+
+        if(!eles.hasProperty("Act"))
+            eles.append_channels(cudaPol,{{"Act",1}});
+
+        if(!eles.hasProperty(muscle_id_tag) || !eles.hasProperty("fiber"))
+            fmt::print(fg(fmt::color::red),"the quadrature has no \"{}\" muscle_id_tag\n",muscle_id_tag);
+        if(nm_acts == 0)
+            fmt::print(fg(fmt::color::red),"no activation input\n");
+
         cudaPol(zs::range(eles.size()),
             [etemp = proxy<space>({},etemp),eles = proxy<space>({},eles),
-                act_buffer = proxy<space>({},act_buffer),muscle_id_tag = SmallString(muscle_id_tag),nm_acts,avgl] ZS_LAMBDA(int ei) mutable {
+                act_buffer = proxy<space>({},act_buffer),muscle_id_tag = SmallString(muscle_id_tag),nm_acts] ZS_LAMBDA(int ei) mutable {
                 // auto act = eles.template pack<3>("act",ei);
                 // auto fiber = etemp.template pack<3>("fiber",ei);
-                zs::vec<T,3> fiber{};
-                if(!eles.hasProperty("fiber"))
+
+                vec3 act{1.0,1.0,1.0};
+                vec3 fiber{};
+                // float a = 1.0f;
+                if(eles.hasProperty("fiber") && eles.hasProperty(muscle_id_tag) && nm_acts > 0 && (int)eles(muscle_id_tag,ei) >= 0 && fabs(eles.template pack<3>("fiber",ei).norm() - 1.0) < 0.001 && (int)eles(muscle_id_tag,ei) < act_buffer.size()){
                     fiber = eles.template pack<3>("fiber",ei);
-                else 
+                    auto ID = (int)eles(muscle_id_tag,ei);
+                    auto a = 1. - act_buffer("act",0,ID);
+                    auto b = 1. - act_buffer("act",1,ID);
+                    // act = vec3{zs::sqrt(a),zs::sqrt(1./a),zs::sqrt(1./a)};
+                    // auto aclamp = 
+                    // act = vec3{a < 0.7 ? 0.7 : a,zs::sqrt(1./a),zs::sqrt(1./a)};
+                    act = vec3{a,zs::sqrt(1./b),zs::sqrt(1./b)};
+                    eles("Act",ei) = act_buffer("act",0,ID) + 1e-6;
+                }else{
                     fiber = zs::vec<T,3>(1.0,0.0,0.0);
-                vec3 act{1.0,1.0,1.0};
-
-
-                auto nfiber = fiber.norm();
-                // auto ID = etemp("muscle_ID",ei);
-                int ID = -1;
-                if(eles.hasProperty(muscle_id_tag))
-                    ID = (int)eles(muscle_id_tag,ei);
-                
-                if(nm_acts > 0 && ID > -1){
-                    float a = 1. - act_buffer("act",ID);
-                    act = vec3{1,zs::sqrt(1./a),zs::sqrt(1./a)};
+                    act = vec3{1,1,1};
+                    eles("Act",ei) = (T)0.0;
+                }
+                if(fabs(fiber.norm() - 1.0) > 0.1) {
+                    printf("invalid fiber[%d] detected : %f %f %f\n",(int)ei,
+                        (float)fiber[0],(float)fiber[1],(float)fiber[2]);
                 }
 
                 vec3 dir[3];
@@ -769,147 +880,177 @@ struct FleshDynamicStepping : INode {
                 Act(2,2) = act[2];
 
                 Act = R * Act * R.transpose();
-
-                // if(ei == 0) {
-                //     printf("Act[0]:\n%f %f %f\n%f %f %f\n%f %f %f\n",
-                //         (float)Act(0,0),(float)Act(0,1),(float)Act(0,2),
-                //         (float)Act(1,0),(float)Act(1,1),(float)Act(1,2),
-                //         (float)Act(2,0),(float)Act(2,1),(float)Act(2,2));
-                // }
-
                 etemp.template tuple<9>("ActInv",ei) = zs::inverse(Act);
+                // if(a < 1.0f) {
+                //     auto ActInv = etemp.template pack<3,3>("ActInv",ei);
+                //     printf("ActInv[%d] : \n%f %f %f\n%f %f %f\n%f %f %f\n",ei,
+                //         (float)ActInv(0,0),(float)ActInv(0,1),(float)ActInv(0,2),
+                //         (float)ActInv(1,0),(float)ActInv(1,1),(float)ActInv(1,2),
+                //         (float)ActInv(2,0),(float)ActInv(2,1),(float)ActInv(2,2));
+                // }
         });
+        auto collisionStiffness = get_input2<float>("cstiffness");
+        auto kineCollisionStiffness = get_input2<float>("kineCstiffness");
+
+
+        // auto inset_ratio = get_input2<float>("collision_inset");
+        // auto outset_ratio = get_input2<float>("collision_outset");    
+
+        auto in_collisionEps = get_input2<float>("in_collisionEps");
+        auto out_collisionEps = get_input2<float>("out_collisionEps");
+
+        auto kine_in_collisionEps = get_input2<float>("kine_inCollisionEps");
+        auto kine_out_collisionEps = get_input2<float>("kine_outCollisionEps");
+
+        auto aniso_strength = get_input2<float>("aniso_strength");
+
+        FEMDynamicSteppingSystem A{
+            verts,eles,
+            points,lines,tris,
+            (T)in_collisionEps,(T)out_collisionEps,
+            bbw,bverts,bone_driven_weight,
+            volf,dt,collisionStiffness,
+            (T)kine_in_collisionEps,(T)kine_out_collisionEps,
+            (T)kineCollisionStiffness,(T)aniso_strength};
+
         // std::cout << "set initial guess" << std::endl;
         // setup initial guess
+        // if(verts.hasProperty("dt")) {
+        //     std::cout << "verts has property 'dt'" << std::endl;
+        // }
+
         TILEVEC_OPS::copy<3>(cudaPol,verts,"x",vtemp,"xp");
         TILEVEC_OPS::copy<3>(cudaPol,verts,"v",vtemp,"vp");
-        TILEVEC_OPS::copy(cudaPol,verts,"active",vtemp,"active");
-        if(verts.hasProperty("init_x"))
-            TILEVEC_OPS::copy<3>(cudaPol,verts,"init_x",vtemp,"xn");   
-        else {
-            // TILEVEC_OPS::add<3>(cudaPol,vtemp,"xp",1.0,"vp",dt,"xn");  
-            TILEVEC_OPS::add<3>(cudaPol,vtemp,"xp",1.0,"vp",(T)0.0,"xn");  
-        }
-        TILEVEC_OPS::fill(cudaPol,vtemp,"bou_tag",(T)0.0);
-
-
-        auto bvh_thickness = 5 * avgl;
+        if(verts.hasProperty("active"))
+            TILEVEC_OPS::copy(cudaPol,verts,"active",vtemp,"active");
+        else
+            TILEVEC_OPS::fill(cudaPol,vtemp,"active",(T)1.0);
+
+        if(verts.hasProperty("k_active"))
+            TILEVEC_OPS::copy(cudaPol,verts,"k_active",vtemp,"k_active");
+        else
+            TILEVEC_OPS::fill(cudaPol,vtemp,"k_active",(T)1.0);
+
+        // if there is no init_x as guess, then use the baraff witkin approach
+        // if(verts.hasProperty("init_x"))
+        //     TILEVEC_OPS::copy<3>(cudaPol,verts,"init_x",vtemp,"xn");   
+        // else {
+            // TILEVEC_OPS::add<3>(cudaPol,vtemp,"xp",1.0,"vp",dt,"xn");
+        TILEVEC_OPS::copy(cudaPol,verts,"v",vtemp,"vn");  
+        TILEVEC_OPS::copy(cudaPol,verts,"x",vtemp,"xn");
+            // TILEVEC_OPS::add<3>(cudaPol,verts,"x",1.0,"vp",(T)0.0,"xn");  
+        // }
+        if(verts.hasProperty("bou_tag") && verts.getPropertySize("bou_tag") == 1)
+            TILEVEC_OPS::copy(cudaPol,verts,"bou_tag",vtemp,"bou_tag");
+        else
+            TILEVEC_OPS::fill(cudaPol,vtemp,"bou_tag",(T)0.0);
 
-        int max_newton_iterations = 5;
+        int max_newton_iterations = get_param<int>("max_newton_iters");
         int nm_iters = 0;
-
         // make sure, at least one baraf simi-implicit step will be taken
         auto res0 = 1e10;
 
+        auto kd_alpha = get_input2<float>("kd_alpha");
+        auto kd_beta = get_input2<float>("kd_beta");
+        auto kd_theta = get_input2<float>("kd_theta");
+
+        auto max_cg_iters = get_param<int>("max_cg_iters");
+
         while(nm_iters < max_newton_iterations) {
+            // break;
+
+            TILEVEC_OPS::fill(cudaPol,gh_buffer,"grad",(T)0.0);
+            TILEVEC_OPS::fill(cudaPol,gh_buffer,"H",(T)0.0);  
+            TILEVEC_OPS::fill<4>(cudaPol,gh_buffer,"inds",zs::vec<int,4>::uniform(-1).reinterpret_bits(float_c)); 
+            A.findInversion(cudaPol,vtemp,etemp);  
+            // match([&](auto &elasticModel,auto &anisoModel) -> std::enable_if_t<zs::is_same_v<RM_CVREF_T(anisoModel),zs::AnisotropicArap<float>>> {...},[](...) {
+            //     A.computeGradientAndHessian(cudaPol, elasticModel,anisoModel,vtemp,etemp,gh_buffer,kd_alpha,kd_beta);
+            // })(models.getElasticModel(),models.getAnisoElasticModel());
+ 
+            match([&](auto &elasticModel,zs::AnisotropicArap<float> &anisoModel){
+                A.computeGradientAndHessian(cudaPol, elasticModel,anisoModel,vtemp,etemp,gh_buffer,kd_alpha,kd_beta);
+            },[](...) {
+                throw std::runtime_error("unsupported anisotropic elasticity model");
+            })(models.getElasticModel(),models.getAnisoElasticModel());
 
             match([&](auto &elasticModel) {
-                A.computeGradientAndHessian(cudaPol, elasticModel,vtemp,etemp);
+                A.computeCollisionGradientAndHessian(cudaPol,elasticModel,
+                    vtemp,
+                    etemp,
+                    sttemp,
+                    setemp,
+                    // ee_buffer,
+                    fp_buffer,
+                    kverts,
+                    kc_buffer,
+                    gh_buffer,kd_theta);
             })(models.getElasticModel());
 
-            bool include_collision = true;
-            if(include_collision) {
-
-                // if(!calculate_facet_normal(cudaPol,vtemp,"xn",tris,sttemp,"nrm")){
-                //         throw std::runtime_error("fail updating facet normal");
-                // }
-
-                // if(!COLLISION_UTILS::calculate_cell_bisector_normal(cudaPol,
-                //     vtemp,"xn",
-                //     lines,
-                //     tris,
-                //     sttemp,"nrm",
-                //     setemp,"nrm")){
-                //         throw std::runtime_error("fail calculate cell bisector normal");
-                // }
-
-                // auto stbvs = retrieve_bounding_volumes(cudaPol,vtemp,tris,wrapv<3>{},(T)0.0,"xn");
-                // auto sebvs = retrieve_bounding_volumes(cudaPol,vtemp,lines,wrapv<2>{},(T)0.0,"xn");
-                // stBvh.refit(cudaPol,stbvs);
-                // seBvh.refit(cudaPol,sebvs);
-
-                match([&](auto &elasticModel) {
-                    A.computeCollisionGradientAndHessian(cudaPol,elasticModel,
-                        vtemp,
-                        etemp,
-                        sttemp,
-                        setemp,
-                        cptemp,
-                        // stBvh,
-                        // seBvh,
-                        bvh_thickness);
-                })(models.getElasticModel());
-
-            }
-
-            PCG::prepare_block_diagonal_preconditioner<4,3>(cudaPol,"H",etemp,cptemp,"P",vtemp);
+            TILEVEC_OPS::fill(cudaPol,vtemp,"grad",(T)0.0); 
+            TILEVEC_OPS::assemble(cudaPol,gh_buffer,"grad","inds",vtemp,"grad");
+            // break;
+
+            PCG::prepare_block_diagonal_preconditioner<4,3>(cudaPol,"H",gh_buffer,"P",vtemp);
+            // PCG::precondition<3>(cudaPol,vtemp,"P","grad","q");
+            // T res = TILEVEC_OPS::inf_norm<3>(cudaPol, vtemp, "q");
+            // if(res < newton_res){
+            //     fmt::print(fg(fmt::color::cyan),"reach desire newton res {} : {}\n",newton_res,res);
+            //     break;
+            // }
+            // auto nP = TILEVEC_OPS::inf_norm<9>(cudaPol,vtemp,"P");
+            // std::cout << "nP : " << nP << std::endl;
             // PCG::prepare_block_diagonal_preconditioner<4,3>(cudaPol,"H",etemp,"P",vtemp);
             // if the grad is too small, return the result
             // Solve equation using PCG
-            TILEVEC_OPS::fill<3>(cudaPol,vtemp,"dir",zs::vec<T,3>::zeros());
+            TILEVEC_OPS::fill(cudaPol,vtemp,"dir",(T)0.0);
             // std::cout << "solve using pcg" << std::endl;
-            PCG::pcg_with_fixed_sol_solve<3,4>(cudaPol,vtemp,etemp,cptemp,"dir","bou_tag","grad","P","inds","H",cg_res,1000,50);
-            // PCG::pcg_with_fixed_sol_solve<3,4>(cudaPol,vtemp,etemp,"dir","bou_tag","grad","P","inds","H",cg_res,1000,50);
-            // std::cout << "finish solve pcg" << std::endl;
-            PCG::project<3>(cudaPol,vtemp,"dir","bou_tag");
+            auto nm_CG_iters = PCG::pcg_with_fixed_sol_solve<3,4>(cudaPol,vtemp,gh_buffer,"dir","bou_tag","grad","P","inds","H",cg_res,max_cg_iters,100);
+            fmt::print(fg(fmt::color::cyan),"nm_cg_iters : {}\n",nm_CG_iters);
             T alpha = 1.;
-            cudaPol(zs::range(vtemp.size()), [vtemp = proxy<space>({}, vtemp),alpha] __device__(int i) mutable {
-                vtemp.tuple<3>("xn", i) =
-                    vtemp.pack<3>("xn", i) + alpha * vtemp.pack<3>("dir", i);
+
+            auto nxn = TILEVEC_OPS::inf_norm<3>(cudaPol,vtemp,"xn");
+            auto ndir = TILEVEC_OPS::dot<3>(cudaPol,vtemp,"dir","dir");
+            auto nP = TILEVEC_OPS::dot<9>(cudaPol,vtemp,"P","P");
+
+            // std::cout << "vtemp's xn : " << nxn << std::endl;
+            // std::cout << "vtemp's dir : " << ndir << std::endl;
+            // std::cout << "vtemp's P : " << nP << std::endl;
+
+            cudaPol(zs::range(vtemp.size()), [vtemp = proxy<space>({}, vtemp),alpha,dt] __device__(int i) mutable {
+                vtemp.template tuple<3>("xn", i) =
+                    vtemp.template pack<3>("xn", i) + alpha * vtemp.template pack<3>("dir", i);
+                vtemp.template tuple<3>("vn",i) = 
+                    (vtemp.template pack<3>("xn",i) - vtemp.template pack<3>("xp",i))/dt; 
             });
 
-            T res = TILEVEC_OPS::inf_norm<3>(cudaPol, vtemp, "dir");// this norm is independent of descriterization
-            std::cout << "res[" << nm_iters << "] : " << res << std::endl;
-            if(res < 1e-3)
-                break;
-
-            // keep dropping, to avoid explosion
-            if(res < res0)
-                res0 = res;
-            else {
-                // reverse 
-                cudaPol(zs::range(vtemp.size()), [vtemp = proxy<space>({}, vtemp),alpha] __device__(int i) mutable {
-                    vtemp.tuple<3>("xn", i) =
-                        vtemp.pack<3>("xn", i) - alpha * vtemp.pack<3>("dir", i);
-                });
+            // nxn = TILEVEC_OPS::inf_norm<3>(cudaPol,vtemp,"xn");
+            // std::cout << "new vtemp's xn : " << nxn << std::endl;
+
 
-                break;
-            }
+            // res = TILEVEC_OPS::inf_norm<3>(cudaPol, vtemp, "dir");// this norm is independent of descriterization
+            // std::cout << "res[" << nm_iters << "] : " << res << std::endl;
+            // if(res < newton_res){
+            //     fmt::print(fg(fmt::color::cyan),"reach desire newton res {} : {}\n",newton_res,res);
+            //     break;
+            // }
             nm_iters++;
         }
 
 
-
         cudaPol(zs::range(verts.size()),
-                [vtemp = proxy<space>({}, vtemp), verts = proxy<space>({}, verts),dt] __device__(int vi) mutable {
-                    auto newX = vtemp.pack<3>("xn", vi);
-                    verts.tuple<3>("x", vi) = newX;
-                    verts.tuple<3>("v",vi) = (vtemp.pack<3>("xn",vi) - vtemp.pack<3>("xp",vi))/dt;
+                [vtemp = proxy<space>({}, vtemp), verts = proxy<space>({}, verts),dt = dt] __device__(int vi) mutable {
+                    // auto newX = vtemp.pack(dim_c<3>,"xn", vi);
+                    verts.tuple<3>("x", vi) = vtemp.pack(dim_c<3>,"xn", vi);
+                    // if(verts.hasProperty("dt"))
+                    //     dt = verts("dt",vi);
+                    verts.tuple<3>("v",vi) = vtemp.pack<3>("vn",vi);
                 });
 
-        dtiles_t nodalForceVis(verts.get_allocator(),
-            {
-                {"x",3},
-                {"dir",3},
-            },verts.size());
-
-
-
-        // TILEVEC_OPS::copy<3>(cudaPol,vtemp,"xn",nodalForceVis,"x");
-        // TILEVEC_OPS::fill<3>(cudaPol,nodalForceVis,"dir",zs::vec<T,3>::zeros());
-        // TILEVEC_OPS::assemble<3,4>(cudaPol,cptemp,"grad",nodalForceVis,"dir");
-
-
-
-
-
         set_output("ZSParticles", zsparticles);
     }
-
-
 };
 
-ZENDEFNODE(FleshDynamicStepping, {{"ZSParticles",
+ZENDEFNODE(FleshDynamicStepping, {{"ZSParticles","kinematic_boundary",
                                     "gravity","Acts",
                                     "driven_boudary",
                                     {"string","driven_tag","bone_bw"},
@@ -918,13 +1059,21 @@ ZENDEFNODE(FleshDynamicStepping, {{"ZSParticles",
                                     {"float","cstiffness","0.0"},
                                     {"float","in_collisionEps","0.01"},
                                     {"float","out_collisionEps","0.01"},
-                                    {"float","dt","0.5"}
+                                    {"float","kineCstiffness","1"},
+                                    {"float","kine_inCollisionEps","0.01"},
+                                    {"float","kine_outCollisionEps","0.02"},
+                                    {"float","dt","0.5"},
+                                    {"float","newton_res","0.001"},
+                                    {"float","kd_alpha","0.01"},
+                                    {"float","kd_beta","0.01"},
+                                    {"float","kd_theta","0.01"},
+                                    {"float","aniso_strength","1.0"},
                                     },
                                   {"ZSParticles"},
                                   {
+                                    {"int","max_cg_iters","1000"},
+                                    {"int","max_newton_iters","5"},
+                                    {"float","cg_res","0.0001"}
                                   },
                                   {"FEM"}});
-
-
-
 };
\ No newline at end of file
diff --git a/projects/CuLagrange/fem/FleshQuasiStepping.cu b/projects/CuLagrange/fem/FleshQuasiStepping.cu
deleted file mode 100644
index 8ee94fb5d5..0000000000
--- a/projects/CuLagrange/fem/FleshQuasiStepping.cu
+++ /dev/null
@@ -1,588 +0,0 @@
-#include "Structures.hpp"
-#include "zensim/Logger.hpp"
-#include "zensim/cuda/execution/ExecutionPolicy.cuh"
-#include "zensim/omp/execution/ExecutionPolicy.hpp"
-#include "zensim/geometry/PoissonDisk.hpp"
-#include "zensim/geometry/VdbLevelSet.h"
-#include "zensim/geometry/VdbSampler.h"
-#include "zensim/io/MeshIO.hpp"
-#include "zensim/math/bit/Bits.h"
-#include "zensim/types/Property.h"
-#include <atomic>
-#include <zeno/VDBGrid.h>
-#include <zeno/types/ListObject.h>
-#include <zeno/types/NumericObject.h>
-#include <zeno/types/PrimitiveObject.h>
-#include <zeno/types/StringObject.h>
-
-#include "../geometry/linear_system/mfcg.hpp"
-
-namespace zeno {
-
-
-struct FleshQuasiStaticStepping : INode {
-  using T = float;
-  using dtiles_t = zs::TileVector<T,32>;
-  using tiles_t = typename ZenoParticles::particles_t;
-  using vec3 = zs::vec<T, 3>;
-  using mat3 = zs::vec<T, 3, 3>;
-  struct FEMQuasiStaticSystem {
-
-    constexpr auto dFAdF(const mat3& A) {
-        zs::vec<T,9,9> M{};
-        M(0,0) = M(1,1) = M(2,2) = A(0,0);
-        M(3,0) = M(4,1) = M(5,2) = A(0,1);
-        M(6,0) = M(7,1) = M(8,2) = A(0,2);
-
-        M(0,3) = M(1,4) = M(2,5) = A(1,0);
-        M(3,3) = M(4,4) = M(5,5) = A(1,1);
-        M(6,3) = M(7,4) = M(8,5) = A(1,2);
-
-        M(0,6) = M(1,7) = M(2,8) = A(2,0);
-        M(3,6) = M(4,7) = M(5,8) = A(2,1);
-        M(6,6) = M(7,7) = M(8,8) = A(2,2);
-
-        return M;        
-    }
-
-
-    template <typename Pol, typename Model>
-    T energy(Pol &pol, const Model &model, const zs::SmallString tag, dtiles_t& vtemp,dtiles_t& etemp) {
-      using namespace zs;
-      constexpr auto space = execspace_e::cuda;
-      Vector<T> res{verts.get_allocator(), 1};
-      res.setVal(0);
-      bool shouldSync = pol.shouldSync();
-      pol.sync(true);
-    //   elastic potential
-      pol(range(eles.size()), [verts = proxy<space>({}, verts),
-                               eles = proxy<space>({}, eles),
-                               vtemp = proxy<space>({}, vtemp),
-                               etemp = proxy<space>({},etemp),
-                               res = proxy<space>(res), tag, model = model,volf = volf] 
-                               ZS_LAMBDA (int ei) mutable {
-        auto DmInv = eles.template pack<3, 3>("IB", ei);
-        auto inds = eles.template pack<4>("inds", ei).template reinterpret_bits<int>();
-        vec3 xs[4] = {vtemp.pack<3>(tag, inds[0]), vtemp.pack<3>(tag, inds[1]),
-                      vtemp.pack<3>(tag, inds[2]), vtemp.pack<3>(tag, inds[3])};
-        mat3 FAct{};
-        {
-          auto x1x0 = xs[1] - xs[0];
-          auto x2x0 = xs[2] - xs[0];
-          auto x3x0 = xs[3] - xs[0];
-          auto Ds = mat3{x1x0[0], x2x0[0], x3x0[0], x1x0[1], x2x0[1],
-                         x3x0[1], x1x0[2], x2x0[2], x3x0[2]};
-          FAct = Ds * DmInv;
-
-          FAct = FAct * etemp.template pack<3,3>("ActInv",ei);
-
-        //   if(ei == 0) {
-        //     printf("FAct in energy : \n%f\t%f\t%f\n%f\t%f\t%f\n%f\t%f\t%f\n",
-        //         (float)FAct(0,0),(float)FAct(0,1),(float)FAct(0,2),
-        //         (float)FAct(1,0),(float)FAct(1,1),(float)FAct(1,2),
-        //         (float)FAct(2,0),(float)FAct(2,1),(float)FAct(2,2));
-        //   }
-        }
-
-        auto psi = model.psi(FAct);
-        auto vole = eles("vol", ei);
-
-        T gpsi = 0;
-        for(int i = 0;i != 4;++i)
-            gpsi += (-volf.dot(xs[i])/4); 
-
-        atomic_add(exec_cuda, &res[0], (T)(vole * (psi + gpsi)));
-      });
-// Bone Driven Potential Energy
-      T lambda = model.lam;
-      T mu = model.mu;
-      auto nmEmbedVerts = b_verts.size();
-      if(b_bcws.size() != b_verts.size()){
-          fmt::print("B_BCWS_SIZE = {}\t B_VERTS_SIZE = {}\n",b_bcws.size(),b_verts.size());
-          throw std::runtime_error("B_BCWS SIZE AND B_VERTS SIZE NOT MATCH");
-      }
-      pol(range(nmEmbedVerts), [vtemp = proxy<space>({},vtemp),
-          eles = proxy<space>({},eles),
-          b_verts = proxy<space>({},b_verts),
-          bcws = proxy<space>({},b_bcws),lambda,mu,tag,res = proxy<space>(res),bone_driven_weight = bone_driven_weight]
-          ZS_LAMBDA(int vi) mutable {
-              auto ei = reinterpret_bits<int>(bcws("inds",vi));
-              if(ei < 0)
-                  return;
-              auto inds = eles.pack(dim_c<4>, "inds", ei).reinterpret_bits<int>();
-              auto w = bcws.pack(dim_c<4>, "w",vi);
-
-              auto tpos = vec3::zeros();
-              for(size_t i = 0;i != 4;++i)
-                  tpos += w[i] * vtemp.pack<3>(tag,inds[i]);
-              auto pdiff = tpos - b_verts.pack<3>("x",vi);
-
-              T stiffness = 2.0066 * mu + 1.0122 * lambda;
-              // if(eles("vol",ei) < 0)
-              //     printf("WARNING INVERT TET DETECTED<%d> %f\n",ei,(float)eles("vol",ei));
-              T bpsi = (0.5 * bcws("cnorm",vi) * stiffness * bone_driven_weight * eles("vol",ei)) * pdiff.l2NormSqr();
-                // bpsi = (0.5 * bcws("cnorm",vi) * lambda * bone_driven_weight) * pdiff.dot(pdiff);
-                // the cnorm here should be the allocated volume of point in embeded tet 
-              atomic_add(exec_cuda, &res[0], (T)bpsi);
-      });
-      pol.sync(shouldSync);
-      return res.getVal();
-    }
-
-    template <typename Model>
-    void computeGradientAndHessian(zs::CudaExecutionPolicy& cudaPol,
-                                            const Model& model,
-                                            const zs::SmallString tag, 
-                                            dtiles_t& vtemp,
-                                            dtiles_t& etemp) {
-        using namespace zs;
-        constexpr auto space = execspace_e::cuda;
-        // fmt::print("check here 0");
-        TILEVEC_OPS::fill<3>(cudaPol,vtemp,"grad",zs::vec<T,3>::zeros());
-        TILEVEC_OPS::fill<144>(cudaPol,etemp,"He",zs::vec<T,144>::zeros());
-        cudaPol(zs::range(eles.size()), [this,
-                                        vtemp = proxy<space>({}, vtemp),
-                                        etemp = proxy<space>({}, etemp),
-                                        bcws = proxy<space>({},b_bcws),
-                                        b_verts = proxy<space>({},b_verts),
-                                        verts = proxy<space>({}, verts),
-                                        eles = proxy<space>({}, eles),tag, model, volf = volf] ZS_LAMBDA (int ei) mutable {
-            auto DmInv = eles.template pack<3, 3>("IB", ei);
-            auto dFdX = dFdXMatrix(DmInv);
-            auto inds = eles.template pack<4>("inds", ei).template reinterpret_bits<int>();
-            vec3 xs[4] = {vtemp.pack<3>(tag, inds[0]), vtemp.pack<3>(tag, inds[1]),
-                            vtemp.pack<3>(tag, inds[2]), vtemp.pack<3>(tag, inds[3])};
-            mat3 FAct{};
-            {
-                auto x1x0 = xs[1] - xs[0];
-                auto x2x0 = xs[2] - xs[0];
-                auto x3x0 = xs[3] - xs[0];
-                auto Ds = mat3{x1x0[0], x2x0[0], x3x0[0], x1x0[1], x2x0[1],
-                            x3x0[1], x1x0[2], x2x0[2], x3x0[2]};
-                FAct = Ds * DmInv;
-
-                FAct = FAct * etemp.template pack<3,3>("ActInv",ei);
-
-                // if(ei == 0) {
-                //     printf("FAct in gH : \n%f\t%f\t%f\n%f\t%f\t%f\n%f\t%f\t%f\n",
-                //         (float)FAct(0,0),(float)FAct(0,1),(float)FAct(0,2),
-                //         (float)FAct(1,0),(float)FAct(1,1),(float)FAct(1,2),
-                //         (float)FAct(2,0),(float)FAct(2,1),(float)FAct(2,2));
-                    
-                //     auto Act =  etemp.template pack<3,3>("ActInv",ei);
-
-                //     printf("Act in gH : \n%f\t%f\t%f\n%f\t%f\t%f\n%f\t%f\t%f\n",
-                //         (float)Act(0,0),(float)Act(0,1),(float)Act(0,2),
-                //         (float)Act(1,0),(float)Act(1,1),(float)Act(1,2),
-                //         (float)Act(2,0),(float)Act(2,1),(float)Act(2,2));                        
-                // }
-
-                // auto ActInv_check = etemp.template pack<3,3>("ActInv",ei);
-                // for(int i = 0;i != 3;++i)
-                //     ActInv_check(i,i) -= 1.0;
-                // if(ActInv_check.norm() > 1){
-                //     auto ActInv = etemp.template pack<3,3>("ActInv",ei);
-                //     printf("wierd ActInv<%d> in gH : \n%f\t%f\t%f\n%f\t%f\t%f\n%f\t%f\t%f\n",ei,
-                //         (float)ActInv(0,0),(float)ActInv(0,1),(float)ActInv(0,2),
-                //         (float)ActInv(1,0),(float)ActInv(1,1),(float)ActInv(1,2),
-                //         (float)ActInv(2,0),(float)ActInv(2,1),(float)ActInv(2,2));  
-                // }
-
-            }
-
-            auto dFActdF = dFAdF(etemp.template pack<3,3>("ActInv",ei));
-
-            auto P = model.first_piola(FAct);
-            auto vole = eles("vol", ei);
-            auto vecP = flatten(P);
-            vecP = dFActdF.transpose() * vecP;
-            auto dFdXT = dFdX.transpose();
-            auto vf = -vole * (dFdXT * vecP);
-
-            auto mg = volf * vole / 4;
-            for (int i = 0; i != 4; ++i) {
-                auto vi = inds[i];
-                for (int d = 0; d != 3; ++d)
-                    atomic_add(exec_cuda, &vtemp("grad", d, vi), vf(i * 3 + d) + mg(d));
-            }
-
-            auto Hq = model.first_piola_derivative(FAct, true_c);
-            auto dFdAct_dFdX = dFActdF * dFdX; 
-            // dFdAct_dFdX = dFdX; 
-            auto H = dFdAct_dFdX.transpose() * Hq * dFdAct_dFdX * vole;
-
-            etemp.tuple<12 * 12>("He", ei) = H;
-
-
-            // auto Hn = H.norm();
-            // if(isnan(Hn)){
-            //     auto Hqn = Hq.norm();
-            //     auto dFdXn = dFdAct_dFdX.norm();
-            //     printf("elm<%d>_Hn : %f %f %f\n",ei,(float)Hn,(float)dFdXn,(float)Hqn);
-            //     printf("FAct<%d> in gH : \n%f\t%f\t%f\n%f\t%f\t%f\n%f\t%f\t%f\n",ei,
-            //         (float)FAct(0,0),(float)FAct(0,1),(float)FAct(0,2),
-            //         (float)FAct(1,0),(float)FAct(1,1),(float)FAct(1,2),
-            //         (float)FAct(2,0),(float)FAct(2,1),(float)FAct(2,2));
-
-            //     auto Act = etemp.template pack<3,3>("ActInv",ei);
-            //     printf("Act<%d> in gH : \n%f\t%f\t%f\n%f\t%f\t%f\n%f\t%f\t%f\n",ei,
-            //         (float)Act(0,0),(float)Act(0,1),(float)Act(0,2),
-            //         (float)Act(1,0),(float)Act(1,1),(float)Act(1,2),
-            //         (float)Act(2,0),(float)Act(2,1),(float)Act(2,2));                
-            // }
-
-        });
-
-
-        // fmt::print("check here 1\n");
-        T lambda = model.lam;
-        T mu = model.mu;
-        if(b_bcws.size() != b_verts.size()){
-            fmt::print("B_BCWS_SIZE = {}\t B_VERTS_SIZE = {}\n",b_bcws.size(),b_verts.size());
-            throw std::runtime_error("B_BCWS SIZE AND B_VERTS SIZE NOT MATCH");
-        }
-
-        // fmt::print("check here 2\n");
-
-        auto nmEmbedVerts = b_verts.size();
-        cudaPol(zs::range(nmEmbedVerts),
-            [bcws = proxy<space>({},b_bcws),b_verts = proxy<space>({},b_verts),vtemp = proxy<space>({},vtemp),etemp = proxy<space>({},etemp),
-                eles = proxy<space>({},eles),lambda,mu,tag,bone_driven_weight = bone_driven_weight] ZS_LAMBDA(int vi) mutable {
-                    auto ei = reinterpret_bits<int>(bcws("inds",vi));
-                    if(ei < 0)
-                        return;
-                    auto inds = eles.pack<4>("inds",ei).reinterpret_bits<int>();
-                    auto w = bcws.pack(dim_c<4>, "w", vi);
-                    auto tpos = vec3::zeros();
-                    for(size_t i = 0;i != 4;++i)
-                        tpos += w[i] * vtemp.pack(dim_c<3>, tag, inds[i]);
-                    auto pdiff = tpos - b_verts.pack(dim_c<3>, "x", vi);
-
-                    T stiffness = 2.0066 * mu + 1.0122 * lambda;
-
-                    for(size_t i = 0;i != 4;++i){
-                        auto tmp = pdiff * (-stiffness * bcws("cnorm",vi) * bone_driven_weight * w[i] * eles("vol",ei)); 
-                        // tmp = pdiff * (-lambda * bcws("cnorm",vi) * bone_driven_weight * w[i]);
-                        for(size_t d = 0;d != 3;++d)
-                            atomic_add(exec_cuda,&vtemp("grad",d,inds[i]),(T)tmp[d]);
-                    }
-                    for(int i = 0;i != 4;++i)
-                        for(int j = 0;j != 4;++j){
-                            T alpha = stiffness * bone_driven_weight * w[i] * w[j] * bcws("cnorm",vi) * eles("vol",ei);
-                            // alpha = lambda * bone_driven_weight * w[i] * w[j] * bcws("cnorm",vi);
-                            // if(ei == 11221)
-                            //   if(i == 3 && j == 3)
-                            //     printf("alpha : %f\n",alpha);
-                            for(int d = 0;d != 3;++d){
-                                // etemp("He",(i * 3 + d) * 12 + j * 3 + d,ei) += alpha;
-                                // if(isnan(alpha)){
-                                //     printf("nan alpha<%d,%d,%d> %f %f %f %f %f\n",vi,i,j,(float)lambda,(float)bone_driven_weight,(float)w[i],(float)w[j],(float)bcws("cnorm",vi));
-                                // }
-                                atomic_add(exec_cuda,&etemp("He",(i * 3 + d) * 12 + j * 3 + d,ei),alpha);
-                            }
-                        }
-
-        });
-
-    }
-
-    FEMQuasiStaticSystem(const tiles_t &verts, const tiles_t &eles, const tiles_t &b_bcws, const tiles_t& b_verts,T bone_driven_weight,vec3 volf)
-        : verts{verts}, eles{eles}, b_bcws{b_bcws}, b_verts{b_verts}, bone_driven_weight{bone_driven_weight},volf{volf}{}
-
-    const tiles_t &verts;
-    const tiles_t &eles;
-    const tiles_t &b_bcws;  // the barycentric interpolation of embeded bones 
-    const tiles_t &b_verts; // the position of embeded bones
-
-    T bone_driven_weight;
-    vec3 volf;
-  };
-
-  void apply() override {
-    using namespace zs;
-    auto zstets = get_input<ZenoParticles>("ZSParticles");
-    auto gravity = zeno::vec<3,T>(0);
-    if(has_input("gravity"))
-      gravity = get_input<zeno::NumericObject>("gravity")->get<zeno::vec<3,T>>();
-    auto armijo = get_param<float>("armijo");
-    auto curvature = get_param<float>("wolfe");
-    auto cg_res = get_param<float>("cg_res");
-    auto btl_res = get_param<float>("btl_res");
-    auto models = zstets->getModel();
-    auto& verts = zstets->getParticles();
-    auto& eles = zstets->getQuadraturePoints();
-    auto zsbones = get_input<ZenoParticles>("driven_bones");
-    auto tag = get_param<std::string>("driven_tag");
-    auto muscle_id_tag = get_param<std::string>("muscle_id_tag");
-    auto bone_driven_weight = get_param<float>("bone_driven_weight");
-    auto newton_res = get_param<float>("newton_res");
-
-    auto volf = vec3::from_array(gravity * models.density);
-
-    // auto nm_acts = get_input<zeno::ListObject>("Acts")->arr.size();
-    // fmt::print("number of activations : {}\n",nm_acts);
-
-    std::vector<float> act_;    
-    std::size_t nm_acts = 0;
-    // auto nm_acts_ = zstets->get().get("NM_MUSCLES");
-    // std::cout << "nm_acts_ : " << std::endl;
-
-    if(has_input("Acts")) {
-      act_ = get_input<zeno::ListObject>("Acts")->getLiterial<float>();
-      nm_acts = act_.size();
-    }
-    // auto act_ = get_input<zeno::ListObject>("Acts")->getLiterial<float>();
-    // initialize on host qs[i] = qs_[i]->get<zeno::vec4f>();
-
-    constexpr auto host_space = zs::execspace_e::openmp;
-    auto ompExec = zs::omp_exec();
-    auto act_buffer = dtiles_t{{{"act",1}},nm_acts,zs::memsrc_e::host};
-    ompExec(range(act_buffer.size()),
-        [act_buffer = proxy<host_space>({},act_buffer),act_] (int i) mutable{
-            act_buffer("act",i) = act_[i];
-            // fmt::print("act<{}> : {}\n",i,act_buffer("act",i));
-    });
-    act_buffer = act_buffer.clone({zs::memsrc_e::device, 0});
-
-    static dtiles_t vtemp{verts.get_allocator(),
-                          {{"grad", 3},
-                           {"P", 9},
-                           {"bou_tag",1},
-                           {"dir", 3},
-                           {"xn", 3},
-                           {"xn0", 3},
-                           {"temp", 3},
-                           {"r", 3},
-                           {"p", 3},
-                           {"q", 3}},
-                          verts.size()};
-    static dtiles_t etemp{eles.get_allocator(), {{"He", 12 * 12},{"inds",4},{"ActInv",3*3},{"muscle_ID",1},{"fiber",3}}, eles.size()};
-    vtemp.resize(verts.size());
-    etemp.resize(eles.size());
-
-    FEMQuasiStaticSystem A{verts,eles,(*zstets)[tag],zsbones->getParticles(),bone_driven_weight,volf};
-
-    constexpr auto space = execspace_e::cuda;
-    auto cudaPol = cuda_exec().sync(false);
-
-    TILEVEC_OPS::copy<4>(cudaPol,eles,"inds",etemp,"inds");
-
-
-    if(!eles.hasProperty("fiber")){
-        // fmt::print("The input flesh have no fiber orientations, use the default setting\n");
-        TILEVEC_OPS::fill<3>(cudaPol,etemp,"fiber",{1.,0.,0.});
-        // throw std::runtime_error("The input flesh should have fiber orientations");
-
-    }else {
-      if(eles.getPropertySize("fiber") != 3){
-          fmt::print("The input fiber  has wrong channel size\n");
-          throw std::runtime_error("The input fiber has wrong channel size");
-      }
-      TILEVEC_OPS::copy<3>(cudaPol,eles,"fiber",etemp,"fiber");
-    }
-    if(!eles.hasProperty(muscle_id_tag)) {
-      // if((!eles.hasProperty(muscle_id_tag)) || (eles.getPropertySize(muscle_id_tag) != 1)){
-      //     fmt::print("the quadrature has no muscle id tag : {} {}\n",muscle_id_tag,eles.getPropertySize(muscle_id_tag));
-      //     throw std::runtime_error("the quadrature has no muscle id tag");
-      // }
-      // fmt::print("The input flesh have no mosucle_id specified, use the default setting");
-      TILEVEC_OPS::fill(cudaPol,etemp,"muscle_ID",-1);
-    }else {
-      TILEVEC_OPS::copy(cudaPol,eles,muscle_id_tag,etemp,"muscle_ID");
-    }
-
-    // apply muscle activation
-    cudaPol(range(etemp.size()),
-        [etemp = proxy<space>({},etemp),act_buffer = proxy<space>({},act_buffer),muscle_id_tag = SmallString(muscle_id_tag),nm_acts] ZS_LAMBDA(int ei) mutable {
-            // auto act = eles.template pack<3>("act",ei);
-            auto fiber = etemp.template pack<3>("fiber",ei);
-              
-            vec3 act{0};
-
-            auto nfiber = fiber.norm();
-            auto ID = etemp("muscle_ID",ei);
-            if(nfiber < 0.5 || ID < -1e-6 || nm_acts == 0){ // if there is no local fiber orientaion, use the default act and fiber
-                fiber = vec3{1.0,0.0,0.0};
-                act = vec3{1.0,1.0,1.0};
-            }else{
-                // a test
-                int id = (int)ID;
-                float a = 1. - act_buffer("act",id);
-                act = vec3{1,zs::sqrt(1./a),zs::sqrt(1./a)};
-                fiber /= nfiber;// in case there is some floating-point error
-
-                // printf("use act[%d] : %f\n",id,(float)a);
-            }
-
-            vec3 dir[3];
-            dir[0] = fiber;
-            auto tmp = vec3{1.0,0.0,0.0};
-            dir[1] = dir[0].cross(tmp);
-            if(dir[1].length() < 1e-3) {
-                tmp = vec3{0.0,1.0,0.0};
-                dir[1] = dir[0].cross(tmp);
-            }
-
-            dir[1] = dir[1] / dir[1].length();
-            dir[2] = dir[0].cross(dir[1]);
-
-            auto R = mat3{};
-            for(int i = 0;i < 3;++i)
-                for(int j = 0;j < 3;++j)
-                    R(i,j) = dir[j][i];
-
-            auto Act = mat3::zeros();
-            Act(0,0) = act[0];
-            Act(1,1) = act[1];
-            Act(2,2) = act[2];
-
-            Act = R * Act * R.transpose();
-
-            // if(ei == 0) {
-            //     printf("Act : \n%f\t%f\t%f\n%f\t%f\t%f\n%f\t%f\t%f\n",
-            //         (float)Act(0,0),(float)Act(0,1),(float)Act(0,2),
-            //         (float)Act(1,0),(float)Act(1,1),(float)Act(1,2),
-            //         (float)Act(2,0),(float)Act(2,1),(float)Act(2,2));                        
-            // }
-
-
-            etemp.template tuple<9>("ActInv",ei) = zs::inverse(Act);
-
-            // if(ei == 0) {
-            //     Act = etemp.template pack<3,3>("ActInv",ei);
-            //     printf("Act : \n%f\t%f\t%f\n%f\t%f\t%f\n%f\t%f\t%f\n",
-            //         (float)Act(0,0),(float)Act(0,1),(float)Act(0,2),
-            //         (float)Act(1,0),(float)Act(1,1),(float)Act(1,2),
-            //         (float)Act(2,0),(float)Act(2,1),(float)Act(2,2));  
-
-            //     // auto dFActdF = dFAdF(eles.template pack<3,3>("ActInv",ei));
-            //     // printf("dFActdF : \n%f\t%f\t%f\t%f\t%f\t%f\t%f\t%f\")
-
-            // }
-
-    });
-
-    // setup initial guess
-    TILEVEC_OPS::copy<3>(cudaPol,verts,verts.hasProperty("init_x") ? "init_x" : "x",vtemp,"xn");    
-    TILEVEC_OPS::fill<1>(cudaPol,vtemp,"bou_tag",zs::vec<T,1>::zeros());
-
-    for(int newtonIter = 0;newtonIter != 1000;++newtonIter){
-      match([&](auto &elasticModel) {
-        A.computeGradientAndHessian(cudaPol, elasticModel,"xn",vtemp,etemp);
-      })(models.getElasticModel());
-
-    // auto Hn = TILEVEC_OPS::dot<144>(cudaPol,etemp,"He","He");
-    // fmt::print("Hn : {}\n",(float)Hn);    
-
-    // break;
-
-    //  Prepare Preconditioning
-      PCG::prepare_block_diagonal_preconditioner<4,3>(cudaPol,"He",etemp,"P",vtemp);
-
-      // if the grad is too small, return the result
-      // Solve equation using PCG
-      TILEVEC_OPS::fill<3>(cudaPol,vtemp,"dir",zs::vec<T,3>::zeros());
-      PCG::pcg_with_fixed_sol_solve<3,4>(cudaPol,vtemp,etemp,"dir","bou_tag","grad","P","inds","He",cg_res,1000,50);
-      PCG::project<3>(cudaPol,vtemp,"dir","bou_tag");
-      PCG::project<3>(cudaPol,vtemp,"grad","bou_tag");
-      T res = TILEVEC_OPS::inf_norm<3>(cudaPol, vtemp, "dir");// this norm is independent of descriterization
-
-      if (res < newton_res) {
-        fmt::print("\t# newton optimizer reach desired resolution in {} iters with residual {}\n",
-                   newtonIter, res);
-        break;
-      }
-      T dg = TILEVEC_OPS::dot<3>(cudaPol,vtemp,"grad","dir");
-      if(fabs(dg) < btl_res){
-        // fmt::print("\t# newton optimizer reach stagnation point in {} iters with residual {}\n",newtonIter, res);
-        break;
-      }
-      if(dg < 0){
-          T gradn = std::sqrt(TILEVEC_OPS::dot<3>(cudaPol,vtemp,"grad","grad"));
-          T dirn = std::sqrt(TILEVEC_OPS::dot<3>(cudaPol,vtemp,"dir","dir"));
-          fmt::print("invalid dg = {} grad = {} dir = {}\n",dg,gradn,dirn);
-          throw std::runtime_error("INVALID DESCENT DIRECTION");
-      }
-      T alpha = 1.;
-      TILEVEC_OPS::copy<3>(cudaPol,vtemp,"xn",vtemp,"xn0");
-      T E0;
-      match([&](auto &elasticModel) {
-        E0 = A.energy(cudaPol, elasticModel, "xn0",vtemp,etemp);
-      })(models.getElasticModel());
-
-      dg = -dg;
-
-      T E{E0};
-    //   Backtracking Linesearch
-      int max_line_search = 10;
-      int line_search = 0;
-      std::vector<T> armijo_buffer(max_line_search);
-      do {
-        TILEVEC_OPS::add<3>(cudaPol,vtemp,"xn0",(T)1.0,"dir",alpha,"xn");
-        match([&](auto &elasticModel) {
-          E = A.energy(cudaPol, elasticModel, "xn",vtemp,etemp);
-        })(models.getElasticModel());
-        armijo_buffer[line_search] = (E - E0)/alpha;
-        // test Armojo condition
-        if (((double)E - (double)E0) < (double)armijo * (double)dg * (double)alpha)
-          break;
-        alpha /= 2;
-        ++line_search;
-      } while (line_search < max_line_search);
-      if(line_search == max_line_search){
-          fmt::print("LINE_SEARCH_EXCEED: %f\n",dg);
-          // for(size_t i = 0;i != max_line_search;++i)
-          //   fmt::print("AB[{}]\t = {} dg = {}\n",i,armijo_buffer[i],dg);
-      }
-
-      cudaPol(zs::range(vtemp.size()), [vtemp = proxy<space>({}, vtemp),
-                                        alpha] __device__(int i) mutable {
-        vtemp.tuple<3>("xn", i) =
-            vtemp.pack<3>("xn0", i) + alpha * vtemp.pack<3>("dir", i);
-      });
-    
-    }
-
-    cudaPol(zs::range(verts.size()),
-            [vtemp = proxy<space>({}, vtemp), verts = proxy<space>({}, verts)] __device__(int vi) mutable {
-              auto newX = vtemp.pack<3>("xn", vi);
-              verts.tuple<3>("x", vi) = newX;
-            });
-
-    cudaPol.syncCtx();
-
-    // write back muscle activation
-    auto output_act = get_param<int>("output_act");
-    if(output_act) {
-      auto ActTag = get_param<std::string>("actTag");
-      if(!eles.hasProperty(ActTag))
-        eles.append_channels(cudaPol,{{ActTag,1}});
-      TILEVEC_OPS::fill(cudaPol,eles,ActTag,0);
-      if(nm_acts > 0) {
-        cudaPol(zs::range(eles.size()),
-          [eles = proxy<space>({},eles),muscle_id_tag = zs::SmallString{muscle_id_tag},
-              act_buffer = proxy<space>({},act_buffer),ActTag = zs::SmallString{ActTag}] __device__(int ei) mutable {
-            auto ID = eles(muscle_id_tag,ei);
-            int id = (int)ID;
-            eles(ActTag,ei) = id > -1 ? act_buffer("act",id) : 0;
-            // eles(ActTag,ei) = id > -1 ? 0.5 : 0;
-        });
-      }
-    }
-
-    cudaPol.syncCtx();
-
-    set_output("ZSParticles", zstets);
-  }
-};
-
-ZENDEFNODE(FleshQuasiStaticStepping, {{"ZSParticles","driven_bones","gravity","Acts"},
-                                  {"ZSParticles"},
-                                  {{"float","armijo","0.1"},{"float","wolfe","0.9"},
-                                    {"float","cg_res","0.1"},{"float","btl_res","0.0001"},{"float","newton_res","0.001"},
-                                    {"string","driven_tag","bone_bw"},{"float","bone_driven_weight","0.0"},
-                                    {"string","muscle_id_tag","ms_id_tag"},{"int","output_act","0"},{"string","actTag","Act"}  
-                                  },
-                                  {"FEM"}});
-
-}
\ No newline at end of file
diff --git a/projects/CuLagrange/fem/QuasiStaticStepping.cu b/projects/CuLagrange/fem/QuasiStaticStepping.cu
deleted file mode 100644
index 8521806090..0000000000
--- a/projects/CuLagrange/fem/QuasiStaticStepping.cu
+++ /dev/null
@@ -1,349 +0,0 @@
-#include "Structures.hpp"
-#include "zensim/Logger.hpp"
-#include "zensim/cuda/execution/ExecutionPolicy.cuh"
-#include "zensim/geometry/PoissonDisk.hpp"
-#include "zensim/geometry/VdbLevelSet.h"
-#include "zensim/geometry/VdbSampler.h"
-#include "zensim/io/MeshIO.hpp"
-#include "zensim/math/bit/Bits.h"
-#include "zensim/types/Property.h"
-#include <atomic>
-#include <zeno/VDBGrid.h>
-#include <zeno/types/ListObject.h>
-#include <zeno/types/NumericObject.h>
-#include <zeno/types/PrimitiveObject.h>
-#include <zeno/types/StringObject.h>
-
-#include "../geometry/linear_system/mfcg.hpp"
-
-namespace zeno {
-struct QuasiStaticStepping : INode {
-  using T = float;
-  using dtiles_t = zs::TileVector<T,32>;
-  using tiles_t = typename ZenoParticles::particles_t;
-  using vec3 = zs::vec<T, 3>;
-  using mat3 = zs::vec<T, 3, 3>;
-  struct FEMSystem {
-    template <typename Pol, typename Model>
-    T energy(Pol &pol, const Model &model, const zs::SmallString tag, dtiles_t& vtemp) {
-      using namespace zs;
-      constexpr auto space = execspace_e::cuda;
-      Vector<T> res{verts.get_allocator(), 1};
-      res.setVal(0);
-    //   elastic potential
-      pol(range(eles.size()), [verts = proxy<space>({}, verts),
-                               eles = proxy<space>({}, eles),
-                               vtemp = proxy<space>({}, vtemp),
-                               res = proxy<space>(res), tag, model = model,volf = volf] 
-                               ZS_LAMBDA (int ei) mutable {
-        auto DmInv = eles.template pack<3, 3>("IB", ei);
-        auto inds = eles.template pack<4>("inds", ei).template reinterpret_bits<int>();
-        vec3 xs[4] = {vtemp.pack<3>(tag, inds[0]), vtemp.pack<3>(tag, inds[1]),
-                      vtemp.pack<3>(tag, inds[2]), vtemp.pack<3>(tag, inds[3])};
-        mat3 F{};
-        {
-          auto x1x0 = xs[1] - xs[0];
-          auto x2x0 = xs[2] - xs[0];
-          auto x3x0 = xs[3] - xs[0];
-          auto Ds = mat3{x1x0[0], x2x0[0], x3x0[0], x1x0[1], x2x0[1],
-                         x3x0[1], x1x0[2], x2x0[2], x3x0[2]};
-          F = Ds * DmInv;
-        }
-        auto psi = model.psi(F);
-        auto vole = eles("vol", ei);
-
-        T gpsi = 0;
-        for(int i = 0;i != 4;++i)
-            gpsi += (-volf.dot(xs[i])/4); 
-
-        atomic_add(exec_cuda, &res[0], (T)(vole * (psi + gpsi)));
-      });
-// Bone Driven Potential Energy
-      T lambda = model.lam;
-      T mu = model.mu;
-      auto nmEmbedVerts = b_verts.size();
-      if(b_bcws.size() != b_verts.size()){
-          fmt::print("B_BCWS_SIZE = {}\t B_VERTS_SIZE = {}\n",b_bcws.size(),b_verts.size());
-          throw std::runtime_error("B_BCWS SIZE AND B_VERTS SIZE NOT MATCH");
-      }
-      pol(range(nmEmbedVerts), [vtemp = proxy<space>({},vtemp),
-          eles = proxy<space>({},eles),
-          b_verts = proxy<space>({},b_verts),
-          bcws = proxy<space>({},b_bcws),lambda,mu,tag,res = proxy<space>(res),bone_driven_weight = bone_driven_weight]
-          ZS_LAMBDA(int vi) mutable {
-              auto ei = reinterpret_bits<int>(bcws("inds",vi));
-              if(ei < 0)
-                  return;
-              auto inds = eles.pack<4>("inds",ei).reinterpret_bits<int>();
-              auto w = bcws.pack<4>("w",vi);
-
-              auto tpos = vec3::zeros();
-              for(size_t i = 0;i != 4;++i)
-                  tpos += w[i] * vtemp.pack<3>(tag,inds[i]);
-              auto pdiff = tpos - b_verts.pack<3>("x",vi);
-
-              T stiffness = 2.0066 * mu + 1.0122 * lambda;
-              // if(eles("vol",ei) < 0)
-              //     printf("WARNING INVERT TET DETECTED<%d> %f\n",ei,(float)eles("vol",ei));
-              T bpsi = (0.5 * bcws("cnorm",vi) * stiffness * bone_driven_weight * eles("vol",ei)) * pdiff.l2NormSqr();
-                    // bpsi = (0.5 * bcws("cnorm",vi) * lambda * bone_driven_weight) * pdiff.dot(pdiff);
-// the cnorm here should be the allocated volume of point in embeded tet 
-              atomic_add(exec_cuda, &res[0], (T)bpsi);
-      });
-
-      return res.getVal();
-    }
-
-    template <typename Model>
-    void computeGradientAndHessian(zs::CudaExecutionPolicy& cudaPol,
-                                            const Model& model,
-                                            const zs::SmallString tag, 
-                                            dtiles_t& vtemp,
-                                            dtiles_t& etemp) {
-        using namespace zs;
-        constexpr auto space = execspace_e::cuda;
-        // fmt::print("check here 0");
-        TILEVEC_OPS::fill<3>(cudaPol,vtemp,"grad",zs::vec<T,3>::zeros());
-        TILEVEC_OPS::fill<144>(cudaPol,etemp,"He",zs::vec<T,144>::zeros());
-        cudaPol(zs::range(eles.size()), [vtemp = proxy<space>({}, vtemp),
-                                        etemp = proxy<space>({}, etemp),
-                                        bcws = proxy<space>({},b_bcws),
-                                        b_verts = proxy<space>({},b_verts),
-                                        verts = proxy<space>({}, verts),
-                                        eles = proxy<space>({}, eles),tag, model, volf = volf] ZS_LAMBDA (int ei) mutable {
-            auto DmInv = eles.template pack<3, 3>("IB", ei);
-            auto dFdX = dFdXMatrix(DmInv);
-            auto inds = eles.template pack<4>("inds", ei).template reinterpret_bits<int>();
-            vec3 xs[4] = {vtemp.pack<3>(tag, inds[0]), vtemp.pack<3>(tag, inds[1]),
-                            vtemp.pack<3>(tag, inds[2]), vtemp.pack<3>(tag, inds[3])};
-            mat3 F{};
-            {
-                auto x1x0 = xs[1] - xs[0];
-                auto x2x0 = xs[2] - xs[0];
-                auto x3x0 = xs[3] - xs[0];
-                auto Ds = mat3{x1x0[0], x2x0[0], x3x0[0], x1x0[1], x2x0[1],
-                            x3x0[1], x1x0[2], x2x0[2], x3x0[2]};
-                F = Ds * DmInv;
-            }
-            auto P = model.first_piola(F);
-            auto vole = eles("vol", ei);
-            auto vecP = flatten(P);
-            auto dFdXT = dFdX.transpose();
-            auto vf = -vole * (dFdXT * vecP);
-
-            auto mg = volf * vole / 4;
-            for (int i = 0; i != 4; ++i) {
-                auto vi = inds[i];
-                for (int d = 0; d != 3; ++d)
-                    atomic_add(exec_cuda, &vtemp("grad", d, vi), vf(i * 3 + d) + mg(d));
-            }
-
-            auto Hq = model.first_piola_derivative(F, true_c);
-            auto H = dFdXT * Hq * dFdX * vole;
-
-            etemp.tuple<12 * 12>("He", ei) = H;
-
-        });
-
-
-        // fmt::print("check here 1\n");
-        T lambda = model.lam;
-        T mu = model.mu;
-        if(b_bcws.size() != b_verts.size()){
-            fmt::print("B_BCWS_SIZE = {}\t B_VERTS_SIZE = {}\n",b_bcws.size(),b_verts.size());
-            throw std::runtime_error("B_BCWS SIZE AND B_VERTS SIZE NOT MATCH");
-        }
-
-        // fmt::print("check here 2\n");
-
-        auto nmEmbedVerts = b_verts.size();
-        cudaPol(zs::range(nmEmbedVerts),
-            [bcws = proxy<space>({},b_bcws),b_verts = proxy<space>({},b_verts),vtemp = proxy<space>({},vtemp),etemp = proxy<space>({},etemp),
-                eles = proxy<space>({},eles),lambda,mu,tag,bone_driven_weight = bone_driven_weight] ZS_LAMBDA(int vi) mutable {
-                    auto ei = reinterpret_bits<int>(bcws("inds",vi));
-                    if(ei < 0)
-                        return;
-                    auto inds = eles.pack<4>("inds",ei).reinterpret_bits<int>();
-                    auto w = bcws.pack<4>("w",vi);
-                    auto tpos = vec3::zeros();
-                    for(size_t i = 0;i != 4;++i)
-                        tpos += w[i] * vtemp.pack<3>(tag,inds[i]);
-                    auto pdiff = tpos - b_verts.pack<3>("x",vi);
-
-                    T stiffness = 2.0066 * mu + 1.0122 * lambda;
-
-                    for(size_t i = 0;i != 4;++i){
-                        auto tmp = pdiff * (-stiffness * bcws("cnorm",vi) * bone_driven_weight * w[i] * eles("vol",ei)); 
-                        // tmp = pdiff * (-lambda * bcws("cnorm",vi) * bone_driven_weight * w[i]);
-                        for(size_t d = 0;d != 3;++d)
-                            atomic_add(exec_cuda,&vtemp("grad",d,inds[i]),(T)tmp[d]);
-                    }
-                    for(int i = 0;i != 4;++i)
-                        for(int j = 0;j != 4;++j){
-                            T alpha = stiffness * bone_driven_weight * w[i] * w[j] * bcws("cnorm",vi) * eles("vol",ei);
-                            // alpha = lambda * bone_driven_weight * w[i] * w[j] * bcws("cnorm",vi);
-                            // if(ei == 11221)
-                            //   if(i == 3 && j == 3)
-                            //     printf("alpha : %f\n",alpha);
-                            for(int d = 0;d != 3;++d){
-                                // etemp("He",(i * 3 + d) * 12 + j * 3 + d,ei) += alpha;
-                                if(isnan(alpha)){
-                                    printf("nan alpha<%d,%d,%d> %f %f %f %f %f\n",vi,i,j,(float)lambda,(float)bone_driven_weight,(float)w[i],(float)w[j],(float)bcws("cnorm",vi));
-                                }
-                                atomic_add(exec_cuda,&etemp("He",(i * 3 + d) * 12 + j * 3 + d,ei),alpha);
-                            }
-                        }
-
-        });
-
-    }
-
-    FEMSystem(const tiles_t &verts, const tiles_t &eles, const tiles_t &b_bcws, const tiles_t& b_verts,T bone_driven_weight,vec3 volf)
-        : verts{verts}, eles{eles}, b_bcws{b_bcws}, b_verts{b_verts}, bone_driven_weight{bone_driven_weight},volf{volf}{}
-
-    const tiles_t &verts;
-    const tiles_t &eles;
-    const tiles_t &b_bcws;  // the barycentric interpolation of embeded bones 
-    const tiles_t &b_verts; // the position of embeded bones
-
-    T bone_driven_weight;
-    vec3 volf;
-  };
-
-  void apply() override {
-    using namespace zs;
-    auto zstets = get_input<ZenoParticles>("ZSParticles");
-    auto gravity = get_input<zeno::NumericObject>("gravity")->get<zeno::vec<3,T>>();
-    auto armijo = get_param<float>("armijo");
-    auto curvature = get_param<float>("wolfe");
-    auto cg_res = get_param<float>("cg_res");
-    auto btl_res = get_param<float>("btl_res");
-    auto models = zstets->getModel();
-    auto& verts = zstets->getParticles();
-    auto& eles = zstets->getQuadraturePoints();
-    auto zsbones = get_input<ZenoParticles>("driven_bones");
-    auto tag = get_param<std::string>("driven_tag");
-    auto bone_driven_weight = get_param<float>("bone_driven_weight");
-    auto newton_res = get_param<float>("newton_res");
-
-    auto volf = vec3::from_array(gravity * models.density);
-
-    static dtiles_t vtemp{verts.get_allocator(),
-                          {{"grad", 3},
-                           {"P", 9},
-                           {"bou_tag",1},
-                           {"dir", 3},
-                           {"xn", 3},
-                           {"xn0", 3},
-                           {"temp", 3},
-                           {"r", 3},
-                           {"p", 3},
-                           {"q", 3}},
-                          verts.size()};
-    static dtiles_t etemp{eles.get_allocator(), {{"He", 12 * 12},{"inds",4}}, eles.size()};
-    vtemp.resize(verts.size());
-    etemp.resize(eles.size());
-
-    FEMSystem A{verts,eles,(*zstets)[tag],zsbones->getParticles(),bone_driven_weight,volf};
-
-    constexpr auto space = execspace_e::cuda;
-    auto cudaPol = cuda_exec();
-
-    TILEVEC_OPS::copy<4>(cudaPol,eles,"inds",etemp,"inds");
-
-    // setup initial guess
-    TILEVEC_OPS::copy<3>(cudaPol,verts,verts.hasProperty("init_x") ? "init_x" : "x",vtemp,"xn");    
-    TILEVEC_OPS::fill<1>(cudaPol,vtemp,"bou_tag",zs::vec<T,1>::zeros());
-
-    for(int newtonIter = 0;newtonIter != 1000;++newtonIter){
-      match([&](auto &elasticModel) {
-        A.computeGradientAndHessian(cudaPol, elasticModel,"xn",vtemp,etemp);
-      })(models.getElasticModel());
-
-    //  Prepare Preconditioning
-      PCG::prepare_block_diagonal_preconditioner<4,3>(cudaPol,"He",etemp,"P",vtemp);
-
-      // if the grad is too small, return the result
-      // Solve equation using PCG
-      TILEVEC_OPS::fill<3>(cudaPol,vtemp,"dir",zs::vec<T,3>::zeros());
-      PCG::pcg_with_fixed_sol_solve<3,4>(cudaPol,vtemp,etemp,"dir","bou_tag","grad","P","inds","He",cg_res,1000,50);
-      PCG::project<3>(cudaPol,vtemp,"dir","bou_tag");
-      PCG::project<3>(cudaPol,vtemp,"grad","bou_tag");
-      T res = TILEVEC_OPS::inf_norm<3>(cudaPol, vtemp, "dir");// this norm is independent of descriterization
-
-      if (res < newton_res) {
-        fmt::print("\t# newton optimizer reach desired resolution in {} iters with residual {}\n",
-                   newtonIter, res);
-        break;
-      }
-      T dg = TILEVEC_OPS::dot<3>(cudaPol,vtemp,"grad","dir");
-      if(fabs(dg) < btl_res){
-        fmt::print("\t# newton optimizer reach stagnation point in {} iters with residual {}\n",
-        newtonIter, res);
-        break;
-      }
-      if(dg < 0){
-          T gradn = std::sqrt(TILEVEC_OPS::dot<3>(cudaPol,vtemp,"grad","grad"));
-          T dirn = std::sqrt(TILEVEC_OPS::dot<3>(cudaPol,vtemp,"dir","dir"));
-          fmt::print("invalid dg = {} grad = {} dir = {}\n",dg,gradn,dirn);
-          throw std::runtime_error("INVALID DESCENT DIRECTION");
-      }
-      T alpha = 1.;
-      TILEVEC_OPS::copy<3>(cudaPol,vtemp,"xn",vtemp,"xn0");
-      T E0;
-      match([&](auto &elasticModel) {
-        E0 = A.energy(cudaPol, elasticModel, "xn0",vtemp);
-      })(models.getElasticModel());
-
-      dg = -dg;
-
-      T E{E0};
-    //   Backtracking Linesearch
-      int max_line_search = 10;
-      int line_search = 0;
-      std::vector<T> armijo_buffer(max_line_search);
-      do {
-        TILEVEC_OPS::add<3>(cudaPol,vtemp,"xn0",(T)1.0,"dir",alpha,"xn");
-        match([&](auto &elasticModel) {
-          E = A.energy(cudaPol, elasticModel, "xn",vtemp);
-        })(models.getElasticModel());
-        armijo_buffer[line_search] = (E - E0)/alpha;
-        // test Armojo condition
-        if (((double)E - (double)E0) < (double)armijo * (double)dg * (double)alpha)
-          break;
-        alpha /= 2;
-        ++line_search;
-      } while (line_search < max_line_search);
-      if(line_search == max_line_search){
-          fmt::print("LINE_SEARCH_EXCEED: %f\n",dg);
-          for(size_t i = 0;i != max_line_search;++i)
-            fmt::print("AB[{}]\t = {} dg = {}\n",i,armijo_buffer[i],dg);
-      }
-
-      cudaPol(zs::range(vtemp.size()), [vtemp = proxy<space>({}, vtemp),
-                                        alpha] __device__(int i) mutable {
-        vtemp.tuple<3>("xn", i) =
-            vtemp.pack<3>("xn0", i) + alpha * vtemp.pack<3>("dir", i);
-      });
-    
-    }
-
-    cudaPol(zs::range(verts.size()),
-            [vtemp = proxy<space>({}, vtemp), verts = proxy<space>({}, verts)] __device__(int vi) mutable {
-              auto newX = vtemp.pack<3>("xn", vi);
-              verts.tuple<3>("x", vi) = newX;
-            });
-
-
-    set_output("ZSParticles", zstets);
-  }
-};
-
-ZENDEFNODE(QuasiStaticStepping, {{"ZSParticles","driven_bones","gravity"},
-                                  {"ZSParticles"},
-                                  {{"float","armijo","0.1"},{"float","wolfe","0.9"},
-                                    {"float","cg_res","0.1"},{"float","btl_res","0.0001"},{"float","newton_res","0.001"},
-                                    {"string","driven_tag","bone_bw"},{"float","bone_driven_weight","0.0"}},
-                                  {"FEM"}});
-
-}
\ No newline at end of file
diff --git a/projects/CuLagrange/fem/collision_energy/collision_utils.hpp b/projects/CuLagrange/fem/collision_energy/collision_utils.hpp
index 9756a527b5..df631817d7 100644
--- a/projects/CuLagrange/fem/collision_energy/collision_utils.hpp
+++ b/projects/CuLagrange/fem/collision_energy/collision_utils.hpp
@@ -5,6 +5,8 @@
 
 #include "zensim/math/VecInterface.hpp"
 
+#include "../../geometry/kernel/geo_math.hpp"
+
 
 namespace zeno {
 namespace COLLISION_UTILS {
@@ -449,134 +451,6 @@ namespace COLLISION_UTILS {
         return contracted + product + product.transpose();
     }
 
-    ///////////////////////////////////////////////////////////////////////
-    // get the linear interpolation coordinates from v0 to the line segment
-    // between v1 and v2
-    ///////////////////////////////////////////////////////////////////////
-    constexpr VECTOR2 getLerp(const VECTOR3 v0, const VECTOR3& v1, const VECTOR3& v2)
-    {
-        const VECTOR3 e0 = v0 - v1;
-        const VECTOR3 e1 = v2 - v1;
-        const VECTOR3 e1hat = e1 / e1.norm();
-        const REAL projection = e0.dot(e1hat);
-
-        if (projection < 0.0)
-            return VECTOR2(1.0, 0.0);
-
-        if (projection >= e1.norm())
-            return VECTOR2(0.0, 1.0);
-
-        const REAL ratio = projection / e1.norm();
-        return VECTOR2(1.0 - ratio, ratio);
-    }
-
-
-    ///////////////////////////////////////////////////////////////////////
-    // find the distance from a line segment (v1, v2) to a point (v0)
-    ///////////////////////////////////////////////////////////////////////
-    constexpr REAL pointLineDistance(const VECTOR3 v0, const VECTOR3& v1, const VECTOR3& v2)
-    {
-        const VECTOR3 e0 = v0 - v1;
-        const VECTOR3 e1 = v2 - v1;
-        const VECTOR3 e1hat = e1 / e1.norm();
-        const REAL projection = e0.dot(e1hat);
-
-        // if it projects onto the line segment, use that length
-        if (projection > 0.0 && projection < e1.norm())
-        {
-            const VECTOR3 normal = e0 - projection * e1hat;
-            return normal.norm();
-        }
-
-        // if it doesn't, find the point-point distances
-        const REAL diff01 = (v0 - v1).norm();
-        const REAL diff02 = (v0 - v2).norm();
-
-        return (diff01 < diff02) ? diff01 : diff02;
-    }
-
-
-    ///////////////////////////////////////////////////////////////////////
-    // get the barycentric coordinate of the projection of v[0] onto the triangle
-    // formed by v[1], v[2], v[3]
-    ///////////////////////////////////////////////////////////////////////
-    constexpr VECTOR3 getBarycentricCoordinates(const VECTOR3 vertices[4])
-    {
-        const VECTOR3 v0 = vertices[1];
-        const VECTOR3 v1 = vertices[2];
-        const VECTOR3 v2 = vertices[3];
-            
-        const VECTOR3 e1 = v1 - v0;
-        const VECTOR3 e2 = v2 - v0;
-        const VECTOR3 n = e1.cross(e2);
-        const VECTOR3 nHat = n / n.norm();
-        const VECTOR3 v = vertices[0] - (nHat.dot(vertices[0] - v0)) * nHat;
-
-        // get the barycentric coordinates
-        const VECTOR3 na = (v2 - v1).cross(v - v1);
-        const VECTOR3 nb = (v0 - v2).cross(v - v2);
-        const VECTOR3 nc = (v1 - v0).cross(v - v0);
-        const VECTOR3 barycentric(n.dot(na) / n.l2NormSqr(),
-                                    n.dot(nb) / n.l2NormSqr(),
-                                    n.dot(nc) / n.l2NormSqr());
-
-        return barycentric;
-    }
-
-
-    ///////////////////////////////////////////////////////////////////////
-    // get the barycentric coordinate of the projection of v[0] onto the triangle
-    // formed by v[1], v[2], v[3]
-    //
-    // but, if the projection is actually outside, project to all of the
-    // edges and find the closest point that's still inside the triangle
-    ///////////////////////////////////////////////////////////////////////
-    constexpr VECTOR3 getInsideBarycentricCoordinates(const VECTOR3 vertices[4])
-    {
-        VECTOR3 barycentric = getBarycentricCoordinates(vertices);
-
-        // if it's already inside, we're all done
-        if (barycentric[0] >= 0.0 &&
-            barycentric[1] >= 0.0 &&
-            barycentric[2] >= 0.0)
-            return barycentric;
-
-        // find distance to all the line segments
-        //
-        // there's lots of redundant computation between here and getLerp,
-        // but let's get it working and see if it fixes the actual
-        // artifact before optimizing
-        REAL distance12 = pointLineDistance(vertices[0], vertices[1], vertices[2]);
-        REAL distance23 = pointLineDistance(vertices[0], vertices[2], vertices[3]);
-        REAL distance31 = pointLineDistance(vertices[0], vertices[3], vertices[1]);
-
-        // less than or equal is important here, otherwise fallthrough breaks
-        if (distance12 <= distance23 && distance12 <= distance31)
-        {
-            VECTOR2 lerp = getLerp(vertices[0], vertices[1], vertices[2]);
-            barycentric[0] = lerp[0];
-            barycentric[1] = lerp[1];
-            barycentric[2] = 0.0;
-            return barycentric;
-        }
-        
-        // less than or equal is important here, otherwise fallthrough breaks
-        if (distance23 <= distance12 && distance23 <= distance31)
-        {
-            VECTOR2 lerp = getLerp(vertices[0], vertices[2], vertices[3]);
-            barycentric[0] = 0.0;
-            barycentric[1] = lerp[0];
-            barycentric[2] = lerp[1];
-            return barycentric;
-        }
-
-        // else it must be the 31 case
-        VECTOR2 lerp = getLerp(vertices[0], vertices[3], vertices[1]);
-        barycentric[0] = lerp[1];
-        barycentric[1] = 0.0;
-        barycentric[2] = lerp[0];
-        return barycentric;
-    }
 
     ///////////////////////////////////////////////////////////////////////
     ///////////////////////////////////////////////////////////////////////
@@ -660,182 +534,452 @@ namespace COLLISION_UTILS {
     }
 
 
-///////////////////////////////////////////////////////////////////////
-// compute distance between a point and triangle
-///////////////////////////////////////////////////////////////////////
-    constexpr REAL pointTriangleDistance(const VECTOR3& v0, const VECTOR3& v1, 
-                                        const VECTOR3& v2, const VECTOR3& v)
+
+    #define FMAX(a,b) ((a) > (b) ? (a) : (b))
+    #define FMIN(a,b) ((a) > (b) ? (b) : (a))
+    #define FABS(a) ((a) < 0.0f ? -(a) : (a))
+    #define OUT_OF_RANGE(a) ((a) < 0.0f || (a) > 1.f)
+
+
+    /**************************************************************************
+    |
+    |     Method: FindNearestPointOnLineSegment
+    |
+    |    Purpose: Given a line (segment) and a point in 3-dimensional space,
+    |             find the point on the line (segment) that is closest to the
+    |             point.
+    |
+    | Parameters: Input:
+    |             ------
+    |             A1x, A1y, A1z   - Coordinates of first defining point of the line/segment
+    |             Lx, Ly, Lz      - Vector from (A1x, A1y, A1z) to the second defining point
+    |                               of the line/segment.
+    |             Bx, By, Bz      - Coordinates of the point
+    |             infinite_lines  - set to true if lines are to be treated as infinite
+    |             epsilon_squared - tolerance value to be used to check for degenerate
+    |                               and parallel lines, and to check for true intersection.
+    |
+    |             Output:
+    |             -------
+    |             NearestPointX,  - Point on line/segment that is closest to (Bx, By, Bz)
+    |             NearestPointY,
+    |             NearestPointZ
+    |             parameter       - Parametric coordinate of the nearest point along the
+    |                               line/segment. parameter = 0 at (A1x, A1y, A1z) and
+    |                               parameter = 1 at the second defining point of the line/
+    |                               segmetn
+    **************************************************************************/
+    constexpr void FindNearestPointOnLineSegment(const REAL A1x, const REAL A1y, const REAL A1z,
+                                    const REAL Lx, const REAL Ly, const REAL Lz,
+                                    const REAL Bx, const REAL By, const REAL Bz,
+                                    bool infinite_line, REAL epsilon_squared, REAL &NearestPointX,
+                                    REAL &NearestPointY, REAL &NearestPointZ,
+                                    REAL &parameter)
     {
-        // get the barycentric coordinates
-        const VECTOR3 e1 = v1 - v0;
-        const VECTOR3 e2 = v2 - v0;
-        const VECTOR3 n = e1.cross(e2);
-        const VECTOR3 na = (v2 - v1).cross(v - v1);
-        const VECTOR3 nb = (v0 - v2).cross(v - v2);
-        const VECTOR3 nc = (v1 - v0).cross(v - v0);
-        const VECTOR3 barycentric(n.dot(na) / n.l2NormSqr(),
-                                    n.dot(nb) / n.l2NormSqr(),
-                                    n.dot(nc) / n.l2NormSqr());
-                                    
-        const REAL barySum = zs::abs(barycentric[0]) + zs::abs(barycentric[1]) + zs::abs(barycentric[2]);
-
-        // if the point projects to inside the triangle, it should sum to 1
-        if (zs::abs(barySum - 1.0) < 1e-6)
+        // Line/Segment is degenerate --- special case #1
+        REAL D = Lx * Lx + Ly * Ly + Lz * Lz;
+        if (D < epsilon_squared)
         {
-            const VECTOR3 nHat = n / n.norm();
-            const REAL normalDistance = (nHat.dot(v - v0));
-            return zs::abs(normalDistance);
+            NearestPointX = A1x;
+            NearestPointY = A1y;
+            NearestPointZ = A1z;
+            return;
         }
 
-        // project onto each edge, find the distance to each edge
-        const VECTOR3 e3 = v2 - v1;
-        const VECTOR3 ev = v - v0;
-        const VECTOR3 ev3 = v - v1;
-        const VECTOR3 e1Hat = e1 / e1.norm();
-        const VECTOR3 e2Hat = e2 / e2.norm();
-        const VECTOR3 e3Hat = e3 / e3.norm();
-        VECTOR3 edgeDistances(1e8, 1e8, 1e8);
-
-        // see if it projects onto the interval of the edge
-        // if it doesn't, then the vertex distance will be smaller,
-        // so we can skip computing anything
-        const REAL e1dot = e1Hat.dot(ev);
-        if (e1dot > 0.0 && e1dot < e1.norm())
+        REAL ABx = Bx - A1x;
+        REAL ABy = By - A1y;
+        REAL ABz = Bz - A1z;
+
+        // parameter is computed from Equation (20).
+        parameter = (Lx * ABx + Ly * ABy + Lz * ABz) / D;
+
+        if (false == infinite_line) parameter = (REAL)FMAX(0.0, FMIN(1.0, parameter));
+
+        NearestPointX = A1x + parameter * Lx;
+        NearestPointY = A1y + parameter * Ly;
+        NearestPointZ = A1z + parameter * Lz;
+        return;
+    }
+
+
+    /**************************************************************************
+    |
+    |     Method: AdjustNearestPoints
+    |
+    |    Purpose: Given nearest point information for two infinite lines, adjust
+    |             to model finite line segments.
+    |
+    | Parameters: Input:
+    |             ------
+    |             A1x, A1y, A1z   - Coordinates of first defining point of line/segment A
+    |             Lax, Lay, Laz   - Vector from (A1x, A1y, A1z) to the (A2x, A2y, A2z).
+    |             B1x, B1y, B1z   - Coordinates of first defining point of line/segment B
+    |             Lbx, Lby, Lbz   - Vector from (B1x, B1y, B1z) to the (B2x, B2y, B2z).
+    |             epsilon_squared - tolerance value to be used to check for degenerate
+    |                               and parallel lines, and to check for true intersection.
+    |             s               - parameter representing nearest point on infinite line A
+    |             t               - parameter representing nearest point on infinite line B
+    |
+    |             Output:
+    |             -------
+    |             PointOnSegAx,   - Coordinates of the point on segment A that are nearest
+    |             PointOnSegAy,     to segment B. This corresponds to point C in the text.
+    |             PointOnSegAz
+    |             PointOnSegBx,   - Coordinates of the point on segment B that are nearest
+    |             PointOnSegBy,     to segment A. This corresponds to point D in the text.
+    |             PointOnSegBz
+    **************************************************************************/
+    constexpr void AdjustNearestPoints(REAL A1x, REAL A1y, REAL A1z,
+                            REAL Lax, REAL Lay, REAL Laz,
+                            REAL B1x, REAL B1y, REAL B1z,
+                            REAL Lbx, REAL Lby, REAL Lbz,
+                            REAL epsilon_squared, REAL s, REAL t,
+                            REAL &PointOnSegAx, REAL &PointOnSegAy, REAL &PointOnSegAz,
+                            REAL &PointOnSegBx, REAL &PointOnSegBy, REAL &PointOnSegBz)
+    {
+    // handle the case where both parameter s and t are out of range
+        if (OUT_OF_RANGE(s) && OUT_OF_RANGE(t))
+        {
+            s = FMAX((REAL)0.0, FMIN((REAL)1.0, s));
+            PointOnSegAx = (A1x + s * Lax);
+            PointOnSegAy = (A1y + s * Lay);
+            PointOnSegAz = (A1z + s * Laz);
+            FindNearestPointOnLineSegment(B1x, B1y, B1z, Lbx, Lby, Lbz, PointOnSegAx,
+                                        PointOnSegAy, PointOnSegAz, true, epsilon_squared,
+                                        PointOnSegBx, PointOnSegBy, PointOnSegBz, t);
+            if (OUT_OF_RANGE(t))
+            {
+                t = FMAX((REAL)0.0, FMIN((REAL)1.0, t));
+                PointOnSegBx = (B1x + t * Lbx);
+                PointOnSegBy = (B1y + t * Lby);
+                PointOnSegBz = (B1z + t * Lbz);
+                FindNearestPointOnLineSegment(A1x, A1y, A1z, Lax, Lay, Laz, PointOnSegBx,
+                                                PointOnSegBy, PointOnSegBz, false, epsilon_squared,
+                                                PointOnSegAx, PointOnSegAy, PointOnSegAz, s);
+                FindNearestPointOnLineSegment(B1x, B1y, B1z, Lbx, Lby, Lbz, PointOnSegAx,
+                                                PointOnSegAy, PointOnSegAz, false, epsilon_squared,
+                                                PointOnSegBx, PointOnSegBy, PointOnSegBz, t);
+            }
+        }
+        // otherwise, handle the case where the parameter for only one segment is
+        // out of range
+        else if (OUT_OF_RANGE(s))
         {
-            const VECTOR3 projected = v0 + e1Hat * e1dot;
-            edgeDistances[0] = (v - projected).norm();
+            s = FMAX((REAL)0.0, FMIN((REAL)1.0, s));
+            PointOnSegAx = (A1x + s * Lax);
+            PointOnSegAy = (A1y + s * Lay);
+            PointOnSegAz = (A1z + s * Laz);
+            FindNearestPointOnLineSegment(B1x, B1y, B1z, Lbx, Lby, Lbz, PointOnSegAx,
+                                        PointOnSegAy, PointOnSegAz, false, epsilon_squared,
+                                        PointOnSegBx, PointOnSegBy, PointOnSegBz, t);
         }
-        const REAL e2dot = e2Hat.dot(ev);
-        if (e2dot > 0.0 && e2dot < e2.norm())
+        else if (OUT_OF_RANGE(t))
         {
-            const VECTOR3 projected = v0 + e2Hat * e2dot;
-            edgeDistances[1] = (v - projected).norm();
+            t = FMAX((REAL)0.0, FMIN((REAL)1.0, t));
+            PointOnSegBx = (B1x + t * Lbx);
+            PointOnSegBy = (B1y + t * Lby);
+            PointOnSegBz = (B1z + t * Lbz);
+            FindNearestPointOnLineSegment(A1x, A1y, A1z, Lax, Lay, Laz, PointOnSegBx,
+                                        PointOnSegBy, PointOnSegBz, false, epsilon_squared,
+                                        PointOnSegAx, PointOnSegAy, PointOnSegAz, s);
         }
-        const REAL e3dot = e3Hat.dot(ev3);
-        if (e3dot > 0.0 && e3dot < e3.norm())
+    }    
+
+
+    /**************************************************************************
+    |
+    |     Method: FindNearestPointOfParallelLineSegments
+    |
+    |    Purpose: Given two lines (segments) that are known to be parallel, find
+    |             a representative point on each that is nearest to the other. If
+    |             the lines are considered to be finite then it is possible that there
+    |             is one true point on each line that is nearest to the other. This
+    |             code properly handles this case.
+    |
+    |             This is the most difficult line intersection case to handle, since
+    |             there is potentially a family, or locus of points on each line/segment
+    |             that are nearest to the other.
+    | Parameters: Input:
+    |             ------
+    |             A1x, A1y, A1z   - Coordinates of first defining point of line/segment A
+    |             A2x, A2y, A2z   - Coordinates of second defining point of line/segment A
+    |             Lax, Lay, Laz   - Vector from (A1x, A1y, A1z) to the (A2x, A2y, A2z).
+    |             B1x, B1y, B1z   - Coordinates of first defining point of line/segment B
+    |             B2x, B2y, B2z   - Coordinates of second defining point of line/segment B
+    |             Lbx, Lby, Lbz   - Vector from (B1x, B1y, B1z) to the (B2x, B2y, B2z).
+    |             infinite_lines  - set to true if lines are to be treated as infinite
+    |             epsilon_squared - tolerance value to be used to check for degenerate
+    |                               and parallel lines, and to check for true intersection.
+    |
+    |             Output:
+    |             -------
+    |             PointOnSegAx,   - Coordinates of the point on segment A that are nearest
+    |             PointOnSegAy,     to segment B. This corresponds to point C in the text.
+    |             PointOnSegAz
+    |             PointOnSegBx,   - Coordinates of the point on segment B that are nearest
+    |             PointOnSegBy,     to segment A. This corresponds to point D in the text.
+    |             PointOnSegBz
+
+    **************************************************************************/
+    constexpr void FindNearestPointOfParallelLineSegments(REAL A1x, REAL A1y, REAL A1z,
+                                                REAL A2x, REAL A2y, REAL A2z,
+                                                REAL Lax, REAL Lay, REAL Laz,
+                                                REAL B1x, REAL B1y, REAL B1z,
+                                                REAL B2x, REAL B2y, REAL B2z,
+                                                REAL Lbx, REAL Lby, REAL Lbz,
+                                                bool infinite_lines, REAL epsilon_squared,
+                                                REAL &PointOnSegAx, REAL &PointOnSegAy, REAL &PointOnSegAz,
+                                                REAL &PointOnSegBx, REAL &PointOnSegBy, REAL &PointOnSegBz)
+    {
+        REAL s[2] = {0, 0};
+        REAL temp{};
+        FindNearestPointOnLineSegment(A1x, A1y, A1z, Lax, Lay, Laz, B1x, B1y, B1z,
+                                        true, epsilon_squared, PointOnSegAx, PointOnSegAy, PointOnSegAz, s[0]);
+        if (true == infinite_lines)
         {
-            const VECTOR3 projected = v1 + e3Hat * e3dot;
-            edgeDistances[2] = (v - projected).norm();
+            PointOnSegBx = B1x;
+            PointOnSegBy = B1y;
+            PointOnSegBz = B1z;
         }
-
-        // get the distance to each vertex
-        const VECTOR3 vertexDistances((v - v0).norm(), 
-                                        (v - v1).norm(), 
-                                        (v - v2).norm());
-
-        // get the smallest of both the edge and vertex distances
-        REAL vertexMin = 1e8;
-        REAL edgeMin = 1e8;
-        for(int i = 0;i < 3;++i){
-            vertexMin = vertexMin > vertexDistances[i] ? vertexDistances[i] : vertexMin;
-            edgeMin = edgeMin > edgeDistances[i] ? edgeDistances[i] : edgeMin;
+        else
+        {
+            REAL tp[3] = {};
+            FindNearestPointOnLineSegment(A1x, A1y, A1z, Lax, Lay, Laz, B2x, B2y, B2z,
+                                        true, epsilon_squared, tp[0], tp[1], tp[2], s[1]);
+            if (s[0] < 0.0 && s[1] < 0.0)
+            {
+                PointOnSegAx = A1x;
+                PointOnSegAy = A1y;
+                PointOnSegAz = A1z;
+                if (s[0] < s[1])
+                {
+                    PointOnSegBx = B2x;
+                    PointOnSegBy = B2y;
+                    PointOnSegBz = B2z;
+                }
+                else
+                {
+                    PointOnSegBx = B1x;
+                    PointOnSegBy = B1y;
+                    PointOnSegBz = B1z;
+                }
+            }
+            else if (s[0] > (REAL)1.0 && s[1] > (REAL)1.0)
+            {
+                PointOnSegAx = A2x;
+                PointOnSegAy = A2y;
+                PointOnSegAz = A2z;
+                if (s[0] < s[1])
+                {
+                    PointOnSegBx = B1x;
+                    PointOnSegBy = B1y;
+                    PointOnSegBz = B1z;
+                }
+                else
+                {
+                    PointOnSegBx = B2x;
+                    PointOnSegBy = B2y;
+                    PointOnSegBz = B2z;
+                }
+            }
+            else
+            {
+                temp = (REAL)0.5*(FMAX((REAL)0.0, FMIN((REAL)1.0, s[0])) + FMAX((REAL)0.0, FMIN((REAL)1.0, s[1])));
+                PointOnSegAx = (A1x + temp * Lax);
+                PointOnSegAy = (A1y + temp * Lay);
+                PointOnSegAz = (A1z + temp * Laz);
+                FindNearestPointOnLineSegment(B1x, B1y, B1z, Lbx, Lby, Lbz,
+                                                PointOnSegAx, PointOnSegAy, PointOnSegAz, true,
+                                                epsilon_squared, PointOnSegBx, PointOnSegBy, PointOnSegBz, temp);
+            }
         }
-        // return the smallest of those
-        return (vertexMin < edgeMin) ? vertexMin : edgeMin;
     }
 
 
-constexpr REAL pointTriangleDistance(const VECTOR3& v0, const VECTOR3& v1, 
-                                        const VECTOR3& v2, const VECTOR3& v,REAL& barySum)
-    {
-        // get the barycentric coordinates
-        const VECTOR3 e1 = v1 - v0;
-        const VECTOR3 e2 = v2 - v0;
-        const VECTOR3 n = e1.cross(e2);
-        const VECTOR3 na = (v2 - v1).cross(v - v1);
-        const VECTOR3 nb = (v0 - v2).cross(v - v2);
-        const VECTOR3 nc = (v1 - v0).cross(v - v0);
-        const VECTOR3 barycentric(n.dot(na) / n.l2NormSqr(),
-                                    n.dot(nb) / n.l2NormSqr(),
-                                    n.dot(nc) / n.l2NormSqr());
-                                    
-        barySum = zs::abs(barycentric[0]) + zs::abs(barycentric[1]) + zs::abs(barycentric[2]);
-
-        // if the point projects to inside the triangle, it should sum to 1
-        if (zs::abs(barySum - 1.0) < 1e-6)
-        {
-            const VECTOR3 nHat = n / n.norm();
-            const REAL normalDistance = (nHat.dot(v - v0));
-            return zs::abs(normalDistance);
-        }
 
-        // project onto each edge, find the distance to each edge
-        const VECTOR3 e3 = v2 - v1;
-        const VECTOR3 ev = v - v0;
-        const VECTOR3 ev3 = v - v1;
-        const VECTOR3 e1Hat = e1 / e1.norm();
-        const VECTOR3 e2Hat = e2 / e2.norm();
-        const VECTOR3 e3Hat = e3 / e3.norm();
-        VECTOR3 edgeDistances(1e8, 1e8, 1e8);
-
-        // see if it projects onto the interval of the edge
-        // if it doesn't, then the vertex distance will be smaller,
-        // so we can skip computing anything
-        const REAL e1dot = e1Hat.dot(ev);
-        if (e1dot > 0.0 && e1dot < e1.norm())
+    /**************************************************************************
+    |
+    |     Method: IntersectLineSegments
+    |
+    |    Purpose: Find the nearest point between two finite length line segments
+    |             or two infinite lines in 3-dimensional space. The function calculates
+    |             the point on each line/line segment that is closest to the other
+    |             line/line segment, the midpoint between the nearest points, and
+    |             the vector between these two points. If the two nearest points
+    |             are close within a tolerance, a flag is set indicating the lines
+    |             have a "true" intersection.
+    |
+    | Parameters: Input:
+    |             ------
+    |             A1x, A1y, A1z   - Coordinates of first defining point of line/segment A
+    |             A2x, A2y, A2z   - Coordinates of second defining point of line/segment A
+    |             B1x, B1y, B1z   - Coordinates of first defining point of line/segment B
+    |             B2x, B2y, B2z   - Coordinates of second defining point of line/segment B
+    |             infinite_lines  - set to true if lines are to be treated as infinite
+    |             epsilon         - tolerance value to be used to check for degenerate
+    |                               and parallel lines, and to check for true intersection.
+    |
+    |             Output:
+    |             -------
+    |             PointOnSegAx,   - Coordinates of the point on segment A that are nearest
+    |             PointOnSegAy,     to segment B. This corresponds to point C in the text.
+    |             PointOnSegAz
+    |             PointOnSegBx,   - Coordinates of the point on segment B that are nearest
+    |             PointOnSegBy,     to segment A. This corresponds to point D in the text.
+    |             PointOnSegBz
+    |             NearestPointX,  - Midpoint between the two nearest points. This can be
+    |             NearestPointY,    treated as *the* intersection point if nearest points
+    |             NearestPointZ     are sufficiently close. This corresponds to point P
+    |                               in the text.
+    |             NearestVectorX, - Vector between the nearest point on A to the nearest
+    |                               point on segment B. This vector is normal to both
+    |                               lines if the lines are infinite, but is not guaranteed
+    |                               to be normal to both lines if both lines are finite
+    |                               length.
+    |           true_intersection - true if the nearest points are close within a small
+    |                               tolerance.
+    **************************************************************************/
+    constexpr void IntersectLineSegments(const REAL A1x, const REAL A1y, const REAL A1z,
+                            const REAL A2x, const REAL A2y, const REAL A2z,
+                            const REAL B1x, const REAL B1y, const REAL B1z,
+                            const REAL B2x, const REAL B2y, const REAL B2z,
+                            bool infinite_lines, REAL epsilon, REAL &PointOnSegAx,
+                            REAL &PointOnSegAy, REAL &PointOnSegAz, REAL &PointOnSegBx,
+                            REAL &PointOnSegBy, REAL &PointOnSegBz, REAL &NearestPointX,
+                            REAL &NearestPointY, REAL &NearestPointZ, REAL &NearestVectorX,
+                            REAL &NearestVectorY, REAL &NearestVectorZ, bool &true_intersection)
+    {
+        REAL temp = (REAL)0.0;
+        REAL epsilon_squared = epsilon * epsilon;
+
+        // Compute parameters from Equations (1) and (2) in the text
+        REAL Lax = A2x - A1x;
+        REAL Lay = A2y - A1y;
+        REAL Laz = A2z - A1z;
+        REAL Lbx = B2x - B1x;
+        REAL Lby = B2y - B1y;
+        REAL Lbz = B2z - B1z;
+        // From Equation (15)
+        REAL L11 =  (Lax * Lax) + (Lay * Lay) + (Laz * Laz);
+        REAL L22 =  (Lbx * Lbx) + (Lby * Lby) + (Lbz * Lbz);
+
+        // Line/Segment A is degenerate ---- Special Case #1
+        if (L11 < epsilon_squared)
         {
-            const VECTOR3 projected = v0 + e1Hat * e1dot;
-            edgeDistances[0] = (v - projected).norm();
+            PointOnSegAx = A1x;
+            PointOnSegAy = A1y;
+            PointOnSegAz = A1z;
+            FindNearestPointOnLineSegment(B1x, B1y, B1z, Lbx, Lby, Lbz, A1x, A1y, A1z,
+                                        infinite_lines, epsilon, PointOnSegBx, PointOnSegBy,
+                                        PointOnSegBz, temp);
         }
-        const REAL e2dot = e2Hat.dot(ev);
-        if (e2dot > 0.0 && e2dot < e2.norm())
+        // Line/Segment B is degenerate ---- Special Case #1
+        else if (L22 < epsilon_squared)
         {
-            const VECTOR3 projected = v0 + e2Hat * e2dot;
-            edgeDistances[1] = (v - projected).norm();
+            PointOnSegBx = B1x;
+            PointOnSegBy = B1y;
+            PointOnSegBz = B1z;
+            FindNearestPointOnLineSegment(A1x, A1y, A1z, Lax, Lay, Laz, B1x, B1y, B1z,
+                                        infinite_lines, epsilon, PointOnSegAx, PointOnSegAy,
+                                        PointOnSegAz, temp);
         }
-        const REAL e3dot = e3Hat.dot(ev3);
-        if (e3dot > 0.0 && e3dot < e3.norm())
+        // Neither line/segment is degenerate
+        else
         {
-            const VECTOR3 projected = v1 + e3Hat * e3dot;
-            edgeDistances[2] = (v - projected).norm();
-        }
+            // Compute more parameters from Equation (3) in the text.
+            REAL ABx = B1x - A1x;
+            REAL ABy = B1y - A1y;
+            REAL ABz = B1z - A1z;
+
+            // and from Equation (15).
+            REAL L12 = -(Lax * Lbx) - (Lay * Lby) - (Laz * Lbz);
 
-        // get the distance to each vertex
-        const VECTOR3 vertexDistances((v - v0).norm(), 
-                                        (v - v1).norm(), 
-                                        (v - v2).norm());
-
-        // get the smallest of both the edge and vertex distances
-        REAL vertexMin = 1e8;
-        REAL edgeMin = 1e8;
-        for(int i = 0;i < 3;++i){
-            vertexMin = vertexMin > vertexDistances[i] ? vertexDistances[i] : vertexMin;
-            edgeMin = edgeMin > edgeDistances[i] ? edgeDistances[i] : edgeMin;
+            REAL DetL = L11 * L22 - L12 * L12;
+            // Lines/Segments A and B are parallel ---- special case #2.
+            if (FABS(DetL) < epsilon)
+            {
+                FindNearestPointOfParallelLineSegments(A1x, A1y, A1z, A2x, A2y, A2z,
+                                                        Lax, Lay, Laz,
+                                                        B1x, B1y, B1z, B2x, B2y, B2z,
+                                                        Lbx, Lby, Lbz,
+                                                        infinite_lines, epsilon,
+                                                        PointOnSegAx, PointOnSegAy, PointOnSegAz,
+                                                        PointOnSegBx, PointOnSegBy, PointOnSegBz);
+            }
+            // The general case
+            else
+            {
+                // from Equation (15)
+                REAL ra = Lax * ABx + Lay * ABy + Laz * ABz;
+                REAL rb = -Lbx * ABx - Lby * ABy - Lbz * ABz;
+
+                REAL t = (L11 * rb - ra * L12)/DetL; // Equation (12)
+
+            #ifdef USE_CRAMERS_RULE
+                REAL s = (L22 * ra - rb * L12)/DetL;
+            #else
+                REAL s = (ra-L12*t)/L11;             // Equation (13)
+            #endif // USE_CRAMERS_RULE
+
+            #ifdef CHECK_ANSWERS
+                REAL check_ra = s*L11 + t*L12;
+                REAL check_rb = s*L12 + t*L22;
+                // assert(FABS(check_ra-ra) < epsilon);
+                // assert(FABS(check_rb-rb) < epsilon);
+            #endif // CHECK_ANSWERS
+
+            // if we are dealing with infinite lines or if parameters s and t both
+            // lie in the range [0,1] then just compute the points using Equations
+            // (1) and (2) from the text.
+                PointOnSegAx = (A1x + s * Lax);
+                PointOnSegAy = (A1y + s * Lay);
+                PointOnSegAz = (A1z + s * Laz);
+                PointOnSegBx = (B1x + t * Lbx);
+                PointOnSegBy = (B1y + t * Lby);
+                PointOnSegBz = (B1z + t * Lbz);
+            // otherwise, at least one of s and t is outside of [0,1] and we have to
+            // handle this case.
+                if (false == infinite_lines && (OUT_OF_RANGE(s) || OUT_OF_RANGE(t)))
+                {
+                    AdjustNearestPoints(A1x, A1y, A1z, Lax, Lay, Laz,
+                                        B1x, B1y, B1z, Lbx, Lby, Lbz,
+                                        epsilon, s, t,
+                                        PointOnSegAx, PointOnSegAy, PointOnSegAz,
+                                        PointOnSegBx, PointOnSegBy, PointOnSegBz);
+                }
+            }
         }
-        // return the smallest of those
-        return (vertexMin < edgeMin) ? vertexMin : edgeMin;
-    }
 
+        NearestPointX = (REAL)0.5 * (PointOnSegAx + PointOnSegBx);
+        NearestPointY = (REAL)0.5 * (PointOnSegAy + PointOnSegBy);
+        NearestPointZ = (REAL)0.5 * (PointOnSegAz + PointOnSegBz);
 
-    ///////////////////////////////////////////////////////////////////////
-    // see if the projection of v onto the plane of v0,v1,v2 is inside 
-    // the triangle formed by v0,v1,v2
-    ///////////////////////////////////////////////////////////////////////
-    constexpr bool pointProjectsInsideTriangle(const VECTOR3& v0, const VECTOR3& v1, 
-                                            const VECTOR3& v2, const VECTOR3& v){
-        // get the barycentric coordinates
-        const VECTOR3 e1 = v1 - v0;
-        const VECTOR3 e2 = v2 - v0;
-        const VECTOR3 n = e1.cross(e2);
-        const VECTOR3 na = (v2 - v1).cross(v - v1);
-        const VECTOR3 nb = (v0 - v2).cross(v - v2);
-        const VECTOR3 nc = (v1 - v0).cross(v - v0);
-        const VECTOR3 barycentric(n.dot(na) / n.l2NormSqr(),
-                                    n.dot(nb) / n.l2NormSqr(),
-                                    n.dot(nc) / n.l2NormSqr());
-                                    
-        const REAL barySum = zs::abs(barycentric[0]) + zs::abs(barycentric[1]) + zs::abs(barycentric[2]);
-
-        // if the point projects to inside the triangle, it should sum to 1
-        if (zs::abs(barySum - 1.0) < 1e-6)
-            return true;
+        NearestVectorX = PointOnSegBx - PointOnSegAx;
+        NearestVectorY = PointOnSegBy - PointOnSegAy;
+        NearestVectorZ = PointOnSegBz - PointOnSegAz;
 
-        return false;
+        // optional check to indicate if the lines truly intersect
+        true_intersection = (FABS(NearestVectorX) +
+                            FABS(NearestVectorY) +
+                            FABS(NearestVectorZ)) < epsilon ? true : false;
     }
 
 
-
-
+    constexpr void IntersectLineSegments(const VECTOR3& a0, const VECTOR3& a1,
+                            const VECTOR3& b0, const VECTOR3& b1,
+                            VECTOR3& aPoint, VECTOR3& bPoint)
+    {
+        VECTOR3 midpoint{};
+        VECTOR3 normal{};
+        bool intersect{};
+        IntersectLineSegments(a0[0], a0[1], a0[2], a1[0], a1[1], a1[2],
+                                b0[0], b0[1], b0[2], b1[0], b1[1], b1[2],
+                                false, 1e-6,
+                                aPoint[0], aPoint[1], aPoint[2],
+                                bPoint[0], bPoint[1], bPoint[2],
+                                midpoint[0], midpoint[1], midpoint[2],
+                                normal[0], normal[1], normal[2], intersect);
+    }
 
 };
 };
\ No newline at end of file
diff --git a/projects/CuLagrange/fem/collision_energy/edge_edge_collision.hpp b/projects/CuLagrange/fem/collision_energy/edge_edge_collision.hpp
index 7a0546606b..b661822939 100644
--- a/projects/CuLagrange/fem/collision_energy/edge_edge_collision.hpp
+++ b/projects/CuLagrange/fem/collision_energy/edge_edge_collision.hpp
@@ -231,8 +231,10 @@ namespace EDGE_EDGE_COLLISION {
         // ndotHessian = ndot_hessian(x);
         const MATRIX12 springLengthH = springLengthHessian(e,n,diff,a,b);
         
-        return (REAL)2.0 * _mu * (dyadic_prod(springLengthGrad,springLengthGrad) +
-                            springLength * springLengthH);
+        //return 2.0 * _mu * (springLengthGrad * springLengthGrad.transpose() +
+        //                    springLength * springLengthH);
+
+        return (REAL)2.0 * _mu * dyadic_prod(springLengthGrad,springLengthGrad);
     }
 
     ///////////////////////////////////////////////////////////////////////
@@ -288,8 +290,11 @@ namespace EDGE_EDGE_COLLISION {
         
         //return 2.0 * _mu * (springLengthGrad * springLengthGrad.transpose() +
         //                    springLength * springLengthH);
-        return (REAL)-2.0 * _mu * (springLength * springLengthH - 
-                            zs::dyadic_prod(springLengthGrad,springLengthGrad));
+        // return (REAL)-2.0 * _mu * (springLength * springLengthH - 
+        //                     zs::dyadic_prod(springLengthGrad,springLengthGrad));
+
+        return (REAL)2.0 * _mu * zs::dyadic_prod(springLengthGrad,springLengthGrad);
+
     }
 
 };
diff --git a/projects/CuLagrange/fem/collision_energy/edge_edge_sqrt_collision.hpp b/projects/CuLagrange/fem/collision_energy/edge_edge_sqrt_collision.hpp
deleted file mode 100644
index 359742184a..0000000000
--- a/projects/CuLagrange/fem/collision_energy/edge_edge_sqrt_collision.hpp
+++ /dev/null
@@ -1,227 +0,0 @@
-#pragma once
-
-#include "collision_utils.hpp"
-
-namespace zeno {
-namespace EDGE_EDGE_SQRT_COLLISION {
-    using namespace COLLISION_UTILS;
-
-///////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-constexpr REAL psi(const VECTOR3 v[4],
-                    const VECTOR2& a, 
-                    const VECTOR2& b,
-                    const REAL& _mu,
-                    const REAL& _nu,
-                    const REAL& _eps,
-                    const REAL& _tooSmall)
-{
-    // convert to vertices and edges
-    VECTOR3 e[3] = {};
-    e[0] = v[3] - v[2];
-    e[1] = v[0] - v[2];
-    e[2] = v[1] - v[2];
-
-    // get the interpolated vertices
-    const VECTOR3 va = (a[0] * v[0] + a[1] * v[1]);
-    const VECTOR3 vb = (b[0] * v[2] + b[1] * v[3]);
-    if ((vb - va).norm() < _tooSmall)
-        return 0.0;
-
-    // there is not sign switch operation
-    const REAL springLength = _eps - (vb - va).norm();
-    return _mu * springLength * springLength;
-}
-
-///////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-constexpr REAL psiNegated(const VECTOR3 v[4],
-                    const VECTOR2& a, 
-                    const VECTOR2& b,
-                    const REAL& _mu,
-                    const REAL& _nu,
-                    const REAL& _eps,
-                    const REAL& _tooSmall)
-{
-    // convert to vertices and edges
-    VECTOR3 e[3] = {};
-    e[0] = v[3] - v[2];
-    e[1] = v[0] - v[2];
-    e[2] = v[1] - v[2];
-
-    // get the interpolated vertices
-    const VECTOR3 va = (a[0] * v[0] + a[1] * v[1]);
-    const VECTOR3 vb = (b[0] * v[2] + b[1] * v[3]);
-    if ((vb - va).norm() < _tooSmall)
-        return 0.0;
-
-    const REAL springLength = _eps + (vb - va).norm();
-    return _mu * springLength * springLength;
-}
-
-///////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-constexpr VECTOR12 gradient(const VECTOR3 v[4],
-                    const VECTOR2& a, 
-                    const VECTOR2& b,
-                    const REAL& _mu,
-                    const REAL& _nu,
-                    const REAL& _eps,
-                    const REAL& _tooSmall)
-{
-    // convert to vertices and edges
-    VECTOR3 e[3] = {};
-    e[0] = v[3] - v[2];
-    e[1] = v[0] - v[2];
-    e[2] = v[1] - v[2];
-
-    // get the interpolated vertices
-    const VECTOR3 va = (a[0] * v[0] + a[1] * v[1]);
-    const VECTOR3 vb = (b[0] * v[2] + b[1] * v[3]);
-    const VECTOR3 diff = vb - va;
-
-    // if the two are co-linear, give up
-    // should probably fall back to cross-product formula here
-    // (see EDGE_HYBRID_COLLISION)
-    if (diff.norm() < _tooSmall)
-        return VECTOR12::zeros();
-
-    // get the normal
-    VECTOR3 n = diff;
-    n = n / n.norm();
-
-    const REAL springLength = _eps - diff.norm();
-    return (REAL)-2.0 * _mu * springLength * (vDiffPartial(a,b).transpose() * n);
-}
-
-
-///////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-constexpr VECTOR12 gradientNegated(const VECTOR3 v[4],
-                    const VECTOR2& a, 
-                    const VECTOR2& b,
-                    const REAL& _mu,
-                    const REAL& _nu,
-                    const REAL& _eps,
-                    const REAL& _tooSmall)
-{
-    // convert to vertices and edges
-    VECTOR3 e[3] = {};
-    e[0] = v[3] - v[2];
-    e[1] = v[0] - v[2];
-    e[2] = v[1] - v[2];
-
-
-    // get the interpolated vertices
-    const VECTOR3 va = (a[0] * v[0] + a[1] * v[1]);
-    const VECTOR3 vb = (b[0] * v[2] + b[1] * v[3]);
-    const VECTOR3 diff = vb - va;
-
-    // if the two are co-linear, give up
-    // should probably fall back to cross-product formula here
-    // (see EDGE_HYBRID_COLLISION)
-    if (diff.norm() < _tooSmall)
-        return VECTOR12::zeros();
-
-    // get the direction
-    VECTOR3 d = diff;
-    d = d / d.norm();
-
-    const REAL springLength = _eps + diff.norm();
-    const MATRIX3x12 vPartial = vDiffPartial(a,b);
-    
-    return (REAL)2.0 * _mu * springLength * (vPartial.transpose() * d);
-}
-
-///////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-constexpr MATRIX12 hessian(const VECTOR3 v[4],
-                    const VECTOR2& a, 
-                    const VECTOR2& b,
-                    const REAL& _mu,
-                    const REAL& _nu,
-                    const REAL& _eps,
-                    const REAL& _tooSmall)
-{
-    // convert to vertices and edges
-    VECTOR3 e[3] = {};
-    e[0] = v[3] - v[2];
-    e[1] = v[0] - v[2];
-    e[2] = v[1] - v[2];
-
-    // get the interpolated vertices
-    const VECTOR3 va = (a[0] * v[0] + a[1] * v[1]);
-    const VECTOR3 vb = (b[0] * v[2] + b[1] * v[3]);
-    const VECTOR3 diff = vb - va;
-    const REAL diffNorm = diff.norm();
-
-    // if the two are co-linear, give up
-    // should probably fall back to cross-product formula here
-    // (see EDGE_HYBRID_COLLISION)
-    if (diffNorm < _tooSmall)
-        return MATRIX12::zeros();
-
-    // get the normal
-    VECTOR3 d = diff;
-    d = d / d.norm();
-
-    const MATRIX3x12 vPartial = vDiffPartial(a,b);
-    const REAL invNorm = (diffNorm >= 1e-8) ? 1.0 / diffNorm : 1.0;
-    const REAL invNorm3 = invNorm * invNorm * invNorm;
-
-    const VECTOR12 normPartial = -invNorm * (vPartial.transpose() * diff);
-    const MATRIX3x12 dGrad = invNorm * vPartial -
-                            invNorm3 * zs::dyadic_prod(diff,(vPartial.transpose() * diff));
-
-    return (REAL)-2.0 * _mu * ((_eps - diffNorm) * (vPartial.transpose() * dGrad) +
-                        zs::dyadic_prod(normPartial,vPartial.transpose() * d));
-}
-
-///////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-constexpr MATRIX12 hessianNegated(const VECTOR3 v[4],
-                    const VECTOR2& a, 
-                    const VECTOR2& b,
-                    const REAL& _mu,
-                    const REAL& _nu,
-                    const REAL& _eps,
-                    const REAL& _tooSmall)
-{
-    // convert to vertices and edges
-    VECTOR3 e[3] = {};
-    e[0] = v[3] - v[2];
-    e[1] = v[0] - v[2];
-    e[2] = v[1] - v[2];
-
-    // get the interpolated vertices
-    const VECTOR3 va = (a[0] * v[0] + a[1] * v[1]);
-    const VECTOR3 vb = (b[0] * v[2] + b[1] * v[3]);
-    const VECTOR3 diff = vb - va;
-    const REAL diffNorm = diff.norm();
-    const REAL diffNorm3 = diffNorm * diffNorm * diffNorm;
-
-    // if the two are co-linear, give up
-    // should probably fall back to cross-product formula here
-    // (see EDGE_HYBRID_COLLISION)
-    if (diffNorm < _tooSmall)
-        return MATRIX12::zeros();
-
-    // get the normal
-    VECTOR3 n = diff;
-    n = n / n.norm();
-
-    const MATRIX3x12 vPartial = vDiffPartial(a,b);
-    const VECTOR12 normPartial = ((REAL)-1.0 / diffNorm) * (vPartial.transpose() * diff);
-
-    const MATRIX3x12 nGrad = ((REAL)1.0 / diffNorm) * vPartial -
-                            ((REAL)1.0 / diffNorm3) * zs::dyadic_prod(diff, (vPartial.transpose() * diff));
-
-    // this is the energetically consistent one
-    return (REAL)2.0 * _mu * ((_eps + diffNorm) * (vPartial.transpose() * nGrad) -
-                        zs::dyadic_prod(normPartial,vPartial.transpose() * n));
-}
-
-
-
-};
-};
\ No newline at end of file
diff --git a/projects/CuLagrange/fem/collision_energy/evaluate_collision.hpp b/projects/CuLagrange/fem/collision_energy/evaluate_collision.hpp
index a59d0ea97f..ed1851af50 100644
--- a/projects/CuLagrange/fem/collision_energy/evaluate_collision.hpp
+++ b/projects/CuLagrange/fem/collision_energy/evaluate_collision.hpp
@@ -29,8 +29,8 @@
 
 #include "vertex_face_sqrt_collision.hpp"
 #include "vertex_face_collision.hpp"
-#include "edge_edge_sqrt_collision.hpp"
-#include "edge_edge_collision.hpp"
+// #include "edge_edge_sqrt_collision.hpp"
+// #include "edge_edge_collision.hpp"
 
 namespace zeno { namespace COLLISION_UTILS {
 
@@ -55,7 +55,7 @@ void do_facet_point_collision_detection(Pol& cudaPol,
     const SurfTriTileVec& tris,
     SurfTriNrmVec& sttemp,
     SurfLineNrmVec& setemp,
-    FPCollisionBuffer& cptemp,
+    FPCollisionBuffer& fp_collision_buffer,
     // const bvh_t& stBvh,
     T in_collisionEps,T out_collisionEps) {
         using namespace zs;
@@ -68,6 +68,7 @@ void do_facet_point_collision_detection(Pol& cudaPol,
         auto avgl = compute_average_edge_length(cudaPol,verts,xtag,tris);
         auto bvh_thickness = 5 * avgl;
 
+
         if(!calculate_facet_normal(cudaPol,verts,xtag,tris,sttemp,"nrm")){
             throw std::runtime_error("fail updating facet normal");
         }       
@@ -79,8 +80,8 @@ void do_facet_point_collision_detection(Pol& cudaPol,
             setemp,"nrm")){
                 throw std::runtime_error("fail calculate cell bisector normal");
         }       
-        TILEVEC_OPS::fill<4>(cudaPol,cptemp,"inds",zs::vec<int,4>::uniform(-1).template reinterpret_bits<T>());
-        TILEVEC_OPS::fill(cudaPol,cptemp,"inverted",reinterpret_bits<T>((int)0));
+        TILEVEC_OPS::fill<4>(cudaPol,fp_collision_buffer,"inds",zs::vec<int,4>::uniform(-1).template reinterpret_bits<T>());
+        TILEVEC_OPS::fill(cudaPol,fp_collision_buffer,"inverted",reinterpret_bits<T>((int)0));
         cudaPol(zs::range(points.size()),[in_collisionEps = in_collisionEps,
                         out_collisionEps = out_collisionEps,
                         verts = proxy<space>({},verts),xtag,
@@ -89,12 +90,21 @@ void do_facet_point_collision_detection(Pol& cudaPol,
                         points = proxy<space>({},points),
                         lines = proxy<space>({},lines),
                         tris = proxy<space>({},tris),
-                        cptemp = proxy<space>({},cptemp),
+                        fp_collision_buffer = proxy<space>({},fp_collision_buffer),
                         stbvh = proxy<space>(stBvh),thickness = bvh_thickness] ZS_LAMBDA(int svi) mutable {
             auto vi = reinterpret_bits<int>(points("inds",svi));
             auto active = verts("active",vi);
-            if(active < 1e-6)
+            bool is_active_vert = true;
+            if(active < 1e-6){
+                is_active_vert = false;
                 return;
+            }
+
+            if(verts.hasProperty("is_verted")) {
+                auto is_inverted =reinterpret_bits<int>(verts("is_inverted",vi));
+                if(is_inverted)
+                    return;
+            }
 
             auto p = verts.template pack<3>(xtag,vi);
             auto bv = bv_t{get_bounding_box(p - thickness, p + thickness)};
@@ -108,12 +118,19 @@ void do_facet_point_collision_detection(Pol& cudaPol,
                 if(tri[0] == vi || tri[1] == vi || tri[2] == vi)
                     return;
 
+
+                if(verts.hasProperty("is_verted")) {
+
+                    for(int i = 0;i != 3;++i)
+                        if(reinterpret_bits<int>(verts("is_inverted",tri[i])))
+                            return;
+
+                }
+
                 bool is_active_tri = true;
                 for(int i = 0;i != 3;++i)
                     if(verts("active",tri[i]) < 1e-6)
-                        is_active_tri = false;
-                if(!is_active_tri)
-                    return;
+                        return;
 
                 T dist = (T)0.0;
 
@@ -133,22 +150,6 @@ void do_facet_point_collision_detection(Pol& cudaPol,
                 if(areaDeform < 1e-1)
                     return;
 
-                // if(COLLISION_UTILS::is_inside_the_cell(verts,xtag,
-                //         lines,tris,
-                //         sttemp,"nrm",
-                //         setemp,"nrm",
-                //         stI,p,in_collisionEps,out_collisionEps,dist)) {
-                // //     cptemp.template tuple<4>("inds",svi * MAX_FP_COLLISION_PAIRS + nm_collision_pairs) = zs::vec<int,4>(vi,tri[0],tri[1],tri[2]).template reinterpret_bits<T>();
-                // //     auto vertexFaceCollisionAreas = tris("area",stI) + points("area",svi); 
-                // //     cptemp("area",svi * MAX_FP_COLLISION_PAIRS + nm_collision_pairs) = vertexFaceCollisionAreas;   
-                // //     if(vertexFaceCollisionAreas < 0)
-                // //         printf("negative face area detected\n");  
-                // //     int is_inverted = dist > (T)0.0 ? 1 : 0;  
-                // //     cptemp("inverted",svi * MAX_FP_COLLISION_PAIRS + nm_collision_pairs) = reinterpret_bits<T>(is_inverted);            
-                // //     nm_collision_pairs++;  
-
-                // }
-
                 auto nrm = sttemp.template pack<3>("nrm",stI);
                 
                 auto seg = p - verts.template pack<3>(xtag,tri[0]);    
@@ -165,12 +166,12 @@ void do_facet_point_collision_detection(Pol& cudaPol,
                 // auto avge = (e01 + e02 + e12)/(T)3.0;
 
                 T barySum = (T)1.0;
-                T distance = COLLISION_UTILS::pointTriangleDistance(t0,t1,t2,p,barySum);
+                T distance = LSL_GEO::pointTriangleDistance(t0,t1,t2,p,barySum);
                 // auto max_ratio = inset_ratio > outset_ratio ? inset_ratio : outset_ratio;
                 // collisionEps = avge * max_ratio;
                 auto collisionEps = seg.dot(nrm) > 0 ? out_collisionEps : in_collisionEps;
 
-                if(barySum > 2)
+                if(barySum > 5)
                     return;
 
                 if(distance > collisionEps)
@@ -180,7 +181,7 @@ void do_facet_point_collision_detection(Pol& cudaPol,
                 //     return;
 
                 // if the triangle cell is too degenerate
-                if(!pointProjectsInsideTriangle(t0,t1,t2,p))
+                if(!LSL_GEO::pointProjectsInsideTriangle(t0,t1,t2,p))
                     for(int i = 0;i != 3;++i) {
                             auto bisector_normal = get_bisector_orient(lines,tris,setemp,"nrm",stI,i);
                             // auto test = bisector_normal.cross(nrm).norm() < 1e-2;
@@ -193,13 +194,13 @@ void do_facet_point_collision_detection(Pol& cudaPol,
         
                 // now the points is inside the cell
 
-                cptemp.template tuple<4>("inds",svi * MAX_FP_COLLISION_PAIRS + nm_collision_pairs) = zs::vec<int,4>(vi,tri[0],tri[1],tri[2]).template reinterpret_bits<T>();
+                fp_collision_buffer.template tuple<4>("inds",svi * MAX_FP_COLLISION_PAIRS + nm_collision_pairs) = zs::vec<int,4>(vi,tri[0],tri[1],tri[2]).template reinterpret_bits<T>();
                 auto vertexFaceCollisionAreas = tris("area",stI) + points("area",svi); 
-                cptemp("area",svi * MAX_FP_COLLISION_PAIRS + nm_collision_pairs) = vertexFaceCollisionAreas;   
+                fp_collision_buffer("area",svi * MAX_FP_COLLISION_PAIRS + nm_collision_pairs) = vertexFaceCollisionAreas;   
                 if(vertexFaceCollisionAreas < 0)
                     printf("negative face area detected\n");  
                 int is_inverted = dist > (T)0.0 ? 1 : 0;  
-                cptemp("inverted",svi * MAX_FP_COLLISION_PAIRS + nm_collision_pairs) = reinterpret_bits<T>(is_inverted);            
+                fp_collision_buffer("inverted",svi * MAX_FP_COLLISION_PAIRS + nm_collision_pairs) = reinterpret_bits<T>(is_inverted);            
                 nm_collision_pairs++;  
 
             };
@@ -207,258 +208,716 @@ void do_facet_point_collision_detection(Pol& cudaPol,
         });
 }
 
-// template<int MAX_EE_COLLISION_PAIRS,typename Pol,
+
+template<int MAX_KINEMATIC_COLLISION_PAIRS,
+    typename Pol,
+    typename PosTileVec,
+    typename SurfPointTileVec,
+    typename SurfLineTileVec,
+    typename SurfTriTileVec,
+    typename SurfLineNrmTileVec,
+    typename SurfTriNrmTileVec,
+    typename KPosTileVec,
+    typename KCollisionBuffer>
+void do_kinematic_point_collision_detection(Pol& cudaPol,
+    PosTileVec& verts,const zs::SmallString& xtag,
+    const SurfPointTileVec& points,
+    SurfLineTileVec& lines,
+    SurfTriTileVec& tris,
+    SurfLineNrmTileVec& nrmLines,
+    SurfTriNrmTileVec& nrmTris,
+    const KPosTileVec& kverts,
+    KCollisionBuffer& kc_buffer,
+    T in_collisionEps,T out_collisionEps,bool update_normal = true) {
+        using namespace zs;
+        constexpr auto space = execspace_e::cuda;
+
+        auto stBvh = bvh_t{};
+        auto bvs = retrieve_bounding_volumes(cudaPol,verts,tris,wrapv<3>{},(T)0.0,xtag);
+        stBvh.build(cudaPol,bvs);
+
+        auto avgl = compute_average_edge_length(cudaPol,verts,xtag,tris);
+        auto bvh_thickness = 5 * avgl;    
+
+        if(update_normal) {
+            if(!calculate_facet_normal(cudaPol,verts,xtag,tris,nrmTris,"nrm")){
+                throw std::runtime_error("fail updating kinematic facet normal");
+            }       
+            if(!COLLISION_UTILS::calculate_cell_bisector_normal(cudaPol,
+                verts,xtag,
+                lines,
+                tris,
+                nrmTris,"nrm",
+                nrmLines,"nrm")){
+                    throw std::runtime_error("fail calculate cell bisector normal");
+            }    
+        }
+
+        TILEVEC_OPS::fill<2>(cudaPol,kc_buffer,"inds",zs::vec<int,2>::uniform(-1).template reinterpret_bits<T>());
+        TILEVEC_OPS::fill(cudaPol,kc_buffer,"inverted",reinterpret_bits<T>((int)0));
+
+        cudaPol(zs::range(kverts.size()),[in_collisionEps = in_collisionEps,
+                out_collisionEps = out_collisionEps,
+                verts = proxy<space>({},verts),xtag,
+                lines = proxy<space>({},lines),
+                tris = proxy<space>({},tris),
+                nrmTris = proxy<space>({},nrmTris),
+                nrmLines = proxy<space>({},nrmLines),
+                kverts = proxy<space>({},kverts),
+                kc_buffer = proxy<space>({},kc_buffer),
+                stBvh = proxy<space>(stBvh),thickness = bvh_thickness] ZS_LAMBDA(int kvi) mutable {
+
+                    auto kp = kverts.pack(dim_c<3>,"x",kvi);
+                    auto bv = bv_t{get_bounding_box(kp - thickness,kp + thickness)};
+
+                    int nm_collision_pairs = 0;
+                    auto process_kinematic_vertex_face_collision_pairs = [&](int stI) {
+                        if(nm_collision_pairs >= MAX_KINEMATIC_COLLISION_PAIRS)
+                            return;
+                        auto tri = tris.pack(dim_c<3>,"inds",stI).reinterpret_bits(int_c);
+                        for(int i = 0;i != 3;++i)
+                            if(verts("k_active",tri[i]) < 1e-6)
+                                return;
+
+                        auto average_thickness = (T)0.0;
+                        if(verts.hasProperty("k_thickness")){
+                            // average_thickness = (T)0.0;
+                            for(int i = 0;i != 3;++i)
+                                average_thickness += verts("k_thickness",tri[i])/(T)3.0;
+                        }
+
+
+
+                        if(verts.hasProperty("is_verted")) {
+
+                            for(int i = 0;i != 3;++i)
+                                if(reinterpret_bits<int>(verts("is_inverted",tri[i])))
+                                    return;
+
+                        }
+
+                        T dist = (T)0.0;
+
+                        // if(tri[0] > 5326 || tri[1] > 5326 || tri[2] > 5326){
+                        //     printf("invalid tri detected : %d %d %d\n",tri[0],tri[1],tri[2]);
+                        //     return;
+                        // }
+
+                        auto nrm = nrmTris.pack(dim_c<3>,"nrm",stI);
+                        auto seg = kp - verts.pack(dim_c<3>,xtag,tri[0]);
+
+
+                        auto t0 = verts.pack(dim_c<3>,xtag,tri[0]);
+                        auto t1 = verts.pack(dim_c<3>,xtag,tri[1]);
+                        auto t2 = verts.pack(dim_c<3>,xtag,tri[2]);
+
+                        auto e01 = (t0 - t1).norm();
+                        auto e02 = (t0 - t2).norm();
+                        auto e12 = (t1 - t2).norm();
+
+                        T barySum = (T)1.0;
+                        T distance = LSL_GEO::pointTriangleDistance(t0,t1,t2,kp,barySum);
+
+                        dist = seg.dot(nrm);
+                        // increase the stability, the tri must already in collided in the previous frame before been penerated in the current frame
+                        // if(dist > 0 && tris("collide",stI) < 0.5)
+                        //     return;
+
+                        auto collisionEps = dist < 0 ? out_collisionEps * ((T)1.0 + average_thickness) : in_collisionEps;
+
+                        if(barySum > 1.1)
+                            return;
+
+                        if(distance > collisionEps)
+                            return;
+
+                        // if(dist < -(avge * inset_ratio + 1e-6) || dist > (outset_ratio * avge + 1e-6))
+                        //     return;
+
+                        // if the triangle cell is too degenerate
+                        if(!LSL_GEO::pointProjectsInsideTriangle(t0,t1,t2,kp))
+                            for(int i = 0;i != 3;++i) {
+                                auto bisector_normal = get_bisector_orient(lines,tris,nrmLines,"nrm",stI,i);
+                                // auto test = bisector_normal.cross(nrm).norm() < 1e-2;
+                                seg = kp - verts.pack(dim_c<3>,xtag,tri[i]);
+                                if(bisector_normal.dot(seg) < 0)
+                                    return;
+                            }
+
+                        kc_buffer.template tuple<2>("inds",kvi * MAX_KINEMATIC_COLLISION_PAIRS + nm_collision_pairs) = zs::vec<int,2>(kvi,stI).template reinterpret_bits<T>();
+                        auto vertexFaceCollisionAreas = /*tris("area",stI) + */kverts("area",kvi); 
+                        kc_buffer("area",kvi * MAX_KINEMATIC_COLLISION_PAIRS + nm_collision_pairs) = vertexFaceCollisionAreas;   
+                        // if(vertexFaceCollisionAreas < 0)
+                        //     printf("negative face area detected\n");  
+                        int is_inverted = dist > (T)0.0 ? 1 : 0;  
+                        kc_buffer("inverted",kvi * MAX_KINEMATIC_COLLISION_PAIRS + nm_collision_pairs) = reinterpret_bits<T>(is_inverted);            
+                        nm_collision_pairs++;  
+                    };
+                    stBvh.iter_neighbors(bv,process_kinematic_vertex_face_collision_pairs);
+            });
+}
+
+
+
+// template<typename Pol,
 //     typename PosTileVec,
 //     typename SurfPointTileVec,
 //     typename SurfLineTileVec,
 //     typename SurfTriTileVec,
 //     typename SurfTriNrmVec,
 //     typename SurfLineNrmVec,
-//     typename PointNeighHash,
 //     typename EECollisionBuffer>
 // void do_edge_edge_collision_detection(Pol& cudaPol,
 //     const PosTileVec& verts,const zs::SmallString& xtag,
 //     const SurfPointTileVec& points,
 //     const SurfLineTileVec& lines,
 //     const SurfTriTileVec& tris,
-//     SurfTriNrmVec& sttemp,
-//     SurfLineNrmVec& setemp,
-//     EECollisionBuffer& eetemp,
-//     const PointNeighHash& pphash,
+//     SurfTriNrmVec& sttemp,SurfLineNrmVec& setemp,
+//     EECollisionBuffer& ee_collision_buffer,
+//     // const PointNeighHash& pphash,// we might need an one-ring neighbor removal tech
 //     T in_collisionEps,T out_collisionEps) {
 //         using namespace zs;
 //         constexpr auto space = execspace_e::cuda;
 
 //         auto seBvh = bvh_t{};
 //         auto bvs = retrieve_bounding_volumes(cudaPol,verts,lines,wrapv<2>{},(T)0.0,xtag);
+//         seBvh.build(cudaPol,bvs);
 
 //         auto avgl = compute_average_edge_length(cudaPol,verts,xtag,lines);
 //         auto bvh_thickness = 5 * avgl;
 
-//         if(!calculate_facet_normal(cudaPol,verts,xtag,sttemp,"nrm"))
-//             throw std::runtime_error("fail updating facet normal");
 
+//         if(!sttemp.hasProperty("nrm") || sttemp.getChannelSize("nrm") != 3)
+//             throw std::runtime_error("do_edge_edge_collision_detection::invalid sttemp's \"nrm\" channel");
+
+//         if(!setemp.hasProperty("nrm") || setemp.getChannelSize("nrm") != 3)
+//             throw std::runtime_error("do_edge_edge_collision_detection::invalid setemp's \"nrm\" channel");
+
+//         if(setemp.size() != lines.size())
+//             throw std::runtime_error("setemp.size() != lines.size()");
+//         if(sttemp.size() != tris.size())
+//             throw std::runtime_error("sttemp.size() != tris.size()");
+
+//         // std::cout << "do edge edge collision detection" << std::endl;
+//         if(!calculate_facet_normal(cudaPol,verts,xtag,tris,sttemp,"nrm"))
+//             throw std::runtime_error("do_edge_edge_collision_detection::fail updating facet normal");
+
+
+//         // std::cout << "calculate edge normal" << std::endl;
+
+//         if(!calculate_edge_normal_from_facet_normal(cudaPol,sttemp,"nrm",setemp,"nrm",lines))
+//             throw std::runtime_error("do_edge_edge_collision_detection::fail updating edge normal");
+
+//         if(ee_collision_buffer.size() != lines.size())
+//             throw std::runtime_error("do_edge_edge_collision_detection::invalid ee_colliision_buffer size");
+
+//         if(!ee_collision_buffer.hasProperty("inds") || ee_collision_buffer.getChannelSize("inds") != 4)
+//             throw std::runtime_error("do_edge_edge_collision_detection::invalid ee_colliision_buffer's \"inds\" channel");
+
+//         if(!ee_collision_buffer.hasProperty("inverted") || ee_collision_buffer.getChannelSize("inverted") != 1)
+//             throw std::runtime_error("do_edge_edge_collision_detection::invalid ee_colliision_buffer's \"inverted\" channel");
+
+//         if(!ee_collision_buffer.hasProperty("abary") || ee_collision_buffer.getChannelSize("abary") != 2)
+//             throw std::runtime_error("do_edge_edge_collision_detection::invalid ee_colliision_buffer's \"abary\" channel");
+
+//         if(!ee_collision_buffer.hasProperty("bbary") || ee_collision_buffer.getChannelSize("bbary") != 2)
+//             throw std::runtime_error("do_edge_edge_collision_detection::invalid ee_colliision_buffer's \"bbary\" channel");
+
+//         if(!ee_collision_buffer.hasProperty("area") || ee_collision_buffer.getChannelSize("area") != 1)
+//             throw std::runtime_error("do_edge_edge_collision_detection::invalid ee_colliision_buffer's \"area\" channel");
+
+//         if(!lines.hasProperty("area") || lines.getChannelSize("area") != 1)
+//             throw std::runtime_error("do_edge_edge_collision_detection::invalid lines's \"area\" channel");
+
+//         TILEVEC_OPS::fill<4>(cudaPol,ee_collision_buffer,"inds",zs::vec<int,4>::uniform(-1).template reinterpret_bits<T>());
+//         TILEVEC_OPS::fill(cudaPol,ee_collision_buffer,"inverted",reinterpret_bits<T>((int)0));
+//         // TILEVEC_OPS::fill(cudaPol,ee_collision_buffer,"abary",(T)0.0);
+//         // TILEVEC_OPS::fill(cudaPol,ee_collision_buffer,"bbary",(T)0.0);
+
+//         if(!verts.hasProperty("active") || verts.getChannelSize("active") != 1)
+//             throw std::runtime_error("do_edge_edge_collision_detection::invalid verts' \"active\" channel");
+//         if(!verts.hasProperty(xtag) || verts.getChannelSize(xtag) != 3)
+//             throw std::runtime_error("do_edge_edge_collision_detection::invalid verts' \"xtag\" channel");
+
+//         cudaPol(zs::range(lines.size()),[in_collisionEps = in_collisionEps,
+//                 out_collisionEps = out_collisionEps,
+//                 verts = proxy<space>({},verts),xtag = xtag,
+//                 points = proxy<space>({},points),
+//                 lines = proxy<space>({},lines),
+//                 tris = proxy<space>({},tris),
+//                 sttemp = proxy<space>({},sttemp),
+//                 setemp = proxy<space>({},setemp),
+//                 ee_collision_buffer = proxy<space>({},ee_collision_buffer),
+//                 seBvh = proxy<space>(seBvh),
+//                 thickness = bvh_thickness] ZS_LAMBDA(int sei) mutable {
+//                     auto einds = lines.template pack<2>("inds",sei).reinterpret_bits(int_c);
+//                     auto id0 = einds[0];
+//                     auto id1 = einds[1];
+//                     auto is_active0 = verts("active",id0) > 1e-6;
+//                     auto is_active1 = verts("active",id1) > 1e-6;
+
+//                     if(!is_active0 || !is_active1){
+//                         // printf("skip inactive edge %d\n",sei);
+//                         return;
+//                     }
+
+//                     auto a0 = verts.template pack<3>(xtag,id0);
+//                     auto a1 = verts.template pack<3>(xtag,id1);
+
+//                     auto ac = (a0 + a1) / (T)2.0;
+//                     auto bv = bv_t{get_bounding_box(ac - thickness,ac + thickness)};
+
+//                     // int nm_collision_pairs = 0;
+//                     int closestEdge = -1;
+//                     T closestDistance = 1e8;
+
+//                     zs::vec<T,2> aClosest{};
+//                     zs::vec<T,2> bClosest{};
+//                     zs::vec<T,3> aClosestPoint{};
+//                     zs::vec<T,3> bClosestPoint{};
+
+//                     auto aNrm = setemp.template pack<3>("nrm",sei);
+
+//                     auto process_edge_edge_collision_pairs = [&](int nseI) {
+//                         // printf("check edge pairs : %d %d\n",sei,nseI);
+
+//                         zs::vec<T,3> aPoint{};
+//                         zs::vec<T,3> bPoint{};
+//                         // zs::vec<T,3> bNrm{};
+//                         zs::vec<T,2> a{},b{};
+
+//                         auto nedge = lines.pack(dim_c<2>,"inds",nseI).reinterpret_bits(int_c);
+
+//                         if(nedge[0] == id0 || nedge[1] == id0 || nedge[0] == id1 || nedge[1] == id1){
+//                             // printf("skip neighbor pairs : %d %d\n",sei,nseI);
+//                             return;
+//                         }
+
+//                         auto is_active0_nei = verts("active",nedge[0]) > 1e-6;
+//                         auto is_active1_nei = verts("active",nedge[1]) > 1e-6;
+
+//                         if(!is_active0_nei || !is_active1_nei){
+//                             // printf("skip inactive nedge %d\n",sei);
+//                             return;
+//                         }
+
+
+//                         // // the two edges should orient in different directions
+//                         auto bNrm = setemp.template pack<3>("nrm",nseI);
+//                         auto orient = bNrm.dot(aNrm);
+//                         if(orient > 0.2){
+//                             // printf("skip pairs : %d %d due to orient problem %f %f %f\n",sei,nseI,(float)orient,(float)bNrm.norm(),(float)aNrm.norm());
+//                             return;
+//                         }
+
+//                         auto nid0 = nedge[0];
+//                         auto nid1 = nedge[1];
+
+//                         auto b0 = verts.template pack<3>(xtag,nid0);
+//                         auto b1 = verts.template pack<3>(xtag,nid1);
+
+//                         COLLISION_UTILS::IntersectLineSegments(a0,a1,b0,b1,aPoint,bPoint);
+//                         auto distance = (aPoint - bPoint).norm();
+
+//                         if(distance > closestDistance){  
+//                             // printf("skip pairs : %d %d due to distance %f %f\n",sei,nseI,(float)distance,(float)closestDistance);
+//                             return;
+//                         }
+
+//                         zs::vec<T,3> ea = a1 - a0;
+//                         zs::vec<T,3> eb = b1 - b0;
+
+//                         a[1] = (aPoint - a0).norm() / ea.norm();
+//                         a[0] = (T)1.0 - a[1];
+
+//                         b[1] = (bPoint - b0).norm() / eb.norm();
+//                         b[0] = (T)1.0 - b[1];
+
+//                         T skipEps = 1e-4;
+//                         if ((a[0] < skipEps) || (a[0] > 1.0 - skipEps)) return;
+//                         if ((a[1] < skipEps) || (a[1] > 1.0 - skipEps)) return;
+//                         if ((b[0] < skipEps) || (b[0] > 1.0 - skipEps)) return;
+//                         if ((b[1] < skipEps) || (b[1] > 1.0 - skipEps)) return;
+
+//                         closestDistance = distance;
+//                         closestEdge = nseI;
+
+//                         aClosest = a;
+//                         bClosest = b;      
+//                         aClosestPoint = aPoint;
+//                         bClosestPoint = bPoint;                    
+//                 };
+//                 seBvh.iter_neighbors(bv,process_edge_edge_collision_pairs);
+
+
+
+//                 if(closestEdge == -1) return;
+
+//                 // printf("find closest pairs : %d -> %d\n",sei,closestEdge);
+
+
+//                 if(closestEdge >= lines.size()){
+//                     printf("closestEdge bigger than lines size\n");
+//                     return;
+//                 }
+
+//                 if(lines.size() != setemp.size()){
+//                     printf("lines size and setemp size not match\n");
+//                     return;
+//                 }
+//                 if(!setemp.hasProperty("nrm")){
+//                     printf("setemp has no nrm channel");
+//                     return;
+//                 }
+
+//                 auto innerEdge = lines.pack(dim_c<2>,"inds",closestEdge).reinterpret_bits(int_c);
+
+//                 // return;
+
+//                 // // skip the one-ring neighbor_check
+//                 // bool insideOneRing = false;
+
+//                 // for (int j = 0; j < 2; j++)
+//                 // {
+//                 // pair<int, int> lookup;
+//                 // lookup.first = outerEdge[j];
+//                 // for (int i = 0; i < 2; i++)
+//                 // {
+//                 //     lookup.second = innerEdge[i];
+//                 //     if (_insideSurfaceVertexOneRing.find(lookup) != _insideSurfaceVertexOneRing.end())
+//                 //     insideOneRing = true;
+//                 // }
+//                 // }
+//                 // if (insideOneRing) return;
+//                 auto a2b = bClosestPoint - aClosestPoint;
+//                 auto bNrm = setemp.template pack<3>("nrm",closestEdge);
+
+
+//                 // auto avgNrm = (bNrm - aNrm).normalized();
+//                 bool is_penertrating = a2b.dot(aNrm) < 0 && a2b.dot(bNrm) > 0;
+
+//                 auto collisionEps = is_penertrating ? in_collisionEps : out_collisionEps;
+
+//                 // then there is edge edge collision
+//                 if(closestDistance > collisionEps)  
+//                     return;
+
+//                 // if(is_penertrating)
+//                 //     printf("find penertrating pair %d %d %d %d\n",einds[0],einds[1],innerEdge[0],innerEdge[1]);
+
+
+//                 ee_collision_buffer.template tuple<4>("inds",sei) = zs::vec<int,4>(einds[0],einds[1],innerEdge[0],innerEdge[1]).template reinterpret_bits<T>();
+//                 auto edgeEdgeCollsionAreas = lines("area",sei) + lines("area",closestEdge);
+//                 ee_collision_buffer("area",sei) = edgeEdgeCollsionAreas;
+
+//                 int is_inverted = is_penertrating ? 1 : 0;  
+//                 ee_collision_buffer("inverted",sei) = reinterpret_bits<T>(is_inverted);   
+
+//                 ee_collision_buffer.template tuple<4>("bary",sei) = zs::vec<T,4>(aClosest[0],aClosest[1],bClosest[0],bClosest[1]);
+                
+//                                 // return;
+
+//                 ee_collision_buffer.template tuple<2>("abary",sei) = aClosest;
+
+//                 // ee_collision_buffer("abary",0,sei) = (T)0.0;
+//                 // ee_collision_buffer("abary",1,sei) = (T)0.0;
+//                 // // return;
+//                 ee_collision_buffer.template tuple<2>("bbary",sei) = bClosest;
+//                 // ee_collision_buffer("bbary",0,sei) = (T)0.0;
+//                 // ee_collision_buffer("bbary",1,sei) = (T)0.0;
+//         });
         
 // }
 
 
-template<int MAX_FP_COLLISION_PAIRS,
-    typename Pol,
+template<typename Pol,
     typename PosTileVec,
-    typename FPCollisionBuffer>
-void evaluate_collision_grad_and_hessian(Pol& cudaPol,
-    const PosTileVec& verts,const zs::SmallString& xtag,
-    FPCollisionBuffer& cptemp,
+    typename FPCollisionBuffer,
+    typename GradHessianTileVec>
+void evaluate_fp_collision_grad_and_hessian(
+    Pol& cudaPol,
+    const PosTileVec& verts,const zs::SmallString& xtag,const zs::SmallString& vtag,T dt,
+    const FPCollisionBuffer& fp_collision_buffer,// recording all the fp collision pairs
+    GradHessianTileVec& gh_buffer,int offset,
     T in_collisionEps,T out_collisionEps,
     T collisionStiffness,
-    T mu,T lambda) {
+    T mu,T lambda,T kd_theta) {
         using namespace zs;
         constexpr auto space = execspace_e::cuda;
-        TILEVEC_OPS::fill<12*12>(cudaPol,cptemp,"H",zs::vec<T,12*12>::zeros());
-        TILEVEC_OPS::fill<12>(cudaPol,cptemp,"grad",zs::vec<T,12>::zeros());  
-        // TILEVEC_OPS::fill(cudaPol,cptemp,"area",(T)0.0);
-
-#if 0
-        int nm_points = cptemp.size() / MAX_FP_COLLISION_PAIRS;
-        cudaPol(zs::range(nm_points),
-            [verts = proxy<space>({},verts),xtag,
-                cptemp = proxy<space>({},cptemp),
+ 
+        int start = offset;
+        int fp_size = fp_collision_buffer.size(); 
+
+        TILEVEC_OPS::fill_range(cudaPol,gh_buffer,"H",(T)0.0,start,fp_size);
+        TILEVEC_OPS::fill_range(cudaPol,gh_buffer,"grad",(T)0.0,start,fp_size); 
+
+        // std::cout << "inds size compair : " << fp_collision_buffer.getChannelSize("inds") << "\t" << gh_buffer.getChannelSize("inds") << std::endl;
+
+        TILEVEC_OPS::copy(cudaPol,fp_collision_buffer,"inds",gh_buffer,"inds",start); 
+
+        cudaPol(zs::range(fp_size),
+            [verts = proxy<space>({},verts),xtag,vtag,dt,kd_theta,
+                fp_collision_buffer = proxy<space>({},fp_collision_buffer),
+                gh_buffer = proxy<space>({},gh_buffer),
                 in_collisionEps = in_collisionEps,
                 out_collisionEps = out_collisionEps,
                 stiffness = collisionStiffness,
-                mu = mu,lam = lambda] ZS_LAMBDA(int pi) mutable {
-            for(int i = 0;i != MAX_FP_COLLISION_PAIRS;++i)  {
-                auto inds = cptemp.template pack<4>("inds",pi * MAX_FP_COLLISION_PAIRS + i).reinterpret_bits(int_c);
+                mu = mu,lam = lambda,start = start] ZS_LAMBDA(int cpi) mutable {
+                auto inds = fp_collision_buffer.template pack<4>("inds",cpi).reinterpret_bits(int_c);
                 for(int j = 0;j != 4;++j)
                     if(inds[j] < 0)
                         return;
-
-                for(int j = 0;j != 4;++j){
-                    auto active = verts("active",inds[j]);
-                    if(active < 1e-6)
-                        return;
-                }
                 vec3 cv[4] = {};
                 for(int j = 0;j != 4;++j)
                     cv[j] = verts.template pack<3>(xtag,inds[j]);
             
-                
+                // auto is_inverted = reinterpret_bits<int>(fp_collision_buffer("inverted",cpi));
+                // auto ceps = is_inverted ? in_collisionEps : out_collisionEps;
 
-                auto is_inverted = reinterpret_bits<int>(cptemp("inverted",pi * MAX_FP_COLLISION_PAIRS + i));
-                auto ceps = is_inverted ? in_collisionEps : out_collisionEps;
+                auto ceps = out_collisionEps;
+                // ceps += (T)1e-2 * ceps;
 
                 auto alpha = stiffness;
-                auto beta = cptemp("area",pi * MAX_FP_COLLISION_PAIRS + i);
-                cptemp.template tuple<12>("grad",pi * MAX_FP_COLLISION_PAIRS + i) = alpha * beta * VERTEX_FACE_SQRT_COLLISION::gradient(cv,mu,lam,ceps);
-                cptemp.template tuple<12*12>("H",pi * MAX_FP_COLLISION_PAIRS + i) = alpha * beta * VERTEX_FACE_SQRT_COLLISION::hessian(cv,mu,lam,ceps);
-            }
+                auto beta = fp_collision_buffer("area",cpi);
+          
+                auto cforce = -alpha * beta * VERTEX_FACE_SQRT_COLLISION::gradient(cv,mu,lam,ceps);
+                auto K = alpha * beta * VERTEX_FACE_SQRT_COLLISION::hessian(cv,mu,lam,ceps);
+
+                // gh_buffer.template tuple<12>("grad",cpi + start) = -alpha * beta * VERTEX_FACE_SQRT_COLLISION::gradient(cv,mu,lam,ceps);
+                // gh_buffer.template tuple<12*12>("H",cpi + start) =  alpha * beta * VERTEX_FACE_SQRT_COLLISION::hessian(cv,mu,lam,ceps); 
+                
+                
+                // adding rayleigh damping term
+                vec3 v0[4] = {verts.pack(dim_c<3>,vtag, inds[0]),
+                verts.pack(dim_c<3>,vtag, inds[1]),
+                verts.pack(dim_c<3>,vtag, inds[2]),
+                verts.pack(dim_c<3>,vtag, inds[3])}; 
+                auto vel = COLLISION_UTILS::flatten(v0); 
+
+                auto C = K * kd_theta;
+                auto dforce = -C * vel;
+                gh_buffer.template tuple<12>("grad",cpi + start) = cforce + dforce;
+                gh_buffer.template tuple<12*12>("H",cpi + start) = K + C/dt;
         });
-#else
-        cudaPol(zs::range(cptemp.size()),
-            [verts = proxy<space>({},verts),xtag,
-                cptemp = proxy<space>({},cptemp),
+
+}
+
+// TODO: add damping collision term
+template<typename Pol,
+    typename TetTileVec,
+    typename PosTileVec,
+    typename SurfTriTileVec,
+    typename FPCollisionBuffer,
+    typename GradHessianTileVec>
+void evaluate_kinematic_fp_collision_grad_and_hessian(
+    Pol& cudaPol,
+    const TetTileVec& eles,
+    const PosTileVec& verts,const zs::SmallString& xtag,const zs::SmallString& vtag,T dt,
+    const SurfTriTileVec& tris,
+    const PosTileVec& kverts,
+    const FPCollisionBuffer& kc_buffer,
+    GradHessianTileVec& gh_buffer,int offset,
+    T in_collisionEps,T out_collisionEps,
+    T collisionStiffness,
+    T mu,T lambda,T kd_theta) {
+        using namespace zs;
+        constexpr auto space = execspace_e::cuda;
+
+        int start = offset;
+        int fp_size = kc_buffer.size();
+
+        // TILEVEC_OPS::fill_range(cudaPol,gh_buffer,"H",(T)0.0,start,fp_size);
+        // TILEVEC_OPS::fill_range(cudaPol,gh_buffer,"grad",(T)0.0,start,fp_size);
+
+        // get only the dynamic object's dofs
+        // TILEVEC_OPS::copy(cudaPol,kc_buffer,"inds",gh_buffer,"inds",start);
+        // cudaPol(zs::range(fp_size),
+        //     [gh_buffer = proxy<space>({},gh_buffer),start = start] ZS_LAMBDA(int fpi) mutable {
+        //         gh_buffer("inds",0,start + fpi) = gh_buffer("inds",1,start + fpi);
+        //         auto tmp = gh_buffer("inds",2,start + fpi);
+        //         gh_buffer("inds",2,start + fpi) = gh_buffer("inds",3,start + fpi);
+        //         gh_buffer("inds",3,start + fpi) = tmp;
+        // });
+
+
+        cudaPol(zs::range(fp_size),
+            [verts = proxy<space>({},verts),xtag,vtag,dt,kd_theta,
+                eles = proxy<space>({},eles),
+                tris = proxy<space>({},tris),
+                kverts = proxy<space>({},kverts),
+                kc_buffer = proxy<space>({},kc_buffer),
+                gh_buffer = proxy<space>({},gh_buffer),start,
                 in_collisionEps = in_collisionEps,
                 out_collisionEps = out_collisionEps,
                 stiffness = collisionStiffness,
                 mu = mu,lam = lambda] ZS_LAMBDA(int cpi) mutable {
-                auto inds = cptemp.template pack<4>("inds",cpi).reinterpret_bits(int_c);
-                for(int j = 0;j != 4;++j)
-                    if(inds[j] < 0)
+                auto inds = kc_buffer.pack(dim_c<2>,"inds",cpi).reinterpret_bits(int_c);
+                // auto oinds = kc_buffer.pack(dim_c<4>,"inds",cpi).reinterpret_bits(int_c);
+                for(int i = 0;i != 2;++i)
+                    if(inds[i] < 0)
                         return;
                 vec3 cv[4] = {};
-                for(int j = 0;j != 4;++j)
-                    cv[j] = verts.template pack<3>(xtag,inds[j]);
-            
-                auto is_inverted = reinterpret_bits<int>(cptemp("inverted",cpi));
-                // auto ceps = is_inverted ? in_collisionEps : out_collisionEps;
+                cv[0] = kverts.pack(dim_c<3>,"x",inds[0]);
+                auto tri = tris.pack(dim_c<3>,"inds",inds[1]).reinterpret_bits(int_c);
+                for(int j = 1;j != 4;++j)
+                    cv[j] = verts.template pack<3>(xtag,tri[j-1]);
+                
+                // vec3 cvel[4] = {};
+                // cvel[0] = vec3::zeros();
+                // for(int j = 1;j != 4;++j)
+                //     cvel[j] = verts.template pack<3>(vel_tag,inds[j]);
+
+                // auto is_inverted = reinterpret_bits<int>(kc_buffer("inverted",cpi));
+                auto average_thickness = (T)0.0;
+                if(verts.hasProperty("k_thickness")){
+                    // average_thickness = (T)0.0;
+                    for(int i = 0;i != 3;++i)
+                        average_thickness += verts("k_thickness",tri[i])/(T)3.0;
+                }
 
-                auto ceps = out_collisionEps;
-                // ceps += (T)1e-2 * ceps;
 
+                auto ceps = out_collisionEps * ((T)1.0 + average_thickness);
                 auto alpha = stiffness;
-                auto beta = cptemp("area",cpi);
-          
-#if 0
-                cptemp.template tuple<12>("grad",cpi) = alpha * beta * VERTEX_FACE_COLLISION::gradient(cv,mu,lam,out_collisionEps);
-                cptemp.template tuple<12*12>("H",cpi) = alpha * beta * VERTEX_FACE_COLLISION::hessian(cv,mu,lam,out_collisionEps);
-#else
-                cptemp.template tuple<12>("grad",cpi) = -alpha * beta * VERTEX_FACE_SQRT_COLLISION::gradient(cv,mu,lam,ceps);
-                cptemp.template tuple<12*12>("H",cpi) = alpha * beta * VERTEX_FACE_SQRT_COLLISION::hessian(cv,mu,lam,ceps); 
-#endif
+                auto beta = kc_buffer("area",cpi);
+
+                // change the 
+
+                auto cgrad = -alpha * beta * VERTEX_FACE_SQRT_COLLISION::gradient(cv,mu,lam,ceps,true);
+                auto cH = alpha * beta * VERTEX_FACE_SQRT_COLLISION::hessian(cv,mu,lam,ceps,true);
+
+                auto ei = reinterpret_bits<int>(tris("ft_inds",inds[1]));
+                // auto cp = gh_buffer.pack(dim_c<2>,"inds",ei).reinterpret_bits(int_c);
+                // auto pidx = cp[0];
+                // auto tri = tris.pack(dim_c<3>,"inds",cp[1]).reinterpret_bits(int_c);
+                auto tet = eles.pack(dim_c<4>,"inds",ei).reinterpret_bits(int_c);
+                auto inds_reorder = zs::vec<int,3>::zeros();
+                for(int i = 0;i != 3;++i){
+                    auto idx = tri[i];
+                    for(int j = 0;j != 4;++j)
+                        if(idx == tet[j])
+                            inds_reorder[i] = j;
+                }
 
-                // printf("cpi[%d] : %f %f %f\n",cpi,(float)alpha,(float)beta,(float)cptemp.template pack<12>("grad",cpi).norm());   
+                vec3 v0[4] = {zs::vec<T,3>::zeros(),
+                verts.pack(dim_c<3>,vtag, tri[0]),
+                verts.pack(dim_c<3>,vtag, tri[1]),
+                verts.pack(dim_c<3>,vtag, tri[2])}; 
+                auto vel = COLLISION_UTILS::flatten(v0);
+
+                auto C = cH * kd_theta;
+                auto dforce = -C * vel;
+
+                cgrad += dforce;
+                cH += C/dt;
+
+                // gh_buffer.template tuple<12>("grad",cpi + start) = cforce + dforce;
+                // gh_buffer.template tuple<12*12>("H",cpi + start) = K + C/dt;
+
+                for(int i = 3;i != 12;++i){
+                    int d0 = i % 3;
+                    int row = inds_reorder[i/3 - 1]*3 + d0;
+                    atomic_add(exec_cuda,&gh_buffer("grad",row,ei),cgrad[i]);
+                    for(int j = 3;j != 12;++j){
+                        int d1 = j % 3;
+                        int col = inds_reorder[j/3 - 1]*3 + d1;
+                        if(row >= 12 || col >= 12){
+                            printf("invalid row = %d and col = %d %d %d detected %d %d %d\n",row,col,i/3,j/3,
+                                inds_reorder[0],
+                                inds_reorder[1],
+                                inds_reorder[2]);
+                        }
+                        atomic_add(exec_cuda,&gh_buffer("H",row*12 + col,ei),cH(i,j));
+                    }                    
+                }
+                // for(int i = 1;i != 4;++i){ 
+                //     auto idx = inds[i];
+                //     for(int j = 0;j != 4;++j){
+                //         if(idx == tet[j]) {
+                //             for(int d = 0;d != 3;++d)
+                //                 atomic_add(exec_cuda,&gh_buffer("grad",j*3 + d,ei),cgrad[i * 3 + d]);
+                //         }
+                //     }
+                    
+                //     gh_buffer("grad",i,cpi + start) = cgrad[i];
 
+                // }
+                // for(int i = 3;i != 12;++i)
+                //     for(int j = 3;j != 12;++j)
+                //         gh_buffer("H",i * 12 + j,cpi + start) = cH(i,j);
+                // auto test_ind = gh_buffer.pack(dim_c<4>,"inds",start + cpi).reinterpret_bits(int_c);
+                // auto cgrad_norm = cgrad.norm();
+                // auto cH_norm = cH.norm();
+                // printf("find_kinematic_collision[%d %d %d %d] : %f %f\n",inds[0],inds[1],inds[2],inds[3],(float)alpha,(float)beta);
         });
+}
 
-#endif
-
-    }
-
-
-// template<int MAX_FP_COLLISION_PAIRS,
-//             typename Pol,
-//             typename SurfPointTileVec,
-//             typename SurfLineTileVec,
-//             typename SurfTriTileVec,
-//             typename PosTileVec,
-//             typename CellPointTileVec,
-//             typename CellBisectorTileVec,
-//             typename CellTriTileVec,
-//             typename FPCollisionBuffer>
-// void evaluate_collision_grad_and_hessian(Pol& cudaPol,
-//     const PosTileVec& verts,
-//     const zs::SmallString& xtag,
-//     const SurfPointTileVec& points,
-//     const SurfLineTileVec& lines,
-//     const SurfTriTileVec& tris,
-//     CellPointTileVec& sptemp,
-//     CellBisectorTileVec& setemp,
-//     CellTriTileVec& sttemp,
-//     FPCollisionBuffer& cptemp,
-//     T cellBvhThickness,
-//     T collisionEps,
+// template<typename Pol,
+//     typename PosTileVec,
+//     typename EECollisionBuffer,
+//     typename GradHessianTileVec>
+// void evaluate_ee_collision_grad_and_hessian(Pol& cudaPol,
+//     const PosTileVec& verts,const zs::SmallString& xtag,
+//     const EECollisionBuffer& ee_collision_buffer,
+//     GradHessianTileVec& gh_buffer,int offset,
+//     T in_collisionEps,T out_collisionEps,
 //     T collisionStiffness,
 //     T mu,T lambda) {
 //         using namespace zs;
 //         constexpr auto space = execspace_e::cuda;
-//         TILEVEC_OPS::fill<12*12>(cudaPol,cptemp,"H",zs::vec<T,12*12>::zeros());
-//         TILEVEC_OPS::fill<3>(cudaPol,sttemp,"grad",zs::vec<T,3>::zeros());
-//         TILEVEC_OPS::fill<3>(cudaPol,sptemp,"grad",zs::vec<T,3>::zeros());
-
-//         cudaPol(zs::range(points.size()),
-//             [   collisionEps = collisionEps,
-//                 cellBvhThickness = cellBvhThickness,
-//                 verts = proxy<space>({},verts),
-//                 sttemp = proxy<space>({},sttemp),
-//                 setemp = proxy<space>({},setemp),
-//                 sptemp = proxy<space>({},sptemp),
-//                 cptemp = proxy<space>({},cptemp),
-//                 points = proxy<space>({},points),
-//                 lines = proxy<space>({},lines),
-//                 tris = proxy<space>({},tris),
-//                 stbvh = proxy<space>(stbvh),xtag,
-//                 collisionStiffness = collisionStiffness,
-//                 mu = mu,lambda = lambda] ZS_LAMBDA(int pi) mutable {
-
-//             auto vi = reinterpret_bits<int>(points("inds",pi));
-//             auto p = verts.template pack<3>(xtag,vi);
-//             auto bv = bv_t{get_bounding_box(p - cellBvhThickness, p + cellBvhThickness)};
-
-//             vec3 collision_verts[4] = {};
-//             collision_verts[0] = p;
-
-//             int nm_collision_pairs = 0;
-//             auto process_vertex_face_collision_pairs = [&](int stI) {
-//                 if(nm_collision_pairs >= MAX_FP_COLLISION_PAIRS)     
-//                     return;   
-
-//                 auto tri = tris.pack(dim_c<3>, "inds",stI).reinterpret_bits(int_c);
-//                 if(tri[0] == vi || tri[1] == vi || tri[2] == vi)
-//                     return;
-
-//                 collision_verts[1] = verts.template pack<3>(xtag,tri[0]);
-//                 collision_verts[2] = verts.template pack<3>(xtag,tri[1]);
-//                 collision_verts[3] = verts.template pack<3>(xtag,tri[2]);
-
-//                 // check whether the triangle is degenerate
-//                 auto restArea = tris("area",stI);
 
-//                 const auto e10 = collision_verts[2] - collision_verts[1];
-//                 const auto e20 = collision_verts[3] - collision_verts[1];
-//                 auto deformedArea = (T)0.5 * e10.cross(e20).norm();
-//                 const T degeneracyEps = 1e-4;
-//                 // skip the degenerate triangles
-//                 const T relativeArea = deformedArea / (restArea + (T)1e-6);
-//                 if(relativeArea < degeneracyEps)
-//                     return;
-
-//                 bool collide = false;
-
-//                 if(COLLISION_UTILS::is_inside_the_cell(verts,xtag,
-//                         lines,tris,
-//                         sttemp,"nrm",
-//                         setemp,"nrm",
-//                         stI,p,collisionEps)) {
-//                     collide = true;
+//         int start = offset;
+//         int ee_size = ee_collision_buffer.size();
+
+//         TILEVEC_OPS::fill_range(cudaPol,gh_buffer,"H",(T)0.0,start,ee_size);
+//         TILEVEC_OPS::fill_range(cudaPol,gh_buffer,"grad",(T)0.0,start,ee_size);
+//         TILEVEC_OPS::copy(cudaPol,ee_collision_buffer,"inds",gh_buffer,"inds",start);
+
+//         cudaPol(zs::range(ee_size),[
+//             verts = proxy<space>({},verts),xtag,
+//             in_collisionEps,out_collisionEps,
+//             ee_collision_buffer = proxy<space>({},ee_collision_buffer),
+//             gh_buffer = proxy<space>({},gh_buffer),
+//             start = start,
+//             stiffness = collisionStiffness,mu = mu,lam = lambda] ZS_LAMBDA(int eei) mutable {
+//                 auto inds = ee_collision_buffer.template pack<4>("inds",eei).reinterpret_bits(int_c);
+//                 for(int i = 0;i != 4;++i)
+//                     if(inds[i] < 0)
+//                         return;
+//                 for(int j = 0;j != 4;++j){
+//                     auto active = verts("active",inds[j]);
+//                     if(active < 1e-6)
+//                         return;
+//                 }  
+//                 vec3 cv[4] = {};
+//                 for(int j = 0;j != 4;++j)
+//                     cv[j] = verts.template pack<3>(xtag,inds[j]);       
+
+//                 auto is_inverted = reinterpret_bits<int>(ee_collision_buffer("inverted",eei));
+//                 auto ceps = is_inverted ? in_collisionEps : out_collisionEps;
+
+//                 auto alpha = stiffness;
+//                 auto beta = ee_collision_buffer("area",eei);
+
+//                 auto a = ee_collision_buffer.template pack<2>("abary",eei);
+//                 auto b = ee_collision_buffer.template pack<2>("bbary",eei);
+
+//                 const T tooSmall = (T)1e-6;
+
+//                 if(is_inverted) {
+//                     gh_buffer.template tuple<12>("grad",eei + start) = -alpha * beta * EDGE_EDGE_SQRT_COLLISION::gradientNegated(cv,a,b,mu,lam,ceps,tooSmall);
+//                     gh_buffer.template tuple<12*12>("H",eei + start) = alpha * beta * EDGE_EDGE_SQRT_COLLISION::hessianNegated(cv,a,b,mu,lam,ceps,tooSmall);
+//                     // gh_buffer.template tuple<12>("grad",eei + start) = -alpha * beta * EDGE_EDGE_COLLISION::gradientNegated(cv,a,b,mu,lam,ceps);
+//                     // gh_buffer.template tuple<12*12>("H",eei + start) = alpha * beta * EDGE_EDGE_COLLISION::hessianNegated(cv,a,b,mu,lam,ceps);
+//                 }else {
+//                     gh_buffer.template tuple<12>("grad",eei + start) = -alpha * beta * EDGE_EDGE_SQRT_COLLISION::gradient(cv,a,b,mu,lam,ceps,tooSmall);
+//                     gh_buffer.template tuple<12*12>("H",eei + start) = alpha * beta * EDGE_EDGE_SQRT_COLLISION::hessian(cv,a,b,mu,lam,ceps,tooSmall);  
+//                     // gh_buffer.template tuple<12>("grad",eei + start) = -alpha * beta * EDGE_EDGE_COLLISION::gradient(cv,a,b,mu,lam,ceps);
+//                     // gh_buffer.template tuple<12*12>("H",eei + start) = alpha * beta * EDGE_EDGE_COLLISION::hessian(cv,a,b,mu,lam,ceps);                  
 //                 }
-
-//                 if(!collide)
-//                     return;
-
-//                 auto vertexFaceCollisionAreas = tris("area",stI) + points("area",pi);
-
-//                 auto grad = collisionStiffness * VERTEX_FACE_SQRT_COLLISION::gradient(collision_verts,mu,lambda,collisionEps) * vertexFaceCollisionAreas;
-//                 auto hessian = collisionStiffness * VERTEX_FACE_SQRT_COLLISION::hessian(collision_verts,mu,lambda,collisionEps) * vertexFaceCollisionAreas;
-                
-
-//                 cptemp.template tuple<4>("inds",pi * MAX_FP_COLLISION_PAIRS + nm_collision_pairs) = zs::vec<int,4>(vi,tri[0],tri[1],tri[2]).template reinterpret_bits<T>();      
-//                 cptemp.template tuple<12*12>("H",pi * MAX_FP_COLLISION_PAIRS + nm_collision_pairs) = hessian;
-//                 // auto pf = zs::vec<T,3>{grad[0],grad[1],grad[2]};    
-//                 zs::vec<T,3> tf[3] = {};
-//                 for(int j = 0;j != 3;++j)
-//                     tf[j] = zs::vec<T,3>{grad[j * 3 + 3 + 0],grad[j * 3 + 3 + 1],grad[j * 3 + 3 + 2]};     
-
-//                 // auto avgtf = (tf[0] + tf[1] + tf[2])/(T)3.0;
-//                 auto avgtf = (tf[0] + tf[1] + tf[2]);
-//                 for(int j = 0;j != 3;++j)
-//                     atomic_add(exec_cuda,&sttemp("grad",j,stI),avgtf[j]);
-
-
-//                 auto fp_inds = tris.template pack<3>("fp_inds",stI).reinterpret_bits(int_c);
-//                 for(int j = 0;j != 3;++j){
-//                     atomic_add(exec_cuda,&sptemp("grad",j,pi),grad[j]);
-//                     for(int k = 0;k != 3;++k)   {
-//                         auto fp_idx = fp_inds[k];
-//                         atomic_add(exec_cuda,&sptemp("grad",j,fp_idx),tf[k][j]);
-//                     }
-//                 }   
-
-//                 nm_collision_pairs++;                   
-//             };
-//             stbvh.iter_neighbors(bv,process_vertex_face_collision_pairs);                
 //         });
 //     }
 
+
 };
 
 };
\ No newline at end of file
diff --git a/projects/CuLagrange/fem/collision_energy/vertex_face_sqrt_collision.hpp b/projects/CuLagrange/fem/collision_energy/vertex_face_sqrt_collision.hpp
index 6e77d354cf..cf398350fc 100644
--- a/projects/CuLagrange/fem/collision_energy/vertex_face_sqrt_collision.hpp
+++ b/projects/CuLagrange/fem/collision_energy/vertex_face_sqrt_collision.hpp
@@ -35,14 +35,14 @@ namespace VERTEX_FACE_SQRT_COLLISION {
     ///////////////////////////////////////////////////////////////////////
     constexpr REAL psi(const VECTOR3 v[4],const REAL& _mu,const REAL& _nu,const REAL& _eps)
     {
-        const VECTOR3 bary = getInsideBarycentricCoordinates(v);
+        const VECTOR3 bary = LSL_GEO::getInsideBarycentricCoordinates(v);
         return psi(v,bary,_mu,_nu,_eps);
     }
 
 ///////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////
 //  our normal pointing outward
-    constexpr VECTOR12 gradient(const VECTOR3 v[4], const VECTOR3& bary,const REAL& _mu,const REAL& _nu,const REAL& _eps)
+    constexpr VECTOR12 gradient(const VECTOR3 v[4], const VECTOR3& bary,const REAL& _mu,const REAL& _nu,const REAL& _eps,bool collide_from_inside = false)
     {
         // REAL _inverseEps = 1e-6;
         using DREAL = double;
@@ -51,7 +51,9 @@ namespace VERTEX_FACE_SQRT_COLLISION {
         e[0] = v[3] - v[2];
         e[1] = v[0] - v[2];
         e[2] = v[1] - v[2]; 
-        const bool reversal = !reverse(v,e);
+        bool reversal = !reverse(v,e);
+        if(collide_from_inside)
+            reversal = !reversal;
         
         // remember we had to reorder vertices in a wonky way
         const VECTOR3 xs = bary[0] * v[1] + bary[1] * v[2] + bary[2] * v[3];
@@ -95,15 +97,15 @@ namespace VERTEX_FACE_SQRT_COLLISION {
 
     ///////////////////////////////////////////////////////////////////////
     ///////////////////////////////////////////////////////////////////////
-    constexpr VECTOR12 gradient(const VECTOR3 v[4],const REAL& _mu,const REAL& _nu,const REAL& _eps)
+    constexpr VECTOR12 gradient(const VECTOR3 v[4],const REAL& _mu,const REAL& _nu,const REAL& _eps,bool collide_from_inside = false)
     {
-        const VECTOR3 bary = getInsideBarycentricCoordinates(v);
-        return gradient(v, bary, _mu, _nu, _eps);
+        const VECTOR3 bary = LSL_GEO::getInsideBarycentricCoordinates(v);
+        return gradient(v, bary, _mu, _nu, _eps,collide_from_inside);
     }
 
     ///////////////////////////////////////////////////////////////////////
     ///////////////////////////////////////////////////////////////////////
-    constexpr MATRIX12 hessian(const VECTOR3 v[4], const VECTOR3& bary,const REAL& _mu,const REAL& _nu,const REAL& _eps)
+    constexpr MATRIX12 hessian(const VECTOR3 v[4], const VECTOR3& bary,const REAL& _mu,const REAL& _nu,const REAL& _eps,bool collide_from_inside = false)
     {
         // REAL _inverseEps = 1e-6;
 
@@ -112,8 +114,12 @@ namespace VERTEX_FACE_SQRT_COLLISION {
         e[0] = v[3] - v[2];
         e[1] = v[0] - v[2];
         e[2] = v[1] - v[2]; 
-        const bool reversal = !reverse(v,e);
-        
+        bool reversal = !reverse(v,e);
+        if(collide_from_inside)
+            reversal = !reversal;
+
+#if 0
+
         using DREAL = double;
 
         // remember we had to reorder vertices in a wonky way
@@ -142,14 +148,17 @@ namespace VERTEX_FACE_SQRT_COLLISION {
 
         alpha = alpha > 0 ? alpha : 0;
         beta = beta > 0 ? beta : 0;
+        return (REAL)2.0 * _mu * ((REAL)alpha * (zs::dyadic_prod(productn,productn)) + (REAL)beta * tDiff.transpose() * tDiff);
+        // auto H = (REAL)2.0 * _mu * ((REAL)alpha * (zs::dyadic_prod(productn,productn)) + (REAL)beta * tDiff.transpose() * tDiff);
+        // make_pd(H);
+        // return H; 
+
 
         // return (REAL)2.0 * _mu * (((REAL)1.0 / tDott - springDiff / (tDott * tMagnitude)) * (zs::dyadic_prod(product,product)) +
         //                     (springDiff / tMagnitude) * tDiff.transpose() * tDiff); 
 
         // return (REAL)2.0 * _mu * (alpha * (zs::dyadic_prod(product,product)) + beta * tDiff.transpose() * tDiff); 
 
-        return (REAL)2.0 * _mu * ((REAL)alpha * (zs::dyadic_prod(productn,productn)) + (REAL)beta * tDiff.transpose() * tDiff);
-
         // could instead try to trap all the inverses and hand back something fixed up,
         // but consistency is not guaranteed, so let's just zero it out at the first
         // sign of trouble
@@ -157,16 +166,85 @@ namespace VERTEX_FACE_SQRT_COLLISION {
         //const REAL tDottInv = (zs::abs(tDott) > _inverseEps) ? 1.0 / tDott : 1.0;
         //return 2.0 * _mu * ((tDottInv - springDiff / (tDott * tMagnitude)) * (product * product.transpose()) +
         //                    (springDiff * tMagnitudeInv) * tDiff.transpose() * tDiff); 
+
+#else
+        const VECTOR3 xs = bary[0] * v[1] + bary[1] * v[2] + bary[2] * v[3];
+        const VECTOR3 t = v[0] - xs;  
+
+        const REAL tDott = t.dot(t);
+        const REAL tMagnitude = zs::sqrt(tDott);
+
+
+        const REAL springDiff = (reversal) ? tMagnitude + _eps : tMagnitude - _eps;
+        const MATRIX3x12 tDiff = tDiffPartial(bary); 
+
+        // get the spring length, non-zero rest-length
+        const VECTOR12 product = tDiff.transpose() * t;
+
+        auto res =  (REAL)2.0 * _mu * (((REAL)1.0 / tDott - springDiff / (tDott * tMagnitude)) * (zs::dyadic_prod(product,product)) + (springDiff / tMagnitude) * tDiff.transpose() * tDiff);   
+        make_pd(res);
+        return res;    
+
+#endif
     }
 
     ///////////////////////////////////////////////////////////////////////
     ///////////////////////////////////////////////////////////////////////
-    constexpr MATRIX12 hessian(const VECTOR3 v[4],const REAL& _mu,const REAL& _nu,const REAL& _eps)
+    constexpr MATRIX12 hessian(const VECTOR3 v[4],const REAL& _mu,const REAL& _nu,const REAL& _eps,bool collide_from_inside = false)
     {
-        const VECTOR3 bary = getInsideBarycentricCoordinates(v);
-        return hessian(v, bary,_mu,_nu,_eps);
+        const VECTOR3 bary = LSL_GEO::getInsideBarycentricCoordinates(v);
+        return hessian(v, bary,_mu,_nu,_eps,collide_from_inside);
     }
 
+    // constexpr VECTOR12 damp_gradient(const VECTOR v[4],const VECTOR vp[4],const REAL& _dt, const VECTOR3& bary,const REAL& _kd,const REAL& _mu,const REAL& _nu,const REAL& eps){
+    //     using DREAL = double;
+
+    //     // const VECTOR3 vs = bary[0] * v[1] + bary[1] * v[2] + bary[2] * v[3];
+    //     const VECTOR3 t = v[0] - (bary[0] * v[1] + bary[1] * v[2] + bary[2] * v[3]);// relative position
+    //     // const VECTOR3 vps = bary[0] * vp[1] + bary[1] * vp[2] + bary[2] * vp[3];
+    //     const VECTOR3 tp = vp[0] - (bary[0] * vp[1] + bary[1] * vp[2] + bary[2] * vp[3]);// previous relative position
+    //     const VECTOR3 vel_t = (t - tp) / _dt;// relative velocity
+
+    //     const MATRIX3x12 tDiff = tDiffPartial(bary); 
+    //     const auto tn = t.template cast<DREAL>().normalized().template cast<REAL>();
+
+    //     const DREAL project_vel_t = vel_t.dot(tn);
+    //     return (REAL)2.0 * _mu * _kd * (REAL)project_vel_t * tDiff.transpose() * tn;
+    // }
+
+    // constexpr VECTOR12 damp_gradient(const VECTOR v[4],const VECTOR vp[4],const REAL& _dt,const REAL& _kd,const REAL& _mu,const REAL& _nu,const REAL& eps)
+    // {
+    //     const VECTOR3 bary = LSL_GEO::getInsideBarycentricCoordinates(v);
+    //     return damp_gradient(v, vp,_dt,bary,_kd, _mu,_nu,eps);
+    // }
+
+    // const MATRIX12 damp_hessian(const VECTOR v[4],const VECTOR vp[4],const REAL& _dt, const VECTOR3& bary,const REAL& _kd,const REAL& _mu,const REAL& _nu,const REAL& eps) {
+    //     using DREAL = double;
+
+    //     // const VECTOR3 vs = bary[0] * v[1] + bary[1] * v[2] + bary[2] * v[3];
+    //     const VECTOR3 t = v[0] - (bary[0] * v[1] + bary[1] * v[2] + bary[2] * v[3]);// relative position
+    //     // const VECTOR3 vps = bary[0] * vp[1] + bary[1] * vp[2] + bary[2] * vp[3];
+    //     const VECTOR3 tp = vp[0] - (bary[0] * vp[1] + bary[1] * vp[2] + bary[2] * vp[3]);// previous relative position
+    //     const VECTOR3 vel_t = (t - tp) / _dt;// relative velocity
+
+    //     const MATRIX3x12 tDiff = tDiffPartial(bary); 
+    //     const REAL tDott = t.dot(t);
+    //     const REAL tMagnitude = zs::sqrt(tDott);
+
+    //     const VECTOR12 product = tDiff.transpose() * t;
+    //     const VECTOR12 vproduct = tDiff.transpose() * vel_t;
+
+    //     const DREAL project_vel_t = vel_t.dot(tn);
+
+    //     return (T)2.0 * mu * kd * (
+    //         ((T)1.0/_dt/tDott - (T)2.0*project_vel_t/tMagnitude/tDott)*zs::dyadic_prod(product,product) +
+    //             project_vel_t/tDott * zs::dyadic_prod(tDiff) +
+    //             (T)1.0/tDott * zs::dyadic_prod(product,product)
+    //     );
+
+    // }
+
+
 
 
 };
diff --git a/projects/CuLagrange/geometry/BaryCentricInterpolator.cu b/projects/CuLagrange/geometry/BaryCentricInterpolator.cu
index 68b240ff9c..c930cb7c2b 100644
--- a/projects/CuLagrange/geometry/BaryCentricInterpolator.cu
+++ b/projects/CuLagrange/geometry/BaryCentricInterpolator.cu
@@ -9,6 +9,9 @@
 #include <zeno/types/PrimitiveObject.h>
 #include <zeno/types/StringObject.h>
 
+#include "zensim/container/Bcht.hpp"
+#include "kernel/tiled_vector_ops.hpp"
+
 #include <iostream>
 
 namespace zeno{
@@ -19,6 +22,18 @@ using vec4 = zs::vec<T,4>;
 using mat3 = zs::vec<T,3,3>;
 using mat4 = zs::vec<T,4,4>;
 
+
+// 给定一个四面网格与一组点，计算每个点在四面体网格单元中的质心坐标
+struct ZSComputeBaryCentricWeights2 : INode {
+    void apply() override {
+        using namespace zs;
+
+
+    }
+};
+
+
+
 struct ZSComputeBaryCentricWeights : INode {
     void apply() override {
         using namespace zs;
@@ -28,48 +43,50 @@ struct ZSComputeBaryCentricWeights : INode {
 
         auto zsvolume = get_input<ZenoParticles>("zsvolume");
         auto zssurf = get_input<ZenoParticles>("zssurf");
+        auto mark_embed_elm = get_input2<int>("mark_elm");
         // the bvh of zstets
         // auto lbvh = get_input<zeno::LBvh>("lbvh");
         auto thickness = get_param<float>("bvh_thickness");
         auto fitting_in = get_param<int>("fitting_in");
 
         auto bvh_channel = get_param<std::string>("bvh_channel");
-        auto tag = get_param<std::string>("tag");
+        auto tag = get_input2<std::string>("tag");
 
-        const auto& verts = zsvolume->getParticles();
-        const auto& eles = zsvolume->getQuadraturePoints();
+        auto& verts = zsvolume->getParticles();
+        auto& eles = zsvolume->getQuadraturePoints();
 
         const auto& everts = zssurf->getParticles();
-        const auto& e_eles = zssurf->getQuadraturePoints();
+        // const auto& e_eles = zssurf->getQuadraturePoints();
 
         auto &bcw = (*zsvolume)[tag];
-        bcw = typename ZenoParticles::particles_t({{"inds",1},{"w",4},{"cnorm",1}},everts.size(),zs::memsrc_e::device,0);
+
+        bcw = typename ZenoParticles::particles_t({
+            {"X",3},
+            {"inds",1},
+            {"w",4},
+            {"strength",1},
+            {"cnorm",1}},everts.size(),zs::memsrc_e::device,0);
+        
+
+
+
+        // auto topo_tag = tag + std::string("_topo");
+        // auto &bcw_topo = (*zsvolume)[topo_tag];
+
+        // auto e_dim = e_eles.getPropertySize("inds");
+        // bcw_topo = typename ZenoParticles::particles_t({{"inds",e_dim}},e_eles.size(),zs::memsrc_e::device,0);
+
 
         auto cudaExec = zs::cuda_exec();
         const auto numFEMVerts = verts.size();
         const auto numFEMEles = eles.size();
         const auto numEmbedVerts = bcw.size();
-        const auto numEmbedEles = e_eles.size();
+        // const auto numEmbedEles = e_eles.size();
         constexpr auto space = zs::execspace_e::cuda;
 
-        // fmt::print("TRY COMPUTE BARYCENTRIC WEIGHTS\n");
-
-        // std::cout << "TRY COMPUTE BARYCENTRIC WEIGHTS" << std::endl;
-
-
-        // cudaExec(zs::range(eles.size()),
-        //     [eles = proxy<space>({},eles)] __device__(int ei) mutable {
-        //         auto quad = eles.template pack<4>("inds", ei).template reinterpret_bits<int>();
-        //         if(quad[0] < 0 || quad[1] < 0 || quad[2] < 0 || quad[3] < 0)
-        //             printf("invalid quad : %d %d %d %d\n",quad[0],quad[1],quad[2],quad[3]);
-        //         if(quad[0] > 13572 || quad[1] > 13572 || quad[2] > 13572 || quad[3] > 13572)
-        //             printf("invalid quad : %d %d %d %d\n",quad[0],quad[1],quad[2],quad[3]);
-        // });
+        TILEVEC_OPS::copy<3>(cudaExec,everts,"x",bcw,"X");
 
         compute_barycentric_weights(cudaExec,verts,eles,everts,"x",bcw,"inds","w",thickness,fitting_in);
-        // set_output("zsvolume", zsvolume);return;
-
-        // fmt::print("FINISH COMPUTING BARYCENTRIC WEIGHTS\n");
 
         cudaExec(zs::range(numEmbedVerts),
             [bcw = proxy<space>({},bcw),fitting_in] ZS_LAMBDA(int vi) mutable {
@@ -79,7 +96,13 @@ struct ZSComputeBaryCentricWeights : INode {
             }
         );
 
-        auto e_dim = e_eles.getPropertySize("inds");
+
+        // cudaExec(zs::range(e_eles.size()),[e_dim = e_dim,
+        //     e_eles = proxy<space>({},e_eles),bcw_topo = proxy<space>({},bcw_topo)] ZS_LAMBDA(int ei) mutable {
+        //         for(int i = 0;i != e_dim;++i)
+        //             bcw_topo("inds",i,ei) = e_eles("inds",i,ei);
+        // });
+
 
         cudaExec(zs::range(numEmbedVerts),
             [bcw = proxy<space>({},bcw)] ZS_LAMBDA (int vi) mutable {
@@ -94,41 +117,143 @@ struct ZSComputeBaryCentricWeights : INode {
                 nmEmbedVerts[ei] = (T)0.;
         });
 
-        if(e_dim !=3 && e_dim !=4) {
-            throw std::runtime_error("INVALID EMBEDDED PRIM TOPO");
+        // if(e_dim !=3 && e_dim !=4) {
+        //     throw std::runtime_error("INVALID EMBEDDED PRIM TOPO");
+        // }  
+
+        if(mark_embed_elm && everts.hasProperty("tag")){
+            eles.append_channels(cudaExec,{{"nmBones",1},{"bdw",1}});
+
+            cudaExec(zs::range(eles.size()),
+                [eles = proxy<space>({},eles)] ZS_LAMBDA(int elm_id) mutable{
+                    eles("nmBones",elm_id) = (T)0.0;
+                    eles("bdw",elm_id) = (T)1.0;
+            });  
+
+
+            auto nmBones = get_input2<int>("nmCpns");
+            using vec2i = zs::vec<int,2>;
+            using vec3i = zs::vec<int,3>;
+            bcht<vec2i, int, true, universal_hash<vec2i>, 32> ebtab{eles.get_allocator(), eles.size() * nmBones};
+            cudaExec(zs::range(bcw.size()),
+                [bcw = proxy<space>({},bcw),ebtab = proxy<space>(ebtab),everts = proxy<space>({},everts)] 
+                    ZS_LAMBDA(int vi) mutable{
+                        auto ei = reinterpret_bits<int>(bcw("inds",vi));
+                        if(ei < 0)
+                            return;
+                        else{
+                            int tag = (int)everts("tag",vi);
+                            ebtab.insert(vec2i{ei,tag});
+                        }
+            });
+
+            cudaExec(zs::range(eles.size()),
+                [eles = proxy<space>({},eles),ebtab = proxy<space>(ebtab),nmBones] ZS_LAMBDA(int ei) mutable {
+                    for(int i = 0;i != nmBones;++i) {
+                        auto res = ebtab.query(vec2i{ei,i});
+                        if(res < 0)
+                            continue;
+                        eles("nmBones",ei) += (T)1.0;
+                    }
+                    // if(eles("nmBones",ei) > 0)
+                        // printf("nmEmbedCmps[%d] : [%d]\n",ei,(int)eles("nmBones",ei));
+            });
+        }else {
+            eles.append_channels(cudaExec,{{"nmBones",1},{"bdw",1}});
+            cudaExec(zs::range(eles.size()),[
+                eles = proxy<space>({},eles)] ZS_LAMBDA(int ei) mutable {
+                    eles("bdw",ei) = (T)1.0;
+                    eles("nmBones",ei) = (T)1.0;
+            });
         }
 
         cudaExec(zs::range(bcw.size()),
-            [everts = proxy<space>({},everts),bcw = proxy<space>({},bcw),execTag = wrapv<space>{},nmEmbedVerts = proxy<space>(nmEmbedVerts)]
+            [everts = proxy<space>({},everts),
+                    bcw = proxy<space>({},bcw),
+                    execTag = wrapv<space>{},
+                    nmEmbedVerts = proxy<space>(nmEmbedVerts),
+                    eles = proxy<space>({},eles),
+                    verts = proxy<space>({},verts)]
                 ZS_LAMBDA (int vi) mutable {
                     using T = typename RM_CVREF_T(bcw)::value_type;
                     auto ei = reinterpret_bits<int>(bcw("inds",vi));
                     if(ei < 0)
                         return;
+                    auto tet = eles.pack(dim_c<3>,"inds",ei).reinterpret_bits(int_c);
                     atomic_add(execTag,&nmEmbedVerts[ei],(T)1.0);                  
         });
 
         cudaExec(zs::range(bcw.size()),
-            [bcw = proxy<space>({},bcw),nmEmbedVerts = proxy<space>(nmEmbedVerts)] 
+            [bcw = proxy<space>({},bcw),nmEmbedVerts = proxy<space>(nmEmbedVerts),eles = proxy<space>({},eles),everts = proxy<space>({},everts)] 
                 ZS_LAMBDA(int vi) mutable{
                     auto ei = reinterpret_bits<int>(bcw("inds",vi));
-                    if(ei < 0)
-                        bcw("cnorm",vi) = (T)0.0;
+                    if(everts.hasProperty("strength"))
+                        bcw("strength",vi) = everts("strength",vi);
                     else
-                        bcw("cnorm",vi) = (T)1.0/(T)nmEmbedVerts[ei];
+                        bcw("strength",vi) = (T)1.0;
+                    if(ei >= 0){
+                        auto alpha = (T)1.0/(T)nmEmbedVerts[ei];
+                        bcw("cnorm",vi) = (T)alpha;
+                        if(eles("nmBones",ei) > (T)1.5)
+                            eles("bdw",ei) = (T)0.0;
+                    }
+
+                    // if(ei < 0 || eles("nmBones",ei) > (T)1.5){
+                    //     // bcw("strength",vi) = (T)0.0;
+                    //     bcw("cnorm",vi) = (T)0.0;
+                    //     if(ei >= 0)
+                    //         eles("bdw",ei) = (T)0.0;
+                    // }
+                    // else{
+
+                    //     // bcw("cnorm",vi) = (T)1.0;
+                    // }
         });
 
+        
+        // we might also do some smoothing on cnorm
 
         set_output("zsvolume", zsvolume);
     }
 };
 
-ZENDEFNODE(ZSComputeBaryCentricWeights, {{{"interpolator","zsvolume"}, {"embed surf", "zssurf"}},
+ZENDEFNODE(ZSComputeBaryCentricWeights, {{{"interpolator","zsvolume"}, {"embed surf", "zssurf"},{"int","mark_elm","0"},{"int","nmCpns","1"},{"string","tag","skin"}},
                             {{"interpolator on gpu", "zsvolume"}},
-                            {{"float","bvh_thickness","0"},{"int","fitting_in","1"},{"string","tag","skin_bw"},{"string","bvh_channel","x"}},
+                            {{"float","bvh_thickness","0"},{"int","fitting_in","1"},{"string","bvh_channel","x"}},
                             {"ZSGeometry"}});
 
+struct VisualizeInterpolator : zeno::INode {
+    void apply() override {
+        using namespace zs;
+        auto zsvolume = get_input<ZenoParticles>("zsvolume");
+        auto tag = get_input2<std::string>("interpolator_name");
+        const auto& bcw = (*zsvolume)[tag].clone({zs::memsrc_e::host});
+        auto topo_tag = tag + std::string("_topo");
+        const auto &bcw_topo = (*zsvolume)[topo_tag].clone({zs::memsrc_e::host});
+
+        auto bcw_vis = std::make_shared<zeno::PrimitiveObject>();
+        bcw_vis->resize(bcw.size());
+        auto& bcw_X = bcw_vis->verts;
+        auto& bcw_cnorm = bcw_vis->add_attr<float>("cnorm");
+        auto& bcw_strength = bcw_vis->add_attr<float>("strength");
+
+        auto ompPol = omp_exec();  
+        constexpr auto omp_space = execspace_e::openmp;        
+        ompPol(zs::range(bcw.size()),
+            [&bcw_X,&bcw_cnorm,&bcw_strength,bcw = proxy<omp_space>({},bcw)] (int vi) mutable {
+                bcw_X[vi] = bcw.pack(dim_c<3>,"X",vi).to_array();
+                bcw_cnorm[vi] = bcw("cnorm",vi);
+                bcw_strength[vi] = bcw("strength",vi);
+        });
 
+        set_output("bcw_vis",std::move(bcw_vis));
+    }
+};
+
+ZENDEFNODE(VisualizeInterpolator, {{{"interpolator","zsvolume"},{"string","interpolator_name","skin"}},
+                            {{"visual bcw", "bcw_vis"}},
+                            {},
+                            {"ZSGeometry"}});
 
 struct ZSSampleEmbedVectorField : zeno::INode {
     void apply() override {
@@ -322,7 +447,7 @@ struct ZSInterpolateEmbedAttr : zeno::INode {
 
         auto srcAttr = get_param<std::string>("srcAttr");
         auto dstAttr = get_param<std::string>("dstAttr");
-        auto bcw_tag = get_param<std::string>("bcw_tag");
+        auto bcw_tag = get_input2<std::string>("bcw_tag");
         auto strategy = get_param<std::string>("strategy");
         const auto& bcw = (*source)[bcw_tag];
         auto& dest_pars = dest->getParticles();
@@ -340,20 +465,20 @@ struct ZSInterpolateEmbedAttr : zeno::INode {
                 fmt::print("the source have no {} channel\n",srcAttr);
                 throw std::runtime_error("the source have no specified channel");
             }           
-            if(topo.getPropertySize("inds") != 4) {
+            if(topo.getChannelSize("inds") != 4) {
                 fmt::print("only support tetrahedra mesh as source\n");
                 throw std::runtime_error("only support tetrahedra mesh as source");
             }
-            if(dest_pars.hasProperty(dstAttr) && dest_pars.getPropertySize(dstAttr) != source_pars.getPropertySize(srcAttr)){
+            if(dest_pars.hasProperty(dstAttr) && dest_pars.getChannelSize(dstAttr) != source_pars.getChannelSize(srcAttr)){
                 fmt::print("the dest attr_{} and source attr_{} not match in size\n",dstAttr,srcAttr);
                 throw std::runtime_error("the dest attr and source attr not match in size");
             }
 
-            if(source_pars.getPropertySize(srcAttr) == 1)
+            if(source_pars.getChannelSize(srcAttr) == 1)
                 interpolate_p2p_imp<1>(srcAttr,dstAttr,source_pars,dest_pars,topo,bcw);
-            if(source_pars.getPropertySize(srcAttr) == 2)
+            if(source_pars.getChannelSize(srcAttr) == 2)
                 interpolate_p2p_imp<2>(srcAttr,dstAttr,source_pars,dest_pars,topo,bcw);
-            if(source_pars.getPropertySize(srcAttr) == 3)
+            if(source_pars.getChannelSize(srcAttr) == 3)
                 interpolate_p2p_imp<3>(srcAttr,dstAttr,source_pars,dest_pars,topo,bcw);
         }else if(strategy == "q2p") {
             const auto& source_quads = source->getQuadraturePoints();
@@ -361,16 +486,16 @@ struct ZSInterpolateEmbedAttr : zeno::INode {
                 fmt::print("the source have no {} channel\n",srcAttr);
                 throw std::runtime_error("the source have no specified channel");
             }    
-            if(dest_pars.hasProperty(dstAttr) && dest_pars.getPropertySize(dstAttr) != source_quads.getPropertySize(srcAttr)){
+            if(dest_pars.hasProperty(dstAttr) && dest_pars.getChannelSize(dstAttr) != source_quads.getChannelSize(srcAttr)){
                 fmt::print("the dest attr_{} and source attr_{} not match in size\n",dstAttr,srcAttr);
                 throw std::runtime_error("the dest attr and source attr not match in size");
             }
 
-            if(source_quads.getPropertySize(srcAttr) == 1)
+            if(source_quads.getChannelSize(srcAttr) == 1)
                 interpolate_q2p_imp<1>(srcAttr,dstAttr,source_quads,dest_pars,bcw);
-            if(source_quads.getPropertySize(srcAttr) == 2)
+            if(source_quads.getChannelSize(srcAttr) == 2)
                 interpolate_q2p_imp<2>(srcAttr,dstAttr,source_quads,dest_pars,bcw);
-            if(source_quads.getPropertySize(srcAttr) == 3)
+            if(source_quads.getChannelSize(srcAttr) == 3)
                 interpolate_q2p_imp<3>(srcAttr,dstAttr,source_quads,dest_pars,bcw);
         }
         set_output("dest",dest);
@@ -378,12 +503,11 @@ struct ZSInterpolateEmbedAttr : zeno::INode {
 };
 
 
-ZENDEFNODE(ZSInterpolateEmbedAttr, {{{"source"}, {"dest"}},
+ZENDEFNODE(ZSInterpolateEmbedAttr, {{{"source"}, {"dest"},{"string","bcw_tag","skin_bw"}},
                             {{"dest"}},
                             {
                                 {"string","srcAttr","x"},
                                 {"string","dstAttr","x"},
-                                {"string","bcw_tag","skin_bw"},
                                 {"enum p2p q2p","strategy","p2p"}
 
                             },
@@ -484,12 +608,12 @@ struct ZSInterpolateEmbedPrim : zeno::INode {
                         auto idx = inds[i];
                         everts.tuple<3>(outAttr,vi) = everts.pack<3>(outAttr,vi) + w[i] * verts.pack<3>(inAttr, idx);
                     }
-#if 0
-                    if(vi == 100){
-                        auto vert = everts.pack<3>(outAttr,vi);
-                        printf("V<%d>->E<%d>(%f,%f,%f,%f) :\t%f\t%f\t%f\n",vi,ei,w[0],w[1],w[2],w[3],vert[0],vert[1],vert[2]);
-                    }
-#endif
+// #if 0
+//                     if(vi == 100){
+//                         auto vert = everts.pack<3>(outAttr,vi);
+//                         printf("V<%d>->E<%d>(%f,%f,%f,%f) :\t%f\t%f\t%f\n",vi,ei,w[0],w[1],w[2],w[3],vert[0],vert[1],vert[2]);
+//                     }
+// #endif
 
                 // }
         });
diff --git a/projects/CuLagrange/geometry/BiharmonicBoundedWeight.cu b/projects/CuLagrange/geometry/BiharmonicBoundedWeight.cu
index 47d30a4a98..2b95c3f952 100644
--- a/projects/CuLagrange/geometry/BiharmonicBoundedWeight.cu
+++ b/projects/CuLagrange/geometry/BiharmonicBoundedWeight.cu
@@ -13,7 +13,7 @@
 #include <zeno/types/PrimitiveObject.h>
 #include <zeno/types/StringObject.h>
 
-#include "kernel/laplace_matrix.hpp"
+#include "kernel/laplacian.hpp"
 #include "linear_system/active_set.hpp"
 
 namespace zeno {
diff --git a/projects/CuLagrange/geometry/CollisionVis.cu b/projects/CuLagrange/geometry/CollisionVis.cu
index b56478057a..dffb0801e4 100644
--- a/projects/CuLagrange/geometry/CollisionVis.cu
+++ b/projects/CuLagrange/geometry/CollisionVis.cu
@@ -7,7 +7,6 @@
 #include <zeno/types/NumericObject.h>
 #include <zeno/types/StringObject.h>
 
-#include "TopoUtils.hpp"
 
 #include "zensim/omp/execution/ExecutionPolicy.hpp"
 #include "kernel/calculate_facet_normal.hpp"
@@ -15,6 +14,7 @@
 #include "kernel/compute_characteristic_length.hpp"
 #include "kernel/calculate_bisector_normal.hpp"
 #include "kernel/tiled_vector_ops.hpp"
+#include "kernel/calculate_edge_normal.hpp"
 
 #include "../fem/collision_energy/evaluate_collision.hpp"
 
@@ -34,6 +34,7 @@ namespace zeno {
     using mat3 = zs::vec<T,3,3>;
     using mat4 = zs::vec<T,4,4>;
     // using vec2i = zs::vec<int,2>;
+    // using vec2i = zs::vec<int,2>;
     // using vec3i = zs::vec<int,3>;
     // using vec4i = zs::vec<int,4>;
 
@@ -42,6 +43,365 @@ namespace zeno {
     // TODO: build a half edge structure
     struct ZSInitSurfaceTopoConnect : INode {
 
+        // void compute_surface_neighbors(zs::CudaExecutionPolicy &pol, typename ZenoParticles::particles_t &sfs,
+        //                             typename ZenoParticles::particles_t &ses, typename ZenoParticles::particles_t &svs) {
+        //     using namespace zs;
+        //     constexpr auto space = execspace_e::cuda;
+        //     using vec2i = zs::vec<int, 2>;
+        //     using vec3i = zs::vec<int, 3>;
+        //     sfs.append_channels(pol, {{"ff_inds", 3}, {"fe_inds", 3}, {"fp_inds", 3}});
+        //     ses.append_channels(pol, {{"fe_inds", 2},{"ep_inds",2}});
+
+        //     fmt::print("sfs size: {}, ses size: {}, svs size: {}\n", sfs.size(), ses.size(), svs.size());
+
+        //     bcht<vec2i, int, true, universal_hash<vec2i>, 32> etab{sfs.get_allocator(), sfs.size() * 3};
+        //     Vector<int> sfi{sfs.get_allocator(), sfs.size() * 3}; // surftri indices corresponding to edges in the table
+
+        //     bcht<int,int,true, universal_hash<int>,32> ptab(svs.get_allocator(),svs.size());
+        //     Vector<int> spi{svs.get_allocator(),svs.size()};
+
+        //     /// @brief compute hash table
+        //     {
+        //         // compute directed edge to triangle idx hash table
+        //         pol(range(sfs.size()), [etab = proxy<space>(etab), sfs = proxy<space>({}, sfs),
+        //                                 sfi = proxy<space>(sfi)] __device__(int ti) mutable {
+        //             auto tri = sfs.pack(dim_c<3>, "inds", ti).reinterpret_bits(int_c);
+        //             for (int i = 0; i != 3; ++i)
+        //                 if (auto no = etab.insert(vec2i{tri[i], tri[(i + 1) % 3]}); no >= 0) {
+        //                     sfi[no] = ti;
+        //                 } else {
+        //                     auto oti = sfi[etab.query(vec2i{tri[i], tri[(i + 1) % 3]})];
+        //                     auto otri = sfs.pack(dim_c<3>, "inds", oti).reinterpret_bits(int_c);
+        //                     printf("the same directed edge <%d, %d> has been inserted twice! original sfi %d <%d, %d, %d>, cur "
+        //                         "%d <%d, %d, %d>\n",
+        //                         tri[i], tri[(i + 1) % 3], oti, otri[0], otri[1], otri[2], ti, tri[0], tri[1], tri[2]);
+        //                 }
+        //         });
+        //         // // compute surface point to vert hash table
+        //         // pol(range(svs.size()),[ptab = proxy<space>(ptab),svs = proxy<space>({},svs),
+        //         //     spi = proxy<space>(spi)] __device__(int pi) mutable {
+        //         //         auto pidx = reinterpret_bits<int>(svs("inds",pi));
+        //         //         if(auto no = ptab.insert(pidx); no >= 0)
+        //         //             spi[no] = pi;
+        //         //         else {
+        //         //             auto opi = spi[ptab.query(pidx)];
+        //         //             auto opidx = reinterpret_bits<int>(svs("inds",opi));
+        //         //             printf("the same surface point <%d> has been inserted twice! origin svi %d <%d>, cur "
+        //         //                 "%d <%d>\n",
+        //         //                 pidx,opi,opidx,pi,pidx);
+        //         //         }
+        //         // });
+        //     }
+        //     /// @brief compute ep neighbors
+        //     // {
+        //     //     pol(range(ses.size()),[ptab = proxy<space>(ptab),ses = proxy<space>({},ses),
+        //     //         svs = proxy<space>({},svs),spi = proxy<space>(spi)] __device__(int ei) mutable {
+        //     //             auto neighpIds = vec2i::uniform(-1);
+        //     //             auto edge = ses.pack(dim_c<2>,"inds",ei).reinterpret_bits(int_c);
+        //     //             for(int i = 0;i != 2;++i)
+        //     //                 if(auto no = ptab.query(edge[i]);no >= 0) {
+        //     //                     neighpIds[i] = spi[no];
+        //     //                 }
+        //     //             ses.tuple(dim_c<2>,"ep_inds",ei) = neighpIds.reinterpret_bits(float_c);
+        //     //     });
+        //     // } 
+
+        //     /// @brief compute ff neighbors
+        //     {
+        //         pol(range(sfs.size()), [etab = proxy<space>(etab), sfs = proxy<space>({}, sfs),
+        //                                 sfi = proxy<space>(sfi)] __device__(int ti) mutable {
+        //             auto neighborIds = vec3i::uniform(-1);
+        //             auto tri = sfs.pack(dim_c<3>, "inds", ti).reinterpret_bits(int_c);
+        //             for (int i = 0; i != 3; ++i)
+        //                 if (auto no = etab.query(vec2i{tri[(i + 1) % 3], tri[i]}); no >= 0) {
+        //                     neighborIds[i] = sfi[no];
+        //                 }
+        //             sfs.tuple(dim_c<3>, "ff_inds", ti) = neighborIds.reinterpret_bits(float_c);
+        //         });
+        //     }
+        //     /// @brief compute fe neighbors
+        //     {
+        //         auto sfindsOffset = sfs.getPropertyOffset("inds");
+        //         auto sfFeIndsOffset = sfs.getPropertyOffset("fe_inds");
+        //         auto seFeIndsOffset = ses.getPropertyOffset("fe_inds");
+        //         pol(range(ses.size()),
+        //             [etab = proxy<space>(etab), sfs = proxy<space>({}, sfs), ses = proxy<space>({}, ses),
+        //             sfi = proxy<space>(sfi), sfindsOffset, sfFeIndsOffset, seFeIndsOffset] __device__(int li) mutable {
+        //                 auto findLineIdInTri = [](const auto &tri, int v0, int v1) -> int {
+        //                     for (int loc = 0; loc < 3; ++loc)
+        //                         if (tri[loc] == v0 && tri[(loc + 1) % 3] == v1)
+        //                             return loc;
+        //                     return -1;
+        //                 };
+        //                 auto neighborTris = vec2i::uniform(-1);
+        //                 auto line = ses.pack(dim_c<2>, "inds", li).reinterpret_bits(int_c);
+
+        //                 {
+        //                     if (auto no = etab.query(line); no >= 0) {
+        //                         // tri
+        //                         auto triNo = sfi[no];
+        //                         auto tri = sfs.pack(dim_c<3>, sfindsOffset, triNo).reinterpret_bits(int_c);
+        //                         auto loc = findLineIdInTri(tri, line[0], line[1]);
+        //                         if (loc == -1) {
+        //                             printf("ridiculous, this edge <%d, %d> does not belong to tri <%d, %d, %d>\n", line[0],
+        //                                 line[1], tri[0], tri[1], tri[2]);
+        //                             return;
+        //                         }
+        //                         sfs(sfFeIndsOffset + loc, triNo) = li;
+        //                         // edge
+        //                         neighborTris[0] = triNo;
+        //                     }
+        //                 }
+        //                 vec2i rline{line[1], line[0]};
+        //                 {
+        //                     if (auto no = etab.query(rline); no >= 0) {
+        //                         // tri
+        //                         auto triNo = sfi[no];
+        //                         auto tri = sfs.pack(dim_c<3>, sfindsOffset, triNo).reinterpret_bits(int_c);
+        //                         auto loc = findLineIdInTri(tri, rline[0], rline[1]);
+        //                         if (loc == -1) {
+        //                             printf("ridiculous, this edge <%d, %d> does not belong to tri <%d, %d, %d>\n", rline[0],
+        //                                 rline[1], tri[0], tri[1], tri[2]);
+        //                             return;
+        //                         }
+        //                         sfs(sfFeIndsOffset + loc, triNo) = li;
+        //                         // edge
+        //                         neighborTris[1] = triNo;
+        //                     }
+        //                 }
+        //                 ses.tuple(dim_c<2>, seFeIndsOffset, li) = neighborTris.reinterpret_bits(float_c);
+        //             });
+        //     }
+        //     /// @brief compute fp neighbors
+        //     /// @note  surface vertex index is not necessarily consecutive, thus hashing
+        //     {
+        //         bcht<int, int, true, universal_hash<int>, 32> vtab{svs.get_allocator(), svs.size()};
+        //         Vector<int> svi{etab.get_allocator(), svs.size()}; // surftri indices corresponding to edges in the table
+        //         // svs
+        //         pol(range(svs.size()), [vtab = proxy<space>(vtab), svs = proxy<space>({}, svs),
+        //                                 svi = proxy<space>(svi)] __device__(int vi) mutable {
+        //             int vert = reinterpret_bits<int>(svs("inds", vi));
+        //             if (auto no = vtab.insert(vert); no >= 0)
+        //                 svi[no] = vi;
+        //         });
+        //         //
+        //         pol(range(sfs.size()), [vtab = proxy<space>(vtab), sfs = proxy<space>({}, sfs),
+        //                                 svi = proxy<space>(svi)] __device__(int ti) mutable {
+        //             auto neighborIds = vec3i::uniform(-1);
+        //             auto tri = sfs.pack(dim_c<3>, "inds", ti).reinterpret_bits(int_c);
+        //             for (int i = 0; i != 3; ++i)
+        //                 if (auto no = vtab.query(tri[i]); no >= 0) {
+        //                     neighborIds[i] = svi[no];
+        //                 }
+        //             sfs.tuple(dim_c<3>, "fp_inds", ti) = neighborIds.reinterpret_bits(float_c);
+        //         });
+        //     }
+        // }
+
+
+        // void compute_surface_neighbors(zs::CudaExecutionPolicy &pol, ZenoParticles::particles_t &sfs,
+        //                             ZenoParticles::particles_t &ses, ZenoParticles::particles_t &svs) {
+        //     using namespace zs;
+        //     constexpr auto space = execspace_e::cuda;
+        //     using vec2i = zs::vec<int, 2>;
+        //     using vec3i = zs::vec<int, 3>;
+        //     sfs.append_channels(pol, {{"ff_inds", 3}, {"fe_inds", 3}, {"fp_inds", 3}});
+        //     ses.append_channels(pol, {{"fe_inds", 2},{"ep_inds",2}});
+
+        //     // fmt::print("sfs size: {}, ses size: {}, svs size: {}\n", sfs.size(), ses.size(), svs.size());
+
+        //     bcht<vec2i, int, true, universal_hash<vec2i>, 32> etab{sfs.get_allocator(), sfs.size() * 3};
+        //     Vector<int> sfi{sfs.get_allocator(), sfs.size() * 3}; // surftri indices corresponding to edges in the table
+        //     bcht<int,int,true, universal_hash<int>,32> ptab(svs.get_allocator(),svs.size());
+        //     Vector<int> spi{svs.get_allocator(),svs.size()};
+
+        //     pol(range(sfi.size()),
+        //         [sfi = proxy<space>(sfi)] __device__(int i) mutable {
+        //             sfi[i] = -1;
+        //     });
+
+        //     /// @brief compute space hash
+        //     {
+        //         pol(range(sfs.size()), [etab = proxy<space>(etab), sfs = proxy<space>({}, sfs),
+        //                                 sfi = proxy<space>(sfi)] __device__(int ti) mutable {
+        //             auto tri = sfs.pack(dim_c<3>, "inds", ti).reinterpret_bits(int_c);
+        //             for (int i = 0; i != 3; ++i)
+        //                 if (auto no = etab.insert(vec2i{tri[i], tri[(i + 1) % 3]}); no >= 0) {
+        //                     sfi[no] = ti;
+        //                 } else {
+        //                     int pid = etab.query(vec2i{tri[i], tri[(i + 1) % 3]});
+        //                     int oti = sfi[pid];
+        //                     // auto otri = sfs.pack(dim_c<3>, "inds", oti).reinterpret_bits(int_c);
+        //                     printf("the same directed edge <%d, %d> has been inserted twice! original sfi[%d,%d]= %d, cur "
+        //                         "%d <%d, %d, %d>\n",
+        //                         tri[i], tri[(i + 1) % 3],no , pid, oti, ti, tri[0], tri[1], tri[2]);
+        //                 }
+        //         });
+
+        //         std::cout << "output svs's channel and channel size :" << std::endl;
+        //         for(auto tag : svs.getPropertyTags()) {
+        //             std::cout << tag.name << "\t:\t" << tag.numChannels << std::endl; 
+        //         }
+
+        //         // if(svs.hasProperty("inds"))
+        //         //     fmt::print(fg(fmt::color::red),"svs has \"inds\" channel\n");
+        //         auto svsIndsOffset = svs.getPropertyOffset("inds");
+        //         // std::cout << "svdIndsOffset : " << svsIndsOffset << std::endl;
+        //         pol(range(spi.size()),
+        //             [spi = proxy<space>(spi)] ZS_LAMBDA(int pi) mutable {
+        //                 spi[pi] = -1;
+        //         });
+        //         pol(range(svs.size()),[ptab = proxy<space>(ptab),svs = proxy<space>({},svs,"filling_in_ptab"),
+        //             spi = proxy<space>(spi),svsIndsOffset] __device__(int pi) mutable {
+        //                 // auto numChannels = svs.propertySize("inds");
+        //                 // if(pi == 0){
+        //                     // printf("svdInds[\"inds\"][%d] : %d %d\n",(int)numChannels,(int)svsIndsOffset,(int)pi);
+
+        //                 auto pidx = reinterpret_bits<int>(svs("inds",pi));
+
+        //                 // }
+        //                 // auto no = ptab.insert(pidx);
+        //                 // if(no >=0 && no >= spi.size())
+        //                 //     printf("ptab overflow %d %d %d\n",(int)pidx,(int)no,(int)spi.size());
+        //                 // if(no < 0)
+        //                 //     printf("negative ptab : %d\n",(int)no);
+        //                 // auto no = ptab.insert(pidx);
+        //                 // duplicate of pi and inds
+        //                 if(auto no = ptab.insert(pidx);no >= 0)
+        //                     spi[no] = pi;
+        //                 else {
+        //                     // printf("invalid ptab insertion\n");
+        //                     auto opi = spi[ptab.query(pidx)];
+        //                     auto opidx = reinterpret_bits<int>(svs(svsIndsOffset,opi));
+        //                     printf("the same surface point <%d> has been inserted twice! origin svi %d <%d>, cur "
+        //                         "%d <%d>\n",
+        //                         pidx,opi,opidx,pi,pidx);
+        //                 }
+        //         });
+
+        //         pol(range(spi.size()),
+        //             [spi = proxy<space>(spi)] ZS_LAMBDA(int pi) mutable {
+        //                 if(spi[pi] < 0)
+        //                     printf("invalid spi[%d] = %d\n",pi,spi[pi]);
+        //         });
+
+
+        //     }
+        //     /// @brief compute ep neighbors
+        //     {
+        //         if(!ses.hasProperty("inds") || ses.getChannelSize("inds") != 2)
+        //             throw std::runtime_error("ses has no valid inds");
+
+        //         if(!ses.hasProperty("ep_inds") || ses.getChannelSize("ep_inds") != 2)
+        //             throw std::runtime_error("ses has no valid ep_inds");
+        //         pol(range(ses.size()),[ptab = proxy<space>(ptab),ses = proxy<space>({},ses,"ses:retrieve_inds_set_ep_inds"),
+        //             svs = proxy<space>({},svs),spi = proxy<space>(spi)] __device__(int ei) mutable {
+        //                 auto neighpIds = vec2i::uniform(-1);
+        //                 auto edge = ses.pack(dim_c<2>,"inds",ei).reinterpret_bits(int_c);
+        //                 for(int i = 0;i != 2;++i){
+        //                     if(auto no = ptab.query(edge[i]);no >= 0) {
+        //                         neighpIds[i] = spi[no];
+        //                     }
+        //                 }
+        //                 ses.tuple(dim_c<2>,"ep_inds",ei) = neighpIds.reinterpret_bits(float_c);
+        //         });
+        //     }
+
+        //     /// @brief compute ff neighbors
+        //     {
+
+        //         pol(range(sfs.size()), [etab = proxy<space>(etab), sfs = proxy<space>({}, sfs),
+        //                                 sfi = proxy<space>(sfi)] __device__(int ti) mutable {
+        //             auto neighborIds = vec3i::uniform(-1);
+        //             auto tri = sfs.pack(dim_c<3>, "inds", ti).reinterpret_bits(int_c);
+        //             for (int i = 0; i != 3; ++i)
+        //                 if (auto no = etab.query(vec2i{tri[(i + 1) % 3], tri[i]}); no >= 0) {
+        //                     neighborIds[i] = sfi[no];
+        //                 }
+        //             sfs.tuple(dim_c<3>, "ff_inds", ti) = neighborIds.reinterpret_bits(float_c);
+        //             sfs.tuple(dim_c<3>, "fe_inds", ti) = vec3i::uniform(-1); // default initialization
+        //         });
+        //     }
+        //     /// @brief compute fe neighbors
+        //     {
+        //         auto sfindsOffset = sfs.getPropertyOffset("inds");
+        //         auto sfFeIndsOffset = sfs.getPropertyOffset("fe_inds");
+        //         auto seFeIndsOffset = ses.getPropertyOffset("fe_inds");
+        //         pol(range(ses.size()),
+        //             [etab = proxy<space>(etab), sfs = proxy<space>({}, sfs), ses = proxy<space>({}, ses),
+        //             sfi = proxy<space>(sfi), sfindsOffset, sfFeIndsOffset, seFeIndsOffset] __device__(int li) mutable {
+        //                 auto findLineIdInTri = [](const auto &tri, int v0, int v1) -> int {
+        //                     for (int loc = 0; loc < 3; ++loc)
+        //                         if (tri[loc] == v0 && tri[(loc + 1) % 3] == v1)
+        //                             return loc;
+        //                     return -1;
+        //                 };
+        //                 auto neighborTris = vec2i::uniform(-1);
+        //                 auto line = ses.pack(dim_c<2>, "inds", li).reinterpret_bits(int_c);
+
+        //                 {
+        //                     if (auto no = etab.query(line); no >= 0) {
+        //                         // tri
+        //                         auto triNo = sfi[no];
+        //                         auto tri = sfs.pack(dim_c<3>, sfindsOffset, triNo).reinterpret_bits(int_c);
+        //                         auto loc = findLineIdInTri(tri, line[0], line[1]);
+        //                         if (loc == -1) {
+        //                             printf("ridiculous, this edge <%d, %d> does not belong to tri <%d, %d, %d>\n", line[0],
+        //                                 line[1], tri[0], tri[1], tri[2]);
+        //                         } else {
+        //                             sfs(sfFeIndsOffset + loc, triNo) = reinterpret_bits<float>(li);
+        //                             // edge
+        //                             neighborTris[0] = triNo;
+        //                         }
+        //                     }
+        //                 }
+        //                 vec2i rline{line[1], line[0]};
+        //                 {
+        //                     if (auto no = etab.query(rline); no >= 0) {
+        //                         // tri
+        //                         auto triNo = sfi[no];
+        //                         auto tri = sfs.pack(dim_c<3>, sfindsOffset, triNo).reinterpret_bits(int_c);
+        //                         auto loc = findLineIdInTri(tri, rline[0], rline[1]);
+        //                         if (loc == -1) {
+        //                             printf("ridiculous, this edge <%d, %d> does not belong to tri <%d, %d, %d>\n", rline[0],
+        //                                 rline[1], tri[0], tri[1], tri[2]);
+        //                         } else {
+        //                             sfs(sfFeIndsOffset + loc, triNo) = reinterpret_bits<float>(li);
+        //                             // edge
+        //                             neighborTris[1] = triNo;
+        //                         }
+        //                     }
+        //                 }
+        //                 ses.tuple(dim_c<2>, seFeIndsOffset, li) = neighborTris.reinterpret_bits(float_c);
+        //             });
+        //     }
+        //     /// @brief compute fp neighbors
+        //     /// @note  surface vertex index is not necessarily consecutive, thus hashing
+        //     {
+        //         bcht<int, int, true, universal_hash<int>, 32> vtab{svs.get_allocator(), svs.size()};
+        //         Vector<int> svi{etab.get_allocator(), svs.size()}; // surftri indices corresponding to edges in the table
+        //         // svs
+        //         pol(range(svs.size()), [vtab = proxy<space>(vtab), svs = proxy<space>({}, svs),
+        //                                 svi = proxy<space>(svi)] __device__(int vi) mutable {
+        //             int vert = reinterpret_bits<int>(svs("inds", vi));
+        //             if (auto no = vtab.insert(vert); no >= 0)
+        //                 svi[no] = vi;
+        //         });
+        //         //
+        //         pol(range(sfs.size()), [vtab = proxy<space>(vtab), sfs = proxy<space>({}, sfs),
+        //                                 svi = proxy<space>(svi)] __device__(int ti) mutable {
+        //             auto neighborIds = vec3i::uniform(-1);
+        //             auto tri = sfs.pack(dim_c<3>, "inds", ti).reinterpret_bits(int_c);
+        //             for (int i = 0; i != 3; ++i)
+        //                 if (auto no = vtab.query(tri[i]); no >= 0) {
+        //                     neighborIds[i] = svi[no];
+        //                 }
+        //             sfs.tuple(dim_c<3>, "fp_inds", ti) = neighborIds.reinterpret_bits(float_c);
+        //         });
+        //     }
+        // }
+
+
         void apply() override {
             using namespace zs;
 
@@ -56,15 +416,16 @@ namespace zeno {
             auto& tris  = (*surf)[ZenoParticles::s_surfTriTag];
             auto& lines = (*surf)[ZenoParticles::s_surfEdgeTag];
             auto& points = (*surf)[ZenoParticles::s_surfVertTag];
+            auto& tets = surf->getQuadraturePoints();
 
-            if(!tris.hasProperty("inds") || tris.getPropertySize("inds") != 3){
+            if(!tris.hasProperty("inds") || tris.getChannelSize("inds") != 3){
                 throw std::runtime_error("the tris has no inds channel");
             }
 
-            if(!lines.hasProperty("inds") || lines.getPropertySize("inds") != 2) {
+            if(!lines.hasProperty("inds") || lines.getChannelSize("inds") != 2) {
                 throw std::runtime_error("the line has no inds channel");
             }
-            if(!points.hasProperty("inds") || points.getPropertySize("inds") != 1) {
+            if(!points.hasProperty("inds") || points.getChannelSize("inds") != 1) {
                 throw std::runtime_error("the point has no inds channel");
             }
 
@@ -79,28 +440,64 @@ namespace zeno {
             //             printf("line[%d] : %d %d\n",(int)li,(int)inds[0],(int)inds[1]);
             // });
 
-            auto bvh_thickness = (T)3 * compute_average_edge_length(cudaExec,verts,"x",tris);
+#if 1
 
-            // std::cout << "bvh_thickness : " << bvh_thickness << std::endl;
+            auto bvh_thickness = (T)2 * compute_average_edge_length(cudaExec,verts,"x",tris);
+
+            // std::cout << "bvh_thickness : " << bvh_thickness << std::endl；
 
-            // compute_surface_neighbors(cudaExec, tris, lines, points);
-#if 1
             tris.append_channels(cudaExec,{{"ff_inds",3},{"fe_inds",3},{"fp_inds",3}});
-            lines.append_channels(cudaExec,{{"fe_inds",2}});
+            lines.append_channels(cudaExec,{{"fe_inds",2},{"ep_inds",2}});
+            if(tets.getChannelSize("inds") == 4){
+                tris.append_channels(cudaExec,{{"ft_inds",1}});
+                if(!compute_ft_neigh_topo(cudaExec,verts,tris,tets,"ft_inds",bvh_thickness))
+                    throw std::runtime_error("ZSInitTopoConnect::compute_face_tet_neigh_topo fail");
+            }
             if(!compute_ff_neigh_topo(cudaExec,verts,tris,"ff_inds",bvh_thickness))
                 throw std::runtime_error("ZSInitTopoConnect::compute_face_neigh_topo fail");
             if(!compute_fe_neigh_topo(cudaExec,verts,lines,tris,"fe_inds",bvh_thickness))
                 throw std::runtime_error("ZSInitTopoConnect::compute_face_neigh_topo fail");
             if(!compute_fp_neigh_topo(cudaExec,verts,points,tris,"fp_inds",bvh_thickness))
                 throw std::runtime_error("ZSInitTopoConnect::compute_face_point_neigh_topo fail");
+#else
+            compute_surface_neighbors(cudaExec,tris,lines,points);
 #endif
+            auto fbuffer = typename ZenoParticles::particles_t({{"non_manifold",1},{"inds",3}},tris.size(),zs::memsrc_e::device,0);
+            auto vbuffer = typename ZenoParticles::particles_t({{"x",3}},verts.size(),zs::memsrc_e::device,0); 
+            TILEVEC_OPS::copy(cudaExec,tris,"non_manifold",fbuffer,"non_manifold");
+            TILEVEC_OPS::copy(cudaExec,tris,"inds",fbuffer,"inds");
+            TILEVEC_OPS::copy(cudaExec,verts,"x",vbuffer,"x");
+
+            fbuffer = fbuffer.clone({zs::memsrc_e::host});
+            vbuffer = vbuffer.clone({zs::memsrc_e::host});
+
+            constexpr auto omp_space = execspace_e::openmp;
+            auto ompPol = omp_exec();
+
+            auto nmf_prim = std::make_shared<zeno::PrimitiveObject>();
+            auto& nmf_verts = nmf_prim->verts;
+            nmf_verts.resize(tris.size() * 3);
+            auto& nmf_tris = nmf_prim->tris;
+            nmf_tris.resize(tris.size());
+            ompPol(range(nmf_tris.size()),
+                [&nmf_tris,&nmf_verts,fbuffer = proxy<omp_space>({},fbuffer),vbuffer = proxy<omp_space>({},vbuffer)] (int ti) mutable {
+                    auto inds = fbuffer.pack(dim_c<3>,"inds",ti).reinterpret_bits(int_c);
+                    for(int i = 0;i != 3;++i)
+                        nmf_verts[ti * 3 + i] = vbuffer.pack(dim_c<3>,"x",inds[i]).to_array();
+                    if(fbuffer("non_manifold",ti) > 0) {
+                        nmf_tris[ti] = zeno::vec3i(ti * 3 + 0,ti * 3 + 1,ti * 3 + 2);
+                    }else{
+                        nmf_tris[ti] = zeno::vec3i(0,0,0);
+                    }
+            });
+            set_output("non_manifold_facets",std::move(nmf_prim));
 
             set_output("zssurf",surf);
         }
     };
 
     ZENDEFNODE(ZSInitSurfaceTopoConnect, {{{"zssurf"}},
-                                {{"zssurf"}},
+                                {{"zssurf"},{"non_manifold_facets"}},
                                 {},
                                 {"ZSGeometry"}});
 
@@ -108,7 +505,7 @@ namespace zeno {
     template<typename VTILEVEC> 
     constexpr vec3 eval_center(const VTILEVEC& verts,const zs::vec<int,4>& tet) {
         auto res = vec3::zeros();
-        for(int i = 0;i < 4;++i)
+        for(int i = 0;i != 4;++i)
             res += verts.template pack<3>("x",tet[i]) / (T)4.0;
         return res;
     } 
@@ -144,6 +541,7 @@ namespace zeno {
                 throw std::runtime_error("the input zsparticles has no surface points");
             }
 
+            const auto& tets = zsparticles->getQuadraturePoints();
             auto& tris  = (*zsparticles)[ZenoParticles::s_surfTriTag];
             auto& lines = (*zsparticles)[ZenoParticles::s_surfEdgeTag];
             auto& points = (*zsparticles)[ZenoParticles::s_surfVertTag];
@@ -164,11 +562,14 @@ namespace zeno {
             std::vector<zs::PropertyTag> tags{{"x",3}};
 
             int nm_tris = tris.size();
+            int nm_lines = lines.size();
 
             // output ff topo first
             auto ff_topo = typename ZenoParticles::particles_t(tags,nm_tris * 4,zs::memsrc_e::device,0);
             auto fe_topo = typename ZenoParticles::particles_t(tags,nm_tris * 4,zs::memsrc_e::device,0);
             auto fp_topo = typename ZenoParticles::particles_t(tags,nm_tris * 4,zs::memsrc_e::device,0);
+            auto ep_topo = typename ZenoParticles::particles_t(tags,nm_lines * 2,zs::memsrc_e::device,0);
+            auto ft_topo = typename ZenoParticles::particles_t(tags,nm_tris * 2,zs::memsrc_e::device,0);
 
             // transfer the data from gpu to cpu
             constexpr auto cuda_space = execspace_e::cuda;
@@ -177,11 +578,15 @@ namespace zeno {
                 [ff_topo = proxy<cuda_space>({},ff_topo),
                     fe_topo = proxy<cuda_space>({},fe_topo),
                     fp_topo = proxy<cuda_space>({},fp_topo),
+                    ft_topo = proxy<cuda_space>({},ft_topo),
+                    tets = proxy<cuda_space>({},tets),
                     tris = proxy<cuda_space>({},tris),
                     lines = proxy<cuda_space>({},lines),
                     points = proxy<cuda_space>({},points),
                     verts = proxy<cuda_space>({},verts)] ZS_LAMBDA(int ti) mutable {
                         auto tri = tris.template pack<3>("inds",ti).reinterpret_bits(int_c);
+                        auto tet_id = reinterpret_bits<int>(tris("ft_inds",ti));
+                        auto tet = tets.template pack<4>("inds",tet_id).reinterpret_bits(int_c);
                         auto ff_inds = tris.template pack<3>("ff_inds",ti).reinterpret_bits(int_c);
                         auto fe_inds = tris.template pack<3>("fe_inds",ti).reinterpret_bits(int_c);
                         auto fp_inds = tris.template pack<3>("fp_inds",ti).reinterpret_bits(int_c);
@@ -190,6 +595,11 @@ namespace zeno {
                         ff_topo.template tuple<3>("x",ti * 4 + 0) = center;
                         fe_topo.template tuple<3>("x",ti * 4 + 0) = center;
                         fp_topo.template tuple<3>("x",ti * 4 + 0) = center;
+                        auto tcenter = eval_center(verts,tet);
+
+                        ft_topo.template tuple<3>("x",ti * 2 + 0) = center;
+                        ft_topo.template tuple<3>("x",ti * 2 + 1) = tcenter;
+
                         for(int i = 0;i != 3;++i) {
                             auto nti = ff_inds[i];
                             auto ntri = tris.template pack<3>("inds",nti).reinterpret_bits(int_c);
@@ -208,13 +618,30 @@ namespace zeno {
 
             });   
 
+            cudaPol(zs::range(nm_lines),
+                [ep_topo = proxy<cuda_space>({},ep_topo),
+                    verts = proxy<cuda_space>({},verts),
+                    points = proxy<cuda_space>({},points),
+                    lines = proxy<cuda_space>({},lines)] ZS_LAMBDA(int li) mutable {
+                        auto ep_inds = lines.template pack<2>("ep_inds",li).reinterpret_bits(int_c);
+                        for(int i = 0;i != 2;++i) {
+                            auto pidx = ep_inds[i];
+                            auto vidx = reinterpret_bits<int>(points("inds",pidx));
+                            ep_topo.template tuple<3>("x",li * 2 + i) = verts.template pack<3>("x",vidx);
+                        }
+            });
+
             ff_topo = ff_topo.clone({zs::memsrc_e::host});
             fe_topo = fe_topo.clone({zs::memsrc_e::host});
             fp_topo = fp_topo.clone({zs::memsrc_e::host});
+            ep_topo = ep_topo.clone({zs::memsrc_e::host});
+            ft_topo = ft_topo.clone({zs::memsrc_e::host});
 
             int ff_size = ff_topo.size();
             int fe_size = fe_topo.size();
             int fp_size = fp_topo.size();
+            int ep_size = ep_topo.size();
+            int ft_size = ft_topo.size();
 
             constexpr auto omp_space = execspace_e::openmp;
             auto ompPol = omp_exec();
@@ -222,6 +649,8 @@ namespace zeno {
             auto ff_prim = std::make_shared<zeno::PrimitiveObject>();
             auto fe_prim = std::make_shared<zeno::PrimitiveObject>();
             auto fp_prim = std::make_shared<zeno::PrimitiveObject>();
+            auto ep_prim = std::make_shared<zeno::PrimitiveObject>();
+            auto ft_prim = std::make_shared<zeno::PrimitiveObject>();
 
             auto& ff_verts = ff_prim->verts;
             auto& ff_lines = ff_prim->lines;
@@ -232,9 +661,17 @@ namespace zeno {
             auto& fp_verts = fp_prim->verts;
             auto& fp_lines = fp_prim->lines;
 
+            auto& ep_verts = ep_prim->verts;
+            auto& ep_lines = ep_prim->lines;
+
+            auto& ft_verts = ft_prim->verts;
+            auto& ft_lines = ft_prim->lines;
+
             int ff_pair_count = nm_tris * 3;
             int fe_pair_count = nm_tris * 3;
             int fp_pair_count = nm_tris * 3;
+            int ep_pair_count = nm_lines * 1;
+            int ft_pair_count = nm_tris;
 
             ff_verts.resize(ff_size);
             ff_lines.resize(ff_pair_count);
@@ -242,6 +679,18 @@ namespace zeno {
             fe_lines.resize(fe_pair_count);
             fp_verts.resize(fp_size);
             fp_lines.resize(fp_pair_count);
+            ep_verts.resize(ep_size);
+            ep_lines.resize(ep_pair_count);
+            ft_verts.resize(ft_size);
+            ft_lines.resize(ft_pair_count);
+
+            ompPol(zs::range(nm_tris),
+                [&ft_verts,&ft_lines,ft_topo = proxy<omp_space>({},ft_topo)] (int fi) mutable {
+                    ft_verts[fi * 2 + 0] = ft_topo.template pack<3>("x",fi * 2 + 0).to_array();
+                    ft_verts[fi * 2 + 1] = ft_topo.template pack<3>("x",fi * 2 + 1).to_array();
+                    // ft_verts[fi * 2 + 1] = zeno::vec3f(0.0,0.0,0.0);
+                    ft_lines[fi] = zeno::vec2i(fi * 2 + 0,fi * 2 + 1);
+            });
 
             ompPol(zs::range(nm_tris),
                 [&ff_verts,&ff_lines,ff_topo = proxy<omp_space>({},ff_topo)] (int fi) mutable {
@@ -276,17 +725,42 @@ namespace zeno {
                     }
             });
 
+            ompPol(zs::range(nm_lines),
+                [&ep_verts,&ep_lines,ep_topo = proxy<omp_space>({},ep_topo)] (int li) mutable {
+                    for(int i = 0;i != 2;++i)
+                        ep_verts[li * 2 + i] = ep_topo.template pack<3>("x",li * 2 + i).to_array();
+                    ep_lines[li] = zeno::vec2i(li * 2 + 0,li * 2 + 1);
+            });
+
             // for(int i = 0;i < fe_lines.size();++i)
             //     std::cout << "fe_line<" << i << "> : \t" << fe_lines[i][0] << "\t" << fe_lines[i][1] << std::endl;
+            set_output("ft_topo",std::move(ft_prim));
             set_output("fp_topo",std::move(fp_prim));
             set_output("ff_topo",std::move(ff_prim));
             set_output("fe_topo",std::move(fe_prim));
+            set_output("ep_topo",std::move(ep_prim));
         }
     };
 
 
     ZENDEFNODE(VisualizeTopology, {{{"ZSParticles"}},
-                                {{"ff_topo"},{"fe_topo"},{"fp_topo"}},
+                                {{"ft_topo"},{"ff_topo"},{"fe_topo"},{"fp_topo"},{"ep_topo"}},
+                                {},
+                                {"ZSGeometry"}});
+
+
+    struct CopyShape : INode {
+        virtual void apply() override {
+            auto prim1 = get_input<zeno::PrimitiveObject>("prim1");
+            auto prim2 = get_input<zeno::PrimitiveObject>("prim2");
+            auto& nx = prim1->add_attr<zeno::vec3f>("npos");
+            for(int i = 0;i != prim1->size();++i)
+                nx[i] = prim2->verts[i];
+            set_output("prim1",prim1);
+        }
+    };
+    ZENDEFNODE(CopyShape, {{{"prim1"},{"prim2"}},
+                                {{"prim1"}},
                                 {},
                                 {"ZSGeometry"}});
 
@@ -309,7 +783,7 @@ namespace zeno {
             const auto& points  = (*zsparticles)[ZenoParticles::s_surfVertTag];
             const auto& verts = zsparticles->getParticles();
 
-            if(!tris.hasProperty("fp_inds") || tris.getPropertySize("fp_inds") != 3) {
+            if(!tris.hasProperty("fp_inds") || tris.getChannelSize("fp_inds") != 3) {
                 throw std::runtime_error("call ZSInitSurfaceTopology first before VisualizeSurfaceMesh");
             }
 
@@ -466,10 +940,13 @@ namespace zeno {
             // std::cout << "CALCULATE SURFACE NORMAL" << std::endl;
 
             if(!calculate_facet_normal(cudaExec,verts,"x",tris,tris,"nrm"))
-                throw std::runtime_error("ZSCalNormal::calculate_facet_normal fail"); 
+                throw std::runtime_error("VisualizeSurfaceEdgeNormal::calculate_facet_normal fail"); 
 
 
             auto buffer = typename ZenoParticles::particles_t({{"nrm",3},{"x",3}},lines.size(),zs::memsrc_e::device,0);  
+            if(!calculate_edge_normal_from_facet_normal(cudaExec,tris,"nrm",buffer,"nrm",lines))
+                throw std::runtime_error("VisualizeSurfaceEdgeNormal::calculate_edge_normal_from_facet_normal fail");
+
 
             cudaExec(zs::range(lines.size()),[
                     buffer = proxy<space>({},buffer),
@@ -485,7 +962,7 @@ namespace zeno {
                         auto v0 = verts.template pack<3>("x",linds[0]);
                         auto v1 = verts.template pack<3>("x",linds[1]);
 
-                        buffer.template tuple<3>("nrm",ei) = (n0 + n1).normalized();
+                        // buffer.template tuple<3>("nrm",ei) = (n0 + n1).normalized();
                         // buffer.template tuple<3>("nrm",ei) = lines.template pack<3>("nrm",ei);
                         buffer.template tuple<3>("x",ei) = (v0 + v1) / (T)2.0;
             }); 
@@ -579,16 +1056,13 @@ namespace zeno {
 
             //             lines.template tuple<3>(ceNrmTag,ei) = e10.cross(ne).normalized();
             // });
-#if 1
-            update_surface_cell_normals(cudaExec, const_cast<ZenoParticles::particles_t&>(verts), "x", 0, const_cast<ZenoParticles::particles_t&>(tris), "nrm", lines, ceNrmTag);
-#else
+
             COLLISION_UTILS::calculate_cell_bisector_normal(cudaExec,
                 verts,"x",
                 lines,
                 tris,
                 tris,"nrm",
                 lines,ceNrmTag);
-#endif
 
 
             set_output("ZSParticles",zsparticles);
@@ -903,310 +1377,463 @@ namespace zeno {
 
 
 
-    struct VisualizeFacetPointIntersection : zeno::INode {
-        using T = float;
-        using Ti = int;
-        using dtiles_t = zs::TileVector<T,32>;
-        using tiles_t = typename ZenoParticles::particles_t;
-        using bvh_t = zs::LBvh<3,int,T>;
-        using bv_t = zs::AABBBox<3, T>;
-        using vec3 = zs::vec<T, 3>;
-
-        virtual void apply() override {
-            using namespace zs;
-
-            auto zsparticles = get_input<ZenoParticles>("ZSParticles");
-
-            if(!zsparticles->hasAuxData(ZenoParticles::s_surfTriTag))
-                throw std::runtime_error("the input zsparticles has no surface tris");
-            if(!zsparticles->hasAuxData(ZenoParticles::s_surfEdgeTag))
-                throw std::runtime_error("the input zsparticles has no surface lines");
-            if(!zsparticles->hasAuxData(ZenoParticles::s_surfVertTag)) 
-                throw std::runtime_error("the input zsparticles has no surface points");
-            // if(!zsparticles->hasBvh(ZenoParticles::s_surfTriTag)) {
-            //     throw std::runtime_error("the input zsparticles has no surface tris's spacial structure");
-            // }
-            // if(!zsparticles->hasBvh(ZenoParticles::s_surfEdgeTag)) {
-            //     throw std::runtime_error("the input zsparticles has no surface edge's spacial structure");
-            // }
-            // if(!zsparticles->hasBvh(ZenoParticles::s_surfVertTag))  {
-            //     throw std::runtime_error("the input zsparticles has no surface vert's spacial structure");
-            // }
-
-            const auto& verts = zsparticles->getParticles();
+//     struct VisualizeFacetPointIntersection : zeno::INode {
+//         using T = float;
+//         using Ti = int;
+//         using dtiles_t = zs::TileVector<T,32>;
+//         using tiles_t = typename ZenoParticles::particles_t;
+//         using bvh_t = zs::LBvh<3,int,T>;
+//         using bv_t = zs::AABBBox<3, T>;
+//         using vec3 = zs::vec<T, 3>;
+
+//         virtual void apply() override {
+//             using namespace zs;
+
+//             auto zsparticles = get_input<ZenoParticles>("ZSParticles");
+
+//             if(!zsparticles->hasAuxData(ZenoParticles::s_surfTriTag))
+//                 throw std::runtime_error("the input zsparticles has no surface tris");
+//             if(!zsparticles->hasAuxData(ZenoParticles::s_surfEdgeTag))
+//                 throw std::runtime_error("the input zsparticles has no surface lines");
+//             if(!zsparticles->hasAuxData(ZenoParticles::s_surfVertTag)) 
+//                 throw std::runtime_error("the input zsparticles has no surface points");
+//             // if(!zsparticles->hasBvh(ZenoParticles::s_surfTriTag)) {
+//             //     throw std::runtime_error("the input zsparticles has no surface tris's spacial structure");
+//             // }
+//             // if(!zsparticles->hasBvh(ZenoParticles::s_surfEdgeTag)) {
+//             //     throw std::runtime_error("the input zsparticles has no surface edge's spacial structure");
+//             // }
+//             // if(!zsparticles->hasBvh(ZenoParticles::s_surfVertTag))  {
+//             //     throw std::runtime_error("the input zsparticles has no surface vert's spacial structure");
+//             // }
+
+//             const auto& verts = zsparticles->getParticles();
+
+//             auto& tris  = (*zsparticles)[ZenoParticles::s_surfTriTag];
+//             auto& lines = (*zsparticles)[ZenoParticles::s_surfEdgeTag];
+//             auto& points = (*zsparticles)[ZenoParticles::s_surfVertTag];
+
+//             // auto& stBvh = zsparticles->bvh(ZenoParticles::s_surfTriTag);
+//             // auto& seBvh = zsparticles->bvh(ZenoParticles::s_surfEdgeTag);
+
+//             auto in_collisionEps = get_input2<float>("in_collisionEps");
+//             auto out_collisionEps = get_input2<float>("out_collisionEps");
+
+//             dtiles_t sttemp(tris.get_allocator(),
+//                 {
+//                     {"nrm",3}
+//                 },tris.size()
+//             );
+//             dtiles_t setemp(lines.get_allocator(),
+//                 {
+//                     {"nrm",3}
+//                 },lines.size()
+//             );
+            
+//             dtiles_t cptemp(points.get_allocator(),
+//                 {
+//                     {"inds",4},
+//                     {"area",1},
+//                     {"inverted",1}
+//                 },points.size() * MAX_FP_COLLISION_PAIRS);
 
-            auto& tris  = (*zsparticles)[ZenoParticles::s_surfTriTag];
-            auto& lines = (*zsparticles)[ZenoParticles::s_surfEdgeTag];
-            auto& points = (*zsparticles)[ZenoParticles::s_surfVertTag];
 
-            // auto& stBvh = zsparticles->bvh(ZenoParticles::s_surfTriTag);
-            // auto& seBvh = zsparticles->bvh(ZenoParticles::s_surfEdgeTag);
-
-            auto in_collisionEps = get_input2<float>("in_collisionEps");
-            auto out_collisionEps = get_input2<float>("out_collisionEps");
-
-            dtiles_t sttemp(tris.get_allocator(),
-                {
-                    {"nrm",3}
-                },tris.size()
-            );
-            dtiles_t setemp(lines.get_allocator(),
-                {
-                    {"nrm",3}
-                },lines.size()
-            );
+//             constexpr auto space = execspace_e::cuda;
+//             auto cudaPol = cuda_exec();
+
+//             std::vector<zs::PropertyTag> cv_tags{{"xs",3},{"xe",3}};
+//             auto cv_buffer = typename ZenoParticles::particles_t(cv_tags,points.size() * MAX_FP_COLLISION_PAIRS,zs::memsrc_e::device,0);
+//             std::vector<zs::PropertyTag> cv_pt_tags{{"p",3},{"t0",3},{"t1",3},{"t2",3}};
+//             auto cv_pt_buffer = typename ZenoParticles::particles_t(cv_pt_tags,points.size() * MAX_FP_COLLISION_PAIRS,zs::memsrc_e::device,0);
+
+// #if 0
+
+//             if(!calculate_facet_normal(cudaPol,verts,"x",tris,sttemp,"nrm")){
+//                     throw std::runtime_error("fail updating facet normal");
+//             }
+
+
+//             // TILEVEC_OPS::copy<4>(cudaPol,eles,"inds",etemp,"inds");
+
+
+
+//             if(!COLLISION_UTILS::calculate_cell_bisector_normal(cudaPol,
+//                 verts,"x",
+//                 lines,
+//                 tris,
+//                 sttemp,"nrm",
+//                 setemp,"nrm")){
+//                     throw std::runtime_error("fail calculate cell bisector normal");
+//             } 
+
+//             auto stbvs = retrieve_bounding_volumes(cudaPol,verts,tris,wrapv<3>{},(T)0.0,"x");
+//             auto sebvs = retrieve_bounding_volumes(cudaPol,verts,lines,wrapv<2>{},(T)0.0,"x");
+//             stBvh.refit(cudaPol,stbvs);
+//             seBvh.refit(cudaPol,sebvs);
+
+//             auto avgl = compute_average_edge_length(cudaPol,verts,"x",tris);
+//             auto bvh_thickness = 5 * avgl;
+
+//             TILEVEC_OPS::fill<MAX_FP_COLLISION_PAIRS>(cudaPol,sptemp,"fp_collision_pairs",zs::vec<int,MAX_FP_COLLISION_PAIRS>::uniform(-1).template reinterpret_bits<T>());
+//             cudaPol(zs::range(points.size()),[collisionEps = collisionEps,
+//                             verts = proxy<space>({},verts),
+//                             sttemp = proxy<space>({},sttemp),
+//                             setemp = proxy<space>({},setemp),
+//                             sptemp = proxy<space>({},sptemp),
+//                             points = proxy<space>({},points),
+//                             lines = proxy<space>({},lines),
+//                             tris = proxy<space>({},tris),
+//                             stbvh = proxy<space>(stBvh),thickness = bvh_thickness] ZS_LAMBDA(int svi) mutable {
+
+
+//                 auto vi = reinterpret_bits<int>(points("inds",svi));
+//                 // auto is_vertex_inverted = reinterpret_bits<int>(verts("is_inverted",vi));
+//                 // if(is_vertex_inverted)
+//                 //     return;
+
+//                 auto p = verts.template pack<3>("x",vi);
+//                 auto bv = bv_t{get_bounding_box(p - thickness, p + thickness)};
+
+//                 int nm_collision_pairs = 0;
+//                 auto process_vertex_face_collision_pairs = [&](int stI) {
+//                     auto tri = tris.pack(dim_c<3>, "inds",stI).reinterpret_bits(int_c);
+//                     if(tri[0] == vi || tri[1] == vi || tri[2] == vi)
+//                         return;
+
+//                     zs::vec<T,3> t[3] = {};
+//                     t[0] = verts.template pack<3>("x",tri[0]);
+//                     t[1] = verts.template pack<3>("x",tri[1]);
+//                     t[2] = verts.template pack<3>("x",tri[2]);
+
+//                     bool collide = false;
+
+//                     if(COLLISION_UTILS::is_inside_the_cell(verts,"x",
+//                             lines,tris,
+//                             sttemp,"nrm",
+//                             setemp,"nrm",
+//                             stI,p,collisionEps)) {
+//                         collide = true;
+//                     }
+
+
+//                     if(!collide)
+//                         return;
+
+//                     if(nm_collision_pairs  < MAX_FP_COLLISION_PAIRS) {
+//                         sptemp("fp_collision_pairs",nm_collision_pairs++,svi) = reinterpret_bits<T>(stI);
+//                     }
+//                 };
+//                 stbvh.iter_neighbors(bv,process_vertex_face_collision_pairs);
+//             });
+
+
+//            cudaPol(zs::range(points.size()),
+//                 [cv_buffer = proxy<space>({},cv_buffer),cv_pt_buffer = proxy<space>({},cv_pt_buffer),
+//                         sptemp = proxy<space>({},sptemp),verts = proxy<space>({},verts),points = proxy<space>({},points),tris = proxy<space>({},tris)] ZS_LAMBDA(int pi) mutable {
+//                     auto collision_pairs = sptemp.template pack<MAX_FP_COLLISION_PAIRS>("fp_collision_pairs",pi).reinterpret_bits(int_c);
+//                     auto vi = reinterpret_bits<int>(points("inds",pi));
+//                     auto pvert = verts.template pack<3>("x",vi);
+
+//                     for(int i = 0;i != MAX_FP_COLLISION_PAIRS;++i){
+//                         auto sti = collision_pairs[i];
+//                         if(sti < 0){
+//                             cv_buffer.template tuple<3>("xs",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
+//                             cv_buffer.template tuple<3>("xe",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
+                            
+//                             cv_pt_buffer.template tuple<3>("p",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
+//                             cv_pt_buffer.template tuple<3>("t0",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
+//                             cv_pt_buffer.template tuple<3>("t1",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
+//                             cv_pt_buffer.template tuple<3>("t2",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
+
+//                         }else {
+//                             auto tri = tris.template pack<3>("inds",sti).reinterpret_bits(int_c);
+//                             auto t0 = verts.template pack<3>("x",tri[0]);
+//                             auto t1 = verts.template pack<3>("x",tri[1]);
+//                             auto t2 = verts.template pack<3>("x",tri[2]);
+//                             auto center = (t0 + t1 + t2) / (T)3.0;
+
+//                             cv_buffer.template tuple<3>("xs",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
+//                             cv_buffer.template tuple<3>("xe",MAX_FP_COLLISION_PAIRS * pi + i) = center;
+
+//                             cv_pt_buffer.template tuple<3>("p",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
+//                             cv_pt_buffer.template tuple<3>("t0",MAX_FP_COLLISION_PAIRS * pi + i) = t0;
+//                             cv_pt_buffer.template tuple<3>("t1",MAX_FP_COLLISION_PAIRS * pi + i) = t1;
+//                             cv_pt_buffer.template tuple<3>("t2",MAX_FP_COLLISION_PAIRS * pi + i) = t2;
+
+//                         }
+//                     }
+//             });
+
+// #else
+//             // auto stbvs = retrieve_bounding_volumes(cudaPol,verts,tris,wrapv<3>{},(T)0.0,"x");
+//             // stBvh.refit(cudaPol,stbvs);
+
+//             COLLISION_UTILS::do_facet_point_collision_detection<MAX_FP_COLLISION_PAIRS>(cudaPol,
+//                 verts,"x",
+//                 points,
+//                 lines,
+//                 tris,
+//                 sttemp,
+//                 setemp,
+//                 cptemp,
+//                 // stBvh,
+//                 in_collisionEps,out_collisionEps);
+
+
+
+//             cudaPol(zs::range(points.size()),
+//                 [cptemp = proxy<space>({},cptemp),verts = proxy<space>({},verts),
+//                     cv_buffer = proxy<space>({},cv_buffer),
+//                     cv_pt_buffer = proxy<space>({},cv_pt_buffer),
+//                     points = proxy<space>({},points)] ZS_LAMBDA(int pi) mutable {
+//                         for(int i = 0;i != MAX_FP_COLLISION_PAIRS;++i) {
+//                             auto inds = cptemp.template pack<4>("inds",pi * MAX_FP_COLLISION_PAIRS + i).reinterpret_bits(int_c);
+//                             bool contact = true;
+//                             auto pvert = zs::vec<T,3>::zeros();
+//                             for(int j = 0;j != 4;++j)
+//                                 if(inds[j] < 0)
+//                                     contact = false;
+//                             if(contact) {
+//                                 pvert = verts.template pack<3>("x",inds[0]);
+//                                 auto t0 = verts.template pack<3>("x",inds[1]);
+//                                 auto t1 = verts.template pack<3>("x",inds[2]);
+//                                 auto t2 = verts.template pack<3>("x",inds[3]);
+//                                 auto center = (t0 + t1 + t2) / (T)3.0;
+
+//                                 cv_buffer.template tuple<3>("xs",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
+//                                 cv_buffer.template tuple<3>("xe",MAX_FP_COLLISION_PAIRS * pi + i) = center;
+
+//                                 cv_pt_buffer.template tuple<3>("p",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
+//                                 cv_pt_buffer.template tuple<3>("t0",MAX_FP_COLLISION_PAIRS * pi + i) = t0;
+//                                 cv_pt_buffer.template tuple<3>("t1",MAX_FP_COLLISION_PAIRS * pi + i) = t1;
+//                                 cv_pt_buffer.template tuple<3>("t2",MAX_FP_COLLISION_PAIRS * pi + i) = t2;                                
+//                             }else{
+//                                 cv_buffer.template tuple<3>("xs",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
+//                                 cv_buffer.template tuple<3>("xe",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
+                                
+//                                 cv_pt_buffer.template tuple<3>("p",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
+//                                 cv_pt_buffer.template tuple<3>("t0",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
+//                                 cv_pt_buffer.template tuple<3>("t1",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
+//                                 cv_pt_buffer.template tuple<3>("t2",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;                                
+//                             }
+//                         }
+//             });
             
-            dtiles_t cptemp(points.get_allocator(),
-                {
-                    {"inds",4},
-                    {"area",1},
-                    {"inverted",1}
-                },points.size() * MAX_FP_COLLISION_PAIRS);
 
+// #endif
+//             // cudaPol.syncCtx();
 
-            constexpr auto space = execspace_e::cuda;
-            auto cudaPol = cuda_exec();
 
-            std::vector<zs::PropertyTag> cv_tags{{"xs",3},{"xe",3}};
-            auto cv_buffer = typename ZenoParticles::particles_t(cv_tags,points.size() * MAX_FP_COLLISION_PAIRS,zs::memsrc_e::device,0);
-            std::vector<zs::PropertyTag> cv_pt_tags{{"p",3},{"t0",3},{"t1",3},{"t2",3}};
-            auto cv_pt_buffer = typename ZenoParticles::particles_t(cv_pt_tags,points.size() * MAX_FP_COLLISION_PAIRS,zs::memsrc_e::device,0);
+//             cv_buffer = cv_buffer.clone({zs::memsrc_e::host});
+//             auto collisionFacetVis = std::make_shared<zeno::PrimitiveObject>();
+//             auto& cv_verts = collisionFacetVis->verts;
+//             auto& cv_lines = collisionFacetVis->lines;
+//             cv_verts.resize(points.size() * 2 * MAX_FP_COLLISION_PAIRS);
+//             cv_lines.resize(points.size() * MAX_FP_COLLISION_PAIRS);
 
-#if 0
+//             auto ompPol = omp_exec();  
+//             constexpr auto omp_space = execspace_e::openmp;
 
-            if(!calculate_facet_normal(cudaPol,verts,"x",tris,sttemp,"nrm")){
-                    throw std::runtime_error("fail updating facet normal");
-            }
+//             ompPol(zs::range(cv_buffer.size()),
+//                 [cv_buffer = proxy<omp_space>({},cv_buffer),&cv_verts,&cv_lines] (int pi) mutable {
+//                     auto xs = cv_buffer.template pack<3>("xs",pi);
+//                     auto xe = cv_buffer.template pack<3>("xe",pi);
+//                     cv_verts[pi * 2 + 0] = zeno::vec3f(xs[0],xs[1],xs[2]);
+//                     cv_verts[pi * 2 + 1] = zeno::vec3f(xe[0],xe[1],xe[2]);
+//                     cv_lines[pi] = zeno::vec2i(pi * 2 + 0,pi * 2 + 1);
+//             });
 
+//             set_output("collisionFacetVis",std::move(collisionFacetVis));
 
-            // TILEVEC_OPS::copy<4>(cudaPol,eles,"inds",etemp,"inds");
 
 
+//             cv_pt_buffer = cv_pt_buffer.clone({zs::memsrc_e::host});
+//             auto colPointFacetPairVis = std::make_shared<zeno::PrimitiveObject>();
+//             auto& cv_pt_verts = colPointFacetPairVis->verts;
+//             auto& cv_pt_tris = colPointFacetPairVis->tris;
 
-            if(!COLLISION_UTILS::calculate_cell_bisector_normal(cudaPol,
-                verts,"x",
-                lines,
-                tris,
-                sttemp,"nrm",
-                setemp,"nrm")){
-                    throw std::runtime_error("fail calculate cell bisector normal");
-            } 
-
-            auto stbvs = retrieve_bounding_volumes(cudaPol,verts,tris,wrapv<3>{},(T)0.0,"x");
-            auto sebvs = retrieve_bounding_volumes(cudaPol,verts,lines,wrapv<2>{},(T)0.0,"x");
-            stBvh.refit(cudaPol,stbvs);
-            seBvh.refit(cudaPol,sebvs);
-
-            auto avgl = compute_average_edge_length(cudaPol,verts,"x",tris);
-            auto bvh_thickness = 5 * avgl;
-
-            TILEVEC_OPS::fill<MAX_FP_COLLISION_PAIRS>(cudaPol,sptemp,"fp_collision_pairs",zs::vec<int,MAX_FP_COLLISION_PAIRS>::uniform(-1).template reinterpret_bits<T>());
-            cudaPol(zs::range(points.size()),[collisionEps = collisionEps,
-                            verts = proxy<space>({},verts),
-                            sttemp = proxy<space>({},sttemp),
-                            setemp = proxy<space>({},setemp),
-                            sptemp = proxy<space>({},sptemp),
-                            points = proxy<space>({},points),
-                            lines = proxy<space>({},lines),
-                            tris = proxy<space>({},tris),
-                            stbvh = proxy<space>(stBvh),thickness = bvh_thickness] ZS_LAMBDA(int svi) mutable {
-
-
-                auto vi = reinterpret_bits<int>(points("inds",svi));
-                // auto is_vertex_inverted = reinterpret_bits<int>(verts("is_inverted",vi));
-                // if(is_vertex_inverted)
-                //     return;
-
-                auto p = verts.template pack<3>("x",vi);
-                auto bv = bv_t{get_bounding_box(p - thickness, p + thickness)};
-
-                int nm_collision_pairs = 0;
-                auto process_vertex_face_collision_pairs = [&](int stI) {
-                    auto tri = tris.pack(dim_c<3>, "inds",stI).reinterpret_bits(int_c);
-                    if(tri[0] == vi || tri[1] == vi || tri[2] == vi)
-                        return;
-
-                    zs::vec<T,3> t[3] = {};
-                    t[0] = verts.template pack<3>("x",tri[0]);
-                    t[1] = verts.template pack<3>("x",tri[1]);
-                    t[2] = verts.template pack<3>("x",tri[2]);
-
-                    bool collide = false;
-
-                    if(COLLISION_UTILS::is_inside_the_cell(verts,"x",
-                            lines,tris,
-                            sttemp,"nrm",
-                            setemp,"nrm",
-                            stI,p,collisionEps)) {
-                        collide = true;
-                    }
+//             cv_pt_verts.resize(cv_pt_buffer.size() * 4);
+//             cv_pt_tris.resize(cv_pt_buffer.size());
 
+//             ompPol(zs::range(cv_pt_buffer.size()),
+//                 [&cv_pt_verts,&cv_pt_tris,cv_pt_buffer = proxy<omp_space>({},cv_pt_buffer)] (int pi) mutable {
+//                     cv_pt_verts[pi * 4 + 0] = cv_pt_buffer.template pack<3>("p",pi).to_array();
+//                     cv_pt_verts[pi * 4 + 1] = cv_pt_buffer.template pack<3>("t0",pi).to_array();
+//                     cv_pt_verts[pi * 4 + 2] = cv_pt_buffer.template pack<3>("t1",pi).to_array();
+//                     cv_pt_verts[pi * 4 + 3] = cv_pt_buffer.template pack<3>("t2",pi).to_array();
 
-                    if(!collide)
-                        return;
+//                     cv_pt_tris[pi] = zeno::vec3i(pi * 4 + 1,pi * 4 + 2,pi * 4 + 3);
+//             });
 
-                    if(nm_collision_pairs  < MAX_FP_COLLISION_PAIRS) {
-                        sptemp("fp_collision_pairs",nm_collision_pairs++,svi) = reinterpret_bits<T>(stI);
-                    }
-                };
-                stbvh.iter_neighbors(bv,process_vertex_face_collision_pairs);
-            });
 
+//             set_output("colPointFacetPairVis",std::move(colPointFacetPairVis));
 
-           cudaPol(zs::range(points.size()),
-                [cv_buffer = proxy<space>({},cv_buffer),cv_pt_buffer = proxy<space>({},cv_pt_buffer),
-                        sptemp = proxy<space>({},sptemp),verts = proxy<space>({},verts),points = proxy<space>({},points),tris = proxy<space>({},tris)] ZS_LAMBDA(int pi) mutable {
-                    auto collision_pairs = sptemp.template pack<MAX_FP_COLLISION_PAIRS>("fp_collision_pairs",pi).reinterpret_bits(int_c);
-                    auto vi = reinterpret_bits<int>(points("inds",pi));
-                    auto pvert = verts.template pack<3>("x",vi);
+//         }
+//     };
 
-                    for(int i = 0;i != MAX_FP_COLLISION_PAIRS;++i){
-                        auto sti = collision_pairs[i];
-                        if(sti < 0){
-                            cv_buffer.template tuple<3>("xs",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
-                            cv_buffer.template tuple<3>("xe",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
-                            
-                            cv_pt_buffer.template tuple<3>("p",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
-                            cv_pt_buffer.template tuple<3>("t0",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
-                            cv_pt_buffer.template tuple<3>("t1",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
-                            cv_pt_buffer.template tuple<3>("t2",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
 
-                        }else {
-                            auto tri = tris.template pack<3>("inds",sti).reinterpret_bits(int_c);
-                            auto t0 = verts.template pack<3>("x",tri[0]);
-                            auto t1 = verts.template pack<3>("x",tri[1]);
-                            auto t2 = verts.template pack<3>("x",tri[2]);
-                            auto center = (t0 + t1 + t2) / (T)3.0;
+//     ZENDEFNODE(VisualizeFacetPointIntersection, {{"ZSParticles",{"float","in_collisionEps","0.01"},{"float","out_collisionEps","0.01"}},
+//                                     {"collisionFacetVis","colPointFacetPairVis"},
+//                                     {
+//                                     },
+//                                     {"ZSGeometry"}});
 
-                            cv_buffer.template tuple<3>("xs",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
-                            cv_buffer.template tuple<3>("xe",MAX_FP_COLLISION_PAIRS * pi + i) = center;
 
-                            cv_pt_buffer.template tuple<3>("p",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
-                            cv_pt_buffer.template tuple<3>("t0",MAX_FP_COLLISION_PAIRS * pi + i) = t0;
-                            cv_pt_buffer.template tuple<3>("t1",MAX_FP_COLLISION_PAIRS * pi + i) = t1;
-                            cv_pt_buffer.template tuple<3>("t2",MAX_FP_COLLISION_PAIRS * pi + i) = t2;
 
-                        }
-                    }
-            });
+//     struct VisualizeEdgeEdgeIntersection : zeno::INode {
+//         using T = float;
+//         using Ti = int;
+//         using dtiles_t = zs::TileVector<T,32>;
+//         using tiles_t = typename ZenoParticles::particles_t;
 
-#else
-            // auto stbvs = retrieve_bounding_volumes(cudaPol,verts,tris,wrapv<3>{},(T)0.0,"x");
-            // stBvh.refit(cudaPol,stbvs);
+//         virtual void apply() override {
+//             using namespace zs;
+//             auto zsparticles = get_input<ZenoParticles>("ZSParticles");
 
-            COLLISION_UTILS::do_facet_point_collision_detection<MAX_FP_COLLISION_PAIRS>(cudaPol,
-                verts,"x",
-                points,
-                lines,
-                tris,
-                sttemp,
-                setemp,
-                cptemp,
-                // stBvh,
-                in_collisionEps,out_collisionEps);
-
-
-
-            cudaPol(zs::range(points.size()),
-                [cptemp = proxy<space>({},cptemp),verts = proxy<space>({},verts),
-                    cv_buffer = proxy<space>({},cv_buffer),
-                    cv_pt_buffer = proxy<space>({},cv_pt_buffer),
-                    points = proxy<space>({},points)] ZS_LAMBDA(int pi) mutable {
-                        for(int i = 0;i != MAX_FP_COLLISION_PAIRS;++i) {
-                            auto inds = cptemp.template pack<4>("inds",pi * MAX_FP_COLLISION_PAIRS + i).reinterpret_bits(int_c);
-                            bool contact = true;
-                            auto pvert = zs::vec<T,3>::zeros();
-                            for(int j = 0;j != 4;++j)
-                                if(inds[j] < 0)
-                                    contact = false;
-                            if(contact) {
-                                pvert = verts.template pack<3>("x",inds[0]);
-                                auto t0 = verts.template pack<3>("x",inds[1]);
-                                auto t1 = verts.template pack<3>("x",inds[2]);
-                                auto t2 = verts.template pack<3>("x",inds[3]);
-                                auto center = (t0 + t1 + t2) / (T)3.0;
-
-                                cv_buffer.template tuple<3>("xs",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
-                                cv_buffer.template tuple<3>("xe",MAX_FP_COLLISION_PAIRS * pi + i) = center;
-
-                                cv_pt_buffer.template tuple<3>("p",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
-                                cv_pt_buffer.template tuple<3>("t0",MAX_FP_COLLISION_PAIRS * pi + i) = t0;
-                                cv_pt_buffer.template tuple<3>("t1",MAX_FP_COLLISION_PAIRS * pi + i) = t1;
-                                cv_pt_buffer.template tuple<3>("t2",MAX_FP_COLLISION_PAIRS * pi + i) = t2;                                
-                            }else{
-                                cv_buffer.template tuple<3>("xs",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
-                                cv_buffer.template tuple<3>("xe",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
-                                
-                                cv_pt_buffer.template tuple<3>("p",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
-                                cv_pt_buffer.template tuple<3>("t0",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
-                                cv_pt_buffer.template tuple<3>("t1",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;
-                                cv_pt_buffer.template tuple<3>("t2",MAX_FP_COLLISION_PAIRS * pi + i) = pvert;                                
-                            }
-                        }
-            });
-            
-
-#endif
-            // cudaPol.syncCtx();
+//             if(!zsparticles->hasAuxData(ZenoParticles::s_surfTriTag))
+//                 throw std::runtime_error("the input zsparticles has no surface tris");
+//             if(!zsparticles->hasAuxData(ZenoParticles::s_surfEdgeTag))
+//                 throw std::runtime_error("the input zsparticles has no surface lines");
+//             if(!zsparticles->hasAuxData(ZenoParticles::s_surfVertTag)) 
+//                 throw std::runtime_error("the input zsparticles has no surface points");
 
+//             const auto& verts = zsparticles->getParticles();
+//             auto& tris  = (*zsparticles)[ZenoParticles::s_surfTriTag];
+//             auto& lines = (*zsparticles)[ZenoParticles::s_surfEdgeTag];
+//             auto& points = (*zsparticles)[ZenoParticles::s_surfVertTag];        
 
-            cv_buffer = cv_buffer.clone({zs::memsrc_e::host});
-            auto collisionFacetVis = std::make_shared<zeno::PrimitiveObject>();
-            auto& cv_verts = collisionFacetVis->verts;
-            auto& cv_lines = collisionFacetVis->lines;
-            cv_verts.resize(points.size() * 2 * MAX_FP_COLLISION_PAIRS);
-            cv_lines.resize(points.size() * MAX_FP_COLLISION_PAIRS);
+//             auto in_collisionEps = get_input2<float>("in_collisionEps");
+//             auto out_collisionEps = get_input2<float>("out_collisionEps");  
 
-            auto ompPol = omp_exec();  
-            constexpr auto omp_space = execspace_e::openmp;
 
-            ompPol(zs::range(cv_buffer.size()),
-                [cv_buffer = proxy<omp_space>({},cv_buffer),&cv_verts,&cv_lines] (int pi) mutable {
-                    auto xs = cv_buffer.template pack<3>("xs",pi);
-                    auto xe = cv_buffer.template pack<3>("xe",pi);
-                    cv_verts[pi * 2 + 0] = zeno::vec3f(xs[0],xs[1],xs[2]);
-                    cv_verts[pi * 2 + 1] = zeno::vec3f(xe[0],xe[1],xe[2]);
-                    cv_lines[pi] = zeno::vec2i(pi * 2 + 0,pi * 2 + 1);
-            });
+//             dtiles_t sttemp(tris.get_allocator(),
+//                 {
+//                     {"nrm",3}
+//                 },tris.size()
+//             );
+//             dtiles_t setemp(lines.get_allocator(),
+//                 {
+//                     {"nrm",3},
+//                     {"inds",4},
+//                     {"area",1},
+//                     {"inverted",1},
+//                     {"abary",2},
+//                     {"bbary",2}
+//                 },lines.size()
+//             );
+            
+//             constexpr auto space = execspace_e::cuda;
+//             auto cudaPol = cuda_exec();
 
-            set_output("collisionFacetVis",std::move(collisionFacetVis));
+//             std::cout << "before do edge edge collision detection" << std::endl;
 
+//             COLLISION_UTILS::do_edge_edge_collision_detection(cudaPol,
+//                 verts,"x",
+//                 points,lines,tris,
+//                 sttemp,setemp,
+//                 setemp,
+//                 in_collisionEps,out_collisionEps);
+            
+//             // std::vector<zs::PropertyTag> cv_tags{{"xs",3},{"xe",3}};
+//             // auto cv_buffer = typename ZenoParticles::particles_t(cv_tags,setemp.size(),zs::memsrc_e::device,0);
+//             std::vector<zs::PropertyTag> cv_ee_tags{{"a0",3},{"a1",3},{"b0",3},{"b1",3},{"abary",2},{"bbary",2}};
+//             auto cv_ee_buffer = typename ZenoParticles::particles_t(cv_ee_tags,setemp.size(),zs::memsrc_e::device,0);
+
+//             cudaPol(zs::range(setemp.size()),
+//                 [setemp = proxy<space>({},setemp),verts = proxy<space>({},verts),
+//                     cv_ee_buffer = proxy<space>({},cv_ee_buffer)] ZS_LAMBDA(int ei) mutable {
+//                         auto inds = setemp.template pack<4>("inds",ei).reinterpret_bits(int_c);
+//                         bool collide = true;
+//                         if(inds[0] < 0 || inds[1] < 0 || inds[2] < 0 || inds[3] < 0)
+//                             collide = false;
+//                         if(collide) {
+//                             auto abary = setemp.template pack<2>("abary",ei);
+//                             auto bbary = setemp.template pack<2>("bbary",ei);
+//                             printf("find collision pairs : %d %d %d %d with bary %f %f %f %f\n",inds[0],inds[1],inds[2],inds[3],
+//                                 (float)abary[0],(float)abary[1],(float)bbary[0],(float)bbary[1]);
+//                             cv_ee_buffer.template tuple<3>("a0",ei) = verts.template pack<3>("x",inds[0]);
+//                             cv_ee_buffer.template tuple<3>("a1",ei) = verts.template pack<3>("x",inds[1]);
+//                             cv_ee_buffer.template tuple<3>("b0",ei) = verts.template pack<3>("x",inds[2]);
+//                             cv_ee_buffer.template tuple<3>("b1",ei) = verts.template pack<3>("x",inds[3]);
+//                             cv_ee_buffer.template tuple<2>("abary",ei) = abary;
+//                             cv_ee_buffer.template tuple<2>("bbary",ei) = bbary;
+//                         }else {
+//                             cv_ee_buffer.template tuple<3>("a0",ei) = zs::vec<T,3>::zeros();
+//                             cv_ee_buffer.template tuple<3>("a1",ei) = zs::vec<T,3>::zeros();
+//                             cv_ee_buffer.template tuple<3>("b0",ei) = zs::vec<T,3>::zeros();
+//                             cv_ee_buffer.template tuple<3>("b1",ei) = zs::vec<T,3>::zeros();
+//                             cv_ee_buffer.template tuple<2>("abary",ei) = zs::vec<T,2>((T)1.0,0.0);
+//                             cv_ee_buffer.template tuple<2>("bbary",ei) = zs::vec<T,2>((T)1.0,0.0);
+//                         }
+//                 });
+
+//             cv_ee_buffer = cv_ee_buffer.clone({zs::memsrc_e::host});
+
+
+//             auto ompPol = omp_exec();  
+//             constexpr auto omp_space = execspace_e::openmp;
+
+//             auto collisionEdgeVis = std::make_shared<zeno::PrimitiveObject>();
+//             auto& ee_verts = collisionEdgeVis->verts;
+//             auto& ee_lines = collisionEdgeVis->lines;
+//             ee_verts.resize(cv_ee_buffer.size() * 2);
+//             ee_lines.resize(cv_ee_buffer.size());
+
+
+//             ompPol(zs::range(cv_ee_buffer.size()),
+//                 [cv_ee_buffer = proxy<omp_space>({},cv_ee_buffer),&ee_verts,&ee_lines] (int eei) mutable {
+//                     auto a0 = cv_ee_buffer.template pack<3>("a0",eei);
+//                     auto a1 = cv_ee_buffer.template pack<3>("a1",eei);
+//                     auto b0 = cv_ee_buffer.template pack<3>("b0",eei);
+//                     auto b1 = cv_ee_buffer.template pack<3>("b1",eei);     
+                    
+//                     auto abary = cv_ee_buffer.template pack<2>("abary",eei);
+//                     auto bbary = cv_ee_buffer.template pack<2>("bbary",eei);
 
+//                     // auto ac = (a0 + a1) / (T)2.0;
+//                     // auto bc = (b0 + b1) / (T)2.0;
 
-            cv_pt_buffer = cv_pt_buffer.clone({zs::memsrc_e::host});
-            auto colPointFacetPairVis = std::make_shared<zeno::PrimitiveObject>();
-            auto& cv_pt_verts = colPointFacetPairVis->verts;
-            auto& cv_pt_tris = colPointFacetPairVis->tris;
+//                     auto ac = abary[0] * a0 + abary[1] * a1;
+//                     auto bc = bbary[0] * b0 + bbary[1] * b1;
 
-            cv_pt_verts.resize(cv_pt_buffer.size() * 4);
-            cv_pt_tris.resize(cv_pt_buffer.size());
+//                     ee_verts[eei * 2 + 0] = zeno::vec3f(ac[0],ac[1],ac[2]);
+//                     ee_verts[eei * 2 + 1] = zeno::vec3f(bc[0],bc[1],bc[2]);
+//                     ee_lines[eei] = zeno::vec2i(eei * 2 + 0,eei * 2 + 1);
+//             });
 
-            ompPol(zs::range(cv_pt_buffer.size()),
-                [&cv_pt_verts,&cv_pt_tris,cv_pt_buffer = proxy<omp_space>({},cv_pt_buffer)] (int pi) mutable {
-                    cv_pt_verts[pi * 4 + 0] = cv_pt_buffer.template pack<3>("p",pi).to_array();
-                    cv_pt_verts[pi * 4 + 1] = cv_pt_buffer.template pack<3>("t0",pi).to_array();
-                    cv_pt_verts[pi * 4 + 2] = cv_pt_buffer.template pack<3>("t1",pi).to_array();
-                    cv_pt_verts[pi * 4 + 3] = cv_pt_buffer.template pack<3>("t2",pi).to_array();
+//             set_output("collisionEdgeVis",std::move(collisionEdgeVis));
 
-                    cv_pt_tris[pi] = zeno::vec3i(pi * 4 + 1,pi * 4 + 2,pi * 4 + 3);
-            });
+//             auto colEdgetPairVis = std::make_shared<zeno::PrimitiveObject>();
+//             auto& cv_ee_verts = colEdgetPairVis->verts;
+//             auto& cv_ee_lines = colEdgetPairVis->lines;
 
+//             cv_ee_verts.resize(cv_ee_buffer.size() * 4);
+//             cv_ee_lines.resize(cv_ee_buffer.size() * 2);
 
-            set_output("colPointFacetPairVis",std::move(colPointFacetPairVis));
+//             ompPol(zs::range(cv_ee_buffer.size()),
+//                 [&cv_ee_verts,&cv_ee_lines,cv_ee_buffer = proxy<omp_space>({},cv_ee_buffer)] (int eei) mutable {
+//                     cv_ee_verts[eei * 4 + 0] = cv_ee_buffer.template pack<3>("a0",eei).to_array();
+//                     cv_ee_verts[eei * 4 + 1] = cv_ee_buffer.template pack<3>("a1",eei).to_array();
+//                     cv_ee_verts[eei * 4 + 2] = cv_ee_buffer.template pack<3>("b0",eei).to_array();
+//                     cv_ee_verts[eei * 4 + 3] = cv_ee_buffer.template pack<3>("b1",eei).to_array();
 
-        }
-    };
+//                     cv_ee_lines[eei * 2 + 0] = zeno::vec2i(eei * 4 + 0,eei * 4 + 1);
+//                     cv_ee_lines[eei * 2 + 1] = zeno::vec2i(eei * 4 + 2,eei * 4 + 3);
+//             });
 
 
-ZENDEFNODE(VisualizeFacetPointIntersection, {{"ZSParticles",{"float","in_collisionEps","0.01"},{"float","out_collisionEps","0.01"}},
-                                  {"collisionFacetVis","colPointFacetPairVis"},
-                                  {
-                                  },
-                                  {"ZSGeometry"}});
+//             set_output("colEdgetPairVis",std::move(colEdgetPairVis));            
+//         }
+//     };
 
+//     ZENDEFNODE(VisualizeEdgeEdgeIntersection, {{"ZSParticles",{"float","in_collisionEps","0.01"},{"float","out_collisionEps","0.01"}},
+//                                     {"collisionEdgeVis","colEdgetPairVis"},
+//                                     {
+//                                     },
+//                                     {"ZSGeometry"}});
 
-struct VisualizeCollisionForce : zeno::INode {
 
+struct VisualizeKineCollision : zeno::INode {
     using T = float;
     using Ti = int;
     using dtiles_t = zs::TileVector<T,32>;
@@ -1215,71 +1842,67 @@ struct VisualizeCollisionForce : zeno::INode {
     using bv_t = zs::AABBBox<3, T>;
     using vec3 = zs::vec<T, 3>;
 
-
     virtual void apply() override {
         using namespace zs;
-
         auto zsparticles = get_input<ZenoParticles>("ZSParticles");
-
         if(!zsparticles->hasAuxData(ZenoParticles::s_surfTriTag))
             throw std::runtime_error("the input zsparticles has no surface tris");
         if(!zsparticles->hasAuxData(ZenoParticles::s_surfEdgeTag))
             throw std::runtime_error("the input zsparticles has no surface lines");
         if(!zsparticles->hasAuxData(ZenoParticles::s_surfVertTag)) 
             throw std::runtime_error("the input zsparticles has no surface points");
-        // if(!zsparticles->hasBvh(ZenoParticles::s_surfTriTag)) {
-        //     throw std::runtime_error("the input zsparticles has no surface tris's spacial structure");
-        // }
-        // if(!zsparticles->hasBvh(ZenoParticles::s_surfEdgeTag)) {
-        //     throw std::runtime_error("the input zsparticles has no surface edge's spacial structure");
-        // }
-        // if(!zsparticles->hasBvh(ZenoParticles::s_surfVertTag))  {
-        //     throw std::runtime_error("the input zsparticles has no surface vert's spacial structure");
-        // }
-
+        
+        const auto& eles = zsparticles->getQuadraturePoints();
         const auto& verts = zsparticles->getParticles();
-
-        auto& tris  = (*zsparticles)[ZenoParticles::s_surfTriTag];
+        auto& tris = (*zsparticles)[ZenoParticles::s_surfTriTag];
         auto& lines = (*zsparticles)[ZenoParticles::s_surfEdgeTag];
         auto& points = (*zsparticles)[ZenoParticles::s_surfVertTag];
 
-        // auto& stBvh = zsparticles->bvh(ZenoParticles::s_surfTriTag);
-        // auto& seBvh = zsparticles->bvh(ZenoParticles::s_surfEdgeTag);
-
+        // ksurf should be a surface tris
+        auto ksurf = get_input<ZenoParticles>("KinematicSurf");
+        auto kverts = ksurf->getParticles();
+        // if(!kverts.hasProperty("nrm")) {
+        //     fmt::print(fg(fmt::color::red),"KinematicSurf has no surface normal\n");
+        //     throw std::runtime_error("the Kinematic surf has no surface normal");
+        // }
+        
         dtiles_t sttemp(tris.get_allocator(),
             {
-                {"nrm",3},
-                {"x",3}
+                {"nrm",3}
             },tris.size()
         );
         dtiles_t setemp(lines.get_allocator(),
             {
+                // {"inds",4},
+                // {"area",1},
+                // {"inverted",1},
+                // {"abary",2},
+                // {"bbary",2},
                 {"nrm",3}
+                // {"grad",12},
+                // {"H",12*12}
             },lines.size()
         );
-        
         dtiles_t sptemp(points.get_allocator(),
             {
-                {"nrm",3},
-                {"x",3}
+                {"nrm",3}
             },points.size()
         );
 
-        dtiles_t cptemp(points.get_allocator(),
+    
+        dtiles_t fp_buffer(kverts.get_allocator(),
             {
-                {"inds",4},
+                {"inds",2},
                 {"area",1},
-                {"grad",12},
-                {"H",12 * 12},
                 {"inverted",1}
-            },points.size() * MAX_FP_COLLISION_PAIRS);
-
+            },kverts.size() * MAX_FP_COLLISION_PAIRS);
         
-        dtiles_t vtemp(verts.get_allocator(),
+        dtiles_t gh_buffer(points.get_allocator(),
             {
-                {"x",3},
-                {"dir",3},
-            },verts.size());
+                {"inds",4},
+                {"H",12*12},
+                {"grad",12}
+            },eles.size());
 
 
         auto in_collisionEps = get_input2<float>("in_collisionEps");
@@ -1287,193 +1910,272 @@ struct VisualizeCollisionForce : zeno::INode {
 
         constexpr auto space = execspace_e::cuda;
         auto cudaPol = cuda_exec();
-    
 
-        // if(!calculate_facet_normal(cudaPol,verts,"x",tris,sttemp,"nrm")){
-        //         throw std::runtime_error("fail updating facet normal");
-        // }
 
+        auto kverts_ = typename ZenoParticles::particles_t({
+            {"x",3},
+            {"area",1}},kverts.size(),zs::memsrc_e::device,0);  
+        TILEVEC_OPS::copy<3>(cudaPol,kverts,"x",kverts_,"x");
+        TILEVEC_OPS::fill(cudaPol,kverts_,"area",(T)1.0);
+        TILEVEC_OPS::copy<4>(cudaPol,eles,"inds",gh_buffer,"inds");              
 
-        // auto avgl = compute_average_edge_length(cudaPol,verts,"x",tris);
-        // auto bvh_thickness = 5 * avgl;
-
-#if 0
-        if(!COLLISION_UTILS::calculate_cell_bisector_normal(cudaPol,
+        COLLISION_UTILS::do_kinematic_point_collision_detection<MAX_FP_COLLISION_PAIRS>(cudaPol,
             verts,"x",
+            points,
             lines,
             tris,
-            sttemp,"nrm",
-            setemp,"nrm")){
-                throw std::runtime_error("fail calculate cell bisector normal");
-        } 
-
-        auto stbvs = retrieve_bounding_volumes(cudaPol,verts,tris,wrapv<3>{},(T)0.0,"x");
-        auto sebvs = retrieve_bounding_volumes(cudaPol,verts,lines,wrapv<2>{},(T)0.0,"x");
-        stBvh.refit(cudaPol,stbvs);
-        seBvh.refit(cudaPol,sebvs);
-
-
-        if(!calculate_facet_normal(cudaPol,verts,"x",tris,sttemp,"nrm")){
-                throw std::runtime_error("fail updating facet normal");
-        }
-
-        auto collisionEps = get_input2<float>("collisionEps");
-
-        TILEVEC_OPS::fill<3>(cudaPol,sttemp,"cf",zs::vec<T,3>::zeros());
-        TILEVEC_OPS::fill<3>(cudaPol,sptemp,"cf",zs::vec<T,3>::zeros());
-        // TILEVEC_OPS::copy<3>(cudaPol,verts,"x",sptemp,"x");
-        cudaPol(zs::range(sptemp.size()),
-            [sptemp = proxy<space>({},sptemp),verts = proxy<space>({},verts),points = proxy<space>({},points)] ZS_LAMBDA(int pi) mutable {
-                auto pidx = reinterpret_bits<int>(points("inds",pi));
-                sptemp.template tuple<3>("x",pi) = verts.template pack<3>("x",pidx);
-        });
-
-        // evaluate the center of tris
-        cudaPol(zs::range(tris.size()),
-            [verts = proxy<space>({},verts),tris = proxy<space>({},tris),sttemp = proxy<space>({},sttemp)] ZS_LAMBDA(int ti) mutable {
-                sttemp.template tuple<3>("x",ti) = zs::vec<T,3>::zeros();
-                auto inds = tris.template pack<3>("inds",ti).reinterpret_bits(int_c);
-                for(int i = 0;i != 3;++i)
-                    sttemp.template tuple<3>("x",ti) = sttemp.template pack<3>("x",ti) + verts.template pack<3>("x",inds[i]) / (T)3.0;
+            setemp,
+            sttemp,
+            kverts_,
+            fp_buffer,
+            in_collisionEps,out_collisionEps);
+        
+        std::vector<zs::PropertyTag> cv_tags{{"xp",3},{"xt",3},{"t0",3},{"t1",3},{"t2",3}};
+        auto cv_buffer = typename ZenoParticles::particles_t(cv_tags,fp_buffer.size(),zs::memsrc_e::device,0);
+
+        cudaPol(zs::range(fp_buffer.size()),
+            [fp_buffer = proxy<space>({},fp_buffer),
+                verts = proxy<space>({},verts),
+                tris = proxy<space>({},tris),
+                kverts = proxy<space>({},kverts),
+                cv_buffer = proxy<space>({},cv_buffer)] ZS_LAMBDA(int ci) mutable {
+                    auto cp = fp_buffer.pack(dim_c<2>,"inds",ci).reinterpret_bits(int_c);
+
+                    auto contact = true;
+                    for(int i = 0;i != 2;++i)
+                        if(cp[i] < 0){
+                            contact = false;
+                            break;
+                        }
+                    auto pvert = zs::vec<T,3>::zeros();
+                    if(contact) {
+                        // auto pidx = cp[0];
+                        auto tri = tris.pack(dim_c<3>,"inds",cp[1]).reinterpret_bits(int_c);
+                        pvert = kverts.pack(dim_c<3>,"x",cp[0]);
+                        auto t0 = verts.pack(dim_c<3>,"x",tri[0]);
+                        auto t1 = verts.pack(dim_c<3>,"x",tri[1]);
+                        auto t2 = verts.pack(dim_c<3>,"x",tri[2]);
+
+                        auto tc = (t0 + t1 + t2)/(T)3.0;
+
+                        cv_buffer.template tuple<3>("xp",ci) = pvert;
+                        cv_buffer.template tuple<3>("xt",ci) = tc;
+                        cv_buffer.template tuple<3>("t0",ci) = t0;
+                        cv_buffer.template tuple<3>("t1",ci) = t1;
+                        cv_buffer.template tuple<3>("t2",ci) = t2;
+                    } else {
+                        cv_buffer.template tuple<3>("xp",ci) = pvert;
+                        cv_buffer.template tuple<3>("xt",ci) = pvert;
+                        cv_buffer.template tuple<3>("t0",ci) = pvert;
+                        cv_buffer.template tuple<3>("t1",ci) = pvert;
+                        cv_buffer.template tuple<3>("t2",ci) = pvert;
+                    }
+                    
         });
 
-        // evaluate the collision force
-        cudaPol(zs::range(points.size()),[collisionEps = collisionEps,
-                        verts = proxy<space>({},verts),
-                        sttemp = proxy<space>({},sttemp),
-                        setemp = proxy<space>({},setemp),
-                        sptemp = proxy<space>({},sptemp),
-                        points = proxy<space>({},points),
-                        lines = proxy<space>({},lines),
-                        tris = proxy<space>({},tris),
-                        stbvh = proxy<space>(stBvh),thickness = bvh_thickness] ZS_LAMBDA(int svi) mutable {
+        auto ompPol = omp_exec();  
+        constexpr auto omp_space = execspace_e::openmp;
 
+        cv_buffer = cv_buffer.clone({zs::memsrc_e::host});
+        auto colPointTriPairVis = std::make_shared<zeno::PrimitiveObject>();
+        auto& cv_pt_verts = colPointTriPairVis->verts;
+        auto& cv_pt_tris = colPointTriPairVis->tris;
 
-            auto vi = reinterpret_bits<int>(points("inds",svi));
-            // auto is_vertex_inverted = reinterpret_bits<int>(verts("is_inverted",vi));
-            // if(is_vertex_inverted)
-            //     return;
+        cv_pt_verts.resize(cv_buffer.size() * 4);
+        cv_pt_tris.resize(cv_buffer.size());
 
-            auto p = verts.template pack<3>("x",vi);
-            auto bv = bv_t{get_bounding_box(p - thickness, p + thickness)};
+        ompPol(zs::range(cv_buffer.size()),
+            [&cv_pt_verts,&cv_pt_tris,cv_buffer = proxy<omp_space>({},cv_buffer)] (int ci) mutable {
+                cv_pt_verts[ci * 4 + 0] = cv_buffer.pack(dim_c<3>,"xp",ci).to_array();
+                cv_pt_verts[ci * 4 + 1] = cv_buffer.pack(dim_c<3>,"t0",ci).to_array();
+                cv_pt_verts[ci * 4 + 2] = cv_buffer.pack(dim_c<3>,"t1",ci).to_array();
+                cv_pt_verts[ci * 4 + 3] = cv_buffer.pack(dim_c<3>,"t2",ci).to_array();
+                
+                cv_pt_tris[ci] = zeno::vec3i(ci * 4 + 1,ci * 4 + 2,ci * 4 + 3);
+        });
 
-            vec3 collision_verts[4] = {};
-            collision_verts[0] = p;
+        set_output("colPointFacePairVis",std::move(colPointTriPairVis));
 
+        auto colCenterLineVis = std::make_shared<zeno::PrimitiveObject>();
+        auto& cv_cl_verts = colCenterLineVis->verts;
+        auto& cv_cl_lines = colCenterLineVis->lines;
+        
+        cv_cl_verts.resize(cv_buffer.size() * 2);
+        cv_cl_lines.resize(cv_buffer.size());
 
-            auto process_vertex_face_collision_pairs = [&](int stI) {
-                auto tri = tris.pack(dim_c<3>, "inds",stI).reinterpret_bits(int_c);
-                if(tri[0] == vi || tri[1] == vi || tri[2] == vi)
-                    return;
+        ompPol(zs::range(cv_buffer.size()),
+            [cv_buffer = proxy<omp_space>({},cv_buffer),&cv_cl_verts,&cv_cl_lines] (int ci) mutable {
+                cv_cl_verts[ci * 2 + 0] = cv_buffer.pack(dim_c<3>,"xp",ci).to_array();
+                cv_cl_verts[ci * 2 + 1] = cv_buffer.pack(dim_c<3>,"xt",ci).to_array();
+                cv_cl_lines[ci] = zeno::vec2i(ci * 2 + 0,ci * 2 + 1);
+        });
 
-                bool collide = false;
+        set_output("colConnVis",std::move(colCenterLineVis));
 
-                if(COLLISION_UTILS::is_inside_the_cell(verts,"x",
-                        lines,tris,
-                        sttemp,"nrm",
-                        setemp,"nrm",
-                        stI,p,collisionEps)) {
-                    collide = true;
-                }
 
+        COLLISION_UTILS::evaluate_kinematic_fp_collision_grad_and_hessian(
+            cudaPol,
+            eles,
+            verts,"x","v",(T)1.0,
+            tris,
+            kverts_,
+            fp_buffer,
+            gh_buffer,0,
+            in_collisionEps,out_collisionEps,
+            (T)1.0,
+            (T)1.0,(T)1.0,(T)0.01);
 
-                if(!collide)
-                    return;
+        dtiles_t vtemp(verts.get_allocator(),
+            {
+                {"x",3},
+                {"dir",3},
+            },verts.size());
+        TILEVEC_OPS::copy<3>(cudaPol,verts,"x",vtemp,"x");
+        TILEVEC_OPS::fill<3>(cudaPol,vtemp,"dir",zs::vec<T,3>::zeros());
 
-                collision_verts[1] = verts.template pack<3>("x",tri[0]);
-                collision_verts[2] = verts.template pack<3>("x",tri[1]);
-                collision_verts[3] = verts.template pack<3>("x",tri[2]);
+        TILEVEC_OPS::assemble_range(cudaPol,gh_buffer,"grad","inds",vtemp,"dir",0,gh_buffer.size());        
+        vtemp = vtemp.clone({zs::memsrc_e::host}); 
 
-                auto vertexFaceCollisionAreas = tris("area",stI) + points("area",svi);
+        auto nodalForceVis = std::make_shared<zeno::PrimitiveObject>();       
+        auto& spverts = nodalForceVis->verts;
+        spverts.resize(vtemp.size() * 2);
+        auto& splines = nodalForceVis->lines;
+        splines.resize(vtemp.size());
 
-                auto grad = (T)1.0 * VERTEX_FACE_SQRT_COLLISION::gradient(collision_verts,1,1,collisionEps) * vertexFaceCollisionAreas;
+        auto scale = get_input2<float>("scale");
+        ompPol(zs::range(vtemp.size()),
+            [vtemp = proxy<space>({},vtemp),&spverts,&splines,scale] (int vi) mutable {
+                auto xs = vtemp.template pack<3>("x",vi);
+                auto dir = vtemp.template pack<3>("dir",vi);
 
-                // auto pf = zs::vec<T,3>{grad[0],grad[1],grad[2]};    
-                zs::vec<T,3> tf[3] = {};
-                for(int j = 0;j != 3;++j)
-                    tf[j] = zs::vec<T,3>{grad[j * 3 + 3 + 0],grad[j * 3 + 3 + 1],grad[j * 3 + 3 + 2]};     
+                auto xe = xs + scale * dir;
 
-                // auto avgtf = (tf[0] + tf[1] + tf[2])/(T)3.0;
-                auto avgtf = (tf[0] + tf[1] + tf[2]);
-                for(int j = 0;j != 3;++j)
-                    atomic_add(exec_cuda,&sttemp("cf",j,stI),avgtf[j]);
+                spverts[vi * 2 + 0] = xs.to_array();
+                spverts[vi * 2 + 1] = xe.to_array();
+                splines[vi] = zeno::vec2i(vi * 2 + 0,vi * 2 + 1);               
+        });
 
+        set_output("FPNodalForceVis",std::move(nodalForceVis));
 
-                auto fp_inds = tris.template pack<3>("fp_inds",stI).reinterpret_bits(int_c);
-                for(int j = 0;j != 3;++j){
-                    atomic_add(exec_cuda,&sptemp("cf",j,svi),grad[j]);
-                    // for(int k = 0;k != 3;++k)   {
-                    //     auto fp_idx = fp_inds[k];
-                    //     atomic_add(exec_cuda,&sptemp("cf",j,fp_idx),tf[k][j]);
-                    // }
-                }                      
 
-            };
-            stbvh.iter_neighbors(bv,process_vertex_face_collision_pairs);
-        });
+    }
+};
 
-        cudaPol.syncCtx();
 
+ZENDEFNODE(VisualizeKineCollision, {{"ZSParticles","KinematicSurf",{"float","in_collisionEps"},{"float","out_collisionEps"},{"float","scale"}},
+                                  {
+                                        "colPointFacePairVis",
+                                        "colConnVis",
+                                        "FPNodalForceVis"
+                                    },
+                                  {
+                                  },
+                                  {"ZSGeometry"}});
 
-        auto scale = get_input2<float>("scale");
 
-        auto ompPol = omp_exec();  
-        constexpr auto omp_space = execspace_e::openmp;
-        
-        sptemp = sptemp.clone({zs::memsrc_e::host});
-        // sttemp = sttemp.clone({zs::memsrc_e::host});
+struct VisualizeCollision : zeno::INode {
 
-        auto nodalForceVis = std::make_shared<zeno::PrimitiveObject>();
-        auto& spverts = nodalForceVis->verts;
-        spverts.resize(sptemp.size() * 2);
-        auto& splines = nodalForceVis->lines;
-        splines.resize(sptemp.size());
+    using T = float;
+    using Ti = int;
+    using dtiles_t = zs::TileVector<T,32>;
+    using tiles_t = typename ZenoParticles::particles_t;
+    using bvh_t = zs::LBvh<3,int,T>;
+    using bv_t = zs::AABBBox<3, T>;
+    using vec3 = zs::vec<T, 3>;
 
 
-        ompPol(zs::range(sptemp.size()),
-            [sptemp = proxy<omp_space>({},sptemp),&spverts,&splines,scale] (int pi) mutable {
-                auto xs = sptemp.template pack<3>("x",pi);
-                auto dir = sptemp.template pack<3>("cf",pi);
-                // auto dir = zs::vec<T,3>{1.0,0.0,0.0};
-                auto xe = xs + dir * scale;
+    virtual void apply() override {
+        using namespace zs;
 
-                spverts[pi * 2 + 0] = xs.to_array();
-                spverts[pi * 2 + 1] = xe.to_array();
-                splines[pi] = zeno::vec2i(pi * 2 + 0,pi * 2 + 1);
-        });
+        auto zsparticles = get_input<ZenoParticles>("ZSParticles");
 
-        set_output("nodalForceVis",std::move(nodalForceVis));
+        if(!zsparticles->hasAuxData(ZenoParticles::s_surfTriTag))
+            throw std::runtime_error("the input zsparticles has no surface tris");
+        if(!zsparticles->hasAuxData(ZenoParticles::s_surfEdgeTag))
+            throw std::runtime_error("the input zsparticles has no surface lines");
+        if(!zsparticles->hasAuxData(ZenoParticles::s_surfVertTag)) 
+            throw std::runtime_error("the input zsparticles has no surface points");
+        // if(!zsparticles->hasBvh(ZenoParticles::s_surfTriTag)) {
+        //     throw std::runtime_error("the input zsparticles has no surface tris's spacial structure");
+        // }
+        // if(!zsparticles->hasBvh(ZenoParticles::s_surfEdgeTag)) {
+        //     throw std::runtime_error("the input zsparticles has no surface edge's spacial structure");
+        // }
+        // if(!zsparticles->hasBvh(ZenoParticles::s_surfVertTag))  {
+        //     throw std::runtime_error("the input zsparticles has no surface vert's spacial structure");
+        // }
 
+        const auto& verts = zsparticles->getParticles();
 
+        auto& tris  = (*zsparticles)[ZenoParticles::s_surfTriTag];
+        auto& lines = (*zsparticles)[ZenoParticles::s_surfEdgeTag];
+        auto& points = (*zsparticles)[ZenoParticles::s_surfVertTag];
 
-        sttemp = sttemp.clone({zs::memsrc_e::host});
-        auto facetForceVis = std::make_shared<zeno::PrimitiveObject>();
-        auto& stverts = facetForceVis->verts;
-        auto& stlines = facetForceVis->lines;
+        // auto& stBvh = zsparticles->bvh(ZenoParticles::s_surfTriTag);
+        // auto& seBvh = zsparticles->bvh(ZenoParticles::s_surfEdgeTag);
 
-        stverts.resize(sttemp.size() * 2);
-        stlines.resize(sttemp.size());
+        dtiles_t sttemp(tris.get_allocator(),
+            {
+                {"nrm",3},
+                {"x",3}
+            },tris.size()
+        );
+        dtiles_t setemp(lines.get_allocator(),
+            {
+                // {"inds",4},
+                // {"area",1},
+                // {"inverted",1},
+                // {"abary",2},
+                // {"bbary",2},
+                {"nrm",3}
+                // {"grad",12},
+                // {"H",12*12}
+            },lines.size()
+        );
+        dtiles_t sptemp(points.get_allocator(),
+            {
+                {"nrm",3},
+                {"x",3}
+            },points.size()
+        );
 
-        ompPol(zs::range(sttemp.size()),
-            [&stverts,&stlines,sttemp = proxy<omp_space>({},sttemp),scale] (int ti) mutable {
-                auto xs = sttemp.template pack<3>("x",ti);
-                auto dir = sttemp.template pack<3>("cf",ti);
-                auto xe = xs + dir * scale;
+        dtiles_t fp_buffer(points.get_allocator(),
+            {
+                {"inds",4},
+                {"area",1},
+                {"inverted",1}
+            },points.size() * MAX_FP_COLLISION_PAIRS);
+        dtiles_t ee_buffer(lines.get_allocator(),
+            {
+                {"inds",4},
+                {"area",1},
+                {"inverted",1},
+                {"abary",2},
+                {"bbary",2}
+            },lines.size());
 
-                stverts[ti * 2 + 0] = xs.to_array();
-                stverts[ti * 2 + 1] = xe.to_array();
+        dtiles_t gh_buffer(points.get_allocator(),
+            {
+                {"inds",4},
+                {"H",12*12},
+                {"grad",12}
+            },points.size() * MAX_FP_COLLISION_PAIRS + lines.size());
 
-                stlines[ti] = zeno::vec2i(ti * 2 + 0,ti * 2 + 1);
-        });
 
+        dtiles_t vtemp(verts.get_allocator(),
+            {
+                {"x",3},
+                {"dir",3},
+            },verts.size());
 
-        set_output("facetForceVis",std::move(facetForceVis));
 
-#else
+        auto in_collisionEps = get_input2<float>("in_collisionEps");
+        auto out_collisionEps = get_input2<float>("out_collisionEps");
 
-        // auto stbvs = retrieve_bounding_volumes(cudaPol,verts,tris,wrapv<3>{},(T)0.0,"x");
-        // stBvh.refit(cudaPol,stbvs);
+        constexpr auto space = execspace_e::cuda;
+        auto cudaPol = cuda_exec();
 
+        // calculate facet-point collision pairs and force
 
         COLLISION_UTILS::do_facet_point_collision_detection<MAX_FP_COLLISION_PAIRS>(cudaPol,
             verts,"x",
@@ -1482,12 +2184,10 @@ struct VisualizeCollisionForce : zeno::INode {
             tris,
             sttemp,
             setemp,
-            cptemp,
-            // stBvh,
+            fp_buffer,
             in_collisionEps,out_collisionEps);
 
 
-
         std::vector<zs::PropertyTag> cv_tags{{"xs",3},{"xe",3}};
         auto cv_buffer = typename ZenoParticles::particles_t(cv_tags,points.size() * MAX_FP_COLLISION_PAIRS,zs::memsrc_e::device,0);
         std::vector<zs::PropertyTag> cv_pt_tags{{"p",3},{"t0",3},{"t1",3},{"t2",3}};
@@ -1495,12 +2195,12 @@ struct VisualizeCollisionForce : zeno::INode {
 
 
         cudaPol(zs::range(points.size()),
-            [cptemp = proxy<space>({},cptemp),verts = proxy<space>({},verts),
+            [fp_buffer = proxy<space>({},fp_buffer),verts = proxy<space>({},verts),
                 cv_buffer = proxy<space>({},cv_buffer),
                 cv_pt_buffer = proxy<space>({},cv_pt_buffer),
                 points = proxy<space>({},points)] ZS_LAMBDA(int pi) mutable {
                     for(int i = 0;i != MAX_FP_COLLISION_PAIRS;++i) {
-                        auto inds = cptemp.template pack<4>("inds",pi * MAX_FP_COLLISION_PAIRS + i).reinterpret_bits(int_c);
+                        auto inds = fp_buffer.template pack<4>("inds",pi * MAX_FP_COLLISION_PAIRS + i).reinterpret_bits(int_c);
                         bool contact = true;
                         auto pvert = zs::vec<T,3>::zeros();
                         for(int j = 0;j != 4;++j)
@@ -1575,56 +2275,21 @@ struct VisualizeCollisionForce : zeno::INode {
         set_output("colPointFacetPairVis",std::move(colPointFacetPairVis));
 
 
-
-
-        COLLISION_UTILS::evaluate_collision_grad_and_hessian<MAX_FP_COLLISION_PAIRS>(cudaPol,
-            verts,"x",
-            cptemp,
-            in_collisionEps,
-            out_collisionEps,
+        COLLISION_UTILS::evaluate_fp_collision_grad_and_hessian(
+            cudaPol,
+            verts,"x","v",(T)1.0,
+            fp_buffer,
+            gh_buffer,0,
+            in_collisionEps,out_collisionEps,
             (T)1.0,
-            (T)1.0,(T)1.0);
+            (T)1.0,(T)1.0,(T)0.0);
 
         TILEVEC_OPS::copy<3>(cudaPol,verts,"x",vtemp,"x");
         TILEVEC_OPS::fill<3>(cudaPol,vtemp,"dir",zs::vec<T,3>::zeros());
-        
-        // TILEVEC_OPS::fill<12>(cudaPol,cptemp,"grad",zs::vec<T,12>{1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0});
-        // auto gradN = TILEVEC_OPS::inf_norm<12>(cudaPol,cptemp,"grad");
-
-        // cudaPol(zs::range(cptemp.size()),
-        //     [cptemp = proxy<space>({},cptemp),verts = proxy<space>({},verts)] ZS_LAMBDA(int cpi) mutable {
-        //         auto inds = cptemp.template pack<4>("inds",cpi).reinterpret_bits(int_c);
-        //         bool in_active = false;
-        //         for(int i = 0;i != 4;++i) {
-        //             if(inds[i] < 0)
-        //                 in_active = false;
-        //             else{
-        //                 auto active = verts("active",inds[i]);
-        //                 if(active < 1e-6)
-        //                     in_active = false;
-        //             }
-        //         }
-        //         if(in_active)
-        //             cptemp.template tuple<12>("grad",cpi) = zs::vec<T,12>::zeros();
-        // });
-
-
-        TILEVEC_OPS::assemble<3,4>(cudaPol,cptemp,"grad",vtemp,"dir");
-
-        // cudaPol(zs::range(verts.size()),
-        //     [verts = proxy<space>({},verts),vtemp = proxy<space>({},vtemp)] ZS_LAMBDA(int vi) mutable {
-        //         auto active = verts("active",vi);
-        //         if(active < 1e-6)
-        //             vtemp.template tuple<3>("dir",vi) = zs::vec<T,3>::zeros();
-        // });
-
-        // cudaPol.syncCtx();
 
-        // fmt::print(fg(fmt::color::dark_cyan),
-        //     "gradN = {}\n",gradN);
+        TILEVEC_OPS::assemble_range(cudaPol,gh_buffer,"grad","inds",vtemp,"dir",0,fp_buffer.size());
 
-
-        auto scale = get_input2<float>("scale");
+        auto scale = get_input2<float>("fp_scale");
 
         // auto ompPol = omp_exec();  
         // constexpr auto omp_space = execspace_e::openmp;
@@ -1650,29 +2315,166 @@ struct VisualizeCollisionForce : zeno::INode {
                 splines[vi] = zeno::vec2i(vi * 2 + 0,vi * 2 + 1);               
         });
 
-        set_output("nodalForceVis",std::move(nodalForceVis));
+        set_output("FPNodalForceVis",std::move(nodalForceVis));
+
+        // calculate edge edge collision pairs and face
+        // COLLISION_UTILS::do_edge_edge_collision_detection(cudaPol,
+        //     verts,"x",
+        //     points,lines,tris,
+        //     sttemp,setemp,
+        //     ee_buffer,
+        //     in_collisionEps,out_collisionEps);        
+        // std::vector<zs::PropertyTag> cv_ee_tags{{"a0",3},{"a1",3},{"b0",3},{"b1",3},{"abary",2},{"bbary",2}};
+        // auto cv_ee_buffer = typename ZenoParticles::particles_t(cv_ee_tags,setemp.size(),zs::memsrc_e::device,0);
+
+        // cudaPol(zs::range(ee_buffer.size()),
+        //         [ee_buffer = proxy<space>({},ee_buffer),verts = proxy<space>({},verts),
+        //     cv_ee_buffer = proxy<space>({},cv_ee_buffer)] ZS_LAMBDA(int ei) mutable {
+        //         auto inds = ee_buffer.template pack<4>("inds",ei).reinterpret_bits(int_c);
+        //         bool collide = true;
+        //         if(inds[0] < 0 || inds[1] < 0 || inds[2] < 0 || inds[3] < 0)
+        //             collide = false;
+        //         if(collide) {
+        //             auto abary = ee_buffer.template pack<2>("abary",ei);
+        //             auto bbary = ee_buffer.template pack<2>("bbary",ei);
+
+        //             // printf("Found edge collision pair %d %d %d %d %f %f %f %f\n",inds[0],inds[1],inds[2],inds[3],
+        //             //     (float)abary[0],(float)abary[1],(float)bbary[0],(float)bbary[1]);
+
+        //             // printf("find collision pairs : %d %d %d %d with bary %f %f %f %f\n",inds[0],inds[1],inds[2],inds[3],
+        //             //     (float)abary[0],(float)abary[1],(float)bbary[0],(float)bbary[1]);
+        //             cv_ee_buffer.template tuple<3>("a0",ei) = verts.template pack<3>("x",inds[0]);
+        //             cv_ee_buffer.template tuple<3>("a1",ei) = verts.template pack<3>("x",inds[1]);
+        //             cv_ee_buffer.template tuple<3>("b0",ei) = verts.template pack<3>("x",inds[2]);
+        //             cv_ee_buffer.template tuple<3>("b1",ei) = verts.template pack<3>("x",inds[3]);
+        //             cv_ee_buffer.template tuple<2>("abary",ei) = abary;
+        //             cv_ee_buffer.template tuple<2>("bbary",ei) = bbary;
+        //         }else {
+        //             cv_ee_buffer.template tuple<3>("a0",ei) = zs::vec<T,3>::zeros();
+        //             cv_ee_buffer.template tuple<3>("a1",ei) = zs::vec<T,3>::zeros();
+        //             cv_ee_buffer.template tuple<3>("b0",ei) = zs::vec<T,3>::zeros();
+        //             cv_ee_buffer.template tuple<3>("b1",ei) = zs::vec<T,3>::zeros();
+        //             cv_ee_buffer.template tuple<2>("abary",ei) = zs::vec<T,2>((T)1.0,0.0);
+        //             cv_ee_buffer.template tuple<2>("bbary",ei) = zs::vec<T,2>((T)1.0,0.0);
+        //         }
+        // });
+
+        // cv_ee_buffer = cv_ee_buffer.clone({zs::memsrc_e::host});
 
-        auto facetForceVis = std::make_shared<zeno::PrimitiveObject>();
-        auto& stverts = facetForceVis->verts;
-        auto& stlines = facetForceVis->lines;
+        // // auto ompPol = omp_exec();  
+        // // constexpr auto omp_space = execspace_e::openmp;
 
-        stverts.resize(0);
-        stlines.resize(0);
+        // auto collisionEdgeVis = std::make_shared<zeno::PrimitiveObject>();
+        // auto& ee_verts = collisionEdgeVis->verts;
+        // auto& ee_lines = collisionEdgeVis->lines;
+        // ee_verts.resize(cv_ee_buffer.size() * 2);
+        // ee_lines.resize(cv_ee_buffer.size());
 
-        set_output("facetForceVis",std::move(facetForceVis));
 
-#endif 
+        // ompPol(zs::range(cv_ee_buffer.size()),
+        //     [cv_ee_buffer = proxy<omp_space>({},cv_ee_buffer),&ee_verts,&ee_lines] (int eei) mutable {
+        //         auto a0 = cv_ee_buffer.template pack<3>("a0",eei);
+        //         auto a1 = cv_ee_buffer.template pack<3>("a1",eei);
+        //         auto b0 = cv_ee_buffer.template pack<3>("b0",eei);
+        //         auto b1 = cv_ee_buffer.template pack<3>("b1",eei);     
+                
+        //         auto abary = cv_ee_buffer.template pack<2>("abary",eei);
+        //         auto bbary = cv_ee_buffer.template pack<2>("bbary",eei);
+
+        //         // auto ac = (a0 + a1) / (T)2.0;
+        //         // auto bc = (b0 + b1) / (T)2.0;
+
+        //         auto ac = abary[0] * a0 + abary[1] * a1;
+        //         auto bc = bbary[0] * b0 + bbary[1] * b1;
+
+        //         ee_verts[eei * 2 + 0] = zeno::vec3f(ac[0],ac[1],ac[2]);
+        //         ee_verts[eei * 2 + 1] = zeno::vec3f(bc[0],bc[1],bc[2]);
+        //         ee_lines[eei] = zeno::vec2i(eei * 2 + 0,eei * 2 + 1);
+        // });
 
+        // set_output("collisionEdgeVis",std::move(collisionEdgeVis));
+
+        // auto colEdgetPairVis = std::make_shared<zeno::PrimitiveObject>();
+        // auto& cv_ee_verts = colEdgetPairVis->verts;
+        // auto& cv_ee_lines = colEdgetPairVis->lines;
+
+        // cv_ee_verts.resize(cv_ee_buffer.size() * 4);
+        // cv_ee_lines.resize(cv_ee_buffer.size() * 2);
+
+        // ompPol(zs::range(cv_ee_buffer.size()),
+        //     [&cv_ee_verts,&cv_ee_lines,cv_ee_buffer = proxy<omp_space>({},cv_ee_buffer)] (int eei) mutable {
+        //         cv_ee_verts[eei * 4 + 0] = cv_ee_buffer.template pack<3>("a0",eei).to_array();
+        //         cv_ee_verts[eei * 4 + 1] = cv_ee_buffer.template pack<3>("a1",eei).to_array();
+        //         cv_ee_verts[eei * 4 + 2] = cv_ee_buffer.template pack<3>("b0",eei).to_array();
+        //         cv_ee_verts[eei * 4 + 3] = cv_ee_buffer.template pack<3>("b1",eei).to_array();
+
+        //         cv_ee_lines[eei * 2 + 0] = zeno::vec2i(eei * 4 + 0,eei * 4 + 1);
+        //         cv_ee_lines[eei * 2 + 1] = zeno::vec2i(eei * 4 + 2,eei * 4 + 3);
+        // });
+
+
+        // set_output("colEdgePairVis",std::move(colEdgetPairVis)); 
+
+
+        // dtiles_t ee_vtemp(verts.get_allocator(),
+        //     {
+        //         {"x",3},
+        //         {"dir",3},
+        //     },verts.size());
+
+        // COLLISION_UTILS::evaluate_ee_collision_grad_and_hessian(cudaPol,
+        //     verts,"x",
+        //     ee_buffer,
+        //     gh_buffer,fp_buffer.size(),
+        //     in_collisionEps,out_collisionEps,
+        //     1.0,
+        //     1.0,1.0);
+
+        // TILEVEC_OPS::copy<3>(cudaPol,verts,"x",ee_vtemp,"x");
+        // TILEVEC_OPS::fill(cudaPol,ee_vtemp,"dir",(T)0.0);
+        // TILEVEC_OPS::assemble_range(cudaPol,gh_buffer,"grad","inds",ee_vtemp,"dir",fp_buffer.size(),ee_buffer.size());
+
+        // auto EENodalForceVis = std::make_shared<zeno::PrimitiveObject>();
+        // auto& ee_spverts = EENodalForceVis->verts;
+        // ee_spverts.resize(ee_vtemp.size() * 2);
+        // auto& ee_splines = EENodalForceVis->lines;
+        // ee_splines.resize(ee_vtemp.size());
+
+        // scale = get_input2<float>("ee_scale");
+
+        // ee_vtemp = ee_vtemp.clone({zs::memsrc_e::host});   
+        // ompPol(zs::range(ee_vtemp.size()),
+        //     [ee_vtemp = proxy<space>({},ee_vtemp),&ee_spverts,&ee_splines,scale] (int vi) mutable {
+        //         auto xs = ee_vtemp.template pack<3>("x",vi);
+        //         auto dir = ee_vtemp.template pack<3>("dir",vi);
+
+        //         auto xe = xs + scale * dir;
+
+        //         ee_spverts[vi * 2 + 0] = xs.to_array();
+        //         ee_spverts[vi * 2 + 1] = xe.to_array();
+        //         ee_splines[vi] = zeno::vec2i(vi * 2 + 0,vi * 2 + 1);               
+        // });             
+
+        // set_output("EENodalForceVis",std::move(EENodalForceVis));     
     }
 
 };
 
-ZENDEFNODE(VisualizeCollisionForce, {{"ZSParticles",{"float","scale","1.0"},{"float","in_collisionEps"},{"float","out_collisionEps"}},
-                                  {"nodalForceVis","facetForceVis","collisionFacetVis","colPointFacetPairVis"},
+ZENDEFNODE(VisualizeCollision, {{"ZSParticles",{"float","fp_scale","1.0"},{"float","ee_scale","1.0"},{"float","in_collisionEps"},{"float","out_collisionEps"}},
+                                  {
+                                        "collisionFacetVis",
+                                        "colPointFacetPairVis",
+                                        "FPNodalForceVis",
+                                        // "collisionEdgeVis",
+                                        // "colEdgePairVis",
+                                        // "EENodalForceVis",
+                                    },
                                   {
                                   },
                                   {"ZSGeometry"}});
 
 
 
+
+
 }
\ No newline at end of file
diff --git a/projects/CuLagrange/geometry/DeformationField.cu b/projects/CuLagrange/geometry/DeformationField.cu
index 93aa101d75..18f1d3da6d 100644
--- a/projects/CuLagrange/geometry/DeformationField.cu
+++ b/projects/CuLagrange/geometry/DeformationField.cu
@@ -20,8 +20,8 @@ struct ZSIsotropicTensionField : INode {
         auto& verts = zssurf->getParticles();
         auto& tris = zssurf->getQuadraturePoints();
 
-        if(tris.getPropertySize("inds") != 3) {
-            fmt::print("ZSCalcSurfaceTenssionField only supports triangle surface mesh {}\n",tris.getPropertySize("inds"));
+        if(tris.getChannelSize("inds") != 3) {
+            fmt::print("ZSCalcSurfaceTenssionField only supports triangle surface mesh {}\n",tris.getChannelSize("inds"));
             throw std::runtime_error("ZSCalcSurfaceTenssionField only supports triangle surface mesh");
         }
         if(!verts.hasProperty(ref_channel)){
@@ -158,7 +158,7 @@ struct ZSEvalDeformationGradient : zeno::INode {
         }
 
         auto& quads = zsvolume->getQuadraturePoints();
-        if(quads.getPropertySize("inds") != 4) {
+        if(quads.getChannelSize("inds") != 4) {
             fmt::print("the input zsvolume should be a tetrahedra mesh\n");
             throw std::runtime_error("the input zsvolume should be a tetrahedra mesh");
         }
@@ -173,7 +173,7 @@ struct ZSEvalDeformationGradient : zeno::INode {
 
         if(!quads.hasProperty(gradientTag)) {
             quads.append_channels(cudaExec,{{gradientTag,9}});
-        }else if(quads.getPropertySize(gradientTag) != 9) {
+        }else if(quads.getChannelSize(gradientTag) != 9) {
             fmt::print("the size of F channel {} is not 9\n",gradientTag);
             throw std::runtime_error("the size of F channel is not 9");
         }
diff --git a/projects/CuLagrange/geometry/SolveLaplacian.cu b/projects/CuLagrange/geometry/SolveLaplacian.cu
index 15facc5009..22c41ff33f 100644
--- a/projects/CuLagrange/geometry/SolveLaplacian.cu
+++ b/projects/CuLagrange/geometry/SolveLaplacian.cu
@@ -13,7 +13,7 @@
 #include <zeno/types/PrimitiveObject.h>
 #include <zeno/types/StringObject.h>
 
-#include "kernel/laplace_matrix.hpp"
+#include "kernel/laplacian.hpp"
 #include "linear_system/mfcg.hpp"
 
 namespace zeno {
diff --git a/projects/CuLagrange/geometry/Topology.cu b/projects/CuLagrange/geometry/Topology.cu
index b317c2f996..dbdd2e4bc0 100644
--- a/projects/CuLagrange/geometry/Topology.cu
+++ b/projects/CuLagrange/geometry/Topology.cu
@@ -7,43 +7,316 @@
 #include <zeno/types/PrimitiveObject.h>
 #include <zeno/types/StringObject.h>
 
+#include "kernel/tiled_vector_ops.hpp"
+#include "zensim/container/Bcht.hpp"
+
+#include "zensim/cuda/execution/ExecutionPolicy.cuh"
+#include "zensim/omp/execution/ExecutionPolicy.hpp"
+
+#include "kernel/topology.hpp"
+
 namespace zeno {
 
-struct FilterTopology : INode {
-    void apply() override {
-        auto prim = get_input<zeno::PrimitiveObject>("prim");
-        auto filTopo = get_param<std::string>("topo");
-
-        auto primOut = std::static_pointer_cast<zeno::PrimitiveObject>(prim->clone());
-        if (filTopo == "lines") {
-            primOut->tris.resize(0);
-            primOut->quads.resize(0);
-        }
-        if (filTopo == "tris") {
-            primOut->lines.resize(0);
-            primOut->quads.resize(0);
-        }
-        if (filTopo == "quads") {
-            primOut->lines.resize(0);
-            primOut->tris.resize(0);
-        }
-
-        set_output("primOut", std::move(primOut));
-    }
+struct BuildSurfaceHalfEdgeStructure : zeno::INode {
+	using T = float;
+
+	virtual void apply() override {
+		using namespace zs;
+		using vec2i = zs::vec<int, 2>;
+		using vec3i = zs::vec<int, 3>;
+
+		auto zsparticles = get_input<ZenoParticles>("zsparticles");
+            if(!zsparticles->hasAuxData(ZenoParticles::s_surfTriTag))
+                throw std::runtime_error("the input zsparticles has no surface tris");
+            if(!zsparticles->hasAuxData(ZenoParticles::s_surfEdgeTag))
+                throw std::runtime_error("the input zsparticles has no surface lines");
+            if(!zsparticles->hasAuxData(ZenoParticles::s_surfVertTag))
+                throw std::runtime_error("the input zsparticles has no surface lines");
+
+			auto& tris = (*zsparticles)[ZenoParticles::s_surfTriTag];
+			auto& lines = (*zsparticles)[ZenoParticles::s_surfEdgeTag];
+			auto& points = (*zsparticles)[ZenoParticles::s_surfVertTag];
+			
+			auto& halfEdge = (*zsparticles)[ZenoParticles::s_surfHalfEdgeTag];
+			halfEdge = typename ZenoParticles::particles_t({{"to_vertex",1},{"face",1},{"edge",1},{"opposite_he",1},{"next_he",1}},
+				tris.size() * 3,zs::memsrc_e::device,0);
+
+			auto cudaPol = zs::cuda_exec();
+
+
+			points.append_channels(cudaPol,{{"he_inds",1}});
+			lines.append_channels(cudaPol,{{"he_inds",1}});
+			tris.append_channels(cudaPol,{{"he_inds",1}});
+
+#if 0
+
+			constexpr auto space = zs::execspace_e::cuda;
+
+			TILEVEC_OPS::fill(cudaPol,halfEdge,"to_vertex",reinterpret_bits<T>((int)-1));
+			TILEVEC_OPS::fill(cudaPol,halfEdge,"face",reinterpret_bits<T>((int)-1));
+			TILEVEC_OPS::fill(cudaPol,halfEdge,"edge",reinterpret_bits<T>((int)-1));
+			TILEVEC_OPS::fill(cudaPol,halfEdge,"opposite_he",reinterpret_bits<T>((int)-1));
+			TILEVEC_OPS::fill(cudaPol,halfEdge,"next_he",reinterpret_bits<T>((int)-1));
+
+			// we might also need a space hash structure here, map from [i1,i2]->[ej]
+			bcht<vec2i,int,true,universal_hash<vec2i>,32> de2fi{halfEdge.get_allocator(),halfEdge.size()};
+
+			cudaPol(zs::range(tris.size()), [
+				tris = proxy<space>({},tris),de2fi = proxy<space>(de2fi),halfEdge = proxy<space>({},halfEdge)] ZS_LAMBDA(int ti) mutable {
+					auto fe_inds = tris.pack(dim_c<3>,"fe_inds",ti).reinterpret_bits(int_c);
+					auto tri = tris.pack(dim_c<3>,"fp_inds",ti).reinterpret_bits(int_c);
+
+					vec3i nos{};
+					for(int i = 0;i != 3;++i) {
+						if(auto no = de2fi.insert(vec2i{tri[i],tri[(i+1) % 3]});no >= 0){
+							nos[i] = no;
+							halfEdge("to_vertex",no) = reinterpret_bits<T>(tri[i]);
+							halfEdge("face",no) = reinterpret_bits<T>(ti);
+							halfEdge("edge",no) = reinterpret_bits<T>(fe_inds[i]);
+							// halfEdge("next_he",no) = ti * 3 + (i+1) % 3;
+						} else {
+							// some error happen
+
+						}						
+					}
+					for(int i = 0;i != 3;++i)
+						halfEdge("next_he",nos[i]) = reinterpret_bits<T>(nos[(i+1) % 3]);
+			});
+			cudaPol(zs::range(halfEdge.size()),
+				[halfEdge = proxy<space>({},halfEdge),de2fi = proxy<space>(de2fi)] ZS_LAMBDA(int hei) mutable {
+					auto idx0 = reinterpret_bits<int>(halfEdge("to_vertex",hei));
+					auto nexthei = reinterpret_bits<int>(halfEdge("next_he",hei));
+					auto idx1 = reinterpret_bits<int>(halfEdge("to_vertex",nexthei));
+					if(auto no = de2fi.query(vec2i{idx1,idx0});no >= 0)
+						halfEdge("opposite_he",hei) = reinterpret_bits<T>(no);
+					else	
+						halfEdge("opposite_he",hei) = reinterpret_bits<T>((int)-1);
+			});
+
+			points.append_channels(cudaPol,{{"he_inds",1}});
+			lines.append_channels(cudaPol,{{"he_inds",1}});
+			tris.append_channels(cudaPol,{{"he_inds",1}});
+
+			cudaPol(zs::range(lines.size()),[
+				lines = proxy<space>({},lines),de2fi = proxy<space>(de2fi)] ZS_LAMBDA(int li) mutable {
+					auto linds = lines.pack(dim_c<2>,"ep_inds",li).reinterpret_bits(int_c);
+					if(auto no = de2fi.query(vec2i{linds[0],linds[1]});no >= 0){
+						lines("he_inds",li) = reinterpret_bits<T>((int)no);
+					}else {
+						// some algorithm bug
+					}
+			});
+
+			cudaPol(zs::range(tris.size()),[
+				points = proxy<space>({},points),tris = proxy<space>({},tris),de2fi = proxy<space>(de2fi)] ZS_LAMBDA(int ti) mutable {
+					auto tinds = tris.pack(dim_c<3>,"fp_inds",ti).reinterpret_bits(int_c);
+					if(auto no = de2fi.query(vec2i{tinds[0],tinds[1]});no >= 0){
+						tris("he_inds",ti) = reinterpret_bits<T>((int)no);
+					}else {
+						// some algorithm bug
+					}
+
+					for(int i = 0;i != 3;++i) {
+						if(auto no = de2fi.query(vec2i{tinds[i],tinds[(i+1) % 3]});no >= 0){
+							points("he_inds",tinds[i]) = reinterpret_bits<T>((int)no);
+						}else {
+							// some algorithm bug
+						}						
+					}
+			});
+#else
+			if(!build_surf_half_edge(cudaPol,tris,lines,points,halfEdge))
+				throw std::runtime_error("fail building surf half edge");
+#endif
+
+			set_output("zsparticles",zsparticles);
+			// zsparticles->setMeta("de2fi",std::move())
+	}
+
+};
+
+
+ZENDEFNODE(BuildSurfaceHalfEdgeStructure, {{{"zsparticles"}},
+							{{"zsparticles"}},
+							{},
+							{"ZSGeometry"}});
+
+
+// visualize the one-ring points, lines, and tris
+struct VisualizeOneRingNeighbors : zeno::INode {
+	using T = float;
+	virtual void apply() override {
+		using namespace zs;
+		auto zsparticles = get_input<ZenoParticles>("zsparticles");
+		constexpr int MAX_NEIGHS = 8;
+
+		if(!zsparticles->hasAuxData(ZenoParticles::s_surfTriTag))
+			throw std::runtime_error("the input zsparticles has no surface tris");
+		if(!zsparticles->hasAuxData(ZenoParticles::s_surfEdgeTag))
+			throw std::runtime_error("the input zsparticles has no surface lines");
+		if(!zsparticles->hasAuxData(ZenoParticles::s_surfVertTag))
+			throw std::runtime_error("the input zsparticles has no surface lines");
+		if(!zsparticles->hasAuxData(ZenoParticles::s_surfHalfEdgeTag))
+			throw std::runtime_error("the input zsparticles has no half edges");
+
+		const auto& verts = zsparticles->getParticles();
+		const auto& tris = (*zsparticles)[ZenoParticles::s_surfTriTag];
+		const auto& lines = (*zsparticles)[ZenoParticles::s_surfEdgeTag];
+		const auto& points = (*zsparticles)[ZenoParticles::s_surfVertTag];
+		const auto& half_edges = (*zsparticles)[ZenoParticles::s_surfHalfEdgeTag];
+
+
+		auto cudaPol = zs::cuda_exec();
+		constexpr auto space = zs::execspace_e::cuda;
+
+		auto one_ring_points = typename ZenoParticles::particles_t({{"x",3},{"active",1}},points.size() * (MAX_NEIGHS + 1),zs::memsrc_e::device,0);
+		TILEVEC_OPS::fill(cudaPol,one_ring_points,"active",(T)0);
+
+		auto one_ring_lines = typename ZenoParticles::particles_t({{"x",3},{"active",1}},points.size() * (MAX_NEIGHS * 2),zs::memsrc_e::device,0);
+		TILEVEC_OPS::fill(cudaPol,one_ring_lines,"active",(T)0);
+
+		// auto one_ring_tris = typename ZenoParticles::particles_t({{"x",3},{"active",1}},points.size() * (MAX_NEIGHS + 1),zs::memsrc_e::device,0);
+
+
+		// auto one_ring_lines = typename ZenoParticles::particles_t({{"x",3},{"active",1}},points.size() * (MAX_NEIGHS + 1));
+		// auto one_ring_tris = typename ZenoParticles::particles_t({{"x",3},{"active",1}},points.size() * (MAX_NEIGHS + 1));
+
+		// cudaPol(zs::range(lines.size()),
+		// 	[lines = proxy<space>({},lines)] ZS_LAMBDA(int li) {
+		// 		auto ep_inds = lines.pack(dim_c<2>,"ep_inds",li).reinterpret_bits(int_c);
+		// 		printf("ep_inds[%d] : %d %d\n",li,ep_inds[0],ep_inds[1]);
+		// });
+
+		// cudaPol(zs::range(half_edges.size()),
+		// 	[half_edges = proxy<space>({},half_edges)] ZS_LAMBDA(int hei) {
+		// 		auto id0 = reinterpret_bits<int>(half_edges("to_vertex",hei));
+		// 		auto nhei = get_next_half_edge(hei,half_edges,1,false);
+		// 		auto id1 = reinterpret_bits<int>(half_edges("to_vertex",nhei));
+		// 		auto rhei = reinterpret_bits<int>(half_edges("opposite_he",hei));
+		// 		auto rid0 = reinterpret_bits<int>(half_edges("to_vertex",rhei));
+		// 		auto nrhei = get_next_half_edge(rhei,half_edges,1,false);
+		// 		auto rid1 = reinterpret_bits<int>(half_edges("to_vertex",nrhei));
+		// 		printf("half_edge[%d] : %d %d \t <-> half_edge[%d] : %d %d\n",hei,id0,id1,rhei,rid0,rid1);
+		// });
+
+		cudaPol(zs::range(points.size()),[
+				verts = proxy<space>({},verts),
+				one_ring_points = proxy<space>({},one_ring_points),
+				// one_ring_lines = proxy<space>({},one_ring_lines),
+				// one_ring_tris = proxy<space>({},one_ring_tris),
+				points = proxy<space>({},points),
+				lines = proxy<space>({},lines),
+				tris = proxy<space>({},tris),
+				half_edges = proxy<space>({},half_edges)] ZS_LAMBDA(int pi) mutable {
+			// calculate one-ring neighbored points
+			one_ring_points("active",pi * (MAX_NEIGHS+1) + 0) = (T)1.0;
+			auto pidx = reinterpret_bits<int>(points("inds",pi));
+			one_ring_points.tuple(dim_c<3>,"x",pi * (MAX_NEIGHS+1) + 0) = verts.pack(dim_c<3>,"x",pidx);
+
+			auto he_idx = reinterpret_bits<int>(points("he_inds",pi));
+
+			zs::vec<int,MAX_NEIGHS> pneighs = get_one_ring_neigh_points<MAX_NEIGHS>(he_idx,half_edges);
+			// printf("one_ring_neighbors[%d] : %d %d %d %d %d %d\n",(int)pi,
+			// 	(int)pneighs[0],(int)pneighs[1],(int)pneighs[2],(int)pneighs[3],(int)pneighs[4],(int)pneighs[5]);
+			for(int i = 0;i != MAX_NEIGHS;++i){
+				if(pneighs[i] < 0)
+					break;
+				auto npidx = reinterpret_bits<int>(points("inds",pneighs[i]));
+				one_ring_points("active",pi * (MAX_NEIGHS+1) + i + 1) = (T)1.0;
+				one_ring_points.tuple(dim_c<3>,"x",pi * (MAX_NEIGHS+1) + i + 1) = verts.pack(dim_c<3>,"x",npidx);
+			}
+
+		});
+
+		cudaPol(zs::range(points.size()),[
+				verts = proxy<space>({},verts),
+				one_ring_lines = proxy<space>({},one_ring_lines),
+				points = proxy<space>({},points),
+				lines = proxy<space>({},lines),
+				half_edges = proxy<space>({},half_edges)] ZS_LAMBDA(int pi) mutable {
+					auto he_idx = reinterpret_bits<int>(points("he_inds",pi));
+					zs::vec<int,MAX_NEIGHS> pneighs = get_one_ring_neigh_edges<MAX_NEIGHS>(he_idx,half_edges);
+					// printf("one_ring_line_neighbors[%d] : %d %d %d %d %d %d\n",(int)pi,
+					// 	(int)pneighs[0],(int)pneighs[1],(int)pneighs[2],(int)pneighs[3],(int)pneighs[4],(int)pneighs[5]);
+					for(int i = 0;i != MAX_NEIGHS;++i) {
+						if(pneighs[i] < 0)
+							break;
+						one_ring_lines("active",pi * (2 * MAX_NEIGHS) + 2 * i + 0) = (T)1.0;
+						one_ring_lines("active",pi * (2 * MAX_NEIGHS) + 2 * i + 1) = (T)1.0;
+						auto ne = lines.pack(dim_c<2>,"inds",pneighs[i]).reinterpret_bits(int_c);
+						one_ring_lines.tuple(dim_c<3>,"x",pi * (2 * MAX_NEIGHS) + 2 * i + 0) = verts.pack(dim_c<3>,"x",ne[0]);
+						one_ring_lines.tuple(dim_c<3>,"x",pi * (2 * MAX_NEIGHS) + 2 * i + 1) = verts.pack(dim_c<3>,"x",ne[1]);
+					}
+		});
+
+		one_ring_points = one_ring_points.clone({zs::memsrc_e::host});
+		auto pn_prim = std::make_shared<zeno::PrimitiveObject>();
+		auto& pn_verts = pn_prim->verts;
+		auto& pn_lines = pn_prim->lines;
+
+		pn_verts.resize(points.size() * (MAX_NEIGHS + 1));
+		pn_lines.resize(points.size() * MAX_NEIGHS);
+		constexpr auto omp_space = execspace_e::openmp;
+		auto ompPol = omp_exec();    
+
+		ompPol(zs::range(points.size()),
+			[one_ring_points = proxy<omp_space>({},one_ring_points),&pn_verts,&pn_lines] (int pi) {
+				int nm_active = 0;
+				for(int i = 0;i != MAX_NEIGHS + 1;++i) {
+					if(one_ring_points("active",pi * (MAX_NEIGHS+1) + i) > 0)
+						nm_active++;
+					else
+						break;
+					pn_verts[pi * (MAX_NEIGHS+1) + i] = one_ring_points.pack(dim_c<3>,"x",pi * (MAX_NEIGHS+1) + i).to_array();
+					// if(i > 0) {
+					// 	auto diff = pn_verts[pi * (MAX_NEIGHS+1) + i] - pn_verts[pi * (MAX_NEIGHS+1) + 0];
+					// 	pn_verts[pi * (MAX_NEIGHS+1) + i] = pn_verts[pi * (MAX_NEIGHS+1) + 0] + diff * 0.9;
+					// }
+				}
+				for(int i = 0;i < nm_active-1;++i)
+					pn_lines[pi * MAX_NEIGHS + i] = zeno::vec2i(pi * (MAX_NEIGHS + 1) + 0,pi * (MAX_NEIGHS + 1) + i + 1);
+		});  
+
+		// for(int i = 0;i != pn_lines.size();++i)
+		// 	std::cout << "pn_lines[" << i << "] : " << pn_lines[i][0] << "\t" << pn_lines[i][1] << std::endl;
+
+		set_output("pn_prim",std::move(pn_prim));
+
+
+		one_ring_lines = one_ring_lines.clone({zs::memsrc_e::host});
+		auto en_prim = std::make_shared<zeno::PrimitiveObject>();
+		auto& en_verts = en_prim->verts;
+		auto& en_lines = en_prim->lines;
+
+		en_verts.resize(points.size() * (MAX_NEIGHS * 2));
+		en_lines.resize(points.size() * MAX_NEIGHS);
+
+		ompPol(zs::range(points.size()),
+			[one_ring_lines = proxy<omp_space>({},one_ring_lines),&en_verts,&en_lines] (int pi) {
+				int nm_active = 0;
+				for(int i = 0;i != 2*MAX_NEIGHS;++i) {
+					if(one_ring_lines("active",pi * MAX_NEIGHS * 2 + i) > 0)
+						nm_active++;
+					else
+						 break;
+					en_verts[pi * MAX_NEIGHS * 2 + i] = one_ring_lines.pack(dim_c<3>,"x",pi * MAX_NEIGHS * 2 + i).to_array();
+				}
+				int nm_active_edges = nm_active / 2;
+				for(int i = 0;i != nm_active_edges;++i)
+					en_lines[pi * MAX_NEIGHS + i] = zeno::vec2i(pi * MAX_NEIGHS * 2 + i * 2 + 0,pi * MAX_NEIGHS * 2 + i * 2 + 1);
+		});
+
+		set_output("en_prim",std::move(en_prim));
+	}
 };
 
-ZENDEFNODE(FilterTopology, {/* inputs: */ {
-                                {"prim"},
-                            },
-                            /* outputs: */
-                            {
-                                {"primOut"},
-                            },
-                            /* params: */
-                            {{"enum lines tris quads", "topo", "tris"}},
-                            /* category: */
-                            {
-                                "ZSGEOMETRY",
-                            }});
-
-}; // namespace zeno
\ No newline at end of file
+
+ZENDEFNODE(VisualizeOneRingNeighbors, {{{"zsparticles"}},
+							{{"pn_prim"}
+								,{"en_prim"}
+							},
+							{},
+							{"ZSGeometry"}});
+
+
+};
\ No newline at end of file
diff --git a/projects/CuLagrange/geometry/VectorField.cu b/projects/CuLagrange/geometry/VectorField.cu
index 22d6f07fda..111cc1edb9 100644
--- a/projects/CuLagrange/geometry/VectorField.cu
+++ b/projects/CuLagrange/geometry/VectorField.cu
@@ -308,8 +308,8 @@ struct ZSSampleQuadratureAttr2Vert : zeno::INode {
         if(!verts.hasProperty(attr)) {
             fmt::print("append new nodal attribute {}[{}]\n",attr,attr_dim);
             verts.append_channels(cudaPol,{{attr,attr_dim}});
-        }else if(verts.getPropertySize(attr) != attr_dim){
-            fmt::print("the verts' {} attr[{}] and quads' {} attr[{}] not matched\n",attr,verts.getPropertySize(attr),attr,attr_dim);
+        }else if(verts.getChannelSize(attr) != attr_dim){
+            fmt::print("the verts' {} attr[{}] and quads' {} attr[{}] not matched\n",attr,verts.getChannelSize(attr),attr,attr_dim);
         }
         cudaPol(range(verts.size()),
             [verts = proxy<space>({},verts),attr_dim,attr = SmallString(attr)] 
@@ -335,16 +335,17 @@ struct ZSSampleQuadratureAttr2Vert : zeno::INode {
                     // if(ei == 0)
                     //     printf("w : %f\n",(float)w);
                     // w = 1.0;// cancel out the specified weight info
+                    // printf("quads[%s][%d] : %f\n",attr.asChars(),ei,(float)quads(attr,0,ei));
                     for(int i = 0;i != simplex_size;++i){
                         auto idx = reinterpret_bits<int>(quads("inds",i,ei));
                         if(skip_bou && verts(bou_tag,idx) > 1e-6)
                             continue;
+                        auto alpha = w;
                         for(int j = 0;j != attr_dim;++j) {
                             // verts(attr,j,idx) += w * quads(attr,j,ei) / (float)simplex_size;
-                            auto alpha = w / (float)simplex_size;
                             atomic_add(execTag,&verts(attr,j,idx),alpha * quads(attr,j,ei));
-                            atomic_add(execTag,&vtemp("wsum",idx),alpha);
                         }
+                        atomic_add(execTag,&vtemp("wsum",idx),alpha);
                     }   
         });
 
@@ -389,6 +390,17 @@ struct ZSSampleVertAttr2Quadrature : zeno::INode {
         auto& verts = field->getParticles();
         auto& quads = field->getQuadraturePoints();
 
+
+
+        // auto skip_bou = get_param<int>("skip_bou");
+        // auto bou_tag = get_param<std::string>("bou_tag");
+
+        // if(skip_bou && !quads.hasProperty(bou_tag)) {
+        //     fmt::print("the input vertices have no {} boudary tag when skip bou is on\n",bou_tag);
+        //     throw std::runtime_error("the input vertices have no boudary tag when skip bou is on");
+        // }
+
+
         auto attr = get_param<std::string>("attr");
         if(!verts.hasProperty(attr)){
             fmt::print("the input verts have no specified channel : {}\n",attr);
@@ -401,13 +413,14 @@ struct ZSSampleVertAttr2Quadrature : zeno::INode {
         //     throw std::runtime_error("the input vertices have no specified weight channel");
         // }
 
+
         int simplex_size = quads.getPropertySize("inds");
         int attr_dim = verts.getPropertySize(attr);
 
         if(!quads.hasProperty(attr))
             quads.append_channels(cudaPol,{{attr,attr_dim}});
-        else if(quads.getPropertySize(attr) != attr_dim) {
-            fmt::print("the size of channel {} V[{}] and Q[{}] not match\n",attr,attr_dim,quads.getPropertySize(attr));
+        else if(quads.getChannelSize(attr) != attr_dim) {
+            fmt::print("the size of channel {} V[{}] and Q[{}] not match\n",attr,attr_dim,quads.getChannelSize(attr));
             throw std::runtime_error("the size of channel does not match");
         }
 
@@ -566,6 +579,7 @@ struct ZSGaussianNeighborQuadatureSampler : zeno::INode {
         auto radius_shrink = get_input2<float>("radius");
         auto mark = get_input2<float>("mark");
         auto mark_tag = get_param<std::string>("mark_tag");
+        auto weight_tag = get_param<std::string>("weight_tag");
 
         // auto bvh_thickness = get_param<float>("bvh_thickness");
 
@@ -627,24 +641,24 @@ struct ZSGaussianNeighborQuadatureSampler : zeno::INode {
             [ dst_quads = proxy<space>({},dst_quads),src_quads = proxy<space>({},src_quads),
                 dst_verts = proxy<space>({},dst_verts),src_verts = proxy<space>({},src_verts),
                 src_centers = proxy<space>(src_centers),dst_centers = proxy<space>(dst_centers),
-                attr = SmallString(attr),xtag = SmallString(xtag),simplex_size,attr_dim,
+                attr = SmallString(attr),xtag = SmallString(xtag),simplex_size,attr_dim,weight_tag = zs::SmallString(weight_tag),
                 bvh = proxy<space>(quadsBvh),sigma,this,use_append,radius_shrink,mark_tag = SmallString(mark_tag),mark] __device__(int di) mutable {
-                    if(!use_append)
-                        for(int i = 0;i != attr_dim;++i)
-                            dst_quads(attr,i,di) = 0.0;
-                    else{
-                        float field_norm = 0.f;
-                        for(int i = 0;i != attr_dim;++i)
-                            field_norm += dst_quads(attr,i,di) * dst_quads(attr,i,di);
-                        field_norm = zs::sqrt(field_norm);
-                        if(field_norm > 1e-6)
-                            return;
-                    }
+                    // if(!use_append)
+                    //     for(int i = 0;i != attr_dim;++i)
+                    //         dst_quads(attr,i,di) = 0.0;
+                    // else{
+                    //     float field_norm = 0.f;
+                    //     for(int i = 0;i != attr_dim;++i)
+                    //         field_norm += dst_quads(attr,i,di) * dst_quads(attr,i,di);
+                    //     field_norm = zs::sqrt(field_norm);
+                    //     if(field_norm > 1e-6)
+                    //         return;
+                    // }
                     // compute the center of the src tet
                     auto dst_ct = dst_centers[di]; 
                     float radius = 0;
 
-                    float w_sum = 0;
+                    // float w_sum = 0;
 
                     // automatically detected the approapiate radius size
                     for(int i = 0; i != simplex_size;++i){
@@ -661,20 +675,38 @@ struct ZSGaussianNeighborQuadatureSampler : zeno::INode {
                     // }
 
                     auto dst_bv = bv_t{get_bounding_box(dst_ct - radius, dst_ct + radius)};
+                    bool first_iter = true;
+                    bool has_been_sampled = false;
                     bvh.iter_neighbors(dst_bv,[&](int si){
                         auto src_ct = src_centers[si];
                         auto dist = (src_ct - dst_ct).norm();
+                        if(dist > radius * 2)
+                            return;
+
                         auto w = gauss_kernel(dist,sigma);
+                        if(w < 1e-4)
+                            return;
+
+                        has_been_sampled = true;
+                        if(first_iter && !use_append){
+                            for(int i = 0;i != attr_dim;++i)
+                                dst_quads(attr,i,di) = 0.0;
+                            first_iter = false;
+                        }
+                
                         // float distds = dist/sigma;
 
 
                         // float beta = zs::exp(-0.5 * distds * distds);
                         // w = 1/(sigma /* zs::sqrt(2*zs::g_pi)*/) * zs::exp(-0.5 * distds * distds);
 
-                        w_sum += w;
+                        // w_sum += w;
+                        dst_quads(weight_tag,di) += w;
                         // printf("sample neighbor : %d->%d %f %f %f\n",si,di,(float)w,(float)alpha,(float)zs::g_pi);
                         for(int i = 0;i != attr_dim;++i)
                             dst_quads(attr,i,di) += w * src_quads(attr,i,si);
+                        // if(attr_dim == 1)
+                        //     printf("dst_quads[%s][%d] sample src_quads[%s][%d] : %f\n",attr.asChars(),di,attr.asChars(),si,src_quads(attr,0,si));
 
                         dst_quads(mark_tag,di) = mark;
                     });
@@ -682,8 +714,9 @@ struct ZSGaussianNeighborQuadatureSampler : zeno::INode {
                     // if(w_sum < 1e-6){
                     //     printf("lost element %d\n",di);
                     // }
-                    for(int i = 0;i != attr_dim;++i)
-                        dst_quads(attr,i,di) /= (w_sum + 1e-6);
+                    // if(has_been_sampled)
+                    //     for(int i = 0;i != attr_dim;++i)
+                    //         dst_quads(attr,i,di) /= (w_sum + 1e-6);
         });
 
 
@@ -697,6 +730,7 @@ ZENDEFNODE(ZSGaussianNeighborQuadatureSampler,{
     {"source","dest",{"int","use_append","0"},{"float","radius","1"},{"float","mark","-1.0"}},
     {"dest"},
     {
+        {"string","weight_tag","weight_tag"},
         {"string","mark_tag","mark_tag"},
         {"string","attr","attr"},
         {"string","xtag","x"},
diff --git a/projects/CuLagrange/geometry/file_parser/read_vtk_mesh.hpp b/projects/CuLagrange/geometry/file_parser/read_vtk_mesh.hpp
index 66dd49d938..85c96a40c1 100644
--- a/projects/CuLagrange/geometry/file_parser/read_vtk_mesh.hpp
+++ b/projects/CuLagrange/geometry/file_parser/read_vtk_mesh.hpp
@@ -329,6 +329,13 @@ namespace zeno {
                 for(int array_id = 0;array_id != nm_arrays;++array_id){
                     int nm_components,nm_tuples;
                     bufferp = readline(buffer,fp,&line_count);
+                    sscanf(bufferp,"%s",array_name);
+                    if(!strcmp(array_name,"METADATA")){
+                        printf("skip_line : %s\n",bufferp);
+                        bufferp = readline(buffer,fp,&line_count);
+                        printf("skip_line : %s\n",bufferp);
+                        bufferp = readline(buffer,fp,&line_count);
+                    }
                     sscanf(bufferp,"%s %d %d %s",array_name,&nm_components,&nm_tuples,dummy_str);
                     printf("array_name : %s | nm_components  %d | nm_tuples : %d | type : %s at %d\n",
                         array_name,nm_components,nm_tuples,dummy_str,line_count);
diff --git a/projects/CuLagrange/geometry/kernel/bary_centric_weights.hpp b/projects/CuLagrange/geometry/kernel/bary_centric_weights.hpp
index 20eacf57a8..a26cbe4059 100644
--- a/projects/CuLagrange/geometry/kernel/bary_centric_weights.hpp
+++ b/projects/CuLagrange/geometry/kernel/bary_centric_weights.hpp
@@ -10,6 +10,8 @@
 
 namespace zeno {
 
+
+
     template <typename TileVecT, int codim = 3>
     zs::Vector<zs::AABBBox<3, typename TileVecT::value_type>>
     get_bounding_volumes(zs::CudaExecutionPolicy &pol, const TileVecT &vtemp,
@@ -40,33 +42,6 @@ namespace zeno {
         return ret;
     }
 
-    template<typename T>
-    constexpr T compute_dist_2_facet(const zs::vec<T,3>& vp,const zs::vec<T,3>& v0,const zs::vec<T,3>& v1,const zs::vec<T,3>& v2){
-        auto v012 = (v0 + v1 + v2) / 3;
-        auto v01 = (v0 + v1) / 2;
-        auto v02 = (v0 + v2) / 2;
-        auto v12 = (v1 + v2) / 2;
-
-        T dist = 1e6;
-        T tdist = (v012 - vp).norm();
-        dist = tdist < dist ? tdist : dist;
-        tdist = (v01 - vp).norm();
-        dist = tdist < dist ? tdist : dist;
-        tdist = (v02 - vp).norm();
-        dist = tdist < dist ? tdist : dist;
-        tdist = (v12 - vp).norm();
-        dist = tdist < dist ? tdist : dist;
-
-        tdist = (v0 - vp).norm();
-        dist = tdist < dist ? tdist : dist;
-        tdist = (v1 - vp).norm();
-        dist = tdist < dist ? tdist : dist;
-        tdist = (v2 - vp).norm();
-        dist = tdist < dist ? tdist : dist;
-
-        return dist;        
-    }
-
     template<typename T>
     constexpr T volume(
         const zs::vec<T,3>& p0,
@@ -170,7 +145,15 @@ namespace zeno {
         // return;
 
         auto bvs = retrieve_bounding_volumes(pol,verts,quads,wrapv<4>{},bvh_thickness,x_tag);
-        // std::cout << "TRY BUILDING TETS BVH" << std::endl;
+        // std::cout << "sizeof bvs : " << bvs.size() << std::endl;
+        // // std::cout << "TRY BUILDING TETS BVH" << std::endl;
+        // pol(zs::range(bvs.size()),[
+        //         bvs = proxy<space>(bvs)] ZS_LAMBDA(int bi) mutable {
+        //             printf("bv[%d] : min(%f %f %f); max(%f %f %f)\n",bi,
+        //                 (float)bvs[bi]._min[0],(float)bvs[bi]._min[1],(float)bvs[bi]._min[2],
+        //                 (float)bvs[bi]._max[0],(float)bvs[bi]._max[1],(float)bvs[bi]._max[2]);
+        // });
+
 
         auto tetsBvh = LBvh<3, int,T>{};
 
@@ -192,14 +175,14 @@ namespace zeno {
                 T closest_dist = 1e6;
                 bool found = false;
                 // if(vi == 10820)
-                //     printf("check to locate vert %d using bvh\n",vi);
+                    // printf("check to locate vert %d using bvh with pos = %f %f %f\n",vi,(float)p[0],(float)p[1],(float)p[2]);
 
                 // auto dst_bv = bv_t{get_bounding_box(dst )}
                 tetsBvh.iter_neighbors(p,[&](int ei){
+                    // printf("test %d v's neighbor element %d ei\n",vi,ei);
                     if(found)
                         return;
                     // if(vi == 10820)
-                    //     printf("test neighbor element %d ei\n",ei);
                     auto inds = eles.template pack<4>(elm_tag, ei).template reinterpret_bits<int>();
                     auto p0 = verts.template pack<3>(x_tag,inds[0]);
                     auto p1 = verts.template pack<3>(x_tag,inds[1]);
@@ -217,9 +200,11 @@ namespace zeno {
                     }
                     if(!fitting_in)
                         return;
+                    zs::vec<T,3> bary{};
 
                     if(ws[0] < 0){
-                        T dist = compute_dist_2_facet(p,p1,p2,p3);
+                        // T dist = compute_dist_2_facet(p,p1,p2,p3);
+                        T dist = LSL_GEO::pointTriangleDistance(p1,p2,p3,p,bary);
                         if(dist < closest_dist){
                             closest_dist = dist;
                             bcw(elm_tag,vi) = reinterpret_bits<T>(ei);
@@ -227,7 +212,7 @@ namespace zeno {
                         }
                     }
                     if(ws[1] < 0){
-                        T dist = compute_dist_2_facet(p,p0,p2,p3);
+                        T dist = LSL_GEO::pointTriangleDistance(p0,p2,p3,p,bary);
                         if(dist < closest_dist){
                             closest_dist = dist;
                             bcw(elm_tag,vi) = reinterpret_bits<T>(ei);
@@ -235,7 +220,7 @@ namespace zeno {
                         }
                     }
                     if(ws[2] < 0){
-                        T dist = compute_dist_2_facet(p,p0,p1,p3);
+                        T dist = LSL_GEO::pointTriangleDistance(p0,p1,p3,p,bary);
                         if(dist < closest_dist){
                             closest_dist = dist;
                             bcw(elm_tag,vi) = reinterpret_bits<T>(ei);
@@ -243,17 +228,58 @@ namespace zeno {
                         }
                     }
                     if(ws[3] < 0){
-                        T dist = compute_dist_2_facet(p,p0,p1,p2);
+                        T dist = LSL_GEO::pointTriangleDistance(p0,p1,p2,p,bary);
                         if(dist < closest_dist){
                             closest_dist = dist;
                             bcw(elm_tag,vi) = reinterpret_bits<T>(ei);
                             bcw.template tuple<4>(weight_tag,vi) = ws;
                         }
                     }
+
+                    // if(ws[0] < 0){
+                    //     T dist = compute_dist_2_facet(p,p1,p2,p3);
+                    //     if(dist < closest_dist){
+                    //         closest_dist = dist;
+                    //         bcw(elm_tag,vi) = reinterpret_bits<T>(ei);
+                    //         bcw.template tuple<4>(weight_tag,vi) = ws;
+                    //     }
+                    // }
+                    // if(ws[1] < 0){
+                    //     T dist = compute_dist_2_facet(p,p0,p2,p3);
+                    //     if(dist < closest_dist){
+                    //         closest_dist = dist;
+                    //         bcw(elm_tag,vi) = reinterpret_bits<T>(ei);
+                    //         bcw.template tuple<4>(weight_tag,vi) = ws;
+                    //     }
+                    // }
+                    // if(ws[2] < 0){
+                    //     T dist = compute_dist_2_facet(p,p0,p1,p3);
+                    //     if(dist < closest_dist){
+                    //         closest_dist = dist;
+                    //         bcw(elm_tag,vi) = reinterpret_bits<T>(ei);
+                    //         bcw.template tuple<4>(weight_tag,vi) = ws;
+                    //     }
+                    // }
+                    // if(ws[3] < 0){
+                    //     T dist = compute_dist_2_facet(p,p0,p1,p2);
+                    //     if(dist < closest_dist){
+                    //         closest_dist = dist;
+                    //         bcw(elm_tag,vi) = reinterpret_bits<T>(ei);
+                    //         bcw.template tuple<4>(weight_tag,vi) = ws;
+                    //     }
+                    // }
+
+                    if(!fitting_in){
+                        printf("bind vert %d to %d under non-fitting-in mode\n",vi,ei);
+                        // return;
+                    }
+
+
                 });// finish iter the neighbor tets
         });
     }
 
+
 };
 
 
diff --git a/projects/CuLagrange/geometry/kernel/calculate_bisector_normal.hpp b/projects/CuLagrange/geometry/kernel/calculate_bisector_normal.hpp
index 8ce2b8550f..038b06993d 100644
--- a/projects/CuLagrange/geometry/kernel/calculate_bisector_normal.hpp
+++ b/projects/CuLagrange/geometry/kernel/calculate_bisector_normal.hpp
@@ -101,7 +101,7 @@ namespace zeno { namespace COLLISION_UTILS {
         // // auto avge = (e01 + e02 + e12)/(T)3.0;
 
         // T barySum = (T)1.0;
-        // T distance = COLLISION_UTILS::pointTriangleDistance(t0,t1,t2,p,barySum);
+        // T distance = pointTriangleDistance(t0,t1,t2,p,barySum);
         // // auto max_ratio = inset_ratio > outset_ratio ? inset_ratio : outset_ratio;
         // // collisionEps = avge * max_ratio;
         // auto collisionEps = seg.dot(nrm) > 0 ? out_collisionEps : in_collisionEps;
diff --git a/projects/CuLagrange/geometry/kernel/calculate_edge_normal.hpp b/projects/CuLagrange/geometry/kernel/calculate_edge_normal.hpp
index 13ccd5dd86..310b15509b 100644
--- a/projects/CuLagrange/geometry/kernel/calculate_edge_normal.hpp
+++ b/projects/CuLagrange/geometry/kernel/calculate_edge_normal.hpp
@@ -10,22 +10,44 @@
 namespace zeno {
     using T = float;
 
-    template<typename Pol,typename LineTileVec,typename SurfTriNrmTileVec,typename SurfLineNrmTileVec>
-    bool calculate_edge_normal_from_facet_normal(Pol& pol,const LineTileVec& lines,
+    template<typename Pol,typename SurfTriNrmTileVec,typename SurfLineNrmTileVec,typename SurfTriTopoTileVec>
+    bool calculate_edge_normal_from_facet_normal(Pol& pol,
         const SurfTriNrmTileVec& ttemp,const zs::SmallString& srcTag,
-        SurfLineNrmTileVec& etemp,const zs::SmallString& dstTag) {
+        SurfLineNrmTileVec& etemp,const zs::SmallString& dstTag,
+        const SurfTriTopoTileVec& ltopo) {
             using namespace zs;
-            if(!ttemp.hasProperty(srcTag) || ttemp.getPropertySize(srcTag) != 3){
+
+            if(!ttemp.hasProperty(srcTag) || ttemp.getChannelSize(srcTag) != 3){
                 fmt::print(fg(fmt::color::red),"the input triNrmTileVec has no valid {} normal channel\n",srcTag);
                 return false;
             }
-            if(!etemp.hasProperty(dstTag) || etemp.getPropertySize(dstTag) != 3) {
+            if(!etemp.hasProperty(dstTag) || etemp.getChannelSize(dstTag) != 3) {
                 fmt::print(fg(fmt::color::red),"the input lineNrmTileVec has no valid {} normal channel\n",dstTag);
                 return false;
             }
+            if(!ltopo.hasProperty("fe_inds") || ltopo.getChannelSize("fe_inds") != 2){
+                fmt::print(fg(fmt::color::red),"the input ltopo has no \"fe_inds\" channel\n");
+                return false;
+            }
+
+            // std::cout << "doing assemble" << std::endl;
+
+            // constexpr auto space = execspace_e::cuda;
+            // auto cudaPol = cuda_exec();
+            // cudaPol(zs::range(ltopo.size()),
+            //     [ltopo = proxy<space>({},ltopo)] ZS_LAMBDA(int li) mutable {
+            //         auto inds = ltopo.template pack<2>("fe_inds",li).reinterpret_bits(int_c);
+            //         printf("ltopo<%d> : %d %d\n",li,inds[0],inds[1]);
+            // });
+
 
-            TILEVEC_OPS::assemble_from<3,2>(pol,ttemp,srcTag,etemp,dstTag,"fe_inds");
+            // TILEVEC_OPS::fill<3>(pol,etemp,dstTag,zs::vec<T,3>::zeros());
+            TILEVEC_OPS::fill(pol,etemp,dstTag,(T)0.0);
+            TILEVEC_OPS::assemble_from(pol,ttemp,srcTag,etemp,dstTag,ltopo,"fe_inds");
+            // std::cout << "finish assemble" << std::endl;
             TILEVEC_OPS::normalized_channel<3>(pol,etemp,dstTag);
+            // std::cout << "finish normalize" << std::endl;
+            return true;
     }
 
 };
\ No newline at end of file
diff --git a/projects/CuLagrange/geometry/kernel/calculate_facet_center.hpp b/projects/CuLagrange/geometry/kernel/calculate_facet_center.hpp
index 3c00dfb21f..57fcfd806d 100644
--- a/projects/CuLagrange/geometry/kernel/calculate_facet_center.hpp
+++ b/projects/CuLagrange/geometry/kernel/calculate_facet_center.hpp
@@ -11,11 +11,11 @@ namespace zeno {
     template<typename Pol,typename PosTileVec,typename SurfTriTileVec,typename SurfCenterTileVec>
     bool calculate_facet_center(Pol& pol,const PosTileVec& verts,const zs::SmallString& xTag,SurfTriTileVec& tris,SurfCenterTileVec& tri_center_buffer,const zs::SmallString& centerTag) {
         using namespace zs;
-        if(!tris.hasProperty("inds") || tris.getPropertySize("inds") != 3) {
+        if(!tris.hasProperty("inds") || tris.getChannelSize("inds") != 3) {
             if(!tris.hasProperty("inds"))
                 fmt::print(fg(fmt::color::red),"the tris has no 'inds' channel\n");
-            else if(tris.getPropertySize("inds") != 3)
-                fmt::print(fg(fmt::color::red),"the tris has invalid 'inds' channel size {}\n",tris.getPropertySize("inds"));
+            else if(tris.getChannelSize("inds") != 3)
+                fmt::print(fg(fmt::color::red),"the tris has invalid 'inds' channel size {}\n",tris.getChannelSize("inds"));
             return false;
         }
         if(tris.size() != tri_center_buffer.size()) {
diff --git a/projects/CuLagrange/geometry/kernel/calculate_facet_normal.hpp b/projects/CuLagrange/geometry/kernel/calculate_facet_normal.hpp
index 121e6cb946..b9c654ac9c 100644
--- a/projects/CuLagrange/geometry/kernel/calculate_facet_normal.hpp
+++ b/projects/CuLagrange/geometry/kernel/calculate_facet_normal.hpp
@@ -9,21 +9,41 @@ namespace zeno {
     using T = float;
 
     template<typename Pol,typename PosTileVec,typename SurfTriTileVec,typename SurfNrmTileVec>
-    bool calculate_facet_normal(Pol& pol,const PosTileVec& verts,const zs::SmallString& xTag,SurfTriTileVec& tris,SurfNrmTileVec& tri_nrm_buffer,const zs::SmallString& nrmTag) {
+    bool calculate_facet_normal(Pol& pol,const PosTileVec& verts,const zs::SmallString& xTag,const SurfTriTileVec& tris,SurfNrmTileVec& tri_nrm_buffer,const zs::SmallString& nrmTag) {
+        // std::cout << "calculate facet normal" << std::endl;
+        
         using namespace zs;
-        if(!tris.hasProperty("inds") || tris.getPropertySize("inds") != 3) {
-            if(!tris.hasProperty("inds"))
-                fmt::print(fg(fmt::color::red),"the tris has no 'inds' channel\n");
-            else if(tris.getPropertySize("inds") != 3)
-                fmt::print(fg(fmt::color::red),"the tris has invalid 'inds' channel size {}\n",tris.getPropertySize("inds"));
+
+        if(!tris.hasProperty("inds")){
+            std::cout << "the tris has no 'inds' channel\n" << std::endl;
+            fmt::print(fg(fmt::color::red),"the tris has no 'inds' channel\n");
+            return false;
+        }
+        if(tris.getChannelSize("inds") != 3){
+            std::cout << "the tris has invalid 'inds' channel size {}\n" << std::endl;
+            fmt::print(fg(fmt::color::red),"the tris has invalid 'inds' channel size {}\n",tris.getChannelSize("inds"));
             return false;
         }
         if(tris.size() != tri_nrm_buffer.size()) {
+            std::cout << "invalid tris and triNrms" << std::endl;
             fmt::print(fg(fmt::color::red),"the tris's size {} does not match that of tri_nrm_buffer {}\n",
                 tris.size(),tri_nrm_buffer.size());
             return false;
         }
 
+        if(!tri_nrm_buffer.hasProperty(nrmTag)) {
+            // std::cout << "the tri_nrm_buffer has no " << nrmTag  << " channel" << std::endl;
+            fmt::print(fg(fmt::color::red),"the tri_nrm_buffer has no {} channel\n",nrmTag);
+            return false;
+        }
+
+        if(tri_nrm_buffer.getChannelSize(nrmTag) != 3) {
+            // std::cout << "the tri_nrm_buffer has no " << nrmTag  << " channel" << std::endl;
+            fmt::print(fg(fmt::color::red),"the tri_nrm_buffer has invalid {} channel, which should be vec3\n",nrmTag);
+            return false;
+        }
+
+
         constexpr auto space = execspace_e::cuda;
         pol(zs::range(tris.size()),
             [verts = proxy<space>({},verts),tris = proxy<space>({},tris),tri_nrm_buffer = proxy<space>({},tri_nrm_buffer),xTag,nrmTag] ZS_LAMBDA(int ti) mutable {
@@ -52,7 +72,7 @@ namespace zeno {
     // template<typename Pol,typename VTileVec,typename TTileVec>
     // constexpr bool calculate_point_normal(Pol& pol,const VTileVec& verts,const TTileVec& tris,const zs::SmallString& nrmTag) {
     //     using namespace zs;
-    //     if(!tris.hasProperty("inds") || tris.getPropertySize("inds") != 3) 
+    //     if(!tris.hasProperty("inds") || tris.getChannelSize("inds") != 3) 
     //         return false;
 
     //     constexpr auto space = execspace_e::cuda;
diff --git a/projects/CuLagrange/geometry/kernel/compute_characteristic_length.hpp b/projects/CuLagrange/geometry/kernel/compute_characteristic_length.hpp
index a63d31c637..f84038eec6 100644
--- a/projects/CuLagrange/geometry/kernel/compute_characteristic_length.hpp
+++ b/projects/CuLagrange/geometry/kernel/compute_characteristic_length.hpp
@@ -17,12 +17,14 @@ namespace zeno {
 
         if(!verts.hasProperty(xTag))
             throw std::runtime_error("compute_average_edge_length::verts contain no specified \"xTag\" channel");
+        if(!elms.hasProperty("inds"))
+            throw std::runtime_error("compute_average_edge_length::elms contain no \"inds\" channel");
 
         constexpr auto space = execspace_e::cuda;
         Vector<T> length_sum{verts.get_allocator(),1};
         length_sum.setVal((T)0);
         
-        auto elm_dim = elms.getPropertySize("inds");
+        auto elm_dim = elms.getChannelSize("inds");
         auto nm_elms = elms.size();
         auto nm_edges = (elm_dim * nm_elms); 
 
diff --git a/projects/CuLagrange/geometry/kernel/geo_math.hpp b/projects/CuLagrange/geometry/kernel/geo_math.hpp
index 7eebaf3a6d..5bf657dc2c 100644
--- a/projects/CuLagrange/geometry/kernel/geo_math.hpp
+++ b/projects/CuLagrange/geometry/kernel/geo_math.hpp
@@ -5,6 +5,14 @@
 
 namespace zeno { namespace LSL_GEO {
 
+    using REAL = float;
+    using VECTOR12 = typename zs::vec<REAL,12>;
+    using VECTOR4 = typename zs::vec<REAL,4>;
+    using VECTOR3 = typename zs::vec<REAL,3>;
+    using VECTOR2 = typename zs::vec<REAL,2>;
+    using MATRIX3x12 = typename zs::vec<REAL,3,12>;
+    using MATRIX12 = typename zs::vec<REAL,12,12>;
+
     template<int simplex_size,int ne = (simplex_size - 1) * simplex_size, zs::enable_if_t<(simplex_size >= 2 && simplex_size <= 4)> = 0>
     constexpr zs::vec<int,ne * 2> ordered_edges() {
         if constexpr (simplex_size == 4)    
@@ -188,5 +196,383 @@ namespace zeno { namespace LSL_GEO {
             b = x0 - F * X0;
     }
 
+
+    ///////////////////////////////////////////////////////////////////////
+    // get the linear interpolation coordinates from v0 to the line segment
+    // between v1 and v2
+    ///////////////////////////////////////////////////////////////////////
+    constexpr VECTOR2 getLerp(const VECTOR3 v0, const VECTOR3& v1, const VECTOR3& v2)
+    {
+        const VECTOR3 e0 = v0 - v1;
+        const VECTOR3 e1 = v2 - v1;
+        const VECTOR3 e1hat = e1 / e1.norm();
+        const REAL projection = e0.dot(e1hat);
+
+        if (projection < 0.0)
+            return VECTOR2(1.0, 0.0);
+
+        if (projection >= e1.norm())
+            return VECTOR2(0.0, 1.0);
+
+        const REAL ratio = projection / e1.norm();
+        return VECTOR2(1.0 - ratio, ratio);
+    }
+
+
+    ///////////////////////////////////////////////////////////////////////
+    // find the distance from a line segment (v1, v2) to a point (v0)
+    ///////////////////////////////////////////////////////////////////////
+    constexpr REAL pointLineDistance(const VECTOR3 v0, const VECTOR3& v1, const VECTOR3& v2)
+    {
+        const VECTOR3 e0 = v0 - v1;
+        const VECTOR3 e1 = v2 - v1;
+        const VECTOR3 e1hat = e1 / e1.norm();
+        const REAL projection = e0.dot(e1hat);
+
+        // if it projects onto the line segment, use that length
+        if (projection > 0.0 && projection < e1.norm())
+        {
+            const VECTOR3 normal = e0 - projection * e1hat;
+            return normal.norm();
+        }
+
+        // if it doesn't, find the point-point distances
+        const REAL diff01 = (v0 - v1).norm();
+        const REAL diff02 = (v0 - v2).norm();
+
+        return (diff01 < diff02) ? diff01 : diff02;
+    }
+
+
+    ///////////////////////////////////////////////////////////////////////
+    // get the barycentric coordinate of the projection of v[0] onto the triangle
+    // formed by v[1], v[2], v[3]
+    ///////////////////////////////////////////////////////////////////////
+    constexpr VECTOR3 getBarycentricCoordinates(const VECTOR3 vertices[4])
+    {
+        const VECTOR3 v0 = vertices[1];
+        const VECTOR3 v1 = vertices[2];
+        const VECTOR3 v2 = vertices[3];
+            
+        const VECTOR3 e1 = v1 - v0;
+        const VECTOR3 e2 = v2 - v0;
+        const VECTOR3 n = e1.cross(e2);
+        const VECTOR3 nHat = n / n.norm();
+        const VECTOR3 v = vertices[0] - (nHat.dot(vertices[0] - v0)) * nHat;
+
+        // get the barycentric coordinates
+        const VECTOR3 na = (v2 - v1).cross(v - v1);
+        const VECTOR3 nb = (v0 - v2).cross(v - v2);
+        const VECTOR3 nc = (v1 - v0).cross(v - v0);
+        const VECTOR3 barycentric(n.dot(na) / n.l2NormSqr(),
+                                    n.dot(nb) / n.l2NormSqr(),
+                                    n.dot(nc) / n.l2NormSqr());
+
+        return barycentric;
+    }
+
+
+    ///////////////////////////////////////////////////////////////////////
+    // get the barycentric coordinate of the projection of v[0] onto the triangle
+    // formed by v[1], v[2], v[3]
+    //
+    // but, if the projection is actually outside, project to all of the
+    // edges and find the closest point that's still inside the triangle
+    ///////////////////////////////////////////////////////////////////////
+    constexpr VECTOR3 getInsideBarycentricCoordinates(const VECTOR3 vertices[4])
+    {
+        VECTOR3 barycentric = getBarycentricCoordinates(vertices);
+
+        // if it's already inside, we're all done
+        if (barycentric[0] >= 0.0 &&
+            barycentric[1] >= 0.0 &&
+            barycentric[2] >= 0.0)
+            return barycentric;
+
+        // find distance to all the line segments
+        //
+        // there's lots of redundant computation between here and getLerp,
+        // but let's get it working and see if it fixes the actual
+        // artifact before optimizing
+        REAL distance12 = pointLineDistance(vertices[0], vertices[1], vertices[2]);
+        REAL distance23 = pointLineDistance(vertices[0], vertices[2], vertices[3]);
+        REAL distance31 = pointLineDistance(vertices[0], vertices[3], vertices[1]);
+
+        // less than or equal is important here, otherwise fallthrough breaks
+        if (distance12 <= distance23 && distance12 <= distance31)
+        {
+            VECTOR2 lerp = getLerp(vertices[0], vertices[1], vertices[2]);
+            barycentric[0] = lerp[0];
+            barycentric[1] = lerp[1];
+            barycentric[2] = 0.0;
+            return barycentric;
+        }
+        
+        // less than or equal is important here, otherwise fallthrough breaks
+        if (distance23 <= distance12 && distance23 <= distance31)
+        {
+            VECTOR2 lerp = getLerp(vertices[0], vertices[2], vertices[3]);
+            barycentric[0] = 0.0;
+            barycentric[1] = lerp[0];
+            barycentric[2] = lerp[1];
+            return barycentric;
+        }
+
+        // else it must be the 31 case
+        VECTOR2 lerp = getLerp(vertices[0], vertices[3], vertices[1]);
+        barycentric[0] = lerp[1];
+        barycentric[1] = 0.0;
+        barycentric[2] = lerp[0];
+        return barycentric;
+    }
+
+
+///////////////////////////////////////////////////////////////////////
+// compute distance between a point and triangle
+///////////////////////////////////////////////////////////////////////
+    constexpr REAL pointTriangleDistance(const VECTOR3& v0, const VECTOR3& v1, 
+                                        const VECTOR3& v2, const VECTOR3& v,VECTOR3& barycentric)
+    {
+        // get the barycentric coordinates
+        const VECTOR3 e1 = v1 - v0;
+        const VECTOR3 e2 = v2 - v0;
+        const VECTOR3 n = e1.cross(e2);
+        const VECTOR3 na = (v2 - v1).cross(v - v1);
+        const VECTOR3 nb = (v0 - v2).cross(v - v2);
+        const VECTOR3 nc = (v1 - v0).cross(v - v0);
+        barycentric = VECTOR3(n.dot(na) / n.l2NormSqr(),
+                                    n.dot(nb) / n.l2NormSqr(),
+                                    n.dot(nc) / n.l2NormSqr());
+                                    
+        const REAL barySum = zs::abs(barycentric[0]) + zs::abs(barycentric[1]) + zs::abs(barycentric[2]);
+
+        // if the point projects to inside the triangle, it should sum to 1
+        if (zs::abs(barySum - 1.0) < 1e-6)
+        {
+            const VECTOR3 nHat = n / n.norm();
+            const REAL normalDistance = (nHat.dot(v - v0));
+            return zs::abs(normalDistance);
+        }
+
+        // project onto each edge, find the distance to each edge
+        const VECTOR3 e3 = v2 - v1;
+        const VECTOR3 ev = v - v0;
+        const VECTOR3 ev3 = v - v1;
+        const VECTOR3 e1Hat = e1 / e1.norm();
+        const VECTOR3 e2Hat = e2 / e2.norm();
+        const VECTOR3 e3Hat = e3 / e3.norm();
+        VECTOR3 edgeDistances(1e8, 1e8, 1e8);
+
+        // see if it projects onto the interval of the edge
+        // if it doesn't, then the vertex distance will be smaller,
+        // so we can skip computing anything
+        const REAL e1dot = e1Hat.dot(ev);
+        if (e1dot > 0.0 && e1dot < e1.norm())
+        {
+            const VECTOR3 projected = v0 + e1Hat * e1dot;
+            edgeDistances[0] = (v - projected).norm();
+        }
+        const REAL e2dot = e2Hat.dot(ev);
+        if (e2dot > 0.0 && e2dot < e2.norm())
+        {
+            const VECTOR3 projected = v0 + e2Hat * e2dot;
+            edgeDistances[1] = (v - projected).norm();
+        }
+        const REAL e3dot = e3Hat.dot(ev3);
+        if (e3dot > 0.0 && e3dot < e3.norm())
+        {
+            const VECTOR3 projected = v1 + e3Hat * e3dot;
+            edgeDistances[2] = (v - projected).norm();
+        }
+
+        // get the distance to each vertex
+        const VECTOR3 vertexDistances((v - v0).norm(), 
+                                        (v - v1).norm(), 
+                                        (v - v2).norm());
+
+        // get the smallest of both the edge and vertex distances
+        REAL vertexMin = 1e8;
+        REAL edgeMin = 1e8;
+        for(int i = 0;i < 3;++i){
+            vertexMin = vertexMin > vertexDistances[i] ? vertexDistances[i] : vertexMin;
+            edgeMin = edgeMin > edgeDistances[i] ? edgeDistances[i] : edgeMin;
+        }
+        // return the smallest of those
+        return (vertexMin < edgeMin) ? vertexMin : edgeMin;
+    }
+
+    constexpr REAL pointTriangleDistance(const VECTOR3& v0, const VECTOR3& v1, 
+                                        const VECTOR3& v2, const VECTOR3& v)
+    {
+        // // get the barycentric coordinates
+        // const VECTOR3 e1 = v1 - v0;
+        // const VECTOR3 e2 = v2 - v0;
+        // const VECTOR3 n = e1.cross(e2);
+        // const VECTOR3 na = (v2 - v1).cross(v - v1);
+        // const VECTOR3 nb = (v0 - v2).cross(v - v2);
+        // const VECTOR3 nc = (v1 - v0).cross(v - v0);
+        // const VECTOR3 barycentric(n.dot(na) / n.l2NormSqr(),
+        //                             n.dot(nb) / n.l2NormSqr(),
+        //                             n.dot(nc) / n.l2NormSqr());
+                                    
+        // const REAL barySum = zs::abs(barycentric[0]) + zs::abs(barycentric[1]) + zs::abs(barycentric[2]);
+
+        // // if the point projects to inside the triangle, it should sum to 1
+        // if (zs::abs(barySum - 1.0) < 1e-6)
+        // {
+        //     const VECTOR3 nHat = n / n.norm();
+        //     const REAL normalDistance = (nHat.dot(v - v0));
+        //     return zs::abs(normalDistance);
+        // }
+
+        // // project onto each edge, find the distance to each edge
+        // const VECTOR3 e3 = v2 - v1;
+        // const VECTOR3 ev = v - v0;
+        // const VECTOR3 ev3 = v - v1;
+        // const VECTOR3 e1Hat = e1 / e1.norm();
+        // const VECTOR3 e2Hat = e2 / e2.norm();
+        // const VECTOR3 e3Hat = e3 / e3.norm();
+        // VECTOR3 edgeDistances(1e8, 1e8, 1e8);
+
+        // // see if it projects onto the interval of the edge
+        // // if it doesn't, then the vertex distance will be smaller,
+        // // so we can skip computing anything
+        // const REAL e1dot = e1Hat.dot(ev);
+        // if (e1dot > 0.0 && e1dot < e1.norm())
+        // {
+        //     const VECTOR3 projected = v0 + e1Hat * e1dot;
+        //     edgeDistances[0] = (v - projected).norm();
+        // }
+        // const REAL e2dot = e2Hat.dot(ev);
+        // if (e2dot > 0.0 && e2dot < e2.norm())
+        // {
+        //     const VECTOR3 projected = v0 + e2Hat * e2dot;
+        //     edgeDistances[1] = (v - projected).norm();
+        // }
+        // const REAL e3dot = e3Hat.dot(ev3);
+        // if (e3dot > 0.0 && e3dot < e3.norm())
+        // {
+        //     const VECTOR3 projected = v1 + e3Hat * e3dot;
+        //     edgeDistances[2] = (v - projected).norm();
+        // }
+
+        // // get the distance to each vertex
+        // const VECTOR3 vertexDistances((v - v0).norm(), 
+        //                                 (v - v1).norm(), 
+        //                                 (v - v2).norm());
+
+        // // get the smallest of both the edge and vertex distances
+        // REAL vertexMin = 1e8;
+        // REAL edgeMin = 1e8;
+        // for(int i = 0;i < 3;++i){
+        //     vertexMin = vertexMin > vertexDistances[i] ? vertexDistances[i] : vertexMin;
+        //     edgeMin = edgeMin > edgeDistances[i] ? edgeDistances[i] : edgeMin;
+        // }
+        // // return the smallest of those
+        // return (vertexMin < edgeMin) ? vertexMin : edgeMin;
+        VECTOR3 barycentric{};
+        return pointTriangleDistance(v0,v1,v2,v,barycentric);
+    }
+
+
+
+    constexpr REAL pointTriangleDistance(const VECTOR3& v0, const VECTOR3& v1, 
+                                        const VECTOR3& v2, const VECTOR3& v,REAL& barySum)
+    {
+        // get the barycentric coordinates
+        const VECTOR3 e1 = v1 - v0;
+        const VECTOR3 e2 = v2 - v0;
+        const VECTOR3 n = e1.cross(e2);
+        const VECTOR3 na = (v2 - v1).cross(v - v1);
+        const VECTOR3 nb = (v0 - v2).cross(v - v2);
+        const VECTOR3 nc = (v1 - v0).cross(v - v0);
+        const VECTOR3 barycentric(n.dot(na) / n.l2NormSqr(),
+                                    n.dot(nb) / n.l2NormSqr(),
+                                    n.dot(nc) / n.l2NormSqr());
+                                    
+        barySum = zs::abs(barycentric[0]) + zs::abs(barycentric[1]) + zs::abs(barycentric[2]);
+
+        // if the point projects to inside the triangle, it should sum to 1
+        if (zs::abs(barySum - 1.0) < 1e-6)
+        {
+            const VECTOR3 nHat = n / n.norm();
+            const REAL normalDistance = (nHat.dot(v - v0));
+            return zs::abs(normalDistance);
+        }
+
+        // project onto each edge, find the distance to each edge
+        const VECTOR3 e3 = v2 - v1;
+        const VECTOR3 ev = v - v0;
+        const VECTOR3 ev3 = v - v1;
+        const VECTOR3 e1Hat = e1 / e1.norm();
+        const VECTOR3 e2Hat = e2 / e2.norm();
+        const VECTOR3 e3Hat = e3 / e3.norm();
+        VECTOR3 edgeDistances(1e8, 1e8, 1e8);
+
+        // see if it projects onto the interval of the edge
+        // if it doesn't, then the vertex distance will be smaller,
+        // so we can skip computing anything
+        const REAL e1dot = e1Hat.dot(ev);
+        if (e1dot > 0.0 && e1dot < e1.norm())
+        {
+            const VECTOR3 projected = v0 + e1Hat * e1dot;
+            edgeDistances[0] = (v - projected).norm();
+        }
+        const REAL e2dot = e2Hat.dot(ev);
+        if (e2dot > 0.0 && e2dot < e2.norm())
+        {
+            const VECTOR3 projected = v0 + e2Hat * e2dot;
+            edgeDistances[1] = (v - projected).norm();
+        }
+        const REAL e3dot = e3Hat.dot(ev3);
+        if (e3dot > 0.0 && e3dot < e3.norm())
+        {
+            const VECTOR3 projected = v1 + e3Hat * e3dot;
+            edgeDistances[2] = (v - projected).norm();
+        }
+
+        // get the distance to each vertex
+        const VECTOR3 vertexDistances((v - v0).norm(), 
+                                        (v - v1).norm(), 
+                                        (v - v2).norm());
+
+        // get the smallest of both the edge and vertex distances
+        REAL vertexMin = 1e8;
+        REAL edgeMin = 1e8;
+        for(int i = 0;i < 3;++i){
+            vertexMin = vertexMin > vertexDistances[i] ? vertexDistances[i] : vertexMin;
+            edgeMin = edgeMin > edgeDistances[i] ? edgeDistances[i] : edgeMin;
+        }
+        // return the smallest of those
+        return (vertexMin < edgeMin) ? vertexMin : edgeMin;
+    }
+
+
+    ///////////////////////////////////////////////////////////////////////
+    // see if the projection of v onto the plane of v0,v1,v2 is inside 
+    // the triangle formed by v0,v1,v2
+    ///////////////////////////////////////////////////////////////////////
+    constexpr bool pointProjectsInsideTriangle(const VECTOR3& v0, const VECTOR3& v1, 
+                                            const VECTOR3& v2, const VECTOR3& v){
+        // get the barycentric coordinates
+        const VECTOR3 e1 = v1 - v0;
+        const VECTOR3 e2 = v2 - v0;
+        const VECTOR3 n = e1.cross(e2);
+        const VECTOR3 na = (v2 - v1).cross(v - v1);
+        const VECTOR3 nb = (v0 - v2).cross(v - v2);
+        const VECTOR3 nc = (v1 - v0).cross(v - v0);
+        const VECTOR3 barycentric(n.dot(na) / n.l2NormSqr(),
+                                    n.dot(nb) / n.l2NormSqr(),
+                                    n.dot(nc) / n.l2NormSqr());
+                                    
+        const REAL barySum = zs::abs(barycentric[0]) + zs::abs(barycentric[1]) + zs::abs(barycentric[2]);
+
+        // if the point projects to inside the triangle, it should sum to 1
+        if (zs::abs(barySum - 1.0) < 1e-6)
+            return true;
+
+        return false;
+    }
+
 };
 };
\ No newline at end of file
diff --git a/projects/CuLagrange/geometry/kernel/laplacian.hpp b/projects/CuLagrange/geometry/kernel/laplacian.hpp
new file mode 100644
index 0000000000..433271f003
--- /dev/null
+++ b/projects/CuLagrange/geometry/kernel/laplacian.hpp
@@ -0,0 +1,351 @@
+#pragma once
+
+#include "Structures.hpp"
+#include "topology.hpp"
+
+namespace zeno {
+    template<typename T>
+    constexpr T doublearea(T a,T b,T c) {
+        T s = (a + b + c)/2;
+        return 2*zs::sqrt(s*(s-a)*(s-b)*(s-c));
+    }
+
+    template<typename T>
+    constexpr T area(T a,T b,T c) {
+        return doublearea(a,b,c)/2;
+    }
+
+    template<typename T>
+    constexpr T volume(zs::vec<T, 6> l) {
+        T u = l(0);
+        T v = l(1);
+        T w = l(2);
+        T U = l(3);
+        T V = l(4);
+        T W = l(5);
+        T X = (w - U + v)*(U + v + w);
+        T x = (U - v + w)*(v - w + U);
+        T Y = (u - V + w)*(V + w + u);
+        T y = (V - w + u)*(w - u + V);
+        T Z = (v - W + u)*(W + u + v);
+        T z = (W - u + v)*(u - v + W);
+        T a = zs::sqrt(x*Y*Z);
+        T b = zs::sqrt(y*Z*X);
+        T c = zs::sqrt(z*X*Y);
+        T d = zs::sqrt(x*y*z);
+        T vol = zs::sqrt(
+        (-a + b + c + d)*
+        ( a - b + c + d)*
+        ( a + b - c + d)*
+        ( a + b + c - d))/
+        (192.*u*v*w);
+
+        return vol;
+    }
+
+    template<typename T>
+    constexpr void dihedral_angle_intrinsic(const zs::vec<T, 6>& l,const zs::vec<T, 4>& s,zs::vec<T, 6>& theta,zs::vec<T, 6>& cos_theta) {
+        zs::vec<T, 6> H_sqr{};
+        H_sqr[0] = (1./16.) * (4.*l(3)*l(3)*l(0)*l(0) - zs::sqr((l(1)*l(1) + l(4)*l(4)) - (l(2)*l(2) + l(5)*l(5))));
+        H_sqr[1] = (1./16.) * (4.*l(4)*l(4)*l(1)*l(1) - zs::sqr((l(2)*l(2) + l(5)*l(5)) - (l(3)*l(3) + l(0)*l(0))));
+        H_sqr[2] = (1./16.) * (4.*l(5)*l(5)*l(2)*l(2) - zs::sqr((l(3)*l(3) + l(0)*l(0)) - (l(4)*l(4) + l(1)*l(1))));
+        H_sqr[3] = (1./16.) * (4.*l(0)*l(0)*l(3)*l(3) - zs::sqr((l(4)*l(4) + l(1)*l(1)) - (l(5)*l(5) + l(2)*l(2))));
+        H_sqr[4] = (1./16.) * (4.*l(1)*l(1)*l(4)*l(4) - zs::sqr((l(5)*l(5) + l(2)*l(2)) - (l(0)*l(0) + l(3)*l(3))));
+        H_sqr[5] = (1./16.) * (4.*l(2)*l(2)*l(5)*l(5) - zs::sqr((l(0)*l(0) + l(3)*l(3)) - (l(1)*l(1) + l(4)*l(4))));
+
+        cos_theta(0) = (H_sqr(0) - s(1)*s(1) - s(2)*s(2)) / (-2.*s(1) * s(2));
+        cos_theta(1) = (H_sqr(1) - s(2)*s(2) - s(0)*s(0)) / (-2.*s(2) * s(0));
+        cos_theta(2) = (H_sqr(2) - s(0)*s(0) - s(1)*s(1)) / (-2.*s(0) * s(1));
+        cos_theta(3) = (H_sqr(3) - s(3)*s(3) - s(0)*s(0)) / (-2.*s(3) * s(0));
+        cos_theta(4) = (H_sqr(4) - s(3)*s(3) - s(1)*s(1)) / (-2.*s(3) * s(1));
+        cos_theta(5) = (H_sqr(5) - s(3)*s(3) - s(2)*s(2)) / (-2.*s(3) * s(2));
+
+        //TODO the theta here might be invalid, might be a hidden bug
+        theta(0) = zs::acos(cos_theta(0));  
+        theta(1) = zs::acos(cos_theta(1)); 
+        theta(2) = zs::acos(cos_theta(2)); 
+        theta(3) = zs::acos(cos_theta(3)); 
+        theta(4) = zs::acos(cos_theta(4)); 
+        theta(5) = zs::acos(cos_theta(5));       
+    }
+
+
+    template<int MAX_NEIGHS,
+        typename Pol,
+        typename PosTileVec,
+        typename SrcTileVec,
+        typename HalfEdgeTileVec,
+        typename PointTileVec,
+        typename EdgeTileVec,
+        typename TriTileVec,
+        typename DstTileVec>
+    void compute_smooth_laplacian(Pol& pol,
+        const PosTileVec& verts,const zs::SmallString& xTag,
+        const SrcTileVec& src,const zs::SmallString& srcTag,
+        const HalfEdgeTileVec& halfEdges,
+        const PointTileVec& points,
+        const EdgeTileVec& edges,
+        const TriTileVec& tris,
+        DstTileVec& dst,const zs::SmallString& dstTag) {
+            using T = typename SrcTileVec::value_type;
+            using namespace zs;
+            constexpr auto space = Pol::exec_tag::value;
+            int space_dim = src.getPropertySize(srcTag);
+
+            pol(range(points.size()),[
+                    verts = proxy<space>({},verts),xTag,
+                    src = proxy<space>({},src),srcTag,
+                    half_edges = proxy<space>({},halfEdges),
+                    points = proxy<space>({},points),
+                    edges = proxy<space>({},edges),
+                    tris = proxy<space>({},tris),
+                    dst = proxy<space>({},dst),dstTag,space_dim]
+                        ZS_LAMBDA(int pi) mutable {
+                auto vidx = reinterpret_bits<int>(points("inds",pi));
+                auto he_idx = reinterpret_bits<int>(points("he_inds",pi));
+                zs::vec<int,MAX_NEIGHS> pneighs = get_one_ring_neigh_points<MAX_NEIGHS>(he_idx,half_edges);
+                zs::vec<int,MAX_NEIGHS> eneighs = get_one_ring_neigh_edges<MAX_NEIGHS>(he_idx,half_edges);
+                T ws = (T)0.0;
+                for(int i = 0;i != MAX_NEIGHS;++i) {
+                    auto npi = pneighs[i];
+                    if(npi < 0)
+                        break;
+                    auto nvidx = reinterpret_bits<int>(points("inds",npi));
+                    auto w = (T)0.0;
+                    // compute cotangent weight
+                    {
+                        auto li = eneighs[i];
+                        auto ne = edges.pack(dim_c<2>,"inds",li).reinterpret_bits(int_c);
+                        auto fe_inds = edges.pack(dim_c<2>,"fe_inds",li).reinterpret_bits(int_c);
+
+                        auto t0 = fe_inds[0];
+                        auto t1 = fe_inds[1];
+
+                        zs::vec<T,3> l{};
+                        zs::vec<T,3> l2{};
+                        zs::vec<T,3> vs[3] = {};
+
+                        for(int j = 0;j != 2;++j) {
+                            if(fe_inds[j] < 0)
+                                break;
+                            auto tri = tris.pack(dim_c<3>,"inds",fe_inds[j]).reinterpret_bits(int_c);
+                            int k = 0;
+                            for(k = 0;k != 3;++k) {
+                                if((tri[k] == ne[0] && tri[(k+1)%3] == ne[1]) || (tri[k] == ne[1] && tri[(k+1)%3] == ne[0]))
+                                    break;
+                            }
+                            if(k == 3) {
+                                printf("invalid fe_inds detected");
+                            }else{
+                                for(int d = 0;d != 3;++d)
+                                    vs[d] = verts.pack(dim_c<3>,xTag,tri[(k + d) % 3]);
+                                for(int d = 0;d != 3;++d){
+                                    l2[d] = (vs[d] - vs[(d+1) % 3]).l2Norm();
+                                    l[d] = zs::sqrt(l2[d]);
+                                }
+
+                                auto dblA = doublearea(l[0],l[1],l[2]);
+                                auto C = (l2[2] + l2[1] - l2[0])/dblA/(T)4.0;
+                                w += C;
+                            }
+                            
+                        }
+                    }
+                    ws += w;
+                    for(int i = 0;i != space_dim;++i)
+                        dst(dstTag,i,pi) += src(srcTag,i,pi) * w;
+                }
+                for(int i = 0;i != space_dim;++i)
+                    dst(dstTag,i,pi) /= ws;
+            });            
+    }
+
+    template<int MAX_NEIGHS,
+        typename Pol,
+        typename SrcTileVec,
+        typename HalfEdgeTileVec,
+        typename PointTileVec,
+        typename EdgeTileVec,
+        typename TriTileVec,
+        typename DstTileVec>
+    void compute_smooth(Pol& pol,
+        const SrcTileVec& src,const zs::SmallString& srcTag,
+        const HalfEdgeTileVec& halfEdges,
+        const PointTileVec& points,
+        const EdgeTileVec& edges,
+        const TriTileVec& tris,
+        DstTileVec& dst,const zs::SmallString& dstTag) {
+            using T = typename SrcTileVec::value_type;
+            using namespace zs;
+            constexpr auto space = Pol::exec_tag::value;
+            int space_dim = src.getPropertySize(srcTag);
+            pol(range(points.size()),[
+                    src = proxy<space>({},src),srcTag,
+                    half_edges = proxy<space>({},halfEdges),
+                    points = proxy<space>({},points),
+                    edges = proxy<space>({},edges),
+                    tris = proxy<space>({},tris),
+                    dst = proxy<space>({},dst),dstTag,space_dim]
+                        ZS_LAMBDA(int pi) mutable {
+                auto vidx = reinterpret_bits<int>(points("inds",pi));
+                auto he_idx = reinterpret_bits<int>(points("he_inds",pi));
+                zs::vec<int,MAX_NEIGHS> pneighs = get_one_ring_neigh_points<MAX_NEIGHS>(he_idx,half_edges);
+                T ws = (T)0.0;
+
+                for(int i = 0;i != MAX_NEIGHS;++i) {
+                    auto npi = pneighs[i];
+                    if(npi < 0)
+                        break;
+                    auto nvidx = reinterpret_bits<int>(points("inds",npi));
+                    auto w = (T)1.0;
+                    ws += w;
+                    for(int d = 0;d != space_dim;++d)
+                        dst(dstTag,d,pi) += w * src(srcTag,d,pi);
+                }
+                for(int d = 0;d != space_dim;++d)
+                    dst(dstTag,d,pi) /= ws;
+            });
+    }
+
+    template<int MAX_NEIGHS,
+        typename Pol,
+        typename SrcTileVec,
+        typename HalfEdgeTileVec,
+        typename PointTileVec,
+        typename EdgeTileVec,
+        typename TriTileVec,
+        typename DstTileVec>
+    void compute_smooth_corrective(Pol& pol,
+            const SrcTileVec& src,const zs::SmallString& srcTag,
+            const HalfEdgeTileVec& halfEdges,
+            const PointTileVec& points,
+            const EdgeTileVec& edges,
+            const TriTileVec& tris,
+            DstTileVec& dst,const zs::SmallString& dstTag) {
+
+    }
+
+
+    template <int simplex_size,typename Pol,typename ETileVec,typename VTileVec,typename ETmpTileVec>
+    void compute_cotmatrix(Pol &pol,const ETileVec &eles,
+        const VTileVec &verts, const zs::SmallString& xTag, 
+        ETmpTileVec& etemp, const zs::SmallString& HTag) {
+
+        static_assert(zs::is_same_v<typename ETileVec::value_type,typename VTileVec::value_type>,"precision not match");
+        static_assert(zs::is_same_v<typename ETileVec::value_type,typename ETmpTileVec::value_type>,"precision not match");   
+
+        using T = typename VTileVec::value_type;
+
+        using namespace zs;
+        static_assert(simplex_size >= 3 && simplex_size <=4, "invalid co-dimension!\n");
+        constexpr auto space = Pol::exec_tag::value;
+
+        #if ZS_ENABLE_CUDA && defined(__CUDACC__)
+            static_assert(space == execspace_e::cuda,
+                    "specified policy and compiler not match");
+        #else
+            static_assert(space != execspace_e::cuda,
+                    "specified policy and compiler not match");
+        #endif
+
+        if(!verts.hasProperty(xTag)){
+            printf("the verts buffer does not contain specified channel\n");
+        }   
+
+        // if(!etemp.hasProperty(HTag)){
+        //     printf("the etemp buffer does not contain specified channel\n");
+        // }  
+
+        etemp.append_channels(pol,{{HTag,simplex_size*simplex_size}});
+
+        // zs::Vector<T> C{eles.get_allocator(),eles.size()*simplex_size*(simplex_size-1)/2};
+
+        // compute cotangent entries
+        // fmt::print("COMPUTE COTANGENT ENTRIES\n");
+        int nm_elms = etemp.size();
+        pol(zs::range(etemp.size()),
+            [eles = proxy<space>({},eles),verts = proxy<space>({},verts),
+            etemp = proxy<space>({},etemp),xTag,HTag,nm_elms] ZS_LAMBDA(int ei) mutable {
+                constexpr int ne = simplex_size*(simplex_size-1)/2;
+                auto inds = eles.template pack<simplex_size>("inds",ei).template reinterpret_bits<int>();
+                
+                using IV = zs::vec<int,ne*2>;
+                using TV = zs::vec<T, ne>;
+
+                TV C;
+                IV edges;
+                // printf("check_0\n");
+                // compute the cotangent entris
+                if constexpr (simplex_size == 3){
+                    edges = IV{1,2,2,0,0,1};
+                    zs::vec<T,3> l;
+                    zs::vec<T,3> l2;
+                    for(size_t i = 0;i != ne;++i) {
+                        l[i] = (verts.pack<3>(xTag,inds[edges[i*2+0]]) - verts.pack<3>(xTag,inds[edges[i*2+1]])).norm();
+                        l2[i] = l[i] * l[i];
+                    }
+                    auto dblA = doublearea(l[0],l[1],l[2]);// check here, double area
+                    for(size_t i = 0;i != ne;++i)
+                        C[i] = (l2[edges[2*i+0]] + l2[edges[2*i+1]] - l2[3 - edges[2*i+0] - edges[2*i+1]])/dblA/4.0;
+                }
+                if constexpr (simplex_size == 4){
+                    // printf("check_1\n");
+                    edges = IV{1,2,2,0,0,1,3,0,3,1,3,2};
+                    zs::vec<T,ne> l{};
+                    l[0] = (verts.pack<3>(xTag,inds[3]) - verts.pack<3>(xTag,inds[0])).length();
+                    l[1] = (verts.pack<3>(xTag,inds[3]) - verts.pack<3>(xTag,inds[1])).length();
+                    l[2] = (verts.pack<3>(xTag,inds[3]) - verts.pack<3>(xTag,inds[2])).length();
+                    l[3] = (verts.pack<3>(xTag,inds[1]) - verts.pack<3>(xTag,inds[2])).length();
+                    l[4] = (verts.pack<3>(xTag,inds[2]) - verts.pack<3>(xTag,inds[0])).length();
+                    l[5] = (verts.pack<3>(xTag,inds[0]) - verts.pack<3>(xTag,inds[1])).length();
+                    // for(int i = 0;i != ne;++i)
+                    //     l[i] = (verts.pack<3>(xTag,inds[edges[i*2+0]]) - verts.pack<3>(xTag,inds[edges[i*2+1]])).norm();
+                    // printf("check_2\n");
+                    zs::vec<T, 4> s{ 
+                        area(l[1],l[2],l[3]),
+                        area(l[0],l[2],l[4]),
+                        area(l[0],l[1],l[5]),
+                        area(l[3],l[4],l[5])};
+                    // printf("check_3\n");
+                    zs::vec<T,ne> cos_theta{},theta{};
+                    dihedral_angle_intrinsic(l,s,theta,cos_theta);
+                    // printf("check_4\n");
+                    T vol = eles("vol",ei);
+                    // T vol_cmp = volume(l);
+                    // if(fabs(vol_cmp - vol) > 1e-6)
+                        // printf("VOL_ERROR<%d> : %f\n",ei,(float)fabs(vol_cmp - vol));
+                    zs::vec<T, 6> sin_theta{};
+                    #if 0
+                    sin_theta(0) = vol / ((2./(3.*l(0))) * s(1) * s(2));
+                    sin_theta(1) = vol / ((2./(3.*l(1))) * s(2) * s(0));
+                    sin_theta(2) = vol / ((2./(3.*l(2))) * s(0) * s(1));
+                    sin_theta(3) = vol / ((2./(3.*l(3))) * s(3) * s(0));
+                    sin_theta(4) = vol / ((2./(3.*l(4))) * s(3) * s(1));
+                    sin_theta(5) = vol / ((2./(3.*l(5))) * s(3) * s(2));
+                    #else
+                    for(size_t i = 0;i !=ne; ++i)
+                        sin_theta(i) = zs::sin(theta(i));
+                    #endif
+                    C = (1./6.) * l * cos_theta / sin_theta;
+                }
+
+                constexpr int simplex_size2 = simplex_size*simplex_size;
+                etemp.template tuple<simplex_size2>(HTag,ei) = zs::vec<T,simplex_size2>::zeros();
+
+
+                for(size_t i = 0;i != ne;++i){
+                    int source = edges(i*2 + 0);
+                    int dest = edges(i*2 + 1);
+                    etemp(HTag,simplex_size*source + dest,ei) -= C(i); 
+                    etemp(HTag,simplex_size*dest + source,ei) -= C(i); 
+                    etemp(HTag,simplex_size*source + source,ei) += C(i); 
+                    etemp(HTag,simplex_size*dest + dest,ei) += C(i); 
+                }
+
+                auto L = etemp.template pack<simplex_size,simplex_size>(HTag,ei);
+        });
+
+        // fmt::print("FINISH COMPUTING COTANGENT ENTRIES\n");
+
+    }
+};
\ No newline at end of file
diff --git a/projects/CuLagrange/geometry/kernel/tiled_vector_ops.hpp b/projects/CuLagrange/geometry/kernel/tiled_vector_ops.hpp
index 7c5d07fdba..2641ca9099 100644
--- a/projects/CuLagrange/geometry/kernel/tiled_vector_ops.hpp
+++ b/projects/CuLagrange/geometry/kernel/tiled_vector_ops.hpp
@@ -8,27 +8,41 @@ namespace zeno { namespace TILEVEC_OPS {
     using T = float;
 
     template<int width,typename Pol,typename SrcTileVec,typename DstTileVec>
-    void copy(Pol& pol,const SrcTileVec& src,const zs::SmallString& src_tag,DstTileVec& dst,const zs::SmallString& dst_tag) {
+    void copy(Pol& pol,const SrcTileVec& src,const zs::SmallString& src_tag,DstTileVec& dst,const zs::SmallString& dst_tag,int offset = 0) {
         using namespace zs;
         constexpr auto space = execspace_e::cuda;
         // if(src.size() != dst.size())
         //     throw std::runtime_error("copy_ops_error::the size of src and dst not match");
 
         pol(zs::range(src.size()),
-            [src = proxy<space>({},src),src_tag,dst = proxy<space>({},dst),dst_tag] __device__(int vi) mutable {
-                dst.template tuple<width>(dst_tag,vi) = src.template pack<width>(src_tag,vi);
+            [src = proxy<space>({},src),src_tag,dst = proxy<space>({},dst),dst_tag,offset] __device__(int vi) mutable {
+                dst.template tuple<width>(dst_tag,vi + offset) = src.template pack<width>(src_tag,vi);
         });
     }
 
     template<typename Pol,typename SrcTileVec,typename DstTileVec>
-    void copy(Pol& pol,const SrcTileVec& src,const zs::SmallString& src_tag,DstTileVec& dst,const zs::SmallString& dst_tag) {
+    void copy(Pol& pol,const SrcTileVec& src,const zs::SmallString& src_tag,DstTileVec& dst,const zs::SmallString& dst_tag,int offset = 0) {
         using namespace zs;
         constexpr auto space = execspace_e::cuda;
         // if(src.size() != dst.size())
         //     throw std::runtime_error("copy_ops_error::the size of src and dst not match");
+        if(!src.hasProperty(src_tag)){
+            fmt::print(fg(fmt::color::red),"copy_ops_error::the src has no specified channel {}\n",src_tag);
+            throw std::runtime_error("copy_ops_error::the src has no specified channel");
+        }
+        if(!dst.hasProperty(dst_tag)){
+            fmt::print(fg(fmt::color::red),"copy_ops_error::the dst has no specified channel {}\n",dst_tag);
+            throw std::runtime_error("copy_ops_error::the dst has no specified channel");
+        }
+        auto space_dim = src.getChannelSize(src_tag);
+        if(dst.getChannelSize(dst_tag) != space_dim){
+            // std::cout << "invalid channel size : " << space_dim << "\t" << dst.getChannelSize(dst_tag) << std::endl;
+            throw std::runtime_error("copy_ops_error::the channel size of src and dst not match");
+        }
         pol(zs::range(src.size()),
-            [src = proxy<space>({},src),src_tag,dst = proxy<space>({},dst),dst_tag] __device__(int vi) mutable {
-                dst(dst_tag,vi) = src(src_tag,vi);
+            [src = proxy<space>({},src),src_tag,dst = proxy<space>({},dst),dst_tag,offset,space_dim] __device__(int vi) mutable {
+                for(int i = 0;i != space_dim;++i)
+                    dst(dst_tag,i,vi + offset) = src(src_tag,i,vi);
         });
     }
 
@@ -60,49 +74,133 @@ namespace zeno { namespace TILEVEC_OPS {
         constexpr auto space = execspace_e::cuda;
         pol(range(vtemp.size()),
             [vtemp = proxy<space>({},vtemp),tag,value] __device__(int vi) mutable {
-                vtemp.template tuple<space_dim>(tag,vi) = value;
+                vtemp.tuple(dim_c<space_dim>,tag,vi) = value;
         });
     }
 
+
     template<typename T,typename Pol,typename VTileVec>
     void fill(Pol& pol,VTileVec& vtemp,const zs::SmallString& tag,const T& value) {
         using namespace zs;
         constexpr auto space = execspace_e::cuda;
+        int space_dim = vtemp.getChannelSize(tag);
         pol(range(vtemp.size()),
-            [vtemp = proxy<space>({},vtemp),tag,value] __device__(int vi) mutable {
-                vtemp(tag,vi) = value;
+            [vtemp = proxy<space>({},vtemp),tag,value,space_dim] __device__(int vi) mutable {
+                for(int i= 0;i != space_dim;++i)
+                    vtemp(tag,i,vi) = value;
         });
     }
 
-    template<int space_dim,int simplex_size,typename Pol,typename SrcTileVec,typename DstTileVec>
+    template<int space_dim,typename Pol,typename VTileVec>
+    void fill_range(Pol& pol,VTileVec& vtemp,const zs::SmallString& tag,const zs::vec<T,space_dim>& value,int start,int length) {
+        using namespace zs;
+        constexpr auto space = execspace_e::cuda;
+        pol(range(length),
+            [vtemp = proxy<space>({},vtemp),tag,value,start] __device__(int vi) mutable {
+                vtemp.template tuple<space_dim>(tag,vi + start) = value;
+        });
+    }
+
+
+    template<typename T,typename Pol,typename VTileVec>
+    void fill_range(Pol& pol,VTileVec& vtemp,const zs::SmallString& tag,const T& value,int start,int length) {
+        using namespace zs;
+        constexpr auto space = execspace_e::cuda;
+        int space_dim = vtemp.getChannelSize(tag);
+        pol(range(length),
+            [vtemp = proxy<space>({},vtemp),tag,value,space_dim,start] __device__(int vi) mutable {
+                for(int i= 0;i != space_dim;++i)
+                    vtemp(tag,i,vi + start) = value;
+        });
+    }
+
+
+
+    template<typename Pol,typename SrcTileVec,typename DstTileVec>
     void assemble(Pol& pol,
         const SrcTileVec& src,const zs::SmallString& srcTag,const zs::SmallString& srcTopoTag,
         DstTileVec& dst,const zs::SmallString& dstTag) {
             using namespace zs;
             constexpr auto space = execspace_e::cuda;
 
-            if(!src.hasProperty(srcTopoTag) || src.getPropertySize(srcTopoTag) != simplex_size)
+            if(!src.hasProperty(srcTopoTag))
                 throw std::runtime_error("tiledvec_ops::assemble::invalid src's topo channel");
             if(!src.hasProperty(srcTag))
                 throw std::runtime_error("tiledvec_ops::assemble::src has no 'srcTag' channel");
             if(!dst.hasProperty(dstTag))
                 throw std::runtime_error("tiledvec_ops::assemble::dst has no 'dstTag' channel");
 
+            int simplex_size = src.getChannelSize(srcTopoTag);
+            int src_space_dim = src.getChannelSize(srcTag);
+            int dst_space_dim = dst.getChannelSize(dstTag);
+
+            if(dst_space_dim * simplex_size != src_space_dim)
+                throw std::runtime_error("tiledvec_ops::assemble::src_space_dim and dst_space_dim not match");
+
+            // std::cout << "simplex_size : " << simplex_size << std::endl;
+            // std::cout << "space_dim : " << space_dim << std::endl;
+            // std::cout << "src_size : " << src.size() << std::endl;
+            // std::cout << "dst_size : " << dst.size() << std::endl;
+
+
             pol(range(src.size()),
-                [src = proxy<space>({},src),dst = proxy<space>({},dst),srcTag,srcTopoTag,dstTag] __device__(int si) mutable {
-                    auto inds = src.template pack<simplex_size>(srcTopoTag,si).reinterpret_bits(int_c);
-                    for(int i = 0;i != simplex_size;++i)
-                        if(inds[i] < 0)
+                [src = proxy<space>({},src),dst = proxy<space>({},dst),srcTag,srcTopoTag,dstTag,simplex_size,src_space_dim,dst_space_dim] __device__(int si) mutable {
+                    for(int i = 0;i != simplex_size;++i){
+                        auto idx = reinterpret_bits<int>(src(srcTopoTag,i,si));
+                        if(idx < 0)
                             return;
-                    auto data = src.template pack<space_dim * simplex_size>(srcTag,si);
-                    for(int i = 0;i != simplex_size;++i)
-                            for(int d = 0;d != space_dim;++d)
-                                atomic_add(exec_cuda,&dst(dstTag,d,inds[i]),data[i*space_dim + d]);
+                    }
+
+                    for(int i = 0;i != simplex_size;++i){
+                        auto idx = reinterpret_bits<int>(src(srcTopoTag,i,si));
+                        for(int d = 0;d != dst_space_dim;++d){
+                            atomic_add(exec_cuda,&dst(dstTag,d,idx),src(srcTag,i * dst_space_dim + d,si));
+                        }
+                    }
             });
     }
 
+    template<typename Pol,typename SrcTileVec,typename DstTileVec>
+    void assemble_range(Pol& pol,
+        const SrcTileVec& src,const zs::SmallString& srcTag,const zs::SmallString& srcTopoTag,
+        DstTileVec& dst,const zs::SmallString& dstTag,int start,int alen) {
+            using namespace zs;
+            constexpr auto space = execspace_e::cuda;
 
-    template<int space_dim,int simplex_size,typename Pol,typename SrcTileVec,typename DstTileVec>
+            if(!src.hasProperty(srcTopoTag))
+                throw std::runtime_error("tiledvec_ops::assemble::invalid src's topo channel");
+            if(!src.hasProperty(srcTag))
+                throw std::runtime_error("tiledvec_ops::assemble::src has no 'srcTag' channel");
+            if(!dst.hasProperty(dstTag))
+                throw std::runtime_error("tiledvec_ops::assemble::dst has no 'dstTag' channel");
+
+            int simplex_size = src.getChannelSize(srcTopoTag);
+            int src_space_dim = src.getChannelSize(srcTag);
+            int dst_space_dim = dst.getChannelSize(dstTag);
+
+
+            if(dst_space_dim * simplex_size != src_space_dim)
+                throw std::runtime_error("tiledvec_ops::assemble::src_space_dim and dst_space_dim not match");
+
+            pol(range(alen),
+                [src = proxy<space>({},src),dst = proxy<space>({},dst),srcTag,srcTopoTag,dstTag,start,simplex_size,space_dim = dst_space_dim] __device__(int si) mutable {
+                    for(int i = 0;i != simplex_size;++i){
+                        auto idx = reinterpret_bits<int>(src(srcTopoTag,i,si + start));
+                        if(idx < 0)
+                            return;
+                    }
+                    for(int i = 0;i != simplex_size;++i){
+                            auto idx = reinterpret_bits<int>(src(srcTopoTag,i,si + start));
+                            for(int d = 0;d != space_dim;++d){
+                                atomic_add(exec_cuda,&dst(dstTag,d,idx),src(srcTag,i * space_dim + d,si + start));
+                            }
+                    }
+            });
+    }
+
+
+
+    template<typename Pol,typename SrcTileVec,typename DstTileVec>
     void assemble(Pol& pol,
         const SrcTileVec& src,const zs::SmallString& srcTag,
         DstTileVec& dst,const zs::SmallString& dstTag) {
@@ -111,7 +209,7 @@ namespace zeno { namespace TILEVEC_OPS {
 
             // TILEVEC_OPS::fill<space_dim>(pol,dst,"dir",zs::vec<T,space_dim>::uniform((T)0.0));
 
-            // if(!src.hasProperty("inds") || src.getPropertySize("inds") != simplex_size)
+            // if(!src.hasProperty("inds") || src.getChannelSize("inds") != simplex_size)
             //     throw std::runtime_error("tiledvec_ops::assemble::invalid src's topo channel inds");
 
             // pol(range(src.size()),
@@ -126,34 +224,97 @@ namespace zeno { namespace TILEVEC_OPS {
             //                     atomic_add(exec_cuda,&dst(dst_tag,d,inds[i]),data[i*space_dim + d]);
             // });
 
-            assemble<space_dim,simplex_size>(pol,src,srcTag,"inds",dst,dstTag);
+            assemble(pol,src,srcTag,"inds",dst,dstTag);
     }
 
 
-
-    template<int space_dim,int simplex_size,typename Pol,typename SrcTileVec,typename DstTileVec>
+    template<typename Pol,typename SrcTileVec,typename DstTileVec,typename DstTopoTileVec>
     void assemble_from(Pol& pol,
         const SrcTileVec& src,const zs::SmallString& srcTag,
-        DstTileVec& dst,const zs::SmallString& dstTag,const zs::SmallString& dstTopoTag) {
+        DstTileVec& dst,const zs::SmallString& dstTag,
+        const DstTopoTileVec& topo,const zs::SmallString& dstTopoTag) {
             using namespace zs;
             constexpr auto space = execspace_e::cuda;
 
-            if(!dst.hasProperty(dstTopoTag) || dst.getPropertySize(dstTopoTag) != simplex_size)
+            if(!topo.hasProperty(dstTopoTag))
                 throw std::runtime_error("tiledvec_ops::assemble_from::invalid dst's topo channel");
             if(!src.hasProperty(srcTag))
                 throw std::runtime_error("tiledvec_ops::assemble::src has no 'srcTag' channel");
             if(!dst.hasProperty(dstTag))
                 throw std::runtime_error("tiledvec_ops::assemble::dst has no 'dstTag' channel");
+            if(dst.size() != topo.size())
+                throw std::runtime_error("tiledvec_ops::assemble::dst and topo size not match");
+
+            int simplex_size = topo.getChannelSize(dstTopoTag);
+            int space_dim = src.getChannelSize(srcTag);
 
             pol(zs::range(dst.size()),
-                [dst = proxy<space>({},dst),src = proxy<space>({},src),srcTag,dstTag,dstTopoTag] __device__(int di) mutable {
-                    auto inds = dst.template pack<simplex_size>(dstTopoTag,di).reinterpret_bits(int_c);
-                    for(int i = 0;i != simplex_size;++i)
-                        dst.template tuple<space_dim>(dstTag,di) += src.template pack<space_dim>(srcTag,inds[i]);
+                [dst = proxy<space>({},dst),src = proxy<space>({},src),srcTag,dstTag,topo = proxy<space>({},topo),dstTopoTag,simplex_size,space_dim] __device__(int di) mutable {     
+                    for(int i = 0;i != simplex_size;++i){
+                        auto idx = reinterpret_bits<int>(topo(dstTopoTag,i,di));
+                        for(int d = 0;d != space_dim;++d)
+                            dst(dstTag,d,di) += src(srcTag,d,idx);
+                    }
             });
 
     }
 
+    template<typename Pol,typename SrcTileVec0,typename SrcTileVec1,typename DstTileVec>
+    void concatenate_two_tiled_vecs(Pol& pol,
+        const SrcTileVec0& src0,
+        const SrcTileVec1& src1,
+        DstTileVec& dst,
+        const std::vector<zs::PropertyTag>& tags) {
+            using namespace zs;
+            constexpr auto space = execspace_e::cuda;
+
+            for(int i = 0;i != tags.size();++i){
+                auto name = tags[i].name;
+                auto numChannels = tags[i].numChannels;
+
+                if(!src0.hasProperty(name) || src0.getChannelSize() != numChannels)
+                    throw std::runtime_error("concatenate_two_tiled_vecs::src0's channels not aligned with specified tags");
+                if(!src1.hasProperty(name) || src1.getChannelSize() != numChannels)
+                    throw std::runtime_error("concatenate_two_tiled_vecs::src1's channels not aligned with specified tags");
+                if(!dst.hasProperty(name) || dst.getChannelSize() != numChannels)
+                    throw std::runtime_error("concatenate_two_tiled_vecs::dst's channels not aligned with specified tags");
+                if(dst.size() != (src0.size() + src1.size()))
+                    throw std::runtime_error("concatenate_two_tiled_vecs::dst.size() != src0.size() + src1.size()");
+            }
+
+            for(int i = 0;i != tags.size();++i) {
+                auto name = tags[i].name;
+                auto numChannels = tags[i].numChannels;
+                copy(pol,src0,name,dst,name,0);
+                copy(pol,src1,name,dst,name,src0.size());
+            }
+    }
+
+
+    template<int space_dim,int simplex_size,typename Pol,typename SrcTileVec,typename DstTileVec>
+    void assemble_from(Pol& pol,
+        const SrcTileVec& src,const zs::SmallString& srcTag,
+        DstTileVec& dst,const zs::SmallString& dstTag,const zs::SmallString& dstTopoTag) {
+            // using namespace zs;
+            // constexpr auto space = execspace_e::cuda;
+
+            // if(!dst.hasProperty(dstTopoTag) || dst.getChannelSize(dstTopoTag) != simplex_size)
+            //     throw std::runtime_error("tiledvec_ops::assemble_from::invalid dst's topo channel");
+            // if(!src.hasProperty(srcTag))
+            //     throw std::runtime_error("tiledvec_ops::assemble::src has no 'srcTag' channel");
+            // if(!dst.hasProperty(dstTag))
+            //     throw std::runtime_error("tiledvec_ops::assemble::dst has no 'dstTag' channel");
+
+            // pol(zs::range(dst.size()),
+            //     [dst = proxy<space>({},dst),src = proxy<space>({},src),srcTag,dstTag,dstTopoTag] __device__(int di) mutable {
+            //         auto inds = dst.template pack<simplex_size>(dstTopoTag,di).reinterpret_bits(int_c);
+            //         for(int i = 0;i != simplex_size;++i)
+            //             dst.template tuple<space_dim>(dstTag,di) = dst.template pack<space_dim>(dstTag,di) + src.template pack<space_dim>(srcTag,inds[i]);
+            // });
+            assemble_from(pol,src,srcTag,dst,dstTag,dst,dstTopoTag);
+
+    }
+
     // maybe we also need a weighted assemble func
 
     template<int space_dim,typename Pol,typename VTileVec>
@@ -165,7 +326,7 @@ namespace zeno { namespace TILEVEC_OPS {
             [vtemp = proxy<space>({},vtemp),tag,eps] __device__(int vi) mutable {
                 auto d = vtemp.template pack<space_dim>(tag,vi);
                 auto dn = d.norm();
-                d = dn < eps ? d/dn : zs::vec<T,space_dim>::zeros();
+                d = dn > eps ? d/dn : zs::vec<T,space_dim>::zeros();
                 vtemp.template tuple<space_dim>(tag,vi) = d;
         });
     }
diff --git a/projects/CuLagrange/geometry/kernel/topology.hpp b/projects/CuLagrange/geometry/kernel/topology.hpp
index 20b46e5f22..9590d57eb1 100644
--- a/projects/CuLagrange/geometry/kernel/topology.hpp
+++ b/projects/CuLagrange/geometry/kernel/topology.hpp
@@ -5,7 +5,9 @@
 #include "zensim/cuda/execution/ExecutionPolicy.cuh"
 #include "zensim/omp/execution/ExecutionPolicy.hpp"
 #include "zensim/container/Bvh.hpp"
-
+#include "zensim/container/Bcht.hpp"
+#include "zensim/zpc_tpls/fmt/format.h"
+#include "tiled_vector_ops.hpp"
 
 namespace zeno {
 
@@ -38,13 +40,70 @@ namespace zeno {
         return -1;
     }
 
+    template<typename Pol,typename VTileVec,typename TriTileVec,typename TetTileVec>
+    bool compute_ft_neigh_topo(Pol& pol,const VTileVec& verts,TriTileVec& tris,const TetTileVec& tets,const zs::SmallString& neighTag,float bvh_thickness) {
+        using namespace zs;
+        using T = typename VTileVec::value_type;
+        using bv_t = AABBBox<3,T>;
+
+        if(!tris.hasProperty(neighTag) || tris.getChannelSize(neighTag) != 1)
+            return false;
+        
+        constexpr auto space = zs::execspace_e::cuda;
+        auto tetsBvh = LBvh<3,int,T>{};
+        
+        auto bvs = retrieve_bounding_volumes(pol,verts,tets,wrapv<4>{},bvh_thickness,"x");
+        tetsBvh.build(pol,bvs);
+
+        size_t nmTris = tris.size();
+        pol(zs::range(nmTris),
+            [tets = proxy<space>({},tets),
+                verts = proxy<space>({},verts),
+                tris = proxy<space>({},tris),
+                tetsBvh = proxy<space>(tetsBvh),
+                neighTag] ZS_LAMBDA(int ti) mutable {
+                    auto tri = tris.pack(dim_c<3>,"inds",ti).reinterpret_bits(int_c);
+                    tris(neighTag,ti) = zs::reinterpret_bits<float>((int)-1);
+                    int nm_found = 0;
+                    auto cv = zs::vec<T, 3>::zeros();
+                    for(int i = 0;i != 3;++i)
+                        cv += verts.pack(dim_c<3>,"x",tri[i])/(T)3.0;
+                    tetsBvh.iter_neighbors(cv,[&](int ntet) {
+                        // if(ti == 0)
+                        //     printf("test tet[%d] and tri[%d]\n",ntet,ti);
+                        if(nm_found > 0)
+                            return;
+                        auto tet = tets.pack(dim_c<4>,"inds",ntet).reinterpret_bits(int_c);
+                        for(int i = 0;i != 3;++i){
+                            bool found_idx = false;
+                            for(int j = 0;j != 4;++j)
+                                if(tet[j] == tri[i]){
+                                    found_idx = true;
+                                    break;
+                                }
+                            if(!found_idx)
+                                return;
+                        }
+
+                        nm_found++;
+                        tris(neighTag,ti) = reinterpret_bits<float>(ntet);
+                    });
+
+                    if(nm_found == 0)
+                        printf("found no neighbored tet for tri[%d]\n",ti);
+
+        });
+
+        return true;
+    }
+
     template<typename Pol,typename VTileVec,typename TTileVec>
     bool compute_ff_neigh_topo(Pol& pol,const VTileVec& verts,TTileVec& tris,const zs::SmallString neighTag,float bvh_thickness) {
         using namespace zs;
         using T = typename VTileVec::value_type;
         using bv_t = AABBBox<3,T>;
 
-        if(!tris.hasProperty(neighTag) || (tris.getPropertySize(neighTag) != 3)){
+        if(!tris.hasProperty(neighTag) || (tris.getChannelSize(neighTag) != 3)){
             return false;
         }
 
@@ -55,12 +114,16 @@ namespace zeno {
 
         size_t nmTris = tris.size();
         // std::cout << "CALCULATE INCIDENT TRIS " << nmTris << std::endl;
+        if(!tris.hasProperty("non_manfold"))
+            tris.append_channels(pol,{{"non_manifold",1}});
+        
 
         pol(zs::range(nmTris),
             [tris = proxy<space>({},tris),
                     verts = proxy<space>({},verts),
                     trisBvh = proxy<space>(trisBvh),
                     neighTag] ZS_LAMBDA(int ti) mutable {
+                tris("non_manifold",ti) = (T)0;
                 auto tri = tris.template pack<3>("inds",ti).template reinterpret_bits<int>();
                 tris.template tuple<3>(neighTag,ti) = zs::vec<int,3>{-1,-1,-1}.template reinterpret_bits<float>();
                 for(int i = 0;i < 3; ++i) {
@@ -98,6 +161,7 @@ namespace zeno {
                     });
                     if(nm_found > 1) {
                         printf("found a non-manifold facet %d %d\n",ti,nm_found);
+                        tris("non_manifold",ti) = (T)1.0;
                     }
                     if(nm_found == 0) {
                         printf("found boundary facet %d\n",ti);
@@ -115,7 +179,7 @@ namespace zeno {
         using T = typename VTileVec::value_type;
         using bv_t = AABBBox<3,T>;
 
-        if(!tris.hasProperty(neighTag) || tris.getPropertySize(neighTag) != 3) 
+        if(!tris.hasProperty(neighTag) || tris.getChannelSize(neighTag) != 3) 
             return false;
 
         constexpr auto space = zs::execspace_e::cuda;
@@ -146,6 +210,34 @@ namespace zeno {
         return true;
     }
 
+    // template<typename Pol,typename VTileVec,typename ETileVec,typename PTileVec>
+    // bool compute_ep_neigh_topo(Pol& pol,const VTileVec& verts,PTileVec& points,ETileVec& edges,const zs::SmallString& neighTag,float bvh_thickness) {
+    //     using namespace zs;
+    //     using T = typename VTileVec::value_type;
+    //     using bv_t = AABBBox<3,T>;
+
+    //     if(!edges.hasProperty(neighTag) || edges.getChannelSize(neighTag) != 2)
+    //         return false;
+
+    //     constexpr auto space = zs::execspace_e::cuda;
+    //     auto edgesBvh LBvh<3,int,T>{};
+    //     auto bvs = retrieve_bounding_volumes(pol,verts,tris,wrapv<2>{},bvh_thickness,"x");
+    //     edgesBvh.build(pol,bvs);
+
+    //     pol(range(points.size()),[
+    //             verts = proxy<space>({},verts),
+    //             points = proxy<space>({},points),
+    //             edges = proxy<space>({},edges),
+    //             edgesBvh = proxy<space>(edgesBvh),
+    //             neighTag,thickness = bvh_thickness] ZS_LAMBDA(int pi) mutable {
+    //                 auto pidx = reinterpret_bits<int>(points("inds",pi));
+    //                 auto v = verts.pack(dim_c<3>,"x",pidx);
+                    
+
+    //     });
+
+    // }
+
 
     template<typename Pol,typename VTileVec,typename ETileVec,typename TTileVec>
     bool compute_fe_neigh_topo(Pol& pol,const VTileVec& verts,ETileVec& edges,TTileVec& tris,const zs::SmallString& neighTag,float bvh_thickness) {
@@ -153,10 +245,10 @@ namespace zeno {
         using T = typename VTileVec::value_type;
         using bv_t = AABBBox<3,T>;
 
-        if(!edges.hasProperty(neighTag) || edges.getPropertySize(neighTag) != 2)
+        if(!edges.hasProperty(neighTag) || edges.getChannelSize(neighTag) != 2)
             return false;
 
-        if(!tris.hasProperty(neighTag) || tris.getPropertySize(neighTag) != 3)
+        if(!tris.hasProperty(neighTag) || tris.getChannelSize(neighTag) != 3)
             return false;
 
         constexpr auto space = zs::execspace_e::cuda;
@@ -231,4 +323,295 @@ namespace zeno {
         return true;
     }
 
+
+    // void c
+
+    // the input mesh should be a manifold
+    template<typename Pol,typename SurfTriTileVec,typename SurfEdgeTileVec,typename SurfPointTileVec,typename HalfEdgeTileVec>
+    bool build_surf_half_edge(Pol& cudaPol,SurfTriTileVec& tris,SurfEdgeTileVec& lines,SurfPointTileVec& points,HalfEdgeTileVec& halfEdge) {
+        using namespace zs;
+        using vec2i = zs::vec<int, 2>;
+		using vec3i = zs::vec<int, 3>;
+        using T = typename SurfTriTileVec::value_type;
+
+        constexpr auto space = zs::execspace_e::cuda;
+
+        TILEVEC_OPS::fill(cudaPol,halfEdge,"to_vertex",reinterpret_bits<T>((int)-1));
+        TILEVEC_OPS::fill(cudaPol,halfEdge,"to_face",reinterpret_bits<T>((int)-1));
+        // TILEVEC_OPS::fill(cudaPol,halfEdge,"edge",reinterpret_bits<T>((int)-1));
+        TILEVEC_OPS::fill(cudaPol,halfEdge,"opposite_he",reinterpret_bits<T>((int)-1));
+        TILEVEC_OPS::fill(cudaPol,halfEdge,"next_he",reinterpret_bits<T>((int)-1));      
+        // we might also need a space hash structure here, map from [i1,i2]->[ej]
+
+        // surface tri edges' indexing the halfedge list
+        bcht<vec2i,int,true,universal_hash<vec2i>,32> hetab{tris.get_allocator(),tris.size() * 3};
+        // bcht<vec2i,int,true,universal_hash<vec2i>,32> etab{lines.get_allocator(),lines.size()};
+        Vector<int> sfi{tris.get_allocator(),tris.size() * 3};
+        // surface points' indexing one of the connected half-edge
+        bcht<int,int,true,universal_hash<int>,32> ptab{points.get_allocator(),points.size()};
+        Vector<int> spi{points.get_allocator(),points.size()};
+
+        bcht<vec2i,int,true,universal_hash<vec2i>,32> de2fi{halfEdge.get_allocator(),halfEdge.size()};
+        Vector<int> sei(lines.get_allocator(),lines.size());
+
+        cudaPol(range(points.size()),
+            [ptab = proxy<space>(ptab),points = proxy<space>({},points),spi = proxy<space>(spi)] ZS_LAMBDA(int pi) mutable {
+                auto pidx = reinterpret_bits<int>(points("inds",pi));
+                if(int no = ptab.insert(pidx);no >= 0)
+                    spi[no] = pi;
+        });
+        // cudaPol(range(lines.size()),
+        //     [estab = proxy<space>(estab),lines = proxy<space>({},lines),sei = proxy<space>(sei)] ZS_LAMBDA(int li) mutable {
+        //         auto l = lines.pack(dim_c<2>,"inds",li).reinterpret_bits(int_c);
+        //         if(no = estab.insert(vec2i{l[0],l[1]});no >= 0)
+        //             sei[no] = li;
+        // });
+        // initialize surface tri <-> halfedge connectivity
+        cudaPol(range(tris.size()),
+            [hetab = proxy<space>(hetab),
+                ptab = proxy<space>(ptab),
+                spi = proxy<space>(spi),
+                points = proxy<space>({},points),
+                halfEdge = proxy<space>({},halfEdge),
+                sfi = proxy<space>(sfi),
+                tris = proxy<space>({},tris)] ZS_LAMBDA(int ti) mutable {
+                    auto tri = tris.pack(dim_c<3>,"inds",ti).reinterpret_bits(int_c);
+                    vec3i hinds{};
+                    for(int i = 0;i != 3;++i){
+                        if(hinds[i] = hetab.insert(vec2i{tri[i],tri[(i+1)%3]});hinds[i] >= 0){
+                            auto no = hinds[i];
+                            if(i == 0)
+                                tris("he_inds",ti) = reinterpret_bits<T>(no);
+                            auto pno = ptab.query(tri[i]);
+                            halfEdge("to_vertex",no) = reinterpret_bits<T>(spi[pno]);
+                            halfEdge("to_face",no) = reinterpret_bits<T>(ti);
+                            points("he_inds",spi[pno]) = reinterpret_bits<T>(no);
+                        }else {
+                            auto no = hinds[i];
+                            int pid = hetab.query(vec2i{tri[i],tri[(i+1)%3]});
+                            int oti = sfi[pid];
+                            printf("the same directed edge <%d %d> has been inserted twice! original sfi[%d %d] = %d, cur: %d <%d %d %d>\n",
+                                tri[i],tri[(i+1)%3],no,pid,oti,ti,tri[0],tri[1],tri[2]);
+                        }
+                    }
+
+                    for(int i = 0;i != 3;++i)
+                        halfEdge("next_he",hinds[i]) = hinds[(i+1) % 3];
+        });
+
+        cudaPol(range(halfEdge.size()),
+            [halfEdge = proxy<space>({},halfEdge),hetab = proxy<space>(hetab)] ZS_LAMBDA(int hi) mutable {
+                auto curPIdx = reinterpret_bits<int>(halfEdge("to_vertex",hi));
+                auto nxtHalfEdgeIdx = reinterpret_bits<int>(halfEdge("next_he",hi));
+                auto nxtPIdx = reinterpret_bits<int>(halfEdge("to_vertex",reinterpret_bits<int>(halfEdge("to_vertex",nxtHalfEdgeIdx))));
+                auto key = vec2i{nxtPIdx,curPIdx};
+
+                if(auto hno = hetab.query(key);hno >= 0) {
+                    halfEdge("opposite_he",hi) = reinterpret_bits<T>(hno);
+                }else {
+                    halfEdge("opposite_he",hi) = reinterpret_bits<T>(-1);
+                }
+                
+        });
+
+
+
+        // // building the de2fi hash map
+        // cudaPol(zs::range(tris.size()), [
+		// 		tris = proxy<space>({},tris,"tris_access_fe_fp_inds"),de2fi = proxy<space>(de2fi),halfEdge = proxy<space>({},halfEdge)] ZS_LAMBDA(int ti) mutable {
+		// 			auto fe_inds = tris.pack(dim_c<3>,"fe_inds",ti).reinterpret_bits(int_c);
+		// 			auto fp_inds = tris.pack(dim_c<3>,"fp_inds",ti).reinterpret_bits(int_c);
+
+		// 			vec3i nos{};
+		// 			for(int i = 0;i != 3;++i) {
+		// 				if(auto no = de2fi.insert(vec2i{fp_inds[i],fp_inds[(i+1) % 3]});no >= 0 && no < halfEdge.size()){
+		// 					nos[i] = no;
+		// 					halfEdge("to_vertex",no) = reinterpret_bits<T>(fp_inds[i]);
+		// 					halfEdge("face",no) = reinterpret_bits<T>(ti);
+		// 					halfEdge("edge",no) = reinterpret_bits<T>(fe_inds[i]);
+		// 					// halfEdge("next_he",no) = ti * 3 + (i+1) % 3;
+		// 				} else
+        //                     printf("invalid de2fi query : %d\n",no);				
+		// 			}
+		// 			for(int i = 0;i != 3;++i){
+        //                 if(nos[i] >= 0 && nos[i] < halfEdge.size())
+		// 				    halfEdge("next_he",nos[i]) = reinterpret_bits<T>(nos[(i+1) % 3]);
+        //                 else
+        //                     printf("invalid de2fi query : %d\n",nos[i]);
+        //             }
+		// });
+        // fmt::print("build success state: {}\n", de2fi._buildSuccess.getVal());
+        // cudaPol(zs::range(halfEdge.size()),
+        //     [halfEdge = proxy<space>({},halfEdge),de2fi = proxy<space>(de2fi)] ZS_LAMBDA(int hei) mutable {
+        //         auto idx0 = reinterpret_bits<int>(halfEdge("to_vertex",hei));
+        //         auto nexthei = reinterpret_bits<int>(halfEdge("next_he",hei));
+        //         auto idx1 = reinterpret_bits<int>(halfEdge("to_vertex",nexthei));
+        //         if(auto no = de2fi.query(vec2i{idx1,idx0});no >= 0)
+        //             halfEdge("opposite_he",hei) = reinterpret_bits<T>(no);
+        //         else{	
+        //             printf("detected boundary half edge : he[%d] : %d %d\n",hei,idx0,idx1);
+        //             halfEdge("opposite_he",hei) = reinterpret_bits<T>((int)-1);
+        //         }
+        // });
+
+        // cudaPol(zs::range(lines.size()),[
+        //     lines = proxy<space>({},lines,"halfedge::line_set_he_inds"),de2fi = proxy<space>(de2fi)] ZS_LAMBDA(int li) mutable {
+        //         auto ep_inds = lines.pack(dim_c<2>,"ep_inds",li).reinterpret_bits(int_c);
+        //         if(auto no = de2fi.query(vec2i{ep_inds[0],ep_inds[1]});no >= 0){
+        //             lines("he_inds",li) = reinterpret_bits<T>((int)no);
+        //         }else {
+        //             // some algorithm bug
+        //         }
+        // });
+
+        // // std::cout << "problematic_fp_inds_size : " << tris.getPropertySize("fp_inds") << std::endl;
+
+        // cudaPol(zs::range(tris.size()),[
+        //     points = proxy<space>({},points),tris = proxy<space>({},tris,"tris_access_fp_inds"),de2fi = proxy<space>(de2fi)] __device__(int ti) mutable {
+        //         auto fp_inds = tris.pack(dim_c<3>,"fp_inds",ti).reinterpret_bits(int_c);
+        //         // if(auto no = de2fi.query(vec2i{fp_inds[0],fp_inds[1]});no >= 0){
+        //         //     tris("he_inds",ti) = reinterpret_bits<T>((int)no);
+        //         // }else {
+        //         //     // some algorithm bug
+        //         //     printf("invalid de2fi query %d\n",no);
+        //         //     return;
+        //         // }
+
+        //         // for(int i = 0;i != 3;++i) {
+        //         //     if(auto no = de2fi.query(vec2i{fp_inds[i],fp_inds[(i+1) % 3]});no >= 0){
+        //         // //         if(fp_inds[i] >= 0 && fp_inds[i] < points.size()){
+        //         // //             // points("he_inds",fp_inds[i]) = reinterpret_bits<T>((int)no);
+        //         // //         }else
+        //         // //             printf("invalid fp_inds[%d] = %d with points.size() = %d\n",i,fp_inds[i],(int)points.size());
+        //         //     }else {
+        //         // //         // some algorithm bug
+        //         //     }						
+        //         // }
+
+        //         // {
+        //         //     auto tmp = vec2i{fp_inds[0],fp_inds[1]};
+        //         //     auto no_test = de2fi.query(tmp);
+        //         // }
+        //         // {
+        //             for(int i = 0;i != 3;++i) {
+        //                 if(auto no = de2fi.query(vec2i{fp_inds[i],fp_inds[(i+1) % 3]});no >= 0){
+        //                     if(i == 0) {
+        //                         tris("he_inds",ti) = reinterpret_bits<T>((int)no);
+        //                     }
+        //                     if(fp_inds[i] >= 0 && fp_inds[i] < points.size()){
+        //                         points("he_inds",fp_inds[i]) = reinterpret_bits<T>((int)no);
+        //                     }else
+        //                         printf("invalid fp_inds[%d] = %d with points.size() = %d\n",i,fp_inds[i],(int)points.size());
+
+        //                 }else {
+
+        //                 }
+        //             }
+        //         // }
+        // });
+
+        // // handle the boundary points
+        // cudaPol(zs::range(halfEdge.size()),
+        //     [points = proxy<space>({},points),halfEdge = proxy<space>({},halfEdge)] ZS_LAMBDA(int hei) mutable {
+        //         auto opposite_idx = reinterpret_bits<int>(halfEdge("opposite_he",hei));
+        //         if(opposite_idx >= 0)
+        //             return;
+        //         // now the halfEdge is a boundary edge
+        //         auto v_idx = reinterpret_bits<int>(halfEdge("to_vertex",hei));
+        //         points("he_inds",v_idx) = reinterpret_bits<T>((int)hei);
+        // });
+
+        return true;
+
+    }
+
+    template<typename HalfEdgeTileVec>
+    constexpr int get_next_half_edge(int hei,const HalfEdgeTileVec& half_edges,int step = 1,bool reverse = false) {
+        using namespace zs;
+        for(int i = 0;i != step;++i)
+            hei = reinterpret_bits<int>(half_edges("next_he",hei));
+        if(reverse)
+            hei = reinterpret_bits<int>(half_edges("opposite_he",hei));
+        return hei;
+    }
+
+    template<typename HalfEdgeTileVec>
+    constexpr int half_edge_get_another_vertex(int hei,const HalfEdgeTileVec& half_edges) {
+        using namespace zs;
+        // hei = reinterpret_bits<int>(half_edges("next_he",hei));
+        hei = get_next_half_edge(hei,half_edges,1,false);
+        return reinterpret_bits<int>(half_edges("to_vertex",hei));
+    }
+
+    // some operation with half edge structure
+    template<int MAX_NEIGHS,typename HalfEdgeTileVec>
+    constexpr zs::vec<int,MAX_NEIGHS> get_one_ring_neigh_points(int hei,const HalfEdgeTileVec& half_edges) {
+        using namespace zs;
+        auto res = zs::vec<int,MAX_NEIGHS>::uniform(-1);
+        auto hei0 = hei;
+        int i = 0;
+        // res[0] = half_edge_get_another_vertex(hei,half_edges);
+        for(i = 0;i != MAX_NEIGHS;++i) {
+            res[i] = half_edge_get_another_vertex(hei,half_edges);
+            auto nhei = get_next_half_edge(hei,half_edges,2,true);
+            if(nhei == hei0)
+                break;
+            if(nhei < 0 && (i+1) < MAX_NEIGHS) {
+                nhei = get_next_half_edge(hei,half_edges,2,false);
+                if(nhei > 0){
+                    res[i + 1] = reinterpret_bits<int>(half_edges("to_vertex",nhei));
+                    break;
+                }
+            }
+            hei = nhei;
+        }
+        if(i == MAX_NEIGHS)
+            printf("the max_one_ring_neighbor limit exceeds");
+
+        return res;
+    }
+
+    template<int MAX_NEIGHS,typename HalfEdgeTileVec>
+    constexpr zs::vec<int,MAX_NEIGHS> get_one_ring_neigh_edges(int hei,const HalfEdgeTileVec& half_edges) {
+        using namespace zs;
+        auto res = zs::vec<int,MAX_NEIGHS>::uniform(-1);
+        auto hei0 = hei;
+        auto nhei = hei;
+        int i = 0;
+        for(i = 0;i != MAX_NEIGHS;++i) {
+            res[i] = reinterpret_bits<int>(half_edges("edge",hei));
+            nhei = get_next_half_edge(hei,half_edges,2,true);
+            if(hei0 == nhei || nhei == -1)
+                break;
+            hei = nhei;
+        }
+        if(i < MAX_NEIGHS-1 && nhei == -1) {
+            ++i;
+            hei = get_next_half_edge(hei,half_edges,2,false);
+            res[i] = reinterpret_bits<int>(half_edges("edge",hei));
+        }
+        return res;
+    }
+
+    template<int MAX_NEIGHS,typename HalfEdgeTileVec>
+    constexpr zs::vec<int,MAX_NEIGHS> get_one_ring_neigh_tris(int hei,const HalfEdgeTileVec& half_edges) {
+        using namespace zs;
+        auto res = zs::vec<int,MAX_NEIGHS>::uniform(-1);
+        auto hei0 = hei;
+        int i = 0;
+        res[0] = reinterpret_bits<int>(half_edges("face",hei));
+        for(int i = 1;i != MAX_NEIGHS;++i) {
+            hei = get_next_half_edge(hei,half_edges,1,true);
+            if(hei == hei0 || hei < 0)
+                break;
+            res[i] = reinterpret_bits<int>(half_edges("face",hei));
+        }
+
+        if(i == MAX_NEIGHS)
+            printf("the max_one_ring_neighbor limit exceeds");
+
+        return res;
+
+    }
+
 };
\ No newline at end of file
diff --git a/projects/CuLagrange/geometry/linear_system/mfcg.hpp b/projects/CuLagrange/geometry/linear_system/mfcg.hpp
index 6527176417..225d5b022d 100644
--- a/projects/CuLagrange/geometry/linear_system/mfcg.hpp
+++ b/projects/CuLagrange/geometry/linear_system/mfcg.hpp
@@ -105,16 +105,22 @@ namespace zeno { namespace PCG {
     void prepare_block_diagonal_preconditioner(Pol &pol,const zs::SmallString& HTag,const EBufTileVec& etemp,const zs::SmallString& PTag,VBufTileVec& vtemp,bool use_block = true) {
         using namespace zs;
         constexpr auto space = execspace_e::cuda;
-        pol(zs::range(vtemp.size()),
-            [vtemp = proxy<space>({}, vtemp),PTag] ZS_LAMBDA (int vi) mutable {
-                constexpr int block_size = space_dim * space_dim;
-                vtemp.template tuple<block_size>(PTag, vi) = zs::vec<T,space_dim,space_dim>::zeros();
-        });
+        // pol(zs::range(vtemp.size()),
+        //     [vtemp = proxy<space>({}, vtemp),PTag] ZS_LAMBDA (int vi) mutable {
+        //         constexpr int block_size = space_dim * space_dim;
+        //         vtemp.template tuple<block_size>(PTag, vi) = zs::vec<T,space_dim,space_dim>::zeros();
+        // });
+        TILEVEC_OPS::fill(pol,vtemp,PTag,(T)0.0);
+
         pol(zs::range(etemp.size()),
                     [vtemp = proxy<space>({},vtemp),etemp = proxy<space>({},etemp),HTag,PTag,use_block]
                         ZS_LAMBDA(int ei) mutable{
             constexpr int h_width = space_dim * simplex_dim;
             auto inds = etemp.template pack<simplex_dim>("inds",ei).template reinterpret_bits<int>();
+            for(int i = 0;i != simplex_dim;++i)
+                if(inds[i] < 0)
+                    return;
+
             auto H = etemp.template pack<h_width,h_width>(HTag,ei);
 
             for(int vi = 0;vi != simplex_dim;++vi)
@@ -660,6 +666,11 @@ namespace zeno { namespace PCG {
                 fmt::print(fg(fmt::color::dark_cyan),"negative zTrk detected = {}\n",zTrk);
                 throw std::runtime_error("negative zTrk detected");
             }
+            if(std::isnan(zTrk)) {
+                std::cout << "nan zTrk detected = " << zTrk << std::endl;
+                fmt::print(fg(fmt::color::dark_cyan),"nan zTrk detected = {}\n",zTrk);
+                throw std::runtime_error("nan zTrk detected");
+            }
             if(residualPreconditionedNorm < localTol)
                 break;
             // H * p -> tmp