From 696527f585292a2ca90b62d885fc532f1f7d95ec Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 3 Jan 2024 19:29:44 +0100 Subject: [PATCH] Update alpaka nbody to C++20 and simplify --- examples/alpaka/nbody/CMakeLists.txt | 2 +- examples/alpaka/nbody/nbody.cpp | 58 ++++++++++++---------------- 2 files changed, 26 insertions(+), 34 deletions(-) diff --git a/examples/alpaka/nbody/CMakeLists.txt b/examples/alpaka/nbody/CMakeLists.txt index 85ea9ea1b2..6181dcad7d 100644 --- a/examples/alpaka/nbody/CMakeLists.txt +++ b/examples/alpaka/nbody/CMakeLists.txt @@ -15,7 +15,7 @@ if (NOT TARGET llama::llama) endif() find_package(alpaka 1.0 REQUIRED) alpaka_add_executable(${PROJECT_NAME} nbody.cpp ../../common/Stopwatch.hpp) -target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17) +target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_20) target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama fmt::fmt alpaka::alpaka xsimd) if (MSVC) diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp index 5380097e99..a01f2e226e 100644 --- a/examples/alpaka/nbody/nbody.cpp +++ b/examples/alpaka/nbody/nbody.cpp @@ -43,9 +43,9 @@ constexpr auto runUpdate = true; // run update step. Useful to disable for bench #endif #if ANY_CPU_ENABLED -constexpr auto elementsPerThread = xsimd::batch::size; constexpr auto threadsPerBlock = 1; constexpr auto sharedElementsPerBlock = 1; +constexpr auto elementsPerThread = xsimd::batch::size; constexpr auto aosoaLanes = elementsPerThread; #elif ANY_GPU_ENABLED constexpr auto threadsPerBlock = 256; @@ -101,9 +101,6 @@ struct llama::SimdTraits::value>> } }; -template -using MakeBatch = xsimd::batch; - template struct MakeSizedBatchImpl { @@ -166,66 +163,60 @@ LLAMA_FN_HOST_ACC_INLINE void pPInteraction(const Acc& acc, ParticleRefI& pis, P pis(tag::Vel{}) += dist * sts; } -template +template struct UpdateKernel { - template - ALPAKA_FN_HOST_ACC void operator()(const Acc& acc, View particles) const + ALPAKA_FN_HOST_ACC void operator()(const auto& acc, auto particles) const { auto sharedView = [&] { // if there is only 1 shared element per block, use just a variable (in registers) instead of shared memory - if constexpr(sharedElementsPerBlock == 1) + if constexpr(SharedElementsPerBlock == 1) { using Mapping = llama::mapping::MinAlignedOne, SharedMemoryParticle>; return allocViewUninitialized(Mapping{}, llama::bloballoc::Array{}); } else { - using ArrayExtents = llama::ArrayExtents; - using Mapping = typename QuotedSMMapping::template fn; - constexpr auto sharedMapping = Mapping{}; - - llama::Array sharedMems{}; - boost::mp11::mp_for_each>( - [&](auto i) - { - auto& sharedMem = alpaka::declareSharedVar(acc); - sharedMems[i] = &sharedMem[0]; - }); - return llama::View{sharedMapping, sharedMems}; + using Mapping = typename QuotedSMMapping:: + template fn, SharedMemoryParticle>; + return [&](std::index_sequence) + { + return llama::View{ + Mapping{}, + llama::Array{alpaka::declareSharedVar(acc)...}}; + }(std::make_index_sequence{}); } }(); const auto ti = alpaka::getIdx(acc)[0]; const auto tbi = alpaka::getIdx(acc)[0]; - auto pis = llama::SimdN{}; - llama::loadSimd(particles(ti * Elems), pis); + auto pis = llama::SimdN{}; + llama::loadSimd(particles(ti * ElementsPerThread), pis); - for(int blockOffset = 0; blockOffset < problemSize; blockOffset += sharedElementsPerBlock) + for(int blockOffset = 0; blockOffset < problemSize; blockOffset += SharedElementsPerBlock) { - for(int j = 0; j < sharedElementsPerBlock; j += threadsPerBlock) + for(int j = 0; j < SharedElementsPerBlock; j += ThreadsPerBlock) sharedView(j) = particles(blockOffset + tbi + j); alpaka::syncBlockThreads(acc); - for(int j = 0; j < sharedElementsPerBlock; ++j) + for(int j = 0; j < SharedElementsPerBlock; ++j) pPInteraction(acc, pis, sharedView(j)); alpaka::syncBlockThreads(acc); } - llama::storeSimd(pis(tag::Vel{}), particles(ti * Elems)(tag::Vel{})); + llama::storeSimd(pis(tag::Vel{}), particles(ti * ElementsPerThread)(tag::Vel{})); } }; -template +template struct MoveKernel { - template - ALPAKA_FN_HOST_ACC void operator()(const Acc& acc, View particles) const + ALPAKA_FN_HOST_ACC void operator()(const auto& acc, auto particles) const { const auto ti = alpaka::getIdx(acc)[0]; - const auto i = ti * Elems; - llama::SimdN pos; - llama::SimdN vel; + const auto i = ti * ElementsPerThread; + llama::SimdN pos; + llama::SimdN vel; llama::loadSimd(particles(i)(tag::Pos{}), pos); llama::loadSimd(particles(i)(tag::Vel{}), vel); llama::storeSimd(pos + vel * +timestep, particles(i)(tag::Pos{})); @@ -354,7 +345,8 @@ void run(std::ostream& plotFile) { if constexpr(runUpdate) { - auto updateKernel = UpdateKernel{}; + auto updateKernel + = UpdateKernel{}; alpaka::exec(queue, workdiv, updateKernel, llama::shallowCopy(accView)); statsUpdate(watch.printAndReset("update", '\t')); }