From 57671b07958e3c12f99caf3dbc27970ac6c4e07d Mon Sep 17 00:00:00 2001 From: Georgiana Mania Date: Wed, 21 Dec 2022 15:01:05 +0100 Subject: [PATCH 1/4] initial commit --- CMakeLists.txt | 2 +- src/main.cpp | 5 + src/vecpar/VecparStream.cpp | 275 ++++++++++++++++++++++++++++++++++++ src/vecpar/VecparStream.hpp | 170 ++++++++++++++++++++++ src/vecpar/model.cmake | 147 +++++++++++++++++++ 5 files changed, 598 insertions(+), 1 deletion(-) create mode 100644 src/vecpar/VecparStream.cpp create mode 100644 src/vecpar/VecparStream.hpp create mode 100644 src/vecpar/model.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 54034ee1..005f32c7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -90,7 +90,7 @@ register_model(acc ACC ACCStream.cpp) register_model(raja USE_RAJA RAJAStream.cpp) register_model(tbb TBB TBBStream.cpp) register_model(thrust THRUST ThrustStream.cu) # Thrust uses cu, even for rocThrust - +register_model(vecpar VECPAR VecparStream.cpp) set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model") diff --git a/src/main.cpp b/src/main.cpp index 3035da0c..5e8f1583 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -49,6 +49,8 @@ #include "SYCLStream2020.h" #elif defined(OMP) #include "OMPStream.h" +#elif defined(VECPAR) +#include "VecparStream.hpp" #endif // Default size of 2^25 @@ -298,6 +300,9 @@ void run() // Use the OpenMP implementation stream = new OMPStream(ARRAY_SIZE, deviceIndex); +#elif defined(VECPAR) + // Use vecpar implementation + stream = new VecparStream(ARRAY_SIZE, deviceIndex); #endif stream->init_arrays(startA, startB, startC); diff --git a/src/vecpar/VecparStream.cpp b/src/vecpar/VecparStream.cpp new file mode 100644 index 00000000..f9822e57 --- /dev/null +++ b/src/vecpar/VecparStream.cpp @@ -0,0 +1,275 @@ +#include "VecparStream.hpp" + +template +VecparStream::VecparStream(const int ARRAY_SIZE, int device) +{ + array_size = ARRAY_SIZE; + + // Allocate on the host + this->a = new vecmem::vector(array_size, &memoryResource); + this->b = new vecmem::vector(array_size, &memoryResource); + this->c = new vecmem::vector(array_size, &memoryResource); +} + +template +VecparStream::~VecparStream() +{ + free(a); + free(b); + free(c); +} + +template +void VecparStream::init_arrays(T initA, T initB, T initC) +{ + int array_size = this->array_size; + + for (int i = 0; i < array_size; i++) + { + a->at(i) = initA; + b->at(i) = initB; + c->at(i) = initC; + } + +#if defined(VECPAR_GPU) && defined(DEFAULT) + d_a = copy_tool.to(vecmem::get_data(*a), + dev_mem, + vecmem::copy::type::host_to_device); + d_b = copy_tool.to(vecmem::get_data(*b), + dev_mem, + vecmem::copy::type::host_to_device); + d_c = copy_tool.to(vecmem::get_data(*c), + dev_mem, + vecmem::copy::type::host_to_device); +#endif +} + +template +void VecparStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + +#if defined(VECPAR_GPU) && defined(DEFAULT) + copy_tool(d_a, *a, vecmem::copy::type::device_to_host); + copy_tool(d_b, *b, vecmem::copy::type::device_to_host); + copy_tool(d_c, *c, vecmem::copy::type::device_to_host); +#endif + + for (int i = 0; i < array_size; i++) + { + h_a[i] = a->at(i); + h_b[i] = b->at(i); + h_c[i] = c->at(i); + } +} + +template +void VecparStream::copy() +{ +#if defined(SINGLE_SOURCE) // gpu+managed, cpu + vecpar_copy algorithm; + vecpar::parallel_algorithm(algorithm, memoryResource, *c, *a); +#else + #ifdef VECPAR_GPU + vecpar::cuda::parallel_map( + array_size, + [=] __device__ (int idx, + vecmem::data::vector_view &c_view, + vecmem::data::vector_view &a_view) { + vecmem::device_vector dc(c_view); + vecmem::device_vector da(a_view); + dc[idx] = da[idx] ; + }, + vecmem::get_data(d_c), + vecmem::get_data(d_a)); + #else // lambda + vecpar::omp::parallel_map(array_size, + [&] (int idx) { c->at(idx) = a->at(idx);}); + #endif +#endif +} + +template +void VecparStream::mul() +{ + const T scalar = startScalar; +#if defined(SINGLE_SOURCE) + vecpar_mul algorithm; + vecpar::parallel_algorithm(algorithm, memoryResource, *b, *c, scalar); +#else + #ifdef VECPAR_GPU + vecpar::cuda::parallel_map( + array_size, + [=] __device__ (int idx, + vecmem::data::vector_view &b_view, + vecmem::data::vector_view &c_view, + T dscalar) { + vecmem::device_vector db(b_view); + vecmem::device_vector dc(c_view); + db[idx] = dscalar * dc[idx] ; + }, + vecmem::get_data(d_b), + vecmem::get_data(d_c), + scalar); + #else + vecpar::omp::parallel_map(array_size, + [&] (int idx) { b->at(idx) = scalar * c->at(idx);}); + #endif +#endif +} + +template +void VecparStream::add() +{ +#if defined(SINGLE_SOURCE) + vecpar_add algorithm; + vecpar::parallel_algorithm(algorithm, memoryResource, *c, *a, *b); +#else //defined (DEFAULT) + #ifdef VECPAR_GPU + vecpar::cuda::parallel_map( + array_size, + [=] __device__ (int idx, + vecmem::data::vector_view &a_view, + vecmem::data::vector_view &b_view, + vecmem::data::vector_view &c_view) { + vecmem::device_vector da(a_view); + vecmem::device_vector db(b_view); + vecmem::device_vector dc(c_view); + dc[idx] = da[idx] + db[idx] ; + }, + vecmem::get_data(d_a), + vecmem::get_data(d_b), + vecmem::get_data(d_c)); + #else + vecpar::omp::parallel_map(array_size, + [&] (int idx) { c->at(idx) = a->at(idx) + b->at(idx);}); + #endif +#endif +} + +template +void VecparStream::triad() +{ + const T scalar = startScalar; + int array_size = this->array_size; + + #if defined(SINGLE_SOURCE) + vecpar_triad algorithm; + vecpar::parallel_algorithm(algorithm, memoryResource, *a, *b, *c, scalar); + #else //defined (DEFAULT) + #if defined (VECPAR_GPU) + vecpar::cuda::parallel_map( + array_size, + [=] __device__ (int idx, + vecmem::data::vector_view &a_view, + vecmem::data::vector_view &b_view, + vecmem::data::vector_view &c_view, + T dscalar) { + vecmem::device_vector da(a_view); + vecmem::device_vector db(b_view); + vecmem::device_vector dc(c_view); + da[idx] = db[idx] + dscalar * dc[idx]; + }, + vecmem::get_data(d_a), + vecmem::get_data(d_b), + vecmem::get_data(d_c), + scalar); + #else + vecpar::omp::parallel_map(array_size, + [&] (int idx) { + a->at(idx) = b->at(idx) + scalar * c->at(idx); }); + #endif + #endif +} + +template +void VecparStream::nstream() +{ + const T scalar = startScalar; + + #if defined(SINGLE_SOURCE) + vecpar_nstream algorithm; + vecpar::parallel_algorithm(algorithm, memoryResource, *a, *b, *c, scalar); + #else + #ifdef VECPAR_GPU + vecpar::cuda::parallel_map( + array_size, + [=] __device__ (int idx, + vecmem::data::vector_view &a_view, + vecmem::data::vector_view &b_view, + vecmem::data::vector_view &c_view, + T dscalar) { + vecmem::device_vector da(a_view); + vecmem::device_vector db(b_view); + vecmem::device_vector dc(c_view); + da[idx] += db[idx] + dscalar * dc[idx]; + }, + vecmem::get_data(d_a), + vecmem::get_data(d_b), + vecmem::get_data(d_c), + scalar); + #else + vecpar::omp::parallel_map(array_size, + [&] (int idx) { a->at(idx) = b->at(idx) + scalar * c->at(idx);}); + #endif +#endif +} + +template +T VecparStream::dot() +{ + T* sum = new T(); + *sum = 0.0; +#if defined(SINGLE_SOURCE) + vecpar_dot algorithm; + *sum = vecpar::parallel_algorithm(algorithm, memoryResource, *a, *b); +#else + #ifdef VECPAR_GPU + T* dsum; + cudaMalloc(&dsum, sizeof(T)); + vecpar::cuda::offload_reduce(array_size, + [=](int* lock, int size, T* dsum, + vecmem::data::vector_view a_view, + vecmem::data::vector_view b_view) { + vecmem::device_vector da(a_view); + vecmem::device_vector db(b_view); + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) + return; + atomicAdd(dsum, (da[idx]*db[idx])); + }, array_size, dsum, + vecmem::get_data(d_a), + vecmem::get_data(d_b)); + cudaMemcpy(sum, dsum, sizeof(T), cudaMemcpyDeviceToHost); + cudaFree(dsum); + #else + vecmem::vector result(array_size, &memoryResource); + vecpar::omp::parallel_map(array_size, + [&] (int idx) { result.at(idx) = a->at(idx) * b->at(idx); }); + vecpar::omp::parallel_reduce(array_size, sum, + [&] (T* sum, T& value) { *sum += value; }, result); + #endif +#endif + return *sum; +} + +void listDevices(void) +{ +#ifdef VECPAR_GPU + std::cout << "Not implemented yet" << std::endl; +#else + std::cout << "0: CPU" << std::endl; +#endif +} + +std::string getDeviceName(const int) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} + +template class VecparStream; +template class VecparStream; \ No newline at end of file diff --git a/src/vecpar/VecparStream.hpp b/src/vecpar/VecparStream.hpp new file mode 100644 index 00000000..27aa6e28 --- /dev/null +++ b/src/vecpar/VecparStream.hpp @@ -0,0 +1,170 @@ +#pragma once + +#include + +#include +#include + +#include "Stream.h" + +#ifdef VECPAR_GPU +#include "cuda.h" +#include +#include +#include +#include +#include +#endif + +#include + +#if defined(VECPAR_GPU) && defined(MANAGED) + #define IMPLEMENTATION_STRING "vecpar_gpu_single_source_managed_memory" + #define SINGLE_SOURCE "1" +#elif defined(VECPAR_GPU) && defined(DEFAULT) + #define IMPLEMENTATION_STRING "vecpar_gpu_host_device_memory" +#else +//##elif defined(MANAGED) + #define IMPLEMENTATION_STRING "vecpar_cpu_single_source" + #define SINGLE_SOURCE "1" +//#else + // #define IMPLEMENTATION_STRING "vecpar_cpu_host_memory" +#endif + + +template +class VecparStream : public Stream +{ +protected: + // Size of arrays + int array_size; + + // Host side pointers or managed memory + vecmem::vector *a; + vecmem::vector *b; + vecmem::vector *c; + +#if defined(VECPAR_GPU) && defined(MANAGED) + vecmem::cuda::managed_memory_resource memoryResource; +#elif defined(VECPAR_GPU) && defined(DEFAULT) + vecmem::host_memory_resource memoryResource; + vecmem::cuda::device_memory_resource dev_mem; + vecmem::cuda::copy copy_tool; + + vecmem::data::vector_buffer d_a; + vecmem::data::vector_buffer d_b; + vecmem::data::vector_buffer d_c; +#else + vecmem::host_memory_resource memoryResource; +#endif + +public: + VecparStream(const int, int); + ~VecparStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; +}; + +/// if SHARED MEMORY is set, then vecpar single source code can be used; +/// define one algorithm per function +#ifdef MANAGED + template + struct vecpar_triad : + public vecpar::algorithm::parallelizable_mmap< + vecpar::collection::Three, + vecmem::vector, // a + vecmem::vector, // b + vecmem::vector, // c + const T // scalar + > { + TARGET T& map(T& a_i, const T& b_i, const T& c_i, const T scalar) const { + a_i = b_i + scalar * c_i; + return a_i; + } + }; + +template +struct vecpar_add : + public vecpar::algorithm::parallelizable_mmap< + vecpar::collection::Three, + vecmem::vector, // c + vecmem::vector, // a + vecmem::vector> // b + { + TARGET T& map(T& c_i, const T& a_i, const T& b_i) const { + c_i = a_i + b_i ; + return c_i; + } +}; + +template +struct vecpar_mul: + public vecpar::algorithm::parallelizable_mmap< + vecpar::collection::Two, + vecmem::vector, // b + vecmem::vector, // c + const T > // scalar + { + TARGET T& map(T& b_i, const T& c_i, const T scalar) const { + b_i = scalar * c_i ; + return b_i; + } +}; + +template +struct vecpar_copy: + public vecpar::algorithm::parallelizable_mmap< + vecpar::collection::Two, + vecmem::vector, // c + vecmem::vector> // a +{ + TARGET T& map(T& c_i, const T& a_i) const { + c_i = a_i; + return c_i; + } +}; + +template +struct vecpar_dot: + public vecpar::algorithm::parallelizable_map_reduce< + vecpar::collection::Two, + T, // reduction result + vecmem::vector, // map result + vecmem::vector, // a + vecmem::vector> // b +{ + TARGET T& map(T& result, T& a_i, const T& b_i) const { + result = a_i * b_i; + return result; + } + + TARGET T* reduce(T* result, T& crt) const { + *result += crt; + return result; + } +}; + +template +struct vecpar_nstream : public vecpar::algorithm::parallelizable_mmap< + vecpar::collection::Three, + vecmem::vector, // a + vecmem::vector, // b + vecmem::vector, // c + const T> // scalar +{ + TARGET T& map(T& a_i, const T& b_i, const T& c_i, const T scalar) const { + a_i += b_i + scalar * c_i; + return a_i; + } + +}; +#endif + diff --git a/src/vecpar/model.cmake b/src/vecpar/model.cmake new file mode 100644 index 00000000..cd979df1 --- /dev/null +++ b/src/vecpar/model.cmake @@ -0,0 +1,147 @@ + +register_flag_optional(CMAKE_CXX_COMPILER + "Any CXX compiler that supports OpenMP as per CMake detection (and offloading if enabled with `OFFLOAD`)" + "c++") + +register_flag_optional(ARCH + "This overrides CMake's CMAKE_SYSTEM_PROCESSOR detection which uses (uname -p), this is mainly for use with + specialised accelerators only and not to be confused with offload which is is mutually exclusive with this. + Supported values are: + - NEC" + "") + +register_flag_optional(OFFLOAD + "Whether to use OpenMP offload, the format is |ON|OFF. + We support a small set of known offload flags for clang, gcc, and icpx. + However, as offload support is rapidly evolving, we recommend you directly supply them via OFFLOAD_FLAGS. + For example: + * OFFLOAD=NVIDIA:sm_60 + * OFFLOAD=ON OFFLOAD_FLAGS=..." + OFF) + +register_flag_optional(OFFLOAD_FLAGS + "If OFFLOAD is enabled, this *overrides* the default offload flags" + "") + +register_flag_optional(OFFLOAD_APPEND_LINK_FLAG + "If enabled, this appends all resolved offload flags (OFFLOAD= or directly from OFFLOAD_FLAGS) to the link flags. + This is required for most offload implementations so that offload libraries can linked correctly." + ON) + +#register_flag_optional(VECPAR_BACKEND + # "Valid values: + # * VECPAR_BACKEND=OFF (default) + # * VECPAR_BACKEND=CUDA + # * VECPAR_BACKEND=OMPT" OFF) + +register_flag_optional(MEM "Device memory mode: + DEFAULT - allocate host and device memory pointers. + MANAGED - use CUDA Managed Memory" + "DEFAULT") + +set(VECPAR_FLAGS_CLANG -fopenmp) +set(VECPAR_FLAGS_OFFLOAD_CLANG_NVIDIA --language=cuda) +#set(VECPAR_BACKEND "CUDA") + +macro(setup) + + set(CMAKE_CXX_STANDARD 20) + set(LINKER_LANGUAGE CXX) + register_definitions(${MEM}) + + string(TOUPPER ${CMAKE_CXX_COMPILER_ID} COMPILER) + find_package(vecpar REQUIRED 0.0.3) + + find_package(OpenMP QUIET) + #find_package(vecmem QUIET) + + + if (("${OFFLOAD}" STREQUAL OFF) OR (NOT DEFINED OFFLOAD)) + # no offload + + # CPU OpenMP backend can be built by either GCC or Clang + register_link_library(OpenMP::OpenMP_CXX) + + list(APPEND VECPAR_FLAGS -fopenmp) + # resolve the CPU specific flags + # set(VECPAR_FLAGS -fopenmp) + # set(VECPAR_LINK_FLAGS -fopenmp) + # register_append_compiler_and_arch_specific_cxx_flags( + # VECPAR_FLAGS_CPU + # ${COMPILER} + # ${ARCH} + # ) + + # register_append_compiler_and_arch_specific_link_flags( + # VECPAR_LINK_FLAGS_CPU + # ${COMPILER} + # ${ARCH} + #) + + elseif ("${OFFLOAD}" STREQUAL ON) + # offload but with custom flags + find_package(CUDAToolkit QUIET) + register_definitions(VECPAR_GPU) + separate_arguments(OFFLOAD_FLAGS) + set(VECPAR_FLAGS ${OFFLOAD_FLAGS}) + register_link_library(CUDA::cudart) + register_link_library(vecmem::cuda) + register_link_library(vecpar::all) + elseif ((DEFINED OFFLOAD) AND OFFLOAD_FLAGS) + # offload but OFFLOAD_FLAGS overrides + find_package(CUDAToolkit QUIET) + register_definitions(VECPAR_GPU) + separate_arguments(OFFLOAD_FLAGS) + list(VECPAR_FLAGS APPEND ${OFFLOAD_FLAGS}) + register_link_library(CUDA::cudart) + register_link_library(vecmem::cuda) + register_link_library(vecpar::all) + else () + find_package(CUDAToolkit QUIET) + register_definitions(VECPAR_GPU) + # list(APPEND VECPAR_FLAGS "-x cuda") + + # handle the vendor:arch value + string(REPLACE ":" ";" OFFLOAD_TUPLE "${OFFLOAD}") + + list(LENGTH OFFLOAD_TUPLE LEN) + if (LEN EQUAL 1) + # offload with tuple + list(GET OFFLOAD_TUPLE 0 OFFLOAD_VENDOR) + # append VECPAR_FLAGS_OFFLOAD_ if exists + list(APPEND VECPAR_FLAGS ${VECPAR_FLAGS_OFFLOAD_${OFFLOAD_VENDOR}}) + + elseif (LEN EQUAL 2) + # offload with tuple + list(GET OFFLOAD_TUPLE 0 OFFLOAD_VENDOR) + list(GET OFFLOAD_TUPLE 1 OFFLOAD_ARCH) + + # append VECPAR_FLAGS_OFFLOAD__ if exists + list(APPEND VECPAR_FLAGS ${VECPAR_FLAGS_OFFLOAD_${COMPILER}_${OFFLOAD_VENDOR}}) + # append offload arch if VECPAR_FLAGS_OFFLOAD__ARCH_FLAG if exists + if (DEFINED VECPAR_FLAGS_OFFLOAD_${COMPILER}_ARCH_FLAG) + list(APPEND VECPAR_FLAGS + ${VECPAR_FLAGS_OFFLOAD_${COMPILER}_ARCH_FLAG}${OFFLOAD_ARCH}) + endif () + list(APPEND VECPAR_FLAGS --offload-arch=${OFFLOAD_ARCH}) + else () + message(FATAL_ERROR "Unrecognised OFFLOAD format: `${OFFLOAD}`, consider directly using OFFLOAD_FLAGS") + endif () + + register_link_library(CUDA::cudart) + register_link_library(vecmem::cuda) + endif () + + + register_link_library(vecpar::all) + register_link_library(vecmem::core) + + # propagate flags to linker so that it links with the offload stuff as well + register_append_cxx_flags(ANY ${VECPAR_FLAGS}) +# if (OFFLOAD_APPEND_LINK_FLAG) + # register_append_link_flags(${VECPAR_FLAGS}) + # endif () + +endmacro() + + From 8c0114a2d49c49531f7c2abc9d626468187bcac5 Mon Sep 17 00:00:00 2001 From: Georgiana Mania Date: Mon, 27 Feb 2023 17:24:11 +0100 Subject: [PATCH 2/4] add vecpar ompt --- src/vecpar/VecparStream.cpp | 65 ++++++++++++++++++++++++--- src/vecpar/VecparStream.hpp | 88 +++++++++++++++++++++++-------------- src/vecpar/model.cmake | 68 ++++++++++++++-------------- 3 files changed, 149 insertions(+), 72 deletions(-) diff --git a/src/vecpar/VecparStream.cpp b/src/vecpar/VecparStream.cpp index f9822e57..88c092c1 100644 --- a/src/vecpar/VecparStream.cpp +++ b/src/vecpar/VecparStream.cpp @@ -9,11 +9,24 @@ VecparStream::VecparStream(const int ARRAY_SIZE, int device) this->a = new vecmem::vector(array_size, &memoryResource); this->b = new vecmem::vector(array_size, &memoryResource); this->c = new vecmem::vector(array_size, &memoryResource); + +#if defined(VECPAR_GPU) and defined(OMPT) + d_a = a->data(); + d_b = b->data(); + d_c = c->data(); + + #pragma omp target enter data map(alloc: d_a[0:array_size], d_b[0:array_size], d_c[0:array_size]) + {} +#endif } template VecparStream::~VecparStream() { +#if defined(VECPAR_GPU) and defined(OMPT) +#pragma omp target exit data map(release: d_a[0:array_size], d_b[0:array_size], d_c[0:array_size]) + {} +#endif free(a); free(b); free(c); @@ -31,7 +44,8 @@ void VecparStream::init_arrays(T initA, T initB, T initC) c->at(i) = initC; } -#if defined(VECPAR_GPU) && defined(DEFAULT) +#if defined(VECPAR_GPU) and defined(DEFAULT) +#if defined(NATIVE) d_a = copy_tool.to(vecmem::get_data(*a), dev_mem, vecmem::copy::type::host_to_device); @@ -41,6 +55,13 @@ void VecparStream::init_arrays(T initA, T initB, T initC) d_c = copy_tool.to(vecmem::get_data(*c), dev_mem, vecmem::copy::type::host_to_device); +#else + d_a = a->data(); + d_b = b->data(); + d_c = c->data(); +#pragma omp target update to(d_a[0:array_size], d_b[0:array_size], d_c[0:array_size]) + {} +#endif #endif } @@ -48,10 +69,18 @@ template void VecparStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { -#if defined(VECPAR_GPU) && defined(DEFAULT) +#if defined(VECPAR_GPU) and defined(DEFAULT) +#if defined(NATIVE) copy_tool(d_a, *a, vecmem::copy::type::device_to_host); copy_tool(d_b, *b, vecmem::copy::type::device_to_host); copy_tool(d_c, *c, vecmem::copy::type::device_to_host); +#else + d_a = a->data(); + d_b = b->data(); + d_c = c->data(); +#pragma omp target update from(d_a[0:array_size], d_b[0:array_size], d_c[0:array_size]) + {} +#endif #endif for (int i = 0; i < array_size; i++) @@ -65,9 +94,13 @@ void VecparStream::read_arrays(std::vector& h_a, std::vector& h_b, std: template void VecparStream::copy() { -#if defined(SINGLE_SOURCE) // gpu+managed, cpu +#if defined(SINGLE_SOURCE) // gpu+managed vecpar_copy algorithm; - vecpar::parallel_algorithm(algorithm, memoryResource, *c, *a); +#if defined(NATIVE) // omp + cuda + vecpar::parallel_algorithm(algorithm, memoryResource, *c, *a); +#else + vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *c, *a); +#endif #else #ifdef VECPAR_GPU vecpar::cuda::parallel_map( @@ -94,7 +127,11 @@ void VecparStream::mul() const T scalar = startScalar; #if defined(SINGLE_SOURCE) vecpar_mul algorithm; +#if defined(NATIVE) // omp + cuda vecpar::parallel_algorithm(algorithm, memoryResource, *b, *c, scalar); +#else + vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *b, *c, scalar); +#endif #else #ifdef VECPAR_GPU vecpar::cuda::parallel_map( @@ -122,7 +159,11 @@ void VecparStream::add() { #if defined(SINGLE_SOURCE) vecpar_add algorithm; +#if defined(NATIVE) // omp + cuda vecpar::parallel_algorithm(algorithm, memoryResource, *c, *a, *b); +#else + vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *c, *a, *b); +#endif #else //defined (DEFAULT) #ifdef VECPAR_GPU vecpar::cuda::parallel_map( @@ -154,7 +195,11 @@ void VecparStream::triad() #if defined(SINGLE_SOURCE) vecpar_triad algorithm; - vecpar::parallel_algorithm(algorithm, memoryResource, *a, *b, *c, scalar); +#if defined(NATIVE) // omp + cuda + vecpar::parallel_algorithm(algorithm, memoryResource, *a, *b, *c, scalar); +#else + vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *a, *b, *c, scalar); +#endif #else //defined (DEFAULT) #if defined (VECPAR_GPU) vecpar::cuda::parallel_map( @@ -188,7 +233,11 @@ void VecparStream::nstream() #if defined(SINGLE_SOURCE) vecpar_nstream algorithm; - vecpar::parallel_algorithm(algorithm, memoryResource, *a, *b, *c, scalar); +#if defined(NATIVE) // omp + cuda + vecpar::parallel_algorithm(algorithm, memoryResource, *a, *b, *c, scalar); +#else + vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *a, *b, *c, scalar); +#endif #else #ifdef VECPAR_GPU vecpar::cuda::parallel_map( @@ -221,7 +270,11 @@ T VecparStream::dot() *sum = 0.0; #if defined(SINGLE_SOURCE) vecpar_dot algorithm; +#if defined(NATIVE) // omp + cuda *sum = vecpar::parallel_algorithm(algorithm, memoryResource, *a, *b); +#else + *sum = vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *a, *b); +#endif #else #ifdef VECPAR_GPU T* dsum; diff --git a/src/vecpar/VecparStream.hpp b/src/vecpar/VecparStream.hpp index 27aa6e28..8f705073 100644 --- a/src/vecpar/VecparStream.hpp +++ b/src/vecpar/VecparStream.hpp @@ -3,34 +3,46 @@ #include #include +#include #include #include "Stream.h" -#ifdef VECPAR_GPU -#include "cuda.h" -#include -#include -#include -#include -#include +#if defined(VECPAR_GPU) + #if defined(NATIVE) + #include "cuda.h" + #include + #include + #include + #include + #include + #else + #include + #endif #endif #include -#if defined(VECPAR_GPU) && defined(MANAGED) - #define IMPLEMENTATION_STRING "vecpar_gpu_single_source_managed_memory" - #define SINGLE_SOURCE "1" -#elif defined(VECPAR_GPU) && defined(DEFAULT) - #define IMPLEMENTATION_STRING "vecpar_gpu_host_device_memory" +//backend = NATIVE/ompt, memory = default/managed, offload=0/1 +#if defined(NATIVE) and defined(DEFAULT) and defined(VECPAR_GPU) + #define IMPLEMENTATION_STRING "vecpar_cuda_hostdevice" +#elif defined(NATIVE) and defined(DEFAULT) and !defined(VECPAR_GPU) + #define IMPLEMENTATION_STRING "vecpar_omp_hostmemory" +#elif defined(NATIVE) and defined(MANAGED) and defined(VECPAR_GPU) + #define IMPLEMENTATION_STRING "vecpar_cuda_singlesource_managedmemory" + #define SINGLE_SOURCE 1 +#elif defined(NATIVE) and defined(MANAGED) and !defined(VECPAR_GPU) + #define IMPLEMENTATION_STRING "vecpar_omp_singlesource_managedmemory" + #define SINGLE_SOURCE 1 +#elif defined(OMPT) and defined(DEFAULT) and defined(VECPAR_GPU) + #define IMPLEMENTATION_STRING "vecpar_ompt_gpu_singlesource_hostdevice" + #define SINGLE_SOURCE 1 +#elif defined(OMPT) and defined(DEFAULT) and !defined(VECPAR_GPU) + #define IMPLEMENTATION_STRING "vecpar_ompt_cpu_singlesource_hostmemory" + #define SINGLE_SOURCE 1 #else -//##elif defined(MANAGED) - #define IMPLEMENTATION_STRING "vecpar_cpu_single_source" - #define SINGLE_SOURCE "1" -//#else - // #define IMPLEMENTATION_STRING "vecpar_cpu_host_memory" -#endif - + #define IMPLEMENTATION_STRING "NOT_RELEVANT" + #endif template class VecparStream : public Stream @@ -47,13 +59,20 @@ class VecparStream : public Stream #if defined(VECPAR_GPU) && defined(MANAGED) vecmem::cuda::managed_memory_resource memoryResource; #elif defined(VECPAR_GPU) && defined(DEFAULT) - vecmem::host_memory_resource memoryResource; - vecmem::cuda::device_memory_resource dev_mem; - vecmem::cuda::copy copy_tool; - - vecmem::data::vector_buffer d_a; - vecmem::data::vector_buffer d_b; - vecmem::data::vector_buffer d_c; + #if defined(NATIVE) + vecmem::host_memory_resource memoryResource; + vecmem::cuda::device_memory_resource dev_mem; + vecmem::cuda::copy copy_tool; + + vecmem::data::vector_buffer d_a; + vecmem::data::vector_buffer d_b; + vecmem::data::vector_buffer d_c; + #else + vecmem::host_memory_resource memoryResource; + T* d_a; + T* d_b; + T* d_c; + #endif #else vecmem::host_memory_resource memoryResource; #endif @@ -73,9 +92,10 @@ class VecparStream : public Stream virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; + /// if SHARED MEMORY is set, then vecpar single source code can be used; /// define one algorithm per function -#ifdef MANAGED +//#ifdef MANAGED template struct vecpar_triad : public vecpar::algorithm::parallelizable_mmap< @@ -85,7 +105,7 @@ class VecparStream : public Stream vecmem::vector, // c const T // scalar > { - TARGET T& map(T& a_i, const T& b_i, const T& c_i, const T scalar) const { + TARGET T& mapping_function(T& a_i, const T& b_i, const T& c_i, const T scalar) const { a_i = b_i + scalar * c_i; return a_i; } @@ -99,7 +119,7 @@ struct vecpar_add : vecmem::vector, // a vecmem::vector> // b { - TARGET T& map(T& c_i, const T& a_i, const T& b_i) const { + TARGET T& mapping_function(T& c_i, const T& a_i, const T& b_i) const { c_i = a_i + b_i ; return c_i; } @@ -113,7 +133,7 @@ struct vecpar_mul: vecmem::vector, // c const T > // scalar { - TARGET T& map(T& b_i, const T& c_i, const T scalar) const { + TARGET T& mapping_function(T& b_i, const T& c_i, const T scalar) const { b_i = scalar * c_i ; return b_i; } @@ -126,7 +146,7 @@ struct vecpar_copy: vecmem::vector, // c vecmem::vector> // a { - TARGET T& map(T& c_i, const T& a_i) const { + TARGET T& mapping_function(T& c_i, const T& a_i) const { c_i = a_i; return c_i; } @@ -141,12 +161,12 @@ struct vecpar_dot: vecmem::vector, // a vecmem::vector> // b { - TARGET T& map(T& result, T& a_i, const T& b_i) const { + TARGET T& mapping_function(T& result, T& a_i, const T& b_i) const { result = a_i * b_i; return result; } - TARGET T* reduce(T* result, T& crt) const { + TARGET T* reducing_function(T* result, T& crt) const { *result += crt; return result; } @@ -160,7 +180,7 @@ struct vecpar_nstream : public vecpar::algorithm::parallelizable_mmap< vecmem::vector, // c const T> // scalar { - TARGET T& map(T& a_i, const T& b_i, const T& c_i, const T scalar) const { + TARGET T& mapping_function(T& a_i, const T& b_i, const T& c_i, const T scalar) const { a_i += b_i + scalar * c_i; return a_i; } diff --git a/src/vecpar/model.cmake b/src/vecpar/model.cmake index cd979df1..a7168678 100644 --- a/src/vecpar/model.cmake +++ b/src/vecpar/model.cmake @@ -26,13 +26,13 @@ register_flag_optional(OFFLOAD_FLAGS register_flag_optional(OFFLOAD_APPEND_LINK_FLAG "If enabled, this appends all resolved offload flags (OFFLOAD= or directly from OFFLOAD_FLAGS) to the link flags. This is required for most offload implementations so that offload libraries can linked correctly." - ON) + OFF) -#register_flag_optional(VECPAR_BACKEND - # "Valid values: - # * VECPAR_BACKEND=OFF (default) - # * VECPAR_BACKEND=CUDA - # * VECPAR_BACKEND=OMPT" OFF) +register_flag_optional(VECPAR_BACKEND + "Valid values: + MAIN - OpenMP for CPU and CUDA for NVIDIA GPU. + OMPT - OpenMP Target for CPU and GPU" + "NATIVE") register_flag_optional(MEM "Device memory mode: DEFAULT - allocate host and device memory pointers. @@ -40,8 +40,23 @@ register_flag_optional(MEM "Device memory mode: "DEFAULT") set(VECPAR_FLAGS_CLANG -fopenmp) -set(VECPAR_FLAGS_OFFLOAD_CLANG_NVIDIA --language=cuda) -#set(VECPAR_BACKEND "CUDA") + +if ("${VECPAR_BACKEND}" STREQUAL "OMPT") + set(VECPAR_FLAGS_OFFLOAD_GNU_NVIDIA + -foffload=nvptx-none -DCOMPILE_FOR_DEVICE) + set(VECPAR_FLAGS_OFFLOAD_GNU_AMD + -foffload=amdgcn-amdhsa -DCOMPILE_FOR_DEVICE) + set(VECPAR_FLAGS_OFFLOAD_CLANG_NVIDIA -fopenmp -fopenmp-targets=nvptx64 -gline-tables-only -DCOMPILE_FOR_DEVICE) + set(VECPAR_FLAGS_OFFLOAD_CLANG_AMD + -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -DCOMPILE_FOR_DEVICE) +else() + set(VECPAR_FLAGS_OFFLOAD_CLANG_NVIDIA --language=cuda) +endif() + +register_definitions(${VECPAR_BACKEND}) + +register_definitions(MEM=${MEM}) + macro(setup) @@ -53,8 +68,16 @@ macro(setup) find_package(vecpar REQUIRED 0.0.3) find_package(OpenMP QUIET) - #find_package(vecmem QUIET) + if (("${VECPAR_BACKEND}" STREQUAL "NATIVE") AND (DEFINED OFFLOAD)) + find_package(CUDAToolkit) + include_directories(CUDAToolkit_INCLUDE_DIRS) + register_link_library(CUDA::cudart) + endif() + + if ("${VECPAR_BACKEND}" STREQUAL "OMPT") + register_link_library(OpenMP::OpenMP_CXX) + endif() if (("${OFFLOAD}" STREQUAL OFF) OR (NOT DEFINED OFFLOAD)) # no offload @@ -63,24 +86,9 @@ macro(setup) register_link_library(OpenMP::OpenMP_CXX) list(APPEND VECPAR_FLAGS -fopenmp) - # resolve the CPU specific flags - # set(VECPAR_FLAGS -fopenmp) - # set(VECPAR_LINK_FLAGS -fopenmp) - # register_append_compiler_and_arch_specific_cxx_flags( - # VECPAR_FLAGS_CPU - # ${COMPILER} - # ${ARCH} - # ) - - # register_append_compiler_and_arch_specific_link_flags( - # VECPAR_LINK_FLAGS_CPU - # ${COMPILER} - # ${ARCH} - #) elseif ("${OFFLOAD}" STREQUAL ON) # offload but with custom flags - find_package(CUDAToolkit QUIET) register_definitions(VECPAR_GPU) separate_arguments(OFFLOAD_FLAGS) set(VECPAR_FLAGS ${OFFLOAD_FLAGS}) @@ -89,15 +97,13 @@ macro(setup) register_link_library(vecpar::all) elseif ((DEFINED OFFLOAD) AND OFFLOAD_FLAGS) # offload but OFFLOAD_FLAGS overrides - find_package(CUDAToolkit QUIET) register_definitions(VECPAR_GPU) separate_arguments(OFFLOAD_FLAGS) - list(VECPAR_FLAGS APPEND ${OFFLOAD_FLAGS}) + list(APPEND VECPAR_FLAGS ${OFFLOAD_FLAGS}) register_link_library(CUDA::cudart) register_link_library(vecmem::cuda) register_link_library(vecpar::all) else () - find_package(CUDAToolkit QUIET) register_definitions(VECPAR_GPU) # list(APPEND VECPAR_FLAGS "-x cuda") @@ -110,7 +116,6 @@ macro(setup) list(GET OFFLOAD_TUPLE 0 OFFLOAD_VENDOR) # append VECPAR_FLAGS_OFFLOAD_ if exists list(APPEND VECPAR_FLAGS ${VECPAR_FLAGS_OFFLOAD_${OFFLOAD_VENDOR}}) - elseif (LEN EQUAL 2) # offload with tuple list(GET OFFLOAD_TUPLE 0 OFFLOAD_VENDOR) @@ -127,7 +132,6 @@ macro(setup) else () message(FATAL_ERROR "Unrecognised OFFLOAD format: `${OFFLOAD}`, consider directly using OFFLOAD_FLAGS") endif () - register_link_library(CUDA::cudart) register_link_library(vecmem::cuda) endif () @@ -138,9 +142,9 @@ macro(setup) # propagate flags to linker so that it links with the offload stuff as well register_append_cxx_flags(ANY ${VECPAR_FLAGS}) -# if (OFFLOAD_APPEND_LINK_FLAG) - # register_append_link_flags(${VECPAR_FLAGS}) - # endif () + if ("${VECPAR_BACKEND}" STREQUAL "OMPT") #(OFFLOAD_APPEND_LINK_FLAG) + register_append_link_flags(${VECPAR_FLAGS}) + endif () endmacro() From 0587972e72007d3a1adb56f39fb27d46371f6260 Mon Sep 17 00:00:00 2001 From: Georgiana Mania Date: Thu, 2 Mar 2023 15:40:55 +0100 Subject: [PATCH 3/4] update vecpar functionality --- src/vecpar/VecparStream.cpp | 96 ++++++++++++------------------------- src/vecpar/VecparStream.hpp | 9 ++-- src/vecpar/model.cmake | 2 +- 3 files changed, 36 insertions(+), 71 deletions(-) diff --git a/src/vecpar/VecparStream.cpp b/src/vecpar/VecparStream.cpp index 88c092c1..5a99ce86 100644 --- a/src/vecpar/VecparStream.cpp +++ b/src/vecpar/VecparStream.cpp @@ -96,13 +96,12 @@ void VecparStream::copy() { #if defined(SINGLE_SOURCE) // gpu+managed vecpar_copy algorithm; -#if defined(NATIVE) // omp + cuda - vecpar::parallel_algorithm(algorithm, memoryResource, *c, *a); -#else - vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *c, *a); -#endif + #if defined(NATIVE) // omp + cuda + vecpar::parallel_algorithm(algorithm, memoryResource, *c, *a); + #else + vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *c, *a); + #endif #else - #ifdef VECPAR_GPU vecpar::cuda::parallel_map( array_size, [=] __device__ (int idx, @@ -114,10 +113,6 @@ void VecparStream::copy() }, vecmem::get_data(d_c), vecmem::get_data(d_a)); - #else // lambda - vecpar::omp::parallel_map(array_size, - [&] (int idx) { c->at(idx) = a->at(idx);}); - #endif #endif } @@ -127,13 +122,12 @@ void VecparStream::mul() const T scalar = startScalar; #if defined(SINGLE_SOURCE) vecpar_mul algorithm; -#if defined(NATIVE) // omp + cuda - vecpar::parallel_algorithm(algorithm, memoryResource, *b, *c, scalar); -#else - vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *b, *c, scalar); -#endif + #if defined(NATIVE) // omp + cuda + vecpar::parallel_algorithm(algorithm, memoryResource, *b, *c, scalar); + #else + vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *b, *c, scalar); + #endif #else - #ifdef VECPAR_GPU vecpar::cuda::parallel_map( array_size, [=] __device__ (int idx, @@ -147,10 +141,6 @@ void VecparStream::mul() vecmem::get_data(d_b), vecmem::get_data(d_c), scalar); - #else - vecpar::omp::parallel_map(array_size, - [&] (int idx) { b->at(idx) = scalar * c->at(idx);}); - #endif #endif } @@ -159,13 +149,12 @@ void VecparStream::add() { #if defined(SINGLE_SOURCE) vecpar_add algorithm; -#if defined(NATIVE) // omp + cuda - vecpar::parallel_algorithm(algorithm, memoryResource, *c, *a, *b); -#else - vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *c, *a, *b); -#endif + #if defined(NATIVE) // omp + cuda + vecpar::parallel_algorithm(algorithm, memoryResource, *c, *a, *b); + #else + vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *c, *a, *b); + #endif #else //defined (DEFAULT) - #ifdef VECPAR_GPU vecpar::cuda::parallel_map( array_size, [=] __device__ (int idx, @@ -180,10 +169,6 @@ void VecparStream::add() vecmem::get_data(d_a), vecmem::get_data(d_b), vecmem::get_data(d_c)); - #else - vecpar::omp::parallel_map(array_size, - [&] (int idx) { c->at(idx) = a->at(idx) + b->at(idx);}); - #endif #endif } @@ -195,13 +180,12 @@ void VecparStream::triad() #if defined(SINGLE_SOURCE) vecpar_triad algorithm; -#if defined(NATIVE) // omp + cuda - vecpar::parallel_algorithm(algorithm, memoryResource, *a, *b, *c, scalar); -#else - vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *a, *b, *c, scalar); -#endif + #if defined(NATIVE) // omp + cuda + vecpar::parallel_algorithm(algorithm, memoryResource, *a, *b, *c, scalar); + #else + vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *a, *b, *c, scalar); + #endif #else //defined (DEFAULT) - #if defined (VECPAR_GPU) vecpar::cuda::parallel_map( array_size, [=] __device__ (int idx, @@ -218,11 +202,6 @@ void VecparStream::triad() vecmem::get_data(d_b), vecmem::get_data(d_c), scalar); - #else - vecpar::omp::parallel_map(array_size, - [&] (int idx) { - a->at(idx) = b->at(idx) + scalar * c->at(idx); }); - #endif #endif } @@ -233,13 +212,12 @@ void VecparStream::nstream() #if defined(SINGLE_SOURCE) vecpar_nstream algorithm; -#if defined(NATIVE) // omp + cuda - vecpar::parallel_algorithm(algorithm, memoryResource, *a, *b, *c, scalar); -#else - vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *a, *b, *c, scalar); -#endif + #if defined(NATIVE) // omp + cuda + vecpar::parallel_algorithm(algorithm, memoryResource, *a, *b, *c, scalar); + #else + vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *a, *b, *c, scalar); + #endif #else - #ifdef VECPAR_GPU vecpar::cuda::parallel_map( array_size, [=] __device__ (int idx, @@ -256,11 +234,7 @@ void VecparStream::nstream() vecmem::get_data(d_b), vecmem::get_data(d_c), scalar); - #else - vecpar::omp::parallel_map(array_size, - [&] (int idx) { a->at(idx) = b->at(idx) + scalar * c->at(idx);}); - #endif -#endif + #endif } template @@ -270,13 +244,12 @@ T VecparStream::dot() *sum = 0.0; #if defined(SINGLE_SOURCE) vecpar_dot algorithm; -#if defined(NATIVE) // omp + cuda - *sum = vecpar::parallel_algorithm(algorithm, memoryResource, *a, *b); -#else - *sum = vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *a, *b); -#endif + #if defined(NATIVE) // omp + cuda + *sum = vecpar::parallel_algorithm(algorithm, memoryResource, *a, *b); + #else + *sum = vecpar::ompt::parallel_algorithm(algorithm, memoryResource, *a, *b); + #endif #else - #ifdef VECPAR_GPU T* dsum; cudaMalloc(&dsum, sizeof(T)); vecpar::cuda::offload_reduce(array_size, @@ -294,13 +267,6 @@ T VecparStream::dot() vecmem::get_data(d_b)); cudaMemcpy(sum, dsum, sizeof(T), cudaMemcpyDeviceToHost); cudaFree(dsum); - #else - vecmem::vector result(array_size, &memoryResource); - vecpar::omp::parallel_map(array_size, - [&] (int idx) { result.at(idx) = a->at(idx) * b->at(idx); }); - vecpar::omp::parallel_reduce(array_size, sum, - [&] (T* sum, T& value) { *sum += value; }, result); - #endif #endif return *sum; } diff --git a/src/vecpar/VecparStream.hpp b/src/vecpar/VecparStream.hpp index 8f705073..30607a53 100644 --- a/src/vecpar/VecparStream.hpp +++ b/src/vecpar/VecparStream.hpp @@ -26,8 +26,10 @@ //backend = NATIVE/ompt, memory = default/managed, offload=0/1 #if defined(NATIVE) and defined(DEFAULT) and defined(VECPAR_GPU) #define IMPLEMENTATION_STRING "vecpar_cuda_hostdevice" + #undef SINGLE_SOURCE #elif defined(NATIVE) and defined(DEFAULT) and !defined(VECPAR_GPU) #define IMPLEMENTATION_STRING "vecpar_omp_hostmemory" + #define SINGLE_SOURCE 1 #elif defined(NATIVE) and defined(MANAGED) and defined(VECPAR_GPU) #define IMPLEMENTATION_STRING "vecpar_cuda_singlesource_managedmemory" #define SINGLE_SOURCE 1 @@ -92,10 +94,7 @@ class VecparStream : public Stream virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; }; - -/// if SHARED MEMORY is set, then vecpar single source code can be used; -/// define one algorithm per function -//#ifdef MANAGED +/// define vecpar algorithms template struct vecpar_triad : public vecpar::algorithm::parallelizable_mmap< @@ -186,5 +185,5 @@ struct vecpar_nstream : public vecpar::algorithm::parallelizable_mmap< } }; -#endif + diff --git a/src/vecpar/model.cmake b/src/vecpar/model.cmake index a7168678..fe0d5a0a 100644 --- a/src/vecpar/model.cmake +++ b/src/vecpar/model.cmake @@ -46,7 +46,7 @@ if ("${VECPAR_BACKEND}" STREQUAL "OMPT") -foffload=nvptx-none -DCOMPILE_FOR_DEVICE) set(VECPAR_FLAGS_OFFLOAD_GNU_AMD -foffload=amdgcn-amdhsa -DCOMPILE_FOR_DEVICE) - set(VECPAR_FLAGS_OFFLOAD_CLANG_NVIDIA -fopenmp -fopenmp-targets=nvptx64 -gline-tables-only -DCOMPILE_FOR_DEVICE) + set(VECPAR_FLAGS_OFFLOAD_CLANG_NVIDIA -fopenmp -fopenmp-targets=nvptx64 -DCOMPILE_FOR_DEVICE) set(VECPAR_FLAGS_OFFLOAD_CLANG_AMD -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -DCOMPILE_FOR_DEVICE) else() From 967a7df3750832c2945f39d24ae7a78546a93521 Mon Sep 17 00:00:00 2001 From: Georgiana Mania Date: Mon, 6 Mar 2023 16:41:30 +0100 Subject: [PATCH 4/4] add new kernel for vecpar-cuda-mm case --- src/main.cpp | 2 +- src/vecpar/VecparStream.cpp | 78 ++++++++++++++++++++++++------------- 2 files changed, 51 insertions(+), 29 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 5e8f1583..36b32425 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -58,7 +58,7 @@ int ARRAY_SIZE = 33554432; unsigned int num_times = 100; unsigned int deviceIndex = 0; bool use_float = false; -bool output_as_csv = false; +bool output_as_csv = true; bool mibibytes = false; std::string csv_separator = ","; diff --git a/src/vecpar/VecparStream.cpp b/src/vecpar/VecparStream.cpp index 5a99ce86..fd19262e 100644 --- a/src/vecpar/VecparStream.cpp +++ b/src/vecpar/VecparStream.cpp @@ -45,23 +45,45 @@ void VecparStream::init_arrays(T initA, T initB, T initC) } #if defined(VECPAR_GPU) and defined(DEFAULT) -#if defined(NATIVE) - d_a = copy_tool.to(vecmem::get_data(*a), - dev_mem, - vecmem::copy::type::host_to_device); - d_b = copy_tool.to(vecmem::get_data(*b), - dev_mem, - vecmem::copy::type::host_to_device); - d_c = copy_tool.to(vecmem::get_data(*c), - dev_mem, - vecmem::copy::type::host_to_device); -#else - d_a = a->data(); - d_b = b->data(); - d_c = c->data(); -#pragma omp target update to(d_a[0:array_size], d_b[0:array_size], d_c[0:array_size]) - {} -#endif + #if defined(NATIVE) + d_a = copy_tool.to(vecmem::get_data(*a), + dev_mem, + vecmem::copy::type::host_to_device); + d_b = copy_tool.to(vecmem::get_data(*b), + dev_mem, + vecmem::copy::type::host_to_device); + d_c = copy_tool.to(vecmem::get_data(*c), + dev_mem, + vecmem::copy::type::host_to_device); + #else + d_a = a->data(); + d_b = b->data(); + d_c = c->data(); + #pragma omp target update to(d_a[0:array_size], d_b[0:array_size], d_c[0:array_size]) + {} + #endif +#elif defined(VECPAR_GPU) and defined(MANAGED) + // make sure the data is initialized on the GPU using an init kernel + auto fn = [=] __device__ (int idx, + vecmem::data::vector_view &a_view, + vecmem::data::vector_view &b_view, + vecmem::data::vector_view &c_view, + T ia, T ib, T ic) { + vecmem::device_vector da(a_view); + vecmem::device_vector db(b_view); + vecmem::device_vector dc(c_view); + + da[idx] = ia; + db[idx] = ib; + dc[idx] = ic; + }; + vecpar::cuda::kernel<<>>(array_size, fn, + vecmem::get_data(*a), + vecmem::get_data(*b), + vecmem::get_data(*c), + initA, initB, initC); + cudaDeviceSynchronize(); + #endif } @@ -70,17 +92,17 @@ void VecparStream::read_arrays(std::vector& h_a, std::vector& h_b, std: { #if defined(VECPAR_GPU) and defined(DEFAULT) -#if defined(NATIVE) - copy_tool(d_a, *a, vecmem::copy::type::device_to_host); - copy_tool(d_b, *b, vecmem::copy::type::device_to_host); - copy_tool(d_c, *c, vecmem::copy::type::device_to_host); -#else - d_a = a->data(); - d_b = b->data(); - d_c = c->data(); -#pragma omp target update from(d_a[0:array_size], d_b[0:array_size], d_c[0:array_size]) - {} -#endif + #if defined(NATIVE) + copy_tool(d_a, *a, vecmem::copy::type::device_to_host); + copy_tool(d_b, *b, vecmem::copy::type::device_to_host); + copy_tool(d_c, *c, vecmem::copy::type::device_to_host); + #else + d_a = a->data(); + d_b = b->data(); + d_c = c->data(); + #pragma omp target update from(d_a[0:array_size], d_b[0:array_size], d_c[0:array_size]) + {} + #endif #endif for (int i = 0; i < array_size; i++)