diff --git a/.clang-format b/.clang-format index a057d0bef..29924594b 100644 --- a/.clang-format +++ b/.clang-format @@ -1,4 +1,7 @@ --- +Language: Json +DisableFormat: true +--- Language: Cpp AccessModifierOffset: -2 AlignAfterOpenBracket: Align @@ -79,7 +82,7 @@ IncludeBlocks: Regroup IncludeCategories: - Regex: '^"plssvm/' Priority: 1 - - Regex: '^"(cuda|hip|CL|sycl|omp|hpx)' + - Regex: '^"(cuda|hip|CL|sycl|omp|hpx|Kokkos)' Priority: 2 - Regex: '^"(tests|bindings)/' Priority: 3 diff --git a/CMakeLists.txt b/CMakeLists.txt index 593b7b8f5..4993f4ae4 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,6 +72,7 @@ endif () ######################################################################################################################## ## set base sources set(PLSSVM_BASE_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/Kokkos/execution_space.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/SYCL/implementation_types.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/SYCL/kernel_invocation_types.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/stdpar/implementation_types.cpp @@ -411,6 +412,13 @@ if (PLSSVM_ENABLE_SYCL_BACKEND MATCHES "AUTO" OR PLSSVM_ENABLE_SYCL_BACKEND) add_subdirectory(src/plssvm/backends/SYCL) endif () +## check for Kokkos backend +set(PLSSVM_ENABLE_KOKKOS_BACKEND AUTO CACHE STRING "Enable SYCL Backend") +set_property(CACHE PLSSVM_ENABLE_KOKKOS_BACKEND PROPERTY STRINGS AUTO ON OFF) +if (PLSSVM_ENABLE_KOKKOS_BACKEND MATCHES "AUTO" OR PLSSVM_ENABLE_KOKKOS_BACKEND) + add_subdirectory(src/plssvm/backends/Kokkos) +endif () + ## check if ANY backend is available/has been enabled get_target_property(PLSSVM_LINKED_BACKENDS ${PLSSVM_ALL_LIBRARY_NAME} INTERFACE_LINK_LIBRARIES) if (NOT PLSSVM_LINKED_BACKENDS) @@ -642,7 +650,7 @@ if (PLSSVM_ENABLE_FORMATTING) list(APPEND CMAKE_MESSAGE_INDENT "Formatting: ") ## install library to add a clang-format target - set(PLSSVM_format_VERSION 7021abbf066e2e577926731c3fa4141f456c5024) + set(PLSSVM_format_VERSION d22c36043bea6ef85f3eb68b823f50703bd1cc21) find_package(format QUIET) if (format_FOUND) message(STATUS "Found package format.") @@ -734,6 +742,10 @@ if (TARGET ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME}) endforeach () list(APPEND PLSSVM_BACKEND_NAME_LIST "sycl") endif () +if (TARGET ${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME}) + message(STATUS "${PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING}") + list(APPEND PLSSVM_BACKEND_NAME_LIST "kokkos") +endif () message(STATUS "") ######################################################################################################################## @@ -758,8 +770,8 @@ message(STATUS "Generating manpage files.") string(TIMESTAMP PLSSVM_CURRENT_BUILD_TIME "%d. %B %Y") string(REPLACE ";" "|" PLSSVM_PLATFORM_NAME_LIST "${PLSSVM_PLATFORM_NAME_LIST}") string(REPLACE ";" "|" PLSSVM_BACKEND_NAME_LIST "${PLSSVM_BACKEND_NAME_LIST}") -string(REPLACE ";" "|" PLSSVM_SYCL_BACKEND_NAME_LIST "${PLSSVM_SYCL_BACKEND_NAME_LIST}") if (TARGET ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME}) + string(REPLACE ";" "|" PLSSVM_SYCL_BACKEND_NAME_LIST "${PLSSVM_SYCL_BACKEND_NAME_LIST}") set(PLSSVM_SYCL_IMPLEMENTATION_TYPE_MANPAGE_ENTRY " .TP .B --sycl_implementation_type @@ -772,6 +784,15 @@ choose the kernel invocation type when using SYCL as backend: automatic|nd_range ") endif () set(PLSSVM_SYCL_MANPAGE_ENTRY "${PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_MANPAGE_ENTRY}${PLSSVM_SYCL_IMPLEMENTATION_TYPE_MANPAGE_ENTRY}") +## assemble the Kokkos manpage entry +if (TARGET ${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME}) + string(REPLACE ";" "|" PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "${PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES}") + set(PLSSVM_KOKKOS_MANPAGE_ENTRY " +.TP +.B --kokkos_execution_space +choose the Kokkos execution space to be used in the Kokkos backend: automatic|${PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES} (default: automatic) +") +endif () ## assemble the performance tracker manpage entry if (PLSSVM_ENABLE_PERFORMANCE_TRACKING) set(PLSSVM_PERFORMANCE_TRACKER_MANPAGE_ENTRY " @@ -787,6 +808,7 @@ configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/docs/plssvm-train.1 @ONLY ) +# update manpage entry since plssvm-predict can't recognize the SYCL kernel invocation type set(PLSSVM_SYCL_MANPAGE_ENTRY "${PLSSVM_SYCL_IMPLEMENTATION_TYPE_MANPAGE_ENTRY}") configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/docs/plssvm-predict.1.in @@ -866,6 +888,7 @@ install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/cmake/plssvm/plssvmHPXTargets.cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/plssvm/plssvmAdaptiveCppTargets.cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/plssvm/plssvmDPCPPTargets.cmake" + "${CMAKE_CURRENT_SOURCE_DIR}/cmake/plssvm/plssvmKokkosTargets.cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/plssvm/plssvmstdparTargets.cmake" DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/plssvm/cmake ) diff --git a/CMakePresets.json b/CMakePresets.json index c6bf7373f..e226c44fd 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -14,6 +14,7 @@ "cmake/presets/opencl.json", "cmake/presets/acpp.json", "cmake/presets/dpcpp.json", + "cmake/presets/kokkos.json", "cmake/presets/all.json" ] } diff --git a/README.md b/README.md index 394dd8e04..0091ad85c 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,7 @@ The main highlights of our SVM implementations are: - [HIP](https://github.com/ROCm-Developer-Tools/HIP) - [OpenCL](https://www.khronos.org/opencl/) - [SYCL](https://www.khronos.org/sycl/) (supported implementations are [DPC++](https://github.com/intel/llvm) and [AdaptiveCpp](https://github.com/AdaptiveCpp/AdaptiveCpp) (formerly known as hipSYCL); specifically the versions [sycl-nightly/20231201](https://github.com/intel/llvm/tree/sycl-nightly/20230110) and AdaptiveCpp release [v24.06.0](https://github.com/AdaptiveCpp/AdaptiveCpp/releases/tag/v23.10.0)) + - [Kokkos](https://github.com/kokkos/kokkos) (all execution spaces supported except `OpenMPTarget` and `OpenACC`); specifically the version [4.5.00](https://github.com/kokkos/kokkos/releases/tag/4.5.00) 3. Six different kernel functions to be able to classify a large variety of different problems: - linear: $\vec{u}^T$ $\cdot$ $\vec{v}$ - polynomial: $(\gamma$ $\cdot$ $\vec{u}^T$ $\cdot$ $\vec{v}$ $+$ $coef0)^{d}$ @@ -128,6 +129,10 @@ Additional dependencies for the SYCL backend: - the code must be compiled with a SYCL capable compiler; currently supported are [DPC++](https://github.com/intel/llvm) and [AdaptiveCpp](https://github.com/AdaptiveCpp/AdaptiveCpp) +Additional dependencies for the Kokkos backend: + +- a Kokkos installation with the respective execution spaces enabled; currently all execution spaces are supported except `OpenMPTarget` and `OpenACC` + Additional dependencies for the stdpar backend: - the code must be compiled with a stdpar capable compiler; currently supported are [nvc++](https://developer.nvidia.com/hpc-sdk), [roc-stdpar](https://github.com/ROCm/roc-stdpar), [icpx](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compiler.html), [AdaptiveCpp](https://github.com/AdaptiveCpp/AdaptiveCpp), and [GNU GCC](https://gcc.gnu.org/)) @@ -243,6 +248,11 @@ The `[optional_options]` can be one or multiple of: - `AUTO`: check for the OpenMP backend but **do not** fail if not available - `OFF`: do not check for the OpenMP backend +- `PLSSVM_ENABLE_HPX_BACKEND=ON|OFF|AUTO` (default: `AUTO`): + - `ON`: check for the HPX backend and fail if not available + - `AUTO`: check for the HPX backend but **do not** fail if not available + - `OFF`: do not check for the HPX backend + - `PLSSVM_ENABLE_STDPAR_BACKEND=ON|OFF|AUTO` (default: `AUTO`): - `ON`: check for the stdpar backend and fail if not available - `AUTO`: check for the stdpar backend but **do not** fail if not available @@ -268,6 +278,11 @@ The `[optional_options]` can be one or multiple of: - `AUTO`: check for the SYCL backend but **do not** fail if not available - `OFF`: do not check for the SYCL backend +- `PLSSVM_ENABLE_KOKKOS_BACKEND=ON|OFF|AUTO` (default: `AUTO`): + - `ON`: check for the Kokkos backend and fail if not available + - `AUTO`: check for the Kokkos backend but **do not** fail if not available + - `OFF`: do not check for the Kokkos backend + **Attention:** at least one backend must be enabled and available! - `PLSSVM_ENABLE_FAST_MATH=ON|OFF` (default depending on `CMAKE_BUILD_TYPE`: `ON` for Release or RelWithDebInfo, `OFF` otherwise): enable `fast-math` compiler flags for all backends @@ -344,6 +359,10 @@ If more than one SYCL implementation is available the environment variables `PLS - `PLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` (`dpcpp`|`adaptivecpp`): specify the preferred SYCL implementation if the `sycl_implementation_type` option is set to `automatic`; additional the specified SYCL implementation is used in the `plssvm::sycl` namespace, the other implementations are available in the `plssvm::dpcpp` and `plssvm::adaptivecpp` namespace respectively +If the Kokkos backend is available the following additional option is available (**note**: this option takes only effect if the Kokkos SYCL execution space is available): + +- `PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT` (default: `ON`): enable Ahead-of-Time (AOT) compilation for the specified target platforms + If the stdpar backend is available, an additional options can be set. - `PLSSVM_STDPAR_BACKEND_IMPLEMENTATION` (default: `AUTO`): explicitly specify the used stdpar implementation; must be one of: `AUTO`, `NVHPC`, `roc-stdpar`, `IntelLLVM`, `ACPP`, `GNU_TBB`. @@ -363,24 +382,6 @@ Available configure presets: "hpx" - HPX backend "hpx_python" - HPX backend + Python bindings "hpx_test" - HPX backend tests - "cuda" - CUDA backend - "cuda_python" - CUDA backend + Python bindings - "cuda_test" - CUDA backend tests - "hip" - HIP backend - "hip_python" - HIP backend + Python bindings - "hip_test" - HIP backend tests - "opencl" - OpenCL backend - "opencl_python" - OpenCL backend + Python bindings - "opencl_test" - OpenCL backend tests - "acpp" - AdaptiveCpp SYCL backend - "acpp_python" - AdaptiveCpp SYCL backend + Python bindings - "acpp_test" - AdaptiveCpp SYCL backend tests - "dpcpp" - DPC++/icpx SYCL backend - "dpcpp_python" - DPC++/icpx backend + Python bindings - "dpcpp_test" - DPC++/icpx backend tests - "all" - All available backends - "all_python" - All available backends + Python bindings - "all_test" - All available backends tests "stdpar" - stdpar backend "stdpar_python" - stdpar backend + Python bindings "stdpar_test" - stdpar backend tests @@ -399,6 +400,27 @@ Available configure presets: "stdpar_intelllvm" - stdpar IntelLLVM (icpx) backend "stdpar_intelllvm_python" - stdpar IntelLLVM (icpx) backend + Python bindings "stdpar_intelllvm_test" - stdpar IntelLLVM (icpx) backend tests + "cuda" - CUDA backend + "cuda_python" - CUDA backend + Python bindings + "cuda_test" - CUDA backend tests + "hip" - HIP backend + "hip_python" - HIP backend + Python bindings + "hip_test" - HIP backend tests + "opencl" - OpenCL backend + "opencl_python" - OpenCL backend + Python bindings + "opencl_test" - OpenCL backend tests + "acpp" - AdaptiveCpp SYCL backend + "acpp_python" - AdaptiveCpp SYCL backend + Python bindings + "acpp_test" - AdaptiveCpp SYCL backend tests + "dpcpp" - DPC++/icpx SYCL backend + "dpcpp_python" - DPC++/icpx backend + Python bindings + "dpcpp_test" - DPC++/icpx backend tests + "kokkos" - Kokkos backend + "kokkos_python" - Kokkos backend + Python bindings + "kokkos_test" - Kokkos backend tests + "all" - All available backends + "all_python" - All available backends + Python bindings + "all_test" - All available backends tests ``` With these presets, building and testing, e.g., our CUDA backend is as simple as typing (in the PLSSVM root directory): @@ -553,12 +575,14 @@ Usage: -i, --max_iter arg set the maximum number of CG iterations (default: num_features) -l, --solver arg choose the solver: automatic|cg_explicit|cg_implicit (default: automatic) -a, --classification arg the classification strategy to use for multi-class classification: oaa|oao (default: oaa) - -b, --backend arg choose the backend: automatic|openmp|hpx|cuda|hip|opencl|sycl|stdpar (default: automatic) + -b, --backend arg choose the backend: automatic|openmp|hpx|cuda|hip|opencl|sycl|kokkos|stdpar (default: automatic) -p, --target_platform arg choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic) --sycl_kernel_invocation_type arg choose the kernel invocation type when using SYCL as backend: automatic|nd_range (default: automatic) --sycl_implementation_type arg choose the SYCL implementation to be used in the SYCL backend: automatic|dpcpp|adaptivecpp (default: automatic) + --kokkos_execution_space arg + choose the Kokkos execution space to be used in the Kokkos backend: automatic|Cuda|OpenMP|Serial (default: automatic) --performance_tracking arg the output YAML file where the performance tracking results are written to; if not provided, the results are dumped to stderr --use_strings_as_labels use strings as labels instead of plane numbers @@ -594,10 +618,10 @@ Another example targeting NVIDIA GPUs using the SYCL backend looks like: The `--backend=automatic` option works as follows: -- if the `gpu_nvidia` target is available, check for existing backends in order `cuda` 🠦 `hip` 🠦 `opencl` 🠦 `sycl` 🠦 `stdpar` -- otherwise, if the `gpu_amd` target is available, check for existing backends in order `hip` 🠦 `opencl` 🠦 `sycl` 🠦 `stdpar` -- otherwise, if the `gpu_intel` target is available, check for existing backends in order `sycl` 🠦 `opencl` 🠦 `stdpar` -- otherwise, if the `cpu` target is available, check for existing backends in order `sycl` 🠦 `opencl` 🠦 `openmp` 🠦 `hpx` 🠦 `stdpar` +- if the `gpu_nvidia` target is available, check for existing backends in order `cuda` 🠦 `hip` 🠦 `opencl` 🠦 `sycl` 🠦 `kokkos` 🠦 `stdpar` +- otherwise, if the `gpu_amd` target is available, check for existing backends in order `hip` 🠦 `opencl` 🠦 `sycl` 🠦 `kokkos` 🠦 `stdpar` +- otherwise, if the `gpu_intel` target is available, check for existing backends in order `sycl` 🠦 `opencl` 🠦 `kokkos` 🠦 `stdpar` +- otherwise, if the `cpu` target is available, check for existing backends in order `sycl` 🠦 `kokkos` 🠦 `opencl` 🠦 `openmp` 🠦 `hpx` 🠦 `stdpar` Note that during CMake configuration it is guaranteed that at least one of the above combinations does exist. @@ -609,11 +633,13 @@ The `--target_platform=automatic` option works for the different backends as fol - `HIP`: always selects an AMD GPU (if no AMD GPU is available, throws an exception) - `OpenCL`: tries to find available devices in the following order: NVIDIA GPUs 🠦 AMD GPUs 🠦 Intel GPUs 🠦 CPU - `SYCL`: tries to find available devices in the following order: NVIDIA GPUs 🠦 AMD GPUs 🠦 Intel GPUs 🠦 CPU +- `Kokkos`: checks which execution spaces are available and which target platforms they support and then tries to find available devices in the following order: NVIDIA GPUs 🠦 AMD GPUs 🠦 Intel GPUs 🠦 CPU - `stdpar`: target device must be selected at compile time (using `PLSSVM_TARGET_PLATFORMS`) or using environment variables at runtime The `--sycl_kernel_invocation_type` and `--sycl_implementation_type` flags are only used if the `--backend` is `sycl`, otherwise a warning is emitted on `stderr`. If the `--sycl_kernel_invocation_type` is `automatic`, the `nd_range` invocation type is currently always used. If the `--sycl_implementation_type` is `automatic`, the used SYCL implementation is determined by the `PLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` CMake flag. +If the `--kokkos_execution_space` is `automatic`, uses the best fitting execution space based on the provided and/or available target platforms. ### Predicting using `plssvm-predict` @@ -628,10 +654,12 @@ LS-SVM with multiple (GPU-)backends Usage: ./plssvm-predict [OPTION...] test_file model_file [output_file] - -b, --backend arg choose the backend: automatic|openmp|cuda|hip|opencl|sycl|stdpar (default: automatic) + -b, --backend arg choose the backend: automatic|openmp|hpx|cuda|hip|opencl|sycl|kokkos|stdpar (default: automatic) -p, --target_platform arg choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic) --sycl_implementation_type arg choose the SYCL implementation to be used in the SYCL backend: automatic|dpcpp|adaptivecpp (default: automatic) + --kokkos_execution_space arg + choose the Kokkos execution space to be used in the Kokkos backend: automatic|Cuda|OpenMP|Serial (default: automatic) --performance_tracking arg the output YAML file where the performance tracking results are written to; if not provided, the results are dumped to stderr --use_strings_as_labels use strings as labels instead of plane numbers @@ -777,9 +805,6 @@ Roughly the same can be achieved using our Python bindings with the following Py import plssvm from sklearn.metrics import classification_report -# correctly initialize and finalize environments -environment_guard = plssvm.environment.ScopeGuard() - try: # create a new C-SVM parameter set, explicitly overriding the default kernel function params = plssvm.Parameter(kernel_type=plssvm.KernelFunctionType.POLYNOMIAL) diff --git a/bindings/Python/CMakeLists.txt b/bindings/Python/CMakeLists.txt index f951f77a4..f7d4e571d 100644 --- a/bindings/Python/CMakeLists.txt +++ b/bindings/Python/CMakeLists.txt @@ -41,7 +41,6 @@ set(PLSSVM_PYTHON_BINDINGS_SOURCES ${CMAKE_CURRENT_LIST_DIR}/classification_types.cpp ${CMAKE_CURRENT_LIST_DIR}/csvm.cpp ${CMAKE_CURRENT_LIST_DIR}/data_set.cpp - ${CMAKE_CURRENT_LIST_DIR}/environment.cpp ${CMAKE_CURRENT_LIST_DIR}/file_format_types.cpp ${CMAKE_CURRENT_LIST_DIR}/gamma.cpp ${CMAKE_CURRENT_LIST_DIR}/kernel_function_types.cpp @@ -98,6 +97,9 @@ endif () if (TARGET ${PLSSVM_SYCL_BACKEND_DPCPP_LIBRARY_NAME}) list(APPEND PLSSVM_PYTHON_BINDINGS_SOURCES ${CMAKE_CURRENT_LIST_DIR}/backends/dpcpp_csvm.cpp) endif () +if (TARGET ${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME}) + list(APPEND PLSSVM_PYTHON_BINDINGS_SOURCES ${CMAKE_CURRENT_LIST_DIR}/backends/kokkos_csvm.cpp) +endif () # create pybind11 module set(PLSSVM_PYTHON_BINDINGS_LIBRARY_NAME plssvm) diff --git a/bindings/Python/README.md b/bindings/Python/README.md index afe9d6bb4..04d0cee14 100644 --- a/bindings/Python/README.md +++ b/bindings/Python/README.md @@ -10,10 +10,9 @@ - [plssvm.Parameter](#plssvmparameter) - [plssvm.DataSet](#plssvmdataset) - [plssvm.CSVM](#plssvmcsvm) - - [plssvm.openmp.CSVM, plssvm.hpx.CSVM, plssvm.stdpar.CSVM, plssvm.cuda.CSVM, plssvm.hip.CSVM, plssvm.opencl.CSVM, plssvm.sycl.CSVM, plssvm.dpcpp.CSVM, plssvm.adaptivecpp.CSVM](#plssvmopenmpcsvm-plssvmhpxcsvm-plssvmcudacsvm-plssvmhipcsvm-plssvmopenclcsvm-plssvmsyclcsvm-plssvmdpcppcsvm-plssvmadaptivecppcsvm) + - [plssvm.openmp.CSVM, plssvm.hpx.CSVM, plssvm.stdpar.CSVM, plssvm.cuda.CSVM, plssvm.hip.CSVM, plssvm.opencl.CSVM, plssvm.sycl.CSVM, plssvm.dpcpp.CSVM, plssvm.adaptivecpp.CSVM, plssvm.kokkos.CSVM](#plssvmopenmpcsvm-plssvmhpxcsvm-plssvmcudacsvm-plssvmhipcsvm-plssvmopenclcsvm-plssvmsyclcsvm-plssvmdpcppcsvm-plssvmadaptivecppcsvm-plssvmkokkoscsvm) - [plssvm.Model](#plssvmmodel) - [plssvm.Version](#plssvmversion) - - [plssvm.environment.ScopeGuard](#plssvmenvironmentscopeguard) - [plssvm.detail.tracking.PerformanceTracker](#plssvmdetailtrackingperformancetracker) - [plssvm.detail.tracking.Events](#plssvmdetailtrackingevent-plssvmdetailtrackingevents) - [Free functions](#free-functions) @@ -188,17 +187,16 @@ More detailed description of the class methods: The following table lists all PLSSVM enumerations exposed on the Python side: -| enumeration | values | description | -|------------------------|----------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `TargetPlatform` | `AUTOMATIC`, `CPU`, `GPU_NVIDIA`, `GPU_AMD`, `GPU_INTEL` | The different supported target platforms (default: `AUTOMATIC`). If `AUTOMATIC` is provided, checks for available devices in the following order: NVIDIA GPUs -> AMD GPUs -> Intel GPUs -> CPUs. | -| `SolverType` | `AUTOMATIC`, `CG_EXPLICIT`, `CG_IMPLICIT` | The different supported solver types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, the used solver types depends on the available device and system memory. | -| `KernelFunctionType` | `LINEAR`, `POLYNOMIAL`, `RBF`, `SIGMOID`, `LAPLACIAN`, `CHI_SQUARED` | The different supported kernel functions (default: `LINEAR`). | -| `FileFormatType` | `LIBSVM`, `ARFF` | The different supported file format types (default: `LIBSVM`). | -| `GammaCoefficientType` | `AUTOMATIC`, `SCALE` | The different modes for the dynamic gamma calculation (default: `AUTOMATIC`). | -| `ClassificationType` | `OAA`, `OAO` | The different supported multi-class classification strategies (default: `LIBSVM`). | -| `BackendType` | `AUTOMATIC`, `OPENMP`, `HPX`, `CUDA`, `HIP`, `OPENCL`, `SYCL` | The different supported backends (default: `AUTOMATIC`). If `AUTOMATIC` is provided, the selected backend depends on the used target platform. | -| `VerbosityLevel` | `QUIET`, `LIBSVM`, `TIMING`, `FULL` | The different supported log levels (default: `FULL`). `QUIET` means no output, `LIBSVM` output that is as conformant as possible with LIBSVM's output, `TIMING` all timing related outputs, and `FULL` everything. Can be combined via bit-wise operations. | -| `Status` | `UNINITIALIZED`, `INITIALIZED`, `FINALIZED`, `UNNECESSARY` | The different environment status values. **Note**: located in the `plssvm.environment` module. | | +| enumeration | values | description | +|------------------------|-------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `TargetPlatform` | `AUTOMATIC`, `CPU`, `GPU_NVIDIA`, `GPU_AMD`, `GPU_INTEL` | The different supported target platforms (default: `AUTOMATIC`). If `AUTOMATIC` is provided, checks for available devices in the following order: NVIDIA GPUs -> AMD GPUs -> Intel GPUs -> CPUs. | +| `SolverType` | `AUTOMATIC`, `CG_EXPLICIT`, `CG_IMPLICIT` | The different supported solver types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, the used solver types depends on the available device and system memory. | +| `KernelFunctionType` | `LINEAR`, `POLYNOMIAL`, `RBF`, `SIGMOID`, `LAPLACIAN`, `CHI_SQUARED` | The different supported kernel functions (default: `LINEAR`). | +| `FileFormatType` | `LIBSVM`, `ARFF` | The different supported file format types (default: `LIBSVM`). | +| `GammaCoefficientType` | `AUTOMATIC`, `SCALE` | The different modes for the dynamic gamma calculation (default: `AUTOMATIC`). | +| `ClassificationType` | `OAA`, `OAO` | The different supported multi-class classification strategies (default: `LIBSVM`). | +| `BackendType` | `AUTOMATIC`, `OPENMP`, `HPX`, `CUDA`, `HIP`, `OPENCL`, `SYCL`, `KOKKOS` | The different supported backends (default: `AUTOMATIC`). If `AUTOMATIC` is provided, the selected backend depends on the used target platform. | +| `VerbosityLevel` | `QUIET`, `LIBSVM`, `TIMING`, `FULL` | The different supported log levels (default: `FULL`). `QUIET` means no output, `LIBSVM` output that is as conformant as possible with LIBSVM's output, `TIMING` all timing related outputs, and `FULL` everything. Can be combined via bit-wise operations. | If a SYCL implementation is available, additional enumerations are available: @@ -213,6 +211,12 @@ If the stdpar backend is available, an additional enumeration is available: |----------------------|---------------------------------------------------------------|-------------------------------------------------| | `ImplementationType` | `NVHPC`, `ROC_STDPAR`, `INTEL_LLVM`, `ADAPTIVECPP`, `GNU_TBB` | The different supported stdpar implementations. | +If the Kokos backend is available, an additional enumeration is available: + +| enumeration | values | description | +|------------------|----------------------------------------------------------------------------------------|--------------------------------------------------| +| `ExecutionSpace` | `CUDA`, `HIP`, `SYCL`, `HPX`, `OPENMP`, `OPENMPTARGET`, `OPENACC`, `THREADS`, `SERIAL` | The different supported Kokkos execution spaces. | + ### Classes and submodules The following tables list all PLSSVM classes exposed on the Python side: @@ -337,9 +341,8 @@ If the most performant backend should be used, it is sufficient to use `plssvm.C `sycl_implementation_type` to choose between DPC++ and AdaptiveCpp as SYCL implementations and `sycl_kernel_invocation_type` to choose between the two different SYCL kernel invocation types. -**Note**: if the backend type is `plssvm.BackendType.HPX` it is necessary to initialize and finalize the HPX runtime. -The runtime can be manually managed using `plssvm.environment.initialize()` and `plssvm.environment.finalize()`. -We recommend utilizing `plssvm.environment.ScopeGuard()` to manage the lifetime of the HPX runtime automatically. +**Note**: if the backend type is `plssvm.BackendType.HPX` or `plssvm.BackendType.Kokkos` special initialization and finalization functions must be called. +However, this is **automatically** handled by our Python bindings on the module import and cleanup. | methods | description | |----------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| @@ -353,7 +356,7 @@ We recommend utilizing `plssvm.environment.ScopeGuard()` to manage the lifetime | `score(model)` | Score the model with respect to itself returning its accuracy. | | `score(model, data_set)` | Score the model given the provided data set returning its accuracy. | -#### `plssvm.openmp.CSVM`, `plssvm.hpx.CSVM`, `plssvm.stdpar.CSVM`, plssvm.cuda.CSVM`, `plssvm.hip.CSVM`, `plssvm.opencl.CSVM`, `plssvm.sycl.CSVM`, `plssvm.dpcpp.CSVM`, `plssvm.adaptivecpp.CSVM` +#### `plssvm.openmp.CSVM`, `plssvm.hpx.CSVM`, `plssvm.stdpar.CSVM`, plssvm.cuda.CSVM`, `plssvm.hip.CSVM`, `plssvm.opencl.CSVM`, `plssvm.sycl.CSVM`, `plssvm.dpcpp.CSVM`, `plssvm.adaptivecpp.CSVM`, `plssvm.kokkos.CSVM` These classes represent the backend specific CSVMs. **Note**: they are only available if the respective backend has been enabled during PLSSVM's build step. @@ -391,6 +394,14 @@ CSVM. |-----------------------------|---------------------------------------------| | `get_implementation_type()` | Return the used stdpar implementation type. | +In case of the Kokkos CSVM (`plssvm.kokkos.CSVM`) the following method is additional available for the backend specific +CSVM. + + +| methods | description | +|-------------------------|-----------------------------------------| +| `get_execution_space()` | Return the used Kokkos execution space. | + #### `plssvm.Model` A class encapsulating a model learned during a call to `plssvm.CSVM.fit()`. @@ -429,19 +440,6 @@ A class encapsulating the version information of the used PLSSVM installation. | `minor : int` | The minor PLSSVM version. | | `patch : int` | The patch PLSSVM version. | -#### `plssvm.environment.ScopeGuard` - -The environmental scope guard can be used to automatically finalize all necessary backend environments when it goes out of scope. - -| constructors | description | -|-----------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `ScopeGuard([backends={}])` | Create a new scope guard initializing all available backend environments. If a list of backends is provided, only initializes these backends. | -| `ScopeGuard(argc, argv, [backends={}])` | Create a new scope guard initializing all available backend environments using the provided command line arguments. If a list of backends is provided, only initializes these backends. | - -| methods | description | -|--------------|-----------------------------------------------------------------------------------------------------------------------------------| -| `backends()` | Return all initialized backends. All backends returned by this function will be finalized when the scope guard goes out of scope. | - #### `plssvm.detail.tracking.PerformanceTracker` A submodule used to track various performance statistics like runtimes, but also the used setup and hyperparameters. @@ -535,15 +533,6 @@ If a stdpar implementation is available, additional free functions are available |-------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------| | `list_available_stdpar_implementations()` | List all available stdpar implementations (determined during PLSSVM's build step; currently always guaranteed to be only one implementation). | -Additional free functions are available under `plssvm.environment.`. - -| function | description | -|-----------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `get_backend_status(backend)` | Return the current environment status of the provided backend. | -| `initialize([backends={}])` | Initialize all available backend environments. If a list of backends is provided, only initializes these backends. | -| `initialize(argc, argv, [backends={}])` | Initialize all available backend environments using the provided command line arguments. If a list of backends is provided, only initializes these backends. | -| `finalize([backends={}])` | Finalize all available backend environments. If a list of backends is provided, only finalizes these backends. | - ### Exceptions The PLSSVM Python3 bindings define a few new exception types: @@ -562,6 +551,5 @@ The PLSSVM Python3 bindings define a few new exception types: | `MatrixError` | If something went wrong in the internal matrix class. **Note**: shouldn't occur in user code. | | `KernelLaunchResourcesError` | If something went wrong during a kernel launch due to insufficient ressources. | | `ClassificationReportError` | If something in the classification report went wrong. **Note**: shouldn't occur in user code. | -| `EnvironmentError` | If something during environment initialization or finalization went wrong. | Depending on the available backends, additional `BackendError`s are also available (e.g., `plssvm.cuda.BackendError`). diff --git a/bindings/Python/backend_types.cpp b/bindings/Python/backend_types.cpp index f88f8c2e2..5664cf360 100644 --- a/bindings/Python/backend_types.cpp +++ b/bindings/Python/backend_types.cpp @@ -26,7 +26,8 @@ void init_backend_types(py::module_ &m) { .value("CUDA", plssvm::backend_type::cuda, "CUDA to target NVIDIA GPUs only") .value("HIP", plssvm::backend_type::hip, "HIP to target AMD and NVIDIA GPUs") .value("OPENCL", plssvm::backend_type::opencl, "OpenCL to target CPUs and GPUs from different vendors") - .value("SYCL", plssvm::backend_type::sycl, "SYCL to target CPUs and GPUs from different vendors; currently tested SYCL implementations are DPC++ and AdaptiveCpp"); + .value("SYCL", plssvm::backend_type::sycl, "SYCL to target CPUs and GPUs from different vendors; currently tested SYCL implementations are DPC++ and AdaptiveCpp") + .value("KOKKOS", plssvm::backend_type::kokkos, "Kokkos to target CPUs and GPUs from different vendors; currently all Kokkos execution spaces except Kokkos::Experimental::OpenMPTarget and Kokkos::Experimental::OpenACC are supported"); // bind free functions m.def("list_available_backends", &plssvm::list_available_backends, "list the available backends (as found during CMake configuration)"); diff --git a/bindings/Python/backends/adaptivecpp_csvm.cpp b/bindings/Python/backends/adaptivecpp_csvm.cpp index 767853757..bf81b11ae 100644 --- a/bindings/Python/backends/adaptivecpp_csvm.cpp +++ b/bindings/Python/backends/adaptivecpp_csvm.cpp @@ -11,7 +11,7 @@ #include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/csvm.hpp" // plssvm::csvm #include "plssvm/exceptions/exceptions.hpp" // plssvm::exception -#include "plssvm/parameter.hpp" // plssvm::parameter +#include "plssvm/parameter.hpp" // plssvm::parameter, plssvm::sycl_kernel_invocation_type #include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "bindings/Python/utility.hpp" // check_kwargs_for_correctness, convert_kwargs_to_parameter, register_py_exception diff --git a/bindings/Python/backends/dpcpp_csvm.cpp b/bindings/Python/backends/dpcpp_csvm.cpp index 882d6ea37..906cb5979 100644 --- a/bindings/Python/backends/dpcpp_csvm.cpp +++ b/bindings/Python/backends/dpcpp_csvm.cpp @@ -11,7 +11,7 @@ #include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/csvm.hpp" // plssvm::csvm #include "plssvm/exceptions/exceptions.hpp" // plssvm::exception -#include "plssvm/parameter.hpp" // plssvm::parameter +#include "plssvm/parameter.hpp" // plssvm::parameter, plssvm::sycl_kernel_invocation_type #include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "bindings/Python/utility.hpp" // check_kwargs_for_correctness, convert_kwargs_to_parameter, register_py_exception diff --git a/bindings/Python/backends/kokkos_csvm.cpp b/bindings/Python/backends/kokkos_csvm.cpp new file mode 100644 index 000000000..ea6c4af80 --- /dev/null +++ b/bindings/Python/backends/kokkos_csvm.cpp @@ -0,0 +1,77 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + */ + +#include "plssvm/backends/Kokkos/csvm.hpp" // plssvm::kokkos::csvm +#include "plssvm/backends/Kokkos/exceptions.hpp" // plssvm::kokkos::backend_exception +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space +#include "plssvm/csvm.hpp" // plssvm::csvm +#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception +#include "plssvm/parameter.hpp" // plssvm::parameter, plssvm::kokkos_execution_space +#include "plssvm/target_platforms.hpp" // plssvm::target_platform + +#include "bindings/Python/utility.hpp" // check_kwargs_for_correctness, convert_kwargs_to_parameter, register_py_exception + +#include "pybind11/pybind11.h" // py::module_, py::class_, py::init +#include "pybind11/stl.h" // support for STL types + +#include // std::make_unique + +namespace py = pybind11; + +void init_kokkos_csvm(py::module_ &m, const py::exception &base_exception) { + // use its own submodule for the Kokkos CSVM bindings + py::module_ kokkos_module = m.def_submodule("kokkos", "a module containing all Kokkos backend specific functionality"); + + // bind the CSVM using the Kokkos backend + py::class_(kokkos_module, "CSVM") + .def(py::init<>(), "create an SVM with the automatic target platform and default parameter object") + .def(py::init(), "create an SVM with the automatic target platform and provided parameter object") + .def(py::init(), "create an SVM with the provided target platform and default parameter object") + .def(py::init(), "create an SVM with the provided target platform and parameter object") + .def(py::init([](const py::kwargs &args) { + // check for valid keys + check_kwargs_for_correctness(args, { "kernel_type", "degree", "gamma", "coef0", "cost", "kokkos_execution_space" }); + // if one of the value keyword parameter is provided, set the respective value + const plssvm::parameter params = convert_kwargs_to_parameter(args); + // set Kokkos execution space + const plssvm::kokkos::execution_space space = args.contains("kokkos_execution_space") ? args["kokkos_execution_space"].cast() : plssvm::kokkos::execution_space::automatic; + // create CSVM with the default target platform + return std::make_unique(params, plssvm::kokkos_execution_space = space); + }), + "create an SVM with the default target platform and keyword arguments") + .def(py::init([](const plssvm::target_platform target, const py::kwargs &args) { + // check for valid keys + check_kwargs_for_correctness(args, { "kernel_type", "degree", "gamma", "coef0", "cost", "kokkos_execution_space" }); + // if one of the value keyword parameter is provided, set the respective value + const plssvm::parameter params = convert_kwargs_to_parameter(args); + // set Kokkos execution space + const plssvm::kokkos::execution_space space = args.contains("kokkos_execution_space") ? args["kokkos_execution_space"].cast() : plssvm::kokkos::execution_space::automatic; + // create CSVM with the provided target platform + return std::make_unique(target, params, plssvm::kokkos_execution_space = space); + }), + "create an SVM with the provided target platform and keyword arguments") + .def("get_execution_space", &plssvm::kokkos::csvm::get_execution_space, "get the Kokkos execution space used in this Kokkos SVM"); + + // register Kokkos backend specific exceptions + register_py_exception(kokkos_module, "BackendError", base_exception); + + // bind the execution space enum classes + py::enum_(kokkos_module, "ExecutionSpace") + .value("AUTOMATIC", plssvm::kokkos::execution_space::cuda, "automatically determine the used Kokkos execution space (note: this does not necessarily correspond to Kokkos::DefaultExecutionSpace)") + .value("CUDA", plssvm::kokkos::execution_space::cuda, "execution space representing execution on a CUDA device") + .value("HIP", plssvm::kokkos::execution_space::hip, "execution space representing execution on a device supported by HIP") + .value("SYCL", plssvm::kokkos::execution_space::sycl, "execution space representing execution on a device supported by SYCL") + .value("HPX", plssvm::kokkos::execution_space::hpx, "execution space representing execution with the HPX runtime system") + .value("OPENMP", plssvm::kokkos::execution_space::openmp, "execution space representing execution with the OpenMP runtime system") + .value("OPENMPTARGET", plssvm::kokkos::execution_space::openmp_target, "execution space representing execution using the target offloading feature of the OpenMP runtime system") + .value("OPENACC", plssvm::kokkos::execution_space::openacc, "execution space representing execution with the OpenACC runtime system") + .value("THREADS", plssvm::kokkos::execution_space::threads, "execution space representing parallel execution with std::threads") + .value("SERIAL", plssvm::kokkos::execution_space::serial, "execution space representing serial execution on the CPU; should always be available"); + + kokkos_module.def("list_available_execution_spaces", &plssvm::kokkos::list_available_execution_spaces, "list all available Kokkos execution spaces"); +} diff --git a/bindings/Python/environment.cpp b/bindings/Python/environment.cpp deleted file mode 100644 index c9a467187..000000000 --- a/bindings/Python/environment.cpp +++ /dev/null @@ -1,105 +0,0 @@ -/** - * @author Alexander Van Craen - * @author Marcel Breyer - * @copyright 2018-today The PLSSVM project - All Rights Reserved - * @license This file is part of the PLSSVM project which is released under the MIT license. - * See the LICENSE.md file in the project root for full license information. - */ - -#include "plssvm/environment.hpp" - -#include "plssvm/backend_types.hpp" // plssvm::backend_type, plssvm::list_available_backends - -#include "bindings/Python/utility.hpp" // check_kwargs_for_correctness - -#include "pybind11/pybind11.h" // py::module_, py::enum_ -#include "pybind11/pytypes.h" // py::kwargs -#include "pybind11/stl.h" // support for STL types: std::variant - -#include // std::size_t -#include // std::make_unique -#include // std::vector - -namespace py = pybind11; - -void init_environment(py::module_ &m) { - // use its own submodule for the environment related bindings - py::module_ env_module = m.def_submodule("environment", "a module containing all environment initialization and finalization functionality"); - - // bind enum class - py::enum_(m, "Status") - .value("UNINITIALIZED", plssvm::environment::status::uninitialized, "the backend environment hasn't been initialized or finalized yet") - .value("INITIALIZED", plssvm::environment::status::initialized, "the backend environment has been initialized but not finalized yet") - .value("FINALIZED", plssvm::environment::status::finalized, "the backend environment has already been initialized and finalized") - .value("UNNECESSARY", plssvm::environment::status::unnecessary, "no backend environment initialization or finalization necessary"); - - // bind free functions - env_module.def("get_backend_status", &plssvm::environment::get_backend_status, "get the environment status for the provided backend"); - env_module.def("is_initialization_necessary", &plssvm::environment::is_initialization_necessary, "check if the provided backend needs a special environment initialization"); - - env_module.def("initialize", [](const py::kwargs &args) { - // check for valid keys - check_kwargs_for_correctness(args, { "backends" }); - if (args.contains("backends")) { - plssvm::environment::initialize(args["backends"].cast>()); - } else { - plssvm::environment::initialize(); - } }, "initialize all available backends or only the optionally provided once"); - env_module.def("initialize", [](std::vector cmd_args, const py::kwargs &args) { - std::vector cmd_args_ptr(cmd_args.size()); - for (std::size_t i = 0; i < cmd_args.size(); ++i) { - cmd_args_ptr[i] = cmd_args[i].data(); - } - // assemble command line arguments - int argc = static_cast(cmd_args_ptr.size()); - char **argv = cmd_args_ptr.data(); - - // check for valid keys - check_kwargs_for_correctness(args, { "backends" }); - if (args.contains("backends")) { - plssvm::environment::initialize(argc, argv, args["backends"].cast>()); - } else { - plssvm::environment::initialize(argc, argv); - } }, "initialize all available backends or only the optionally provided once using the provided command line parameters"); - - env_module.def("finalize", [](const py::kwargs &args) { - // check for valid keys - check_kwargs_for_correctness(args, { "backends" }); - if (args.contains("backends")) { - plssvm::environment::finalize(args["backends"].cast>()); - } else { - plssvm::environment::finalize(); - } }, "finalize all available backends or only the optionally provided once"); - - // bind plssvm::environment::scope_guard - py::class_(env_module, "ScopeGuard") - .def(py::init([](const py::kwargs &args) { - // check for valid keys - check_kwargs_for_correctness(args, { "backends" }); - if (args.contains("backends")) { - return std::make_unique(args["backends"].cast>()); - } else { - return std::make_unique(); - } - }), - "create a new scope_guard and initialize all available backends or only the optionally provided once") - .def(py::init([](std::vector cmd_args, const py::kwargs &args) { - std::vector cmd_args_ptr(cmd_args.size()); - for (std::size_t i = 0; i < cmd_args.size(); ++i) { - cmd_args_ptr[i] = cmd_args[i].data(); - } - // assemble command line arguments - int argc = static_cast(cmd_args_ptr.size()); - char **argv = cmd_args_ptr.data(); - - // check for valid keys - check_kwargs_for_correctness(args, { "backends" }); - if (args.contains("backends")) { - return std::make_unique(argc, argv, args["backends"].cast>()); - } else { - return std::make_unique(argc, argv); - } - }), - "create a new scope_guard and initialize all available backends or only the optionally provided once using the provided command line parameters") - .def("backends", &plssvm::environment::scope_guard::backends, "return all initialized backends"); -} diff --git a/bindings/Python/main.cpp b/bindings/Python/main.cpp index 170afa2c3..1c1248fb2 100644 --- a/bindings/Python/main.cpp +++ b/bindings/Python/main.cpp @@ -7,6 +7,7 @@ * See the LICENSE.md file in the project root for full license information. */ +#include "plssvm/environment.hpp" // plssvm::environment::{initialize, finalize} #include "plssvm/exceptions/exceptions.hpp" // plssvm::exception #include "pybind11/pybind11.h" // PYBIND11_MODULE, py::module_, py::exception, py::register_exception_translator @@ -32,7 +33,6 @@ void init_parameter(py::module_ &); void init_model(py::module_ &); void init_data_set(py::module_ &); void init_version(py::module_ &); -void init_environment(py::module_ &); void init_exceptions(py::module_ &, const py::exception &); void init_csvm(py::module_ &); void init_openmp_csvm(py::module_ &, const py::exception &); @@ -42,11 +42,20 @@ void init_cuda_csvm(py::module_ &, const py::exception &); void init_hip_csvm(py::module_ &, const py::exception &); void init_opencl_csvm(py::module_ &, const py::exception &); void init_sycl(py::module_ &, const py::exception &); +void init_kokkos_csvm(py::module_ &, const py::exception &); void init_sklearn(py::module_ &); PYBIND11_MODULE(plssvm, m) { m.doc() = "Parallel Least Squares Support Vector Machine"; + // automatically initialize the environments + plssvm::environment::initialize(); + + // automatically finalize the environments + m.add_object("_cleanup", py::capsule([]() { + plssvm::environment::finalize(); + })); + // register PLSSVM base exception static py::exception base_exception(m, "PLSSVMError"); py::register_exception_translator([](std::exception_ptr p) { @@ -80,7 +89,6 @@ PYBIND11_MODULE(plssvm, m) { init_model(m); init_data_set(m); init_version(m); - init_environment(m); init_exceptions(m, base_exception); init_csvm(m); @@ -106,6 +114,9 @@ PYBIND11_MODULE(plssvm, m) { #if defined(PLSSVM_HAS_SYCL_BACKEND) init_sycl(m, base_exception); #endif +#if defined(PLSSVM_HAS_KOKKOS_BACKEND) + init_kokkos_csvm(m, base_exception); +#endif init_sklearn(m); } diff --git a/cmake/plssvm/plssvmConfig.cmake.in b/cmake/plssvm/plssvmConfig.cmake.in index 9636e125e..beb6801bc 100644 --- a/cmake/plssvm/plssvmConfig.cmake.in +++ b/cmake/plssvm/plssvmConfig.cmake.in @@ -25,7 +25,7 @@ find_dependency(fmt REQUIRED) include("${CMAKE_CURRENT_LIST_DIR}/plssvmTargets.cmake") # list all available libraries -set(PLSSVM_SUPPORTED_COMPONENTS "OpenMP;HPX;CUDA;HIP;OpenCL;DPCPP;AdaptiveCpp;stdpar") +set(PLSSVM_SUPPORTED_COMPONENTS "OpenMP;HPX;CUDA;HIP;OpenCL;DPCPP;AdaptiveCpp;Kokkos;stdpar") set(PLSSVM_DISABLED_COMPONENTS "${PLSSVM_SUPPORTED_COMPONENTS}") # check which libraries are available diff --git a/cmake/plssvm/plssvmKokkosTargets.cmake b/cmake/plssvm/plssvmKokkosTargets.cmake new file mode 100644 index 000000000..7ec32069a --- /dev/null +++ b/cmake/plssvm/plssvmKokkosTargets.cmake @@ -0,0 +1,21 @@ +## Authors: Alexander Van Craen, Marcel Breyer +## Copyright (C): 2018-today The PLSSVM project - All Rights Reserved +## License: This file is part of the PLSSVM project which is released under the MIT license. +## See the LICENSE.md file in the project root for full license information. +######################################################################################################################## + +include(CMakeFindDependencyMacro) + +# check if the Kokkos backend is available +if (TARGET plssvm::plssvm-Kokkos) + # enable Kokkos + find_dependency(Kokkos CONFIG) + # set alias targets + add_library(plssvm::Kokkos ALIAS plssvm::plssvm-Kokkos) + add_library(plssvm::kokkos ALIAS plssvm::plssvm-Kokkos) + # set COMPONENT to be found + set(plssvm_Kokkos_FOUND ON) +else () + # set COMPONENT to be NOT found + set(plssvm_Kokkos_FOUND OFF) +endif () \ No newline at end of file diff --git a/cmake/presets/all.json b/cmake/presets/all.json index 52d77a14c..978922667 100644 --- a/cmake/presets/all.json +++ b/cmake/presets/all.json @@ -13,7 +13,8 @@ "PLSSVM_ENABLE_CUDA_BACKEND": "AUTO", "PLSSVM_ENABLE_HIP_BACKEND": "AUTO", "PLSSVM_ENABLE_OPENCL_BACKEND": "AUTO", - "PLSSVM_ENABLE_SYCL_BACKEND": "AUTO" + "PLSSVM_ENABLE_SYCL_BACKEND": "AUTO", + "PLSSVM_ENABLE_KOKKOS_BACKEND": "AUTO" } }, { @@ -28,6 +29,7 @@ "PLSSVM_ENABLE_HIP_BACKEND": "AUTO", "PLSSVM_ENABLE_OPENCL_BACKEND": "AUTO", "PLSSVM_ENABLE_SYCL_BACKEND": "AUTO", + "PLSSVM_ENABLE_KOKKOS_BACKEND": "AUTO", "PLSSVM_ENABLE_LANGUAGE_BINDINGS": "ON", "PLSSVM_ENABLE_PYTHON_BINDINGS": "ON" } @@ -43,7 +45,8 @@ "PLSSVM_ENABLE_CUDA_BACKEND": "AUTO", "PLSSVM_ENABLE_HIP_BACKEND": "AUTO", "PLSSVM_ENABLE_OPENCL_BACKEND": "AUTO", - "PLSSVM_ENABLE_SYCL_BACKEND": "AUTO" + "PLSSVM_ENABLE_SYCL_BACKEND": "AUTO", + "PLSSVM_ENABLE_KOKKOS_BACKEND": "AUTO" } } ], @@ -84,7 +87,7 @@ "inherits": "common", "filter": { "include": { - "name": "OpenMP.*|HPX.*|CUDA.*|HIP.*|OpenCL.*|AdaptiveCpp.*|DPCPP.*" + "name": "OpenMP.*|HPX.*|CUDA.*|HIP.*|OpenCL.*|AdaptiveCpp.*|DPCPP.*|Kokkos.*" } } } diff --git a/cmake/presets/common.json b/cmake/presets/common.json index 68da8cd61..82bbea9e9 100644 --- a/cmake/presets/common.json +++ b/cmake/presets/common.json @@ -17,7 +17,8 @@ "PLSSVM_ENABLE_CUDA_BACKEND": "OFF", "PLSSVM_ENABLE_HIP_BACKEND": "OFF", "PLSSVM_ENABLE_OPENCL_BACKEND": "OFF", - "PLSSVM_ENABLE_SYCL_BACKEND": "OFF" + "PLSSVM_ENABLE_SYCL_BACKEND": "OFF", + "PLSSVM_ENABLE_KOKKOS_BACKEND": "OFF" } }, { diff --git a/cmake/presets/kokkos.json b/cmake/presets/kokkos.json new file mode 100644 index 000000000..620e940e5 --- /dev/null +++ b/cmake/presets/kokkos.json @@ -0,0 +1,142 @@ +{ + "version": 6, + "include": [ + "common.json" + ], + "configurePresets": [ + { + "name": "kokkos", + "displayName": "Kokkos backend", + "inherits": "build", + "cacheVariables": { + "PLSSVM_ENABLE_KOKKOS_BACKEND": "ON" + } + }, + { + "name": "kokkos_python", + "displayName": "Kokkos backend + Python bindings", + "inherits": "build", + "cacheVariables": { + "PLSSVM_ENABLE_KOKKOS_BACKEND": "ON", + "PLSSVM_ENABLE_LANGUAGE_BINDINGS": "ON", + "PLSSVM_ENABLE_PYTHON_BINDINGS": "ON" + } + }, + { + "name": "kokkos_test", + "displayName": "Kokkos backend tests", + "inherits": "test", + "cacheVariables": { + "PLSSVM_ENABLE_KOKKOS_BACKEND": "ON" + } + } + ], + "buildPresets": [ + { + "name": "kokkos", + "displayName": "Kokkos backend", + "configurePreset": "kokkos", + "configuration": "RelWithDebInfo", + "inherits": "common" + }, + { + "name": "kokkos_python", + "displayName": "Kokkos backend + Python bindings", + "configurePreset": "kokkos_python", + "configuration": "RelWithDebInfo", + "inherits": "common" + }, + { + "name": "kokkos_test", + "displayName": "Kokkos backend tests", + "configurePreset": "kokkos_test", + "configuration": "Debug", + "inherits": "common" + } + ], + "testPresets": [ + { + "name": "kokkos_test", + "displayName": "Kokkos backend all tests", + "configurePreset": "kokkos_test", + "inherits": "common" + }, + { + "name": "kokkos_backend_test", + "displayName": "Kokkos backend specific tests", + "configurePreset": "kokkos_test", + "inherits": "common", + "filter": { + "include": { + "name": "Kokkos.*" + } + } + } + ], + "workflowPresets": [ + { + "name": "kokkos", + "displayName": "Kokkos backend workflow", + "steps": [ + { + "name": "kokkos", + "type": "configure" + }, + { + "name": "kokkos", + "type": "build" + } + ] + }, + { + "name": "kokkos_python", + "displayName": "Kokkos backend + Python bindings workflow", + "steps": [ + { + "name": "kokkos_python", + "type": "configure" + }, + { + "name": "kokkos_python", + "type": "build" + } + ] + }, + { + "name": "kokkos_test", + "displayName": "Kokkos test workflow", + "steps": [ + { + "name": "kokkos_test", + "type": "configure" + }, + { + "name": "kokkos_test", + "type": "build" + }, + { + "name": "kokkos_test", + "type": "test" + } + ] + }, + { + "name": "kokkos_backend_test", + "displayName": "Kokkos backend test workflow", + "steps": [ + { + "name": "kokkos_test", + "type": "configure" + }, + { + "name": "kokkos_test", + "type": "build" + }, + { + "name": "kokkos_backend_test", + "type": "test" + } + ] + } + ] +} \ No newline at end of file diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index 3bf366b62..ec8c0c40f 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -32,6 +32,7 @@ set(DOXYGEN_PROJECT_LOGO "${PROJECT_SOURCE_DIR}/docs/resources/logo_90x55.png") set(DOXYGEN_EXCLUDE_SYMBOLS "*_HPP_") set(DOXYGEN_DOT_IMAGE_FORMAT "svg") +set(DOXYGEN_DOT_GRAPH_MAX_NODES "100") set(DOXYGEN_INTERACTIVE_SVG "YES") set(DOXYGEN_INCLUDE_GRAPH "NO") set(DOXYGEN_EXTRACT_PRIVATE "YES") diff --git a/docs/plssvm-predict.1.in b/docs/plssvm-predict.1.in index bb9e29b6b..17d6081fa 100644 --- a/docs/plssvm-predict.1.in +++ b/docs/plssvm-predict.1.in @@ -22,6 +22,8 @@ choose the target platform: @PLSSVM_PLATFORM_NAME_LIST@ (default: automatic) @PLSSVM_SYCL_MANPAGE_ENTRY@ +@PLSSVM_KOKKOS_MANPAGE_ENTRY@ + @PLSSVM_PERFORMANCE_TRACKER_MANPAGE_ENTRY@ .TP diff --git a/docs/plssvm-train.1.in b/docs/plssvm-train.1.in index b52853dac..fad2e4fba 100644 --- a/docs/plssvm-train.1.in +++ b/docs/plssvm-train.1.in @@ -17,7 +17,10 @@ plssvm-train is a utility to train an LS-SVM using different backends to target set type of kernel function. 0 -- linear: u'*v 1 -- polynomial: (gamma*u'*v + coef0)^degree - 2 -- radial basis function: exp(-gamma*|u-v|^2) (default: 2) + 2 -- radial basis function: exp(-gamma*|u-v|^2) + 3 -- sigmoid: tanh(gamma*u'*v+coef0) + 4 -- laplacian: exp(-gamma*|u-v|_1) + 5 -- chi_squared: exp(-gamma*sum_i((x[i]-y[i])^2/(x[i]+y[i]))) (default: 2) .TP .B -d, --degree arg @@ -25,7 +28,7 @@ set degree in kernel function (default: 3) .TP .B -g, --gamma arg -set gamma in kernel function (default: 1 / num_features) +set gamma in kernel function (default: automatic) .TP .B -r, --coef0 arg @@ -61,6 +64,8 @@ choose the target platform: @PLSSVM_PLATFORM_NAME_LIST@ (default: automatic) @PLSSVM_SYCL_MANPAGE_ENTRY@ +@PLSSVM_KOKKOS_MANPAGE_ENTRY@ + @PLSSVM_PERFORMANCE_TRACKER_MANPAGE_ENTRY@ .TP diff --git a/docs/resources/dirs.dox b/docs/resources/dirs.dox index 84e561a46..fd23efcbc 100644 --- a/docs/resources/dirs.dox +++ b/docs/resources/dirs.dox @@ -153,6 +153,61 @@ * @brief Directory containing kernel implementations for utility functions using the HIP backend. */ +/** + * @dir include/plssvm/backends/Kokkos + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing the implementation for the Kokkos backend. + */ + +/** + * @dir include/plssvm/backends/Kokkos/detail + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing implementation details for the Kokkos backend. + */ + +/** + * @dir include/plssvm/backends/Kokkos/kernel + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing all kernels for the Kokkos backend. + */ + +/** + * @dir include/plssvm/backends/Kokkos/kernel/cg_explicit + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing kernel implementations for the explicit CG algorithm using the Kokkos backend. + */ + +/** + * @dir include/plssvm/backends/Kokkos/kernel/cg_implicit + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing kernel implementations for the implicit CG algorithm using the Kokkos backend. + */ + /** * @dir include/plssvm/backends/OpenCL * @author Alexander Van Craen @@ -345,7 +400,7 @@ * @dir include/plssvm/backends/HPX/detail * @author Alexander Van Craen * @author Marcel Breyer - * @authir Alexander Strack + * @author Alexander Strack * @copyright 2018-today The PLSSVM project - All Rights Reserved * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. @@ -381,7 +436,7 @@ * @dir include/plssvm/backends/HPX/kernel/cg_implicit * @author Alexander Van Craen * @author Marcel Breyer - * @author Alexander Strack + * @author Alexander Strack * @copyright 2018-today The PLSSVM project - All Rights Reserved * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. diff --git a/examples/python/main.py b/examples/python/main.py index a200524ff..5efd985ec 100644 --- a/examples/python/main.py +++ b/examples/python/main.py @@ -1,9 +1,6 @@ import plssvm from sklearn.metrics import classification_report -# correctly initialize and finalize environments -environment_guard = plssvm.environment.ScopeGuard() - try: # create a new C-SVM parameter set, explicitly overriding the default kernel function params = plssvm.Parameter(kernel_type=plssvm.KernelFunctionType.POLYNOMIAL) diff --git a/examples/python/sklearn_like_svc.py b/examples/python/sklearn_like_svc.py index 57ab3e148..4b5e5f44f 100644 --- a/examples/python/sklearn_like_svc.py +++ b/examples/python/sklearn_like_svc.py @@ -1,9 +1,6 @@ from sklearn.datasets import make_classification import plssvm -# correctly initialize and finalize environments -environment_guard = plssvm.environment.ScopeGuard() - num_samples = 2**8 num_features = 2**6 diff --git a/include/plssvm/backend_types.hpp b/include/plssvm/backend_types.hpp index 449f5dcdd..90a19bd5c 100644 --- a/include/plssvm/backend_types.hpp +++ b/include/plssvm/backend_types.hpp @@ -36,6 +36,8 @@ enum class backend_type { automatic, /** [OpenMP](https://www.openmp.org/) to target CPUs only (currently no OpenMP target offloading support). */ openmp, + /** [HPX] (https://hpx.stellar-group.org/) to target CPUs only (currently no GPU support). */ + hpx, /** [C++ stdpar](https://en.cppreference.com/w/cpp/algorithm#Execution_policies) to target CPUs and GPUs from different vendors using C++ standard library parallel algorithms. */ stdpar, /** [CUDA](https://developer.nvidia.com/cuda-zone) to target NVIDIA GPUs only. */ @@ -46,8 +48,8 @@ enum class backend_type { opencl, /** [SYCL](https://www.khronos.org/sycl/) to target CPUs and GPUs from different vendors. Currently tested SYCL implementations are [DPC++](https://github.com/intel/llvm) and [AdaptiveCpp](https://github.com/AdaptiveCpp/AdaptiveCpp) (formerly known as hipSYCL). */ sycl, - /** [HPX] (https://hpx.stellar-group.org/) to target CPUs only (currently no GPU support). */ - hpx + /** [Kokkos](https://github.com/kokkos/kokkos) to target CPUs and GPUs from different vendors. */ + kokkos }; /** @@ -93,6 +95,7 @@ namespace hip { class csvm; } namespace opencl { class csvm; } namespace adaptivecpp { class csvm; } namespace dpcpp { class csvm; } +namespace kokkos { class csvm; } // clang-format on @@ -182,6 +185,15 @@ struct csvm_to_backend_type { constexpr static sycl::implementation_type impl = sycl::implementation_type::dpcpp; }; +/** + * @brief Sets the `value` to `plssvm::backend_type::kokkos` for the Kokkos C-SVM. + */ +template <> +struct csvm_to_backend_type { + /// The enum value representing the Kokkos backend. + constexpr static backend_type value = backend_type::kokkos; +}; + } // namespace detail /// @endcond diff --git a/include/plssvm/backends/Kokkos/csvm.hpp b/include/plssvm/backends/Kokkos/csvm.hpp new file mode 100644 index 000000000..2ff662933 --- /dev/null +++ b/include/plssvm/backends/Kokkos/csvm.hpp @@ -0,0 +1,217 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines a C-SVM using the Kokkos backend. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_CSVM_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_CSVM_HPP_ +#pragma once + +#include "plssvm/backends/execution_range.hpp" // plssvm::detail::{dim_type, execution_range} +#include "plssvm/backends/gpu_csvm.hpp" // plssvm::detail::gpu_csvm +#include "plssvm/backends/Kokkos/detail/device_ptr.hpp" // plssvm::kokkos::detail::device_ptr +#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp" // plssvm::kokkos::detail::device_wrapper +#include "plssvm/backends/Kokkos/detail/pinned_memory.hpp" // plssvm::kokkos::detail::pinned_memory +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space +#include "plssvm/constants.hpp" // plssvm::real_type +#include "plssvm/csvm.hpp" // plssvm::detail::csvm_backend_exists +#include "plssvm/detail/igor_utility.hpp" // plssvm::detail::get_value_from_named_parameter +#include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size +#include "plssvm/detail/type_traits.hpp" // PLSSVM_REQUIRES +#include "plssvm/parameter.hpp" // plssvm::parameter, plssvm::detail::parameter +#include "plssvm/target_platforms.hpp" // plssvm::target_platform + +#include "igor/igor.hpp" // igor::parser + +#include // std::size_t +#include // std::true_type +#include // std::forward +#include // std::vector + +namespace plssvm { + +namespace kokkos { + +/** + * @brief A C-SVM implementation using Kokkos as backend. + */ +class csvm : public ::plssvm::detail::gpu_csvm { + protected: + // protected for the test mock class + /// The template base type of the Kokkos C-SVM class. + using base_type = ::plssvm::detail::gpu_csvm; + + using base_type::data_distribution_; + using base_type::devices_; + + public: + using base_type::device_ptr_type; + using typename base_type::pinned_memory_type; + using typename base_type::queue_type; + + /** + * @brief Construct a new C-SVM using the Kokkos backend with the parameters given through @p params. + * @param[in] params struct encapsulating all possible parameters + * @throws plssvm::exception all exceptions thrown in the base class constructor + * @throws plssvm::kokkos::backend_exception if the requested target is not available + * @throws plssvm::kokkos::backend_exception if no device for the requested target was found + */ + explicit csvm(parameter params = {}); + /** + * @brief Construct a new C-SVM using the Kokkos backend on the @p target platform with the parameters given through @p params. + * @param[in] target the target platform used for this C-SVM + * @param[in] params struct encapsulating all possible SVM parameters + * @throws plssvm::exception all exceptions thrown in the base class constructor + * @throws plssvm::kokkos::backend_exception if the requested target is not available + * @throws plssvm::kokkos::backend_exception if no device for the requested target was found + */ + explicit csvm(target_platform target, parameter params = {}); + + /** + * @brief Construct a new C-SVM using the Kokkos backend and the optionally provided @p named_args. + * @param[in] named_args the additional optional named arguments + * @throws plssvm::exception all exceptions thrown in the base class constructor + * @throws plssvm::kokkos::backend_exception if the requested target is not available + * @throws plssvm::kokkos::backend_exception if no device for the requested target was found + */ + template )> + explicit csvm(Args &&...named_args) : + csvm{ plssvm::target_platform::automatic, std::forward(named_args)... } { } + + /** + * @brief Construct a new C-SVM using the Kokkos backend on the @p target platform and the optionally provided @p named_args. + * @param[in] target the target platform used for this C-SVM + * @param[in] named_args the additional optional named-parameters + * @throws plssvm::exception all exceptions thrown in the base class constructor + * @throws plssvm::kokkos::backend_exception if the requested target is not available + * @throws plssvm::kokkos::backend_exception if no device for the requested target was found + */ + template )> + explicit csvm(const target_platform target, Args &&...named_args) : + base_type{ std::forward(named_args)... } { + // check igor parameter + igor::parser parser{ std::forward(named_args)... }; + + // check whether a specific Kokkos execution space has been requested + if constexpr (parser.has(kokkos_execution_space)) { + // compile time check: the value must have the correct type + space_ = ::plssvm::detail::get_value_from_named_parameter(parser, kokkos_execution_space); + } + this->init(target); + } + + /** + * @copydoc plssvm::csvm::csvm(const plssvm::csvm &) + */ + csvm(const csvm &) = delete; + /** + * @copydoc plssvm::csvm::csvm(plssvm::csvm &&) noexcept + */ + csvm(csvm &&) noexcept = default; + /** + * @copydoc plssvm::csvm::operator=(const plssvm::csvm &) + */ + csvm &operator=(const csvm &) = delete; + /** + * @copydoc plssvm::csvm::operator=(plssvm::csvm &&) noexcept + */ + csvm &operator=(csvm &&) noexcept = default; + /** + * @brief Wait for all operations on all Kokkos devices to finish. + * @details Terminates the program, if any exception is thrown. + */ + ~csvm() override; + + /** + * @brief Return the currently used Kokkos `execution_space`. + * @return the execution space (`[[nodiscard]]`) + */ + [[nodiscard]] execution_space get_execution_space() const noexcept { return space_; } + + protected: + /** + * @brief Initialize all important states related to the Kokkos backend. + * @param[in] target the target platform to use + * @throws plssvm::kokkos::backend_exception if the requested target is not available + * @throws plssvm::kokkos::backend_exception if no device for the requested target was found + */ + void init(target_platform target); + + /** + * @copydoc plssvm::csvm::get_device_memory + */ + [[nodiscard]] std::vector<::plssvm::detail::memory_size> get_device_memory() const final; + /** + * @copydoc plssvm::csvm::get_max_mem_alloc_size + */ + [[nodiscard]] std::vector<::plssvm::detail::memory_size> get_max_mem_alloc_size() const final; + /** + * @copydoc plssvm::detail::gpu_csvm::get_max_work_group_size + */ + [[nodiscard]] std::size_t get_max_work_group_size(std::size_t device_id) const final; + /** + * @copydoc plssvm::detail::gpu_csvm::get_max_grid_size + */ + [[nodiscard]] ::plssvm::detail::dim_type get_max_grid_size(std::size_t device_id) const override; + + //***************************************************// + // fit // + //***************************************************// + /** + * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit + */ + [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final; + /** + * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit + */ + void run_blas_level_3_kernel_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const ::plssvm::detail::execution_range &mirror_exec, real_type alpha, const device_ptr_type &A_d, const device_ptr_type &B_d, real_type beta, device_ptr_type &C_d) const final; + /** + * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_implicit_blas_level_3 + */ + void run_assemble_kernel_matrix_implicit_blas_level_3(std::size_t device_id, const ::plssvm::detail::execution_range &exec, real_type alpha, const device_ptr_type &A_d, const parameter ¶ms, const device_ptr_type &q_red_d, real_type QA_cost, const device_ptr_type &B_d, device_ptr_type &C_d) const final; + /** + * @copydoc plssvm::detail::gpu_csvm::run_inplace_matrix_addition + */ + void run_inplace_matrix_addition(std::size_t device_id, const ::plssvm::detail::execution_range &exec, device_ptr_type &lhs_d, const device_ptr_type &rhs_d) const override; + /** + * @copydoc plssvm::detail::gpu_csvm::run_inplace_matrix_scale + */ + void run_inplace_matrix_scale(std::size_t device_id, const ::plssvm::detail::execution_range &exec, device_ptr_type &lhs_d, real_type scale) const override; + + //***************************************************// + // predict, score // + //***************************************************// + /** + * @copydoc plssvm::detail::gpu_csvm::run_w_kernel + */ + [[nodiscard]] device_ptr_type run_w_kernel(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const device_ptr_type &alpha_d, const device_ptr_type &sv_d) const final; + /** + * @copydoc plssvm::detail::gpu_csvm::run_predict_kernel + */ + [[nodiscard]] device_ptr_type run_predict_kernel(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const device_ptr_type &alpha_d, const device_ptr_type &rho_d, const device_ptr_type &sv_or_w_d, const device_ptr_type &predict_points_d) const final; + + /// The used Kokkos execution space. + execution_space space_{}; +}; + +} // namespace kokkos + +namespace detail { + +/** + * @brief Sets the `value` to `true` since C-SVMs using the Kokkos backend are available. + */ +template <> +struct csvm_backend_exists : std::true_type { }; + +} // namespace detail + +} // namespace plssvm + +#endif // PLSSVM_BACKENDS_KOKKOS_CSVM_HPP_ diff --git a/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp new file mode 100644 index 000000000..559c9e75c --- /dev/null +++ b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp @@ -0,0 +1,238 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Conditionally defined macros for the different available Kokkos ExecutionSpaces. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_CONDITIONAL_EXECUTION_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_DETAIL_CONDITIONAL_EXECUTION_HPP_ +#pragma once + +#include "plssvm/backends/Kokkos/exceptions.hpp" // plssvm::kokkos::backend_exception +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space + +#include "Kokkos_Core.hpp" // Kokkos macros + +#include "fmt/core.h" // fmt::format + +#include // std::invoke + +namespace plssvm::kokkos::detail { + +//***************************************************// +// Kokkos::Cuda // +//***************************************************// + +/** + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA` macro if `KOKKOS_ENABLE_CUDA` is defined, i.e., the Kokkos CUDA ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_CUDA` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception. + * + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA` macro if `KOKKOS_ENABLE_CUDA` is defined, i.e., the Kokkos CUDA ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_CUDA` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception. + */ +#if defined(KOKKOS_ENABLE_CUDA) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA(func) return std::invoke(func) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA(func) std::invoke(func) +#else + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::cuda) } + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::cuda) } +#endif + +//***************************************************// +// Kokkos::HIP // +//***************************************************// + +/** + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP` macro if `KOKKOS_ENABLE_HIP` is defined, i.e., the Kokkos HIP ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_HIP` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception. + * + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP` macro if `KOKKOS_ENABLE_HIP` is defined, i.e., the Kokkos HIP ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_HIP` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception. + */ +#if defined(KOKKOS_ENABLE_HIP) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP(func) return std::invoke(func) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP(func) std::invoke(func) +#else + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hip) } + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hip) } +#endif + +//***************************************************// +// Kokkos::SYCL // +//***************************************************// + +/** + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SYCL + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SYCL` macro if `KOKKOS_ENABLE_SYCL` is defined, i.e., the Kokkos SYCL ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_SYCL` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception. + * + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL` macro if `KOKKOS_ENABLE_SYCL` is defined, i.e., the Kokkos SYCL ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_SYCL` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception. + */ +#if defined(KOKKOS_ENABLE_SYCL) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SYCL(func) return std::invoke(func) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL(func) std::invoke(func) +#else + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SYCL(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::sycl) } + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::sycl) } +#endif + +//***************************************************// +// Kokkos::Experimental::HPX // +//***************************************************// + +/** + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HPX + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HPX` macro if `KOKKOS_ENABLE_HPX` is defined, i.e., the Kokkos HPX ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_HPX` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception. + * + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX` macro if `KOKKOS_ENABLE_HPX` is defined, i.e., the Kokkos HPX ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_HPX` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception. + */ +#if defined(KOKKOS_ENABLE_HPX) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HPX(func) return std::invoke(func) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX(func) std::invoke(func) +#else + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HPX(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hpx) } + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hpx) } +#endif + +//***************************************************// +// Kokkos::OpenMP // +//***************************************************// + +/** + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMP + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMP` macro if `KOKKOS_ENABLE_OPENMP` is defined, i.e., the Kokkos OpenMP ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_OPENMP` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception. + * + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP` macro if `KOKKOS_ENABLE_OPENMP` is defined, i.e., the Kokkos OpenMP ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_OPENMP` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception. + */ +#if defined(KOKKOS_ENABLE_OPENMP) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMP(func) return std::invoke(func) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP(func) std::invoke(func) +#else + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMP(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp) } + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp) } +#endif + +//***************************************************// +// Kokkos::Experimental::OpenMPTarget // +//***************************************************// + +/** + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMPTARGET + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMPTARGET` macro if `KOKKOS_ENABLE_OPENMPTARGET` is defined, i.e., the Kokkos OpenMP target offloading ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_OPENMPTARGET` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception. + * + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET` macro if `KOKKOS_ENABLE_OPENMPTARGET` is defined, i.e., the Kokkos OpenMP target offloading ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_OPENMPTARGET` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception. + */ +#if defined(KOKKOS_ENABLE_OPENMPTARGET) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMPTARGET(func) return std::invoke(func) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET(func) std::invoke(func) +#else + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMPTARGET(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp_target) } + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp_target) } +#endif + +//***************************************************// +// Kokkos::Experimental::OpenACC // +//***************************************************// + +/** + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENACC + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENACC` macro if `KOKKOS_ENABLE_OPENACC` is defined, i.e., the Kokkos OpenACC ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_OPENACC` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception. + * + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC` macro if `KOKKOS_ENABLE_OPENACC` is defined, i.e., the Kokkos OpenACC ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_OPENACC` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception. + */ +#if defined(KOKKOS_ENABLE_OPENACC) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENACC(func) return std::invoke(func) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC(func) std::invoke(func) +#else + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENACC(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openacc) } + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openacc) } +#endif + +//***************************************************// +// Kokkos::Threads // +//***************************************************// + +/** + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_THREADS + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_THREADS` macro if `KOKKOS_ENABLE_THREADS` is defined, i.e., the Kokkos std::thread ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_THREADS` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception. + * + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS` macro if `KOKKOS_ENABLE_THREADS` is defined, i.e., the Kokkos std::thread ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_THREADS` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception. + */ +#if defined(KOKKOS_ENABLE_THREADS) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_THREADS(func) return std::invoke(func) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS(func) std::invoke(func) +#else + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_THREADS(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::threads) } + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::threads) } +#endif + +//***************************************************// +// Kokkos::Serial // +//***************************************************// + +/** + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SERIAL + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SERIAL` macro if `KOKKOS_ENABLE_SERIAL` is defined, i.e., the Kokkos serial ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_SERIAL` is enabled, invokes the provided function (normally a lambda function) and returns the return value, otherwise throws an exception. + * @note This ExecutionSpace *should* always be available! + * + * @def PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL + * @brief Defines the `PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL` macro if `KOKKOS_ENABLE_SERIAL` is defined, i.e., the Kokkos serial ExecutionSpace is available. + * @details If `KOKKOS_ENABLE_SERIAL` is enabled, invokes the provided function (normally a lambda function), otherwise throws an exception. + * @note This ExecutionSpace *should* always be available! + */ +#if defined(KOKKOS_ENABLE_SERIAL) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SERIAL(func) return std::invoke(func) + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL(func) std::invoke(func) +#else + #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SERIAL(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::serial) } + #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL(func) \ + throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::serial) } +#endif + +} // namespace plssvm::kokkos::detail + +#endif // PLSSVM_BACKENDS_KOKKOS_DETAIL_CONDITIONAL_EXECUTION_HPP_ diff --git a/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp b/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp new file mode 100644 index 000000000..80d3f8cd9 --- /dev/null +++ b/include/plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp @@ -0,0 +1,73 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Function to list all available execution spaces at compile time. + * @note Must be a separate file such that the Kokkos header must not be included in the "execution_space.hpp" file. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_CONSTEXPR_AVAILABLE_EXECUTION_SPACES_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_DETAIL_CONSTEXPR_AVAILABLE_EXECUTION_SPACES_HPP_ + +/** + * @def PLSSVM_KOKKOS_BACKEND_NUM_AVAILABLE_EXECUTION_SPACES + * @brief Set the macro `PLSSVM_KOKKOS_BACKEND_NUM_AVAILABLE_EXECUTION_SPACES` to 0 if it isn't already defined, i.e., no Kokkos execution space is available. + * Will normally be propagated by CMake with the number of available Kokkos execution spaces. + */ +#if !defined(PLSSVM_KOKKOS_BACKEND_NUM_AVAILABLE_EXECUTION_SPACES) + #define PLSSVM_KOKKOS_BACKEND_NUM_AVAILABLE_EXECUTION_SPACES 0 +#endif + +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space + +#include // std::array + +namespace plssvm::kokkos::detail { + +/** + * @brief List all available Kokkos::ExecutionSpaces at compile time. + * @details At least one execution space must **always** be available! + * @return a `std::array` containing all available execution spaces (`[[nodiscard]]`) + */ +[[nodiscard]] inline constexpr auto constexpr_available_execution_spaces() noexcept { + // Note: The execution_space::automatic value may NEVER be added here! + // Note: the trailing comma is explicitly allowed by the standard + // Note: the order is intentionally chosen this way -> the order of the entries determines the priority when using a backend to run our code + return std::array{ +#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_CUDA) + execution_space::cuda, +#endif +#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_HIP) + execution_space::hip, +#endif +#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_SYCL) + execution_space::sycl, +#endif +#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_OPENMPTARGET) + execution_space::openmp_target, +#endif +#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_OPENACC) + execution_space::openacc, +#endif +#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_OPENMP) + execution_space::openmp, +#endif +#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_THREADS) + execution_space::threads, +#endif +#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_HPX) + execution_space::hpx, +#endif +#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_SERIAL) + execution_space::serial, +#endif + }; +} + +} // namespace plssvm::kokkos::detail + +#endif // PLSSVM_BACKENDS_KOKKOS_DETAIL_CONSTEXPR_AVAILABLE_EXECUTION_SPACES_HPP_ diff --git a/include/plssvm/backends/Kokkos/detail/device_ptr.hpp b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp new file mode 100644 index 000000000..ad067d00b --- /dev/null +++ b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp @@ -0,0 +1,134 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Small wrapper around a Kokkos::View. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_PTR_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_PTR_HPP_ +#pragma once + +#include "plssvm/backends/gpu_device_ptr.hpp" // plssvm::detail::gpu_device_ptr +#include "plssvm/backends/Kokkos/detail/device_view_wrapper.hpp" // plssvm::kokkos::detail::device_view_wrapper +#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp" // plssvm::kokkos::detail::device_wrapper +#include "plssvm/shape.hpp" // plssvm::shape + +#include // std::size_t + +namespace plssvm::kokkos::detail { + +/** + * @brief Small wrapper class around a Kokkos view together with commonly used device functions. + * @tparam T the type of the kernel view to wrap + */ +template +class device_ptr : public ::plssvm::detail::gpu_device_ptr, device_ptr> { + /// The template base type of the Kokkos device_ptr class. + using base_type = ::plssvm::detail::gpu_device_ptr, device_ptr>; + + using base_type::data_; + using base_type::queue_; + using base_type::shape_; + + public: + // Be able to use overloaded base class functions. + using base_type::copy_to_device; + using base_type::copy_to_device_strided; + using base_type::copy_to_host; + using base_type::copy_to_other_device; + using base_type::fill; + using base_type::memset; + + using typename base_type::const_host_pointer_type; + using typename base_type::device_pointer_type; + using typename base_type::host_pointer_type; + using typename base_type::queue_type; + using typename base_type::size_type; + using typename base_type::value_type; + + /** + * @brief Default construct a Kokkos device_ptr with a size of 0. + * @details Always associated with device 0. + */ + device_ptr() = default; + /** + * @brief Allocates `size * sizeof(T)` bytes in the Kokkos execution space @p exec. + * @param[in] size the number of elements represented by the device_ptr + * @param[in] device the device wrapper + */ + explicit device_ptr(size_type size, const device_wrapper &device); + /** + * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes in the Kokkos execution space @p exec. + * @param[in] shape the number of elements represented by the device_ptr + * @param[in] device the device wrapper + */ + explicit device_ptr(plssvm::shape shape, const device_wrapper &device); + /** + * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes in the Kokkos execution space @p exec. + * @param[in] shape the number of elements represented by the device_ptr + * @param[in] padding the number of padding elements added to the extent values + * @param[in] device the device wrapper + */ + device_ptr(plssvm::shape shape, plssvm::shape padding, const device_wrapper &device); + + /** + * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &) + */ + device_ptr(const device_ptr &) = delete; + /** + * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(plssvm::detail::gpu_device_ptr &&) + */ + device_ptr(device_ptr &&other) noexcept = default; + + /** + * @copydoc plssvm::detail::gpu_device_ptr::operator=(const plssvm::detail::gpu_device_ptr &) + */ + device_ptr &operator=(const device_ptr &) = delete; + /** + * @copydoc plssvm::detail::gpu_device_ptr::operator=(plssvm::detail::gpu_device_ptr &&) + */ + device_ptr &operator=(device_ptr &&other) noexcept = default; + + /** + * @copydoc plssvm::detail::gpu_device_ptr::~gpu_device_ptr() + * @details Kokkos automatically frees the memory of a Kokkos::View if the View goes out of scope. + */ + ~device_ptr() override = default; + + /** + * @copydoc plssvm::detail::gpu_device_ptr::memset(int, size_type, size_type) + */ + void memset(int pattern, size_type pos, size_type num_bytes) override; + /** + * @copydoc plssvm::detail::gpu_device_ptr::fill(value_type, size_type, size_type) + */ + void fill(value_type value, size_type pos, size_type count) override; + /** + * @copydoc plssvm::detail::gpu_device_ptr::copy_to_device(const_host_pointer_type, size_type, size_type) + */ + void copy_to_device(const_host_pointer_type data_to_copy, size_type pos, size_type count) override; + /** + * @copydoc plssvm::detail::gpu_device_ptr::copy_to_device_strided(const_host_pointer_type, std::size_t, std::size_t, std::size_t) + */ + void copy_to_device_strided(const_host_pointer_type data_to_copy, std::size_t spitch, std::size_t width, std::size_t height) override; + /** + * @copydoc plssvm::detail::gpu_device_ptr::copy_to_host(host_pointer_type, size_type, size_type) const + */ + void copy_to_host(host_pointer_type buffer, size_type pos, size_type count) const override; + /** + * @copydoc plssvm::detail::gpu_device_ptr::copy_to_other_device(derived_gpu_device_ptr &, size_type, size_type) const + */ + void copy_to_other_device(device_ptr &target, size_type pos, size_type count) const override; +}; + +extern template class device_ptr; +extern template class device_ptr; + +} // namespace plssvm::kokkos::detail + +#endif // PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_PTR_HPP_ diff --git a/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp b/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp new file mode 100644 index 000000000..ea60bb1fd --- /dev/null +++ b/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp @@ -0,0 +1,187 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief A wrapper around a Kokkos::View. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_VIEW_WRAPPER_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_VIEW_WRAPPER_HPP_ + +#include "plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp" // plssvm::kokkos::detail::constexpr_available_execution_spaces +#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp" // plssvm::kokkos::detail::device_wrapper +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space +#include "plssvm/backends/Kokkos/execution_space_type_traits.hpp" // plssvm::kokkos::execution_space_to_kokkos_type_t +#include "plssvm/detail/type_traits.hpp" // plssvm::detail::remove_cvref_t + +#include "Kokkos_Core.hpp" // Kokkos::View, Kokkos::ExecutionSpace + +#include // std::array +#include // std::size_t +#include // std::invoke +#include // std::make_index_sequence, std::index_sequence, std::move +#include // std::variant, std::get, std::visit + +namespace plssvm::kokkos::detail { + +namespace impl { + +/** + * @brief Uninstantiated base type to create a `std::variant` containing all available Kokkos::View types. + */ +template +struct create_view_variant_type_helper; + +/** + * @brief Helper struct to create a `std::variant` containing all available Kokkos::View types by iterating over the `std::array` of + * `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`. + * @tparam T the value type of the underlying Kokkos::View + * @tparam Is the indices to index the `std::array` + */ +template +struct create_view_variant_type_helper> { + /// The array containing all available execution spaces. + constexpr static auto array = detail::constexpr_available_execution_spaces(); + /// The resulting variant type. + using type = std::variant>...>; +}; + +/** + * @brief Create a `std::variant` containing all available Kokkos::View types by iterating over the `std::array` of + * `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`. + * @tparam T the value type of the underlying Kokkos::View + */ +template +struct create_view_variant_type { + /// The number of types in the final variant. + constexpr static std::size_t N = detail::constexpr_available_execution_spaces().size(); + /// The final variant type. + using type = typename create_view_variant_type_helper>::type; +}; + +} // namespace impl + +/** + * @brief A wrapper class around a `std::variant` that contains all available Kokkos::View types. + * @tparam T the value type of the underlying Kokkos::View + */ +template +class device_view_wrapper { + public: + /// The `std::variant` type containing all Kokkos::View types. + using variant_type = typename impl::create_view_variant_type::type; + + /** + * @brief Default construct the `std::variant` wrapper. + */ + device_view_wrapper() = default; + + /** + * @brief Construct the wrapper using the provided Kokkos::View instance by forwarding its value to the underlying `std::variant`. + * @tparam ExecutionSpace the used Kokkos::ExecutionSpace type of the Kokkos::View + * @param[in] view the Kokkos::View instance + */ + template + explicit device_view_wrapper(Kokkos::View &&view) : + v_{ std::move(view) } { } + + /** + * @brief Given the provided `execution_space` enum value, tries to get the `std::variant` alternative for the corresponding Kokkos::ExecutionSpace type. + * @tparam space the `execution_space` enum value + * @return the Kokkos::View instance (`[[nodiscard]]`) + */ + template + [[nodiscard]] Kokkos::View> &get() { + return std::get>>(v_); + } + + /** + * @copydoc plssvm::kokkos::detail::device_view_wrapper::get + */ + template + [[nodiscard]] const Kokkos::View> &get() const { + return std::get>>(v_); + } + + /** + * @brief Return the `execution_space` enum value of the currently active `std::variant` Kokkos::View type. + * @return the `execution_space` enum value (`[[nodiscard]]`) + */ + [[nodiscard]] execution_space get_execution_space() const noexcept { + return detail::constexpr_available_execution_spaces()[v_.index()]; + } + + /** + * @brief Invoke the function @p func on the active `std::variant` member using `std::visit` internally. + * @tparam Func the type of the function + * @param[in] func the function to invoke + */ + template + void execute(const Func &func) { + // clang-format off + std::visit([&func](auto &view) { + std::invoke(func, view); + }, v_); + // clang-format on + } + + /** + * @copydoc plssvm::kokkos::detail::device_view_wrapper::execute + */ + template + void execute(const Func &func) const { + // clang-format off + std::visit([&func](const auto &view) { + std::invoke(func, view); + }, v_); + // clang-format on + } + + /** + * @brief Compare two device view wrappers for equality by comparing the wrapped `std::variant`s. + * @param[in] lhs the first device view wrapper + * @param[in] rhs the second device view wrapper + * @return `true` if both underlying `std::variant`s are equal, otherwise `false` (`[[nodiscard]]`) + */ + [[nodiscard]] friend bool operator==(const device_view_wrapper &lhs, const device_view_wrapper &rhs) noexcept { + return lhs.v_ == rhs.v_; + } + + /** + * @brief Compare two device view wrappers for inequality by comparing the wrapped `std::variant`s. + * @param[in] lhs the first device view wrapper + * @param[in] rhs the second device view wrapper + * @return `true` if both underlying `std::variant`s are unequal, otherwise `false` (`[[nodiscard]]`) + */ + [[nodiscard]] friend bool operator!=(const device_view_wrapper &lhs, const device_view_wrapper &rhs) noexcept { + return !(lhs == rhs); + } + + private: + /// The wrapped `std::variant` type. + variant_type v_; +}; + +/** + * @brief Given a execution @p space and the number of elements @p size, creates a Kokkos::View in the respective memory space. + * @tparam T the value type of the underlying Kokkos::View + * @param[in] device the device for which this view should be allocated + * @param[in] size the size of the Kokkos::View (number of elements **not** byte!) + * @return a Kokkos::View wrapper where the active member of the internal `std::variant` corresponds to the Kokkos::View in the Kokkos::ExecutionSpace specified by @p space (`[[nodiscard]]`) + */ +template +[[nodiscard]] device_view_wrapper make_device_view_wrapper(const device_wrapper &device, const std::size_t size) { + return device.execute_and_return([&](const auto &value) { + using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t; + + return device_view_wrapper{ Kokkos::View{ Kokkos::view_alloc(value, "device_ptr_view"), size } }; + }); +} + +} // namespace plssvm::kokkos::detail + +#endif // PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_VIEW_WRAPPER_HPP_ diff --git a/include/plssvm/backends/Kokkos/detail/device_wrapper.hpp b/include/plssvm/backends/Kokkos/detail/device_wrapper.hpp new file mode 100644 index 000000000..da0aaf755 --- /dev/null +++ b/include/plssvm/backends/Kokkos/detail/device_wrapper.hpp @@ -0,0 +1,199 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief A wrapper around a Kokkos::ExecutionSpace representing a single device. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_WRAPPER_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_WRAPPER_HPP_ + +#include "plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp" // plssvm::kokkos::detail::constexpr_available_execution_spaces +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space +#include "plssvm/backends/Kokkos/execution_space_type_traits.hpp" // plssvm::kokkos::execution_space_to_kokkos_type_t +#include "plssvm/target_platforms.hpp" // plssvm::target_platform + +#include // std::array +#include // std::size_t +#include // std::invoke +#include // std::make_index_sequence, std::index_sequence, std::forward +#include // std::variant, std::get, std::visit +#include // std::vector + +namespace plssvm::kokkos::detail { + +namespace impl { + +/** + * @brief Uninstantiated base type to create a `std::variant` containing all available Kokkos::ExecutionSpace types. + */ +template +struct create_device_variant_type_helper; + +/** + * @brief Helper struct to create a `std::variant` containing all available Kokkos::ExecutionSpace types by iterating over the `std::array` of + * `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`. + * @tparam Is the indices to index the `std::array` + */ +template +struct create_device_variant_type_helper> { + /// The array containing all available execution spaces. + constexpr static auto array = detail::constexpr_available_execution_spaces(); + /// The resulting variant type. + using type = std::variant...>; +}; + +/** + * @brief Create a `std::variant` containing all available Kokkos::ExecutionSpace types by iterating over the `std::array` of + * `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`. + */ +struct create_device_variant_type { + /// The number of types in the final variant. + constexpr static std::size_t N = detail::constexpr_available_execution_spaces().size(); + /// The final variant type. + using type = typename create_device_variant_type_helper>::type; +}; + +} // namespace impl + +/** + * @brief A wrapper class around a `std::variant` that contains all available Kokkos::ExecutionSpace types. + */ +class device_wrapper { + public: + /// The `std::variant` type containing all Kokkos::ExecutionSpace types. + using variant_type = typename impl::create_device_variant_type::type; + + /** + * @brief Default construct the `std::variant` wrapper. + */ + device_wrapper() = default; + + /** + * @brief Construct the wrapper using the provided Kokkos::ExecutionSpace instance by forwarding its value to the underlying `std::variant`. + * @tparam ExecutionSpace the used Kokkos::ExecutionSpace type + * @param[in] exec the Kokkos::ExecutionSpace instance + */ + template + explicit device_wrapper(ExecutionSpace &&exec) : + v_{ std::forward(exec) } { } + + /** + * @brief Given the provided `execution_space` enum value, tries to get the `std::variant` alternative for the corresponding Kokkos::ExecutionSpace type. + * @tparam space the `execution_space` enum value + * @return the Kokkos::ExecutionSpace instance (`[[nodiscard]]`) + */ + template + [[nodiscard]] execution_space_to_kokkos_type_t &get() { + return std::get>(v_); + } + + /** + * @copydoc plssvm::kokkos::detail::device_wrapper::get + */ + template + const execution_space_to_kokkos_type_t &get() const { + return std::get>(v_); + } + + /** + * @brief Return the `execution_space` enum value of the currently active `std::variant` Kokkos::ExecutionSpace type. + * @return the `execution_space` enum value (`[[nodiscard]]`) + */ + [[nodiscard]] execution_space get_execution_space() const noexcept { + return detail::constexpr_available_execution_spaces()[v_.index()]; + } + + /** + * @brief Invoke the function @p func on the active `std::variant` member using `std::visit` internally. + * @tparam Func the type of the function + * @param[in] func the function to invoke + */ + template + void execute(const Func &func) { + // clang-format off + std::visit([&func](auto &device) { + std::invoke(func, device); + }, v_); + // clang-format on + } + + /** + * @copydoc plssvm::kokkos::detail::device_wrapper::execute + */ + template + void execute(const Func &func) const { + // clang-format off + std::visit([&func](const auto &device) { + std::invoke(func, device); + }, v_); + // clang-format on + } + + /** + * @brief Invoke the function @p func on the active `std::variant` member using `std::visit` internally returning the result value of the function invocation. + * @tparam Func the type of the function + * @param[in] func the function to invoke + * @return the return value of function @p func (`[[nodiscard]]`) + */ + template + [[nodiscard]] auto execute_and_return(const Func &func) { + // clang-format off + return std::visit([&func](auto &device) { + return std::invoke(func, device); + }, v_); + // clang-format on + } + + /** + * @copydoc plssvm::kokkos::detail::device_wrapper::execute_and_return + */ + template + [[nodiscard]] auto execute_and_return(const Func &func) const { + // clang-format off + return std::visit([&func](const auto &device) { + return std::invoke(func, device); + }, v_); + // clang-format on + } + + /** + * @brief Compare two device wrappers for equality by comparing the wrapped `std::variant`s. + * @param[in] lhs the first device wrapper + * @param[in] rhs the second device wrapper + * @return `true` if both underlying `std::variant`s are equal, otherwise `false` (`[[nodiscard]]`) + */ + [[nodiscard]] friend bool operator==(const device_wrapper &lhs, const device_wrapper &rhs) noexcept { + return lhs.v_ == rhs.v_; + } + + /** + * @brief Compare two device wrappers for inequality by comparing the wrapped `std::variant`s. + * @param[in] lhs the first device wrapper + * @param[in] rhs the second device wrapper + * @return `true` if both underlying `std::variant`s are unequal, otherwise `false` (`[[nodiscard]]`) + */ + [[nodiscard]] friend bool operator!=(const device_wrapper &lhs, const device_wrapper &rhs) noexcept { + return !(lhs == rhs); + } + + private: + /// The wrapped `std::variant` type. + variant_type v_{}; +}; + +/** + * @brief Get a list of all available devices in the execution @p space that are supported by the @p target platform. + * @param[in] space the Kokkos::ExecutionSpace to retrieve the devices from + * @param[in] target the target platform that must be supported + * @return all devices for the @p target in the Kokkos::ExecutionSpace @p space (`[[nodiscard]]`) + */ +[[nodiscard]] std::vector get_device_list(execution_space space, target_platform target); + +} // namespace plssvm::kokkos::detail + +#endif // PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_WRAPPER_HPP_ diff --git a/include/plssvm/backends/Kokkos/detail/pinned_memory.hpp b/include/plssvm/backends/Kokkos/detail/pinned_memory.hpp new file mode 100644 index 000000000..cb328e6d3 --- /dev/null +++ b/include/plssvm/backends/Kokkos/detail/pinned_memory.hpp @@ -0,0 +1,93 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Small wrapper around RAII for registering memory as pinned memory. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_PINNED_MEMORY_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_DETAIL_PINNED_MEMORY_HPP_ +#pragma once + +#include "plssvm/backends/host_pinned_memory.hpp" // plssvm::detail::host_pinned_memory +#include "plssvm/matrix.hpp" // plssvm::matrix, plssvm::layout_type + +#include // std::size_t +#include // std::vector + +namespace plssvm::kokkos::detail { + +/** + * @brief A small RAII wrapper class to register/unregister pinned memory. + * @tparam T the type of the data array that should be pinned + */ +template +class [[nodiscard]] pinned_memory final : public ::plssvm::detail::host_pinned_memory { + /// The template base type of the CUDA pinned_memory class. + using base_type = ::plssvm::detail::host_pinned_memory; + + using base_type::is_pinned_; + using base_type::ptr_; + + public: + using typename base_type::value_type; + + /** + * @brief Register the memory managed by the matrix @p matr to use pinned memory. + * @tparam layout the layout type of the matrix + * @param[in] matr the memory to pin + */ + template + explicit pinned_memory(const matrix &matr) : + pinned_memory{ matr.data(), matr.size_padded() } { } + + /** + * @brief Register the memory managed by the vector @p vec to use pinned memory. + * @param[in] vec the memory to pin + */ + explicit pinned_memory(const std::vector &vec); + /** + * @brief Register the memory managed by the pointer @p ptr with @p size to use pinned memory. + * @param[in] ptr the memory to pin + * @param[in] size the number of elements in the memory region to pin (**not** bytes!) + */ + pinned_memory(const T *ptr, std::size_t size); + /** + * @brief Unregister the memory managed by this object. + */ + ~pinned_memory() override; + + /** + * @brief Must provide a memory that should be pinned. + */ + pinned_memory() = delete; + /** + * @brief Delete the copy-constructor. + */ + pinned_memory(const pinned_memory &) = delete; + /** + * @brief Delete the move-constructor. + */ + pinned_memory(pinned_memory &&) noexcept = delete; + /** + * @brief Delete the copy-assignment operator. + * @return `*this` + */ + pinned_memory &operator=(const pinned_memory &) = delete; + /** + * @brief Delete the move-assignment operator. + * @return `*this` + */ + pinned_memory &operator=(pinned_memory &&) noexcept = delete; +}; + +extern template class pinned_memory; +extern template class pinned_memory; + +} // namespace plssvm::kokkos::detail + +#endif // PLSSVM_BACKENDS_KOKKOS_DETAIL_PINNED_MEMORY_HPP_ diff --git a/include/plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp b/include/plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp new file mode 100644 index 000000000..5b26f5e98 --- /dev/null +++ b/include/plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp @@ -0,0 +1,131 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Implementation of a basic and minimalistic tuple class which is standard-layout conform. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_STANDARD_LAYOUT_TUPLE_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_DETAIL_STANDARD_LAYOUT_TUPLE_HPP_ +#pragma once + +#include "plssvm/constants.hpp" // plssvm::real_type + +#include "Kokkos_Core.hpp" // KOKKOS_INLINE_FUNCTION + +#include // std::size_t +#include // std::is_standard_layout +#include // std::forward + +namespace plssvm::kokkos::detail { + +/* + * Empty base implementation. + */ +template +struct standard_layout_tuple; + +/** + * @brief Save the value of type @p T as scalar and the remaining values of type @p Rest recursively in another standard layout tuple. + * @tparam T the type of the value to save in this tuple + * @tparam Rest the remaining types saved in a recursive tuple + */ +template +struct standard_layout_tuple { + /// The stored value. + T value; + /// The remaining values stored in their own tuple. + standard_layout_tuple remaining; +}; + +/** + * @brief Special case for an empty tuple (recursion termination criterion). + */ +template <> +struct standard_layout_tuple<> { }; + +namespace impl { + +/** + * @brief Recursively traverse (at compile time) the tuple @p t and retrieve the value at position @p I. + * @tparam I the index of the tuple value to get + */ +template +struct get_impl { + /** + * @brief Recursively traverse (at compile time) the tuple @p t and retrieve the value at position @p I. + * @tparam Types the types in the tuple + * @param[in] t the tuple to traverse + * @return the requested value (`[[nodiscard]]`) + */ + template + KOKKOS_INLINE_FUNCTION constexpr static auto get(const standard_layout_tuple &t) { + return get_impl::get(t.remaining); + } +}; + +/** + * @brief Special case to retrieve the currently held value (recursion termination criterion). + */ +template <> +struct get_impl<0> { + /** + * @brief Get the held value from @p t. + * @tparam Types the types in the tuple + * @param[in] t the tuple to get the value from + * @return the requested value (`[[nodiscard]]`) + */ + template + KOKKOS_INLINE_FUNCTION constexpr static auto get(const standard_layout_tuple &t) { + return t.value; + } +}; + +} // namespace impl + +/** + * @brief Get the value at position @p I of the tuple @p t holding the @p Types. + * @tparam I the position of the element in the tuple to get + * @tparam Types the types stored in the tuple + * @param[in] t the tuple + * @return the value of the tuple @p t at position @p I (`[[nodiscard]]`) + */ +template +KOKKOS_INLINE_FUNCTION constexpr auto get(const standard_layout_tuple &t) { + static_assert(I < sizeof...(Types), "Invalid standard_layout_tuple index!"); + return impl::get_impl::get(t); +} + +/** + * @brief Special case: return an empty tuple if no values have bee provided. + * @return an empty tuple (`[[nodiscard]]`) + */ +[[nodiscard]] inline constexpr standard_layout_tuple<> make_standard_layout_tuple() { + return standard_layout_tuple<>{}; +} + +/** + * @brief Create a new tuple storing the values @p arg and @p remaining. + * @tparam T the type of the first value + * @tparam Rest the types of the remaining values (if any) + * @param[in,out] arg the first value + * @param[in,out] remaining the remaining values (if any) + * @return the constructed tuple (`[[nodiscard]]`) + */ +template +[[nodiscard]] inline constexpr standard_layout_tuple make_standard_layout_tuple(T &&arg, Rest &&...remaining) { + return standard_layout_tuple{ std::forward(arg), make_standard_layout_tuple(std::forward(remaining)...) }; +} + +// sanity checks: be sure that the important use cases are indeed standard layout types! +static_assert(std::is_standard_layout_v>, "standard_layout_tuple<> has no standard layout!"); +static_assert(std::is_standard_layout_v>, "standard_layout_tuple has no standard layout!"); +static_assert(std::is_standard_layout_v>, "standard_layout_tuple has no standard layout!"); + +} // namespace plssvm::kokkos::detail + +#endif // PLSSVM_BACKENDS_KOKKOS_DETAIL_STANDARD_LAYOUT_TUPLE_HPP_ diff --git a/include/plssvm/backends/Kokkos/detail/utility.hpp b/include/plssvm/backends/Kokkos/detail/utility.hpp new file mode 100644 index 000000000..9bbc9b172 --- /dev/null +++ b/include/plssvm/backends/Kokkos/detail/utility.hpp @@ -0,0 +1,103 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Utility functions for the Kokkos backend. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_UTILITY_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_DETAIL_UTILITY_HPP_ +#pragma once + +#include "plssvm/backends/execution_range.hpp" // plssvm::detail::dim_type +#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp" // plssvm::kokkos::detail::device_wrapper +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space +#include "plssvm/detail/type_traits.hpp" // PLSSVM_REQUIRES +#include "plssvm/target_platforms.hpp" // plssvm::target_platform + +#include "Kokkos_Core.hpp" // Kokkos::ExecutionSpace::fence + +#include // std::map +#include // std::string +#include // std::disjunction, std::is_same +#include // std::variant +#include // std::vector + +namespace plssvm::kokkos::detail { + +namespace impl { + +/** + * @brief Uninstantiated base type for the check whether a type @p appears in a std::variant @p Variant. + * @tparam T the type to check for inclusion + * @tparam Variant the std::variant that should include the type @p T + */ +template +struct is_type_in_variant; + +/** + * @brief Implement the inclusion check using `std::disjunction`. + * @tparam T the type to check for inclusion + * @tparam Variant the std::variant that should include the type @p T + */ +template +struct is_type_in_variant> : std::disjunction...> { }; + +/** + * @copydoc plssvm::kokkos::detail::impl::is_type_in_variant + */ +template +inline constexpr bool is_type_in_variant_v = is_type_in_variant::value; + +} // namespace impl + +/** + * @brief Convert a `plssvm::detail::dim_type` to a Kokkos native one-dimensional value. + * @param[in] dims the dimensional value to convert + * @return the native one-dimensional value (`[[nodiscard]]`) + */ +[[nodiscard]] int dim_type_to_native(const ::plssvm::detail::dim_type &dims); + +/** + * @brief Return a `std::map` containing a mapping from all available target platforms to the available Kokkos::ExecutionSpace that supports said target platform. + * @details If a target platform is supported by multiple Kokkos::ExecutionSpace, the order is determined by the order as returned by `list_available_execution_spaces`. + * @return the mapping of all available target_platform <-> Kokkos::ExecutionSpace combinations (`[[nodiscard]]`) + */ +[[nodiscard]] std::map> available_target_platform_to_execution_space_mapping(); + +/** + * @brief Get the name of the device represented by the `device_wrapper` @p dev. + * @param[in] dev the device wrapper + * @return the device name (`[[nodiscard]]`) + */ +[[nodiscard]] std::string get_device_name(const device_wrapper &dev); + +/** + * @brief Wait for all kernel and/or other operations on the device wrapper in the @p dev to finish. + * @param[in] dev the device wrapper + */ +void device_synchronize(const device_wrapper &dev); + +/** + * @brief Wait for all kernel and/or other operations on the device represented by the Kokkos::ExecutionSpace @p exec to finish. + * @tparam ExecutionSpace the type of the Kokkos::ExecutionSpace + * @param[in] exec the device represented by a Kokkos::ExecutionSpace + */ +template )> +void device_synchronize(const ExecutionSpace &exec) { + exec.fence(); +} + +/** + * @brief Get the used Kokkos library version. + * @return the library version (`[[nodiscard]]`) + */ +[[nodiscard]] std::string get_kokkos_version(); + +} // namespace plssvm::kokkos::detail + +#endif // PLSSVM_BACKENDS_KOKKOS_DETAIL_UTILITY_HPP_ diff --git a/include/plssvm/backends/Kokkos/exceptions.hpp b/include/plssvm/backends/Kokkos/exceptions.hpp new file mode 100644 index 000000000..60a9fc8dd --- /dev/null +++ b/include/plssvm/backends/Kokkos/exceptions.hpp @@ -0,0 +1,38 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Implements custom exception classes specific to the Kokkos backend. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_EXCEPTIONS_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_EXCEPTIONS_HPP_ +#pragma once + +#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception +#include "plssvm/exceptions/source_location.hpp" // plssvm::source_location + +#include // std::string + +namespace plssvm::kokkos { + +/** + * @brief Exception type thrown if a problem with the Kokkos backend occurs. + */ +class backend_exception : public exception { + public: + /** + * @brief Construct a new exception forwarding the exception message and source location to plssvm::exception. + * @param[in] msg the exception's `what()` message + * @param[in] loc the exception's call side information + */ + explicit backend_exception(const std::string &msg, source_location loc = source_location::current()); +}; + +} // namespace plssvm::kokkos + +#endif // PLSSVM_BACKENDS_KOKKOS_EXCEPTIONS_HPP_ diff --git a/include/plssvm/backends/Kokkos/execution_space.hpp b/include/plssvm/backends/Kokkos/execution_space.hpp new file mode 100644 index 000000000..cc9114412 --- /dev/null +++ b/include/plssvm/backends/Kokkos/execution_space.hpp @@ -0,0 +1,82 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Execution space enumeration for the ExecutionSpaces in Kokkos. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_EXECUTION_SPACE_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_EXECUTION_SPACE_HPP_ +#pragma once + +#include "fmt/base.h" // fmt::formatter +#include "fmt/ostream.h" // fmt::ostream_formatter + +#include // std::ostream forward declaration +#include // std::vector + +namespace plssvm::kokkos { + +/** + * @brief Enum class for all execution spaces supported by [Kokkos](https://github.com/kokkos/kokkos). + */ +enum class execution_space { + /** Automatically determine the used Kokkos execution space. Note: this does not necessarily correspond to Kokkos::DefaultExecutionSpace! */ + automatic, + /** Execution space representing execution on a CUDA device. */ + cuda, + /** Execution space representing execution on a device supported by HIP. */ + hip, + /** Execution space representing execution on a device supported by SYCL. */ + sycl, + /** Execution space representing execution with the HPX runtime system. */ + hpx, + /** Execution space representing execution with the OpenMP runtime system. */ + openmp, + /** Execution space representing execution using the target offloading feature of the OpenMP runtime system. */ + openmp_target, + /** Execution space representing execution with the OpenACC runtime system. */ + openacc, + /** Execution space representing parallel execution with std::threads. */ + threads, + /** Execution space representing serial execution on the CPU. Should always be available. */ + serial +}; + +/** + * @brief Output the execution @p space to the given output-stream @p out. + * @param[in,out] out the output-stream to write the execution space to + * @param[in] space the Kokkos execution space + * @return the output-stream + */ +std::ostream &operator<<(std::ostream &out, execution_space space); + +/** + * @brief Use the input-stream @p in to initialize the execution @p space. + * @param[in,out] in input-stream to extract the execution space from + * @param[in] space the Kokkos execution space + * @return the input-stream + */ +std::istream &operator>>(std::istream &in, execution_space &space); + +/** + * @brief List all available Kokkos::ExecutionSpaces. + * @details Only Kokkos::ExecutionSpaces that where enabled during the CMake configuration are available. + * @return the available Kokkos::ExecutionSpaces (`[[nodiscard]]`) + */ +[[nodiscard]] std::vector list_available_execution_spaces(); + +} // namespace plssvm::kokkos + +/// @cond + +template <> +struct fmt::formatter : fmt::ostream_formatter { }; + +/// @endcond + +#endif // PLSSVM_BACKENDS_KOKKOS_EXECUTION_SPACE_HPP_ diff --git a/include/plssvm/backends/Kokkos/execution_space_type_traits.hpp b/include/plssvm/backends/Kokkos/execution_space_type_traits.hpp new file mode 100644 index 000000000..aa5e31751 --- /dev/null +++ b/include/plssvm/backends/Kokkos/execution_space_type_traits.hpp @@ -0,0 +1,238 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Execution space type traits for the ExecutionSpaces in Kokkos. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_EXECUTION_SPACE_TYPE_TRAITS_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_EXECUTION_SPACE_TYPE_TRAITS_HPP_ +#pragma once + +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space + +#include "Kokkos_Core.hpp" // Kokkos macros, Kokkos ExecutionSpace types + +namespace plssvm::kokkos { + +//***************************************************// +// execution_space_to_kokkos_type // +//***************************************************// + +/** + * @brief Uninstantiated base type to convert an `execution_space` enum value to a Kokkos::ExecutionSpace type. + */ +template +struct execution_space_to_kokkos_type; + +#if defined(KOKKOS_ENABLE_CUDA) +/** + * @brief Convert an `execution_space::cuda` enum value to a `Kokkos::Cuda` Kokkos::ExecutionSpace type. + */ +template <> +struct execution_space_to_kokkos_type { + using type = Kokkos::Cuda; +}; +#endif + +#if defined(KOKKOS_ENABLE_HIP) +/** + * @brief Convert an `execution_space::hip` enum value to a `Kokkos::HIP` Kokkos::ExecutionSpace type. + */ +template <> +struct execution_space_to_kokkos_type { + using type = Kokkos::HIP; +}; +#endif + +#if defined(KOKKOS_ENABLE_SYCL) +/** + * @brief Convert an `execution_space::sycl` enum value to a `Kokkos::SYCL` Kokkos::ExecutionSpace type. + */ +template <> +struct execution_space_to_kokkos_type { + using type = Kokkos::SYCL; +}; +#endif + +#if defined(KOKKOS_ENABLE_HPX) +/** + * @brief Convert an `execution_space::hpx` enum value to a `Kokkos::Experimental::HPX` Kokkos::ExecutionSpace type. + */ +template <> +struct execution_space_to_kokkos_type { + using type = Kokkos::Experimental::HPX; +}; +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) +/** + * @brief Convert an `execution_space::openmp` enum value to a `Kokkos::OpenMP` Kokkos::ExecutionSpace type. + */ +template <> +struct execution_space_to_kokkos_type { + using type = Kokkos::OpenMP; +}; +#endif + +#if defined(KOKKOS_ENABLE_OPENMPTARGET) +/** + * @brief Convert an `execution_space::openmp_target` enum value to a `Kokkos::Experimental::OpenMPTarget` Kokkos::ExecutionSpace type. + */ +template <> +struct execution_space_to_kokkos_type { + using type = Kokkos::Experimental::OpenMPTarget; +}; +#endif + +#if defined(KOKKOS_ENABLE_OPENACC) +/** + * @brief Convert an `execution_space::openacc` enum value to a `Kokkos::Experimental::OpenACC` Kokkos::ExecutionSpace type. + */ +template <> +struct execution_space_to_kokkos_type { + using type = Kokkos::Experimental::OpenACC; +}; +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +/** + * @brief Convert an `execution_space::threads` enum value to a `Kokkos::Threads` Kokkos::ExecutionSpace type. + */ +template <> +struct execution_space_to_kokkos_type { + using type = Kokkos::Threads; +}; +#endif + +#if defined(KOKKOS_ENABLE_SERIAL) +/** + * @brief Convert an `execution_space::serial` enum value to a `Kokkos::Serial` Kokkos::ExecutionSpace type. + */ +template <> +struct execution_space_to_kokkos_type { + using type = Kokkos::Serial; +}; +#endif + +/** + * @brief Convert the `execution_space` @p space to the corresponding Kokkos::ExecutionSpace type. + * @tparam space the enum value to convert + */ +template +using execution_space_to_kokkos_type_t = typename execution_space_to_kokkos_type::type; + +//***************************************************// +// kokkos_type_to_execution_space // +//***************************************************// + +/** + * @brief Uninstantiated base type to convert a Kokkos::ExecutionSpace type to a `execution_space` enum value. + */ +template +struct kokkos_type_to_execution_space; + +#if defined(KOKKOS_ENABLE_CUDA) +/** + * @brief Convert a `Kokkos::Cuda` Kokkos::ExecutionSpace type to an `execution_space::cuda` enum value. + */ +template <> +struct kokkos_type_to_execution_space { + constexpr static execution_space value = execution_space::cuda; +}; +#endif + +#if defined(KOKKOS_ENABLE_HIP) +/** + * @brief Convert a `Kokkos::HIP` Kokkos::ExecutionSpace type to an `execution_space::hip` enum value. + */ +template <> +struct kokkos_type_to_execution_space { + constexpr static execution_space value = execution_space::hip; +}; +#endif + +#if defined(KOKKOS_ENABLE_SYCL) +/** + * @brief Convert a `Kokkos::SYCL` Kokkos::ExecutionSpace type to an `execution_space::sycl` enum value. + */ +template <> +struct kokkos_type_to_execution_space { + constexpr static execution_space value = execution_space::sycl; +}; +#endif + +#if defined(KOKKOS_ENABLE_HPX) +/** + * @brief Convert a `Kokkos::Experimental::HPX` Kokkos::ExecutionSpace type to an `execution_space::hpx` enum value. + */ +template <> +struct kokkos_type_to_execution_space { + constexpr static execution_space value = execution_space::hpx; +}; +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) +/** + * @brief Convert a `Kokkos::OpenMP` Kokkos::ExecutionSpace type to an `execution_space::openmp` enum value. + */ +template <> +struct kokkos_type_to_execution_space { + constexpr static execution_space value = execution_space::openmp; +}; +#endif + +#if defined(KOKKOS_ENABLE_OPENMPTARGET) +/** + * @brief Convert a `Kokkos::Experimental::OpenMPTarget` Kokkos::ExecutionSpace type to an `execution_space::openmp_target` enum value. + */ +template <> +struct kokkos_type_to_execution_space { + constexpr static execution_space value = execution_space::openmp_target; +}; +#endif + +#if defined(KOKKOS_ENABLE_OPENACC) +/** + * @brief Convert a `Kokkos::Experimental::OpenACC` Kokkos::ExecutionSpace type to an `execution_space::openacc` enum value. + */ +template <> +struct kokkos_type_to_execution_space { + constexpr static execution_space value = execution_space::openacc; +}; +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +/** + * @brief Convert a `Kokkos::Threads` Kokkos::ExecutionSpace type to an `execution_space::threads` enum value. + */ +template <> +struct kokkos_type_to_execution_space { + constexpr static execution_space value = execution_space::threads; +}; +#endif + +#if defined(KOKKOS_ENABLE_SERIAL) +/** + * @brief Convert a `Kokkos::Serial` Kokkos::ExecutionSpace type to an `execution_space::serial` enum value. + */ +template <> +struct kokkos_type_to_execution_space { + constexpr static execution_space value = execution_space::serial; +}; +#endif + +/** + * @brief Convert the Kokkos::ExecutionSpace type @p ExecutionSpace to the corresponding `execution_space` enum value. + * @tparam ExecutionSpace the Kokkos::ExecutionSpace type to convert + */ +template +inline constexpr execution_space kokkos_type_to_execution_space_v = kokkos_type_to_execution_space::value; + +} // namespace plssvm::kokkos + +#endif // PLSSVM_BACKENDS_KOKKOS_EXECUTION_SPACE_TYPE_TRAITS_HPP_ diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp new file mode 100644 index 000000000..bddadac01 --- /dev/null +++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp @@ -0,0 +1,450 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Functions for explicitly performing a BLAS GEMM like matrix-matrix multiplication using the Kokkos backend. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_BLAS_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_BLAS_HPP_ +#pragma once + +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} + +#include "Kokkos_Core.hpp" // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents + +#include // std::size_t + +namespace plssvm::kokkos::detail { + +/** + * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. + * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel + */ +template +class device_kernel_symm { + /** + * @brief The type of the used Kokkos::View. + */ + template + using device_view_type = Kokkos::View; + + public: + /** + * @brief Initialize the Kokkos kernel function object. + * @param[in] num_rows the number of rows in @p A and @p C + * @param[in] num_rhs the number of columns in @p B and @p C + * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] row_offset the first row this device is responsible for + * @param[in] alpha the scalar alpha value + * @param[in] A the matrix @p A + * @param[in] B the matrix @p B + * @param[in] beta the scalar beta value + * @param[in,out] C the matrix @p C, also used as result matrix + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_size_x the size of the execution grid in x-dimension + */ + device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, device_view_type A, device_view_type B, const real_type beta, device_view_type C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) : + num_rows_{ num_rows }, + num_rhs_{ num_rhs }, + device_specific_num_rows_{ device_specific_num_rows }, + row_offset_{ row_offset }, + alpha_{ alpha }, + A_{ A }, + B_{ B }, + beta_{ beta }, + C_{ C }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + grid_size_x_{ grid_size_x } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] team the Kokkos team representing the current point in the execution space + */ + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); + const auto FEATURE_BLOCK_SIZE_sz = static_cast(FEATURE_BLOCK_SIZE); + const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + // calculate the indices used in the current thread + const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; // # rhs -> num_rhs + const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; // # rows -> num_mirror_rows + const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + + // create the shared memory arrays used for caching data point features + constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE; + real_type *data_cache_ptr = static_cast(team.team_shmem().get_shmem(2 * shmem_size * sizeof(real_type))); + Kokkos::mdspan> A_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + Kokkos::mdspan> B_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + + // create a thread private array used for internal caching + real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += FEATURE_BLOCK_SIZE_sz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + + // determine on which side of the diagonal we are located + if (dim + threadIdx_y < global_j) { + A_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }]; + } else { + A_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + dim + threadIdx_y - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + } + // determine on which side of the diagonal we are located + if (dim + threadIdx_y + THREAD_BLOCK_SIZE < global_j) { + A_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz + std::size_t{ 1 }) / std::size_t{ 2 }]; + } else { + A_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + } + + B_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(dim + row_offset_ + threadIdx_y) * (num_rhs_ + PADDING_SIZE_sz) + global_i]; + B_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(dim + row_offset_ + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rhs_ + PADDING_SIZE_sz) + global_i]; + } + team.team_barrier(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i); + } + } + } + team.team_barrier(); // wait until all threads performed their part of the calculations + } + + // apply the (partial) BLAS operation and update C + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = i + static_cast(internal_i); + const auto device_global_j = j + static_cast(internal_j); + const auto global_j = row_offset_ + j + static_cast(internal_j); + + // be sure to not perform out of bounds accesses + if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) { + C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i]; + } + } + } + } + + private: + /// @cond Doxygen_suppress + const std::size_t num_rows_; + const std::size_t num_rhs_; + const std::size_t device_specific_num_rows_; + const std::size_t row_offset_; + const real_type alpha_; + device_view_type A_; + device_view_type B_; + const real_type beta_; + device_view_type C_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::size_t grid_size_x_; + /// @endcond +}; + +/** + * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. + * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! + * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel + */ +template +class device_kernel_symm_mirror { + /** + * @brief The type of the used Kokkos::View. + */ + template + using device_view_type = Kokkos::View; + + public: + /** + * @brief Initialize the Kokkos kernel function object. + * @param[in] num_rows the number of rows in @p A and @p C + * @param[in] num_rhs the number of columns in @p B and @p C + * @param[in] num_mirror_rows the number of rows to mirror down + * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] row_offset the first row this device is responsible for + * @param[in] alpha the scalar alpha value + * @param[in] A the matrix @p A + * @param[in] B the matrix @p B + * @param[in] beta the scalar beta value + * @param[in,out] C the matrix @p C, also used as result matrix + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_size_x the size of the execution grid in x-dimension + */ + device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, device_view_type A, device_view_type B, const real_type beta, device_view_type C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) : + num_rows_{ num_rows }, + num_rhs_{ num_rhs }, + num_mirror_rows_{ num_mirror_rows }, + device_specific_num_rows_{ device_specific_num_rows }, + row_offset_{ row_offset }, + alpha_{ alpha }, + A_{ A }, + B_{ B }, + beta_{ beta }, + C_{ C }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + grid_size_x_{ grid_size_x } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] team the Kokkos team representing the current point in the execution space + */ + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); + const auto FEATURE_BLOCK_SIZE_sz = static_cast(FEATURE_BLOCK_SIZE); + const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + // calculate the indices used in the current thread + const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; // # rhs -> num_rhs + const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; // # rows -> num_mirror_rows + const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + + // create the shared memory arrays used for caching data point features + constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE; + real_type *data_cache_ptr = static_cast(team.team_shmem().get_shmem(2 * shmem_size * sizeof(real_type))); + Kokkos::mdspan> A_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + Kokkos::mdspan> B_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + + // create a thread private array used for internal caching + real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + + // iterate over the remaining features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += FEATURE_BLOCK_SIZE_sz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + A_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - (dim + threadIdx_y - std::size_t{ 1 }) * (dim + threadIdx_y) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_y) + global_j]; + A_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz - std::size_t{ 1 }) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) + global_j]; + B_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(row_offset_ + dim + threadIdx_y) * (num_rhs_ + PADDING_SIZE_sz) + global_i]; + B_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(row_offset_ + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rhs_ + PADDING_SIZE_sz) + global_i]; + } + team.team_barrier(); // wait until all threads loaded their part of the data + + // perform the feature reduction calculation + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i); + } + } + } + team.team_barrier(); // wait until all threads performed their part of the calculations + } + + // apply the (remaining) BLAS operation and update C + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = i + static_cast(internal_i); + const auto partial_global_j = j + static_cast(internal_j); + const auto global_j = row_offset_ + device_specific_num_rows_ + j + static_cast(internal_j); + + // be sure to not perform out of bounds accesses + if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) { + C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i]; + } + } + } + } + + private: + /// @cond Doxygen_suppress + const std::size_t num_rows_; + const std::size_t num_rhs_; + const std::size_t num_mirror_rows_; + const std::size_t device_specific_num_rows_; + const std::size_t row_offset_; + const real_type alpha_; + device_view_type A_; + device_view_type B_; + const real_type beta_; + device_view_type C_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::size_t grid_size_x_; + /// @endcond +}; + +/** + * @brief Perform a simple inplace matrix addition: lhs += rhs. + * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel + */ +template +class device_kernel_inplace_matrix_add { + /** + * @brief The type of the used Kokkos::View. + */ + template + using device_view_type = Kokkos::View; + + public: + /** + * @brief Initialize the Kokkos kernel function object. + * @param[in] num_cols the number of columns in both matrices + * @param[in,out] lhs the first matrix (updated inplace) + * @param[in] rhs the second matrix + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_size_x the size of the execution grid in x-dimension + */ + device_kernel_inplace_matrix_add(const std::size_t num_cols, device_view_type lhs, device_view_type rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) : + num_cols_{ num_cols }, + lhs_{ lhs }, + rhs_{ rhs }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + grid_size_x_{ grid_size_x } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] team the Kokkos team representing the current point in the execution space + */ + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + // Calculate the indices used in the current thread + const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; // num_rows + const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; // num_rhs + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = i + static_cast(internal_i); + const auto global_j = j + static_cast(internal_j); + + lhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j]; + } + } + } + + private: + /// @cond Doxygen_suppress + const std::size_t num_cols_; + device_view_type lhs_; + device_view_type rhs_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::size_t grid_size_x_; + /// @endcond +}; + +/** + * @brief Perform a simple inplace matrix scale: lhs *= scalar. + * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel + */ +template +class device_kernel_inplace_matrix_scale { + /** + * @brief The type of the used Kokkos::View. + */ + template + using device_view_type = Kokkos::View; + + public: + /** + * @brief Initialize the Kokkos kernel function object. + * @param[in] num_cols the number of columns in the matrix + * @param[in,out] lhs the first matrix (updated inplace) + * @param[in] scale the value to scale + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_size_x the size of the execution grid in x-dimension + */ + device_kernel_inplace_matrix_scale(const std::size_t num_cols, device_view_type lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) : + num_cols_{ num_cols }, + lhs_{ lhs }, + scale_{ scale }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + grid_size_x_{ grid_size_x } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] team the Kokkos team representing the current point in the execution space + */ + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + // Calculate the indices used in the current thread + const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; // num_rows + const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; // num_rhs + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = i + static_cast(internal_i); + const auto global_j = j + static_cast(internal_j); + + lhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j] *= scale_; + } + } + } + + private: + /// @cond Doxygen_suppress + const std::size_t num_cols_; + device_view_type lhs_; + const real_type scale_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::size_t grid_size_x_; + /// @endcond +}; + +} // namespace plssvm::kokkos::detail + +#endif // PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_BLAS_HPP_ diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp new file mode 100644 index 000000000..8e42e8b41 --- /dev/null +++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -0,0 +1,181 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Functions for explicitly assembling the kernel matrix using the Kokkos backend. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_KERNEL_MATRIX_ASSEMBLY_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_KERNEL_MATRIX_ASSEMBLY_HPP_ +#pragma once + +#include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp" // plssvm::kokkos::detail::standard_layout_tuple +#include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp" // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type + +#include "Kokkos_Core.hpp" // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents + +#include // std::size_t + +namespace plssvm::kokkos::detail { + +/** + * @brief Create the explicit kernel matrix using the @p kernel_function. + * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel + * @tparam kernel_function the type of the used kernel function + * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `standard_layout_tuple` + */ +template +class device_kernel_assembly { + /** + * @brief The type of the used Kokkos::View. + */ + template + using device_view_type = Kokkos::View; + + public: + /** + * @brief Initialize the Kokkos kernel function object. + * @param[out] kernel_matrix_d the calculated kernel matrix + * @param[in] data_d the data points to calculate the kernel matrix from + * @param[in] num_rows the number of data points + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] num_features the number of features per data point + * @param[in] q the vector used in the dimensional reduction + * @param[in] QA_cost the scalar used in the dimensional reduction + * @param[in] cost the cost factor the diagonal is scaled with + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_size_x the size of the execution grid in x-dimension + * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function + */ + device_kernel_assembly(device_view_type kernel_matrix_d, device_view_type data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, device_view_type q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) : + kernel_matrix_d_{ kernel_matrix_d }, + data_d_{ data_d }, + num_rows_{ num_rows }, + device_num_rows_{ device_num_rows }, + row_offset_{ row_offset }, + num_features_{ num_features }, + q_{ q }, + QA_cost_{ QA_cost }, + cost_{ cost }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + grid_size_x_{ grid_size_x }, + kernel_function_parameter_{ detail::make_standard_layout_tuple(std::forward(kernel_function_parameter)...) } { + } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] team the Kokkos team representing the current point in the execution space + */ + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); + const auto FEATURE_BLOCK_SIZE_sz = static_cast(FEATURE_BLOCK_SIZE); + const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + // calculate the indices used in the current thread + const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; // # rhs -> num_rhs + const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; // # rows -> num_mirror_rows + const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + + // create the shared memory arrays used for caching data point features + constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE; + real_type *data_cache_ptr = static_cast(team.team_shmem().get_shmem(2 * shmem_size)); + Kokkos::mdspan> data_cache_i{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + Kokkos::mdspan> data_cache_j{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + + // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further + if (blockIdx_x >= blockIdx_y) { + // create a thread private array used for internal caching + real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + data_cache_i(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i]; + data_cache_i(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i]; + data_cache_j(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j]; + data_cache_j(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j]; + } + team.team_barrier(); // wait until all threads loaded their part of the data + + // perform the feature reduction calculation + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i), + data_cache_j(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j)); + } + } + } + team.team_barrier(); // wait until all threads performed their part of the calculations + } + + // apply the remaining part of the kernel function and store the value in the output kernel matrix + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the kernel matrix (the part stored on the current device) + const auto device_global_i = i + static_cast(internal_i); + const auto global_i = row_offset_ + i + static_cast(internal_i); + const auto device_global_j = j + static_cast(internal_j); + const auto global_j = row_offset_ + j + static_cast(internal_j); + + // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) + if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + real_type temp_ij = temp[internal_i][internal_j]; + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // apply the cost on the diagonal + if (global_i == global_j) { + temp_ij += cost_; + } + // update the kernel matrix + kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + } + } + } + } + } + + private: + /// @cond Doxygen_suppress + device_view_type kernel_matrix_d_; + device_view_type data_d_; + const std::size_t num_rows_; + const std::size_t device_num_rows_; + const std::size_t row_offset_; + const std::size_t num_features_; + device_view_type q_; + const real_type QA_cost_; + const real_type cost_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::size_t grid_size_x_; + const detail::standard_layout_tuple kernel_function_parameter_; + /// @endcond +}; + +} // namespace plssvm::kokkos::detail + +#endif // PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_KERNEL_MATRIX_ASSEMBLY_HPP_ diff --git a/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp new file mode 100644 index 000000000..b22f69885 --- /dev/null +++ b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp @@ -0,0 +1,286 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Functions for implicitly assembling the kernel matrix using the Kokkos backend. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ +#pragma once + +#include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp" // plssvm::kokkos::detail::standard_layout_tuple +#include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp" // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type + +#include "Kokkos_Core.hpp" // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents, Kokkos::atomic_add + +#include // std::size_t + +namespace plssvm::kokkos::detail { + +/** + * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. + * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel + * @tparam kernel_function the type of the used kernel function + * @tparam Args the types of the parameters necessary for the specific kernel function + */ +template +class device_kernel_assembly_symm { + /** + * @brief The type of the used Kokkos::View. + */ + template + using device_view_type = Kokkos::View; + + public: + /** + * @brief Initialize the Kokkos kernel function object. + * @param[in] alpha the scalar alpha value + * @param[in] q the vector used in the dimensional reduction + * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] num_rows the total number of data points (= total number of rows) + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] num_features the number of features per data point + * @param[in] QA_cost the scalar used in the dimensional reduction + * @param[in] cost the cost factor the diagonal is scaled with + * @param[in] B the matrix @p B + * @param[in,out] C the matrix @p C + * @param[in] num_classes the number of classes in the data set + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_size_x the size of the execution grid in x-dimension + * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function + */ + device_kernel_assembly_symm(const real_type alpha, device_view_type q, device_view_type data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, device_view_type B, device_view_type C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) : + alpha_{ alpha }, + q_{ q }, + data_d_{ data_d }, + num_rows_{ num_rows }, + device_num_rows_{ device_num_rows }, + row_offset_{ row_offset }, + num_features_{ num_features }, + QA_cost_{ QA_cost }, + cost_{ cost }, + B_{ B }, + C_{ C }, + num_classes_{ num_classes }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + grid_size_x_{ grid_size_x }, + kernel_function_parameter_{ detail::make_standard_layout_tuple(std::forward(kernel_function_parameter)...) } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] team the Kokkos team representing the current point in the execution space + */ + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); + const auto FEATURE_BLOCK_SIZE_sz = static_cast(FEATURE_BLOCK_SIZE); + const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + // calculate the indices used in the current thread + const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; // # rhs -> num_rhs + const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; // # rows -> num_mirror_rows + const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + + // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further + if (blockIdx_x >= blockIdx_y) { + // create a thread private array used for internal caching + real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + + // create the shared memory arrays used for caching data point features + constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE; + real_type *data_cache_ptr = static_cast(team.team_shmem().get_shmem(2 * shmem_size)); + + { + // create the shared memory arrays used for caching data point features + Kokkos::mdspan> data_cache_i{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + Kokkos::mdspan> data_cache_j{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + data_cache_i(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i]; + data_cache_i(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i]; + data_cache_j(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j]; + data_cache_j(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j]; + } + team.team_barrier(); // wait until all threads loaded their part of the data + + // perform the feature reduction calculation + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i), + data_cache_j(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j)); + } + } + } + team.team_barrier(); // wait until all threads performed their part of the calculations + } + } + + // apply the remaining part of the kernel function and store the value in the output kernel matrix + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = row_offset_ + i + static_cast(internal_i); + const auto device_global_i = i + static_cast(internal_i); + const auto global_j = row_offset_ + j + static_cast(internal_j); + const auto device_global_j = j + static_cast(internal_j); + + // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) + if ((device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j)) { + temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // apply the cost on the diagonal + if (global_i == global_j) { + temp[internal_i][internal_j] += cost_; + } + } else { + // be sure to set the value to zero otherwise + temp[internal_i][internal_j] = real_type{ 0.0 }; + } + } + } + + // calculate C += alpha * temp * B for the UPPER triangular matrix + { + // same shared memory size but with different dimensions + Kokkos::mdspan> B_cache{ data_cache_ptr, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE, FEATURE_BLOCK_SIZE }; + Kokkos::mdspan> C_out_cache{ data_cache_ptr + shmem_size, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE, FEATURE_BLOCK_SIZE }; + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_sz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + B_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y) = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y]; + B_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y + THREAD_BLOCK_SIZE) = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz]; + C_out_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y) = real_type{ 0.0 }; + C_out_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y + THREAD_BLOCK_SIZE) = real_type{ 0.0 }; + } + team.team_barrier(); // wait until all threads loaded their part of the data + + // calculate intermediate results and store them in shared memory + for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + C_out_cache(threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j, (class_idx + threadIdx_x) % FEATURE_BLOCK_SIZE) += + temp[internal_i][internal_j] * B_cache(threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i, (class_idx + threadIdx_x) % FEATURE_BLOCK_SIZE); + } + } + team.team_barrier(); // wait until all threads performed their part of the calculations + } + + // add intermediate cached results to C + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_j = row_offset_ + j + static_cast(internal); + Kokkos::atomic_add(&C_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_x], C_out_cache(threadIdx_y * INTERNAL_BLOCK_SIZE + internal, threadIdx_x)); + Kokkos::atomic_add(&C_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_sz], C_out_cache(threadIdx_y * INTERNAL_BLOCK_SIZE + internal, threadIdx_x + THREAD_BLOCK_SIZE)); + } + team.team_barrier(); // wai until all threads updated C with their values + } + } + + // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i = row_offset_ + i + static_cast(internal_i); + const auto global_j = row_offset_ + j + static_cast(internal_j); + + if (global_i == global_j) { + temp[internal_i][internal_j] = real_type{ 0.0 }; + } + } + } + + // calculate C += alpha * temp * B for the LOWER triangular matrix + { + // same shared memory size but with different dimensions + Kokkos::mdspan> B_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + Kokkos::mdspan> C_out_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_sz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + B_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y]; + B_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz]; + C_out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 }; + C_out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 }; + } + team.team_barrier(); // wait until all threads loaded their part of the data + + // calculate intermediate results and store them in shared memory + for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + C_out_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, internal_i * THREAD_BLOCK_SIZE + threadIdx_x) += + temp[internal_i][internal_j] * B_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j); + } + } + team.team_barrier(); // wait until all threads performed their part of the calculations + } + + // add intermediate cached results to C + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_i = row_offset_ + i + static_cast(internal); + Kokkos::atomic_add(&C_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y], C_out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x)); + Kokkos::atomic_add(&C_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz], C_out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x)); + } + team.team_barrier(); // wait until all threads updated C with their values + } + } + } + } + + private: + /// @cond Doxygen_suppress + const real_type alpha_; + device_view_type q_; + device_view_type data_d_; + const std::size_t num_rows_; + const std::size_t device_num_rows_; + const std::size_t row_offset_; + const std::size_t num_features_; + const real_type QA_cost_; + const real_type cost_; + device_view_type B_; + device_view_type C_; + const std::size_t num_classes_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::size_t grid_size_x_; + const detail::standard_layout_tuple kernel_function_parameter_; + /// @endcond +}; + +} // namespace plssvm::kokkos::detail + +#endif // PLSSVM_BACKENDS_KOKKOS_CG_IMPLICIT_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ diff --git a/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp new file mode 100644 index 000000000..35cbe8ed1 --- /dev/null +++ b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp @@ -0,0 +1,127 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Implement the different kernel functions on the GPU using Kokkos. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_KERNEL_KERNEL_FUNCTIONS_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_KERNEL_KERNEL_FUNCTIONS_HPP_ + +#include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp" // plssvm::kokkos::detail::standard_layout_tuple +#include "plssvm/constants.hpp" // plssvm::real_type +#include "plssvm/detail/utility.hpp" // plssvm::detail::always_false_v +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type + +#include "Kokkos_MathematicalFunctions.hpp" // KOKKOS_INLINE_FUNCTION, Kokkos::pow, Kokkos::exp, Kokkos::tanh, Kokkos::abs + +#include // LT_MIN, DBL_MIN +#include // std::is_same_v + +namespace plssvm::kokkos::detail { + +//***************************************************// +// feature reductions // +//***************************************************// + +/** + * @brief Compute the default feature reduction, i.e., a simple dot-product. + * @tparam kernel_function the kernel function type + * @param[in] val1 the first feature value + * @param[in] val2 the second feature value + * @return the reduced value (`[[nodiscard]]`) + */ +template +[[nodiscard]] KOKKOS_INLINE_FUNCTION real_type feature_reduce(const real_type val1, const real_type val2) { + return val1 * val2; +} + +/** + * @brief Compute the feature reduction for the radial basis function kernel function, i.e., the squared euclidean distance. + * @param[in] val1 the first feature value + * @param[in] val2 the second feature value + * @return the reduced value (`[[nodiscard]]`) + */ +template <> +[[nodiscard]] KOKKOS_INLINE_FUNCTION real_type feature_reduce(const real_type val1, const real_type val2) { + const real_type d = val1 - val2; + return d * d; +} + +/** + * @brief Compute the feature reduction for the laplacian kernel function, i.e., the Manhattan distance. + * @param[in] val1 the first feature value + * @param[in] val2 the second feature value + * @return the reduced value (`[[nodiscard]]`) + */ +template <> +[[nodiscard]] KOKKOS_INLINE_FUNCTION real_type feature_reduce(const real_type val1, const real_type val2) { + return Kokkos::fabs(val1 - val2); +} + +/** + * @brief Return the minimum possible floating point value for type @p T. + * @brief Function necessary such the the `if constexpr` depends on a template parameter and, therefore, no false-positive implicit conversion warnings are reported. + * @tparam T the type to retrieve the minimum value + * @return the minimum floating point value for type @p T (`[[nodiscard]]`) + */ +template +[[nodiscard]] constexpr KOKKOS_INLINE_FUNCTION T real_type_min() { + if constexpr (std::is_same_v) { + return FLT_MIN; + } else { + return DBL_MIN; + } +} + +/** + * @brief Compute the feature reduction for the chi-squared kernel function. + * @note Be sure that the denominator isn't 0.0 which may be the case for padding values. + * @param[in] val1 the first feature value + * @param[in] val2 the second feature value + * @return the reduced value (`[[nodiscard]]`) + */ +template <> +[[nodiscard]] KOKKOS_INLINE_FUNCTION real_type feature_reduce(const real_type val1, const real_type val2) { + const real_type d = val1 - val2; + return (real_type{ 1.0 } / (val1 + val2 + real_type_min())) * d * d; +} + +//***************************************************// +// kernel functions // +//***************************************************// + +/** + * @brief Compute the @p kernel_function using @p value and the @p params. + * @tparam kernel_function the kernel function type + * @tparam Args the types of the potential kernel function parameters + * @param[in] value the value to apply the kernel function to + * @param[in] params the potential kernel function parameters + * @return the result value (`[[nodiscard]]`) + */ +template +[[nodiscard]] KOKKOS_INLINE_FUNCTION real_type apply_kernel_function(const real_type value, [[maybe_unused]] const detail::standard_layout_tuple params) { + if constexpr (kernel_function == kernel_function_type::linear) { + return value; + } else if constexpr (kernel_function == kernel_function_type::polynomial) { + return Kokkos::pow(detail::get<1>(params) * value + detail::get<2>(params), detail::get<0>(params)); + } else if constexpr (kernel_function == kernel_function_type::rbf) { + return Kokkos::exp(-detail::get<0>(params) * value); + } else if constexpr (kernel_function == kernel_function_type::sigmoid) { + return Kokkos::tanh(detail::get<0>(params) * value + detail::get<1>(params)); + } else if constexpr (kernel_function == kernel_function_type::laplacian) { + return Kokkos::exp(-detail::get<0>(params) * value); + } else if constexpr (kernel_function == kernel_function_type::chi_squared) { + return Kokkos::exp(-detail::get<0>(params) * value); + } else { + static_assert(::plssvm::detail::always_false_v, "Unsupported kernel function!"); + } +} + +} // namespace plssvm::kokkos::detail + +#endif // PLSSVM_BACKENDS_KOKKOS_KERNEL_KERNEL_FUNCTIONS_HPP_ diff --git a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp new file mode 100644 index 000000000..767bfc958 --- /dev/null +++ b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp @@ -0,0 +1,452 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines the functions used for prediction for the C-SVM using the Kokkos backend. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_PREDICT_KERNEL_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_PREDICT_KERNEL_HPP_ +#pragma once + +#include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp" // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type + +#include "Kokkos_Core.hpp" // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents, Kokkos::atomic_add + +#include // std::size_t + +namespace plssvm::kokkos::detail { + +/** + * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. + * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel + */ +template +class device_kernel_w_linear { + /** + * @brief The type of the used Kokkos::View. + */ + template + using device_view_type = Kokkos::View; + + public: + /** + * @brief Initialize the Kokkos kernel function object. + * @param[in,out] w_d the vector to speedup the linear prediction + * @param[in] alpha_d the previously learned weights + * @param[in] sv_d the support vectors + * @param[in] num_classes the number of classes + * @param[in] num_sv the number of support vectors + * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for + * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_size_x the size of the execution grid in x-dimension + */ + device_kernel_w_linear(device_view_type w_d, device_view_type alpha_d, device_view_type sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) : + w_d_{ w_d }, + alpha_d_{ alpha_d }, + sv_d_{ sv_d }, + num_classes_{ num_classes }, + num_sv_{ num_sv }, + device_specific_num_sv_{ device_specific_num_sv }, + sv_offset_{ sv_offset }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + grid_size_x_{ grid_size_x } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] team the Kokkos team representing the current point in the execution space + */ + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + // calculate the indices used in the current thread + const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; + const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; + const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + + // create the shared memory arrays used for caching data point features + constexpr std::size_t shmem_size = THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE; + real_type *data_cache_ptr = static_cast(team.team_shmem().get_shmem(2 * shmem_size)); + Kokkos::mdspan> data_cache_feature{ data_cache_ptr, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + Kokkos::mdspan> data_cache_alpha{ data_cache_ptr + shmem_size, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + + // create a thread private array used for internal caching + real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE_sz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_feature_idx = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + + data_cache_feature(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_sz) + sv + threadIdx_y]; // SoA + data_cache_alpha(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_sz) + sv + sv_offset_ + threadIdx_y]; // AoS + } + team.team_barrier(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_feature][internal_class] += data_cache_alpha(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_class) * data_cache_feature(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_feature); + } + } + } + team.team_barrier(); // wait until all threads performed their part of the calculations + } + + // update global array with local one + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); + + w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_sz) + global_class_idx] = temp[internal_feature][internal_class]; + } + } + } + + private: + /// @cond Doxygen_suppress + device_view_type w_d_; + device_view_type alpha_d_; + device_view_type sv_d_; + const std::size_t num_classes_; + const std::size_t num_sv_; + const std::size_t device_specific_num_sv_; + const std::size_t sv_offset_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::size_t grid_size_x_; + /// @endcond +}; + +/** + * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel + */ +template +class device_kernel_predict_linear { + /** + * @brief The type of the used Kokkos::View. + */ + template + using device_view_type = Kokkos::View; + + public: + /** + * @brief Initialize the Kokkos kernel function object. + * @param[out] prediction_d the predicted values + * @param[in] w_d the vector to speedup the calculations + * @param[in] rho_d the previously learned bias + * @param[in] predict_points_d the data points to predict + * @param[in] num_classes the number of classes + * @param[in] num_predict_points the number of data points to predict + * @param[in] num_features the number of features per data point + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_size_x the size of the execution grid in x-dimension + */ + device_kernel_predict_linear(device_view_type prediction_d, device_view_type w_d, device_view_type rho_d, device_view_type predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) : + prediction_d_{ prediction_d }, + w_d_{ w_d }, + rho_d_{ rho_d }, + predict_points_d_{ predict_points_d }, + num_classes_{ num_classes }, + num_predict_points_{ num_predict_points }, + num_features_{ num_features }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + grid_size_x_{ grid_size_x } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] team the Kokkos team representing the current point in the execution space + */ + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); + const auto FEATURE_BLOCK_SIZE_sz = static_cast(FEATURE_BLOCK_SIZE); + const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; + const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; + const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + + // create the shared memory arrays used for caching data point features + constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE; + real_type *data_cache_ptr = static_cast(team.team_shmem().get_shmem(2 * shmem_size)); + Kokkos::mdspan> data_cache_pp{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + Kokkos::mdspan> data_cache_w{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + + // create a thread private array used for internal caching + real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + data_cache_pp(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx]; + data_cache_pp(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx]; + data_cache_w(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = w_d_[(dim + threadIdx_y) * (num_classes_ + PADDING_SIZE_sz) + global_class_idx]; + data_cache_w(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = w_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_classes_ + PADDING_SIZE_sz) + global_class_idx]; + } + team.team_barrier(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_pd][internal_class] += data_cache_w(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_class) * data_cache_pp(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_pd); + } + } + } + team.team_barrier(); // wait until all threads performed their part of the calculations + } + + // update global array with local one + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + const auto global_pp_idx = pp_idx + static_cast(internal_pd); + const auto global_class_idx = class_idx + static_cast(internal_class); + + prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_sz) + global_class_idx] = temp[internal_pd][internal_class] - rho_d_[global_class_idx]; + } + } + } + + private: + /// @cond Doxygen_suppress + device_view_type prediction_d_; + device_view_type w_d_; + device_view_type rho_d_; + device_view_type predict_points_d_; + const std::size_t num_classes_; + const std::size_t num_predict_points_; + const std::size_t num_features_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::size_t grid_size_x_; + /// @endcond +}; + +/** + * @brief Predict the @p predict_points_d using the @p kernel_function. + * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel + * @tparam kernel_function the type of the used kernel function + * @tparam Args the types of the parameters necessary for the specific kernel function + */ +template +class device_kernel_predict { + /** + * @brief The type of the used Kokkos::View. + */ + template + using device_view_type = Kokkos::View; + + public: + /** + * @brief Initialize the SYCL kernel function object. + * @param[in] prediction_d the predicted values + * @param[in] alpha_d the previously learned weights + * @param[in] rho_d the previously learned biases + * @param[in] sv_d the support vectors + * @param[in] predict_points_d the data points to predict + * @param[in] num_classes the number of classes + * @param[in] num_sv the number of support vectors + * @param[in] num_predict_points the number of data points to predict + * @param[in] num_features the number of features per data point + * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used + * @param[in] grid_size_x the size of the execution grid in x-dimension + * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function + */ + device_kernel_predict(device_view_type prediction_d, device_view_type alpha_d, device_view_type rho_d, device_view_type sv_d, device_view_type predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) : + prediction_d_{ prediction_d }, + alpha_d_{ alpha_d }, + rho_d_{ rho_d }, + sv_d_{ sv_d }, + predict_points_d_{ predict_points_d }, + num_classes_{ num_classes }, + num_sv_{ num_sv }, + num_predict_points_{ num_predict_points }, + num_features_{ num_features }, + grid_x_offset_{ grid_x_offset }, + grid_y_offset_{ grid_y_offset }, + grid_size_x_{ grid_size_x }, + kernel_function_parameter_{ detail::make_standard_layout_tuple(std::forward(kernel_function_parameter)...) } { } + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] team the Kokkos team representing the current point in the execution space + */ + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); + const auto FEATURE_BLOCK_SIZE_sz = static_cast(FEATURE_BLOCK_SIZE); + const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; + const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + + constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE; + real_type *data_cache_ptr = static_cast(team.team_shmem().get_shmem(2 * shmem_size)); + + // create a thread private array used for internal caching + real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + + { + // create the shared memory arrays used for caching data point features + Kokkos::mdspan> data_cache_pp{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + Kokkos::mdspan> data_cache_sv{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; + const auto global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + data_cache_pp(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx]; + data_cache_pp(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx]; + data_cache_sv(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = sv_d_[(dim + threadIdx_y) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx]; + data_cache_sv(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = sv_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx]; + } + team.team_barrier(); // wait until all threads loaded their part of the data + + // perform the feature reduction calculation + for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_sv), + data_cache_pp(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_pd)); + } + } + } + team.team_barrier(); // wait until all threads performed their part of the calculations + } + } + + // update temp using the respective kernel function + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp[internal_pd][internal_sv] = detail::apply_kernel_function(temp[internal_pd][internal_sv], kernel_function_parameter_); + } + } + + { + // create the shared memory arrays used for caching data point features + Kokkos::mdspan> alpha_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + Kokkos::mdspan> out_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_sz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const std::size_t global_sv_idx = sv_idx_linear + internal * THREAD_BLOCK_SIZE; + + // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + alpha_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_d_[(dim + threadIdx_y) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx]; + alpha_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx]; + + // the bias (rho) must only be applied once for all support vectors + if (blockIdx_y == std::size_t{ 0 }) { + out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = -rho_d_[dim + threadIdx_y]; + out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = -rho_d_[dim + threadIdx_y + THREAD_BLOCK_SIZE_sz]; + } else { + out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 }; + out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 }; + } + } + team.team_barrier(); // wait until all threads loaded their part of the data + + // calculate intermediate results and store them in shared memory + for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + out_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, internal_pd * THREAD_BLOCK_SIZE + threadIdx_x) += + temp[internal_pd][internal_sv] * alpha_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_sv); + } + } + team.team_barrier(); // wait until all threads performed their part of the calculations + } + + // add intermediate cached results to prediction_d + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + const auto global_pp_idx = pp_idx + static_cast(internal); + + Kokkos::atomic_add(&prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y], out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x)); + Kokkos::atomic_add(&prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz], out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x)); + } + team.team_barrier(); // wait until all threads updated their part of the prediction + } + } + } + + private: + /// @cond Doxygen_suppress + device_view_type prediction_d_; + device_view_type alpha_d_; + device_view_type rho_d_; + device_view_type sv_d_; + device_view_type predict_points_d_; + const std::size_t num_classes_; + const std::size_t num_sv_; + const std::size_t num_predict_points_; + const std::size_t num_features_; + const std::size_t grid_x_offset_; + const std::size_t grid_y_offset_; + const std::size_t grid_size_x_; + const detail::standard_layout_tuple kernel_function_parameter_; + /// @endcond +}; + +} // namespace plssvm::kokkos::detail + +#endif // PLSSVM_BACKENDS_KOKKOS_PREDICT_KERNEL_HPP_ diff --git a/include/plssvm/backends/execution_range.hpp b/include/plssvm/backends/execution_range.hpp index 3f4bae359..5be842f9a 100644 --- a/include/plssvm/backends/execution_range.hpp +++ b/include/plssvm/backends/execution_range.hpp @@ -12,6 +12,8 @@ #ifndef PLSSVM_BACKENDS_EXECUTION_RANGE_HPP_ #define PLSSVM_BACKENDS_EXECUTION_RANGE_HPP_ +#include "plssvm/backend_types.hpp" // plssvm::backend_type + #include "fmt/base.h" // fmt::formatter #include "fmt/ostream.h" // fmt::ostream_formatter diff --git a/include/plssvm/backends/gpu_device_ptr.hpp b/include/plssvm/backends/gpu_device_ptr.hpp index e1b47a5c8..78729691f 100644 --- a/include/plssvm/backends/gpu_device_ptr.hpp +++ b/include/plssvm/backends/gpu_device_ptr.hpp @@ -415,14 +415,14 @@ void gpu_device_ptr::swap( template void gpu_device_ptr::memset(const int pattern, const size_type pos) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); this->memset(pattern, pos, this->size_padded() * sizeof(value_type)); } template void gpu_device_ptr::fill(const value_type value, const size_type pos) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); this->fill(value, pos, this->size_padded()); } @@ -430,7 +430,7 @@ void gpu_device_ptr::fill( template template void gpu_device_ptr::copy_to_device(const matrix &data_to_copy) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); if (data_to_copy.size_padded() < this->size_padded()) { throw gpu_device_ptr_exception{ fmt::format("Too few data to perform copy (needed: {}, provided: {})!", this->size_padded(), data_to_copy.size_padded()) }; @@ -440,14 +440,14 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_device(const std::vector &data_to_copy) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); this->copy_to_device(data_to_copy, 0, this->size_padded()); } template void gpu_device_ptr::copy_to_device(const std::vector &data_to_copy, const size_type pos, const size_type count) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); const size_type rcount = std::min(count, this->size_padded() - pos); if (data_to_copy.size() < rcount) { @@ -458,7 +458,7 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_device(const_host_pointer_type data_to_copy) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!"); this->copy_to_device(data_to_copy, 0, this->size_padded()); @@ -467,7 +467,7 @@ void gpu_device_ptr::copy_ template template void gpu_device_ptr::copy_to_device_strided(const matrix &data_to_copy, const std::size_t start_row, const std::size_t num_rows) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); if (start_row + num_rows > data_to_copy.num_rows()) { throw gpu_device_ptr_exception{ fmt::format("Tried to copy lines {}-{} (zero-based index) to the device, but the matrix has only {} lines!", start_row, start_row + num_rows - 1, data_to_copy.num_rows()) }; @@ -493,7 +493,7 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_device_strided(const std::vector &data_to_copy, std::size_t spitch, std::size_t width, std::size_t height) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); if (width > spitch) { throw gpu_device_ptr_exception{ fmt::format("Invalid width and spitch combination specified (width: {} <= spitch: {})!", width, spitch) }; @@ -508,7 +508,7 @@ void gpu_device_ptr::copy_ template template void gpu_device_ptr::copy_to_host(matrix &buffer) const { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); if (buffer.size_padded() < this->size_padded()) { throw gpu_device_ptr_exception{ fmt::format("Buffer too small to perform copy (needed: {}, provided: {})!", this->size_padded(), buffer.size_padded()) }; @@ -518,14 +518,14 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_host(std::vector &buffer) const { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); this->copy_to_host(buffer, 0, this->size_padded()); } template void gpu_device_ptr::copy_to_host(std::vector &buffer, const size_type pos, const size_type count) const { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); const size_type rcount = std::min(count, this->size_padded() - pos); if (buffer.size() < rcount) { @@ -536,7 +536,7 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_host(host_pointer_type buffer) const { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!"); this->copy_to_host(buffer, 0, this->size_padded()); @@ -544,8 +544,8 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_other_device(derived_gpu_device_ptr &target) const { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); - PLSSVM_ASSERT(target.get() != nullptr, "Invalid target pointer! Maybe target has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(target.get() != device_pointer_type{}, "Invalid target pointer! Maybe target has been default constructed?"); this->copy_to_other_device(target, 0, this->size_padded()); } diff --git a/include/plssvm/core.hpp b/include/plssvm/core.hpp index 4e1fd1be1..96e56d8e1 100644 --- a/include/plssvm/core.hpp +++ b/include/plssvm/core.hpp @@ -132,4 +132,10 @@ using namespace plssvm::PLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION; /// Namespace containing the C-SVM using the SYCL backend with the preferred SYCL implementation. **Should not** directly be used by users. namespace plssvm::sycl::detail { } +/// Namespace containing the C-SVM using the Kokkos backend. +namespace plssvm::kokkos { } + +/// Namespace containing Kokkos backend specific implementation details. **Should not** directly be used by users. +namespace plssvm::kokkos::detail { } + #endif // PLSSVM_CORE_HPP_ diff --git a/include/plssvm/csvm_factory.hpp b/include/plssvm/csvm_factory.hpp index a1272a5e0..fb7760221 100644 --- a/include/plssvm/csvm_factory.hpp +++ b/include/plssvm/csvm_factory.hpp @@ -48,6 +48,9 @@ #include "plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp" // plssvm::adaptivecpp::csvm, plssvm::csvm_backend_exists_v #endif #endif +#if defined(PLSSVM_HAS_KOKKOS_BACKEND) + #include "plssvm/backends/Kokkos/csvm.hpp" // plssvm::kokkos::csvm, plssvm::csvm_backend_exists_v +#endif #include "fmt/format.h" // fmt::format #include "igor/igor.hpp" // igor::parser, igor::has_unnamed_arguments @@ -143,6 +146,8 @@ template return make_csvm_default_impl(std::forward(args)...); case backend_type::sycl: return make_csvm_sycl_impl(std::forward(args)...); + case backend_type::kokkos: + return make_csvm_default_impl(std::forward(args)...); } throw unsupported_backend_exception{ "Unrecognized backend provided!" }; } diff --git a/include/plssvm/detail/cmd/parser_predict.hpp b/include/plssvm/detail/cmd/parser_predict.hpp index 2b96416ae..4ba2e1a65 100644 --- a/include/plssvm/detail/cmd/parser_predict.hpp +++ b/include/plssvm/detail/cmd/parser_predict.hpp @@ -14,6 +14,7 @@ #pragma once #include "plssvm/backend_types.hpp" // plssvm::backend_type +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space #include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::implementation_type #include "plssvm/target_platforms.hpp" // plssvm::target_platform @@ -37,7 +38,7 @@ struct parser_predict { */ parser_predict(int argc, char **argv); - /// The used backend: automatic (depending on the specified target_platforms), OpenMP, HPX, stdpar, CUDA, HIP, OpenCL, or SYCL. + /// The used backend: automatic (depending on the specified target_platforms), OpenMP, HPX, stdpar, CUDA, HIP, OpenCL, SYCL, or Kokkos. backend_type backend{ backend_type::automatic }; /// The target platform: automatic (depending on the used backend), CPUs or GPUs from NVIDIA, AMD, or Intel. target_platform target{ target_platform::automatic }; @@ -45,6 +46,9 @@ struct parser_predict { /// The SYCL implementation to use with `--backend sycl`. sycl::implementation_type sycl_implementation_type{ sycl::implementation_type::automatic }; + /// The Kokkos execution space to use with --backend=kokkos. + kokkos::execution_space kokkos_execution_space{ kokkos::execution_space::automatic }; + /// `true` if `std::string` should be used as label type instead of the default type `ìnt`. bool strings_as_labels{ false }; diff --git a/include/plssvm/detail/cmd/parser_train.hpp b/include/plssvm/detail/cmd/parser_train.hpp index a723fa82e..73897249a 100644 --- a/include/plssvm/detail/cmd/parser_train.hpp +++ b/include/plssvm/detail/cmd/parser_train.hpp @@ -14,6 +14,7 @@ #pragma once #include "plssvm/backend_types.hpp" // plssvm::backend_type +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space #include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::implementation_type #include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/classification_types.hpp" // plssvm::classification_type @@ -53,7 +54,7 @@ struct parser_train { /// The multi-class classification strategy used. classification_type classification{ classification_type::oaa }; - /// The used backend: automatic (depending on the specified target_platforms), OpenMP, HPX, stdpar, CUDA, HIP, OpenCL, or SYCL. + /// The used backend: automatic (depending on the specified target_platforms), OpenMP, HPX, stdpar, CUDA, HIP, OpenCL, SYCL, or Kokkos. backend_type backend{ backend_type::automatic }; /// The target platform: automatic (depending on the used backend), CPUs or GPUs from NVIDIA, AMD, or Intel. target_platform target{ target_platform::automatic }; @@ -65,6 +66,9 @@ struct parser_train { /// The SYCL implementation to use with --backend=sycl. sycl::implementation_type sycl_implementation_type{ sycl::implementation_type::automatic }; + /// The Kokkos execution space to use with --backend=kokkos. + kokkos::execution_space kokkos_execution_space{ kokkos::execution_space::automatic }; + /// `true` if `std::string` should be used as label type instead of the default type `ìnt`. bool strings_as_labels{ false }; diff --git a/include/plssvm/environment.hpp b/include/plssvm/environment.hpp index 69a6dab24..cddb3f31c 100644 --- a/include/plssvm/environment.hpp +++ b/include/plssvm/environment.hpp @@ -20,21 +20,27 @@ #include "plssvm/detail/utility.hpp" // plssvm::detail::{contains, unreachable} #include "plssvm/exceptions/exceptions.hpp" // plssvm::environment_exception -#include "fmt/base.h" // fmt::formatter -#include "fmt/ostream.h" // fmt::ostream_formatter -#include "fmt/ranges.h" // fmt::join - -#include // std::ios::failbit -#include // std::istream -#include // std::ostream -#include // std::string -#include // std::vector - #if defined(PLSSVM_HAS_HPX_BACKEND) #include // ::hpx::post #include // ::hpx::{start, stop, finalize} #include // ::hpx::{is_running, is_stopped} #endif +#if defined(PLSSVM_HAS_KOKKOS_BACKEND) + #include "Kokkos_Core.hpp" // Kokkos::is_initialized, Kokkos::is_finalized, Kokkos::initialize, Kokkos::finalize +#endif + +#include "fmt/base.h" // fmt::formatter +#include "fmt/format.h" // fmt::format +#include "fmt/ostream.h" // fmt::ostream_formatter +#include "fmt/ranges.h" // fmt::join + +#include // std::remove_if +#include // std::ios::failbit +#include // std::istream +#include // std::ostream +#include // std::string +#include // std::move +#include // std::vector namespace plssvm::environment { @@ -161,6 +167,14 @@ template return detail::determine_status_from_initialized_finalized_functions<::hpx::is_running, ::hpx::is_stopped>(); #else return status::unnecessary; +#endif + } + case backend_type::kokkos: + { +#if defined(PLSSVM_HAS_KOKKOS_BACKEND) + return detail::determine_status_from_initialized_finalized_functions(); +#else + return status::unnecessary; #endif } } @@ -176,7 +190,7 @@ template constexpr bool is_initialization_necessary([[maybe_unused]] const backend_type backend) { // Note: must be implemented for the backends that need environmental setup // currently false for all available backends - return false; + return backend == backend_type::hpx || backend == backend_type::kokkos; } //****************************************************************************// @@ -198,6 +212,11 @@ inline void initialize_backend([[maybe_unused]] const backend_type backend) { ::hpx::start(nullptr, 0, nullptr); } #endif +#if defined(PLSSVM_HAS_KOKKOS_BACKEND) + if (backend == backend_type::kokkos) { + Kokkos::initialize(); + } +#endif } /** @@ -215,6 +234,11 @@ inline void initialize_backend([[maybe_unused]] const backend_type backend, [[ma ::hpx::start(nullptr, argc, argv); } #endif +#if defined(PLSSVM_HAS_KOKKOS_BACKEND) + if (backend == backend_type::kokkos) { + Kokkos::initialize(argc, argv); + } +#endif } /** @@ -231,6 +255,11 @@ inline void finalize_backend([[maybe_unused]] const backend_type backend) { ::hpx::stop(); } #endif +#if defined(PLSSVM_HAS_KOKKOS_BACKEND) + if (backend == backend_type::kokkos) { + Kokkos::finalize(); + } +#endif } /** @@ -430,7 +459,8 @@ inline std::vector finalize() { class [[nodiscard]] scope_guard { public: /** - * @copydoc initialize() + * @brief Initialize all **available** backends. + * @details Only initializes backends that are currently uninitialized. */ scope_guard() { backends_ = initialize(); @@ -445,7 +475,10 @@ class [[nodiscard]] scope_guard { } /** - * @copydoc initialize(int &, char **) + * @brief Initialize all **available** backends. + * @details Only initializes backends that are currently uninitialized. + * @param[in,out] argc the number of provided command line arguments + * @param[in,out] argv the provided command line arguments */ scope_guard(int &argc, char **argv) { backends_ = initialize(argc, argv); diff --git a/include/plssvm/parameter.hpp b/include/plssvm/parameter.hpp index 4e51b90d7..516c66386 100644 --- a/include/plssvm/parameter.hpp +++ b/include/plssvm/parameter.hpp @@ -56,6 +56,8 @@ IGOR_MAKE_NAMED_ARGUMENT(classification); IGOR_MAKE_NAMED_ARGUMENT(sycl_implementation_type); /// Create a named argument for the SYCL backend specific kernel invocation type. IGOR_MAKE_NAMED_ARGUMENT(sycl_kernel_invocation_type); +/// Create a named argument for the Kokkos backend specific execution space. +IGOR_MAKE_NAMED_ARGUMENT(kokkos_execution_space); /// @endcond @@ -73,6 +75,12 @@ constexpr bool has_only_parameter_named_args_v = !igor::has_other_than( template constexpr bool has_only_sycl_parameter_named_args_v = !igor::has_other_than(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_kernel_invocation_type); +/** + * @brief Trait to check whether @p Args only contains named-parameter that can be used to initialize a `plssvm::parameter` struct including Kokkos specific named-parameters. + */ +template +constexpr bool has_only_kokkos_parameter_named_args_v = !igor::has_other_than(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::kokkos_execution_space); + } // namespace detail /** @@ -185,7 +193,7 @@ struct parameter { // compile time check: each named parameter must only be passed once static_assert(!parser.has_duplicates(), "Can only use each named parameter once!"); // compile time check: only some named parameters are allowed - static_assert(!parser.has_other_than(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_kernel_invocation_type), + static_assert(!parser.has_other_than(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_kernel_invocation_type, plssvm::kokkos_execution_space), "An illegal named parameter has been passed!"); // shorthand function for emitting a warning if a provided parameter is not used by the current kernel function diff --git a/src/main_predict.cpp b/src/main_predict.cpp index ff28028c8..3d47ad53f 100644 --- a/src/main_predict.cpp +++ b/src/main_predict.cpp @@ -15,6 +15,7 @@ #include "plssvm/detail/tracking/performance_tracker.hpp" // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_SAVE, // PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_HWS_ENTRY // PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_SET_REFERENCE_TIME +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/detail/utility.hpp" // PLSSVM_IS_DEFINED #if defined(PLSSVM_HARDWARE_SAMPLING_ENABLED) @@ -32,6 +33,7 @@ #include // std::ofstream #include // std::mem_fn #include // std::cerr, std::endl +#include // std::unique_ptr, std::make_unique #include // std::pair #include // std::visit #include // std::vector @@ -74,20 +76,31 @@ int main(int argc, char *argv[]) { // check whether SYCL is used as backend (it is either requested directly or as automatic backend) const bool use_sycl_as_backend{ cmd_parser.backend == plssvm::backend_type::sycl || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::sycl) }; - // check whether HPX is used as backend (it is either requested directly or as automatic backend) const bool use_hpx_as_backend{ cmd_parser.backend == plssvm::backend_type::hpx || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::hpx) }; + // check whether Kokkos is used as backend (it is either requested directly or as automatic backend) + const bool use_kokkos_as_backend{ cmd_parser.backend == plssvm::backend_type::kokkos || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::kokkos) }; // initialize environments if necessary std::vector backends_to_initialize{}; if (use_hpx_as_backend) { backends_to_initialize.push_back(plssvm::backend_type::hpx); } + if (use_kokkos_as_backend) { + backends_to_initialize.push_back(plssvm::backend_type::kokkos); + } environment_guard = std::make_unique(backends_to_initialize); // create default csvm - const std::unique_ptr svm = use_sycl_as_backend ? plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type) - : plssvm::make_csvm(cmd_parser.backend, cmd_parser.target); + const std::unique_ptr svm = [&]() { + if (use_sycl_as_backend) { + return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type); + } else if (use_kokkos_as_backend) { + return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, plssvm::kokkos_execution_space = cmd_parser.kokkos_execution_space); + } else { + return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target); + } + }(); // create model const plssvm::model model{ cmd_parser.model_filename }; diff --git a/src/main_train.cpp b/src/main_train.cpp index 32ac09d71..2e2a39905 100644 --- a/src/main_train.cpp +++ b/src/main_train.cpp @@ -14,6 +14,7 @@ #include "plssvm/detail/logging.hpp" // plssvm::detail::log #include "plssvm/detail/tracking/performance_tracker.hpp" // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_SAVE, // PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_HWS_ENTRY, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_SET_REFERENCE_TIME +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/detail/utility.hpp" // PLSSVM_IS_DEFINED #if defined(PLSSVM_HARDWARE_SAMPLING_ENABLED) @@ -27,7 +28,7 @@ #include // std::exception #include // std::mem_fn #include // std::cerr, std::endl -#include // std::unique_ptr +#include // std::unique_ptr, std::make_unique #include // std::remove_reference_t #include // std::pair #include // std::visit @@ -71,20 +72,31 @@ int main(int argc, char *argv[]) { // check whether SYCL is used as backend (it is either requested directly or as automatic backend) const bool use_sycl_as_backend{ cmd_parser.backend == plssvm::backend_type::sycl || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::sycl) }; - // check whether HPX is used as backend (it is either requested directly or as automatic backend) const bool use_hpx_as_backend{ cmd_parser.backend == plssvm::backend_type::hpx || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::hpx) }; + // check whether Kokkos is used as backend (it is either requested directly or as automatic backend) + const bool use_kokkos_as_backend{ cmd_parser.backend == plssvm::backend_type::kokkos || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::kokkos) }; // initialize environments if necessary std::vector backends_to_initialize{}; if (use_hpx_as_backend) { backends_to_initialize.push_back(plssvm::backend_type::hpx); } + if (use_kokkos_as_backend) { + backends_to_initialize.push_back(plssvm::backend_type::kokkos); + } environment_guard = std::make_unique(backends_to_initialize); // create SVM - const std::unique_ptr svm = use_sycl_as_backend ? plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, cmd_parser.csvm_params, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_kernel_invocation_type = cmd_parser.sycl_kernel_invocation_type) - : plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, cmd_parser.csvm_params); + const std::unique_ptr svm = [&]() { + if (use_sycl_as_backend) { + return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, cmd_parser.csvm_params, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_kernel_invocation_type = cmd_parser.sycl_kernel_invocation_type); + } else if (use_kokkos_as_backend) { + return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, cmd_parser.csvm_params, plssvm::kokkos_execution_space = cmd_parser.kokkos_execution_space); + } else { + return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, cmd_parser.csvm_params); + } + }(); // only specify plssvm::max_iter if it isn't its default value const plssvm::model model = diff --git a/src/plssvm/backend_types.cpp b/src/plssvm/backend_types.cpp index 34789a764..a1021e7dd 100644 --- a/src/plssvm/backend_types.cpp +++ b/src/plssvm/backend_types.cpp @@ -51,6 +51,9 @@ std::vector list_available_backends() { #if defined(PLSSVM_HAS_SYCL_BACKEND) available_backends.push_back(backend_type::sycl); #endif +#if defined(PLSSVM_HAS_KOKKOS_BACKEND) + available_backends.push_back(backend_type::kokkos); +#endif // automatic is ALWAYS available but AT LEAST ONE other backend must be available in addition PLSSVM_ASSERT(available_backends.size() > 1, "Besides \"automatic\" at least one other backend must be available!"); @@ -62,10 +65,10 @@ backend_type determine_default_backend(const std::vector &availabl // the decision order based on empiric findings using decision_order_type = std::pair>; const std::array decision_order = { - decision_order_type{ target_platform::gpu_nvidia, { backend_type::cuda, backend_type::hip, backend_type::opencl, backend_type::sycl, backend_type::stdpar } }, - decision_order_type{ target_platform::gpu_amd, { backend_type::hip, backend_type::opencl, backend_type::sycl, backend_type::stdpar } }, - decision_order_type{ target_platform::gpu_intel, { backend_type::sycl, backend_type::opencl, backend_type::stdpar } }, - decision_order_type{ target_platform::cpu, { backend_type::sycl, backend_type::opencl, backend_type::openmp, backend_type::hpx, backend_type::stdpar } } + decision_order_type{ target_platform::gpu_nvidia, { backend_type::cuda, backend_type::hip, backend_type::opencl, backend_type::sycl, backend_type::kokkos, backend_type::stdpar } }, + decision_order_type{ target_platform::gpu_amd, { backend_type::hip, backend_type::opencl, backend_type::sycl, backend_type::kokkos, backend_type::stdpar } }, + decision_order_type{ target_platform::gpu_intel, { backend_type::sycl, backend_type::opencl, backend_type::kokkos, backend_type::stdpar } }, + decision_order_type{ target_platform::cpu, { backend_type::sycl, backend_type::kokkos, backend_type::opencl, backend_type::openmp, backend_type::hpx, backend_type::stdpar } } }; // return the default backend based on the previously defined decision order @@ -101,6 +104,8 @@ std::ostream &operator<<(std::ostream &out, const backend_type backend) { return out << "opencl"; case backend_type::sycl: return out << "sycl"; + case backend_type::kokkos: + return out << "kokkos"; } return out << "unknown"; } @@ -126,6 +131,8 @@ std::istream &operator>>(std::istream &in, backend_type &backend) { backend = backend_type::opencl; } else if (str == "sycl") { backend = backend_type::sycl; + } else if (str == "kokkos") { + backend = backend_type::kokkos; } else { in.setstate(std::ios::failbit); } diff --git a/src/plssvm/backends/Kokkos/CMakeLists.txt b/src/plssvm/backends/Kokkos/CMakeLists.txt new file mode 100644 index 000000000..bf37122f2 --- /dev/null +++ b/src/plssvm/backends/Kokkos/CMakeLists.txt @@ -0,0 +1,182 @@ +## Authors: Alexander Van Craen, Marcel Breyer +## Copyright (C): 2018-today The PLSSVM project - All Rights Reserved +## License: This file is part of the PLSSVM project which is released under the MIT license. +## See the LICENSE.md file in the project root for full license information. +######################################################################################################################## + +list(APPEND CMAKE_MESSAGE_INDENT "Kokkos: ") + +# check if Kokkos can be enabled +message(CHECK_START "Checking for Kokkos backend") + +find_package(Kokkos) + +if (NOT Kokkos_FOUND) + message(CHECK_FAIL "not found") + if (PLSSVM_ENABLE_KOKKOS_BACKEND MATCHES "ON") + message(SEND_ERROR "Cannot find requested backend: Kokkos!") + endif () + return() +endif () +message(CHECK_PASS "found") + +# explicitly set sources +set(PLSSVM_KOKKOS_SOURCES + ${CMAKE_CURRENT_LIST_DIR}/detail/device_ptr.cpp + ${CMAKE_CURRENT_LIST_DIR}/detail/device_wrapper.cpp + ${CMAKE_CURRENT_LIST_DIR}/detail/pinned_memory.cpp + ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp + ${CMAKE_CURRENT_LIST_DIR}/csvm.cpp + ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp +) + +# set target properties +set_local_and_parent(PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME plssvm-Kokkos) +add_library(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} SHARED ${PLSSVM_KOKKOS_SOURCES}) +target_link_libraries(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PUBLIC Kokkos::kokkos) + +if (Kokkos_ENABLE_SYCL) + # set SYCL (icpx) specific compilation flags + if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "IntelLLVM") + message(FATAL_ERROR "For Kokkos::SYCL to work, the compiler must be IntelLLVM, but is ${CMAKE_CXX_COMPILER}!") + endif () + + # set icpx specific compiler flags based on the provided PLSSVM_TARGET_PLATFORMS + set(PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS "") + # cpu targets + if (DEFINED PLSSVM_CPU_TARGET_ARCHS) + # assemble -fsycl-targets + list(APPEND PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS "spir64_x86_64") + endif () + # nvidia targets + if (DEFINED PLSSVM_NVIDIA_TARGET_ARCHS) + # assemble -fsycl-targets + list(APPEND PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS "nvptx64-nvidia-cuda") + endif () + # amd targets + if (DEFINED PLSSVM_AMD_TARGET_ARCHS) + # assemble -fsycl-targets + list(APPEND PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS "amdgcn-amd-amdhsa") + # add target specific flags for AOT -> must always be specified von amd targets + if (NOT PLSSVM_NUM_AMD_TARGET_ARCHS EQUAL 1) + message(SEND_ERROR "IntelLLVM currently only supports a single AMD architecture specification but ${PLSSVM_NUM_AMD_TARGET_ARCHS} were provided!") + endif () + target_compile_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCHS}) + target_link_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${PLSSVM_AMD_TARGET_ARCHS}) + endif () + # intel targets + if (DEFINED PLSSVM_INTEL_TARGET_ARCHS) + # assemble -fsycl-targets + list(APPEND PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS "spir64_gen") + endif () + # set -fsycl-targets + list(JOIN PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS "," PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS_STRING) + target_compile_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -sycl-std=2020 -fsycl -fsycl-targets=${PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS_STRING}) + target_link_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -fsycl -fsycl-targets=${PLSSVM_KOKKOS_SYCL_FSYCL_TARGETS_STRING}) + + # add option for IntelLLVM Ahead-of-Time (AOT) compilation + option(PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT "Enables Ahead-of-Time compilation for the Kokkos::SYCL execution space using IntelLLVM." ON) + if (PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT) + message(STATUS "Enabled Ahead-of-Time (AOT) compilation for the Kokkos::SYCL execution space using IntelLLVM.") + target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PRIVATE PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT) + target_compile_definitions(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT) + ## set AOT compiler flags + # cpu targets + if (DEFINED PLSSVM_CPU_TARGET_ARCHS) + # add target specific flags for AOT + if (PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 1) + target_compile_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_x86_64 "-march=${PLSSVM_CPU_TARGET_ARCHS}") + target_link_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_x86_64 "-march=${PLSSVM_CPU_TARGET_ARCHS}") + endif () + endif () + # nvidia targets + if (DEFINED PLSSVM_NVIDIA_TARGET_ARCHS) + # add target specific flags for AOT + if (NOT PLSSVM_NUM_NVIDIA_TARGET_ARCHS EQUAL 1) + message(SEND_ERROR "IntelLLVM currently only supports a single NVIDIA architecture specification for AOT but ${PLSSVM_NUM_NVIDIA_TARGET_ARCHS} were provided!") + endif () + target_compile_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=nvptx64-nvidia-cuda --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCHS}) + target_link_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=nvptx64-nvidia-cuda --offload-arch=${PLSSVM_NVIDIA_TARGET_ARCHS}) + endif () + # intel targets + if (DEFINED PLSSVM_INTEL_TARGET_ARCHS) + # add target specific flags for AOT + list(JOIN PLSSVM_INTEL_TARGET_ARCHS "," PLSSVM_INTEL_TARGET_ARCHS_STRING) + target_compile_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_gen "-device ${PLSSVM_INTEL_TARGET_ARCHS_STRING}") + target_link_options(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PRIVATE -Xsycl-target-backend=spir64_gen "-device ${PLSSVM_INTEL_TARGET_ARCHS_STRING}") + endif () + endif () +endif () + +# link base library against Kokkos library +target_link_libraries(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PUBLIC ${PLSSVM_BASE_LIBRARY_NAME}) + +# set compile definition that the Kokkos backend is available +target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PRIVATE PLSSVM_HAS_KOKKOS_BACKEND) +target_compile_definitions(${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME} PUBLIC PLSSVM_HAS_KOKKOS_BACKEND) + +# link against interface library +target_link_libraries(${PLSSVM_ALL_LIBRARY_NAME} INTERFACE ${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME}) + +# mark backend library as install target +append_local_and_parent(PLSSVM_TARGETS_TO_INSTALL ${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME}) + +# assemble Kokkos available execution space string +# also set compile definitions -> can't use KOKKOS_ENABLE_* directly inside the "constexpr_available_execution_space.hpp" +# header since we can't include "Kokkos_Core.hpp" there (transitively used in the base library that doesn't know anything about Kokkos +set(PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "") +if (Kokkos_ENABLE_CUDA) + target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_CUDA) + list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "Cuda") +endif () +if (Kokkos_ENABLE_HIP) + target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_HIP) + list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "HIP") +endif () +if (Kokkos_ENABLE_SYCL) + target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_SYCL) + list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "SYCL") +endif () +if (Kokkos_ENABLE_HPX) + target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_HPX) + list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "HPX") +endif () +if (Kokkos_ENABLE_OPENMP) + target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_OPENMP) + list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "OpenMP") +endif () +if (Kokkos_ENABLE_OPENMPTARGET) + target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_OPENMPTARGET) + list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "OpenMPTarget") +endif () +if (Kokkos_ENABLE_OPENACC) + target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_OPENACC) + list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "OpenACC") +endif () +if (Kokkos_ENABLE_THREADS) + target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_THREADS) + list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "Threads") +endif () +if (Kokkos_ENABLE_SERIAL) + target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_ENABLE_SERIAL) + list(APPEND PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "Serial") +endif () +set(PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "${PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES}" PARENT_SCOPE) + +# also set the number of available Kokkos execution spaces to explicitly set the type of the used std::array +# -> necessary if NO Kokkos execution space is available and, therefore, the size of the std::array would be 0 (can't automatically be deduced) +list(LENGTH PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES PLSSVM_KOKKOS_BACKEND_NUM_AVAILABLE_EXECUTION_SPACES) +target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC PLSSVM_KOKKOS_BACKEND_NUM_AVAILABLE_EXECUTION_SPACES=${PLSSVM_KOKKOS_BACKEND_NUM_AVAILABLE_EXECUTION_SPACES}) + +# generate summary string +set(PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_COMPILER " - Kokkos (${PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES}):") +include(${PROJECT_SOURCE_DIR}/cmake/assemble_summary_string.cmake) +assemble_summary_string(PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS) +# do not print any special target architecture information +string(REPLACE " (${PLSSVM_CPU_TARGET_ARCHS})" "" PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS}") +string(REPLACE " (${PLSSVM_NVIDIA_TARGET_ARCHS})" "" PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS}") +string(REPLACE " (${PLSSVM_AMD_TARGET_ARCHS})" "" PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS}") +string(REPLACE " (${PLSSVM_INTEL_TARGET_ARCHS})" "" PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS "${PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS}") +set(PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING "${PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_COMPILER}${PLSSVM_KOKKOS_BACKEND_SUMMARY_STRING_ARCHS}" PARENT_SCOPE) + +list(POP_BACK CMAKE_MESSAGE_INDENT) \ No newline at end of file diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp new file mode 100644 index 000000000..603a5216c --- /dev/null +++ b/src/plssvm/backends/Kokkos/csvm.cpp @@ -0,0 +1,732 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + */ + +#include "plssvm/backends/Kokkos/csvm.hpp" + +#include "plssvm/backends/execution_range.hpp" // plssvm::detail::{execution_range, dim_type} +#include "plssvm/backends/Kokkos/detail/conditional_execution.hpp" // PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_*, PLSSVM_KOKKOS_BACKEND_INVOKE_IF_ +#include "plssvm/backends/Kokkos/detail/device_ptr.hpp" // plssvm::kokkos::detail::device_ptr +#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp" // plssvm::kokkos::detail::{device_wrapper, get_device_list} +#include "plssvm/backends/Kokkos/detail/utility.hpp" // plssvm::kokkos::detail::{available_target_platform_to_execution_space_mapping, get_kokkos_version, dim_type_to_native, get_device_name, device_synchronize} +#include "plssvm/backends/Kokkos/exceptions.hpp" // plssvm::kokkos::backend_exception +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::{execution_space, list_available_execution_spaces} +#include "plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp" // plssvm::kokkos::detail::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale} +#include "plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp" // plssvm::kokkos::detail::device_kernel_assembly +#include "plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp" // plssvm::kokkos::detail::device_kernel_assembly_symm +#include "plssvm/backends/Kokkos/kernel/predict_kernel.hpp" // plssvm::kokkos::detail::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict} +#include "plssvm/constants.hpp" // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::FEATURE_BLOCK_SIZE +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT +#include "plssvm/detail/data_distribution.hpp" // plssvm::detail::triangular_data_distribution +#include "plssvm/detail/logging.hpp" // plssvm::detail::log +#include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size +#include "plssvm/detail/tracking/performance_tracker.hpp" // plssvm::detail::tracking::tracking_entry +#include "plssvm/detail/type_traits.hpp" // plssvm::detail::remove_cvref_t +#include "plssvm/detail/utility.hpp" // plssvm::detail::{get_system_memory, unreachable} +#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/parameter.hpp" // plssvm::parameter +#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/verbosity_levels.hpp" // plssvm::verbosity_level + +#include "Kokkos_Core.hpp" // Kokkos::TeamPolicy, Kokkos::ParallelForTag, Kokkos::parallel_for, Kokkos::PerTeam + // Kokkos::Experimental::HPX::impl_max_hardware_threads, Kokkos::OpenMP::impl_max_hardware_threads, Kokkos::Threads::impl_max_hardware_threads + +#include "fmt/core.h" // fmt::format +#include "fmt/format.h" // fmt::format + +#include // std::sqrt +#include // std::size_t +#include // std::terminate +#include // std::cout, std::endl +#include // std::numeric_limits::max +#include // std::map +#include // std::string +#include // std::move +#include // std::vector + +// a dummy class used as functor to the team_size_max function +template +struct dummy { + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type &) const { } +}; + +namespace plssvm::kokkos { + +csvm::csvm(parameter params) : + csvm{ plssvm::target_platform::automatic, params } { } + +csvm::csvm(target_platform target, parameter params) : + base_type{ params } { + this->init(target); +} + +void csvm::init(const target_platform target) { + // check whether the requested target platform has been enabled + switch (target) { + case target_platform::automatic: + break; + case target_platform::cpu: +#if !defined(PLSSVM_HAS_CPU_TARGET) + throw backend_exception{ fmt::format("Requested target platform '{}' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!", target) }; +#endif + break; + case target_platform::gpu_nvidia: +#if !defined(PLSSVM_HAS_NVIDIA_TARGET) + throw backend_exception{ fmt::format("Requested target platform '{}' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!", target) }; +#endif + break; + case target_platform::gpu_amd: +#if !defined(PLSSVM_HAS_AMD_TARGET) + throw backend_exception{ fmt::format("Requested target platform '{}' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!", target) }; +#endif + break; + case target_platform::gpu_intel: +#if !defined(PLSSVM_HAS_INTEL_TARGET) + throw backend_exception{ fmt::format("Requested target platform '{}' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!", target) }; +#endif + break; + } + + // check whether the requested execution space is available + if (!::plssvm::detail::contains(list_available_execution_spaces(), space_)) { + throw backend_exception{ fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space_, fmt::join(list_available_execution_spaces(), ", ")) }; + } + + // get all available target_platform <-> Kokkos::ExecutionSpace combinations + const std::map> available_combinations = detail::available_target_platform_to_execution_space_mapping(); + + // check whether the provided execution space is the automatic one + if (space_ == execution_space::automatic) { + // automatically determine the execution space and potentially automatically determine the target platform + if (target == target_platform::automatic) { + bool found_combination{ false }; + // go through all combinations and choose the first execution space in order: gpu_nvidia -> gpu_amd -> gpu_intel -> cpu + for (const target_platform target_order : list_available_target_platforms()) { + if (::plssvm::detail::contains(available_combinations, target_order)) { + // the target platform is supported -> choose the first execution space to use in the Kokkos backend + space_ = available_combinations.at(target_order).front(); + target_ = target_order; + found_combination = true; + break; + } + } + // check whether a valid combination could be found + if (!found_combination) { + throw backend_exception{ fmt::format("Couldn't find a valid Kokkos::ExecutionSpace ({}) and target_platform ({}) combination!", fmt::join(list_available_execution_spaces(), ", "), fmt::join(list_available_target_platforms(), ", ")) }; + } + } else { + // check whether the provided target platform is compatible with the currently available Kokkos::ExecutionSpaces + if (::plssvm::detail::contains(available_combinations, target)) { + // the target platform is supported -> choose the first execution space to use in the Kokkos backend + space_ = available_combinations.at(target).front(); + target_ = target; + } else { + // the provided target platform is unsupported -> throw an exception + throw backend_exception{ fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform {}!", fmt::join(list_available_execution_spaces(), ", "), target) }; + } + } + + // output what we use as automatic Kokkos execution space + plssvm::detail::log(verbosity_level::full, + "\nUsing {} as automatic Kokkos::ExecutionSpace.", + space_); + } else { + // execution space explicitly provided and potentially automatically determine the target platform + if (target == target_platform::automatic) { + bool found_combination{ false }; + // go through all combinations (gpu_nvidia -> gpu_amd -> gpu_intel -> cpu) and check whether the requested execution space supports that target platform + for (const target_platform target_order : list_available_target_platforms()) { + if (::plssvm::detail::contains(available_combinations, target_order) && ::plssvm::detail::contains(available_combinations.at(target_order), space_)) { + // the provided execution space supports the target platform + target_ = target_order; + found_combination = true; + break; + } + } + // check whether a valid combination could be found + if (!found_combination) { + throw backend_exception{ fmt::format("Couldn't find a valid target_platform for the Kokkos::ExecutionSpace {}!", space_) }; + } + } else { + if (::plssvm::detail::contains(available_combinations, target) && ::plssvm::detail::contains(available_combinations.at(target), space_)) { + // update target + target_ = target; + } else { + // the provided execution space and target platform combination is unsupported + throw backend_exception{ fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform {}!", space_, target) }; + } + } + } + + // At this point, space_ may NEVER be execution_space::automatic! + PLSSVM_ASSERT(space_ != execution_space::automatic, "At this point, the Kokkos execution space must be determined and must NOT be automatic!"); + PLSSVM_ASSERT(target_ != target_platform::automatic, "At this point, the target platform must be determined and must NOT be automatic!"); + + // Kokkos::Experimental::OpenMPTarget and Kokkos::Experimental::OpenACC currently not supported! + if (space_ == execution_space::openmp_target || space_ == execution_space::openacc) { + throw backend_exception{ fmt::format("The Kokkos execution space {} is currently not supported!", space_) }; + } + + plssvm::detail::log(verbosity_level::full, + "\nUsing Kokkos ({}) as backend with the Kokkos::ExecutionSpace {}.\n", + plssvm::detail::tracking::tracking_entry{ "dependencies", "kokkos_version", detail::get_kokkos_version() }, + plssvm::detail::tracking::tracking_entry{ "dependencies", "kokkos_default_execution_space", space_ }); + + // output automatic target platform information + if (target == target_platform::automatic) { + plssvm::detail::log(verbosity_level::full, + "Using {} as automatic target platform.\n", + target_); + } + + // get all available devices wrt the requested target platform + devices_ = detail::get_device_list(space_, target_); + + // throw exception if no devices in the current execution space could be found + if (devices_.empty()) { + throw backend_exception{ fmt::format("No devices found for the Kokkos execution space {} with the target platform {}!", space_, target_) }; + } + + // print found Kokkos devices + plssvm::detail::log(verbosity_level::full, + "Found {} Kokkos device(s) for the target platform {}:\n", + plssvm::detail::tracking::tracking_entry{ "backend", "num_devices", devices_.size() }, + plssvm::detail::tracking::tracking_entry{ "backend", "target_platform", target_ }); + + std::vector device_names{}; + device_names.reserve(devices_.size()); + for (typename std::vector::size_type device = 0; device < devices_.size(); ++device) { + const std::string device_name = detail::get_device_name(devices_[device]); + plssvm::detail::log(verbosity_level::full, + " [{}, {}]\n", + device, + device_name); + device_names.emplace_back(device_name); + } + PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "device", device_names })); + plssvm::detail::log(verbosity_level::full | verbosity_level::timing, + "\n"); +} + +csvm::~csvm() { + try { + // be sure that all operations on the CUDA devices have finished before destruction + for (const queue_type &device : devices_) { + detail::device_synchronize(device); + } + } catch (const plssvm::exception &e) { + std::cout << e.what_with_loc() << std::endl; + std::terminate(); + } +} + +std::vector<::plssvm::detail::memory_size> csvm::get_device_memory() const { + PLSSVM_ASSERT(space_ != execution_space::automatic, "The automatic execution_space may not be provided to this function!"); + + std::vector<::plssvm::detail::memory_size> device_memory(this->num_available_devices()); + switch (space_) { + case execution_space::automatic: + throw backend_exception{ "Unsupported execution_space::automatic provided!" }; + case execution_space::cuda: + PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() { + for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) { + device_memory[device_id] = ::plssvm::detail::memory_size{ static_cast(devices_[device_id].get().cuda_device_prop().totalGlobalMem) }; + } + }); + break; + case execution_space::hip: + PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() { + for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) { + device_memory[device_id] = ::plssvm::detail::memory_size{ static_cast(devices_[device_id].get().hip_device_prop().totalGlobalMem) }; + } + }); + break; + case execution_space::sycl: + PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() { + for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) { + device_memory[device_id] = ::plssvm::detail::memory_size{ static_cast(devices_[device_id].get().sycl_queue().get_device().get_info<::sycl::info::device::global_mem_size>()) }; + } + }); + break; + case execution_space::hpx: + case execution_space::openmp: + case execution_space::threads: + case execution_space::serial: + // NOTE: for these execution spaces, this->num_available_devices will always return 1 + PLSSVM_ASSERT(this->num_available_devices() == 1, "The host side Kokkos execution spaces should always only be represented using a single device!"); + device_memory[0] = ::plssvm::detail::get_system_memory(); + break; + // TODO: implement for Kokkos::Experimental::OpenMPTarget and Kokkos::Experimental::OpenACC + case execution_space::openmp_target: + case execution_space::openacc: + throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) }; + } + return device_memory; +} + +std::vector<::plssvm::detail::memory_size> csvm::get_max_mem_alloc_size() const { + PLSSVM_ASSERT(space_ != execution_space::automatic, "The automatic execution_space may not be provided to this function!"); + + std::vector<::plssvm::detail::memory_size> max_mem_alloc_size(this->num_available_devices()); + switch (space_) { + case execution_space::automatic: + throw backend_exception{ "Unsupported execution_space::automatic provided!" }; + case execution_space::cuda: + case execution_space::hip: + max_mem_alloc_size = this->get_device_memory(); + break; + case execution_space::sycl: + PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() { + for (std::size_t device_id = 0; device_id < this->num_available_devices(); ++device_id) { + max_mem_alloc_size[device_id] = ::plssvm::detail::memory_size{ static_cast(devices_[device_id].get().sycl_queue().get_device().get_info<::sycl::info::device::max_mem_alloc_size>()) }; + } + }); + break; + case execution_space::hpx: + case execution_space::openmp: + case execution_space::threads: + case execution_space::serial: + max_mem_alloc_size = this->get_device_memory(); + break; + // TODO: implement for Kokkos::Experimental::OpenMPTarget and Kokkos::Experimental::OpenACC + case execution_space::openmp_target: + case execution_space::openacc: + throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) }; + } + return max_mem_alloc_size; +} + +std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const { + PLSSVM_ASSERT(device_id < this->num_available_devices(), "Invalid device {} requested!", device_id); + PLSSVM_ASSERT(space_ != execution_space::automatic, "The automatic execution_space may not be provided to this function!"); + + // NOTE: the maximum theoretical work-group size, may be additionally limited by the amount of used scratch memory + return devices_[device_id].execute_and_return([](const auto &device) { + using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t; + // NOTE: CUDA + HIP + SYCL: returns the maximum possible number of threads, due to no further limitations in the dummy functor (like, e.g., scratch memory) + // NOTE: HPX + Serial: hardcoded to 1 + // NOTE: OpenMP: should be 1-2; most likely 1 + // NOTE: Threads: should be equal to number of hardware threads IF hwloc is enabled; otherwise 1 + // NOTE: OpenMPTarget: hardcoded to 256 + // NOTE: OpenACC: hardcoded to 512 + + // NOTE: the functor types doesn't matter -> the dummy class + return Kokkos::TeamPolicy{}.team_size_max(dummy{}, Kokkos::ParallelForTag{}); + }); +} + +::plssvm::detail::dim_type csvm::get_max_grid_size([[maybe_unused]] const std::size_t device_id) const { + PLSSVM_ASSERT(device_id < this->num_available_devices(), "Invalid device {} requested!", device_id); + PLSSVM_ASSERT(space_ != execution_space::automatic, "The automatic execution_space may not be provided to this function!"); + + // NOTE: Kokkos only supports one-dimensional execution ranges! + // NOTE: we only use two-dimensional kernels! + switch (space_) { + case execution_space::automatic: + throw backend_exception{ "Unsupported execution_space::automatic provided!" }; + case execution_space::cuda: + PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA(([&]() -> ::plssvm::detail::dim_type { + const cudaDeviceProp &prop = devices_[device_id].get().cuda_device_prop(); + const auto max_grid_size = static_cast(std::sqrt(prop.maxGridSize[0])); + return { max_grid_size, max_grid_size, 1ull }; + })); + case execution_space::hip: + PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP(([&]() -> ::plssvm::detail::dim_type { + const hipDeviceProp_t &prop = devices_[device_id].get().hip_device_prop(); + const auto max_grid_size = static_cast(std::sqrt(prop.maxGridSize[0])); + return { max_grid_size, max_grid_size, 1ull }; + })); + case execution_space::sycl: + PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SYCL(([&]() -> ::plssvm::detail::dim_type { + // TODO: replace with standardized function if there will be one in the future +#if defined(SYCL_EXT_ONEAPI_MAX_WORK_GROUP_QUERY) + const ::sycl::id<3> native_range = devices_[device_id].get().sycl_queue().get_device().get_info<::sycl::ext::oneapi::experimental::info::device::max_work_groups<3>>(); +#else + // fallback to maximum theoretical value, may break at runtime! + ::sycl::id<3> native_range{}; + const std::size_t max_int32 = std::numeric_limits::max(); + const std::size_t max_uint16 = std::numeric_limits::max(); + if (target_ == target_platform::cpu) { + native_range = ::sycl::id<3>{ max_int32, max_int32, max_int32 }; + } else { + native_range = ::sycl::id<3>{ max_int32, max_uint16, max_uint16 }; + } +#endif + // note: account for SYCL's different iteration range! + return { native_range[2], native_range[1], native_range[0] }; + })); + case execution_space::hpx: + case execution_space::openmp: + case execution_space::threads: + case execution_space::serial: + return { std::numeric_limits::max(), std::numeric_limits::max(), 1ull }; + case execution_space::openmp_target: + case execution_space::openacc: + // TODO: implement for Kokkos::Experimental::OpenMPTarget and Kokkos::Experimental::OpenACC + throw backend_exception{ fmt::format("Currently not implemented for the execution space: {}!", space_) }; + } + // all possible cases should be handled by the previous switch + // -> silence missing return statement compiler warnings due to throw statement + ::plssvm::detail::unreachable(); +} + +//***************************************************// +// fit // +//***************************************************// + +auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { + const unsigned long long num_rows_reduced = data_d.shape().x - 1; + const unsigned long long num_features = data_d.shape().y; + + // calculate the number of data points this device is responsible for + const unsigned long long device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id); + + // get the offset of the data points this device is responsible for + const unsigned long long row_offset = data_distribution_->place_row_offset(device_id); + + // calculate the number of matrix entries + const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_); + const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id); + + device_ptr_type kernel_matrix_d{ num_entries_padded, devices_[device_id] }; // only explicitly store the upper triangular matrix + const real_type cost_factor = real_type{ 1.0 } / params.cost; + const std::size_t scratch_memory_size = static_cast(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); + + // save the team size + const int team_size = detail::dim_type_to_native(exec.block); + + return devices_[device_id].execute_and_return([&](auto &device) { + using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t; + constexpr execution_space space = kokkos_type_to_execution_space_v; + + for (const auto &[partial_grid, offsets] : exec.grids) { + // convert execution range partial_grid to Kokkos' native one-dimensional size + const int native_partial_grid = detail::dim_type_to_native(partial_grid); + + // create a Kokkos TeamPolicy + Kokkos::TeamPolicy team_policy{ device, native_partial_grid, team_size }; + + switch (params.kernel_type) { + case kernel_function_type::linear: + { + using functor_type = detail::device_kernel_assembly; + Kokkos::parallel_for("assemble_kernel_matrix_explicit_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get(), data_d.get().get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x }); + } + break; + case kernel_function_type::polynomial: + { + using functor_type = detail::device_kernel_assembly; + Kokkos::parallel_for("assemble_kernel_matrix_explicit_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get(), data_d.get().get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, params.degree, std::get(params.gamma), params.coef0 }); + } + break; + case kernel_function_type::rbf: + { + using functor_type = detail::device_kernel_assembly; + Kokkos::parallel_for("assemble_kernel_matrix_explicit_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get(), data_d.get().get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get(params.gamma) }); + } + break; + case kernel_function_type::sigmoid: + { + using functor_type = detail::device_kernel_assembly; + Kokkos::parallel_for("assemble_kernel_matrix_explicit_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get(), data_d.get().get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get(params.gamma), params.coef0 }); + } + break; + case kernel_function_type::laplacian: + { + using functor_type = detail::device_kernel_assembly; + Kokkos::parallel_for("assemble_kernel_matrix_explicit_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get(), data_d.get().get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get(params.gamma) }); + } + break; + case kernel_function_type::chi_squared: + { + using functor_type = detail::device_kernel_assembly; + Kokkos::parallel_for("assemble_kernel_matrix_explicit_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get(), data_d.get().get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get(params.gamma) }); + } + break; + } + } + detail::device_synchronize(device); + + return std::move(kernel_matrix_d); + }); +} + +void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const ::plssvm::detail::execution_range &mirror_exec, const real_type alpha, const device_ptr_type &A_d, const device_ptr_type &B_d, const real_type beta, device_ptr_type &C_d) const { + const unsigned long long num_rhs = B_d.shape().x; + const unsigned long long num_rows = B_d.shape().y; + + devices_[device_id].execute([&](auto &device) { + using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t; + constexpr execution_space space = kokkos_type_to_execution_space_v; + + // calculate the number of data points this device is responsible for + const unsigned long long device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id); + // get the offset of the data points this device is responsible for + const unsigned long long row_offset = data_distribution_->place_row_offset(device_id); + // the necessary amount of scratch memory for the kernels + const std::size_t scratch_memory_size = static_cast(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); + + // save the team size + const int team_size = detail::dim_type_to_native(exec.block); + + for (const auto &[partial_grid, offsets] : exec.grids) { + // convert execution range partial_grid to Kokkos' native one-dimensional size + const int native_partial_grid = detail::dim_type_to_native(partial_grid); + + // create a Kokkos TeamPolicy + Kokkos::TeamPolicy team_policy{ device, native_partial_grid, team_size }; + + Kokkos::parallel_for("blas_level_3_kernel_explicit", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get().get(), B_d.get().get(), beta, C_d.get().get(), offsets.x, offsets.y, partial_grid.x }); + } + + // save the team size + const int mirror_team_size = detail::dim_type_to_native(mirror_exec.block); + + for (const auto &[partial_grid, offsets] : mirror_exec.grids) { + const unsigned long long num_mirror_rows = num_rows - row_offset - device_specific_num_rows; + + if (num_mirror_rows > 0) { + // convert execution range partial_grid to Kokkos' native one-dimensional size + const int native_partial_grid = detail::dim_type_to_native(partial_grid); + + // create a Kokkos TeamPolicy + Kokkos::TeamPolicy team_policy{ device, native_partial_grid, mirror_team_size }; + + Kokkos::parallel_for("blas_level_3_kernel_explicit_mirror", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get().get(), B_d.get().get(), beta, C_d.get().get(), offsets.x, offsets.y, partial_grid.x }); + } + } + detail::device_synchronize(device); + }); +} + +void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, device_ptr_type &lhs_d, const device_ptr_type &rhs_d) const { + const unsigned long long num_rhs = lhs_d.shape().x; + + devices_[device_id].execute([&](auto &device) { + using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t; + constexpr execution_space space = kokkos_type_to_execution_space_v; + + // save the team size + const int team_size = detail::dim_type_to_native(exec.block); + + for (const auto &[partial_grid, offsets] : exec.grids) { + // convert execution range partial_grid to Kokkos' native one-dimensional size + const int native_partial_grid = detail::dim_type_to_native(partial_grid); + + // create a Kokkos TeamPolicy + const Kokkos::TeamPolicy team_policy{ device, native_partial_grid, team_size }; + + Kokkos::parallel_for("inplace_matrix_addition", team_policy, detail::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get().get(), rhs_d.get().get(), offsets.x, offsets.y, partial_grid.x }); + } + detail::device_synchronize(device); + }); +} + +void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, device_ptr_type &lhs_d, const real_type scale) const { + const unsigned long long num_rhs = lhs_d.shape().x; + + devices_[device_id].execute([&](auto &device) { + using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t; + constexpr execution_space space = kokkos_type_to_execution_space_v; + + // save the team size + const int team_size = detail::dim_type_to_native(exec.block); + + for (const auto &[partial_grid, offsets] : exec.grids) { + // convert execution range partial_grid to Kokkos' native one-dimensional size + const int native_partial_grid = detail::dim_type_to_native(partial_grid); + + // create a Kokkos TeamPolicy + const Kokkos::TeamPolicy team_policy{ device, native_partial_grid, team_size }; + + Kokkos::parallel_for("inplace_matrix_scale", team_policy, detail::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get().get(), scale, offsets.x, offsets.y, partial_grid.x }); + } + detail::device_synchronize(device); + }); +} + +void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const real_type alpha, const device_ptr_type &A_d, const parameter ¶ms, const device_ptr_type &q_red, const real_type QA_cost, const device_ptr_type &B_d, device_ptr_type &C_d) const { + const unsigned long long num_rows_reduced = A_d.shape().x - 1; + const unsigned long long num_features = A_d.shape().y; + const unsigned long long num_classes = B_d.shape().x; + + devices_[device_id].execute([&](auto &device) { + using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t; + constexpr execution_space space = kokkos_type_to_execution_space_v; + + // calculate the number of data points this device is responsible for + const unsigned long long device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id); + // get the offset of the data points this device is responsible for + const unsigned long long row_offset = data_distribution_->place_row_offset(device_id); + + const real_type cost_factor = real_type{ 1.0 } / params.cost; + const std::size_t scratch_memory_size = static_cast(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); + + // save the team size + const int team_size = detail::dim_type_to_native(exec.block); + + for (const auto &[partial_grid, offsets] : exec.grids) { + // convert execution range partial_grid to Kokkos' native one-dimensional size + const int native_partial_grid = detail::dim_type_to_native(partial_grid); + + // create a Kokkos TeamPolicy + Kokkos::TeamPolicy team_policy{ device, native_partial_grid, team_size }; + + switch (params.kernel_type) { + case kernel_function_type::linear: + { + using functor_type = detail::device_kernel_assembly_symm; + Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get(), A_d.get().get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get(), C_d.get().get(), num_classes, offsets.x, offsets.y, partial_grid.x }); + } + break; + case kernel_function_type::polynomial: + { + using functor_type = detail::device_kernel_assembly_symm; + Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get(), A_d.get().get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get(), C_d.get().get(), num_classes, offsets.x, offsets.y, partial_grid.x, params.degree, std::get(params.gamma), params.coef0 }); + } + break; + case kernel_function_type::rbf: + { + using functor_type = detail::device_kernel_assembly_symm; + Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get(), A_d.get().get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get(), C_d.get().get(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get(params.gamma) }); + } + break; + case kernel_function_type::sigmoid: + { + using functor_type = detail::device_kernel_assembly_symm; + Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get(), A_d.get().get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get(), C_d.get().get(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get(params.gamma), params.coef0 }); + } + break; + case kernel_function_type::laplacian: + { + using functor_type = detail::device_kernel_assembly_symm; + Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get(), A_d.get().get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get(), C_d.get().get(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get(params.gamma) }); + } + break; + case kernel_function_type::chi_squared: + { + using functor_type = detail::device_kernel_assembly_symm; + Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get(), A_d.get().get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get(), C_d.get().get(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get(params.gamma) }); + } + break; + } + } + detail::device_synchronize(device); + }); +} + +//***************************************************// +// predict, score // +//***************************************************// + +auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const device_ptr_type &alpha_d, const device_ptr_type &sv_d) const -> device_ptr_type { + const unsigned long long num_classes = alpha_d.shape().x; + const unsigned long long num_sv = alpha_d.shape().y; + const unsigned long long device_specific_num_sv = sv_d.shape().x; + const unsigned long long num_features = sv_d.shape().y; + + // get the offset of the data points this device is responsible for + const unsigned long long sv_offset = data_distribution_->place_row_offset(device_id); + + device_ptr_type w_d{ shape{ num_classes, num_features }, shape{ PADDING_SIZE, PADDING_SIZE }, devices_[device_id] }; + + const std::size_t scratch_memory_size = static_cast(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); + + // save the team size + const int team_size = detail::dim_type_to_native(exec.block); + + return devices_[device_id].execute_and_return([&](auto &device) { + using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t; + constexpr execution_space space = kokkos_type_to_execution_space_v; + + for (const auto &[partial_grid, offsets] : exec.grids) { + // convert execution range partial_grid to Kokkos' native one-dimensional size + const int native_partial_grid = detail::dim_type_to_native(partial_grid); + + // create a Kokkos TeamPolicy + Kokkos::TeamPolicy team_policy{ device, native_partial_grid, team_size }; + + Kokkos::parallel_for("w_kernel", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_w_linear{ w_d.get().get(), alpha_d.get().get(), sv_d.get().get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.x, offsets.y, partial_grid.x }); + } + detail::device_synchronize(device); + + return std::move(w_d); + }); +} + +auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const device_ptr_type &alpha_d, const device_ptr_type &rho_d, const device_ptr_type &sv_or_w_d, const device_ptr_type &predict_points_d) const -> device_ptr_type { + const unsigned long long num_classes = alpha_d.shape().x; + const unsigned long long num_predict_points = predict_points_d.shape().x; // = device_specific_num_rows + const unsigned long long num_features = predict_points_d.shape().y; + const unsigned long long num_sv = sv_or_w_d.shape().x; + + device_ptr_type out_d{ shape{ num_predict_points, num_classes }, shape{ PADDING_SIZE, PADDING_SIZE }, devices_[device_id] }; + + const std::size_t scratch_memory_size = static_cast(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); + + // save the team size + const int team_size = detail::dim_type_to_native(exec.block); + + return devices_[device_id].execute_and_return([&](auto &device) { + using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t; + constexpr execution_space space = kokkos_type_to_execution_space_v; + + for (const auto &[partial_grid, offsets] : exec.grids) { + // convert execution range partial_grid to Kokkos' native one-dimensional size + const int native_partial_grid = detail::dim_type_to_native(partial_grid); + + // create a Kokkos TeamPolicy + Kokkos::TeamPolicy team_policy{ device, native_partial_grid, team_size }; + + switch (params.kernel_type) { + case kernel_function_type::linear: + { + using functor_type = detail::device_kernel_predict_linear; + Kokkos::parallel_for("predict_kernel_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get(), sv_or_w_d.get().get(), rho_d.get().get(), predict_points_d.get().get(), num_classes, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x }); + } + break; + case kernel_function_type::polynomial: + { + using functor_type = detail::device_kernel_predict; + Kokkos::parallel_for("predict_kernel_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get(), alpha_d.get().get(), rho_d.get().get(), sv_or_w_d.get().get(), predict_points_d.get().get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, params.degree, std::get(params.gamma), params.coef0 }); + } + break; + case kernel_function_type::rbf: + { + using functor_type = detail::device_kernel_predict; + Kokkos::parallel_for("predict_kernel_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get(), alpha_d.get().get(), rho_d.get().get(), sv_or_w_d.get().get(), predict_points_d.get().get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get(params.gamma) }); + } + break; + case kernel_function_type::sigmoid: + { + using functor_type = detail::device_kernel_predict; + Kokkos::parallel_for("predict_kernel_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get(), alpha_d.get().get(), rho_d.get().get(), sv_or_w_d.get().get(), predict_points_d.get().get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get(params.gamma), params.coef0 }); + } + break; + case kernel_function_type::laplacian: + { + using functor_type = detail::device_kernel_predict; + Kokkos::parallel_for("predict_kernel_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get(), alpha_d.get().get(), rho_d.get().get(), sv_or_w_d.get().get(), predict_points_d.get().get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get(params.gamma) }); + } + break; + case kernel_function_type::chi_squared: + { + using functor_type = detail::device_kernel_predict; + Kokkos::parallel_for("predict_kernel_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get(), alpha_d.get().get(), rho_d.get().get(), sv_or_w_d.get().get(), predict_points_d.get().get(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get(params.gamma) }); + } + break; + } + } + detail::device_synchronize(device); + + return std::move(out_d); + }); +} + +} // namespace plssvm::kokkos diff --git a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp new file mode 100644 index 000000000..0dfe9adc0 --- /dev/null +++ b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp @@ -0,0 +1,186 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + */ + +#include "plssvm/backends/Kokkos/detail/device_ptr.hpp" + +#include "plssvm/backends/Kokkos/detail/device_view_wrapper.hpp" // plssvm::kokkos::detail::{device_view_wrapper, make_device_view_wrapper} +#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp" // plssvm::kokkos::detail::device_wrapper +#include "plssvm/backends/Kokkos/detail/utility.hpp" // plssvm::detail::device_synchronize +#include "plssvm/backends/Kokkos/exceptions.hpp" // plssvm::kokkos::backend_exception +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT +#include "plssvm/detail/type_traits.hpp" // plssvm::detail::remove_cvref_t +#include "plssvm/shape.hpp" // plssvm::shape + +#include "Kokkos_Core.hpp" // Kokkos::View, Kokkos::HostSpace, Kokkos::MemoryUnmanaged, Kokkos::subview, Kokkos::parallel_for, Kokkos::deep_copy + +#include "fmt/core.h" // fmt::format + +#include // std::min +#include // std::size_t +#include // std::memcpy +#include // std::make_pair +#include // std::vector + +namespace plssvm::kokkos::detail { + +/** + * @brief Typedef for a simple Kokkos::View always targeting the Kokkos::HostSpace. + * @tparam T the type of the view's data + */ +template +using host_view_type = Kokkos::View; + +template +device_ptr::device_ptr(const size_type size, const device_wrapper &device) : + device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device } { } + +template +device_ptr::device_ptr(const plssvm::shape shape, const device_wrapper &device) : + device_ptr{ shape, plssvm::shape{ 0, 0 }, device } { } + +template +device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const device_wrapper &device) : + base_type{ shape, padding, device } { + data_ = make_device_view_wrapper(device, this->size_padded()); + this->memset(0); +} + +template +void device_ptr::memset(const int pattern, const size_type pos, const size_type num_bytes) { + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + + if (pos >= this->size_padded()) { + throw backend_exception{ fmt::format("Illegal access in memset!: {} >= {}", pos, this->size_padded()) }; + } + const size_type rnum_bytes = std::min(num_bytes, (this->size_padded() - pos) * sizeof(value_type)); + + data_.execute([&](const auto &data) { + queue_.execute([&](const auto &exec) { + using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t; + + // create view of the device data cast to unsigned char + const Kokkos::View view{ reinterpret_cast(data.data() + pos), rnum_bytes }; + // fill the view with the pattern -> acts like a memset + Kokkos::deep_copy(exec, view, static_cast(pattern)); + }); + }); + + detail::device_synchronize(queue_); +} + +template +void device_ptr::fill(const value_type value, const size_type pos, const size_type count) { + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + + if (pos >= this->size_padded()) { + throw backend_exception{ fmt::format("Illegal access in fill!: {} >= {}", pos, this->size_padded()) }; + } + const size_type rcount = std::min(count, this->size_padded() - pos); + + data_.execute([&](const auto &data) { + // create subview of the device data + auto data_subview = Kokkos::subview(data, std::make_pair(pos, pos + rcount)); + queue_.execute([&](const auto &exec) { + // fill subview with constant data + Kokkos::deep_copy(exec, data_subview, value); + }); + }); + + detail::device_synchronize(queue_); +} + +template +void device_ptr::copy_to_device(const_host_pointer_type data_to_copy, const size_type pos, const size_type count) { + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!"); + + const size_type rcount = std::min(count, this->size_padded() - pos); + + data_.execute([&](const auto &data) { + // create view of the host data + const host_view_type host_view{ data_to_copy, rcount }; + // create subview of the device data + auto data_subview = Kokkos::subview(data, std::make_pair(pos, pos + rcount)); + queue_.execute([&](const auto &exec) { + // fill subview with constant data + Kokkos::deep_copy(exec, data_subview, host_view); + }); + }); + + detail::device_synchronize(queue_); +} + +template +void device_ptr::copy_to_device_strided(const_host_pointer_type data_to_copy, const std::size_t spitch, const std::size_t width, const std::size_t height) { + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!"); + + if (width > spitch) { + throw backend_exception{ fmt::format("Invalid width and spitch combination specified (width: {} <= spitch: {})!", width, spitch) }; + } + + // TODO: strided copy to device in Kokkos currently not possible + if (spitch == width) { + // can use normal copy since we have no line strides + this->copy_to_device(data_to_copy, 0, width * height); + } else { + std::vector temp(this->shape_padded().x * height, value_type{ 0.0 }); + value_type *pos = temp.data(); + for (std::size_t row = 0; row < height; ++row) { + std::memcpy(pos, data_to_copy + row * spitch, width * sizeof(value_type)); + pos += this->shape_padded().x; + } + this->copy_to_device(temp); + } + + detail::device_synchronize(queue_); +} + +template +void device_ptr::copy_to_host(host_pointer_type buffer, const size_type pos, const size_type count) const { + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!"); + + const size_type rcount = std::min(count, this->size_padded() - pos); + + data_.execute([&](const auto &data) { + // create view of the host data + const host_view_type host_view{ buffer, rcount }; + // create subview of the device data + auto data_subview = Kokkos::subview(data, std::make_pair(pos, pos + rcount)); + queue_.execute([&](const auto &exec) { + // fill subview with constant data + Kokkos::deep_copy(exec, host_view, data_subview); + }); + }); + + detail::device_synchronize(queue_); +} + +template +void device_ptr::copy_to_other_device(device_ptr &target, const size_type pos, const size_type count) const { + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(target.get() != device_pointer_type{}, "Invalid target pointer! Maybe target has been default constructed?"); + + const size_type rcount = std::min(count, this->size_padded() - pos); + if (target.size_padded() < rcount) { + throw backend_exception{ fmt::format("Buffer too small to perform copy (needed: {}, provided: {})!", rcount, target.size_padded()) }; + } + + // TODO: use Kokkos function? + std::vector temp(rcount); + this->copy_to_host(temp, pos, rcount); + target.copy_to_device(temp); + + detail::device_synchronize(queue_); +} + +template class device_ptr; +template class device_ptr; + +} // namespace plssvm::kokkos::detail diff --git a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp new file mode 100644 index 000000000..35dd6c2e9 --- /dev/null +++ b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp @@ -0,0 +1,148 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + */ + +#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp" + +#include "plssvm/backends/Kokkos/detail/conditional_execution.hpp" // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_* +#include "plssvm/backends/Kokkos/exceptions.hpp" // plssvm::kokkos::backend_exception +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT +#include "plssvm/detail/logging_without_performance_tracking.hpp" // plssvm::detail::log_untracked +#include "plssvm/detail/string_utility.hpp" // plssvm::detail::as_lower_case +#include "plssvm/detail/utility.hpp" // plssvm::detail::contains +#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/verbosity_levels.hpp" // plssvm::verbosity_level + +#include "Kokkos_Core.hpp" // Kokkos::num_devices, Kokkos::ExecutionSpace + +#include // std::vector + +#if defined(KOKKOS_ENABLE_CUDA) + #define PLSSVM_CUDA_ERROR_CHECK(err) \ + if ((err) != cudaSuccess) { \ + throw plssvm::kokkos::backend_exception{ fmt::format("Kokkos::Cuda assert '{}': {}", cudaGetErrorName(err), cudaGetErrorString(err)) }; \ + } +#endif + +#if defined(KOKKOS_ENABLE_HIP) + #define PLSSVM_HIP_ERROR_CHECK(err) \ + if ((err) != hipSuccess) { \ + throw plssvm::kokkos::backend_exception{ fmt::format("HIP assert '{}': {}", hipGetErrorName(err), hipGetErrorString(err)) }; \ + } +#endif + +namespace plssvm::kokkos::detail { + +std::vector get_device_list(const execution_space space, [[maybe_unused]] const target_platform target) { + PLSSVM_ASSERT(space != execution_space::automatic, "The automatic execution_space may not be provided to this function!"); + + std::vector devices{}; + switch (space) { + case execution_space::automatic: + throw backend_exception{ "Unsupported execution_space::automatic provided!" }; + case execution_space::cuda: + PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA([&]() { + for (int device = 0; device < Kokkos::num_devices(); ++device) { + // create CUDA stream using the CUDA specific functions + PLSSVM_CUDA_ERROR_CHECK(cudaSetDevice(device)); + cudaStream_t stream{}; + PLSSVM_CUDA_ERROR_CHECK(cudaStreamCreate(&stream)); + // create Kokkos execution space for the specific device + // Note: it is important to pass the cudaStream_t lifetime to be managed by Kokkos + devices.emplace_back(Kokkos::Cuda(stream, Kokkos::Impl::ManageStream::yes)); + } + }); + break; + case execution_space::hip: + PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() { + for (int device = 0; device < Kokkos::num_devices(); ++device) { + // HIP CUDA stream using the HIP specific functions + PLSSVM_HIP_ERROR_CHECK(hipSetDevice(device)); + hipStream_t stream{}; + PLSSVM_HIP_ERROR_CHECK(hipStreamCreate(&stream)); + // create Kokkos execution space for the specific device + // Note: it is important to pass the hipStream_t lifetime to be managed by Kokkos + devices.emplace_back(Kokkos::HIP(stream, Kokkos::Impl::ManageStream::yes)); + } + }); + break; + case execution_space::sycl: + PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL(([&]() { + // all user provided sycl::queues must be in-order queues + ::sycl::property_list props{ ::sycl::property::queue::in_order{} }; + + for (const auto &platform : ::sycl::platform::get_platforms()) { + for (const auto &device : platform.get_devices()) { + // Note: Kokkos is IntelLLVM/DPC++/icpx only + if (device.is_cpu() && target == target_platform::cpu) { + devices.emplace_back(Kokkos::SYCL{ ::sycl::queue{ device, props } }); + } else if (device.is_gpu()) { + // the current device is a GPU + // get vendor string and convert it to all lower case + const std::string vendor_string = ::plssvm::detail::as_lower_case(device.get_info<::sycl::info::device::vendor>()); + // get platform name of current GPU device and convert it to all lower case + const std::string platform_string = ::plssvm::detail::as_lower_case(platform.get_info<::sycl::info::platform::name>()); + + // check vendor string and insert to correct target platform + if (::plssvm::detail::contains(vendor_string, "nvidia") && target == target_platform::gpu_nvidia) { + devices.emplace_back(Kokkos::SYCL{ ::sycl::queue{ device, props } }); + } else if ((::plssvm::detail::contains(vendor_string, "amd") || ::plssvm::detail::contains(vendor_string, "advanced micro devices")) && target == target_platform::gpu_amd) { + devices.emplace_back(Kokkos::SYCL{ ::sycl::queue{ device, props } }); + } else if (::plssvm::detail::contains(vendor_string, "intel") && target == target_platform::gpu_intel) { + devices.emplace_back(Kokkos::SYCL{ ::sycl::queue{ device, props } }); + } + } + } + } + })); + break; + case execution_space::hpx: + PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX([&]() { + devices.emplace_back(Kokkos::Experimental::HPX{}); + }); + break; + case execution_space::openmp: + PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP([&]() { + // Note: if OpenMP should be used as device must be set in order for it to work! + if (omp_get_nested() == 0) { + ::plssvm::detail::log_untracked(verbosity_level::full | verbosity_level::warning, + "WARNING: In order for Kokkos::OpenMP to work properly, we have to set \"omp_set_nested(1)\"!\n"); + // enable OMP_NESTED support + // Note: function is officially deprecated but still necessary for Kokkos::OpenMP to work properly + omp_set_nested(1); + } + devices.emplace_back(Kokkos::OpenMP{}); + }); + break; + case execution_space::openmp_target: + PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET([&]() { + // TODO: implement multi-GPU support? + devices.emplace_back(Kokkos::Experimental::OpenMPTarget{}); + }); + break; + case execution_space::openacc: + PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC([&]() { + // TODO: implement multi-GPU support? + devices.emplace_back(Kokkos::Experimental::OpenACC{}); + }); + break; + case execution_space::threads: + PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS([&]() { + devices.emplace_back(Kokkos::Threads{}); + }); + break; + case execution_space::serial: + PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL([&]() { + devices.emplace_back(Kokkos::Serial{}); + }); + break; + } + return devices; +} + +} // namespace plssvm::kokkos::detail diff --git a/src/plssvm/backends/Kokkos/detail/pinned_memory.cpp b/src/plssvm/backends/Kokkos/detail/pinned_memory.cpp new file mode 100644 index 000000000..919cbdaa1 --- /dev/null +++ b/src/plssvm/backends/Kokkos/detail/pinned_memory.cpp @@ -0,0 +1,46 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + */ + +#include "plssvm/backends/Kokkos/detail/pinned_memory.hpp" + +#include "plssvm/backends/host_pinned_memory.hpp" // plssvm::detail::host_pinned_memory +#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception + +#include // std::size_t +#include // std::terminate +#include // std::cerr, std::endl +#include // std::vector + +namespace plssvm::kokkos::detail { + +template +pinned_memory::pinned_memory(const std::vector &vec) : + pinned_memory{ vec.data(), vec.size() } { } + +template +pinned_memory::pinned_memory(const T *ptr, const std::size_t size) : + ::plssvm::detail::host_pinned_memory{ ptr } { + this->pin_memory(size * sizeof(T)); +} + +template +pinned_memory::~pinned_memory() { + try { + if (is_pinned_ && ptr_ != nullptr) { + this->unpin_memory(); + } + } catch (const plssvm::exception &e) { + std::cerr << e.what_with_loc() << std::endl; + std::terminate(); + } +} + +template class pinned_memory; +template class pinned_memory; + +} // namespace plssvm::kokkos::detail diff --git a/src/plssvm/backends/Kokkos/detail/utility.cpp b/src/plssvm/backends/Kokkos/detail/utility.cpp new file mode 100644 index 000000000..5dc3f8cda --- /dev/null +++ b/src/plssvm/backends/Kokkos/detail/utility.cpp @@ -0,0 +1,168 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + */ + +#include "plssvm/backends/Kokkos/detail/utility.hpp" + +#include "plssvm/backends/execution_range.hpp" // plssvm::detail::dim_type +#include "plssvm/backends/Kokkos/detail/conditional_execution.hpp" // PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_* +#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp" // plssvm::kokkos::detail::device_wrapper +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT +#include "plssvm/detail/string_utility.hpp" // plssvm::detail::as_lower_case +#include "plssvm/detail/utility.hpp" // plssvm::detail::contains +#include "plssvm/target_platforms.hpp" // plssvm::target_platform + +#include "Kokkos_Core.hpp" // Kokkos::ExecutionSpace, Kokkos::Impl::ManageStream +#include "Kokkos_Macros.hpp" // Kokkos macros + +#include "fmt/core.h" // fmt::format + +#include // std::map +#include // std::string +#include // std::unordered_set +#include // std::vector + +namespace plssvm::kokkos::detail { + +int dim_type_to_native(const ::plssvm::detail::dim_type &dims) { + return static_cast(dims.x * dims.y * dims.z); +} + +std::map> available_target_platform_to_execution_space_mapping() { + std::map> available_map{}; + + // iterate over all available execution spaces + for (const execution_space space : list_available_execution_spaces()) { + switch (space) { + case execution_space::automatic: + // nothing to do here + break; + case execution_space::cuda: + // NVIDIA GPUs only + available_map[target_platform::gpu_nvidia].push_back(execution_space::cuda); + break; + case execution_space::hip: + // NVIDIA or AMD GPUs possible (both simultaneously are unsupported) + PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP([&]() { +#if defined(__HIP_PLATFORM_AMD__) + available_map[target_platform::gpu_amd].push_back(execution_space::hip); +#elif defined(__HIP_PLATFORM_NVIDIA__) + available_map[target_platform::gpu_nvidia].push_back(execution_space::hip); +#endif + }); + break; + case execution_space::sycl: + // list all potential target platforms currently available in SYCL + PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL([&]() { + std::unordered_set targets{}; + for (const auto &platform : ::sycl::platform::get_platforms()) { + for (const auto &device : platform.get_devices()) { + // Note: Kokkos is Intel LLVM/DPC++/icpx only + if (device.is_cpu()) { + targets.insert(target_platform::cpu); + } else if (device.is_gpu()) { + // the current device is a GPU + // get vendor string and convert it to all lower case + const std::string vendor_string = ::plssvm::detail::as_lower_case(device.get_info<::sycl::info::device::vendor>()); + // get platform name of current GPU device and convert it to all lower case + const std::string platform_string = ::plssvm::detail::as_lower_case(platform.get_info<::sycl::info::platform::name>()); + + // check vendor string and insert to correct target platform + if (::plssvm::detail::contains(vendor_string, "nvidia")) { + targets.insert(target_platform::gpu_nvidia); + } else if (::plssvm::detail::contains(vendor_string, "amd") || ::plssvm::detail::contains(vendor_string, "advanced micro devices")) { + targets.insert(target_platform::gpu_amd); + } else if (::plssvm::detail::contains(vendor_string, "intel")) { + targets.insert(target_platform::gpu_intel); + } + } + } + } + // now we know which target platforms are available in SYCL -> add them to our mapping + for (const target_platform target : targets) { + available_map[target].push_back(execution_space::sycl); + } + }); + break; + case execution_space::openacc: + // TODO: restrict to available devices + // all GPUs and CPU possible + available_map[target_platform::gpu_nvidia].push_back(execution_space::sycl); + available_map[target_platform::gpu_amd].push_back(execution_space::sycl); + available_map[target_platform::gpu_intel].push_back(execution_space::sycl); + available_map[target_platform::cpu].push_back(execution_space::sycl); + break; + case execution_space::openmp_target: + // TODO: restrict to available devices + // all GPUs + available_map[target_platform::gpu_nvidia].push_back(execution_space::openmp_target); + available_map[target_platform::gpu_amd].push_back(execution_space::openmp_target); + available_map[target_platform::gpu_intel].push_back(execution_space::openmp_target); + break; + case execution_space::hpx: + case execution_space::openmp: + case execution_space::threads: + case execution_space::serial: + // all these execution spaces are CPU only + available_map[target_platform::cpu].push_back(space); + break; + } + } + + // the map must at least have one entry + PLSSVM_ASSERT(!available_map.empty(), "At least one target platform must be available!"); + // the automatic target platform must not be present + PLSSVM_ASSERT(!::plssvm::detail::contains(available_map, target_platform::automatic), "The automatic target platform may not be present!"); + + return available_map; +} + +std::string get_device_name([[maybe_unused]] const device_wrapper &dev) { + switch (dev.get_execution_space()) { + case execution_space::automatic: + throw backend_exception{ "Unsupported execution_space::automatic provided!" }; + case execution_space::cuda: + PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA([&]() { + return std::string{ dev.get().cuda_device_prop().name }; + }); + case execution_space::hip: + PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP([&]() { + return std::string{ dev.get().hip_device_prop().name }; + }); + case execution_space::sycl: + PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SYCL([&]() { + return dev.get().sycl_queue().get_device().get_info<::sycl::info::device::name>(); + }); + case execution_space::hpx: + return "HPX CPU host device"; + case execution_space::openmp: + return "OpenMP CPU host device"; + case execution_space::openmp_target: + return "OpenMP target device"; + case execution_space::openacc: + return "OpenACC target device"; + case execution_space::threads: + return "std::threads CPU host device"; + case execution_space::serial: + return "serial CPU host device"; + } + return "unknown"; +} + +void device_synchronize(const device_wrapper &dev) { + dev.execute([](const auto &device) { + device.fence(); + }); +} + +std::string get_kokkos_version() { + // get the Kokkos version + return fmt::format("{}.{}.{}", KOKKOS_VERSION_MAJOR, KOKKOS_VERSION_MINOR, KOKKOS_VERSION_PATCH); +} + +} // namespace plssvm::kokkos::detail diff --git a/src/plssvm/backends/Kokkos/exceptions.cpp b/src/plssvm/backends/Kokkos/exceptions.cpp new file mode 100644 index 000000000..4186e4008 --- /dev/null +++ b/src/plssvm/backends/Kokkos/exceptions.cpp @@ -0,0 +1,21 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + */ + +#include "plssvm/backends/Kokkos/exceptions.hpp" + +#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception +#include "plssvm/exceptions/source_location.hpp" // plssvm::source_location + +#include // std::string + +namespace plssvm::kokkos { + +backend_exception::backend_exception(const std::string &msg, source_location loc) : + ::plssvm::exception{ msg, "kokkos::backend_exception", loc } { } + +} // namespace plssvm::kokkos diff --git a/src/plssvm/backends/Kokkos/execution_space.cpp b/src/plssvm/backends/Kokkos/execution_space.cpp new file mode 100644 index 000000000..0caae212f --- /dev/null +++ b/src/plssvm/backends/Kokkos/execution_space.cpp @@ -0,0 +1,89 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + */ + +#include "plssvm/backends/Kokkos/execution_space.hpp" + +#include "plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp" // plssvm::kokkos::detail::constexpr_available_execution_spaces +#include "plssvm/detail/string_utility.hpp" // plssvm::detail::to_lower_case + +#include // std::array +#include // std::ios::failbit +#include // std::istream +#include // std::ostream +#include // std::string +#include // std::vector + +namespace plssvm::kokkos { + +std::ostream &operator<<(std::ostream &out, const execution_space space) { + switch (space) { + case execution_space::automatic: + return out << "automatic"; + case execution_space::cuda: + return out << "Cuda"; + case execution_space::hip: + return out << "HIP"; + case execution_space::sycl: + return out << "SYCL"; + case execution_space::hpx: + return out << "HPX"; + case execution_space::openmp: + return out << "OpenMP"; + case execution_space::openmp_target: + return out << "OpenMPTarget"; + case execution_space::openacc: + return out << "OpenACC"; + case execution_space::threads: + return out << "Threads"; + case execution_space::serial: + return out << "Serial"; + } + return out << "unknown"; +} + +std::istream &operator>>(std::istream &in, execution_space &space) { + std::string str{}; + in >> str; + ::plssvm::detail::to_lower_case(str); + + if (str == "automatic" || str == "auto") { + space = execution_space::automatic; + } else if (str == "cuda") { + space = execution_space::cuda; + } else if (str == "hip") { + space = execution_space::hip; + } else if (str == "sycl") { + space = execution_space::sycl; + } else if (str == "hpx") { + space = execution_space::hpx; + } else if (str == "openmp") { + space = execution_space::openmp; + } else if (str == "openmp_target" || str == "openmptarget") { + space = execution_space::openmp_target; + } else if (str == "openacc") { + space = execution_space::openacc; + } else if (str == "threads" || str == "std::threads") { + space = execution_space::threads; + } else if (str == "serial") { + space = execution_space::serial; + } else { + in.setstate(std::ios::failbit); + } + return in; +} + +std::vector list_available_execution_spaces() { + // always add the automatic execution space + std::vector spaces{ execution_space::automatic }; + // add all other available execution spaces + constexpr auto arr = detail::constexpr_available_execution_spaces(); + spaces.insert(spaces.cend(), arr.begin(), arr.end()); + return spaces; +} + +} // namespace plssvm::kokkos diff --git a/src/plssvm/detail/cmd/parser_predict.cpp b/src/plssvm/detail/cmd/parser_predict.cpp index c1a8a5be3..656d9a76d 100644 --- a/src/plssvm/detail/cmd/parser_predict.cpp +++ b/src/plssvm/detail/cmd/parser_predict.cpp @@ -9,6 +9,7 @@ #include "plssvm/detail/cmd/parser_predict.hpp" #include "plssvm/backend_types.hpp" // plssvm::list_available_backends +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::list_available_execution_spaces #include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::list_available_sycl_implementations #include "plssvm/constants.hpp" // plssvm::real_type #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT @@ -51,6 +52,9 @@ parser_predict::parser_predict(int argc, char **argv) { #if defined(PLSSVM_HAS_SYCL_BACKEND) ("sycl_implementation_type", fmt::format("choose the SYCL implementation to be used in the SYCL backend: {}", fmt::join(sycl::list_available_sycl_implementations(), "|")), cxxopts::value()->default_value(fmt::format("{}", sycl_implementation_type))) #endif +#if defined(PLSSVM_HAS_KOKKOS_BACKEND) + ("kokkos_execution_space", fmt::format("choose the Kokkos execution space to be used in the Kokkos backend: {}", fmt::join(kokkos::list_available_execution_spaces(), "|")), cxxopts::value()->default_value(fmt::format("{}", kokkos_execution_space))) +#endif #if defined(PLSSVM_PERFORMANCE_TRACKER_ENABLED) ("performance_tracking", "the output YAML file where the performance tracking results are written to; if not provided, the results are dumped to stderr", cxxopts::value()) #endif @@ -101,18 +105,38 @@ parser_predict::parser_predict(int argc, char **argv) { target = result["target_platform"].as(); #if defined(PLSSVM_HAS_SYCL_BACKEND) - // parse SYCL implementation used in the SYCL backend - sycl_implementation_type = result["sycl_implementation_type"].as(); - - // assembly warning condition - const std::vector target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target }; - const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl); - - // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend - if (!sycl_backend_is_used && sycl_implementation_type != sycl::implementation_type::automatic) { - detail::log_untracked(verbosity_level::full | verbosity_level::warning, - "WARNING: explicitly set a SYCL implementation type but the current backend isn't SYCL; ignoring --sycl_implementation_type={}\n", - sycl_implementation_type); + { + // parse SYCL implementation used in the SYCL backend + sycl_implementation_type = result["sycl_implementation_type"].as(); + + // assembly warning condition + const std::vector target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target }; + const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl); + + // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend + if (!sycl_backend_is_used && sycl_implementation_type != sycl::implementation_type::automatic) { + detail::log_untracked(verbosity_level::full | verbosity_level::warning, + "WARNING: explicitly set a SYCL implementation type but the current backend isn't SYCL; ignoring --sycl_implementation_type={}\n", + sycl_implementation_type); + } + } +#endif + +#if defined(PLSSVM_HAS_KOKKOS_BACKEND) + { + // parse execution space when using Kokkos as backend + kokkos_execution_space = result["kokkos_execution_space"].as(); + + // assemble warning condition + const std::vector target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target }; + const bool kokkos_backend_is_used = backend == backend_type::kokkos || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::kokkos); + + // warn if the kokkos execution space is explicitly set but Kokkos isn't the current (automatic) backend + if (!kokkos_backend_is_used && kokkos_execution_space != kokkos::execution_space::automatic) { + detail::log_untracked(verbosity_level::full | verbosity_level::warning, + "WARNING: explicitly set a Kokkos execution space but the current backend isn't Kokkos; ignoring --kokkos_execution_space={}\n", + kokkos_execution_space); + } } #endif @@ -178,6 +202,10 @@ std::ostream &operator<<(std::ostream &out, const parser_predict ¶ms) { out << fmt::format("SYCL implementation type: {}\n", params.sycl_implementation_type); } + if (params.backend == backend_type::kokkos || params.backend == backend_type::automatic) { + out << fmt::format("Kokkos execution space: {}\n", params.kokkos_execution_space); + } + out << fmt::format( "label_type: {}\n" "real_type: {}\n" diff --git a/src/plssvm/detail/cmd/parser_train.cpp b/src/plssvm/detail/cmd/parser_train.cpp index d0cc4cb26..31d5b8719 100644 --- a/src/plssvm/detail/cmd/parser_train.cpp +++ b/src/plssvm/detail/cmd/parser_train.cpp @@ -9,6 +9,7 @@ #include "plssvm/detail/cmd/parser_train.hpp" #include "plssvm/backend_types.hpp" // plssvm::list_available_backends, plssvm::determine_default_backend +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::{list_available_execution_spaces, execution_space} #include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::{list_available_sycl_implementations, implementation_type} #include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/classification_types.hpp" // plssvm::classification_type, plssvm::classification_type_to_full_string @@ -77,6 +78,9 @@ parser_train::parser_train(int argc, char **argv) { ("sycl_kernel_invocation_type", "choose the kernel invocation type when using SYCL as backend: automatic|nd_range", cxxopts::value()->default_value(fmt::format("{}", sycl_kernel_invocation_type))) ("sycl_implementation_type", fmt::format("choose the SYCL implementation to be used in the SYCL backend: {}", fmt::join(sycl::list_available_sycl_implementations(), "|")), cxxopts::value()->default_value(fmt::format("{}", sycl_implementation_type))) #endif +#if defined(PLSSVM_HAS_KOKKOS_BACKEND) + ("kokkos_execution_space", fmt::format("choose the Kokkos execution space to be used in the Kokkos backend: {}", fmt::join(kokkos::list_available_execution_spaces(), "|")), cxxopts::value()->default_value(fmt::format("{}", kokkos_execution_space))) +#endif #if defined(PLSSVM_PERFORMANCE_TRACKER_ENABLED) ("performance_tracking", "the output YAML file where the performance tracking results are written to; if not provided, the results are dumped to stderr", cxxopts::value()) #endif @@ -185,28 +189,48 @@ parser_train::parser_train(int argc, char **argv) { solver = result["solver"].as(); #if defined(PLSSVM_HAS_SYCL_BACKEND) - // parse kernel invocation type when using SYCL as backend - sycl_kernel_invocation_type = result["sycl_kernel_invocation_type"].as(); - - // assembly warning condition - const std::vector target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target }; - const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl); - - // warn if kernel invocation type is explicitly set but SYCL isn't the current (automatic) backend - if (!sycl_backend_is_used && sycl_kernel_invocation_type != sycl::kernel_invocation_type::automatic) { - detail::log_untracked(verbosity_level::full | verbosity_level::warning, - "WARNING: explicitly set a SYCL kernel invocation type but the current backend isn't SYCL; ignoring --sycl_kernel_invocation_type={}\n", - sycl_kernel_invocation_type); + { + // parse kernel invocation type when using SYCL as backend + sycl_kernel_invocation_type = result["sycl_kernel_invocation_type"].as(); + + // assemble warning condition + const std::vector target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target }; + const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl); + + // warn if kernel invocation type is explicitly set but SYCL isn't the current (automatic) backend + if (!sycl_backend_is_used && sycl_kernel_invocation_type != sycl::kernel_invocation_type::automatic) { + detail::log_untracked(verbosity_level::full | verbosity_level::warning, + "WARNING: explicitly set a SYCL kernel invocation type but the current backend isn't SYCL; ignoring --sycl_kernel_invocation_type={}\n", + sycl_kernel_invocation_type); + } + + // parse SYCL implementation used in the SYCL backend + sycl_implementation_type = result["sycl_implementation_type"].as(); + + // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend + if (!sycl_backend_is_used && sycl_implementation_type != sycl::implementation_type::automatic) { + detail::log_untracked(verbosity_level::full | verbosity_level::warning, + "WARNING: explicitly set a SYCL implementation type but the current backend isn't SYCL; ignoring --sycl_implementation_type={}\n", + sycl_implementation_type); + } } +#endif - // parse SYCL implementation used in the SYCL backend - sycl_implementation_type = result["sycl_implementation_type"].as(); +#if defined(PLSSVM_HAS_KOKKOS_BACKEND) + { + // parse execution space when using Kokkos as backend + kokkos_execution_space = result["kokkos_execution_space"].as(); - // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend - if (!sycl_backend_is_used && sycl_implementation_type != sycl::implementation_type::automatic) { - detail::log_untracked(verbosity_level::full | verbosity_level::warning, - "WARNING: explicitly set a SYCL implementation type but the current backend isn't SYCL; ignoring --sycl_implementation_type={}\n", - sycl_implementation_type); + // assemble warning condition + const std::vector target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target }; + const bool kokkos_backend_is_used = backend == backend_type::kokkos || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::kokkos); + + // warn if the kokkos execution space is explicitly set but Kokkos isn't the current (automatic) backend + if (!kokkos_backend_is_used && kokkos_execution_space != kokkos::execution_space::automatic) { + detail::log_untracked(verbosity_level::full | verbosity_level::warning, + "WARNING: explicitly set a Kokkos execution space but the current backend isn't Kokkos; ignoring --kokkos_execution_space={}\n", + kokkos_execution_space); + } } #endif @@ -302,6 +326,10 @@ std::ostream &operator<<(std::ostream &out, const parser_train ¶ms) { params.sycl_kernel_invocation_type); } + if (params.backend == backend_type::kokkos || params.backend == backend_type::automatic) { + out << fmt::format("Kokkos execution space: {}\n", params.kokkos_execution_space); + } + out << fmt::format( "classification_type: {}\n" "label_type: {}\n" diff --git a/src/plssvm/detail/tracking/performance_tracker.cpp b/src/plssvm/detail/tracking/performance_tracker.cpp index 26ebda7d3..6d1323e8e 100644 --- a/src/plssvm/detail/tracking/performance_tracker.cpp +++ b/src/plssvm/detail/tracking/performance_tracker.cpp @@ -116,6 +116,7 @@ void performance_tracker::add_tracking_entry(const tracking_entry{ fmt::format("{}", entry.entry_value.target) }); tracking_entries_[entry.entry_category].emplace("sycl_kernel_invocation_type", std::vector{ fmt::format("{}", entry.entry_value.sycl_kernel_invocation_type) }); tracking_entries_[entry.entry_category].emplace("sycl_implementation_type", std::vector{ fmt::format("{}", entry.entry_value.sycl_implementation_type) }); + tracking_entries_[entry.entry_category].emplace("kokkos_execution_space", std::vector{ fmt::format("{}", entry.entry_value.kokkos_execution_space) }); tracking_entries_[entry.entry_category].emplace("strings_as_labels", std::vector{ fmt::format("{}", entry.entry_value.strings_as_labels) }); tracking_entries_[entry.entry_category].emplace("real_type", std::vector{ std::string{ arithmetic_type_name() } }); tracking_entries_[entry.entry_category].emplace("input_filename", std::vector{ fmt::format("\"{}\"", entry.entry_value.input_filename) }); @@ -133,6 +134,7 @@ void performance_tracker::add_tracking_entry(const tracking_entry{ fmt::format("{}", entry.entry_value.backend) }); tracking_entries_[entry.entry_category].emplace("target", std::vector{ fmt::format("{}", entry.entry_value.target) }); tracking_entries_[entry.entry_category].emplace("sycl_implementation_type", std::vector{ fmt::format("{}", entry.entry_value.sycl_implementation_type) }); + tracking_entries_[entry.entry_category].emplace("kokkos_execution_space", std::vector{ fmt::format("{}", entry.entry_value.kokkos_execution_space) }); tracking_entries_[entry.entry_category].emplace("strings_as_labels", std::vector{ fmt::format("{}", entry.entry_value.strings_as_labels) }); tracking_entries_[entry.entry_category].emplace("real_type", std::vector{ std::string{ arithmetic_type_name() } }); tracking_entries_[entry.entry_category].emplace("input_filename", std::vector{ fmt::format("\"{}\"", entry.entry_value.input_filename) }); @@ -297,6 +299,14 @@ void performance_tracker::save(std::ostream &out) { " ADAPTIVECPP_with_accelerated_CPU: {}\n", adaptivecpp_sscp, adaptivecpp_accelerated_cpu); +#endif +#if defined(PLSSVM_HAS_KOKKOS_BACKEND) + // check whether Kokkos::SYCL AOT has been enabled + constexpr bool kokkos_sycl_aot = PLSSVM_IS_DEFINED(PLSSVM_KOKKOS_BACKEND_INTEL_LLVM_ENABLE_AOT); + + out << fmt::format( + " KOKKOS_sycl_intel_llvm_with_aot: {}\n", + kokkos_sycl_aot); #endif out << "\n"; diff --git a/src/plssvm/target_platforms.cpp b/src/plssvm/target_platforms.cpp index f5569b51d..8fc47e223 100644 --- a/src/plssvm/target_platforms.cpp +++ b/src/plssvm/target_platforms.cpp @@ -22,9 +22,6 @@ namespace plssvm { std::vector list_available_target_platforms() { std::vector available_targets = { target_platform::automatic }; -#if defined(PLSSVM_HAS_CPU_TARGET) - available_targets.push_back(target_platform::cpu); -#endif #if defined(PLSSVM_HAS_NVIDIA_TARGET) available_targets.push_back(target_platform::gpu_nvidia); #endif @@ -34,6 +31,9 @@ std::vector list_available_target_platforms() { #if defined(PLSSVM_HAS_INTEL_TARGET) available_targets.push_back(target_platform::gpu_intel); #endif +#if defined(PLSSVM_HAS_CPU_TARGET) + available_targets.push_back(target_platform::cpu); +#endif // automatic is ALWAYS available but AT LEAST ONE other target must be available in addition PLSSVM_ASSERT(available_targets.size() > 1, "Besides \"automatic\" at least one other target must be available!"); diff --git a/tests/backend_types.cpp b/tests/backend_types.cpp index 4b0f27aae..8a735a26b 100644 --- a/tests/backend_types.cpp +++ b/tests/backend_types.cpp @@ -40,11 +40,12 @@ TEST(BackendType, to_string) { EXPECT_CONVERSION_TO_STRING(plssvm::backend_type::hip, "hip"); EXPECT_CONVERSION_TO_STRING(plssvm::backend_type::opencl, "opencl"); EXPECT_CONVERSION_TO_STRING(plssvm::backend_type::sycl, "sycl"); + EXPECT_CONVERSION_TO_STRING(plssvm::backend_type::kokkos, "kokkos"); } TEST(BackendType, to_string_unknown) { // check conversions to std::string from unknown backend_type - EXPECT_CONVERSION_TO_STRING(static_cast(8), "unknown"); + EXPECT_CONVERSION_TO_STRING(static_cast(9), "unknown"); } // check whether the std::string -> plssvm::backend_type conversions are correct @@ -68,6 +69,8 @@ TEST(BackendType, from_string) { EXPECT_CONVERSION_FROM_STRING("OpenCL", plssvm::backend_type::opencl); EXPECT_CONVERSION_FROM_STRING("sycl", plssvm::backend_type::sycl); EXPECT_CONVERSION_FROM_STRING("SYCL", plssvm::backend_type::sycl); + EXPECT_CONVERSION_FROM_STRING("Kokkos", plssvm::backend_type::kokkos); + EXPECT_CONVERSION_FROM_STRING("KOKKOS", plssvm::backend_type::kokkos); } TEST(BackendType, from_string_unknown) { @@ -133,6 +136,7 @@ INSTANTIATE_TEST_SUITE_P(BackendType, BackendTypeSupportedCombination, ::testing supported_combination_type{ { plssvm::backend_type::hip }, { plssvm::target_platform::cpu, plssvm::target_platform::gpu_nvidia, plssvm::target_platform::gpu_amd, plssvm::target_platform::gpu_intel }, plssvm::backend_type::hip }, supported_combination_type{ { plssvm::backend_type::opencl }, { plssvm::target_platform::cpu, plssvm::target_platform::gpu_nvidia, plssvm::target_platform::gpu_amd, plssvm::target_platform::gpu_intel }, plssvm::backend_type::opencl }, supported_combination_type{ { plssvm::backend_type::sycl }, { plssvm::target_platform::cpu, plssvm::target_platform::gpu_nvidia, plssvm::target_platform::gpu_amd, plssvm::target_platform::gpu_intel }, plssvm::backend_type::sycl }, + supported_combination_type{ { plssvm::backend_type::kokkos }, { plssvm::target_platform::cpu, plssvm::target_platform::gpu_nvidia, plssvm::target_platform::gpu_amd, plssvm::target_platform::gpu_intel }, plssvm::backend_type::kokkos }, supported_combination_type{ { plssvm::backend_type::openmp, plssvm::backend_type::cuda, plssvm::backend_type::hip, plssvm::backend_type::opencl, plssvm::backend_type::sycl }, { plssvm::target_platform::cpu }, plssvm::backend_type::sycl }, supported_combination_type{ { plssvm::backend_type::openmp, plssvm::backend_type::cuda, plssvm::backend_type::hip, plssvm::backend_type::opencl, plssvm::backend_type::sycl }, { plssvm::target_platform::gpu_nvidia }, plssvm::backend_type::cuda }, supported_combination_type{ { plssvm::backend_type::openmp, plssvm::backend_type::cuda, plssvm::backend_type::hip, plssvm::backend_type::opencl, plssvm::backend_type::sycl }, { plssvm::target_platform::gpu_amd }, plssvm::backend_type::hip }, @@ -151,6 +155,7 @@ TEST(BackendType, csvm_to_backend_type) { EXPECT_EQ(plssvm::csvm_to_backend_type::value, plssvm::backend_type::sycl); EXPECT_EQ(plssvm::csvm_to_backend_type::value, plssvm::backend_type::sycl); EXPECT_EQ(plssvm::csvm_to_backend_type::value, plssvm::backend_type::sycl); + EXPECT_EQ(plssvm::csvm_to_backend_type::value, plssvm::backend_type::kokkos); EXPECT_EQ(plssvm::csvm_to_backend_type::impl, plssvm::sycl::implementation_type::adaptivecpp); EXPECT_EQ(plssvm::csvm_to_backend_type::impl, plssvm::sycl::implementation_type::dpcpp); @@ -167,4 +172,5 @@ TEST(BackendType, csvm_to_backend_type_v) { EXPECT_EQ(plssvm::csvm_to_backend_type_v, plssvm::backend_type::sycl); EXPECT_EQ(plssvm::csvm_to_backend_type_v, plssvm::backend_type::sycl); EXPECT_EQ(plssvm::csvm_to_backend_type_v, plssvm::backend_type::sycl); + EXPECT_EQ(plssvm::csvm_to_backend_type_v, plssvm::backend_type::kokkos); } diff --git a/tests/backends/CMakeLists.txt b/tests/backends/CMakeLists.txt index ec6a5fa76..dffe57615 100644 --- a/tests/backends/CMakeLists.txt +++ b/tests/backends/CMakeLists.txt @@ -38,3 +38,8 @@ endif () if (TARGET ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME}) add_subdirectory(SYCL) endif () + +# create Kokkos tests if the Kokkos backend is available +if (TARGET ${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME}) + add_subdirectory(Kokkos) +endif () diff --git a/tests/backends/Kokkos/CMakeLists.txt b/tests/backends/Kokkos/CMakeLists.txt new file mode 100644 index 000000000..f29367a27 --- /dev/null +++ b/tests/backends/Kokkos/CMakeLists.txt @@ -0,0 +1,54 @@ +## Authors: Alexander Van Craen, Marcel Breyer +## Copyright (C): 2018-today The PLSSVM project - All Rights Reserved +## License: This file is part of the PLSSVM project which is released under the MIT license. +## See the LICENSE.md file in the project root for full license information. +######################################################################################################################## + +## create Kokkos tests +set(PLSSVM_KOKKOS_TEST_NAME Kokkos_tests) + +# list all necessary sources +set(PLSSVM_KOKKOS_TEST_SOURCES + ${CMAKE_CURRENT_LIST_DIR}/detail/constexpr_available_execution_spaces.cpp + ${CMAKE_CURRENT_LIST_DIR}/detail/device_ptr.cpp + ${CMAKE_CURRENT_LIST_DIR}/detail/device_view_wrapper.cpp + ${CMAKE_CURRENT_LIST_DIR}/detail/device_wrapper.cpp + ${CMAKE_CURRENT_LIST_DIR}/detail/standard_layout_tuple.cpp + ${CMAKE_CURRENT_LIST_DIR}/detail/pinned_memory.cpp + ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp + ${CMAKE_CURRENT_LIST_DIR}/kokkos_csvm.cpp + ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp + ${CMAKE_CURRENT_LIST_DIR}/execution_space.cpp + ${CMAKE_CURRENT_LIST_DIR}/execution_space_type_traits.cpp +) + +find_package(Kokkos REQUIRED) + +# add test executable +add_executable(${PLSSVM_KOKKOS_TEST_NAME} ${CMAKE_CURRENT_LIST_DIR}/../../main.cpp ${PLSSVM_KOKKOS_TEST_SOURCES}) + +if (Kokkos_ENABLE_CUDA) + # fix template limit when using Kokkos::Cuda + target_compile_options(${PLSSVM_KOKKOS_TEST_NAME} PRIVATE -Xcudafe --pending_instantiations=0) + + # tests won't compile with nvcc + if (NOT PLSSVM_TEST_WITH_REDUCED_LABEL_TYPES) + message(FATAL_ERROR "Due to template instantiation limits within nvcc, only reduced label type tests are currently supported!") + endif () +endif () + +# increase recursive template instantiation limit +target_compile_options(${PLSSVM_KOKKOS_TEST_NAME} PRIVATE $<$:$<$:-ftemplate-depth=2048>>) + +# link against test library +target_link_libraries(${PLSSVM_KOKKOS_TEST_NAME} PRIVATE ${PLSSVM_BASE_TEST_LIBRARY_NAME}) + +# add tests to google test +include(GoogleTest) +include(${PROJECT_SOURCE_DIR}/cmake/discover_tests_with_death_test_filter.cmake) +discover_tests_with_death_test_filter(${PLSSVM_KOKKOS_TEST_NAME}) + +# add test as coverage dependency +if (TARGET coverage) + add_dependencies(coverage ${PLSSVM_KOKKOS_TEST_NAME}) +endif () \ No newline at end of file diff --git a/tests/backends/Kokkos/detail/constexpr_available_execution_spaces.cpp b/tests/backends/Kokkos/detail/constexpr_available_execution_spaces.cpp new file mode 100644 index 000000000..2e8f064e7 --- /dev/null +++ b/tests/backends/Kokkos/detail/constexpr_available_execution_spaces.cpp @@ -0,0 +1,18 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Tests for the Kokkos `constexpr_available_execution_spaces()` function. + */ + +#include "plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp" + +#include "gtest/gtest.h" // TEST, EXPECT_TRUE, EXPECT_FALSE + +TEST(KokkosConstexprAvailableExecutionSpaces, constexpr_available_execution_spaces) { + // at least one execution space must always be available + EXPECT_FALSE(plssvm::kokkos::detail::constexpr_available_execution_spaces().empty()); +} diff --git a/tests/backends/Kokkos/detail/device_ptr.cpp b/tests/backends/Kokkos/detail/device_ptr.cpp new file mode 100644 index 000000000..ec525dad5 --- /dev/null +++ b/tests/backends/Kokkos/detail/device_ptr.cpp @@ -0,0 +1,55 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Tests for the Kokkos backend device pointer. + */ + +#include "plssvm/backends/Kokkos/detail/device_ptr.hpp" // plssvm::kokkos::detail::device_ptr + +#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp" // plssvm::kokkos::detail::device_wrapper +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space +#include "plssvm/backends/Kokkos/execution_space_type_traits.hpp" // plssvm::kokkos::execution_space_to_kokkos_type_t + +#include "tests/backends/generic_device_ptr_tests.hpp" // generic device pointer tests to instantiate +#include "tests/backends/Kokkos/utility.hpp" // util::create_kokkos_test_tuple_impl +#include "tests/naming.hpp" // naming::test_parameter_to_name +#include "tests/types_to_test.hpp" // util::{combine_test_parameters_gtest_t, cartesian_type_product_t, layout_type_list}, + // util::detail::concat_tuple_types_t + +#include "gtest/gtest.h" // INSTANTIATE_TYPED_TEST_SUITE_P + +#include // std::tuple + +template +struct kokkos_device_ptr_test_type { + using device_ptr_type = plssvm::kokkos::detail::device_ptr; + using queue_type = plssvm::kokkos::detail::device_wrapper; + constexpr static plssvm::kokkos::execution_space space = exec_space; + + static const queue_type &default_queue() { + static const queue_type queue{ plssvm::kokkos::execution_space_to_kokkos_type_t{} }; + return queue; + } +}; + +template +using kokkos_device_ptr_test_type_float = kokkos_device_ptr_test_type; +template +using kokkos_device_ptr_test_type_double = kokkos_device_ptr_test_type; + +using kokkos_device_ptr_tuple = util::detail::concat_tuple_types_t, + util::create_kokkos_test_tuple_t>; + +// the tests used in the instantiated GTest test suites +using kokkos_device_ptr_type_gtest = util::combine_test_parameters_gtest_t>; +using kokkos_device_ptr_layout_type_gtest = util::combine_test_parameters_gtest_t, util::layout_type_list>; + +// instantiate type-parameterized tests +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtr, DevicePtr, kokkos_device_ptr_type_gtest, naming::test_parameter_to_name); +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtr, DevicePtrLayout, kokkos_device_ptr_layout_type_gtest, naming::test_parameter_to_name); + +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtrDeathTest, DevicePtrDeathTest, kokkos_device_ptr_type_gtest, naming::test_parameter_to_name); diff --git a/tests/backends/Kokkos/detail/device_view_wrapper.cpp b/tests/backends/Kokkos/detail/device_view_wrapper.cpp new file mode 100644 index 000000000..28dc97cba --- /dev/null +++ b/tests/backends/Kokkos/detail/device_view_wrapper.cpp @@ -0,0 +1,95 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Tests for the device_view_wrapper class. + */ + +#include "plssvm/backends/Kokkos/detail/device_view_wrapper.hpp" + +#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp" // plssvm::kokkos::detail::device_wrapper +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::{execution_space, kokkos_type_to_execution_space_v} + +#include "Kokkos_Core.hpp" // Kokkos::DefaultExecutionSpace, Kokkos::View + +#include "gtest/gtest.h" // TEST, EXPECT_EQ, EXPECT_TRUE, EXPECT_FALSE + +#include // std::size_t + +TEST(KokkosDeviceViewWrapper, default_construct) { + // default construct a device view wrapper + const plssvm::kokkos::detail::device_view_wrapper view{}; + + // per std::variant specification, the first type in the underlying variant is now the active member + // -> this always corresponds to the first entry in our constexpr_available_execution_spaces array + constexpr auto spaces = plssvm::kokkos::detail::constexpr_available_execution_spaces(); + EXPECT_EQ(view.get_execution_space(), spaces.front()); +} + +TEST(KokkosDeviceViewWrapper, construct) { + // construct a device view wrapper using the current Kokkos::DefaultExecutionSpace + const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View{} }; + + // check that the device view is associated with the correct execution space + EXPECT_EQ(view.get_execution_space(), plssvm::kokkos::kokkos_type_to_execution_space_v); +} + +TEST(KokkosDeviceViewWrapper, get) { + // construct a device view wrapper using the current Kokkos::DefaultExecutionSpace + plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View{} }; + + // check that the returned Kokkos::View has the correct type + constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v; + ::testing::StaticAssertTypeEq()), Kokkos::View &>(); +} + +TEST(KokkosDeviceViewWrapper, get_const) { + // construct a device view wrapper using the current Kokkos::DefaultExecutionSpace + const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View{} }; + + // check that the returned Kokkos::View has the correct type + constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v; + ::testing::StaticAssertTypeEq()), const Kokkos::View &>(); +} + +TEST(KokkosDeviceViewWrapper, get_execution_space) { + // construct a device wrapper using the current Kokkos::DefaultExecutionSpace + const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View{} }; + + // check that the device view is associated with the correct execution space + EXPECT_EQ(view.get_execution_space(), plssvm::kokkos::kokkos_type_to_execution_space_v); +} + +TEST(KokkosDeviceViewWrapper, equality) { + const plssvm::kokkos::detail::device_view_wrapper view1{ Kokkos::View{} }; + const plssvm::kokkos::detail::device_view_wrapper view2{ Kokkos::View{} }; + + // should be equal + EXPECT_TRUE(view1 == view2); +} + +TEST(KokkosDeviceViewWrapper, inequality) { + const plssvm::kokkos::detail::device_view_wrapper view1{ Kokkos::View{} }; + const plssvm::kokkos::detail::device_view_wrapper view2{ Kokkos::View{} }; + + // should not be unequal + EXPECT_FALSE(view1 != view2); +} + +TEST(KokkosDeviceViewWrapper, make_device_view_wrapper) { + // create a device wrapper for the Kokkos::DefaultExecutionSpace + const plssvm::kokkos::detail::device_wrapper device{ Kokkos::DefaultExecutionSpace{} }; + + // create device view wrapper + const plssvm::kokkos::detail::device_view_wrapper view = plssvm::kokkos::detail::make_device_view_wrapper(device, 42); + + // check that the returned Kokkos::View has the correct type + constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v; + ::testing::StaticAssertTypeEq()), const Kokkos::View &>(); + + // check the number of elements + EXPECT_EQ(view.get().size(), std::size_t{ 42 }); +} diff --git a/tests/backends/Kokkos/detail/device_wrapper.cpp b/tests/backends/Kokkos/detail/device_wrapper.cpp new file mode 100644 index 000000000..ca644ece7 --- /dev/null +++ b/tests/backends/Kokkos/detail/device_wrapper.cpp @@ -0,0 +1,115 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Tests for the device_wrapper class. + */ + +#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp" + +#include "plssvm/backends/Kokkos/detail/utility.hpp" // plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::{execution_space, kokkos_type_to_execution_space_v} +#include "plssvm/detail/utility.hpp" // plssvm::detail::contains +#include "plssvm/target_platforms.hpp" // plssvm::target_platform + +#include "Kokkos_Core.hpp" // Kokkos::DefaultExecutionSpace + +#include "tests/utility.hpp" // util::for_each_variant_type + +#include "gtest/gtest.h" // TEST, EXPECT_GE, EXPECT_EQ + +#include // std::vector + +TEST(KokkosDeviceWrapper, default_construct) { + // default construct a device wrapper + const plssvm::kokkos::detail::device_wrapper device{}; + + // per std::variant specification, the first type in the underlying variant is now the active member + // -> this always corresponds to the first entry in our constexpr_available_execution_spaces array + constexpr auto spaces = plssvm::kokkos::detail::constexpr_available_execution_spaces(); + EXPECT_EQ(device.get_execution_space(), spaces.front()); +} + +TEST(KokkosDeviceWrapper, construct) { + // construct a device wrapper using the current Kokkos::DefaultExecutionSpace + const plssvm::kokkos::detail::device_wrapper device{ Kokkos::DefaultExecutionSpace{} }; + + // check that the device is associated with the correct execution space + EXPECT_EQ(device.get_execution_space(), plssvm::kokkos::kokkos_type_to_execution_space_v); +} + +TEST(KokkosDeviceWrapper, get) { + // construct a device wrapper using the current Kokkos::DefaultExecutionSpace + plssvm::kokkos::detail::device_wrapper device{ Kokkos::DefaultExecutionSpace{} }; + + // check that the returned Kokkos::ExecutionSpace has the correct type + constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v; + ::testing::StaticAssertTypeEq()), Kokkos::DefaultExecutionSpace &>(); +} + +TEST(KokkosDeviceWrapper, get_const) { + // construct a device wrapper using the current Kokkos::DefaultExecutionSpace + const plssvm::kokkos::detail::device_wrapper device{ Kokkos::DefaultExecutionSpace{} }; + + // check that the returned Kokkos::ExecutionSpace has the correct type + constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v; + ::testing::StaticAssertTypeEq()), const Kokkos::DefaultExecutionSpace &>(); +} + +TEST(KokkosDeviceWrapper, get_execution_space) { + // construct a device wrapper using the current Kokkos::DefaultExecutionSpace + const plssvm::kokkos::detail::device_wrapper device{ Kokkos::DefaultExecutionSpace{} }; + + // check that the device is associated with the correct execution space + EXPECT_EQ(device.get_execution_space(), plssvm::kokkos::kokkos_type_to_execution_space_v); +} + +TEST(KokkosDeviceWrapper, equality) { + const plssvm::kokkos::detail::device_wrapper device1{ Kokkos::DefaultExecutionSpace{} }; + const plssvm::kokkos::detail::device_wrapper device2{ Kokkos::DefaultExecutionSpace{} }; + + // should be equal + EXPECT_TRUE(device1 == device2); +} + +TEST(KokkosDeviceWrapper, inequality) { + const plssvm::kokkos::detail::device_wrapper device1{ Kokkos::DefaultExecutionSpace{} }; + const plssvm::kokkos::detail::device_wrapper device2{ Kokkos::DefaultExecutionSpace{} }; + + // should not be unequal + EXPECT_FALSE(device1 != device2); +} + +struct device_list_test { + template + void operator()() const { + // get the default device list + const plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v; + plssvm::target_platform default_target{}; + for (const auto &[target, spaces] : plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping()) { + if (::plssvm::detail::contains(spaces, space)) { + default_target = target; + break; + } + } + const std::vector devices = plssvm::kokkos::detail::get_device_list(space, default_target); + + // check the number of returned devices + if (space == plssvm::kokkos::execution_space::cuda || space == plssvm::kokkos::execution_space::hip || space == plssvm::kokkos::execution_space::sycl) { + // TODO: Change if multi-GPU support for Kokkos::Experimental::OpenMPTarget and/or Kokkos::Experimental::OpenACC is implemented + // for the device execution spaces AT LEAST ONE device must be found + EXPECT_GE(devices.size(), 1); + } else { + // for all other execution spaces EXACTLY ONE device must be found + EXPECT_EQ(devices.size(), 1); + } + } +}; + +TEST(KokkosDeviceWrapper, get_device_list) { + using variant_type = typename plssvm::kokkos::detail::impl::create_device_variant_type::type; + util::for_each_variant_type(device_list_test{}); +} diff --git a/tests/backends/Kokkos/detail/pinned_memory.cpp b/tests/backends/Kokkos/detail/pinned_memory.cpp new file mode 100644 index 000000000..2569e68e7 --- /dev/null +++ b/tests/backends/Kokkos/detail/pinned_memory.cpp @@ -0,0 +1,39 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Tests for the Kokkos backend pinned memory. + */ + +#include "plssvm/backends/Kokkos/detail/pinned_memory.hpp" // plssvm::kokkos::detail::pinned_memory + +#include "tests/backends/generic_pinned_memory_tests.hpp" // generic pinned memory tests to instantiate +#include "tests/naming.hpp" // naming::test_parameter_to_name +#include "tests/types_to_test.hpp" // util::{combine_test_parameters_gtest_t, cartesian_type_product_t, layout_type_list} + +#include "gtest/gtest.h" // INSTANTIATE_TYPED_TEST_SUITE_P + +#include // std::tuple + +template +struct kokkos_pinned_memory_test_type { + using pinned_memory_type = plssvm::kokkos::detail::pinned_memory; + + constexpr static bool can_pin = false; +}; + +using kokkos_pinned_memory_tuple = std::tuple, kokkos_pinned_memory_test_type>; + +// the tests used in the instantiated GTest test suites +using kokkos_pinned_memory_type_gtest = util::combine_test_parameters_gtest_t>; +using kokkos_pinned_memory_layout_type_gtest = util::combine_test_parameters_gtest_t, util::layout_type_list>; + +// instantiate type-parameterized tests +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosPinnedMemory, PinnedMemory, kokkos_pinned_memory_type_gtest, naming::test_parameter_to_name); +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosPinnedMemory, PinnedMemoryLayout, kokkos_pinned_memory_layout_type_gtest, naming::test_parameter_to_name); + +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosPinnedMemoryDeathTest, PinnedMemoryDeathTest, kokkos_pinned_memory_type_gtest, naming::test_parameter_to_name); +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosPinnedMemoryDeathTest, PinnedMemoryLayoutDeathTest, kokkos_pinned_memory_layout_type_gtest, naming::test_parameter_to_name); diff --git a/tests/backends/Kokkos/detail/standard_layout_tuple.cpp b/tests/backends/Kokkos/detail/standard_layout_tuple.cpp new file mode 100644 index 000000000..7b4fb6cd8 --- /dev/null +++ b/tests/backends/Kokkos/detail/standard_layout_tuple.cpp @@ -0,0 +1,33 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Tests for the custom standard layout tuple implementation necessary for Kokkos. + */ + +#include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp" // plssvm::kokkos::detail::{standard_layout_tuple, make_standard_layout_tuple, get} + +#include "gtest/gtest.h" // TEST, EXPECT_EQ, testing::StaticAssertTypeEq + +#include // std::remove_const_t + +TEST(KokkosStandardLayoutTuple, make_standard_layout_tuple) { + // create a new standard layout tuple + [[maybe_unused]] const auto tuple = plssvm::kokkos::detail::make_standard_layout_tuple(true, 42, 3.1415); + + // check the tuple type + ::testing::StaticAssertTypeEq, std::remove_const_t>(); +} + +TEST(KokkosStandardLayoutTuple, get) { + // create a new standard layout tuple + const auto tuple = plssvm::kokkos::detail::make_standard_layout_tuple(true, 42, 3.1415); + + // check getter functions + EXPECT_EQ(plssvm::kokkos::detail::get<0>(tuple), true); + EXPECT_EQ(plssvm::kokkos::detail::get<1>(tuple), 42); + EXPECT_EQ(plssvm::kokkos::detail::get<2>(tuple), 3.1415); +} diff --git a/tests/backends/Kokkos/detail/utility.cpp b/tests/backends/Kokkos/detail/utility.cpp new file mode 100644 index 000000000..ec18a977b --- /dev/null +++ b/tests/backends/Kokkos/detail/utility.cpp @@ -0,0 +1,93 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Tests for the custom utility functions related to the Kokkos backend. + */ + +#include "plssvm/backends/Kokkos/detail/utility.hpp" + +#include "plssvm/backends/execution_range.hpp" // plssvm::detail::dim_type +#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp" // plssvm::kokkos::detail::device_wrapper +#include "plssvm/backends/Kokkos/exceptions.hpp" // plssvm::kokkos::backend_exception +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::{execution_space, kokkos_type_to_execution_space_v} +#include "plssvm/detail/utility.hpp" // plssvm::detail::contains +#include "plssvm/target_platforms.hpp" // plssvm::target_platform + +#include "Kokkos_Core.hpp" // Kokkos::ExecutionSpace + +#include "tests/custom_test_macros.hpp" // EXPECT_THROW_WHAT +#include "tests/utility.hpp" // util::for_each_variant_type + +#include "fmt/core.h" // fmt::format +#include "gmock/gmock.h" // EXPECT_THAT; ::testing::AnyOf +#include "gtest/gtest.h" // TEST, EXPECT_NE + +#include // std::map +#include // std::regex, std::regex::extended, std::regex_match +#include // std::string +#include // std::variant +#include // std::vector + +TEST(KokkosUtility, is_type_in_variant) { + // check type trait that determines if a type is contained in a type trait + using variant_type = std::variant; + + EXPECT_TRUE((plssvm::kokkos::detail::impl::is_type_in_variant_v) ); + EXPECT_TRUE((plssvm::kokkos::detail::impl::is_type_in_variant_v) ); + EXPECT_TRUE((plssvm::kokkos::detail::impl::is_type_in_variant_v) ); + EXPECT_TRUE((plssvm::kokkos::detail::impl::is_type_in_variant_v) ); + EXPECT_FALSE((plssvm::kokkos::detail::impl::is_type_in_variant_v) ); + EXPECT_FALSE((plssvm::kokkos::detail::impl::is_type_in_variant_v) ); +} + +TEST(KokkosUtility, dim_type_to_native) { + // create a dim_type + constexpr plssvm::detail::dim_type dim{ 128ull, 64ull, 32ull }; + + // convert it to a Kokkos one-dimensional execution range + const int native_dim = plssvm::kokkos::detail::dim_type_to_native(dim); + + // check values for correctness + EXPECT_EQ(native_dim, 262'144); // = 128 * 64 * 32 +} + +TEST(KokkosUtility, available_target_platform_to_execution_space_mapping) { + // get the target_platform <-> execution_space mappings + const std::map> mapping = plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping(); + + // the map must not be empty + EXPECT_FALSE(mapping.empty()); + + // each vector must at least have one entry + the automatic target platform must not be present + for (const auto &[target, spaces] : mapping) { + EXPECT_NE(target, plssvm::target_platform::automatic); + EXPECT_GE(spaces.size(), 1); + } +} + +struct device_name_test { + template + void operator()() const { + // get the device name of the default Kokkos execution space + const std::string name = plssvm::kokkos::detail::get_device_name(plssvm::kokkos::detail::device_wrapper{ ExecutionSpace{} }); + SCOPED_TRACE(name); + + // the returned device name may not be empty or unknown + EXPECT_FALSE(name.empty()); + EXPECT_NE(name, std::string{ "unknown" }); + } +}; + +TEST(KokkosUtility, get_device_name) { + using variant_type = typename plssvm::kokkos::detail::impl::create_device_variant_type::type; + util::for_each_variant_type(device_name_test{}); +} + +TEST(KokkosUtility, get_kokkos_version) { + const std::regex reg{ "[0-9]+\\.[0-9]+\\.[0-9]+", std::regex::extended }; + EXPECT_TRUE(std::regex_match(plssvm::kokkos::detail::get_kokkos_version(), reg)); +} diff --git a/tests/backends/Kokkos/exceptions.cpp b/tests/backends/Kokkos/exceptions.cpp new file mode 100644 index 000000000..d78ac7801 --- /dev/null +++ b/tests/backends/Kokkos/exceptions.cpp @@ -0,0 +1,25 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Tests for the custom exception classes related to the Kokkos backend. + */ + +#include "plssvm/backends/Kokkos/exceptions.hpp" // plssvm::kokkos::backend_exception + +#include "tests/backends/generic_exceptions_tests.hpp" // generic exception tests to instantiate + +#include "gtest/gtest.h" // INSTANTIATE_TYPED_TEST_SUITE_P + +#include // std::string_view + +struct exception_test_type { + using exception_type = plssvm::kokkos::backend_exception; + constexpr static std::string_view name = "kokkos::backend_exception"; +}; + +// instantiate type-parameterized tests +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosExceptions, Exception, exception_test_type); diff --git a/tests/backends/Kokkos/execution_space.cpp b/tests/backends/Kokkos/execution_space.cpp new file mode 100644 index 000000000..3e54f3be5 --- /dev/null +++ b/tests/backends/Kokkos/execution_space.cpp @@ -0,0 +1,81 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Tests for functions related to the different Kokkos execution spaces. + */ + +#include "plssvm/backends/Kokkos/execution_space.hpp" + +#include "tests/custom_test_macros.hpp" // EXPECT_CONVERSION_TO_STRING, EXPECT_CONVERSION_FROM_STRING + +#include "gtest/gtest.h" // TEST, EXPECT_TRUE, EXPECT_FALSE + +#include // std::istringstream + +// check whether the plssvm::kokkos::execution_space -> std::string conversions are correct +TEST(KokkosExecutionSpace, to_string) { + // check conversions to std::string + EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::automatic, "automatic"); + EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::cuda, "Cuda"); + EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::hip, "HIP"); + EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::sycl, "SYCL"); + EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::hpx, "HPX"); + EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::openmp, "OpenMP"); + EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::openmp_target, "OpenMPTarget"); + EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::openacc, "OpenACC"); + EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::threads, "Threads"); + EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::execution_space::serial, "Serial"); +} + +TEST(KokkosExecutionSpace, to_string_unknown) { + // check conversions to std::string from unknown execution_space + EXPECT_CONVERSION_TO_STRING(static_cast(10), "unknown"); +} + +// check whether the std::string -> plssvm::kokkos::execution_space conversions are correct +TEST(KokkosExecutionSpace, from_string) { + // check conversion from std::string + EXPECT_CONVERSION_FROM_STRING("Automatic", plssvm::kokkos::execution_space::automatic); + EXPECT_CONVERSION_FROM_STRING("AUTO", plssvm::kokkos::execution_space::automatic); + EXPECT_CONVERSION_FROM_STRING("Cuda", plssvm::kokkos::execution_space::cuda); + EXPECT_CONVERSION_FROM_STRING("CUDA", plssvm::kokkos::execution_space::cuda); + EXPECT_CONVERSION_FROM_STRING("Hip", plssvm::kokkos::execution_space::hip); + EXPECT_CONVERSION_FROM_STRING("HIP", plssvm::kokkos::execution_space::hip); + EXPECT_CONVERSION_FROM_STRING("Sycl", plssvm::kokkos::execution_space::sycl); + EXPECT_CONVERSION_FROM_STRING("SYCL", plssvm::kokkos::execution_space::sycl); + EXPECT_CONVERSION_FROM_STRING("Hpx", plssvm::kokkos::execution_space::hpx); + EXPECT_CONVERSION_FROM_STRING("HPX", plssvm::kokkos::execution_space::hpx); + EXPECT_CONVERSION_FROM_STRING("OpenMP", plssvm::kokkos::execution_space::openmp); + EXPECT_CONVERSION_FROM_STRING("OPENMP", plssvm::kokkos::execution_space::openmp); + EXPECT_CONVERSION_FROM_STRING("OpenMP_Target", plssvm::kokkos::execution_space::openmp_target); + EXPECT_CONVERSION_FROM_STRING("OPENMPTARGET", plssvm::kokkos::execution_space::openmp_target); + EXPECT_CONVERSION_FROM_STRING("OpenACC", plssvm::kokkos::execution_space::openacc); + EXPECT_CONVERSION_FROM_STRING("OPENACC", plssvm::kokkos::execution_space::openacc); + EXPECT_CONVERSION_FROM_STRING("threads", plssvm::kokkos::execution_space::threads); + EXPECT_CONVERSION_FROM_STRING("THREADS", plssvm::kokkos::execution_space::threads); + EXPECT_CONVERSION_FROM_STRING("std::threads", plssvm::kokkos::execution_space::threads); + EXPECT_CONVERSION_FROM_STRING("Serial", plssvm::kokkos::execution_space::serial); + EXPECT_CONVERSION_FROM_STRING("SERIAL", plssvm::kokkos::execution_space::serial); +} + +TEST(KokkosExecutionSpace, from_string_unknown) { + // foo isn't a valid execution_space + std::istringstream input{ "foo" }; + plssvm::kokkos::execution_space space{}; + input >> space; + EXPECT_TRUE(input.fail()); +} + +TEST(KokkosExecutionSpace, list_available_execution_spaces) { + const std::vector execution_spaces = plssvm::kokkos::list_available_execution_spaces(); + + // at least one must be available (automatic)! + EXPECT_GE(execution_spaces.size(), 1); + + // the automatic execution space must always be present + EXPECT_THAT(execution_spaces, ::testing::Contains(plssvm::kokkos::execution_space::automatic)); +} diff --git a/tests/backends/Kokkos/execution_space_type_traits.cpp b/tests/backends/Kokkos/execution_space_type_traits.cpp new file mode 100644 index 000000000..f813fa836 --- /dev/null +++ b/tests/backends/Kokkos/execution_space_type_traits.cpp @@ -0,0 +1,75 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Tests for functions related to the different Kokkos execution spaces. + */ + +#include "plssvm/backends/Kokkos/execution_space_type_traits.hpp" + +#include "gtest/gtest.h" // TEST, EXPECT_EQ, ::testing::StaticAssertTypeEq + +TEST(KokkosExecutionSpaceTypeTraits, execution_space_to_kokkos_type) { + // check conversions +#if defined(KOKKOS_ENABLE_CUDA) + ::testing::StaticAssertTypeEq, Kokkos::Cuda>(); +#endif +#if defined(KOKKOS_ENABLE_HIP) + ::testing::StaticAssertTypeEq, Kokkos::HIP>(); +#endif +#if defined(KOKKOS_ENABLE_SYCL) + ::testing::StaticAssertTypeEq, Kokkos::SYCL>(); +#endif +#if defined(KOKKOS_ENABLE_HPX) + ::testing::StaticAssertTypeEq, Kokkos::Experimental::HPX>(); +#endif +#if defined(KOKKOS_ENABLE_OPENMP) + ::testing::StaticAssertTypeEq, Kokkos::OpenMP>(); +#endif +#if defined(KOKKOS_ENABLE_OPENMPTARGET) + ::testing::StaticAssertTypeEq, Kokkos::Experimental::OpenMPTarget>(); +#endif +#if defined(KOKKOS_ENABLE_OPENACC) + ::testing::StaticAssertTypeEq, Kokkos::Experimental::OpenACC>(); +#endif +#if defined(KOKKOS_ENABLE_THREADS) + ::testing::StaticAssertTypeEq, Kokkos::Threads>(); +#endif +#if defined(KOKKOS_ENABLE_SERIAL) + ::testing::StaticAssertTypeEq, Kokkos::Serial>(); +#endif +} + +TEST(KokkosExecutionSpaceTypeTraits, kokkos_type_to_execution_space) { + // check conversions +#if defined(KOKKOS_ENABLE_CUDA) + EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v, plssvm::kokkos::execution_space::cuda); +#endif +#if defined(KOKKOS_ENABLE_HIP) + EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v, plssvm::kokkos::execution_space::hip); +#endif +#if defined(KOKKOS_ENABLE_SYCL) + EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v, plssvm::kokkos::execution_space::sycl); +#endif +#if defined(KOKKOS_ENABLE_HPX) + EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v, plssvm::kokkos::execution_space::hpx); +#endif +#if defined(KOKKOS_ENABLE_OPENMP) + EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v, plssvm::kokkos::execution_space::openmp); +#endif +#if defined(KOKKOS_ENABLE_OPENMPTARGET) + EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v, plssvm::kokkos::execution_space::openmp_target); +#endif +#if defined(KOKKOS_ENABLE_OPENACC) + EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v, plssvm::kokkos::execution_space::openacc); +#endif +#if defined(KOKKOS_ENABLE_THREADS) + EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v, plssvm::kokkos::execution_space::threads); +#endif +#if defined(KOKKOS_ENABLE_SERIAL) + EXPECT_EQ(plssvm::kokkos::kokkos_type_to_execution_space_v, plssvm::kokkos::execution_space::serial); +#endif +} diff --git a/tests/backends/Kokkos/kokkos_csvm.cpp b/tests/backends/Kokkos/kokkos_csvm.cpp new file mode 100644 index 000000000..c99f8c7d5 --- /dev/null +++ b/tests/backends/Kokkos/kokkos_csvm.cpp @@ -0,0 +1,771 @@ +/** + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Tests for the functionality related to the Kokkos backend. + */ + +#include "plssvm/backends/Kokkos/csvm.hpp" // plssvm::kokkos::csvm +#include "plssvm/backends/Kokkos/detail/utility.hpp" // plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping +#include "plssvm/backends/Kokkos/exceptions.hpp" // plssvm::kokkos::backend_exception +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space +#include "plssvm/detail/type_list.hpp" // plssvm::detail::label_type_list +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/parameter.hpp" // plssvm::parameter +#include "plssvm/target_platforms.hpp" // plssvm::target_platform, plssvm::list_available_target_platforms + +#include "tests/backends/generic_csvm_tests.hpp" // generic CSVM tests to instantiate +#include "tests/backends/generic_gpu_csvm_tests.hpp" // generic GPU CSVM tests to instantiate +#include "tests/backends/Kokkos/mock_kokkos_csvm.hpp" +#include "tests/backends/Kokkos/utility.hpp" // util::create_kokkos_test_tuple_impl +#include "tests/custom_test_macros.hpp" // EXPECT_THROW_WHAT +#include "tests/naming.hpp" // naming::test_parameter_to_name +#include "tests/types_to_test.hpp" // util::{cartesian_type_product_t, combine_test_parameters_gtest_t} +#include "tests/utility.hpp" // util::redirect_output + +#include "gtest/gtest.h" // TEST_F, EXPECT_NO_THROW, INSTANTIATE_TYPED_TEST_SUITE_P, ::testing::Test + +#include // std::array +#include // std::size_t +#include // std::map +#include // std::make_tuple, std::tuple +#include // std::vector + +class KokkosCSVM : public ::testing::Test, + private util::redirect_output<> { }; + +TEST_F(KokkosCSVM, construct_parameter) { // execution_space automatic, target_platform automatic + // check whether the execution space would be automatically determined as either OpenMPTarget or OpenACC + const std::map> available_combinations = plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping(); + plssvm::kokkos::execution_space space{}; + for (const plssvm::target_platform target : plssvm::list_available_target_platforms()) { + if (plssvm::detail::contains(available_combinations, target)) { + space = available_combinations.at(target).front(); + break; + } + } + + // must throw an exception if the execution space would be OpenMPTarget or OpenACC + if (space == plssvm::kokkos::execution_space::openmp_target || space == plssvm::kokkos::execution_space::openacc) { + EXPECT_THROW_WHAT(plssvm::kokkos::csvm{ plssvm::parameter{} }, + plssvm::kokkos::backend_exception, + fmt::format("The Kokkos execution space {} is currently not supported !", space)); + } else { + EXPECT_NO_THROW(plssvm::kokkos::csvm{ plssvm::parameter{} }); + } +} + +TEST_F(KokkosCSVM, construct_target_and_parameter) { // execution_space automatic, target_platform explicit + // create parameter struct + const plssvm::parameter params{}; + + // automatic should always work + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::automatic, params })); + + const std::map> available_combinations = plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping(); + const auto target_supported = [&](const plssvm::target_platform target) { + return plssvm::detail::contains(available_combinations, target); + }; + +#if defined(PLSSVM_HAS_CPU_TARGET) + if (target_supported(plssvm::target_platform::cpu)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, params })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, params }), + plssvm::kokkos::backend_exception, + fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform cpu!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, params }), + plssvm::kokkos::backend_exception, + "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); +#endif + +#if defined(PLSSVM_HAS_NVIDIA_TARGET) + if (target_supported(plssvm::target_platform::gpu_nvidia)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, params })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, params }), + plssvm::kokkos::backend_exception, + fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform gpu_nvidia!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, params }), + plssvm::kokkos::backend_exception, + "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); +#endif + +#if defined(PLSSVM_HAS_AMD_TARGET) + if (target_supported(plssvm::target_platform::gpu_amd)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, params })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, params }), + plssvm::kokkos::backend_exception, + fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform gpu_amd!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, params }), + plssvm::kokkos::backend_exception, + "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); +#endif + +#if defined(PLSSVM_HAS_INTEL_TARGET) + if (target_supported(plssvm::target_platform::gpu_intel)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, params })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, params }), + plssvm::kokkos::backend_exception, + fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform gpu_intel!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, params }), + plssvm::kokkos::backend_exception, + "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); +#endif +} + +TEST_F(KokkosCSVM, construct_execution_space_and_parameter) { // execution_space explicit, target_platform automatic + // create parameter struct + const plssvm::parameter params{}; + + // automatic should always work + EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::automatic })); + + const auto target_is_available = [](const plssvm::target_platform target) { + return plssvm::detail::contains(plssvm::list_available_target_platforms(), target); + }; + +#if defined(KOKKOS_ENABLE_CUDA) + // explicitly providing the Cuda execution space should work + if (target_is_available(plssvm::target_platform::gpu_nvidia)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda }), + plssvm::kokkos::backend_exception, + "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace Cuda!"); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace Cuda is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif + +#if defined(KOKKOS_ENABLE_HIP) + // explicitly providing the HIP execution space should work + if (target_is_available(plssvm::target_platform::gpu_nvidia) || target_is_available(plssvm::target_platform::gpu_amd)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip }), + plssvm::kokkos::backend_exception, + "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace HIP!"); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace HIP is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif + +#if defined(KOKKOS_ENABLE_SYCL) + // explicitly providing the SYCL execution space should work + EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::sycl })); +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::sycl }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace SYCL is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif + +#if defined(KOKKOS_ENABLE_HPX) + // explicitly providing the HPX execution space should work + if (target_is_available(plssvm::target_platform::cpu)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx }), + plssvm::kokkos::backend_exception, + "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace HPX!"); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace HPX is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) + // explicitly providing the OpenMP execution space should work + if (target_is_available(plssvm::target_platform::cpu)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp }), + plssvm::kokkos::backend_exception, + "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace OpenMP!"); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace OpenMP is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif + +#if defined(KOKKOS_ENABLE_OPENMPTARGET) + // explicitly providing the OpenMPTarget execution space currently unsupported + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp_target }), + plssvm::kokkos::backend_exception, + "The Kokkos execution space OpenMPTarget is currently not supported !"); +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp_target }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace OpenMPTarget is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif + +#if defined(KOKKOS_ENABLE_OPENACC) + // explicitly providing the OpenACC execution space currently unsupported + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openacc }), + plssvm::kokkos::backend_exception, + "The Kokkos execution space OpenACC is currently not supported !"); +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openacc }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace OpenACC is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif + +#if defined(KOKKOS_ENABLE_THREADS) + // explicitly providing the Threads execution space should work + if (target_is_available(plssvm::target_platform::cpu)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads }), + plssvm::kokkos::backend_exception, + "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace Threads!"); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace Threads is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif + +#if defined(KOKKOS_ENABLE_SERIAL) + // explicitly providing the Serial execution space should work + if (target_is_available(plssvm::target_platform::cpu)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial }), + plssvm::kokkos::backend_exception, + "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace Serial!"); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ params, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace Serial is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif +} + +TEST_F(KokkosCSVM, construct_target_and_execution_space_and_parameter) { // execution_space explicit, target_platform explicit + // create parameter struct + const plssvm::parameter params{}; + + // list all possible execution spaces + std::vector all_execution_spaces{ + plssvm::kokkos::execution_space::cuda, + plssvm::kokkos::execution_space::hip, + plssvm::kokkos::execution_space::sycl, + plssvm::kokkos::execution_space::hpx, + plssvm::kokkos::execution_space::openmp, + plssvm::kokkos::execution_space::openmp_target, + plssvm::kokkos::execution_space::openacc, + plssvm::kokkos::execution_space::threads, + plssvm::kokkos::execution_space::serial + }; + const std::map> available_combinations = plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping(); + const auto combination_exists = [&](const plssvm::target_platform target, const plssvm::kokkos::execution_space space) { + return plssvm::detail::contains(available_combinations, target) && plssvm::detail::contains(available_combinations.at(target), space); + }; + const auto execution_space_available = [&](const plssvm::kokkos::execution_space space) { + return plssvm::detail::contains(plssvm::kokkos::list_available_execution_spaces(), space); + }; + +#if defined(PLSSVM_HAS_CPU_TARGET) + for (const plssvm::kokkos::execution_space space : all_execution_spaces) { + if (!execution_space_available(space)) { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, params, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space, fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); + } else if (combination_exists(plssvm::target_platform::cpu, space)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, params, plssvm::kokkos_execution_space = space })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, params, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform cpu!", space)); + } + } +#else + for (const plssvm::kokkos::execution_space space : all_execution_spaces) { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, params, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); + } +#endif + +#if defined(PLSSVM_HAS_NVIDIA_TARGET) + for (const plssvm::kokkos::execution_space space : all_execution_spaces) { + if (!execution_space_available(space)) { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, params, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space, fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); + } else if (combination_exists(plssvm::target_platform::gpu_nvidia, space)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, params, plssvm::kokkos_execution_space = space })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, params, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform gpu_nvidia!", space)); + } + } +#else + for (const plssvm::kokkos::execution_space space : all_execution_spaces) { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, params, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); + } +#endif + +#if defined(PLSSVM_HAS_AMD_TARGET) + for (const plssvm::kokkos::execution_space space : all_execution_spaces) { + if (!execution_space_available(space)) { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, params, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space, fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); + } else if (combination_exists(plssvm::target_platform::gpu_amd, space)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, params, plssvm::kokkos_execution_space = space })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, params, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform gpu_amd!", space)); + } + } +#else + for (const plssvm::kokkos::execution_space space : all_execution_spaces) { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, params, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); + } +#endif + +#if defined(PLSSVM_HAS_INTEL_TARGET) + for (const plssvm::kokkos::execution_space space : all_execution_spaces) { + if (!execution_space_available(space)) { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, params, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space, fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); + } else if (combination_exists(plssvm::target_platform::gpu_intel, space)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, params, plssvm::kokkos_execution_space = space })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, params, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform gpu_intel!", space)); + } + } +#else + for (const plssvm::kokkos::execution_space space : all_execution_spaces) { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, params, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); + } +#endif +} + +TEST_F(KokkosCSVM, construct_named_args) { // execution_space automatic, target_platform automatic + // check whether the execution space would be automatically determined as either OpenMPTarget or OpenACC + const std::map> available_combinations = plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping(); + plssvm::kokkos::execution_space space{}; + for (const plssvm::target_platform target : plssvm::list_available_target_platforms()) { + if (plssvm::detail::contains(available_combinations, target)) { + space = available_combinations.at(target).front(); + break; + } + } + + // must throw an exception if the execution space would be OpenMPTarget or OpenACC + if (space == plssvm::kokkos::execution_space::openmp_target || space == plssvm::kokkos::execution_space::openacc) { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }), + plssvm::kokkos::backend_exception, + fmt::format("The Kokkos execution space {} is currently not supported !", space)); + } else { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 })); + EXPECT_NO_THROW(plssvm::kokkos::csvm{ plssvm::cost = 2.0 }); + } +} + +TEST_F(KokkosCSVM, construct_target_and_named_args) { // execution_space automatic, target_platform explicit + // automatic should always work + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::automatic, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 })); + + const std::map> available_combinations = plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping(); + const auto target_supported = [&](const plssvm::target_platform target) { + return plssvm::detail::contains(available_combinations, target); + }; + +#if defined(PLSSVM_HAS_CPU_TARGET) + if (target_supported(plssvm::target_platform::cpu)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }), + plssvm::kokkos::backend_exception, + fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform cpu!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }), + plssvm::kokkos::backend_exception, + "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); +#endif + +#if defined(PLSSVM_HAS_NVIDIA_TARGET) + if (target_supported(plssvm::target_platform::gpu_nvidia)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }), + plssvm::kokkos::backend_exception, + fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform gpu_nvidia!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }), + plssvm::kokkos::backend_exception, + "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); +#endif + +#if defined(PLSSVM_HAS_AMD_TARGET) + if (target_supported(plssvm::target_platform::gpu_amd)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }), + plssvm::kokkos::backend_exception, + fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform gpu_amd!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }), + plssvm::kokkos::backend_exception, + "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); +#endif + +#if defined(PLSSVM_HAS_INTEL_TARGET) + if (target_supported(plssvm::target_platform::gpu_intel)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }), + plssvm::kokkos::backend_exception, + fmt::format("No Kokkos::ExecutionSpace available ({}) for that requested target platform gpu_intel!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }), + plssvm::kokkos::backend_exception, + "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); +#endif +} + +TEST_F(KokkosCSVM, construct_execution_space_and_named_args) { // execution_space explicit, target_platform automatic + // automatic should always work + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::automatic })); + + const auto target_is_available = [](const plssvm::target_platform target) { + return plssvm::detail::contains(plssvm::list_available_target_platforms(), target); + }; + +#if defined(KOKKOS_ENABLE_CUDA) + // explicitly providing the Cuda execution space should work + if (target_is_available(plssvm::target_platform::gpu_nvidia)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda }), + plssvm::kokkos::backend_exception, + "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace Cuda!"); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace Cuda is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif + +#if defined(KOKKOS_ENABLE_HIP) + // explicitly providing the HIP execution space should work + if (target_is_available(plssvm::target_platform::gpu_nvidia) || target_is_available(plssvm::target_platform::gpu_amd)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip }), + plssvm::kokkos::backend_exception, + "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace HIP!"); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hip }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace HIP is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif + +#if defined(KOKKOS_ENABLE_SYCL) + // explicitly providing the SYCL execution space should work + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::sycl })); +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::sycl }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace SYCL is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif + +#if defined(KOKKOS_ENABLE_HPX) + // explicitly providing the HPX execution space should work + if (target_is_available(plssvm::target_platform::cpu)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx }), + plssvm::kokkos::backend_exception, + "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace HPX!"); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::hpx }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace HPX is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) + // explicitly providing the OpenMP execution space should work + if (target_is_available(plssvm::target_platform::cpu)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp }), + plssvm::kokkos::backend_exception, + "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace OpenMP!"); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace OpenMP is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif + +#if defined(KOKKOS_ENABLE_OPENMPTARGET) + // explicitly providing the OpenMPTarget execution space currently unsupported + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp_target }), + plssvm::kokkos::backend_exception, + "The Kokkos execution space OpenMPTarget is currently not supported !"); +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openmp_target }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace OpenMPTarget is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif + +#if defined(KOKKOS_ENABLE_OPENACC) + // explicitly providing the OpenACC execution space currently unsupported + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openacc }), + plssvm::kokkos::backend_exception, + "The Kokkos execution space OpenACC is currently not supported !"); +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::openacc }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace OpenACC is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif + +#if defined(KOKKOS_ENABLE_THREADS) + // explicitly providing the Threads execution space should work + if (target_is_available(plssvm::target_platform::cpu)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads }), + plssvm::kokkos::backend_exception, + "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace Threads!"); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::threads }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace Threads is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif + +#if defined(KOKKOS_ENABLE_SERIAL) + // explicitly providing the Serial execution space should work + if (target_is_available(plssvm::target_platform::cpu)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial }), + plssvm::kokkos::backend_exception, + "Couldn't find a valid target_platform for the Kokkos::ExecutionSpace Serial!"); + } +#else + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::serial }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace Serial is not available, available are: {}!", fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); +#endif +} + +TEST_F(KokkosCSVM, construct_target_and_execution_space_and_named_args) { // execution_space explicit, target_platform explicit + // list all possible execution spaces + std::vector all_execution_spaces{ + plssvm::kokkos::execution_space::cuda, + plssvm::kokkos::execution_space::hip, + plssvm::kokkos::execution_space::sycl, + plssvm::kokkos::execution_space::hpx, + plssvm::kokkos::execution_space::openmp, + plssvm::kokkos::execution_space::openmp_target, + plssvm::kokkos::execution_space::openacc, + plssvm::kokkos::execution_space::threads, + plssvm::kokkos::execution_space::serial + }; + const std::map> available_combinations = plssvm::kokkos::detail::available_target_platform_to_execution_space_mapping(); + const auto combination_exists = [&](const plssvm::target_platform target, const plssvm::kokkos::execution_space space) { + return plssvm::detail::contains(available_combinations, target) && plssvm::detail::contains(available_combinations.at(target), space); + }; + const auto execution_space_available = [&](const plssvm::kokkos::execution_space space) { + return plssvm::detail::contains(plssvm::kokkos::list_available_execution_spaces(), space); + }; + +#if defined(PLSSVM_HAS_CPU_TARGET) + for (const plssvm::kokkos::execution_space space : all_execution_spaces) { + if (!execution_space_available(space)) { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space, fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); + } else if (combination_exists(plssvm::target_platform::cpu, space)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform cpu!", space)); + } + } +#else + for (const plssvm::kokkos::execution_space space : all_execution_spaces) { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); + } +#endif + +#if defined(PLSSVM_HAS_NVIDIA_TARGET) + for (const plssvm::kokkos::execution_space space : all_execution_spaces) { + if (!execution_space_available(space)) { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space, fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); + } else if (combination_exists(plssvm::target_platform::gpu_nvidia, space)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform gpu_nvidia!", space)); + } + } +#else + for (const plssvm::kokkos::execution_space space : all_execution_spaces) { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); + } +#endif + +#if defined(PLSSVM_HAS_AMD_TARGET) + for (const plssvm::kokkos::execution_space space : all_execution_spaces) { + if (!execution_space_available(space)) { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space, fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); + } else if (combination_exists(plssvm::target_platform::gpu_amd, space)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform gpu_amd!", space)); + } + } +#else + for (const plssvm::kokkos::execution_space space : all_execution_spaces) { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); + } +#endif + +#if defined(PLSSVM_HAS_INTEL_TARGET) + for (const plssvm::kokkos::execution_space space : all_execution_spaces) { + if (!execution_space_available(space)) { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace {} is not available, available are: {}!", space, fmt::join(plssvm::kokkos::list_available_execution_spaces(), ", "))); + } else if (combination_exists(plssvm::target_platform::gpu_intel, space)) { + EXPECT_NO_THROW((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space })); + } else { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + fmt::format("The provided Kokkos::ExecutionSpace {} does not support the requested target platform gpu_intel!", space)); + } + } +#else + for (const plssvm::kokkos::execution_space space : all_execution_spaces) { + EXPECT_THROW_WHAT((plssvm::kokkos::csvm{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0, plssvm::kokkos_execution_space = space }), + plssvm::kokkos::backend_exception, + "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!"); + } +#endif +} + +TEST_F(KokkosCSVM, get_execution_space) { + // construct default CSVM + const plssvm::kokkos::csvm svm{ plssvm::parameter{} }; + + // after construction: get_execution_space must refer to a plssvm::kokkos::execution_space that is not automatic + EXPECT_NE(svm.get_execution_space(), plssvm::kokkos::execution_space::automatic); +} + +template +struct kokkos_csvm_test_type { + using mock_csvm_type = mock_kokkos_csvm; + using csvm_type = plssvm::kokkos::csvm; + using device_ptr_type = typename csvm_type::device_ptr_type; + inline static auto additional_arguments = std::make_tuple(std::make_pair(plssvm::kokkos_execution_space, space)); +}; + +template +using kokkos_csvm_test_type_without_mock = kokkos_csvm_test_type; + +using kokkos_csvm_test_tuple = util::create_kokkos_test_tuple_t; +using kokkos_csvm_test_label_type_list = util::cartesian_type_product_t; +using kokkos_csvm_test_type_list = util::cartesian_type_product_t; + +// the tests used in the instantiated GTest test suites +using kokkos_csvm_test_type_gtest = util::combine_test_parameters_gtest_t; +using kokkos_solver_type_gtest = util::combine_test_parameters_gtest_t; +using kokkos_kernel_function_type_gtest = util::combine_test_parameters_gtest_t; +using kokkos_solver_and_kernel_function_type_gtest = util::combine_test_parameters_gtest_t; +using kokkos_label_type_kernel_function_and_classification_type_gtest = util::combine_test_parameters_gtest_t; +using kokkos_label_type_solver_kernel_function_and_classification_type_gtest = util::combine_test_parameters_gtest_t; + +// instantiate type-parameterized tests +// generic CSVM tests +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVM, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name); +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMKernelFunction, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name); +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolver, kokkos_solver_type_gtest, naming::test_parameter_to_name); +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolverKernelFunction, kokkos_solver_and_kernel_function_type_gtest, naming::test_parameter_to_name); +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMKernelFunctionClassification, kokkos_label_type_kernel_function_and_classification_type_gtest, naming::test_parameter_to_name); +#if !defined(KOKKOS_ENABLE_CUDA) +// testcase doesn't compile with Kokkos::Cuda's nvcc due to template instantiation limits +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericCSVMSolverKernelFunctionClassification, kokkos_label_type_solver_kernel_function_and_classification_type_gtest, naming::test_parameter_to_name); +#endif + +// generic CSVM DeathTests +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMDeathTest, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name); +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMSolverDeathTest, kokkos_solver_type_gtest, naming::test_parameter_to_name); +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMKernelFunctionDeathTest, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name); +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericCSVMSolverKernelFunctionDeathTest, kokkos_solver_and_kernel_function_type_gtest, naming::test_parameter_to_name); + +// generic GPU CSVM tests - correct grid sizes +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericGPUCSVM, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name); +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVM, GenericGPUCSVMKernelFunction, kokkos_kernel_function_type_gtest, naming::test_parameter_to_name); + +// generic GPU CSVM DeathTests - correct grid sizes +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMDeathTest, GenericGPUCSVMDeathTest, kokkos_csvm_test_type_gtest, naming::test_parameter_to_name); + +template +using kokkos_csvm_test_type_with_mock = kokkos_csvm_test_type; + +using kokkos_mock_csvm_test_tuple = util::create_kokkos_test_tuple_t; +using kokkos_mock_csvm_test_type_list = util::cartesian_type_product_t; + +using kokkos_mock_csvm_test_type_gtest = util::combine_test_parameters_gtest_t; +using kokkos_mock_kernel_function_type_gtest = util::combine_test_parameters_gtest_t; + +// generic GPU CSVM tests - mocked grid sizes +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMFakedGridSize, GenericGPUCSVM, kokkos_mock_csvm_test_type_gtest, naming::test_parameter_to_name); +INSTANTIATE_TYPED_TEST_SUITE_P(KokkosCSVMFakedGridSize, GenericGPUCSVMKernelFunction, kokkos_mock_kernel_function_type_gtest, naming::test_parameter_to_name); diff --git a/tests/backends/Kokkos/mock_kokkos_csvm.hpp b/tests/backends/Kokkos/mock_kokkos_csvm.hpp new file mode 100644 index 000000000..6fb35cd9c --- /dev/null +++ b/tests/backends/Kokkos/mock_kokkos_csvm.hpp @@ -0,0 +1,85 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief MOCK class for the C-SVM class using the Kokkos backend. + */ + +#ifndef PLSSVM_TESTS_BACKENDS_KOKKOS_MOCK_KOKKOS_CSVM_HPP_ +#define PLSSVM_TESTS_BACKENDS_KOKKOS_MOCK_KOKKOS_CSVM_HPP_ +#pragma once + +#include "plssvm/backends/execution_range.hpp" // plssvm::detail::dim_type +#include "plssvm/backends/Kokkos/csvm.hpp" // plssvm::kokkos::csvm + +#include "gmock/gmock.h" // MOCK_METHOD, ON_CALL, ::testing::Return + +#include // std::size_t +#include // std::forward + +/** + * @brief GTest mock class for the Kokkos CSVM. + * @tparam mock_grid_size `true` if the `plssvm::kokkos::csvm::get_max_grid_size()` function should be mocked, otherwise `false` + */ +template +class mock_kokkos_csvm final : public plssvm::kokkos::csvm { + using base_type = plssvm::kokkos::csvm; + + public: + using base_type::device_ptr_type; + + template + explicit mock_kokkos_csvm(Args &&...args) : + base_type{ std::forward(args)... } { + this->fake_functions(); + } + + MOCK_METHOD((plssvm::detail::dim_type), get_max_grid_size, (const std::size_t), (const, override)); + + // make protected member functions public + using base_type::assemble_kernel_matrix; + using base_type::blas_level_3; + using base_type::get_device_memory; + using base_type::get_max_work_group_size; + using base_type::num_available_devices; + + using base_type::predict_values; + + using base_type::conjugate_gradients; + using base_type::perform_dimensional_reduction; + using base_type::run_assemble_kernel_matrix_implicit_blas_level_3; + using base_type::run_blas_level_3; + using base_type::solve_lssvm_system_of_linear_equations; + + using base_type::get_max_mem_alloc_size; + + using base_type::run_assemble_kernel_matrix_explicit; + using base_type::run_blas_level_3_kernel_explicit; + using base_type::run_inplace_matrix_addition; + using base_type::run_inplace_matrix_scale; + using base_type::run_predict_kernel; + using base_type::run_w_kernel; + + using base_type::data_distribution_; + using base_type::devices_; + + private: + /* + * @brief Fake the plssvm::kokkos::csvm::get_max_grid_size() function if requested. + */ + void fake_functions() const { + if constexpr (mock_grid_size) { + // mock the function using hardcoded maximum grid sizes + ON_CALL(*this, get_max_grid_size).WillByDefault(::testing::Return(plssvm::detail::dim_type{ std::size_t{ 4 }, std::size_t{ 4 }, std::size_t{ 4 } })); + } else { + // use the actual real implementation otherwise + ON_CALL(*this, get_max_grid_size).WillByDefault([this](const std::size_t device_id) { return base_type::get_max_grid_size(device_id); }); + } + } +}; + +#endif // PLSSVM_TESTS_BACKENDS_KOKKOS_MOCK_KOKKOS_CSVM_HPP_ diff --git a/tests/backends/Kokkos/utility.hpp b/tests/backends/Kokkos/utility.hpp new file mode 100644 index 000000000..3c3458198 --- /dev/null +++ b/tests/backends/Kokkos/utility.hpp @@ -0,0 +1,95 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Determine the execution spaces available for tests with the Kokkos backend. + */ + +#ifndef PLSSVM_TESTS_BACKENDS_KOKKOS_UTILITY_HPP_ +#define PLSSVM_TESTS_BACKENDS_KOKKOS_UTILITY_HPP_ +#pragma once + +namespace util { + +/** + * @brief Determine which execution spaces can be tested based on the available Kokkos::ExecutionSpaces and PLSSVM target platforms. + * @return the available execution spaces for testing (`[[nodiscard]]`) + */ +[[nodiscard]] constexpr auto constexpr_available_execution_spaces_to_test() { + return std::array{ +#if defined(KOKKOS_ENABLE_CUDA) && defined(PLSSVM_HAS_NVIDIA_TARGET) // for Kokkos::Cuda, an NVIDIA target must be available + plssvm::kokkos::execution_space::cuda, +#endif +#if defined(KOKKOS_ENABLE_HIP) && (defined(PLSSVM_HAS_NVIDIA_TARGET) || defined(PLSSVM_HAS_AMD_TARGET)) // for Kokkos::HIP, an NVIDIA or AMD target must be available + plssvm::kokkos::execution_space::hip, +#endif +#if defined(KOKKOS_ENABLE_SYCL) // for Kokkos::SYCL, any target is ok + plssvm::kokkos::execution_space::sycl, +#endif +#if defined(KOKKOS_ENABLE_HPX) && defined(PLSSVM_HAS_CPU_TARGET) // for Kokkos::Experimental::HPX, a CPU target must be available + plssvm::kokkos::execution_space::hpx, +#endif +#if defined(KOKKOS_ENABLE_OPENMP) && defined(PLSSVM_HAS_CPU_TARGET) // for Kokkos::OpenMP, a CPU target must be available + plssvm::kokkos::execution_space::openmp, +#endif +#if defined(KOKKOS_ENABLE_OPENMPTARGET) // for Kokkos::Experimental::OpenMPTarget,any target is ok // TODO: implement correctly based on allowed target platforms + plssvm::kokkos::execution_space::openmp_target, +#endif +#if defined(KOKKOS_ENABLE_OPENACC) // for Kokkos::Experimental::OpenACC,any target is ok // TODO: implement correctly based on allowed target platforms + plssvm::kokkos::execution_space::openacc, +#endif +#if defined(KOKKOS_ENABLE_THREADS) && defined(PLSSVM_HAS_CPU_TARGET) // for Kokkos::Threads, a CPU target must be available + plssvm::kokkos::execution_space::threads, +#endif +#if defined(KOKKOS_ENABLE_SERIAL) && defined(PLSSVM_HAS_CPU_TARGET) // for Kokkos::Serial, a CPU target must be available + plssvm::kokkos::execution_space::serial, +#endif + }; +} + +/** + * @brief Uninstantiated base type to create a `std::tuple` containing all available `kokkos_csvm_test_type` types. + */ +template