diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 0e03341ec02..64fb643377c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1134,6 +1134,75 @@ struct __early_exit_find_or // parallel_find_or - sync pattern //------------------------------------------------------------------------ +template +struct __parallel_find_or_nd_range_tuner +{ + // Tune the amount of work-groups and work-group size + template + std::tuple + operator()(const _ExecutionPolicy& __exec, const std::size_t __rng_n) const + { + // TODO: find a way to generalize getting of reliable work-group size + // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. + // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. + const std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)4096); + std::size_t __n_groups = 1; + // If no more than 32 data elements per work item, a single work group will be used + if (__rng_n > __wgroup_size * 32) + { + // Compute the number of groups and limit by the number of compute units + __n_groups = std::min(oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __wgroup_size), + oneapi::dpl::__internal::__max_compute_units(__exec)); + } + + return {__n_groups, __wgroup_size}; + } +}; + +// No tuning for FPGA_EMU because we are not going to tune here the performance for FPGA emulation. +#if !_ONEDPL_FPGA_EMU +template <> +struct __parallel_find_or_nd_range_tuner +{ + // Tune the amount of work-groups and work-group size + template + std::tuple + operator()(const _ExecutionPolicy& __exec, const std::size_t __rng_n) const + { + // Call common tuning function to get the work-group size + auto [__n_groups, __wgroup_size] = __parallel_find_or_nd_range_tuner{}(__exec, __rng_n); + + if (__n_groups > 1) + { + auto __iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); + + // If our work capacity is not enough to process all data in one iteration, will tune the number of work-groups + if (__iters_per_work_item > 1) + { + // Empirically found formula for GPU devices. + // TODO : need to re-evaluate this formula. + const float __rng_x = (float)__rng_n / 4096.f; + const float __desired_iters_per_work_item = std::max(std::sqrt(__rng_x), 1.f); + + if (__iters_per_work_item < __desired_iters_per_work_item) + { + // Multiply work per item by a power of 2 to reach the desired number of iterations. + // __dpl_bit_ceil rounds the ratio up to the next power of 2. + const std::size_t __k = oneapi::dpl::__internal::__dpl_bit_ceil( + (std::size_t)std::ceil(__desired_iters_per_work_item / __iters_per_work_item)); + // Proportionally reduce the number of work groups. + __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div( + __rng_n, __wgroup_size * __iters_per_work_item * __k); + } + } + } + + return {__n_groups, __wgroup_size}; + } +}; +#endif // !_ONEDPL_FPGA_EMU + // Base pattern for __parallel_or and __parallel_find. The execution depends on tag type _BrickTag. template @@ -1274,23 +1343,11 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli auto __rng_n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); assert(__rng_n > 0); - // TODO: find a way to generalize getting of reliable work-group size - // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. - // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. - std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)4096); - - const auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); - auto __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __wgroup_size); - __n_groups = ::std::min(__n_groups, decltype(__n_groups)(__max_cu)); - - // Pass all small data into single WG implementation - constexpr std::size_t __max_iters_per_work_item = 32; - if (__rng_n <= __wgroup_size * __max_iters_per_work_item) - { - __n_groups = 1; - } + // Evaluate the amount of work-groups and work-group size + const auto [__n_groups, __wgroup_size] = + __parallel_find_or_nd_range_tuner{}(__exec, __rng_n); - _PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size, __max_cu); + _PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size); using _AtomicType = typename _BrickTag::_AtomicType; const _AtomicType __init_value = _BrickTag::__init_value(__rng_n); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index de7e8fa1796..4606709bd45 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -298,7 +298,7 @@ class __kernel_compiler template inline void // Passing policy by value should be enough for debugging -__print_device_debug_info(_Policy __policy, size_t __wg_size = 0, size_t __max_cu = 0) +__print_device_debug_info(const _Policy& __policy, size_t __wg_size = 0, size_t __max_cu = 0) { ::std::cout << "Device info" << ::std::endl; ::std::cout << " > device name: " << oneapi::dpl::__internal::__device_info(__policy) << ::std::endl; @@ -309,7 +309,8 @@ __print_device_debug_info(_Policy __policy, size_t __wg_size = 0, size_t __max_c } #else template -inline void __print_device_debug_info(_Policy, size_t = 0, size_t = 0) +inline void +__print_device_debug_info(const _Policy& __policy, size_t = 0, size_t = 0) { } #endif