From 6c112719372f58dcb20df9f0b10f3ebaeebd5ce6 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 27 Sep 2024 16:24:34 -0500 Subject: [PATCH 01/31] Initial commit of reduce_by_segment with the reduce-then-scan path Signed-off-by: Matthew Michel --- .../dpl/internal/reduce_by_segment_impl.h | 6 +- .../hetero/algorithm_ranges_impl_hetero.h | 7 ++ .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 60 +++++++++++--- .../parallel_backend_sycl_reduce_then_scan.h | 81 ++++++++++--------- 4 files changed, 103 insertions(+), 51 deletions(-) diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h index d0979554786..a22f2970b49 100644 --- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h +++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h @@ -191,18 +191,19 @@ using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>; } // namespace template + typename _Range4, typename _BinaryPredicate, typename _BinaryOperator, typename _KnownIdentity> oneapi::dpl::__internal::__difference_t<_Range3> __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op, - ::std::false_type /* has_known_identity */) + _KnownIdentity) { return oneapi::dpl::experimental::ranges::reduce_by_segment( ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__keys), ::std::forward<_Range2>(__values), ::std::forward<_Range3>(__out_keys), ::std::forward<_Range4>(__out_values), __binary_pred, __binary_op); } +#if 0 template oneapi::dpl::__internal::__difference_t<_Range3> @@ -573,6 +574,7 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy return __end_idx.get_host_access()[0] + 1; } +#endif template diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index b9cd154a044..f845c209cfe 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -913,6 +913,12 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { + oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), + std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), + std::forward<_Range4>(__out_values), __binary_pred, __binary_op) + .wait(); + return 1; + #if 0 // The algorithm reduces values in __values where the // associated keys for the values are equal to the adjacent key. // @@ -1043,6 +1049,7 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& .__deferrable_wait(); return __result_end; + #endif } } // namespace __ranges diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 77b2e6c35f9..3141374ebcd 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -773,9 +773,9 @@ __group_scan_fits_in_slm(const sycl::queue& __queue, std::size_t __n, std::size_ template struct __gen_transform_input { - template + template auto - operator()(const _InRng& __in_rng, std::size_t __id) const + operator()(std::size_t __id, const _InRng& __in_rng, _OutRng&) const { // We explicitly convert __in_rng[__id] to the value type of _InRng to properly handle the case where we // process zip_iterator input where the reference type is a tuple of a references. This prevents the caller @@ -788,9 +788,9 @@ struct __gen_transform_input struct __simple_write_to_id { - template + template void - operator()(_OutRng& __out_rng, std::size_t __id, const _ValueType& __v) const + operator()(std::size_t __id, const _ValueType& __v, _InRng&, _OutRng& __out_rng) const { // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through. @@ -804,9 +804,9 @@ struct __simple_write_to_id template struct __gen_mask { - template + template bool - operator()(_InRng&& __in_rng, std::size_t __id) const + operator()(std::size_t __id, const _InRng& __in_rng, _OutRng&) const { return __pred((__rng_transform(std::forward<_InRng>(__in_rng)))[__id]); } @@ -1012,9 +1012,9 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen _GenInput __gen_transform{__unary_op}; return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), - std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{}, - _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}); + __backend_tag, std::forward<_ExecutionPolicy>(__exec), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{}, + _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}, __n, + std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng)); } } @@ -1186,6 +1186,48 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t } } +template +auto +__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, + _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, + _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) +{ + auto __n = __keys.size(); + auto __gen_reduce_input = [=](std::size_t __idx, const auto& __in_keys, const auto& __in_vals, const auto&, const auto&) { + using _ValueType = oneapi::dpl::__internal::__value_t; + if (__idx == 0) + return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); + if (!__binary_pred(__in_keys[__idx], __in_keys[__idx - 1])) + return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}); + return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); + }; + auto __reduce_op = [=](const auto& __lhs_tup, const auto& __rhs_tup) { + if (std::get<0>(__rhs_tup) == 0) + { + return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup), + __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup))); + } + return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup), + std::get<1>(__rhs_tup)); + }; + auto __gen_scan_input = __gen_reduce_input; + auto __scan_input_transform = oneapi::dpl::__internal::__no_op{}; + auto __write_out = [=](std::size_t __idx, const auto& __tup, const auto& __in_keys, const auto&, auto& __out_keys, auto& __out_values) { + // Will be present in L1 cache + if (__idx == __n - 1 || !__binary_pred(__in_keys[__idx], __in_keys[__idx + 1])) + { + __out_keys[std::get<0>(__tup)] = __in_keys[__idx]; + __out_values[std::get<0>(__tup)] = std::get<1>(__tup); + } + }; + using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; + return __parallel_transform_reduce_then_scan( + __backend_tag, std::forward<_ExecutionPolicy>(__exec), __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform, + __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{}, __n, + std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)); +} + template auto __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 8c0762f2a38..d5b44d1abcc 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -153,27 +153,27 @@ __sub_group_scan_partial(const __dpl_sycl::__sub_group& __sub_group, _ValueType& template + typename _WriteOp, typename _LazyValueType, typename... _Rngs> void __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenInput __gen_input, _ScanInputTransform __scan_input_transform, _BinaryOp __binary_op, _WriteOp __write_op, - _LazyValueType& __sub_group_carry, const _InRng& __in_rng, _OutRng& __out_rng, + _LazyValueType& __sub_group_carry, std::size_t __start_id, std::size_t __n, std::uint32_t __iters_per_item, std::size_t __subgroup_start_id, std::uint32_t __sub_group_id, - std::uint32_t __active_subgroups) + std::uint32_t __active_subgroups, _Rngs&&... __rngs) { - using _GenInputType = std::invoke_result_t<_GenInput, _InRng, std::size_t>; + using _GenInputType = std::invoke_result_t<_GenInput, std::size_t, _Rngs...>; bool __is_full_block = (__iters_per_item == __max_inputs_per_item); bool __is_full_thread = __subgroup_start_id + __iters_per_item * __sub_group_size <= __n; if (__is_full_thread) { - _GenInputType __v = __gen_input(__in_rng, __start_id); + _GenInputType __v = __gen_input(__start_id, __rngs...); __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__out_rng, __start_id, __v); + __write_op(__start_id, __v, __rngs...); } if (__is_full_block) @@ -182,12 +182,12 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI _ONEDPL_PRAGMA_UNROLL for (std::uint32_t __j = 1; __j < __max_inputs_per_item; __j++) { - __v = __gen_input(__in_rng, __start_id + __j * __sub_group_size); + __v = __gen_input(__start_id + __j * __sub_group_size, __rngs...); __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__out_rng, __start_id + __j * __sub_group_size, __v); + __write_op(__start_id + __j * __sub_group_size, __v, __rngs...); } } } @@ -197,12 +197,12 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI // can proceed without special casing for partial subgroups. for (std::uint32_t __j = 1; __j < __iters_per_item; __j++) { - __v = __gen_input(__in_rng, __start_id + __j * __sub_group_size); + __v = __gen_input(__start_id + __j * __sub_group_size, __rngs...); __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__out_rng, __start_id + __j * __sub_group_size, __v); + __write_op(__start_id + __j * __sub_group_size, __v, __rngs...); } } } @@ -218,48 +218,48 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI if (__iters == 1) { std::size_t __local_id = (__start_id < __n) ? __start_id : __n - 1; - _GenInputType __v = __gen_input(__in_rng, __local_id); + _GenInputType __v = __gen_input(__local_id, __rngs...); __sub_group_scan_partial<__sub_group_size, __is_inclusive, __init_present>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry, __n - __subgroup_start_id); if constexpr (__capture_output) { if (__start_id < __n) - __write_op(__out_rng, __start_id, __v); + __write_op(__start_id, __v, __rngs...); } } else { - _GenInputType __v = __gen_input(__in_rng, __start_id); + _GenInputType __v = __gen_input(__start_id, __rngs...); __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__out_rng, __start_id, __v); + __write_op(__start_id, __v, __rngs...); } for (std::uint32_t __j = 1; __j < __iters - 1; __j++) { std::size_t __local_id = __start_id + __j * __sub_group_size; - __v = __gen_input(__in_rng, __local_id); + __v = __gen_input(__local_id, __rngs...); __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__out_rng, __local_id, __v); + __write_op(__local_id, __v, __rngs...); } } std::size_t __offset = __start_id + (__iters - 1) * __sub_group_size; std::size_t __local_id = (__offset < __n) ? __offset : __n - 1; - __v = __gen_input(__in_rng, __local_id); + __v = __gen_input(__local_id, __rngs...); __sub_group_scan_partial<__sub_group_size, __is_inclusive, /*__init_present=*/true>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry, __n - (__subgroup_start_id + (__iters - 1) * __sub_group_size)); if constexpr (__capture_output) { if (__offset < __n) - __write_op(__out_rng, __offset, __v); + __write_op(__offset, __v, __rngs...); } } } @@ -286,12 +286,12 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu { // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory // input buffer - template + template sycl::event - operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, + operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item, - const std::size_t __block_num) const + const std::size_t __block_num, _Rngs&&... __rngs) const { using _InitValueType = typename _InitType::__value_type; return __exec.queue().submit([&, this](sycl::handler& __cgh) { @@ -332,8 +332,8 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu /*__init_present=*/false, /*__capture_output=*/false, __max_inputs_per_item>( __sub_group, __gen_reduce_input, oneapi::dpl::__internal::__no_op{}, __reduce_op, nullptr, - __sub_group_carry, __in_rng, /*unused*/ __in_rng, __start_id, __n, __inputs_per_item, - __subgroup_start_id, __sub_group_id, __active_subgroups); + __sub_group_carry, __start_id, __n, __inputs_per_item, + __subgroup_start_id, __sub_group_id, __active_subgroups, __rngs...); if (__sub_group_local_id == 0) __sub_group_partials[__sub_group_id] = __sub_group_carry.__v; __sub_group_carry.__destroy(); @@ -437,12 +437,12 @@ struct __parallel_reduce_then_scan_scan_submitter< __tmp_ptr[__num_sub_groups_global + 1 - (__block_num % 2)] = __block_carry_out; } - template + template sycl::event - operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng, + operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item, - const std::size_t __block_num) const + const std::size_t __block_num, _Rngs&&... __rngs) const { std::uint32_t __inputs_in_block = std::min(__n - __block_num * __max_block_size, std::size_t{__max_block_size}); std::uint32_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div( @@ -626,7 +626,7 @@ struct __parallel_reduce_then_scan_scan_submitter< if (__sub_group_local_id == 0) { // For unique patterns, always copy the 0th element to the output - __write_op.__assign(__in_rng[0], __out_rng[0]); + //__write_op.__assign(__in_rng[0], __out_rng[0]); } } @@ -672,8 +672,8 @@ struct __parallel_reduce_then_scan_scan_submitter< /*__init_present=*/true, /*__capture_output=*/true, __max_inputs_per_item>( __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op, - __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item, __subgroup_start_id, - __sub_group_id, __active_subgroups); + __sub_group_carry, __start_id, __n, __inputs_per_item, __subgroup_start_id, + __sub_group_id, __active_subgroups, __rngs...); } else // first group first block, no subgroup carry { @@ -681,8 +681,8 @@ struct __parallel_reduce_then_scan_scan_submitter< /*__init_present=*/false, /*__capture_output=*/true, __max_inputs_per_item>( __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op, - __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item, __subgroup_start_id, - __sub_group_id, __active_subgroups); + __sub_group_carry, __start_id, __n, __inputs_per_item, __subgroup_start_id, + __sub_group_id, __active_subgroups, __rngs...); } // If within the last active group and sub-group of the block, use the 0th work-item of the sub-group // to write out the last carry out for either the return value or the next block @@ -746,15 +746,16 @@ __is_gpu_with_sg_32(const _ExecutionPolicy& __exec) // _ReduceOp - a binary function which is used in the reduction and scan operations // _WriteOp - a function which accepts output range, index, and output of `_GenScanInput` applied to the input range // and performs the final write to output operation -template + typename _Inclusive, typename _IsUniquePattern, typename... _Rngs> auto __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, - _InRng&& __in_rng, _OutRng&& __out_rng, _GenReduceInput __gen_reduce_input, + _GenReduceInput __gen_reduce_input, _ReduceOp __reduce_op, _GenScanInput __gen_scan_input, _ScanInputTransform __scan_input_transform, _WriteOp __write_op, _InitType __init, - _Inclusive, _IsUniquePattern) + _Inclusive, _IsUniquePattern, + std::size_t __n, _Rngs&&... __rngs) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< @@ -780,7 +781,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ const std::uint32_t __num_work_items = __num_work_groups * __work_group_size; const std::uint32_t __num_sub_groups_local = __work_group_size / __sub_group_size; const std::uint32_t __num_sub_groups_global = __num_sub_groups_local * __num_work_groups; - const std::size_t __n = __in_rng.size(); + //const std::size_t __n = __in_rng.size(); const std::uint32_t __max_inputs_per_block = __work_group_size * __max_inputs_per_item * __num_work_groups; std::size_t __inputs_remaining = __n; if constexpr (__is_unique_pattern_v) @@ -848,11 +849,11 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ auto __local_range = sycl::range<1>(__work_group_size); auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range); // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory. - __event = __reduce_submitter(__exec, __kernel_nd_range, __in_rng, __result_and_scratch, __event, - __inputs_per_sub_group, __inputs_per_item, __b); + __event = __reduce_submitter(__exec, __kernel_nd_range, __result_and_scratch, __event, + __inputs_per_sub_group, __inputs_per_item, __b, __rngs...); // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan. - __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event, - __inputs_per_sub_group, __inputs_per_item, __b); + __event = __scan_submitter(__exec, __kernel_nd_range, __result_and_scratch, __event, + __inputs_per_sub_group, __inputs_per_item, __b, __rngs...); __inputs_remaining -= std::min(__inputs_remaining, __block_size); // We only need to resize these parameters prior to the last block as it is the only non-full case. if (__b + 2 == __num_blocks) From b7328e8e063cc6b3d4ec3804554289ca27a23e9a Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 14 Oct 2024 13:36:40 -0700 Subject: [PATCH 02/31] Revert change to ranges and use zip_view over segments / values instead Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 47 ++++++----- .../parallel_backend_sycl_reduce_then_scan.h | 81 +++++++++---------- 2 files changed, 67 insertions(+), 61 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 3141374ebcd..148c2565ba8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -773,9 +773,9 @@ __group_scan_fits_in_slm(const sycl::queue& __queue, std::size_t __n, std::size_ template struct __gen_transform_input { - template + template auto - operator()(std::size_t __id, const _InRng& __in_rng, _OutRng&) const + operator()(const _InRng& __in_rng, std::size_t __id) const { // We explicitly convert __in_rng[__id] to the value type of _InRng to properly handle the case where we // process zip_iterator input where the reference type is a tuple of a references. This prevents the caller @@ -788,9 +788,9 @@ struct __gen_transform_input struct __simple_write_to_id { - template + template void - operator()(std::size_t __id, const _ValueType& __v, _InRng&, _OutRng& __out_rng) const + operator()(const _InRng&, _OutRng& __out_rng, std::size_t __id, const _ValueType& __v) const { // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through. @@ -804,9 +804,9 @@ struct __simple_write_to_id template struct __gen_mask { - template + template bool - operator()(std::size_t __id, const _InRng& __in_rng, _OutRng&) const + operator()(const _InRng& __in_rng, std::size_t __id) const { return __pred((__rng_transform(std::forward<_InRng>(__in_rng)))[__id]); } @@ -937,9 +937,9 @@ struct __get_zeroth_element template struct __write_to_id_if { - template + template void - operator()(_OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const + operator()(const _InRng&, _OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const { // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through. @@ -955,9 +955,9 @@ struct __write_to_id_if template struct __write_to_id_if_else { - template + template void - operator()(_OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const + operator()(const _InRng&, _OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const { using _ConvertedTupleType = typename oneapi::dpl::__internal::__get_tuple_type(__v))>, @@ -1012,9 +1012,9 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen _GenInput __gen_transform{__unary_op}; return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{}, - _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}, __n, - std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng)); + __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), + std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{}, + _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}); } } @@ -1185,7 +1185,6 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t _CopyOp{_ReduceOp{}, _Assign{}}); } } - template auto @@ -1194,7 +1193,9 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { auto __n = __keys.size(); - auto __gen_reduce_input = [=](std::size_t __idx, const auto& __in_keys, const auto& __in_vals, const auto&, const auto&) { + auto __gen_reduce_input = [=](const auto& __in_rng, std::size_t __idx) { + auto&& __in_keys = std::get<0>(__in_rng.tuple()); + auto&& __in_vals = std::get<1>(__in_rng.tuple()); using _ValueType = oneapi::dpl::__internal::__value_t; if (__idx == 0) return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); @@ -1213,8 +1214,11 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac }; auto __gen_scan_input = __gen_reduce_input; auto __scan_input_transform = oneapi::dpl::__internal::__no_op{}; - auto __write_out = [=](std::size_t __idx, const auto& __tup, const auto& __in_keys, const auto&, auto& __out_keys, auto& __out_values) { - // Will be present in L1 cache + auto __write_out = [=](auto& __in_rng, auto& __out_rng, std::size_t __idx, const auto& __tup) { + auto&& __in_keys = std::get<0>(__in_rng.tuple()); + auto&& __out_keys = std::get<0>(__out_rng.tuple()); + auto&& __out_vals = std::get<1>(__out_rng.tuple()); + // Assuming this will be present in L1 cache if (__idx == __n - 1 || !__binary_pred(__in_keys[__idx], __in_keys[__idx + 1])) { __out_keys[std::get<0>(__tup)] = __in_keys[__idx]; @@ -1223,9 +1227,12 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac }; using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform, - __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{}, __n, - std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)); + __backend_tag, std::forward<_ExecutionPolicy>(__exec), + oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), + oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), + __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform, + __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{} + ); } template diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index d5b44d1abcc..09c0d754bd7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -153,27 +153,27 @@ __sub_group_scan_partial(const __dpl_sycl::__sub_group& __sub_group, _ValueType& template + typename _WriteOp, typename _LazyValueType, typename _InRng, typename _OutRng> void __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenInput __gen_input, _ScanInputTransform __scan_input_transform, _BinaryOp __binary_op, _WriteOp __write_op, - _LazyValueType& __sub_group_carry, + _LazyValueType& __sub_group_carry, const _InRng& __in_rng, _OutRng& __out_rng, std::size_t __start_id, std::size_t __n, std::uint32_t __iters_per_item, std::size_t __subgroup_start_id, std::uint32_t __sub_group_id, - std::uint32_t __active_subgroups, _Rngs&&... __rngs) + std::uint32_t __active_subgroups) { - using _GenInputType = std::invoke_result_t<_GenInput, std::size_t, _Rngs...>; + using _GenInputType = std::invoke_result_t<_GenInput, _InRng, std::size_t>; bool __is_full_block = (__iters_per_item == __max_inputs_per_item); bool __is_full_thread = __subgroup_start_id + __iters_per_item * __sub_group_size <= __n; if (__is_full_thread) { - _GenInputType __v = __gen_input(__start_id, __rngs...); + _GenInputType __v = __gen_input(__in_rng, __start_id); __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__start_id, __v, __rngs...); + __write_op(__in_rng, __out_rng, __start_id, __v); } if (__is_full_block) @@ -182,12 +182,12 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI _ONEDPL_PRAGMA_UNROLL for (std::uint32_t __j = 1; __j < __max_inputs_per_item; __j++) { - __v = __gen_input(__start_id + __j * __sub_group_size, __rngs...); + __v = __gen_input(__in_rng, __start_id + __j * __sub_group_size); __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__start_id + __j * __sub_group_size, __v, __rngs...); + __write_op(__in_rng, __out_rng, __start_id + __j * __sub_group_size, __v); } } } @@ -197,12 +197,12 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI // can proceed without special casing for partial subgroups. for (std::uint32_t __j = 1; __j < __iters_per_item; __j++) { - __v = __gen_input(__start_id + __j * __sub_group_size, __rngs...); + __v = __gen_input(__in_rng, __start_id + __j * __sub_group_size); __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__start_id + __j * __sub_group_size, __v, __rngs...); + __write_op(__in_rng, __out_rng, __start_id + __j * __sub_group_size, __v); } } } @@ -218,48 +218,48 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI if (__iters == 1) { std::size_t __local_id = (__start_id < __n) ? __start_id : __n - 1; - _GenInputType __v = __gen_input(__local_id, __rngs...); + _GenInputType __v = __gen_input(__in_rng, __local_id); __sub_group_scan_partial<__sub_group_size, __is_inclusive, __init_present>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry, __n - __subgroup_start_id); if constexpr (__capture_output) { if (__start_id < __n) - __write_op(__start_id, __v, __rngs...); + __write_op(__in_rng, __out_rng, __start_id, __v); } } else { - _GenInputType __v = __gen_input(__start_id, __rngs...); + _GenInputType __v = __gen_input(__in_rng, __start_id); __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__start_id, __v, __rngs...); + __write_op(__in_rng, __out_rng, __start_id, __v); } for (std::uint32_t __j = 1; __j < __iters - 1; __j++) { std::size_t __local_id = __start_id + __j * __sub_group_size; - __v = __gen_input(__local_id, __rngs...); + __v = __gen_input(__in_rng, __local_id); __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__local_id, __v, __rngs...); + __write_op(__in_rng, __out_rng, __local_id, __v); } } std::size_t __offset = __start_id + (__iters - 1) * __sub_group_size; std::size_t __local_id = (__offset < __n) ? __offset : __n - 1; - __v = __gen_input(__local_id, __rngs...); + __v = __gen_input(__in_rng, __local_id); __sub_group_scan_partial<__sub_group_size, __is_inclusive, /*__init_present=*/true>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry, __n - (__subgroup_start_id + (__iters - 1) * __sub_group_size)); if constexpr (__capture_output) { if (__offset < __n) - __write_op(__offset, __v, __rngs...); + __write_op(__in_rng, __out_rng, __offset, __v); } } } @@ -286,12 +286,12 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu { // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory // input buffer - template + template sycl::event - operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, + operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item, - const std::size_t __block_num, _Rngs&&... __rngs) const + const std::size_t __block_num) const { using _InitValueType = typename _InitType::__value_type; return __exec.queue().submit([&, this](sycl::handler& __cgh) { @@ -332,8 +332,8 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu /*__init_present=*/false, /*__capture_output=*/false, __max_inputs_per_item>( __sub_group, __gen_reduce_input, oneapi::dpl::__internal::__no_op{}, __reduce_op, nullptr, - __sub_group_carry, __start_id, __n, __inputs_per_item, - __subgroup_start_id, __sub_group_id, __active_subgroups, __rngs...); + __sub_group_carry, __in_rng, /*unused*/ __in_rng, __start_id, __n, __inputs_per_item, + __subgroup_start_id, __sub_group_id, __active_subgroups); if (__sub_group_local_id == 0) __sub_group_partials[__sub_group_id] = __sub_group_carry.__v; __sub_group_carry.__destroy(); @@ -437,12 +437,12 @@ struct __parallel_reduce_then_scan_scan_submitter< __tmp_ptr[__num_sub_groups_global + 1 - (__block_num % 2)] = __block_carry_out; } - template + template sycl::event - operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, + operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item, - const std::size_t __block_num, _Rngs&&... __rngs) const + const std::size_t __block_num) const { std::uint32_t __inputs_in_block = std::min(__n - __block_num * __max_block_size, std::size_t{__max_block_size}); std::uint32_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div( @@ -626,7 +626,7 @@ struct __parallel_reduce_then_scan_scan_submitter< if (__sub_group_local_id == 0) { // For unique patterns, always copy the 0th element to the output - //__write_op.__assign(__in_rng[0], __out_rng[0]); + __write_op.__assign(__in_rng[0], __out_rng[0]); } } @@ -672,8 +672,8 @@ struct __parallel_reduce_then_scan_scan_submitter< /*__init_present=*/true, /*__capture_output=*/true, __max_inputs_per_item>( __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op, - __sub_group_carry, __start_id, __n, __inputs_per_item, __subgroup_start_id, - __sub_group_id, __active_subgroups, __rngs...); + __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item, __subgroup_start_id, + __sub_group_id, __active_subgroups); } else // first group first block, no subgroup carry { @@ -681,8 +681,8 @@ struct __parallel_reduce_then_scan_scan_submitter< /*__init_present=*/false, /*__capture_output=*/true, __max_inputs_per_item>( __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op, - __sub_group_carry, __start_id, __n, __inputs_per_item, __subgroup_start_id, - __sub_group_id, __active_subgroups, __rngs...); + __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item, __subgroup_start_id, + __sub_group_id, __active_subgroups); } // If within the last active group and sub-group of the block, use the 0th work-item of the sub-group // to write out the last carry out for either the return value or the next block @@ -746,16 +746,15 @@ __is_gpu_with_sg_32(const _ExecutionPolicy& __exec) // _ReduceOp - a binary function which is used in the reduction and scan operations // _WriteOp - a function which accepts output range, index, and output of `_GenScanInput` applied to the input range // and performs the final write to output operation -template + typename _Inclusive, typename _IsUniquePattern> auto __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, - _GenReduceInput __gen_reduce_input, + _InRng&& __in_rng, _OutRng&& __out_rng, _GenReduceInput __gen_reduce_input, _ReduceOp __reduce_op, _GenScanInput __gen_scan_input, _ScanInputTransform __scan_input_transform, _WriteOp __write_op, _InitType __init, - _Inclusive, _IsUniquePattern, - std::size_t __n, _Rngs&&... __rngs) + _Inclusive, _IsUniquePattern) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< @@ -781,7 +780,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ const std::uint32_t __num_work_items = __num_work_groups * __work_group_size; const std::uint32_t __num_sub_groups_local = __work_group_size / __sub_group_size; const std::uint32_t __num_sub_groups_global = __num_sub_groups_local * __num_work_groups; - //const std::size_t __n = __in_rng.size(); + const std::size_t __n = __in_rng.size(); const std::uint32_t __max_inputs_per_block = __work_group_size * __max_inputs_per_item * __num_work_groups; std::size_t __inputs_remaining = __n; if constexpr (__is_unique_pattern_v) @@ -849,11 +848,11 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ auto __local_range = sycl::range<1>(__work_group_size); auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range); // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory. - __event = __reduce_submitter(__exec, __kernel_nd_range, __result_and_scratch, __event, - __inputs_per_sub_group, __inputs_per_item, __b, __rngs...); + __event = __reduce_submitter(__exec, __kernel_nd_range, __in_rng, __result_and_scratch, __event, + __inputs_per_sub_group, __inputs_per_item, __b); // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan. - __event = __scan_submitter(__exec, __kernel_nd_range, __result_and_scratch, __event, - __inputs_per_sub_group, __inputs_per_item, __b, __rngs...); + __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event, + __inputs_per_sub_group, __inputs_per_item, __b); __inputs_remaining -= std::min(__inputs_remaining, __block_size); // We only need to resize these parameters prior to the last block as it is the only non-full case. if (__b + 2 == __num_blocks) From f017ffd753248c9cfe5b054bd2a0bd66eda4ec22 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 15 Oct 2024 06:42:29 -0700 Subject: [PATCH 03/31] Implement correct return for reduce_by_segment Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/algorithm_ranges_impl_hetero.h | 13 ++++++++----- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 10 ++++------ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index f845c209cfe..7558e29b4d5 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -913,11 +913,14 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { - oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), - std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), - std::forward<_Range4>(__out_values), __binary_pred, __binary_op) - .wait(); - return 1; + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), + std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), + std::forward<_Range4>(__out_values), __binary_pred, __binary_op); + __res.wait(); + // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the + // past-the-end iterator pair of segmented reduction. + return std::get<0>(__res.get()) + 1; + // TODO: this needs to be enabled if reduce then scan cannot be satisfied. #if 0 // The algorithm reduces values in __values where the // associated keys for the values are equal to the adjacent key. diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 148c2565ba8..8d782dbc38c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1197,11 +1197,9 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac auto&& __in_keys = std::get<0>(__in_rng.tuple()); auto&& __in_vals = std::get<1>(__in_rng.tuple()); using _ValueType = oneapi::dpl::__internal::__value_t; - if (__idx == 0) + if (__idx == 0 || __binary_pred(__in_keys[__idx], __in_keys[__idx - 1])) return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); - if (!__binary_pred(__in_keys[__idx], __in_keys[__idx - 1])) - return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}); - return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); + return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}); }; auto __reduce_op = [=](const auto& __lhs_tup, const auto& __rhs_tup) { if (std::get<0>(__rhs_tup) == 0) @@ -1231,8 +1229,8 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform, - __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{} - ); + __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, + /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); } template From 5cfa661b0505e7dc21433066845392c9604dfdc1 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 15 Oct 2024 07:29:49 -0700 Subject: [PATCH 04/31] Add support for flag predicates Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 8d782dbc38c..f26c4e097b0 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1185,6 +1185,7 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t _CopyOp{_ReduceOp{}, _Assign{}}); } } + template auto @@ -1196,19 +1197,20 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac auto __gen_reduce_input = [=](const auto& __in_rng, std::size_t __idx) { auto&& __in_keys = std::get<0>(__in_rng.tuple()); auto&& __in_vals = std::get<1>(__in_rng.tuple()); + using _KeyType = oneapi::dpl::__internal::__value_t; using _ValueType = oneapi::dpl::__internal::__value_t; - if (__idx == 0 || __binary_pred(__in_keys[__idx], __in_keys[__idx - 1])) - return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); - return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}); + if (__idx == 0 || __binary_pred(__in_keys[__idx - 1], __in_keys[__idx])) + return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}, _KeyType{__in_keys[__idx]}); + return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}, _KeyType{__in_keys[__idx]}); }; auto __reduce_op = [=](const auto& __lhs_tup, const auto& __rhs_tup) { if (std::get<0>(__rhs_tup) == 0) { return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup), - __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup))); + __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup)), std::get<2>(__lhs_tup)); } return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup), - std::get<1>(__rhs_tup)); + std::get<1>(__rhs_tup), std::get<2>(__rhs_tup)); }; auto __gen_scan_input = __gen_reduce_input; auto __scan_input_transform = oneapi::dpl::__internal::__no_op{}; @@ -1219,17 +1221,18 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac // Assuming this will be present in L1 cache if (__idx == __n - 1 || !__binary_pred(__in_keys[__idx], __in_keys[__idx + 1])) { - __out_keys[std::get<0>(__tup)] = __in_keys[__idx]; + __out_keys[std::get<0>(__tup)] = std::get<2>(__tup); __out_values[std::get<0>(__tup)] = std::get<1>(__tup); } }; + using _KeyType = oneapi::dpl::__internal::__value_t<_Range1>; using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform, - __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, + __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); } From 4f8059afba9fd24da298095d79fa7aaa07c0952e Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 15 Oct 2024 08:16:44 -0700 Subject: [PATCH 05/31] Revert "Add support for flag predicates" This reverts commit 0e0d50e8eea85685d46ccbb32b74f1211bfbab20. --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index f26c4e097b0..8d782dbc38c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1185,7 +1185,6 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t _CopyOp{_ReduceOp{}, _Assign{}}); } } - template auto @@ -1197,20 +1196,19 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac auto __gen_reduce_input = [=](const auto& __in_rng, std::size_t __idx) { auto&& __in_keys = std::get<0>(__in_rng.tuple()); auto&& __in_vals = std::get<1>(__in_rng.tuple()); - using _KeyType = oneapi::dpl::__internal::__value_t; using _ValueType = oneapi::dpl::__internal::__value_t; - if (__idx == 0 || __binary_pred(__in_keys[__idx - 1], __in_keys[__idx])) - return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}, _KeyType{__in_keys[__idx]}); - return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}, _KeyType{__in_keys[__idx]}); + if (__idx == 0 || __binary_pred(__in_keys[__idx], __in_keys[__idx - 1])) + return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); + return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}); }; auto __reduce_op = [=](const auto& __lhs_tup, const auto& __rhs_tup) { if (std::get<0>(__rhs_tup) == 0) { return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup), - __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup)), std::get<2>(__lhs_tup)); + __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup))); } return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup), - std::get<1>(__rhs_tup), std::get<2>(__rhs_tup)); + std::get<1>(__rhs_tup)); }; auto __gen_scan_input = __gen_reduce_input; auto __scan_input_transform = oneapi::dpl::__internal::__no_op{}; @@ -1221,18 +1219,17 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac // Assuming this will be present in L1 cache if (__idx == __n - 1 || !__binary_pred(__in_keys[__idx], __in_keys[__idx + 1])) { - __out_keys[std::get<0>(__tup)] = std::get<2>(__tup); + __out_keys[std::get<0>(__tup)] = __in_keys[__idx]; __out_values[std::get<0>(__tup)] = std::get<1>(__tup); } }; - using _KeyType = oneapi::dpl::__internal::__value_t<_Range1>; using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform, - __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, + __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); } From d56bc5a11eb27356ab99191c75a182c523b284a8 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 15 Oct 2024 08:37:53 -0700 Subject: [PATCH 06/31] Re-implement support for flag predicates in a more performant manner Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 8d782dbc38c..ebb86585133 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1197,7 +1197,7 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac auto&& __in_keys = std::get<0>(__in_rng.tuple()); auto&& __in_vals = std::get<1>(__in_rng.tuple()); using _ValueType = oneapi::dpl::__internal::__value_t; - if (__idx == 0 || __binary_pred(__in_keys[__idx], __in_keys[__idx - 1])) + if (__idx == 0 || __binary_pred(__in_keys[__idx - 1], __in_keys[__idx])) return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}); }; @@ -1216,10 +1216,16 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac auto&& __in_keys = std::get<0>(__in_rng.tuple()); auto&& __out_keys = std::get<0>(__out_rng.tuple()); auto&& __out_vals = std::get<1>(__out_rng.tuple()); - // Assuming this will be present in L1 cache - if (__idx == __n - 1 || !__binary_pred(__in_keys[__idx], __in_keys[__idx + 1])) + // TODO: substantial improvement expected with special handling in kernel + // The first key must be output to __out_keys[__idx] for a segment, so when we encounter a segment end we + // must output the current segment's value and the next segment's key. + if (__idx == 0) + __out_keys[0] = __in_keys[0]; + if (__idx == __n - 1) + __out_values[std::get<0>(__tup)] = std::get<1>(__tup); + else if (!__binary_pred(__in_keys[__idx], __in_keys[__idx + 1])) { - __out_keys[std::get<0>(__tup)] = __in_keys[__idx]; + __out_keys[std::get<0>(__tup) + 1] = __in_keys[__idx + 1]; __out_values[std::get<0>(__tup)] = std::get<1>(__tup); } }; From 543e82f60ffffd3209f4c2754ac10c857433ad93 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 15 Oct 2024 09:51:35 -0700 Subject: [PATCH 07/31] Add fallback and remove old SYCL implementation Signed-off-by: Matthew Michel --- .../dpl/internal/reduce_by_segment_impl.h | 388 +----------------- .../hetero/algorithm_ranges_impl_hetero.h | 26 +- 2 files changed, 23 insertions(+), 391 deletions(-) diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h index a22f2970b49..23259e5e550 100644 --- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h +++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h @@ -191,391 +191,17 @@ using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>; } // namespace template + typename _Range4, typename _BinaryPredicate, typename _BinaryOperator> oneapi::dpl::__internal::__difference_t<_Range3> -__sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __keys, - _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, - _BinaryPredicate __binary_pred, _BinaryOperator __binary_op, - _KnownIdentity) +__pattern_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __keys, + _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, + _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { return oneapi::dpl::experimental::ranges::reduce_by_segment( ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__keys), ::std::forward<_Range2>(__values), ::std::forward<_Range3>(__out_keys), ::std::forward<_Range4>(__out_values), __binary_pred, __binary_op); } -#if 0 -template -oneapi::dpl::__internal::__difference_t<_Range3> -__sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __keys, - _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, - _BinaryPredicate __binary_pred, _BinaryOperator __binary_op, - ::std::true_type /* has_known_identity */) -{ - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - - using _SegReduceCountKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _SegReduceCountPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, - _BinaryOperator>; - using _SegReduceOffsetKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _SegReduceOffsetPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, - _BinaryOperator>; - using _SegReduceWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _SegReduceWgPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, - _BinaryOperator>; - using _SegReducePrefixKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _SegReducePrefixPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, - _BinaryOperator>; - - using __diff_type = oneapi::dpl::__internal::__difference_t<_Range3>; - using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; - using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; - - const ::std::size_t __n = __keys.size(); - - constexpr ::std::uint16_t __vals_per_item = - 16; // Each work item serially processes 16 items. Best observed performance on gpu - - // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. - // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. - std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)2048); - - // adjust __wgroup_size according to local memory limit. Double the requirement on __val_type due to sycl group algorithm's use - // of SLM. - __wgroup_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size( - __exec, sizeof(__key_type) + 2 * sizeof(__val_type), __wgroup_size); - -#if _ONEDPL_COMPILE_KERNEL - auto __seg_reduce_count_kernel = - __par_backend_hetero::__internal::__kernel_compiler<_SegReduceCountKernel>::__compile(__exec); - auto __seg_reduce_offset_kernel = - __par_backend_hetero::__internal::__kernel_compiler<_SegReduceOffsetKernel>::__compile(__exec); - auto __seg_reduce_wg_kernel = - __par_backend_hetero::__internal::__kernel_compiler<_SegReduceWgKernel>::__compile(__exec); - auto __seg_reduce_prefix_kernel = - __par_backend_hetero::__internal::__kernel_compiler<_SegReducePrefixKernel>::__compile(__exec); - __wgroup_size = - ::std::min({__wgroup_size, - oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_count_kernel), - oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_offset_kernel), - oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_wg_kernel), - oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_prefix_kernel)}); -#endif - - ::std::size_t __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __wgroup_size * __vals_per_item); - - // intermediate reductions within a workgroup - auto __partials = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n_groups).get_buffer(); - - auto __end_idx = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, 1).get_buffer(); - - // the number of segment ends found in each work group - auto __seg_ends = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n_groups).get_buffer(); - - // buffer that stores an exclusive scan of the results - auto __seg_ends_scanned = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n_groups).get_buffer(); - - // 1. Count the segment ends in each workgroup - auto __seg_end_identification = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __keys); - auto __seg_ends_acc = __seg_ends.template get_access(__cgh); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT - __cgh.use_kernel_bundle(__seg_reduce_count_kernel.get_kernel_bundle()); -#endif - __cgh.parallel_for<_SegReduceCountKernel>( - sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=]( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT - __seg_reduce_count_kernel, -#endif - sycl::nd_item<1> __item) { - auto __group = __item.get_group(); - ::std::size_t __group_id = __item.get_group(0); - ::std::size_t __local_id = __item.get_local_id(0); - ::std::size_t __global_id = __item.get_global_id(0); - - ::std::size_t __start = __global_id * __vals_per_item; - ::std::size_t __end = __dpl_sycl::__minimum{}(__start + __vals_per_item, __n); - ::std::size_t __item_segments = 0; - - // 1a. Work item scan to identify segment ends - for (::std::size_t __i = __start; __i < __end; ++__i) - if (__n - 1 == __i || !__binary_pred(__keys[__i], __keys[__i + 1])) - ++__item_segments; - - // 1b. Work group reduction - ::std::size_t __num_segs = __dpl_sycl::__reduce_over_group( - __group, __item_segments, __dpl_sycl::__plus()); - - // 1c. First work item writes segment count to global memory - if (__local_id == 0) - __seg_ends_acc[__group_id] = __num_segs; - }); - }); - - // 1.5 Small single-group kernel - auto __single_group_scan = __exec.queue().submit([&](sycl::handler& __cgh) { - __cgh.depends_on(__seg_end_identification); - auto __seg_ends_acc = __seg_ends.template get_access(__cgh); - auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access(__cgh); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT - __cgh.use_kernel_bundle(__seg_reduce_offset_kernel.get_kernel_bundle()); -#endif - __cgh.parallel_for<_SegReduceOffsetKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT - __seg_reduce_offset_kernel, -#endif - sycl::nd_range<1>{__wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { - auto __beg = __dpl_sycl::__get_accessor_ptr(__seg_ends_acc); - auto __out_beg = __dpl_sycl::__get_accessor_ptr(__seg_ends_scan_acc); - __dpl_sycl::__joint_exclusive_scan(__item.get_group(), __beg, __beg + __n_groups, __out_beg, - __diff_type(0), sycl::plus<__diff_type>()); - }); - }); - - // 2. Work group reduction - auto __wg_reduce = __exec.queue().submit([&](sycl::handler& __cgh) { - __cgh.depends_on(__single_group_scan); - oneapi::dpl::__ranges::__require_access(__cgh, __keys, __out_keys, __out_values, __values); - - auto __partials_acc = __partials.template get_access(__cgh); - auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access(__cgh); - __dpl_sycl::__local_accessor<__val_type> __loc_acc(2 * __wgroup_size, __cgh); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT - __cgh.use_kernel_bundle(__seg_reduce_wg_kernel.get_kernel_bundle()); -#endif - __cgh.parallel_for<_SegReduceWgKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT - __seg_reduce_wg_kernel, -#endif - sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { - ::std::array<__val_type, __vals_per_item> __loc_partials; - - auto __group = __item.get_group(); - ::std::size_t __group_id = __item.get_group(0); - ::std::size_t __local_id = __item.get_local_id(0); - ::std::size_t __global_id = __item.get_global_id(0); - - // 2a. Lookup the number of prior segs - auto __wg_num_prior_segs = __seg_ends_scan_acc[__group_id]; - - // 2b. Perform a serial scan within the work item over assigned elements. Store partial - // reductions in work group local memory. - ::std::size_t __start = __global_id * __vals_per_item; - ::std::size_t __end = __dpl_sycl::__minimum{}(__start + __vals_per_item, __n); - - ::std::size_t __max_end = 0; - ::std::size_t __item_segments = 0; - auto __identity = unseq_backend::__known_identity<_BinaryOperator, __val_type>; - - __val_type __accumulator = __identity; - for (::std::size_t __i = __start; __i < __end; ++__i) - { - __accumulator = __binary_op(__accumulator, __values[__i]); - if (__n - 1 == __i || !__binary_pred(__keys[__i], __keys[__i + 1])) - { - __loc_partials[__i - __start] = __accumulator; - ++__item_segments; - __max_end = __local_id; - __accumulator = __identity; - } - } - - // 2c. Count the number of prior work segments cooperatively over group - ::std::size_t __prior_segs_in_wg = __dpl_sycl::__exclusive_scan_over_group( - __group, __item_segments, __dpl_sycl::__plus()); - ::std::size_t __start_idx = __wg_num_prior_segs + __prior_segs_in_wg; - - // 2d. Find the greatest segment end less than the current index (inclusive) - ::std::size_t __closest_seg_id = __dpl_sycl::__inclusive_scan_over_group( - __group, __max_end, __dpl_sycl::__maximum()); - - // __wg_segmented_scan is a derivative work and responsible for the third header copyright - __val_type __carry_in = oneapi::dpl::internal::__wg_segmented_scan( - __item, __loc_acc, __local_id, __local_id - __closest_seg_id, __accumulator, __identity, - __binary_op, __wgroup_size); - - // 2e. Update local partial reductions in first segment and write to global memory. - bool __apply_aggs = true; - ::std::size_t __item_offset = 0; - - // first item in group does not have any work-group aggregates to apply - if (__local_id == 0) - { - __apply_aggs = false; - if (__global_id == 0 && __n > 0) - { - // first segment identifier is always the first key - __out_keys[0] = __keys[0]; - } - } - - // apply the aggregates and copy the locally stored values to destination buffer - for (::std::size_t __i = __start; __i < __end; ++__i) - { - if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1])) - { - ::std::size_t __idx = __start_idx + __item_offset; - if (__apply_aggs) - { - __out_values[__idx] = __binary_op(__carry_in, __loc_partials[__i - __start]); - __apply_aggs = false; - } - else - { - __out_values[__idx] = __loc_partials[__i - __start]; - } - if (__i != __n - 1) - { - __out_keys[__idx + 1] = __keys[__i + 1]; - } - ++__item_offset; - } - } - - // 2f. Output the work group aggregate and total number of segments for use in phase 3. - if (__local_id == __wgroup_size - 1) // last work item writes the group's carry out - { - // If no segment ends in the item, the aggregates from previous work groups must be applied. - if (__max_end == 0) - { - // needs to be inclusive with last element - __partials_acc[__group_id] = __binary_op(__carry_in, __accumulator); - } - else - { - __partials_acc[__group_id] = __accumulator; - } - } - }); - }); - - // 3. Apply inter work-group aggregates - __exec.queue() - .submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __keys, __out_keys, __out_values); - - auto __partials_acc = __partials.template get_access(__cgh); - auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access(__cgh); - auto __seg_ends_acc = __seg_ends.template get_access(__cgh); - auto __end_idx_acc = __end_idx.template get_access(__cgh); - - __dpl_sycl::__local_accessor<__val_type> __loc_partials_acc(__wgroup_size, __cgh); - __dpl_sycl::__local_accessor<__diff_type> __loc_seg_ends_acc(__wgroup_size, __cgh); - - __cgh.depends_on(__wg_reduce); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT - __cgh.use_kernel_bundle(__seg_reduce_prefix_kernel.get_kernel_bundle()); -#endif - __cgh.parallel_for<_SegReducePrefixKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT - __seg_reduce_prefix_kernel, -#endif - sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { - auto __group = __item.get_group(); - ::std::int64_t __group_id = __item.get_group(0); - ::std::size_t __global_id = __item.get_global_id(0); - ::std::size_t __local_id = __item.get_local_id(0); - - ::std::size_t __start = __global_id * __vals_per_item; - ::std::size_t __end = __dpl_sycl::__minimum{}(__start + __vals_per_item, __n); - ::std::size_t __item_segments = 0; - - ::std::int64_t __wg_agg_idx = __group_id - 1; - __val_type __agg_collector = unseq_backend::__known_identity<_BinaryOperator, __val_type>; - - bool __ag_exists = false; - // 3a. Check to see if an aggregate exists and compute that value in the first - // work item. - if (__group_id != 0) - { - __ag_exists = __start < __n; - // local reductions followed by a sweep - constexpr ::std::int32_t __vals_to_explore = 16; - bool __last_it = false; - __loc_seg_ends_acc[__local_id] = false; - __loc_partials_acc[__local_id] = unseq_backend::__known_identity<_BinaryOperator, __val_type>; - for (::std::int32_t __i = __wg_agg_idx - __vals_to_explore * __local_id; !__last_it; - __i -= __wgroup_size * __vals_to_explore) - { - __val_type __local_collector = unseq_backend::__known_identity<_BinaryOperator, __val_type>; - // exploration phase - for (::std::int32_t __j = __i; - __j > __dpl_sycl::__maximum<::std::int32_t>{}(-1L, __i - __vals_to_explore); --__j) - { - __local_collector = __binary_op(__partials_acc[__j], __local_collector); - if (__seg_ends_acc[__j] || __j == 0) - { - __loc_seg_ends_acc[__local_id] = true; - break; - } - } - __loc_partials_acc[__local_id] = __local_collector; - __dpl_sycl::__group_barrier(__item); - // serial aggregate collection and synchronization - if (__local_id == 0) - { - for (::std::size_t __j = 0; __j < __wgroup_size; ++__j) - { - __agg_collector = __binary_op(__loc_partials_acc[__j], __agg_collector); - if (__loc_seg_ends_acc[__j]) - { - __last_it = true; - break; - } - } - } - __agg_collector = __dpl_sycl::__group_broadcast(__item.get_group(), __agg_collector); - __last_it = __dpl_sycl::__group_broadcast(__item.get_group(), __last_it); - } - - // Check to see if aggregates exist. - // The last group must always stay to write the final index - __ag_exists = __dpl_sycl::__any_of_group(__group, __ag_exists); - if (!__ag_exists && __group_id != __n_groups - 1) - return; - } - // 3b. count the segment ends - for (::std::size_t __i = __start; __i < __end; ++__i) - if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1])) - ++__item_segments; - - ::std::size_t __prior_segs_in_wg = __dpl_sycl::__exclusive_scan_over_group( - __group, __item_segments, __dpl_sycl::__plus()); - - // 3c. Determine prior index - ::std::size_t __wg_num_prior_segs = __seg_ends_scan_acc[__group_id]; - - // 3d. Second pass over the keys, reidentifying end segments and applying work group - // aggregates if appropriate. Both the key and reduction value are written to the final output at the - // computed index - ::std::size_t __item_offset = 0; - for (::std::size_t __i = __start; __i < __end; ++__i) - { - if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1])) - { - ::std::size_t __idx = __wg_num_prior_segs + __prior_segs_in_wg + __item_offset; - - // apply the aggregate if it is the first segment end in the workgroup only - if (__prior_segs_in_wg == 0 && __item_offset == 0 && __ag_exists) - __out_values[__idx] = __binary_op(__agg_collector, __out_values[__idx]); - - ++__item_offset; - // the last item must write the last index's position to return - if (__i == __n - 1) - __end_idx_acc[0] = __idx; - } - } - }); - }) - .wait(); - - return __end_idx.get_host_access()[0] + 1; -} -#endif - template ::std::pair @@ -615,9 +241,9 @@ reduce_by_segment_impl(__internal::__hetero_tag<_BackendTag> __tag, Policy&& pol typename ::std::iterator_traits::value_type>::type; // number of unique keys - _CountType __n = __sycl_reduce_by_segment( - __tag, ::std::forward(policy), key_buf.all_view(), value_buf.all_view(), key_output_buf.all_view(), - value_output_buf.all_view(), binary_pred, binary_op, has_known_identity{}); + _CountType __n = + __pattern_reduce_by_segment(__tag, ::std::forward(policy), key_buf.all_view(), value_buf.all_view(), + key_output_buf.all_view(), value_output_buf.all_view(), binary_pred, binary_op); return ::std::make_pair(result1 + __n, result2 + __n); } diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 7558e29b4d5..72782105d93 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -913,15 +913,22 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), - std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), - std::forward<_Range4>(__out_values), __binary_pred, __binary_op); - __res.wait(); - // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the - // past-the-end iterator pair of segmented reduction. - return std::get<0>(__res.get()) + 1; - // TODO: this needs to be enabled if reduce then scan cannot be satisfied. - #if 0 +#if _ONEDPL_BACKEND_SYCL + // We would normally dispatch to the parallel implementation which would make the decision to invoke + // reduce-then-scan. However, since the fallback is implemented at the ranges level we must choose + // whether or not to use reduce-then-scan here. + if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) + { + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), + std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), + __binary_pred, __binary_op); + __res.wait(); + // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the + // past-the-end iterator pair of segmented reduction. + return std::get<0>(__res.get()) + 1; + } +#endif // The algorithm reduces values in __values where the // associated keys for the values are equal to the adjacent key. // @@ -1052,7 +1059,6 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& .__deferrable_wait(); return __result_end; - #endif } } // namespace __ranges From 7c92238a6994c86daa1e024d47743f34e2b2d3ac Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 15 Oct 2024 11:42:03 -0700 Subject: [PATCH 08/31] Switch from using lambdas to functors Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 131 +++++++++++------- 1 file changed, 82 insertions(+), 49 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index ebb86585133..ccd891b954b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -786,6 +786,68 @@ struct __gen_transform_input _UnaryOp __unary_op; }; +template +struct __gen_red_by_seg_input +{ + template + auto + operator()(const _InRng& __in_rng, std::size_t __id) const + { + auto&& __in_keys = std::get<0>(__in_rng.tuple()); + auto&& __in_vals = std::get<1>(__in_rng.tuple()); + using _ValueType = oneapi::dpl::__internal::__value_t; + if (__id == 0 || __binary_pred(__in_keys[__id - 1], __in_keys[__id])) + return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__id]}); + return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__id]}); + } + _BinaryPred __binary_pred; +}; + +template +struct __red_by_seg_op +{ + template + auto + operator()(const _Tup1& __lhs_tup, const _Tup2& __rhs_tup) const + { + if (std::get<0>(__rhs_tup) == 0) + { + return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup), + __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup))); + } + return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup), + std::get<1>(__rhs_tup)); + } + _BinaryOp __binary_op; +}; + +template +struct __write_red_by_seg +{ + template + void + operator()(_InRng& __in_rng, _OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const + { + auto&& __in_keys = std::get<0>(__in_rng.tuple()); + auto&& __out_keys = std::get<0>(__out_rng.tuple()); + auto&& __out_values = std::get<1>(__out_rng.tuple()); + // TODO: substantial improvement expected with special handling in kernel + // The first key must be output to __out_keys[__id] for a segment, so when we encounter a segment end we + // must output the current segment's value and the next segment's key. + if (__id == 0) + __out_keys[0] = __in_keys[0]; + if (__id == __n - 1) + __out_values[std::get<0>(__tup)] = std::get<1>(__tup); + else if (!__binary_pred(__in_keys[__id], __in_keys[__id + 1])) + { + __out_keys[std::get<0>(__tup) + 1] = __in_keys[__id + 1]; + __out_values[std::get<0>(__tup)] = std::get<1>(__tup); + } + } + _BinaryPred __binary_pred; + std::size_t __n; +}; + struct __simple_write_to_id { template @@ -1185,58 +1247,29 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t _CopyOp{_ReduceOp{}, _Assign{}}); } } -template + +template auto -__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, - _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, - _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) -{ - auto __n = __keys.size(); - auto __gen_reduce_input = [=](const auto& __in_rng, std::size_t __idx) { - auto&& __in_keys = std::get<0>(__in_rng.tuple()); - auto&& __in_vals = std::get<1>(__in_rng.tuple()); - using _ValueType = oneapi::dpl::__internal::__value_t; - if (__idx == 0 || __binary_pred(__in_keys[__idx - 1], __in_keys[__idx])) - return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); - return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}); - }; - auto __reduce_op = [=](const auto& __lhs_tup, const auto& __rhs_tup) { - if (std::get<0>(__rhs_tup) == 0) - { - return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup), - __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup))); - } - return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup), - std::get<1>(__rhs_tup)); - }; - auto __gen_scan_input = __gen_reduce_input; - auto __scan_input_transform = oneapi::dpl::__internal::__no_op{}; - auto __write_out = [=](auto& __in_rng, auto& __out_rng, std::size_t __idx, const auto& __tup) { - auto&& __in_keys = std::get<0>(__in_rng.tuple()); - auto&& __out_keys = std::get<0>(__out_rng.tuple()); - auto&& __out_vals = std::get<1>(__out_rng.tuple()); - // TODO: substantial improvement expected with special handling in kernel - // The first key must be output to __out_keys[__idx] for a segment, so when we encounter a segment end we - // must output the current segment's value and the next segment's key. - if (__idx == 0) - __out_keys[0] = __in_keys[0]; - if (__idx == __n - 1) - __out_values[std::get<0>(__tup)] = std::get<1>(__tup); - else if (!__binary_pred(__in_keys[__idx], __in_keys[__idx + 1])) - { - __out_keys[std::get<0>(__tup) + 1] = __in_keys[__idx + 1]; - __out_values[std::get<0>(__tup)] = std::get<1>(__tup); - } - }; +__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, + _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, + _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) +{ + using _GenReduceInput = __gen_red_by_seg_input<_BinaryPredicate>; + using _ReduceOp = __red_by_seg_op<_BinaryOperator>; + using _GenScanInput = _GenReduceInput; + using _ScanInputTransform = oneapi::dpl::__internal::__no_op; + using _WriteOp = __write_red_by_seg<_BinaryPredicate>; using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; + std::size_t __n = __keys.size(); return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), - oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), - oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), - __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform, - __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, - /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); + __backend_tag, std::forward<_ExecutionPolicy>(__exec), + oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), + oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), + _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred}, _ScanInputTransform{}, + _WriteOp{__binary_pred, __n}, + oneapi::dpl::unseq_backend::__no_init_value>{}, + /*Inclusive*/ std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); } template From 6c4267be04faaa120e5ca5d6aadae5778337ef31 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 15 Oct 2024 12:04:35 -0700 Subject: [PATCH 09/31] Add device copyable specializations for red-by-seg functors and update testing Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +- .../dpl/pstl/hetero/dpcpp/sycl_traits.h | 29 ++++++++++++++++++ .../device_copyable.pass.cpp | 30 +++++++++++++++++++ 3 files changed, 60 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index ccd891b954b..5c22ff49dc9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -868,7 +868,7 @@ struct __gen_mask { template bool - operator()(const _InRng& __in_rng, std::size_t __id) const + operator()(_InRng&& __in_rng, std::size_t __id) const { return __pred((__rng_transform(std::forward<_InRng>(__in_rng)))[__id]); } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h index 2d0e88fd34b..40713cf8621 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h @@ -236,6 +236,9 @@ namespace oneapi::dpl::__par_backend_hetero template struct __gen_transform_input; +template +struct __gen_red_by_seg_input; + template struct __gen_mask; @@ -254,12 +257,18 @@ struct __write_to_id_if; template struct __write_to_id_if_else; +template +struct __write_red_by_seg; + template struct __early_exit_find_or; template struct __leaf_sorter; +template +struct __red_by_seg_op; + } // namespace oneapi::dpl::__par_backend_hetero template @@ -276,6 +285,13 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen { }; +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_input, + _BinaryPred)> + : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryPred> +{ +}; + template struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_unique_mask, _BinaryPredicate)> @@ -309,6 +325,13 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen { }; +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_red_by_seg, + _BinaryPred)> + : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryPred> +{ +}; + template struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__early_exit_find_or, _ExecutionPolicy, _Pred)> @@ -323,6 +346,12 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen { }; +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__red_by_seg_op, _BinaryOp)> + : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryOp> +{ +}; + namespace oneapi::dpl::unseq_backend { diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp index 9b1b09e8c2e..ee507860633 100644 --- a/test/general/implementation_details/device_copyable.pass.cpp +++ b/test/general/implementation_details/device_copyable.pass.cpp @@ -155,6 +155,11 @@ test_device_copyable() sycl::is_device_copyable_v>, "__gen_transform_input is not device copyable with device copyable types"); + //__gen_red_by_seg_input + static_assert( + sycl::is_device_copyable_v>, + "__gen_red_by_seg_input is not device copyable with device copyable types"); + //__gen_mask static_assert(sycl::is_device_copyable_v>, "__gen_mask is not device copyable with device copyable types"); @@ -184,6 +189,11 @@ test_device_copyable() sycl::is_device_copyable_v>, "__write_to_id_if_else is not device copyable with device copyable types"); + //__write_red_by_seg + static_assert( + sycl::is_device_copyable_v>, + "__write_red_by_seg is not device copyable with device copyable types"); + // __early_exit_find_or static_assert( sycl::is_device_copyable_v< @@ -199,6 +209,11 @@ test_device_copyable() noop_device_copyable>>, "__leaf_sorter is not device copyable with device copyable types"); + //__red_by_seg_op + static_assert( + sycl::is_device_copyable_v>, + "__red_by_seg_op is not device copyable with device copyable types"); + //__not_pred static_assert(sycl::is_device_copyable_v>, "__not_pred is not device copyable with device copyable types"); @@ -397,6 +412,11 @@ test_non_device_copyable() !sycl::is_device_copyable_v>, "__gen_transform_input is device copyable with non device copyable types"); + //__gen_red_by_seg_input + static_assert( + !sycl::is_device_copyable_v>, + "__gen_red_by_seg_input is device copyable with device copyable types"); + //__gen_mask static_assert(!sycl::is_device_copyable_v>, "__gen_mask is device copyable with non device copyable types"); @@ -426,6 +446,11 @@ test_non_device_copyable() oneapi::dpl::__par_backend_hetero::__write_to_id_if_else>, "__write_to_id_if_else is device copyable with non device copyable types"); + //__write_red_by_seg + static_assert( + !sycl::is_device_copyable_v>, + "__write_red_by_seg is device copyable with device copyable types"); + // __early_exit_find_or static_assert( !sycl::is_device_copyable_v>, "__leaf_sorter is device copyable with non device copyable types"); + //__red_by_seg_op + static_assert( + !sycl::is_device_copyable_v>, + "__red_by_seg_op is device copyable with device copyable types"); + //__not_pred static_assert(!sycl::is_device_copyable_v>, "__not_pred is device copyable with non device copyable types"); From 3eee2429ea170294fe6e814852be2d7ee7bca246 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 15 Oct 2024 12:08:05 -0700 Subject: [PATCH 10/31] Fix typo in error message in device_copyable.pass.cpp Signed-off-by: Matthew Michel --- .../general/implementation_details/device_copyable.pass.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp index ee507860633..f7af1f9e9af 100644 --- a/test/general/implementation_details/device_copyable.pass.cpp +++ b/test/general/implementation_details/device_copyable.pass.cpp @@ -415,7 +415,7 @@ test_non_device_copyable() //__gen_red_by_seg_input static_assert( !sycl::is_device_copyable_v>, - "__gen_red_by_seg_input is device copyable with device copyable types"); + "__gen_red_by_seg_input is device copyable with non device copyable types"); //__gen_mask static_assert(!sycl::is_device_copyable_v>, @@ -449,7 +449,7 @@ test_non_device_copyable() //__write_red_by_seg static_assert( !sycl::is_device_copyable_v>, - "__write_red_by_seg is device copyable with device copyable types"); + "__write_red_by_seg is device copyable with non device copyable types"); // __early_exit_find_or static_assert( @@ -465,7 +465,7 @@ test_non_device_copyable() //__red_by_seg_op static_assert( !sycl::is_device_copyable_v>, - "__red_by_seg_op is device copyable with device copyable types"); + "__red_by_seg_op is device copyable with non device copyable types"); //__not_pred static_assert(!sycl::is_device_copyable_v>, From b334fb010530edb69b3daa9b0c2c8cf2ce65e98e Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 18 Oct 2024 07:20:31 -0700 Subject: [PATCH 11/31] Introduce separate input generation for scan phase and update tests Signed-off-by: Matthew Michel --- .../hetero/algorithm_ranges_impl_hetero.h | 2 +- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 66 +++++++++++++------ .../dpl/pstl/hetero/dpcpp/sycl_traits.h | 14 +++- .../device_copyable.pass.cpp | 22 +++++-- 4 files changed, 75 insertions(+), 29 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 72782105d93..2f42f5e4b60 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -919,7 +919,7 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& // whether or not to use reduce-then-scan here. if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment( + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), __binary_pred, __binary_op); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 5c22ff49dc9..5a08852b79a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -787,7 +787,7 @@ struct __gen_transform_input }; template -struct __gen_red_by_seg_input +struct __gen_red_by_seg_reduce_input { template auto @@ -796,11 +796,33 @@ struct __gen_red_by_seg_input auto&& __in_keys = std::get<0>(__in_rng.tuple()); auto&& __in_vals = std::get<1>(__in_rng.tuple()); using _ValueType = oneapi::dpl::__internal::__value_t; - if (__id == 0 || __binary_pred(__in_keys[__id - 1], __in_keys[__id])) - return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__id]}); - return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__id]}); + std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); + return oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}); } _BinaryPred __binary_pred; + std::size_t __n; +}; + +template +struct __gen_red_by_seg_scan_input +{ + template + auto + operator()(const _InRng& __in_rng, std::size_t __id) const + { + auto&& __in_keys = std::get<0>(__in_rng.tuple()); + auto&& __in_vals = std::get<1>(__in_rng.tuple()); + using _ValueType = oneapi::dpl::__internal::__value_t; + // Each beginning segment is marked with a flag to know when to stop reduce lower indexed inputs + std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); + // Each last element in a segment is marked with an output flag to store its reduction in the write phase + bool __output_mask = __id == __n - 1 || !__binary_pred(__in_keys[__id], __in_keys[__id + 1]); + const auto __candidate_key = __id < __n - 1 ? __in_keys[__id + 1] : __in_keys[__id]; + return oneapi::dpl::__internal::make_tuple(oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), + __output_mask, __candidate_key); + } + _BinaryPred __binary_pred; + std::size_t __n; }; template @@ -826,22 +848,26 @@ struct __write_red_by_seg { template void - operator()(_InRng& __in_rng, _OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const + operator()(const _InRng& __in_rng, _OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const { + using std::get; auto&& __in_keys = std::get<0>(__in_rng.tuple()); auto&& __out_keys = std::get<0>(__out_rng.tuple()); auto&& __out_values = std::get<1>(__out_rng.tuple()); - // TODO: substantial improvement expected with special handling in kernel - // The first key must be output to __out_keys[__id] for a segment, so when we encounter a segment end we - // must output the current segment's value and the next segment's key. + // TODO: substantial improvement expected with special handling in kernel of first and last sub-groups. + // The first key must be output to __out_keys for a segment, so when we encounter a segment end we + // must output the current segment's value and the next segment's key. For index zero we must special handle + // and write the first key from the current index. if (__id == 0) - __out_keys[0] = __in_keys[0]; + __out_keys[0] = __in_keys[0]; + // We are at the end of the input so there is no key to output for the next segment if (__id == __n - 1) - __out_values[std::get<0>(__tup)] = std::get<1>(__tup); - else if (!__binary_pred(__in_keys[__id], __in_keys[__id + 1])) + __out_values[get<0>(get<0>(__tup))] = get<1>(get<0>(__tup)); + // Update the current segment's output value and the next segment's key value + else if (get<1>(__tup)) { - __out_keys[std::get<0>(__tup) + 1] = __in_keys[__id + 1]; - __out_values[std::get<0>(__tup)] = std::get<1>(__tup); + __out_keys[get<0>(get<0>(__tup)) + 1] = get<2>(__tup); + __out_values[get<0>(get<0>(__tup))] = get<1>(get<0>(__tup)); } } _BinaryPred __binary_pred; @@ -1251,14 +1277,14 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t template auto -__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, - _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) +__parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, + _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, + _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { - using _GenReduceInput = __gen_red_by_seg_input<_BinaryPredicate>; + using _GenReduceInput = __gen_red_by_seg_reduce_input<_BinaryPredicate>; using _ReduceOp = __red_by_seg_op<_BinaryOperator>; - using _GenScanInput = _GenReduceInput; - using _ScanInputTransform = oneapi::dpl::__internal::__no_op; + using _GenScanInput = __gen_red_by_seg_scan_input<_BinaryPredicate>; + using _ScanInputTransform = __get_zeroth_element; using _WriteOp = __write_red_by_seg<_BinaryPredicate>; using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; std::size_t __n = __keys.size(); @@ -1266,7 +1292,7 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac __backend_tag, std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), - _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred}, _ScanInputTransform{}, + _GenReduceInput{__binary_pred, __n}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, _ScanInputTransform{}, _WriteOp{__binary_pred, __n}, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/ std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h index 40713cf8621..7d3fd829cc5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h @@ -237,7 +237,10 @@ template struct __gen_transform_input; template -struct __gen_red_by_seg_input; +struct __gen_red_by_seg_reduce_input; + +template +struct __gen_red_by_seg_scan_input; template struct __gen_mask; @@ -286,7 +289,14 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen }; template -struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_input, +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_reduce_input, + _BinaryPred)> + : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryPred> +{ +}; + +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_scan_input, _BinaryPred)> : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryPred> { diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp index f7af1f9e9af..572c16b818a 100644 --- a/test/general/implementation_details/device_copyable.pass.cpp +++ b/test/general/implementation_details/device_copyable.pass.cpp @@ -155,10 +155,15 @@ test_device_copyable() sycl::is_device_copyable_v>, "__gen_transform_input is not device copyable with device copyable types"); - //__gen_red_by_seg_input + //__gen_red_by_seg_reduce_input static_assert( - sycl::is_device_copyable_v>, - "__gen_red_by_seg_input is not device copyable with device copyable types"); + sycl::is_device_copyable_v>, + "__gen_red_by_seg_reduce_input is not device copyable with device copyable types"); + + //__gen_red_by_seg_scan_input + static_assert( + sycl::is_device_copyable_v>, + "__gen_red_by_seg_scan_input is not device copyable with device copyable types"); //__gen_mask static_assert(sycl::is_device_copyable_v>, @@ -412,10 +417,15 @@ test_non_device_copyable() !sycl::is_device_copyable_v>, "__gen_transform_input is device copyable with non device copyable types"); - //__gen_red_by_seg_input + //__gen_red_by_seg_reduce_input + static_assert( + !sycl::is_device_copyable_v>, + "__gen_red_by_seg_reduce_input is device copyable with non device copyable types"); + + //__gen_red_by_seg_reduce_input static_assert( - !sycl::is_device_copyable_v>, - "__gen_red_by_seg_input is device copyable with non device copyable types"); + !sycl::is_device_copyable_v>, + "__gen_red_by_seg_scan_input is device copyable with non device copyable types"); //__gen_mask static_assert(!sycl::is_device_copyable_v>, From d564b3306d473c86f624b241ce4d61134e592fe9 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 18 Oct 2024 09:09:56 -0700 Subject: [PATCH 12/31] Improve code readability Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 65 +++++++++++-------- 1 file changed, 38 insertions(+), 27 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 5a08852b79a..cf932d2df07 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -806,20 +806,28 @@ struct __gen_red_by_seg_reduce_input template struct __gen_red_by_seg_scan_input { + // Returns the following tuple: + // ((new_seg_mask: size_t, value: ValueType), output_value: bool, candidate_key: KeyType) + // new_seg_mask : 1 for a start of a new segment, 0 otherwise + // value : Current element's value for reduction + // output_value : Whether this work-item should write an output + // candidate_key: The key of the next segment to write if output_value is true. template auto operator()(const _InRng& __in_rng, std::size_t __id) const { - auto&& __in_keys = std::get<0>(__in_rng.tuple()); - auto&& __in_vals = std::get<1>(__in_rng.tuple()); + auto __in_keys = std::get<0>(__in_rng.tuple()); + auto __in_vals = std::get<1>(__in_rng.tuple()); + using _KeyType = oneapi::dpl::__internal::__value_t; using _ValueType = oneapi::dpl::__internal::__value_t; - // Each beginning segment is marked with a flag to know when to stop reduce lower indexed inputs std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); - // Each last element in a segment is marked with an output flag to store its reduction in the write phase - bool __output_mask = __id == __n - 1 || !__binary_pred(__in_keys[__id], __in_keys[__id + 1]); - const auto __candidate_key = __id < __n - 1 ? __in_keys[__id + 1] : __in_keys[__id]; - return oneapi::dpl::__internal::make_tuple(oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), - __output_mask, __candidate_key); + if (__id == __n - 1) + return oneapi::dpl::__internal::make_tuple( + oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, + _KeyType{__in_keys[__id]}); // __in_keys[__id] is an unused placeholder + return oneapi::dpl::__internal::make_tuple( + oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), + !__binary_pred(__in_keys[__id], __in_keys[__id + 1]), _KeyType{__in_keys[__id + 1]}); } _BinaryPred __binary_pred; std::size_t __n; @@ -832,13 +840,15 @@ struct __red_by_seg_op auto operator()(const _Tup1& __lhs_tup, const _Tup2& __rhs_tup) const { + using std::get; + // The left-hand side has processed elements from the same segment, so update the reduction value. if (std::get<0>(__rhs_tup) == 0) { - return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup), - __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup))); + return oneapi::dpl::__internal::make_tuple(get<0>(__lhs_tup), + __binary_op(get<1>(__lhs_tup), get<1>(__rhs_tup))); } - return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup), - std::get<1>(__rhs_tup)); + // We are looking at elements from a previous segment so just update the output index. + return oneapi::dpl::__internal::make_tuple(get<0>(__lhs_tup) + get<0>(__rhs_tup), get<1>(__rhs_tup)); } _BinaryOp __binary_op; }; @@ -851,23 +861,24 @@ struct __write_red_by_seg operator()(const _InRng& __in_rng, _OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const { using std::get; - auto&& __in_keys = std::get<0>(__in_rng.tuple()); - auto&& __out_keys = std::get<0>(__out_rng.tuple()); - auto&& __out_values = std::get<1>(__out_rng.tuple()); - // TODO: substantial improvement expected with special handling in kernel of first and last sub-groups. - // The first key must be output to __out_keys for a segment, so when we encounter a segment end we - // must output the current segment's value and the next segment's key. For index zero we must special handle - // and write the first key from the current index. + auto __in_keys = get<0>(__in_rng.tuple()); + auto __out_keys = get<0>(__out_rng.tuple()); + auto __out_values = get<1>(__out_rng.tuple()); + using _KeyType = oneapi::dpl::__internal::__value_t; + using _ValType = oneapi::dpl::__internal::__value_t; + + const _KeyType& __next_segment_key = get<2>(__tup); + const _ValType& __cur_segment_value = get<1>(get<0>(__tup)); + const bool __is_seg_end = get<1>(__tup); + const std::size_t __out_idx = get<0>(get<0>(__tup)); + if (__id == 0) - __out_keys[0] = __in_keys[0]; - // We are at the end of the input so there is no key to output for the next segment - if (__id == __n - 1) - __out_values[get<0>(get<0>(__tup))] = get<1>(get<0>(__tup)); - // Update the current segment's output value and the next segment's key value - else if (get<1>(__tup)) + __out_keys[0] = __in_keys[0]; + if (__is_seg_end) { - __out_keys[get<0>(get<0>(__tup)) + 1] = get<2>(__tup); - __out_values[get<0>(get<0>(__tup))] = get<1>(get<0>(__tup)); + __out_values[__out_idx] = __cur_segment_value; + if (__id != __n - 1) + __out_keys[__out_idx + 1] = __next_segment_key; } } _BinaryPred __binary_pred; From 358ec3be82665a4cbe8c9deb031dd147f39e91dd Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 18 Oct 2024 09:46:54 -0700 Subject: [PATCH 13/31] Add optional first key field to scan input and remove input range in write operations Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index cf932d2df07..1260d5450a1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -28,6 +28,7 @@ #include #include #include +#include #include "../../iterator_impl.h" #include "../../execution_impl.h" @@ -793,8 +794,8 @@ struct __gen_red_by_seg_reduce_input auto operator()(const _InRng& __in_rng, std::size_t __id) const { - auto&& __in_keys = std::get<0>(__in_rng.tuple()); - auto&& __in_vals = std::get<1>(__in_rng.tuple()); + auto __in_keys = std::get<0>(__in_rng.tuple()); + auto __in_vals = std::get<1>(__in_rng.tuple()); using _ValueType = oneapi::dpl::__internal::__value_t; std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); return oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}); @@ -819,15 +820,19 @@ struct __gen_red_by_seg_scan_input auto __in_keys = std::get<0>(__in_rng.tuple()); auto __in_vals = std::get<1>(__in_rng.tuple()); using _KeyType = oneapi::dpl::__internal::__value_t; + using _OptKeyType = std::optional<_KeyType>; using _ValueType = oneapi::dpl::__internal::__value_t; + _OptKeyType __first_key; + if (__id == 0) + __first_key = _OptKeyType{__in_keys[0]}; std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); if (__id == __n - 1) return oneapi::dpl::__internal::make_tuple( oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, - _KeyType{__in_keys[__id]}); // __in_keys[__id] is an unused placeholder + _KeyType{__in_keys[__id]}, __first_key); // __in_keys[__id] is an unused placeholder return oneapi::dpl::__internal::make_tuple( oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), - !__binary_pred(__in_keys[__id], __in_keys[__id + 1]), _KeyType{__in_keys[__id + 1]}); + !__binary_pred(__in_keys[__id], __in_keys[__id + 1]), _KeyType{__in_keys[__id + 1]}, __first_key); } _BinaryPred __binary_pred; std::size_t __n; @@ -856,12 +861,11 @@ struct __red_by_seg_op template struct __write_red_by_seg { - template + template void - operator()(const _InRng& __in_rng, _OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const + operator()(_OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const { using std::get; - auto __in_keys = get<0>(__in_rng.tuple()); auto __out_keys = get<0>(__out_rng.tuple()); auto __out_values = get<1>(__out_rng.tuple()); using _KeyType = oneapi::dpl::__internal::__value_t; @@ -873,7 +877,7 @@ struct __write_red_by_seg const std::size_t __out_idx = get<0>(get<0>(__tup)); if (__id == 0) - __out_keys[0] = __in_keys[0]; + __out_keys[0] = *get<3>(__tup); if (__is_seg_end) { __out_values[__out_idx] = __cur_segment_value; @@ -887,9 +891,9 @@ struct __write_red_by_seg struct __simple_write_to_id { - template + template void - operator()(const _InRng&, _OutRng& __out_rng, std::size_t __id, const _ValueType& __v) const + operator()(_OutRng& __out_rng, std::size_t __id, const _ValueType& __v) const { // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through. @@ -1036,9 +1040,9 @@ struct __get_zeroth_element template struct __write_to_id_if { - template + template void - operator()(const _InRng&, _OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const + operator()(_OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const { // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through. @@ -1054,9 +1058,9 @@ struct __write_to_id_if template struct __write_to_id_if_else { - template + template void - operator()(const _InRng&, _OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const + operator()(_OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const { using _ConvertedTupleType = typename oneapi::dpl::__internal::__get_tuple_type(__v))>, From db0bc25056188a81d6979fdf38e3ac3b57c4e071 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 18 Oct 2024 09:49:30 -0700 Subject: [PATCH 14/31] Update __write_op in reduce-then-scan Signed-off-by: Matthew Michel --- .../dpcpp/parallel_backend_sycl_reduce_then_scan.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 09c0d754bd7..8c0762f2a38 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -173,7 +173,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__in_rng, __out_rng, __start_id, __v); + __write_op(__out_rng, __start_id, __v); } if (__is_full_block) @@ -187,7 +187,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__in_rng, __out_rng, __start_id + __j * __sub_group_size, __v); + __write_op(__out_rng, __start_id + __j * __sub_group_size, __v); } } } @@ -202,7 +202,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__in_rng, __out_rng, __start_id + __j * __sub_group_size, __v); + __write_op(__out_rng, __start_id + __j * __sub_group_size, __v); } } } @@ -225,7 +225,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI if constexpr (__capture_output) { if (__start_id < __n) - __write_op(__in_rng, __out_rng, __start_id, __v); + __write_op(__out_rng, __start_id, __v); } } else @@ -235,7 +235,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__in_rng, __out_rng, __start_id, __v); + __write_op(__out_rng, __start_id, __v); } for (std::uint32_t __j = 1; __j < __iters - 1; __j++) @@ -246,7 +246,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__in_rng, __out_rng, __local_id, __v); + __write_op(__out_rng, __local_id, __v); } } @@ -259,7 +259,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI if constexpr (__capture_output) { if (__offset < __n) - __write_op(__in_rng, __out_rng, __offset, __v); + __write_op(__out_rng, __offset, __v); } } } From 7abbff8abd0a02f4b826543f97a236c737e1a234 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 18 Oct 2024 13:17:58 -0700 Subject: [PATCH 15/31] Remove now unneeded ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION macro Signed-off-by: Matthew Michel --- CMakeLists.txt | 10 -------- cmake/README.md | 1 - .../pstl/hetero/dpcpp/unseq_backend_sycl.h | 17 +++----------- test/CMakeLists.txt | 5 ---- .../numeric.ops/reduce_by_segment.pass.cpp | 23 ++++--------------- 5 files changed, 8 insertions(+), 48 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3cb28a1693d..c682e745f9a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -287,16 +287,6 @@ if (ONEDPL_BACKEND MATCHES "^(tbb|dpcpp|dpcpp_only)$") endif() endif() - if (DEFINED ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION) - if(ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION) - message(STATUS "Adding -DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1 option") - target_compile_options(oneDPL INTERFACE "-DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1") - else() - message(STATUS "Adding -DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=0 option") - target_compile_options(oneDPL INTERFACE "-DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=0") - endif() - endif() - # DPC++ specific macro target_compile_definitions(oneDPL INTERFACE $<$,$>:ONEDPL_FPGA_DEVICE> diff --git a/cmake/README.md b/cmake/README.md index 7335b7e2312..0683a377820 100644 --- a/cmake/README.md +++ b/cmake/README.md @@ -18,7 +18,6 @@ The following variables are provided for oneDPL configuration: | ONEDPL_AOT_ARCH | STRING | Architecture options for ahead-of-time compilation, supported values can be found [here](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html) | "*" for GPU device and "avx" for CPU device | | ONEDPL_TEST_EXPLICIT_KERNEL_NAMES | STRING | Control kernel naming. Affects only oneDPL test targets. Supported values: AUTO, ALWAYS. AUTO: rely on the compiler if "Unnamed SYCL lambda kernels" feature is on, otherwise provide kernel names explicitly; ALWAYS: provide kernel names explicitly | AUTO | | ONEDPL_TEST_WIN_ICX_FIXES | BOOL | Affects only oneDPL test targets. Enable icx, icx-cl workarounds to fix issues in CMake for Windows. | ON | -| ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION | BOOL | Use as a workaround for incorrect results, which may be produced by reduction algorithms with 64-bit data types compiled by the Intel® oneAPI DPC++/C++ Compiler and executed on GPU devices. | | Some useful CMake variables ([here](https://cmake.org/cmake/help/latest/manual/cmake-variables.7.html) you can find a full list of CMake variables for the latest version): diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index b17c619de89..9c588d0bf52 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -34,21 +34,11 @@ namespace unseq_backend //This optimization depends on Intel(R) oneAPI DPC++ Compiler implementation such as support of binary operators from std namespace. //We need to use defined(SYCL_IMPLEMENTATION_INTEL) macro as a guard. -template -inline constexpr bool __can_use_known_identity = -# if ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION - // When ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION is defined as non-zero, we avoid using known identity for 64-bit arithmetic data types - !(::std::is_arithmetic_v<_Tp> && sizeof(_Tp) == sizeof(::std::uint64_t)); -# else - true; -# endif // ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION - //TODO: To change __has_known_identity implementation as soon as the Intel(R) oneAPI DPC++ Compiler implementation issues related to //std::multiplies, std::bit_or, std::bit_and and std::bit_xor operations will be fixed. //std::logical_and and std::logical_or are not supported in Intel(R) oneAPI DPC++ Compiler to be used in sycl::inclusive_scan_over_group and sycl::reduce_over_group template -using __has_known_identity = ::std::conditional_t< - __can_use_known_identity<_Tp>, +using __has_known_identity = # if _ONEDPL_LIBSYCL_VERSION >= 50200 typename ::std::disjunction< __dpl_sycl::__has_known_identity<_BinaryOp, _Tp>, @@ -60,16 +50,15 @@ using __has_known_identity = ::std::conditional_t< ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum<_Tp>>, ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum>, ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<_Tp>>, - ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum>>>>, + ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum>>>>; # else //_ONEDPL_LIBSYCL_VERSION >= 50200 typename ::std::conjunction< ::std::is_arithmetic<_Tp>, ::std::disjunction<::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus<_Tp>>, ::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus>, ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<_Tp>>, - ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus>>>, + ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus>>>; # endif //_ONEDPL_LIBSYCL_VERSION >= 50200 - ::std::false_type>; // This is for the case of __can_use_known_identity<_Tp>==false #else //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 90eb3d5c737..e85e8e9f5f8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -195,7 +195,6 @@ macro(onedpl_add_test test_source_file switch_off_checked_iterators) string(REPLACE "\.cpp" "" _test_name ${_test_name}) set(coal_tests "reduce.pass" "transform_reduce.pass" "count.pass" "sycl_iterator_reduce.pass" "minmax_element.pass") - set(workaround_for_igpu_64bit_reduction_tests "reduce_by_segment.pass") # mark those tests with pstloffload_smoke_tests label set (pstloffload_smoke_tests "adjacent_find.pass" "copy_move.pass" "merge.pass" "partial_sort.pass" "remove_copy.pass" "transform_reduce.pass" "transform_reduce.pass.coal" "transform_scan.pass" "algorithm.pass" @@ -209,10 +208,6 @@ macro(onedpl_add_test test_source_file switch_off_checked_iterators) if (_test_name IN_LIST coal_tests) onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "-D_ONEDPL_DETECT_SPIRV_COMPILATION=1" "${extra_test_label}") onedpl_construct_exec(${test_source_file} ${_test_name}.coal ${switch_off_checked_iterators} "-D_ONEDPL_DETECT_SPIRV_COMPILATION=0" "${extra_test_label}") - elseif (_test_name IN_LIST workaround_for_igpu_64bit_reduction_tests) - onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "" "${extra_test_label}") - string(REPLACE "\.pass" "_workaround_64bit_reduction\.pass" _test_name ${_test_name}) - onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "-D_ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1" "${extra_test_label}") elseif(_test_name STREQUAL "free_after_unload.pass") onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "" "${extra_test_label}") onedpl_construct_exec(${test_source_file} ${_test_name}.after_pstl_offload ${switch_off_checked_iterators} "" "${extra_test_label}") diff --git a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp index 4de95e26e9b..c75be5f2694 100644 --- a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp @@ -13,14 +13,6 @@ // //===----------------------------------------------------------------------===// -#if defined(ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION) -#undef ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION -#endif - -#if defined(_ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION) -# define ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION _ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION -#endif - #include "support/test_config.h" #include "oneapi/dpl/execution" @@ -307,17 +299,12 @@ run_test_on_device() { #if TEST_DPCPP_BACKEND_PRESENT // Skip 64-byte types testing when the algorithm is broken and there is no the workaround -#if _PSTL_ICPX_TEST_RED_BY_SEG_BROKEN_64BIT_TYPES && !ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION - if constexpr (sizeof(ValueType) != 8) -#endif + if (TestUtils::has_type_support(TestUtils::get_test_queue().get_device())) { - if (TestUtils::has_type_support(TestUtils::get_test_queue().get_device())) - { - // Run tests for USM shared memory - test4buffers>(); - // Run tests for USM device memory - test4buffers>(); - } + // Run tests for USM shared memory + test4buffers>(); + // Run tests for USM device memory + test4buffers>(); } #endif // TEST_DPCPP_BACKEND_PRESENT } From 92cf29a88593eb740042d0b5cae71226e27e6d33 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 21 Oct 2024 13:53:35 -0500 Subject: [PATCH 16/31] Alternate testing between usm shared and device to prevent excessive binary size Signed-off-by: Matthew Michel --- .../numeric.ops/reduce_by_segment.pass.cpp | 27 +++++++++---------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp index c75be5f2694..2cee63239b6 100644 --- a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp @@ -293,17 +293,14 @@ test_flag_pred() } #endif -template +template void run_test_on_device() { #if TEST_DPCPP_BACKEND_PRESENT - // Skip 64-byte types testing when the algorithm is broken and there is no the workaround if (TestUtils::has_type_support(TestUtils::get_test_queue().get_device())) { - // Run tests for USM shared memory - test4buffers>(); - // Run tests for USM device memory + constexpr sycl::usm::alloc allocation_type = use_device_alloc ? sycl::usm::alloc::device : sycl::usm::alloc::shared; test4buffers>(); } #endif // TEST_DPCPP_BACKEND_PRESENT @@ -322,12 +319,12 @@ run_test_on_host() #endif // !_PSTL_ICC_TEST_SIMD_UDS_BROKEN && !_PSTL_ICPX_TEST_RED_BY_SEG_OPTIMIZER_CRASH } -template +template void run_test() { run_test_on_host(); - run_test_on_device(); + run_test_on_device(); } int @@ -337,7 +334,7 @@ main() // kernels. This is being filed to the compiler team. In the meantime, we can rearrange this test // to resolve the issue on our side. #if _PSTL_RED_BY_SEG_WINDOWS_COMPILE_ORDER_BROKEN - run_test, UserBinaryPredicate>, MaxFunctor>>(); + run_test, UserBinaryPredicate>, MaxFunctor>>(); #endif #if TEST_DPCPP_BACKEND_PRESENT @@ -347,17 +344,17 @@ main() #endif // TEST_DPCPP_BACKEND_PRESENT #if !_PSTL_RED_BY_SEG_WINDOWS_COMPILE_ORDER_BROKEN - run_test, UserBinaryPredicate>, MaxFunctor>>(); + run_test, UserBinaryPredicate>, MaxFunctor>>(); #endif - run_test, ::std::plus>(); - run_test, ::std::plus>(); - run_test, ::std::plus>(); + run_test, ::std::plus>(); + run_test, ::std::plus>(); + run_test, ::std::plus>(); // TODO investigate possible overflow: see issue #1416 - run_test_on_device, ::std::multiplies>(); - run_test_on_device, ::std::multiplies>(); - run_test_on_device, ::std::multiplies>(); + run_test_on_device, ::std::multiplies>(); + run_test_on_device, ::std::multiplies>(); + run_test_on_device, ::std::multiplies>(); return TestUtils::done(); } From d292e07f16b4e984e9c76030b1e77947f20743f9 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 21 Oct 2024 13:48:28 -0700 Subject: [PATCH 17/31] Performance tuning within scan input functor Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 52 +++++++++++++------ 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 1260d5450a1..7895ace6730 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -28,7 +28,6 @@ #include #include #include -#include #include "../../iterator_impl.h" #include "../../execution_impl.h" @@ -808,11 +807,12 @@ template struct __gen_red_by_seg_scan_input { // Returns the following tuple: - // ((new_seg_mask: size_t, value: ValueType), output_value: bool, candidate_key: KeyType) - // new_seg_mask : 1 for a start of a new segment, 0 otherwise - // value : Current element's value for reduction - // output_value : Whether this work-item should write an output - // candidate_key: The key of the next segment to write if output_value is true. + // ((new_seg_mask, value), output_value, next_key, current_key) + // size_t new_seg_mask : 1 for a start of a new segment, 0 otherwise + // ValueType value : Current element's value for reduction + // bool output_value : Whether this work-item should write an output (end of segment) + // KeyType next_key : The key of the next segment to write if output_value is true + // KeyType current_key : The current element's key. This is only ever used by work-item 0 to write the first key template auto operator()(const _InRng& __in_rng, std::size_t __id) const @@ -820,21 +820,37 @@ struct __gen_red_by_seg_scan_input auto __in_keys = std::get<0>(__in_rng.tuple()); auto __in_vals = std::get<1>(__in_rng.tuple()); using _KeyType = oneapi::dpl::__internal::__value_t; - using _OptKeyType = std::optional<_KeyType>; using _ValueType = oneapi::dpl::__internal::__value_t; - _OptKeyType __first_key; - if (__id == 0) - __first_key = _OptKeyType{__in_keys[0]}; - std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); - if (__id == __n - 1) + const _KeyType& __current_key = __in_keys[__id]; + // Ordering the most common condition first has yielded the best results. + if (__id > 0 && __id < __n - 1) + { + const _KeyType& __prev_key = __in_keys[__id - 1]; + const _KeyType& __next_key = __in_keys[__id + 1]; + std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); + return oneapi::dpl::__internal::make_tuple( + oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), + !__binary_pred(__current_key, __next_key), + __next_key, __current_key); + } + else if (__id == __n - 1) + { + const _KeyType& __prev_key = __in_keys[__id - 1]; + std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); return oneapi::dpl::__internal::make_tuple( oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, - _KeyType{__in_keys[__id]}, __first_key); // __in_keys[__id] is an unused placeholder - return oneapi::dpl::__internal::make_tuple( - oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), - !__binary_pred(__in_keys[__id], __in_keys[__id + 1]), _KeyType{__in_keys[__id + 1]}, __first_key); + __current_key, __current_key); // Passing __current_key as the next key for the last element is a placeholder + } + else + { + const _KeyType& __next_key = __in_keys[__id + 1]; + return oneapi::dpl::__internal::make_tuple( + oneapi::dpl::__internal::make_tuple(std::size_t{0}, _ValueType{__in_vals[__id]}), + !__binary_pred(__current_key, __next_key), __next_key, __current_key); + } } _BinaryPred __binary_pred; + // For correctness of the function call operator, __n must be greater than 1. std::size_t __n; }; @@ -877,7 +893,7 @@ struct __write_red_by_seg const std::size_t __out_idx = get<0>(get<0>(__tup)); if (__id == 0) - __out_keys[0] = *get<3>(__tup); + __out_keys[0] = get<3>(__tup); if (__is_seg_end) { __out_values[__out_idx] = __cur_segment_value; @@ -1303,6 +1319,8 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_ using _WriteOp = __write_red_by_seg<_BinaryPredicate>; using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; std::size_t __n = __keys.size(); + // __gen_red_by_seg_scan_input requires that __n > 1 + assert(__n > 1); return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), From e41298dfc84854324a8a01cddb3090344b45c1bf Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 21 Oct 2024 13:49:44 -0700 Subject: [PATCH 18/31] Handle n=0, n=1 first in reduce_by_segment Signed-off-by: Matthew Michel --- .../hetero/algorithm_ranges_impl_hetero.h | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 2f42f5e4b60..7179e1f864a 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -913,22 +913,6 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { -#if _ONEDPL_BACKEND_SYCL - // We would normally dispatch to the parallel implementation which would make the decision to invoke - // reduce-then-scan. However, since the fallback is implemented at the ranges level we must choose - // whether or not to use reduce-then-scan here. - if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) - { - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), - std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), - __binary_pred, __binary_op); - __res.wait(); - // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the - // past-the-end iterator pair of segmented reduction. - return std::get<0>(__res.get()) + 1; - } -#endif // The algorithm reduces values in __values where the // associated keys for the values are equal to the adjacent key. // @@ -960,6 +944,22 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& return 1; } +#if _ONEDPL_BACKEND_SYCL + // We would normally dispatch to the parallel implementation which would make the decision to invoke + // reduce-then-scan. However, since the fallback is implemented at the ranges level we must choose + // whether or not to use reduce-then-scan here. + if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) + { + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), + std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), + __binary_pred, __binary_op); + __res.wait(); + // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the + // past-the-end iterator pair of segmented reduction. + return std::get<0>(__res.get()) + 1; + } +#endif using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>; using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; From fb9a30635b5cec9041ef0e9ae9e1ffd9f7c6e69e Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 22 Oct 2024 08:50:00 -0700 Subject: [PATCH 19/31] Code cleanup Signed-off-by: Matthew Michel --- .../dpl/internal/reduce_by_segment_impl.h | 56 +++---------------- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 20 +++---- 2 files changed, 19 insertions(+), 57 deletions(-) diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h index 23259e5e550..8a04717c5c0 100644 --- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h +++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h @@ -57,9 +57,8 @@ #include "../pstl/utils_ranges.h" #include "../pstl/hetero/dpcpp/utils_ranges_sycl.h" #include "../pstl/ranges_defs.h" -#include "../pstl/glue_algorithm_ranges_impl.h" +#include "../pstl/hetero/algorithm_ranges_impl_hetero.h" #include "../pstl/hetero/dpcpp/sycl_traits.h" //SYCL traits specialization for some oneDPL types. -#include "scan_by_segment_impl.h" #endif namespace oneapi @@ -169,42 +168,9 @@ reduce_by_segment_impl(_Tag, Policy&& policy, InputIterator1 first1, InputIterat #if _ONEDPL_BACKEND_SYCL -template -class __seg_reduce_count_kernel; -template -class __seg_reduce_offset_kernel; -template -class __seg_reduce_wg_kernel; -template -class __seg_reduce_prefix_kernel; - -namespace -{ -template -using _SegReduceCountPhase = __seg_reduce_count_kernel<_Name...>; -template -using _SegReduceOffsetPhase = __seg_reduce_offset_kernel<_Name...>; -template -using _SegReduceWgPhase = __seg_reduce_wg_kernel<_Name...>; -template -using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>; -} // namespace - -template -oneapi::dpl::__internal::__difference_t<_Range3> -__pattern_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __keys, - _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, - _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) -{ - return oneapi::dpl::experimental::ranges::reduce_by_segment( - ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__keys), ::std::forward<_Range2>(__values), - ::std::forward<_Range3>(__out_keys), ::std::forward<_Range4>(__out_values), __binary_pred, __binary_op); -} - template -::std::pair +std::pair reduce_by_segment_impl(__internal::__hetero_tag<_BackendTag> __tag, Policy&& policy, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, OutputIterator1 result1, OutputIterator2 result2, BinaryPred binary_pred, BinaryOperator binary_op) @@ -218,14 +184,14 @@ reduce_by_segment_impl(__internal::__hetero_tag<_BackendTag> __tag, Policy&& pol // keys_result = { 1, 2, 3, 4, 1, 3, 1, 3, 0 } -- result1 // values_result = { 1, 2, 3, 4, 2, 6, 2, 6, 0 } -- result2 - using _CountType = ::std::uint64_t; + using _CountType = std::uint64_t; namespace __bknd = __par_backend_hetero; - const auto n = ::std::distance(first1, last1); + const auto n = std::distance(first1, last1); if (n == 0) - return ::std::make_pair(result1, result2); + return std::make_pair(result1, result2); auto keep_keys = __ranges::__get_sycl_range<__bknd::access_mode::read, InputIterator1>(); auto key_buf = keep_keys(first1, last1); @@ -236,16 +202,12 @@ reduce_by_segment_impl(__internal::__hetero_tag<_BackendTag> __tag, Policy&& pol auto keep_value_outputs = __ranges::__get_sycl_range<__bknd::access_mode::write, OutputIterator2>(); auto value_output_buf = keep_value_outputs(result2, result2 + n); - using has_known_identity = - typename unseq_backend::__has_known_identity::value_type>::type; - // number of unique keys - _CountType __n = - __pattern_reduce_by_segment(__tag, ::std::forward(policy), key_buf.all_view(), value_buf.all_view(), - key_output_buf.all_view(), value_output_buf.all_view(), binary_pred, binary_op); + _CountType __n = oneapi::dpl::__internal::__ranges::__pattern_reduce_by_segment( + __tag, std::forward(policy), key_buf.all_view(), value_buf.all_view(), key_output_buf.all_view(), + value_output_buf.all_view(), binary_pred, binary_op); - return ::std::make_pair(result1 + __n, result2 + __n); + return std::make_pair(result1 + __n, result2 + __n); } #endif } // namespace internal diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 7895ace6730..54646e0b182 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -829,17 +829,16 @@ struct __gen_red_by_seg_scan_input const _KeyType& __next_key = __in_keys[__id + 1]; std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); return oneapi::dpl::__internal::make_tuple( - oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), - !__binary_pred(__current_key, __next_key), - __next_key, __current_key); + oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), + !__binary_pred(__current_key, __next_key), __next_key, __current_key); } else if (__id == __n - 1) { const _KeyType& __prev_key = __in_keys[__id - 1]; std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); return oneapi::dpl::__internal::make_tuple( - oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, - __current_key, __current_key); // Passing __current_key as the next key for the last element is a placeholder + oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, __current_key, + __current_key); // Passing __current_key as the next key for the last element is a placeholder } else { @@ -887,18 +886,19 @@ struct __write_red_by_seg using _KeyType = oneapi::dpl::__internal::__value_t; using _ValType = oneapi::dpl::__internal::__value_t; - const _KeyType& __next_segment_key = get<2>(__tup); - const _ValType& __cur_segment_value = get<1>(get<0>(__tup)); + const _KeyType& __next_key = get<2>(__tup); + const _KeyType& __current_key = get<3>(__tup); + const _ValType& __current_value = get<1>(get<0>(__tup)); const bool __is_seg_end = get<1>(__tup); const std::size_t __out_idx = get<0>(get<0>(__tup)); if (__id == 0) - __out_keys[0] = get<3>(__tup); + __out_keys[0] = __current_key; if (__is_seg_end) { - __out_values[__out_idx] = __cur_segment_value; + __out_values[__out_idx] = __current_value; if (__id != __n - 1) - __out_keys[__out_idx + 1] = __next_segment_key; + __out_keys[__out_idx + 1] = __next_key; } } _BinaryPred __binary_pred; From d32ea4d02dc37d02f3d6e5da3ee8eb5328e87e22 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 22 Oct 2024 09:16:12 -0700 Subject: [PATCH 20/31] Improve comments and mark relevant variables as const Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 54646e0b182..204b8058b61 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -789,18 +789,21 @@ struct __gen_transform_input template struct __gen_red_by_seg_reduce_input { + // Returns the following tuple: + // (new_seg_mask, value) + // size_t new_seg_mask : 1 for a start of a new segment, 0 otherwise + // ValueType value : Current element's value for reduction template auto operator()(const _InRng& __in_rng, std::size_t __id) const { - auto __in_keys = std::get<0>(__in_rng.tuple()); - auto __in_vals = std::get<1>(__in_rng.tuple()); + const auto __in_keys = std::get<0>(__in_rng.tuple()); + const auto __in_vals = std::get<1>(__in_rng.tuple()); using _ValueType = oneapi::dpl::__internal::__value_t; - std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); + const std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); return oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}); } _BinaryPred __binary_pred; - std::size_t __n; }; template @@ -817,8 +820,8 @@ struct __gen_red_by_seg_scan_input auto operator()(const _InRng& __in_rng, std::size_t __id) const { - auto __in_keys = std::get<0>(__in_rng.tuple()); - auto __in_vals = std::get<1>(__in_rng.tuple()); + const auto __in_keys = std::get<0>(__in_rng.tuple()); + const auto __in_vals = std::get<1>(__in_rng.tuple()); using _KeyType = oneapi::dpl::__internal::__value_t; using _ValueType = oneapi::dpl::__internal::__value_t; const _KeyType& __current_key = __in_keys[__id]; @@ -827,7 +830,7 @@ struct __gen_red_by_seg_scan_input { const _KeyType& __prev_key = __in_keys[__id - 1]; const _KeyType& __next_key = __in_keys[__id + 1]; - std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); + const std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); return oneapi::dpl::__internal::make_tuple( oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), !__binary_pred(__current_key, __next_key), __next_key, __current_key); @@ -835,7 +838,7 @@ struct __gen_red_by_seg_scan_input else if (__id == __n - 1) { const _KeyType& __prev_key = __in_keys[__id - 1]; - std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); + const std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); return oneapi::dpl::__internal::make_tuple( oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, __current_key, __current_key); // Passing __current_key as the next key for the last element is a placeholder @@ -1325,7 +1328,7 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_ __backend_tag, std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), - _GenReduceInput{__binary_pred, __n}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, _ScanInputTransform{}, + _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, _ScanInputTransform{}, _WriteOp{__binary_pred, __n}, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/ std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); From aa4baafb931f6ba7c2e06b5c508438d377449116 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 22 Oct 2024 12:16:18 -0700 Subject: [PATCH 21/31] Add condition to ensure value type is trivially copyable to call reduce-then-scan Signed-off-by: Matthew Michel --- .../hetero/algorithm_ranges_impl_hetero.h | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 7179e1f864a..fc04e0296ff 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -944,26 +944,28 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& return 1; } + using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>; + using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; + using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; #if _ONEDPL_BACKEND_SYCL // We would normally dispatch to the parallel implementation which would make the decision to invoke // reduce-then-scan. However, since the fallback is implemented at the ranges level we must choose // whether or not to use reduce-then-scan here. - if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) + if constexpr (std::is_trivially_copyable_v<__val_type>) { - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), - std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), - __binary_pred, __binary_op); - __res.wait(); - // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the - // past-the-end iterator pair of segmented reduction. - return std::get<0>(__res.get()) + 1; + if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) + { + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), + std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), + __binary_pred, __binary_op); + __res.wait(); + // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the + // past-the-end iterator pair of segmented reduction. + return std::get<0>(__res.get()) + 1; + } } #endif - using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>; - using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; - using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; - // Round 1: reduce with extra indices added to avoid long segments // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence. // Skip a round of copy_if and reduces if there are none. From 8cc59df59bb163b5689a050c2743c5884d6b3361 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 22 Oct 2024 12:25:34 -0700 Subject: [PATCH 22/31] clang-format Signed-off-by: Matthew Michel --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 204b8058b61..056a472a356 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1311,8 +1311,9 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t template auto -__parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, +__parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, + _ExecutionPolicy&& __exec, _Range1&& __keys, _Range2&& __values, + _Range3&& __out_keys, _Range4&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { using _GenReduceInput = __gen_red_by_seg_reduce_input<_BinaryPredicate>; @@ -1328,8 +1329,8 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_ __backend_tag, std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), - _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, _ScanInputTransform{}, - _WriteOp{__binary_pred, __n}, + _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, + _ScanInputTransform{}, _WriteOp{__binary_pred, __n}, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/ std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); } From 80ceacdae7b395ca1b5ff6c0d507e75c071733cd Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 5 Nov 2024 13:52:13 -0600 Subject: [PATCH 23/31] Introduce iterator based __pattern_reduce_by_segment * An iterator based __pattern_reduce_by_segment is added * Due to compiler issues prior to icpx 2025.0, the reduce-then-scan path is disabled and the previous handcrafted SYCL implementation is restored to prevent performance regressions with older compilers * The previous range-based fallback implementation has been moved to the SYCL backend along with the handcrafted SYCL version Signed-off-by: Matthew Michel --- .../dpl/internal/reduce_by_segment_impl.h | 16 +- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 40 ++ .../hetero/algorithm_ranges_impl_hetero.h | 150 +----- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 176 +++++++ .../parallel_backend_sycl_reduce_by_segment.h | 464 ++++++++++++++++++ 5 files changed, 692 insertions(+), 154 deletions(-) create mode 100644 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h index 8a04717c5c0..aa718e3743f 100644 --- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h +++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h @@ -57,7 +57,7 @@ #include "../pstl/utils_ranges.h" #include "../pstl/hetero/dpcpp/utils_ranges_sycl.h" #include "../pstl/ranges_defs.h" -#include "../pstl/hetero/algorithm_ranges_impl_hetero.h" +#include "../pstl/hetero/algorithm_impl_hetero.h" #include "../pstl/hetero/dpcpp/sycl_traits.h" //SYCL traits specialization for some oneDPL types. #endif @@ -193,19 +193,9 @@ reduce_by_segment_impl(__internal::__hetero_tag<_BackendTag> __tag, Policy&& pol if (n == 0) return std::make_pair(result1, result2); - auto keep_keys = __ranges::__get_sycl_range<__bknd::access_mode::read, InputIterator1>(); - auto key_buf = keep_keys(first1, last1); - auto keep_values = __ranges::__get_sycl_range<__bknd::access_mode::read, InputIterator2>(); - auto value_buf = keep_values(first2, first2 + n); - auto keep_key_outputs = __ranges::__get_sycl_range<__bknd::access_mode::write, OutputIterator1>(); - auto key_output_buf = keep_key_outputs(result1, result1 + n); - auto keep_value_outputs = __ranges::__get_sycl_range<__bknd::access_mode::write, OutputIterator2>(); - auto value_output_buf = keep_value_outputs(result2, result2 + n); - // number of unique keys - _CountType __n = oneapi::dpl::__internal::__ranges::__pattern_reduce_by_segment( - __tag, std::forward(policy), key_buf.all_view(), value_buf.all_view(), key_output_buf.all_view(), - value_output_buf.all_view(), binary_pred, binary_op); + _CountType __n = oneapi::dpl::__internal::__pattern_reduce_by_segment( + __tag, std::forward(policy), first1, last1, first2, result1, result2, binary_pred, binary_op); return std::make_pair(result1 + __n, result2 + __n); } diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 1a51076c612..d853f057561 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -2003,6 +2003,46 @@ __pattern_shift_right(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec return __last - __res; } +template +struct __copy_keys_values_wrapper; + +template +typename std::iterator_traits<_Iterator3>::difference_type +__pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Iterator1 __keys_first, + _Iterator1 __keys_last, _Iterator2 __values_first, _Iterator3 __out_keys_first, + _Iterator4 __out_values_first, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) +{ + std::size_t __n = std::distance(__keys_first, __keys_last); + + if (__n == 0) + return 0; + + if (__n == 1) + { + __brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy> __copy_op{}; + + oneapi::dpl::__internal::__pattern_walk2_n( + __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_keys_values_wrapper>(__exec), + oneapi::dpl::make_zip_iterator(__keys_first, __values_first), 1, + oneapi::dpl::make_zip_iterator(__out_keys_first, __out_values_first), __copy_op); + + return 1; + } + + auto __keep_keys = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); + auto __keys = __keep_keys(__keys_first, __keys_last); + auto __keep_values = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator2>(); + auto __values = __keep_values(__values_first, __values_first + __n); + auto __keep_key_outputs = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator3>(); + auto __out_keys = __keep_key_outputs(__out_keys_first, __out_keys_first + __n); + auto __keep_value_outputs = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator4>(); + auto __out_values = __keep_value_outputs(__out_values_first, __out_values_first + __n); + return oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __keys.all_view(), __values.all_view(), + __out_keys.all_view(), __out_values.all_view(), __binary_pred, __binary_op); +} + } // namespace __internal } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index fc04e0296ff..da7820b91a2 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -889,22 +889,7 @@ __pattern_minmax_element(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ //------------------------------------------------------------------------ template -struct __copy_keys_wrapper; - -template -struct __copy_values_wrapper; - -template -struct __reduce1_wrapper; - -template -struct __reduce2_wrapper; - -template -struct __assign_key1_wrapper; - -template -struct __assign_key2_wrapper; +struct __copy_keys_values_range_wrapper; template @@ -932,135 +917,18 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy> __copy_range{}; oneapi::dpl::__internal::__ranges::__pattern_walk_n( - __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_keys_wrapper>(__exec), __copy_range, - std::forward<_Range1>(__keys), std::forward<_Range3>(__out_keys)); - - oneapi::dpl::__internal::__ranges::__pattern_walk_n( - __tag, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_values_wrapper>( - std::forward<_ExecutionPolicy>(__exec)), - __copy_range, std::forward<_Range2>(__values), std::forward<_Range4>(__out_values)); + __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_keys_values_range_wrapper>(__exec), + __copy_range, + oneapi::dpl::__ranges::zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), + oneapi::dpl::__ranges::zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values))); return 1; } - using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>; - using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; - using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; -#if _ONEDPL_BACKEND_SYCL - // We would normally dispatch to the parallel implementation which would make the decision to invoke - // reduce-then-scan. However, since the fallback is implemented at the ranges level we must choose - // whether or not to use reduce-then-scan here. - if constexpr (std::is_trivially_copyable_v<__val_type>) - { - if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) - { - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), - std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), - __binary_pred, __binary_op); - __res.wait(); - // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the - // past-the-end iterator pair of segmented reduction. - return std::get<0>(__res.get()) + 1; - } - } -#endif - // Round 1: reduce with extra indices added to avoid long segments - // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence. - // Skip a round of copy_if and reduces if there are none. - auto __idx = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer(); - auto __tmp_out_keys = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer(); - auto __tmp_out_values = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n).get_buffer(); - - // Replicating first element of keys view to be able to compare (i-1)-th and (i)-th key with aligned sequences, - // dropping the last key for the i-1 sequence. - auto __k1 = - oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::replicate_start_view_simple(__keys, 1), __n); - - // view1 elements are a tuple of the element index and pairs of adjacent keys - // view2 elements are a tuple of the elements where key-index pairs will be written by copy_if - auto __view1 = experimental::ranges::zip_view(experimental::ranges::views::iota(0, __n), __k1, __keys); - auto __view2 = experimental::ranges::zip_view(experimental::ranges::views::all_write(__tmp_out_keys), - experimental::ranges::views::all_write(__idx)); - - // use work group size adjusted to shared local memory as the maximum segment size. - std::size_t __wgroup_size = - oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(__key_type) + sizeof(__val_type)); - - // element is copied if it is the 0th element (marks beginning of first segment), is in an index - // evenly divisible by wg size (ensures segments are not long), or has a key not equal to the - // adjacent element (marks end of real segments) - // TODO: replace wgroup size with segment size based on platform specifics. - auto __intermediate_result_end = __ranges::__pattern_copy_if( - __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2, - [__binary_pred, __wgroup_size](const auto& __a) { - // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys - // for (i-1), but we still need to get its key value as it is the start of a segment - const auto index = std::get<0>(__a); - if (index == 0) - return true; - return index % __wgroup_size == 0 // segment size - || !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // key comparison - }, - unseq_backend::__brick_assign_key_position{}); - - //reduce by segment - oneapi::dpl::__par_backend_hetero::__parallel_for( - _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec), - unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), __intermediate_result_end, - oneapi::dpl::__ranges::take_view_simple(experimental::ranges::views::all_read(__idx), - __intermediate_result_end), - std::forward<_Range2>(__values), experimental::ranges::views::all_write(__tmp_out_values)) - .wait(); - - // Round 2: final reduction to get result for each segment of equal adjacent keys - // create views over adjacent keys - oneapi::dpl::__ranges::all_view<__key_type, __par_backend_hetero::access_mode::read_write> __new_keys( - __tmp_out_keys); - - // Replicating first element of key views to be able to compare (i-1)-th and (i)-th key, - // dropping the last key for the i-1 sequence. Only taking the appropriate number of keys to start with here. - auto __clipped_new_keys = oneapi::dpl::__ranges::take_view_simple(__new_keys, __intermediate_result_end); - - auto __k3 = oneapi::dpl::__ranges::take_view_simple( - oneapi::dpl::__ranges::replicate_start_view_simple(__clipped_new_keys, 1), __intermediate_result_end); - - // view3 elements are a tuple of the element index and pairs of adjacent keys - // view4 elements are a tuple of the elements where key-index pairs will be written by copy_if - auto __view3 = experimental::ranges::zip_view(experimental::ranges::views::iota(0, __intermediate_result_end), __k3, - __clipped_new_keys); - auto __view4 = experimental::ranges::zip_view(experimental::ranges::views::all_write(__out_keys), - experimental::ranges::views::all_write(__idx)); - - // element is copied if it is the 0th element (marks beginning of first segment), or has a key not equal to - // the adjacent element (end of a segment). Artificial segments based on wg size are not created. - auto __result_end = __ranges::__pattern_copy_if( - __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key2_wrapper>(__exec), __view3, __view4, - [__binary_pred](const auto& __a) { - // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys - // for (i-1), but we still need to get its key value as it is the start of a segment - if (std::get<0>(__a) == 0) - return true; - return !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // keys comparison - }, - unseq_backend::__brick_assign_key_position{}); - - //reduce by segment - oneapi::dpl::__par_backend_hetero::__parallel_for( - _BackendTag{}, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>( - std::forward<_ExecutionPolicy>(__exec)), - unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>( - __binary_op, __intermediate_result_end), - __result_end, - oneapi::dpl::__ranges::take_view_simple(experimental::ranges::views::all_read(__idx), __result_end), - experimental::ranges::views::all_read(__tmp_out_values), std::forward<_Range4>(__out_values)) - .__deferrable_wait(); - - return __result_end; + return oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), + std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), + __binary_pred, __binary_op); } } // namespace __ranges diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 056a472a356..431d58d75ea 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -32,12 +32,14 @@ #include "../../iterator_impl.h" #include "../../execution_impl.h" #include "../../utils_ranges.h" +#include "../../ranges_defs.h" #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" #include "parallel_backend_sycl_reduce.h" #include "parallel_backend_sycl_merge.h" #include "parallel_backend_sycl_merge_sort.h" +#include "parallel_backend_sycl_reduce_by_segment.h" #include "parallel_backend_sycl_reduce_then_scan.h" #include "execution_sycl_defs.h" #include "sycl_iterator.h" @@ -2273,6 +2275,180 @@ __parallel_partial_sort(oneapi::dpl::__internal::__device_backend_tag __backend_ return __parallel_partial_sort_impl(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __buf.all_view(), __partial_merge_kernel{__mid_idx}, __comp); } + +//------------------------------------------------------------------------ +// reduce_by_segment - sync pattern +//------------------------------------------------------------------------ + +// TODO: The non-identity fallback path of reduce-by-segment must currently be implemented synchronously due to the +// inability to create event dependency chains across separate parallel pattern calls. If we ever add support for +// cross parallel pattern dependencies, then we can implement this as an async pattern. +template +struct __reduce1_wrapper; + +template +struct __reduce2_wrapper; + +template +struct __assign_key1_wrapper; + +template +struct __assign_key2_wrapper; + +template +oneapi::dpl::__internal::__difference_t<_Range3> +__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, + _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, + _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) +{ + // The algorithm reduces values in __values where the + // associated keys for the values are equal to the adjacent key. + // + // Example: __keys = { 1, 2, 3, 4, 1, 1, 3, 3, 1, 1, 3, 3, 0 } + // __values = { 1, 2, 3, 4, 1, 1, 3, 3, 1, 1, 3, 3, 0 } + // + // __out_keys = { 1, 2, 3, 4, 1, 3, 1, 3, 0 } + // __out_values = { 1, 2, 3, 4, 2, 6, 2, 6, 0 } + + const auto __n = __keys.size(); + + using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>; + using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; + using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; + // Prior to icpx 2025.0, the reduce-then-scan path performs poorly and should be avoided. +#if !defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER >= 20250000 + if constexpr (std::is_trivially_copyable_v<__val_type>) + { + if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) + { + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( + oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), + std::forward<_Range4>(__out_values), __binary_pred, __binary_op); + __res.wait(); + // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the + // past-the-end iterator pair of segmented reduction. + return std::get<0>(__res.get()) + 1; + } + } +#endif + if constexpr (oneapi::dpl::unseq_backend::__has_known_identity<_BinaryOperator, __val_type>::value) + { + return __parallel_reduce_by_segment_known_identity( + oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), + std::forward<_Range4>(__out_values), __binary_pred, __binary_op); + } + else + { + // Round 1: reduce with extra indices added to avoid long segments + // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence. + // Skip a round of copy_if and reduces if there are none. + auto __idx = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer(); + auto __tmp_out_keys = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer(); + auto __tmp_out_values = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n).get_buffer(); + + // Replicating first element of keys view to be able to compare (i-1)-th and (i)-th key with aligned sequences, + // dropping the last key for the i-1 sequence. + auto __k1 = + oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::replicate_start_view_simple(__keys, 1), __n); + + // view1 elements are a tuple of the element index and pairs of adjacent keys + // view2 elements are a tuple of the elements where key-index pairs will be written by copy_if + auto __view1 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __n), __k1, __keys); + auto __view2 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__tmp_out_keys), + oneapi::dpl::__ranges::views::all_write(__idx)); + + // use work group size adjusted to shared local memory as the maximum segment size. + std::size_t __wgroup_size = + oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(__key_type) + sizeof(__val_type)); + + // element is copied if it is the 0th element (marks beginning of first segment), is in an index + // evenly divisible by wg size (ensures segments are not long), or has a key not equal to the + // adjacent element (marks end of real segments) + // TODO: replace wgroup size with segment size based on platform specifics. + auto __intermediate_result_end = + oneapi::dpl::__par_backend_hetero::__parallel_copy_if( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2, + __n, + [__binary_pred, __wgroup_size](const auto& __a) { + // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys + // for (i-1), but we still need to get its key value as it is the start of a segment + const auto index = std::get<0>(__a); + if (index == 0) + return true; + return index % __wgroup_size == 0 // segment size + || !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // key comparison + }, + unseq_backend::__brick_assign_key_position{}) + .get(); + + //reduce by segment + oneapi::dpl::__par_backend_hetero::__parallel_for( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec), + unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), + __intermediate_result_end, + oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), + __intermediate_result_end), + std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values)) + .wait(); + + // Round 2: final reduction to get result for each segment of equal adjacent keys + // create views over adjacent keys + oneapi::dpl::__ranges::all_view<__key_type, __par_backend_hetero::access_mode::read_write> __new_keys( + __tmp_out_keys); + + // Replicating first element of key views to be able to compare (i-1)-th and (i)-th key, + // dropping the last key for the i-1 sequence. Only taking the appropriate number of keys to start with here. + auto __clipped_new_keys = oneapi::dpl::__ranges::take_view_simple(__new_keys, __intermediate_result_end); + + auto __k3 = oneapi::dpl::__ranges::take_view_simple( + oneapi::dpl::__ranges::replicate_start_view_simple(__clipped_new_keys, 1), __intermediate_result_end); + + // view3 elements are a tuple of the element index and pairs of adjacent keys + // view4 elements are a tuple of the elements where key-index pairs will be written by copy_if + auto __view3 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __intermediate_result_end), + __k3, __clipped_new_keys); + auto __view4 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__out_keys), + oneapi::dpl::__ranges::views::all_write(__idx)); + + // element is copied if it is the 0th element (marks beginning of first segment), or has a key not equal to + // the adjacent element (end of a segment). Artificial segments based on wg size are not created. + auto __result_end = oneapi::dpl::__par_backend_hetero::__parallel_copy_if( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key2_wrapper>(__exec), + __view3, __view4, __view3.size(), + [__binary_pred](const auto& __a) { + // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys + // for (i-1), but we still need to get its key value as it is the start of a segment + if (std::get<0>(__a) == 0) + return true; + return !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // keys comparison + }, + unseq_backend::__brick_assign_key_position{}) + .get(); + + //reduce by segment + oneapi::dpl::__par_backend_hetero::__parallel_for( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>( + std::forward<_ExecutionPolicy>(__exec)), + unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>( + __binary_op, __intermediate_result_end), + __result_end, + oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __result_end), + oneapi::dpl::__ranges::views::all_read(__tmp_out_values), std::forward<_Range4>(__out_values)) + .__deferrable_wait(); + return __result_end; + } +} + } // namespace __par_backend_hetero } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h new file mode 100644 index 00000000000..14860e3830a --- /dev/null +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h @@ -0,0 +1,464 @@ +// -*- C++ -*- +//===-- parallel_backend_sycl_reduce_by_segment.h ---------------------------------===// +/* Copyright (c) Intel Corporation + * + *  Copyright 2008-2013 NVIDIA Corporation + * + *  Licensed under the Apache License, Version 2.0 (the "License"); + *  you may not use this file except in compliance with the License. + *  You may obtain a copy of the License at + * + *      http://www.apache.org/licenses/LICENSE-2.0 + * + *  Unless required by applicable law or agreed to in writing, software + *  distributed under the License is distributed on an "AS IS" BASIS, + *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + *  See the License for the specific language governing permissions and + *  limitations under the License. + * + *  Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved. + *  + *  Redistribution and use in source and binary forms, with or without + *  modification, are permitted provided that the following conditions are met: + *     * Redistributions of source code must retain the above copyright + *       notice, this list of conditions and the following disclaimer. + *     * Redistributions in binary form must reproduce the above copyright + *       notice, this list of conditions and the following disclaimer in the + *       documentation and/or other materials provided with the distribution. + *     * Neither the name of the NVIDIA CORPORATION nor the + *       names of its contributors may be used to endorse or promote products + *       derived from this software without specific prior written permission. + *  + *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"  + *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE  + *  ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_BY_SEGMENT_H +#define _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_BY_SEGMENT_H + +#include +#include +#include +#include +#include + +#include "sycl_defs.h" +#include "parallel_backend_sycl_utils.h" +#include "utils_ranges_sycl.h" +#include "sycl_traits.h" + +#include "../../utils.h" +#include "../../../internal/scan_by_segment_impl.h" + +namespace oneapi +{ +namespace dpl +{ +namespace __par_backend_hetero +{ + +template +class __seg_reduce_count_kernel; +template +class __seg_reduce_offset_kernel; +template +class __seg_reduce_wg_kernel; +template +class __seg_reduce_prefix_kernel; + +namespace +{ +template +using _SegReduceCountPhase = __seg_reduce_count_kernel<_Name...>; +template +using _SegReduceOffsetPhase = __seg_reduce_offset_kernel<_Name...>; +template +using _SegReduceWgPhase = __seg_reduce_wg_kernel<_Name...>; +template +using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>; +} // namespace + +template +oneapi::dpl::__internal::__difference_t<_Range3> +__parallel_reduce_by_segment_known_identity(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, + _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, + _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) +{ + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + using _SegReduceCountKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + _SegReduceCountPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, + _BinaryOperator>; + using _SegReduceOffsetKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + _SegReduceOffsetPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, + _BinaryOperator>; + using _SegReduceWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + _SegReduceWgPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, + _BinaryOperator>; + using _SegReducePrefixKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + _SegReducePrefixPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, + _BinaryOperator>; + + using __diff_type = oneapi::dpl::__internal::__difference_t<_Range3>; + using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; + using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; + + const std::size_t __n = __keys.size(); + + constexpr std::uint16_t __vals_per_item = + 16; // Each work item serially processes 16 items. Best observed performance on gpu + + // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. + // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. + std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)2048); + + // adjust __wgroup_size according to local memory limit. Double the requirement on __val_type due to sycl group algorithm's use + // of SLM. + __wgroup_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size( + __exec, sizeof(__key_type) + 2 * sizeof(__val_type), __wgroup_size); + +#if _ONEDPL_COMPILE_KERNEL + auto __seg_reduce_count_kernel = + __par_backend_hetero::__internal::__kernel_compiler<_SegReduceCountKernel>::__compile(__exec); + auto __seg_reduce_offset_kernel = + __par_backend_hetero::__internal::__kernel_compiler<_SegReduceOffsetKernel>::__compile(__exec); + auto __seg_reduce_wg_kernel = + __par_backend_hetero::__internal::__kernel_compiler<_SegReduceWgKernel>::__compile(__exec); + auto __seg_reduce_prefix_kernel = + __par_backend_hetero::__internal::__kernel_compiler<_SegReducePrefixKernel>::__compile(__exec); + __wgroup_size = + std::min({__wgroup_size, + oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_count_kernel), + oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_offset_kernel), + oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_wg_kernel), + oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_prefix_kernel)}); +#endif + + std::size_t __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __wgroup_size * __vals_per_item); + + // intermediate reductions within a workgroup + auto __partials = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n_groups).get_buffer(); + + auto __end_idx = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, 1).get_buffer(); + + // the number of segment ends found in each work group + auto __seg_ends = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n_groups).get_buffer(); + + // buffer that stores an exclusive scan of the results + auto __seg_ends_scanned = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n_groups).get_buffer(); + + // 1. Count the segment ends in each workgroup + auto __seg_end_identification = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __keys); + auto __seg_ends_acc = __seg_ends.template get_access(__cgh); +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT + __cgh.use_kernel_bundle(__seg_reduce_count_kernel.get_kernel_bundle()); +#endif + __cgh.parallel_for<_SegReduceCountKernel>( + sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=]( +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT + __seg_reduce_count_kernel, +#endif + sycl::nd_item<1> __item) { + auto __group = __item.get_group(); + std::size_t __group_id = __item.get_group(0); + std::size_t __local_id = __item.get_local_id(0); + std::size_t __global_id = __item.get_global_id(0); + + std::size_t __start = __global_id * __vals_per_item; + std::size_t __end = __dpl_sycl::__minimum{}(__start + __vals_per_item, __n); + std::size_t __item_segments = 0; + + // 1a. Work item scan to identify segment ends + for (std::size_t __i = __start; __i < __end; ++__i) + if (__n - 1 == __i || !__binary_pred(__keys[__i], __keys[__i + 1])) + ++__item_segments; + + // 1b. Work group reduction + std::size_t __num_segs = __dpl_sycl::__reduce_over_group( + __group, __item_segments, __dpl_sycl::__plus()); + + // 1c. First work item writes segment count to global memory + if (__local_id == 0) + __seg_ends_acc[__group_id] = __num_segs; + }); + }); + + // 1.5 Small single-group kernel + auto __single_group_scan = __exec.queue().submit([&](sycl::handler& __cgh) { + __cgh.depends_on(__seg_end_identification); + auto __seg_ends_acc = __seg_ends.template get_access(__cgh); + auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access(__cgh); +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT + __cgh.use_kernel_bundle(__seg_reduce_offset_kernel.get_kernel_bundle()); +#endif + __cgh.parallel_for<_SegReduceOffsetKernel>( +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT + __seg_reduce_offset_kernel, +#endif + sycl::nd_range<1>{__wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { + auto __beg = __dpl_sycl::__get_accessor_ptr(__seg_ends_acc); + auto __out_beg = __dpl_sycl::__get_accessor_ptr(__seg_ends_scan_acc); + __dpl_sycl::__joint_exclusive_scan(__item.get_group(), __beg, __beg + __n_groups, __out_beg, + __diff_type(0), sycl::plus<__diff_type>()); + }); + }); + + // 2. Work group reduction + auto __wg_reduce = __exec.queue().submit([&](sycl::handler& __cgh) { + __cgh.depends_on(__single_group_scan); + oneapi::dpl::__ranges::__require_access(__cgh, __keys, __out_keys, __out_values, __values); + + auto __partials_acc = __partials.template get_access(__cgh); + auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access(__cgh); + __dpl_sycl::__local_accessor<__val_type> __loc_acc(2 * __wgroup_size, __cgh); +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT + __cgh.use_kernel_bundle(__seg_reduce_wg_kernel.get_kernel_bundle()); +#endif + __cgh.parallel_for<_SegReduceWgKernel>( +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT + __seg_reduce_wg_kernel, +#endif + sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { + std::array<__val_type, __vals_per_item> __loc_partials; + + auto __group = __item.get_group(); + std::size_t __group_id = __item.get_group(0); + std::size_t __local_id = __item.get_local_id(0); + std::size_t __global_id = __item.get_global_id(0); + + // 2a. Lookup the number of prior segs + auto __wg_num_prior_segs = __seg_ends_scan_acc[__group_id]; + + // 2b. Perform a serial scan within the work item over assigned elements. Store partial + // reductions in work group local memory. + std::size_t __start = __global_id * __vals_per_item; + std::size_t __end = __dpl_sycl::__minimum{}(__start + __vals_per_item, __n); + + std::size_t __max_end = 0; + std::size_t __item_segments = 0; + auto __identity = unseq_backend::__known_identity<_BinaryOperator, __val_type>; + + __val_type __accumulator = __identity; + for (std::size_t __i = __start; __i < __end; ++__i) + { + __accumulator = __binary_op(__accumulator, __values[__i]); + if (__n - 1 == __i || !__binary_pred(__keys[__i], __keys[__i + 1])) + { + __loc_partials[__i - __start] = __accumulator; + ++__item_segments; + __max_end = __local_id; + __accumulator = __identity; + } + } + + // 2c. Count the number of prior work segments cooperatively over group + std::size_t __prior_segs_in_wg = __dpl_sycl::__exclusive_scan_over_group( + __group, __item_segments, __dpl_sycl::__plus()); + std::size_t __start_idx = __wg_num_prior_segs + __prior_segs_in_wg; + + // 2d. Find the greatest segment end less than the current index (inclusive) + std::size_t __closest_seg_id = __dpl_sycl::__inclusive_scan_over_group( + __group, __max_end, __dpl_sycl::__maximum()); + + // __wg_segmented_scan is a derivative work and responsible for the third header copyright + __val_type __carry_in = oneapi::dpl::internal::__wg_segmented_scan( + __item, __loc_acc, __local_id, __local_id - __closest_seg_id, __accumulator, __identity, + __binary_op, __wgroup_size); + + // 2e. Update local partial reductions in first segment and write to global memory. + bool __apply_aggs = true; + std::size_t __item_offset = 0; + + // first item in group does not have any work-group aggregates to apply + if (__local_id == 0) + { + __apply_aggs = false; + if (__global_id == 0 && __n > 0) + { + // first segment identifier is always the first key + __out_keys[0] = __keys[0]; + } + } + + // apply the aggregates and copy the locally stored values to destination buffer + for (std::size_t __i = __start; __i < __end; ++__i) + { + if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1])) + { + std::size_t __idx = __start_idx + __item_offset; + if (__apply_aggs) + { + __out_values[__idx] = __binary_op(__carry_in, __loc_partials[__i - __start]); + __apply_aggs = false; + } + else + { + __out_values[__idx] = __loc_partials[__i - __start]; + } + if (__i != __n - 1) + { + __out_keys[__idx + 1] = __keys[__i + 1]; + } + ++__item_offset; + } + } + + // 2f. Output the work group aggregate and total number of segments for use in phase 3. + if (__local_id == __wgroup_size - 1) // last work item writes the group's carry out + { + // If no segment ends in the item, the aggregates from previous work groups must be applied. + if (__max_end == 0) + { + // needs to be inclusive with last element + __partials_acc[__group_id] = __binary_op(__carry_in, __accumulator); + } + else + { + __partials_acc[__group_id] = __accumulator; + } + } + }); + }); + + // 3. Apply inter work-group aggregates + __exec.queue() + .submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __keys, __out_keys, __out_values); + + auto __partials_acc = __partials.template get_access(__cgh); + auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access(__cgh); + auto __seg_ends_acc = __seg_ends.template get_access(__cgh); + auto __end_idx_acc = __end_idx.template get_access(__cgh); + + __dpl_sycl::__local_accessor<__val_type> __loc_partials_acc(__wgroup_size, __cgh); + __dpl_sycl::__local_accessor<__diff_type> __loc_seg_ends_acc(__wgroup_size, __cgh); + + __cgh.depends_on(__wg_reduce); +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT + __cgh.use_kernel_bundle(__seg_reduce_prefix_kernel.get_kernel_bundle()); +#endif + __cgh.parallel_for<_SegReducePrefixKernel>( +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT + __seg_reduce_prefix_kernel, +#endif + sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { + auto __group = __item.get_group(); + std::int64_t __group_id = __item.get_group(0); + std::size_t __global_id = __item.get_global_id(0); + std::size_t __local_id = __item.get_local_id(0); + + std::size_t __start = __global_id * __vals_per_item; + std::size_t __end = __dpl_sycl::__minimum{}(__start + __vals_per_item, __n); + std::size_t __item_segments = 0; + + std::int64_t __wg_agg_idx = __group_id - 1; + __val_type __agg_collector = unseq_backend::__known_identity<_BinaryOperator, __val_type>; + + bool __ag_exists = false; + // 3a. Check to see if an aggregate exists and compute that value in the first + // work item. + if (__group_id != 0) + { + __ag_exists = __start < __n; + // local reductions followed by a sweep + constexpr std::int32_t __vals_to_explore = 16; + bool __last_it = false; + __loc_seg_ends_acc[__local_id] = false; + __loc_partials_acc[__local_id] = unseq_backend::__known_identity<_BinaryOperator, __val_type>; + for (std::int32_t __i = __wg_agg_idx - __vals_to_explore * __local_id; !__last_it; + __i -= __wgroup_size * __vals_to_explore) + { + __val_type __local_collector = unseq_backend::__known_identity<_BinaryOperator, __val_type>; + // exploration phase + for (std::int32_t __j = __i; + __j > __dpl_sycl::__maximum{}(-1L, __i - __vals_to_explore); --__j) + { + __local_collector = __binary_op(__partials_acc[__j], __local_collector); + if (__seg_ends_acc[__j] || __j == 0) + { + __loc_seg_ends_acc[__local_id] = true; + break; + } + } + __loc_partials_acc[__local_id] = __local_collector; + __dpl_sycl::__group_barrier(__item); + // serial aggregate collection and synchronization + if (__local_id == 0) + { + for (std::size_t __j = 0; __j < __wgroup_size; ++__j) + { + __agg_collector = __binary_op(__loc_partials_acc[__j], __agg_collector); + if (__loc_seg_ends_acc[__j]) + { + __last_it = true; + break; + } + } + } + __agg_collector = __dpl_sycl::__group_broadcast(__item.get_group(), __agg_collector); + __last_it = __dpl_sycl::__group_broadcast(__item.get_group(), __last_it); + } + + // Check to see if aggregates exist. + // The last group must always stay to write the final index + __ag_exists = __dpl_sycl::__any_of_group(__group, __ag_exists); + if (!__ag_exists && __group_id != __n_groups - 1) + return; + } + // 3b. count the segment ends + for (std::size_t __i = __start; __i < __end; ++__i) + if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1])) + ++__item_segments; + + std::size_t __prior_segs_in_wg = __dpl_sycl::__exclusive_scan_over_group( + __group, __item_segments, __dpl_sycl::__plus()); + + // 3c. Determine prior index + std::size_t __wg_num_prior_segs = __seg_ends_scan_acc[__group_id]; + + // 3d. Second pass over the keys, reidentifying end segments and applying work group + // aggregates if appropriate. Both the key and reduction value are written to the final output at the + // computed index + std::size_t __item_offset = 0; + for (std::size_t __i = __start; __i < __end; ++__i) + { + if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1])) + { + std::size_t __idx = __wg_num_prior_segs + __prior_segs_in_wg + __item_offset; + + // apply the aggregate if it is the first segment end in the workgroup only + if (__prior_segs_in_wg == 0 && __item_offset == 0 && __ag_exists) + __out_values[__idx] = __binary_op(__agg_collector, __out_values[__idx]); + + ++__item_offset; + // the last item must write the last index's position to return + if (__i == __n - 1) + __end_idx_acc[0] = __idx; + } + } + }); + }) + .wait(); + + return __end_idx.get_host_access()[0] + 1; +} + +} // namespace __par_backend_hetero +} // namespace dpl +} // namespace oneapi + +#endif + From 74143b2936eb001c379e28c074a29a3dd03b7365 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 5 Nov 2024 13:57:48 -0600 Subject: [PATCH 24/31] Revert "Remove now unneeded ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION macro" This reverts commit a4c783533fcd8675094f77681d6d00dc3d23c0f7. --- CMakeLists.txt | 10 ++++++++++ cmake/README.md | 1 + .../pstl/hetero/dpcpp/unseq_backend_sycl.h | 17 +++++++++++++--- test/CMakeLists.txt | 5 +++++ .../numeric.ops/reduce_by_segment.pass.cpp | 20 ++++++++++++++++--- 5 files changed, 47 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c682e745f9a..3cb28a1693d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -287,6 +287,16 @@ if (ONEDPL_BACKEND MATCHES "^(tbb|dpcpp|dpcpp_only)$") endif() endif() + if (DEFINED ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION) + if(ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION) + message(STATUS "Adding -DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1 option") + target_compile_options(oneDPL INTERFACE "-DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1") + else() + message(STATUS "Adding -DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=0 option") + target_compile_options(oneDPL INTERFACE "-DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=0") + endif() + endif() + # DPC++ specific macro target_compile_definitions(oneDPL INTERFACE $<$,$>:ONEDPL_FPGA_DEVICE> diff --git a/cmake/README.md b/cmake/README.md index 0683a377820..7335b7e2312 100644 --- a/cmake/README.md +++ b/cmake/README.md @@ -18,6 +18,7 @@ The following variables are provided for oneDPL configuration: | ONEDPL_AOT_ARCH | STRING | Architecture options for ahead-of-time compilation, supported values can be found [here](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html) | "*" for GPU device and "avx" for CPU device | | ONEDPL_TEST_EXPLICIT_KERNEL_NAMES | STRING | Control kernel naming. Affects only oneDPL test targets. Supported values: AUTO, ALWAYS. AUTO: rely on the compiler if "Unnamed SYCL lambda kernels" feature is on, otherwise provide kernel names explicitly; ALWAYS: provide kernel names explicitly | AUTO | | ONEDPL_TEST_WIN_ICX_FIXES | BOOL | Affects only oneDPL test targets. Enable icx, icx-cl workarounds to fix issues in CMake for Windows. | ON | +| ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION | BOOL | Use as a workaround for incorrect results, which may be produced by reduction algorithms with 64-bit data types compiled by the Intel® oneAPI DPC++/C++ Compiler and executed on GPU devices. | | Some useful CMake variables ([here](https://cmake.org/cmake/help/latest/manual/cmake-variables.7.html) you can find a full list of CMake variables for the latest version): diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index 9c588d0bf52..b17c619de89 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -34,11 +34,21 @@ namespace unseq_backend //This optimization depends on Intel(R) oneAPI DPC++ Compiler implementation such as support of binary operators from std namespace. //We need to use defined(SYCL_IMPLEMENTATION_INTEL) macro as a guard. +template +inline constexpr bool __can_use_known_identity = +# if ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION + // When ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION is defined as non-zero, we avoid using known identity for 64-bit arithmetic data types + !(::std::is_arithmetic_v<_Tp> && sizeof(_Tp) == sizeof(::std::uint64_t)); +# else + true; +# endif // ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION + //TODO: To change __has_known_identity implementation as soon as the Intel(R) oneAPI DPC++ Compiler implementation issues related to //std::multiplies, std::bit_or, std::bit_and and std::bit_xor operations will be fixed. //std::logical_and and std::logical_or are not supported in Intel(R) oneAPI DPC++ Compiler to be used in sycl::inclusive_scan_over_group and sycl::reduce_over_group template -using __has_known_identity = +using __has_known_identity = ::std::conditional_t< + __can_use_known_identity<_Tp>, # if _ONEDPL_LIBSYCL_VERSION >= 50200 typename ::std::disjunction< __dpl_sycl::__has_known_identity<_BinaryOp, _Tp>, @@ -50,15 +60,16 @@ using __has_known_identity = ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum<_Tp>>, ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum>, ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<_Tp>>, - ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum>>>>; + ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum>>>>, # else //_ONEDPL_LIBSYCL_VERSION >= 50200 typename ::std::conjunction< ::std::is_arithmetic<_Tp>, ::std::disjunction<::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus<_Tp>>, ::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus>, ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<_Tp>>, - ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus>>>; + ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus>>>, # endif //_ONEDPL_LIBSYCL_VERSION >= 50200 + ::std::false_type>; // This is for the case of __can_use_known_identity<_Tp>==false #else //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e85e8e9f5f8..90eb3d5c737 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -195,6 +195,7 @@ macro(onedpl_add_test test_source_file switch_off_checked_iterators) string(REPLACE "\.cpp" "" _test_name ${_test_name}) set(coal_tests "reduce.pass" "transform_reduce.pass" "count.pass" "sycl_iterator_reduce.pass" "minmax_element.pass") + set(workaround_for_igpu_64bit_reduction_tests "reduce_by_segment.pass") # mark those tests with pstloffload_smoke_tests label set (pstloffload_smoke_tests "adjacent_find.pass" "copy_move.pass" "merge.pass" "partial_sort.pass" "remove_copy.pass" "transform_reduce.pass" "transform_reduce.pass.coal" "transform_scan.pass" "algorithm.pass" @@ -208,6 +209,10 @@ macro(onedpl_add_test test_source_file switch_off_checked_iterators) if (_test_name IN_LIST coal_tests) onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "-D_ONEDPL_DETECT_SPIRV_COMPILATION=1" "${extra_test_label}") onedpl_construct_exec(${test_source_file} ${_test_name}.coal ${switch_off_checked_iterators} "-D_ONEDPL_DETECT_SPIRV_COMPILATION=0" "${extra_test_label}") + elseif (_test_name IN_LIST workaround_for_igpu_64bit_reduction_tests) + onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "" "${extra_test_label}") + string(REPLACE "\.pass" "_workaround_64bit_reduction\.pass" _test_name ${_test_name}) + onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "-D_ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1" "${extra_test_label}") elseif(_test_name STREQUAL "free_after_unload.pass") onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "" "${extra_test_label}") onedpl_construct_exec(${test_source_file} ${_test_name}.after_pstl_offload ${switch_off_checked_iterators} "" "${extra_test_label}") diff --git a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp index 2cee63239b6..f71f78ed26e 100644 --- a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp @@ -13,6 +13,14 @@ // //===----------------------------------------------------------------------===// +#if defined(ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION) +#undef ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION +#endif + +#if defined(_ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION) +# define ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION _ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION +#endif + #include "support/test_config.h" #include "oneapi/dpl/execution" @@ -298,10 +306,16 @@ void run_test_on_device() { #if TEST_DPCPP_BACKEND_PRESENT - if (TestUtils::has_type_support(TestUtils::get_test_queue().get_device())) + // Skip 64-byte types testing when the algorithm is broken and there is no the workaround +#if _PSTL_ICPX_TEST_RED_BY_SEG_BROKEN_64BIT_TYPES && !ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION + if constexpr (sizeof(ValueType) != 8) +#endif { - constexpr sycl::usm::alloc allocation_type = use_device_alloc ? sycl::usm::alloc::device : sycl::usm::alloc::shared; - test4buffers>(); + if (TestUtils::has_type_support(TestUtils::get_test_queue().get_device())) + { + constexpr sycl::usm::alloc allocation_type = use_device_alloc ? sycl::usm::alloc::device : sycl::usm::alloc::shared; + test4buffers>(); + } } #endif // TEST_DPCPP_BACKEND_PRESENT } From fdf6a3952d8fed112159962ea48dccfe3a1470f9 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 5 Nov 2024 14:00:41 -0600 Subject: [PATCH 25/31] Fix test bug where device allocation is always used for testing Signed-off-by: Matthew Michel --- .../parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp index f71f78ed26e..80aa9f53d3c 100644 --- a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp @@ -314,7 +314,7 @@ run_test_on_device() if (TestUtils::has_type_support(TestUtils::get_test_queue().get_device())) { constexpr sycl::usm::alloc allocation_type = use_device_alloc ? sycl::usm::alloc::device : sycl::usm::alloc::shared; - test4buffers>(); + test4buffers>(); } } #endif // TEST_DPCPP_BACKEND_PRESENT From 5987df9c1c2f469a265cbf884b85c836d87d1e1e Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 13 Nov 2024 13:45:21 -0600 Subject: [PATCH 26/31] Separate each reduce_by_segment fallback path into their own functions Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 239 +++++++++--------- .../parallel_backend_sycl_reduce_by_segment.h | 7 +- 2 files changed, 129 insertions(+), 117 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 431d58d75ea..61975975363 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -2295,6 +2295,126 @@ struct __assign_key1_wrapper; template struct __assign_key2_wrapper; +template +oneapi::dpl::__internal::__difference_t<_Range3> +__parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, + _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, + _Range4&& __out_values, _BinaryPredicate __binary_pred, + _BinaryOperator __binary_op, + /*known_identity=*/std::false_type) +{ + using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>; + using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; + using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; + + const auto __n = __keys.size(); + // Round 1: reduce with extra indices added to avoid long segments + // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence. + // Skip a round of copy_if and reduces if there are none. + auto __idx = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer(); + auto __tmp_out_keys = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer(); + auto __tmp_out_values = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n).get_buffer(); + + // Replicating first element of keys view to be able to compare (i-1)-th and (i)-th key with aligned sequences, + // dropping the last key for the i-1 sequence. + auto __k1 = + oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::replicate_start_view_simple(__keys, 1), __n); + + // view1 elements are a tuple of the element index and pairs of adjacent keys + // view2 elements are a tuple of the elements where key-index pairs will be written by copy_if + auto __view1 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __n), __k1, __keys); + auto __view2 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__tmp_out_keys), + oneapi::dpl::__ranges::views::all_write(__idx)); + + // use work group size adjusted to shared local memory as the maximum segment size. + std::size_t __wgroup_size = + oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(__key_type) + sizeof(__val_type)); + + // element is copied if it is the 0th element (marks beginning of first segment), is in an index + // evenly divisible by wg size (ensures segments are not long), or has a key not equal to the + // adjacent element (marks end of real segments) + // TODO: replace wgroup size with segment size based on platform specifics. + auto __intermediate_result_end = + oneapi::dpl::__par_backend_hetero::__parallel_copy_if( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2, + __n, + [__binary_pred, __wgroup_size](const auto& __a) { + // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys + // for (i-1), but we still need to get its key value as it is the start of a segment + const auto index = std::get<0>(__a); + if (index == 0) + return true; + return index % __wgroup_size == 0 // segment size + || !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // key comparison + }, + unseq_backend::__brick_assign_key_position{}) + .get(); + + //reduce by segment + oneapi::dpl::__par_backend_hetero::__parallel_for( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec), + unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), + __intermediate_result_end, + oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), + __intermediate_result_end), + std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values)) + .wait(); + + // Round 2: final reduction to get result for each segment of equal adjacent keys + // create views over adjacent keys + oneapi::dpl::__ranges::all_view<__key_type, __par_backend_hetero::access_mode::read_write> __new_keys( + __tmp_out_keys); + + // Replicating first element of key views to be able to compare (i-1)-th and (i)-th key, + // dropping the last key for the i-1 sequence. Only taking the appropriate number of keys to start with here. + auto __clipped_new_keys = oneapi::dpl::__ranges::take_view_simple(__new_keys, __intermediate_result_end); + + auto __k3 = oneapi::dpl::__ranges::take_view_simple( + oneapi::dpl::__ranges::replicate_start_view_simple(__clipped_new_keys, 1), __intermediate_result_end); + + // view3 elements are a tuple of the element index and pairs of adjacent keys + // view4 elements are a tuple of the elements where key-index pairs will be written by copy_if + auto __view3 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __intermediate_result_end), + __k3, __clipped_new_keys); + auto __view4 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__out_keys), + oneapi::dpl::__ranges::views::all_write(__idx)); + + // element is copied if it is the 0th element (marks beginning of first segment), or has a key not equal to + // the adjacent element (end of a segment). Artificial segments based on wg size are not created. + auto __result_end = oneapi::dpl::__par_backend_hetero::__parallel_copy_if( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key2_wrapper>(__exec), + __view3, __view4, __view3.size(), + [__binary_pred](const auto& __a) { + // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys + // for (i-1), but we still need to get its key value as it is the start of a segment + if (std::get<0>(__a) == 0) + return true; + return !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // keys comparison + }, + unseq_backend::__brick_assign_key_position{}) + .get(); + + //reduce by segment + oneapi::dpl::__par_backend_hetero::__parallel_for( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>( + std::forward<_ExecutionPolicy>(__exec)), + unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>( + __binary_op, __intermediate_result_end), + __result_end, + oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __result_end), + oneapi::dpl::__ranges::views::all_read(__tmp_out_values), std::forward<_Range4>(__out_values)) + .__deferrable_wait(); + return __result_end; +} + template oneapi::dpl::__internal::__difference_t<_Range3> @@ -2333,120 +2453,11 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _Exe } } #endif - if constexpr (oneapi::dpl::unseq_backend::__has_known_identity<_BinaryOperator, __val_type>::value) - { - return __parallel_reduce_by_segment_known_identity( - oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), - std::forward<_Range4>(__out_values), __binary_pred, __binary_op); - } - else - { - // Round 1: reduce with extra indices added to avoid long segments - // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence. - // Skip a round of copy_if and reduces if there are none. - auto __idx = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer(); - auto __tmp_out_keys = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer(); - auto __tmp_out_values = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n).get_buffer(); - - // Replicating first element of keys view to be able to compare (i-1)-th and (i)-th key with aligned sequences, - // dropping the last key for the i-1 sequence. - auto __k1 = - oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::replicate_start_view_simple(__keys, 1), __n); - - // view1 elements are a tuple of the element index and pairs of adjacent keys - // view2 elements are a tuple of the elements where key-index pairs will be written by copy_if - auto __view1 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __n), __k1, __keys); - auto __view2 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__tmp_out_keys), - oneapi::dpl::__ranges::views::all_write(__idx)); - - // use work group size adjusted to shared local memory as the maximum segment size. - std::size_t __wgroup_size = - oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(__key_type) + sizeof(__val_type)); - - // element is copied if it is the 0th element (marks beginning of first segment), is in an index - // evenly divisible by wg size (ensures segments are not long), or has a key not equal to the - // adjacent element (marks end of real segments) - // TODO: replace wgroup size with segment size based on platform specifics. - auto __intermediate_result_end = - oneapi::dpl::__par_backend_hetero::__parallel_copy_if( - oneapi::dpl::__internal::__device_backend_tag{}, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2, - __n, - [__binary_pred, __wgroup_size](const auto& __a) { - // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys - // for (i-1), but we still need to get its key value as it is the start of a segment - const auto index = std::get<0>(__a); - if (index == 0) - return true; - return index % __wgroup_size == 0 // segment size - || !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // key comparison - }, - unseq_backend::__brick_assign_key_position{}) - .get(); - - //reduce by segment - oneapi::dpl::__par_backend_hetero::__parallel_for( - oneapi::dpl::__internal::__device_backend_tag{}, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec), - unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), - __intermediate_result_end, - oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), - __intermediate_result_end), - std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values)) - .wait(); - - // Round 2: final reduction to get result for each segment of equal adjacent keys - // create views over adjacent keys - oneapi::dpl::__ranges::all_view<__key_type, __par_backend_hetero::access_mode::read_write> __new_keys( - __tmp_out_keys); - - // Replicating first element of key views to be able to compare (i-1)-th and (i)-th key, - // dropping the last key for the i-1 sequence. Only taking the appropriate number of keys to start with here. - auto __clipped_new_keys = oneapi::dpl::__ranges::take_view_simple(__new_keys, __intermediate_result_end); - - auto __k3 = oneapi::dpl::__ranges::take_view_simple( - oneapi::dpl::__ranges::replicate_start_view_simple(__clipped_new_keys, 1), __intermediate_result_end); - - // view3 elements are a tuple of the element index and pairs of adjacent keys - // view4 elements are a tuple of the elements where key-index pairs will be written by copy_if - auto __view3 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __intermediate_result_end), - __k3, __clipped_new_keys); - auto __view4 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__out_keys), - oneapi::dpl::__ranges::views::all_write(__idx)); - - // element is copied if it is the 0th element (marks beginning of first segment), or has a key not equal to - // the adjacent element (end of a segment). Artificial segments based on wg size are not created. - auto __result_end = oneapi::dpl::__par_backend_hetero::__parallel_copy_if( - oneapi::dpl::__internal::__device_backend_tag{}, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key2_wrapper>(__exec), - __view3, __view4, __view3.size(), - [__binary_pred](const auto& __a) { - // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys - // for (i-1), but we still need to get its key value as it is the start of a segment - if (std::get<0>(__a) == 0) - return true; - return !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // keys comparison - }, - unseq_backend::__brick_assign_key_position{}) - .get(); - - //reduce by segment - oneapi::dpl::__par_backend_hetero::__parallel_for( - oneapi::dpl::__internal::__device_backend_tag{}, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>( - std::forward<_ExecutionPolicy>(__exec)), - unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>( - __binary_op, __intermediate_result_end), - __result_end, - oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __result_end), - oneapi::dpl::__ranges::views::all_read(__tmp_out_values), std::forward<_Range4>(__out_values)) - .__deferrable_wait(); - return __result_end; - } + return __parallel_reduce_by_segment_fallback( + oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), + std::forward<_Range4>(__out_values), __binary_pred, __binary_op, + oneapi::dpl::unseq_backend::__has_known_identity<_BinaryOperator, __val_type>{}); } } // namespace __par_backend_hetero diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h index 14860e3830a..f3455d45779 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h @@ -89,9 +89,10 @@ using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>; template oneapi::dpl::__internal::__difference_t<_Range3> -__parallel_reduce_by_segment_known_identity(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, - _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, - _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) +__parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, + _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, + _BinaryPredicate __binary_pred, _BinaryOperator __binary_op, + /*known_identity=*/std::true_type) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; From d80377ebe28955b95310b64dde4906d56e31ef43 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 13 Nov 2024 14:50:25 -0600 Subject: [PATCH 27/31] clang-format Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 6 ++---- .../parallel_backend_sycl_reduce_by_segment.h | 21 +++++++++---------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 61975975363..497790abff8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -2312,8 +2312,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ // Round 1: reduce with extra indices added to avoid long segments // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence. // Skip a round of copy_if and reduces if there are none. - auto __idx = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer(); + auto __idx = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer(); auto __tmp_out_keys = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer(); auto __tmp_out_values = @@ -2359,8 +2358,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ oneapi::dpl::__par_backend_hetero::__parallel_for( oneapi::dpl::__internal::__device_backend_tag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec), - unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), - __intermediate_result_end, + unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), __intermediate_result_end, oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __intermediate_result_end), std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values)) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h index f3455d45779..bc702137963 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h @@ -86,12 +86,13 @@ template using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>; } // namespace -template +template oneapi::dpl::__internal::__difference_t<_Range3> -__parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, - _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, - _BinaryPredicate __binary_pred, _BinaryOperator __binary_op, +__parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, + _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, + _Range4&& __out_values, _BinaryPredicate __binary_pred, + _BinaryOperator __binary_op, /*known_identity=*/std::true_type) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; @@ -137,11 +138,10 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ auto __seg_reduce_prefix_kernel = __par_backend_hetero::__internal::__kernel_compiler<_SegReducePrefixKernel>::__compile(__exec); __wgroup_size = - std::min({__wgroup_size, - oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_count_kernel), - oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_offset_kernel), - oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_wg_kernel), - oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_prefix_kernel)}); + std::min({__wgroup_size, oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_count_kernel), + oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_offset_kernel), + oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_wg_kernel), + oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_prefix_kernel)}); #endif std::size_t __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __wgroup_size * __vals_per_item); @@ -462,4 +462,3 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ } // namespace oneapi #endif - From 3c8154e66747e3a9a4279bcb0d33b72c318bea33 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Thu, 21 Nov 2024 16:40:45 -0600 Subject: [PATCH 28/31] Address comments in reduce-then-scan based implementation Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 12 ++++++++++-- .../implementation_details/device_copyable.pass.cpp | 6 +++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 497790abff8..2c8b5c472ae 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -802,6 +802,9 @@ struct __gen_red_by_seg_reduce_input const auto __in_keys = std::get<0>(__in_rng.tuple()); const auto __in_vals = std::get<1>(__in_rng.tuple()); using _ValueType = oneapi::dpl::__internal::__value_t; + // The first segment start (index 0) is not marked with a 1. This is because we need the first + // segment's key and value output index to be 0. We begin marking new segments only after the + // first. const std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); return oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}); } @@ -845,7 +848,7 @@ struct __gen_red_by_seg_scan_input oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, __current_key, __current_key); // Passing __current_key as the next key for the last element is a placeholder } - else + else // __id == 0 { const _KeyType& __next_key = __in_keys[__id + 1]; return oneapi::dpl::__internal::make_tuple( @@ -867,7 +870,7 @@ struct __red_by_seg_op { using std::get; // The left-hand side has processed elements from the same segment, so update the reduction value. - if (std::get<0>(__rhs_tup) == 0) + if (get<0>(__rhs_tup) == 0) { return oneapi::dpl::__internal::make_tuple(get<0>(__lhs_tup), __binary_op(get<1>(__lhs_tup), get<1>(__rhs_tup))); @@ -897,6 +900,11 @@ struct __write_red_by_seg const bool __is_seg_end = get<1>(__tup); const std::size_t __out_idx = get<0>(get<0>(__tup)); + // With the exception of the first key which is output by index 0, the first key in each segment is written + // by the work item that outputs the previous segment's reduction value. This is because the reduce_by_segment + // API requires that the first key in a segment is output and is important for when keys in a segment might not + // be the same (but satisfy the predicate). The last segment does not output a key as there are no future + // segments process. if (__id == 0) __out_keys[0] = __current_key; if (__is_seg_end) diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp index 572c16b818a..1ae8489e35e 100644 --- a/test/general/implementation_details/device_copyable.pass.cpp +++ b/test/general/implementation_details/device_copyable.pass.cpp @@ -156,9 +156,9 @@ test_device_copyable() "__gen_transform_input is not device copyable with device copyable types"); //__gen_red_by_seg_reduce_input - static_assert( - sycl::is_device_copyable_v>, - "__gen_red_by_seg_reduce_input is not device copyable with device copyable types"); + static_assert(sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_reduce_input>, + "__gen_red_by_seg_reduce_input is not device copyable with device copyable types"); //__gen_red_by_seg_scan_input static_assert( From db63d459f0e2114307b9012e5048bc136eccef7d Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 22 Nov 2024 15:36:52 -0600 Subject: [PATCH 29/31] Improve explanations of reduce-by-segment approach Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 2c8b5c472ae..1d10b8ac29b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -864,6 +864,51 @@ struct __gen_red_by_seg_scan_input template struct __red_by_seg_op { + // Consider the following segment / value pairs that would be processed in reduce-then-scan by a sub-group of size 8: + // ---------------------------------------------------------- + // Keys: 0 0 1 1 2 2 2 2 + // Values: 1 1 1 1 1 1 1 1 + // ---------------------------------------------------------- + // The reduce and scan input generation phase flags new segments (excluding index 0) for use in the sub-group scan + // operation. The above key, value pairs correspond to the following flag, value pairs: + // ---------------------------------------------------------- + // Flags: 0 0 1 0 1 0 0 0 + // Values: 1 1 1 1 1 1 1 1 + // ---------------------------------------------------------- + // The sub-group scan operation looks back by powers-of-2 applying encountered prefixes. The __red_by_seg_op + // operation performs a standard inclusive scan over the flags to compute output indices while performing a masked + // scan over values to avoid applying a previous segment's partial reduction. Previous value elements are reduced + // so long as the current index's flag is 0, indicating that input within its segment is still being processed + // ---------------------------------------------------------- + // Start: + // ---------------------------------------------------------- + // Flags: 0 0 1 0 1 0 0 0 + // Values: 1 1 1 1 1 1 1 1 + // ---------------------------------------------------------- + // After step 1 (apply the i-1th value if the ith flag is 0): + // ---------------------------------------------------------- + // Flags: 0 0 1 1 1 1 0 0 + // Values: 1 2 1 2 1 2 2 2 + // ---------------------------------------------------------- + // After step 2 (apply the i-2th value if the ith flag is 0): + // ---------------------------------------------------------- + // Flags: 0 0 1 1 2 2 1 1 + // Values: 1 2 1 2 1 2 3 4 + // ---------------------------------------------------------- + // After step 3 (apply the i-4th value if the ith flag is 0): + // ---------------------------------------------------------- + // Flags: 0 0 1 1 2 2 2 2 + // Values: 1 2 1 2 1 2 3 4 + // ^ ^ ^ + // ---------------------------------------------------------- + // Note that the scan of segment flags results in the desired output index of the reduce_by_segment operation in + // each segment and the item corresponding to the final key in a segment contains its output reduction value. This + // operation is first applied within a sub-group and then across sub-groups, work-groups, and blocks to + // reduce-by-segment across the full input. The result of these operations combined with cached key data in + // __gen_red_by_seg_scan_input enables the write phase to output keys and reduction values. + // => + // Segments : 0 1 2 + // Values : 2 2 4 template auto operator()(const _Tup1& __lhs_tup, const _Tup2& __rhs_tup) const @@ -1326,10 +1371,15 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_ _Range3&& __out_keys, _Range4&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { + // Flags new segments and passes input value through a 2-tuple using _GenReduceInput = __gen_red_by_seg_reduce_input<_BinaryPredicate>; + // Operation that computes output indices and output reduction values per segment using _ReduceOp = __red_by_seg_op<_BinaryOperator>; + // Returns 4-component tuple which contains flags, keys, value, and a flag to write output using _GenScanInput = __gen_red_by_seg_scan_input<_BinaryPredicate>; + // Returns the first component from scan input which is scanned over using _ScanInputTransform = __get_zeroth_element; + // Writes current segment's output reduction and the next segment's output key using _WriteOp = __write_red_by_seg<_BinaryPredicate>; using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; std::size_t __n = __keys.size(); From 3deed76b7eade1397fd0f584f414f979df95fca8 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 22 Nov 2024 15:54:08 -0600 Subject: [PATCH 30/31] Use binary_op[_non]_device_copyable where appropriate Signed-off-by: Matthew Michel --- .../device_copyable.pass.cpp | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp index 1ae8489e35e..474e4268b69 100644 --- a/test/general/implementation_details/device_copyable.pass.cpp +++ b/test/general/implementation_details/device_copyable.pass.cpp @@ -161,9 +161,9 @@ test_device_copyable() "__gen_red_by_seg_reduce_input is not device copyable with device copyable types"); //__gen_red_by_seg_scan_input - static_assert( - sycl::is_device_copyable_v>, - "__gen_red_by_seg_scan_input is not device copyable with device copyable types"); + static_assert(sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_scan_input>, + "__gen_red_by_seg_scan_input is not device copyable with device copyable types"); //__gen_mask static_assert(sycl::is_device_copyable_v>, @@ -196,7 +196,7 @@ test_device_copyable() //__write_red_by_seg static_assert( - sycl::is_device_copyable_v>, + sycl::is_device_copyable_v>, "__write_red_by_seg is not device copyable with device copyable types"); // __early_exit_find_or @@ -418,14 +418,14 @@ test_non_device_copyable() "__gen_transform_input is device copyable with non device copyable types"); //__gen_red_by_seg_reduce_input - static_assert( - !sycl::is_device_copyable_v>, - "__gen_red_by_seg_reduce_input is device copyable with non device copyable types"); + static_assert(!sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_reduce_input>, + "__gen_red_by_seg_reduce_input is device copyable with non device copyable types"); //__gen_red_by_seg_reduce_input - static_assert( - !sycl::is_device_copyable_v>, - "__gen_red_by_seg_scan_input is device copyable with non device copyable types"); + static_assert(!sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_scan_input>, + "__gen_red_by_seg_scan_input is device copyable with non device copyable types"); //__gen_mask static_assert(!sycl::is_device_copyable_v>, @@ -457,9 +457,9 @@ test_non_device_copyable() "__write_to_id_if_else is device copyable with non device copyable types"); //__write_red_by_seg - static_assert( - !sycl::is_device_copyable_v>, - "__write_red_by_seg is device copyable with non device copyable types"); + static_assert(!sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__write_red_by_seg>, + "__write_red_by_seg is device copyable with non device copyable types"); // __early_exit_find_or static_assert( From c641bd39dc0187237f7761ac6d7b71cd4e6cdb2d Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 22 Nov 2024 16:57:30 -0600 Subject: [PATCH 31/31] Address comments in fallback implementation Signed-off-by: Matthew Michel --- .../dpcpp/parallel_backend_sycl_reduce_by_segment.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h index bc702137963..62ae736782d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h @@ -175,7 +175,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ sycl::nd_item<1> __item) { auto __group = __item.get_group(); std::size_t __group_id = __item.get_group(0); - std::size_t __local_id = __item.get_local_id(0); + std::uint32_t __local_id = __item.get_local_id(0); std::size_t __global_id = __item.get_global_id(0); std::size_t __start = __global_id * __vals_per_item; @@ -267,12 +267,12 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ // 2c. Count the number of prior work segments cooperatively over group std::size_t __prior_segs_in_wg = __dpl_sycl::__exclusive_scan_over_group( - __group, __item_segments, __dpl_sycl::__plus()); + __group, __item_segments, __dpl_sycl::__plus()); std::size_t __start_idx = __wg_num_prior_segs + __prior_segs_in_wg; // 2d. Find the greatest segment end less than the current index (inclusive) std::size_t __closest_seg_id = __dpl_sycl::__inclusive_scan_over_group( - __group, __max_end, __dpl_sycl::__maximum()); + __group, __max_end, __dpl_sycl::__maximum()); // __wg_segmented_scan is a derivative work and responsible for the third header copyright __val_type __carry_in = oneapi::dpl::internal::__wg_segmented_scan( @@ -287,7 +287,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ if (__local_id == 0) { __apply_aggs = false; - if (__global_id == 0 && __n > 0) + if (__global_id == 0) { // first segment identifier is always the first key __out_keys[0] = __keys[0];