From 6c112719372f58dcb20df9f0b10f3ebaeebd5ce6 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Fri, 27 Sep 2024 16:24:34 -0500
Subject: [PATCH 01/31] Initial commit of reduce_by_segment with the
 reduce-then-scan path

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/internal/reduce_by_segment_impl.h     |  6 +-
 .../hetero/algorithm_ranges_impl_hetero.h     |  7 ++
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 60 +++++++++++---
 .../parallel_backend_sycl_reduce_then_scan.h  | 81 ++++++++++---------
 4 files changed, 103 insertions(+), 51 deletions(-)
diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h
index d0979554786..a22f2970b49 100644
--- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h
+++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h
@@ -191,18 +191,19 @@ using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>;
 } // namespace
 
 template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3,
-          typename _Range4, typename _BinaryPredicate, typename _BinaryOperator>
+          typename _Range4, typename _BinaryPredicate, typename _BinaryOperator, typename _KnownIdentity>
 oneapi::dpl::__internal::__difference_t<_Range3>
 __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __keys,
                          _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
                          _BinaryPredicate __binary_pred, _BinaryOperator __binary_op,
-                         ::std::false_type /* has_known_identity */)
+                         _KnownIdentity)
 {
     return oneapi::dpl::experimental::ranges::reduce_by_segment(
         ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__keys), ::std::forward<_Range2>(__values),
         ::std::forward<_Range3>(__out_keys), ::std::forward<_Range4>(__out_values), __binary_pred, __binary_op);
 }
 
+#if 0
 template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3,
           typename _Range4, typename _BinaryPredicate, typename _BinaryOperator>
 oneapi::dpl::__internal::__difference_t<_Range3>
@@ -573,6 +574,7 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy
 
     return __end_idx.get_host_access()[0] + 1;
 }
+#endif
 
 template <typename _BackendTag, typename Policy, typename InputIterator1, typename InputIterator2,
           typename OutputIterator1, typename OutputIterator2, typename BinaryPred, typename BinaryOperator>
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index b9cd154a044..f845c209cfe 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -913,6 +913,12 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
                             _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
                             _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
 {
+    oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys),
+                                 std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys),
+                                 std::forward<_Range4>(__out_values), __binary_pred, __binary_op)
+        .wait();
+    return 1;
+    #if 0
     // The algorithm reduces values in __values where the
     // associated keys for the values are equal to the adjacent key.
     //
@@ -1043,6 +1049,7 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
         .__deferrable_wait();
 
     return __result_end;
+    #endif
 }
 
 } // namespace __ranges
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 77b2e6c35f9..3141374ebcd 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -773,9 +773,9 @@ __group_scan_fits_in_slm(const sycl::queue& __queue, std::size_t __n, std::size_
 template <typename _UnaryOp>
 struct __gen_transform_input
 {
-    template <typename _InRng>
+    template <typename _InRng, typename _OutRng>
     auto
-    operator()(const _InRng& __in_rng, std::size_t __id) const
+    operator()(std::size_t __id, const _InRng& __in_rng, _OutRng&) const
     {
         // We explicitly convert __in_rng[__id] to the value type of _InRng to properly handle the case where we
         // process zip_iterator input where the reference type is a tuple of a references. This prevents the caller
@@ -788,9 +788,9 @@ struct __gen_transform_input
 
 struct __simple_write_to_id
 {
-    template <typename _OutRng, typename _ValueType>
+    template <typename _ValueType, typename _InRng, typename _OutRng>
     void
-    operator()(_OutRng& __out_rng, std::size_t __id, const _ValueType& __v) const
+    operator()(std::size_t __id, const _ValueType& __v, _InRng&, _OutRng& __out_rng) const
     {
         // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our
         // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through.
@@ -804,9 +804,9 @@ struct __simple_write_to_id
 template <typename _Predicate, typename _RangeTransform = oneapi::dpl::__internal::__no_op>
 struct __gen_mask
 {
-    template <typename _InRng>
+    template <typename _InRng, typename _OutRng>
     bool
-    operator()(_InRng&& __in_rng, std::size_t __id) const
+    operator()(std::size_t __id, const _InRng& __in_rng, _OutRng&) const
     {
         return __pred((__rng_transform(std::forward<_InRng>(__in_rng)))[__id]);
     }
@@ -1012,9 +1012,9 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
             _GenInput __gen_transform{__unary_op};
 
             return __parallel_transform_reduce_then_scan(
-                __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
-                std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{},
-                _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{});
+                __backend_tag, std::forward<_ExecutionPolicy>(__exec), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{},
+                _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}, __n,
+                std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng));
         }
     }
 
@@ -1186,6 +1186,48 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
     }
 }
 
+template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3,
+          typename _Range4, typename _BinaryPredicate, typename _BinaryOperator>
+auto
+__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys,
+                            _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
+                            _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
+{
+    auto __n = __keys.size();
+    auto __gen_reduce_input = [=](std::size_t __idx, const auto& __in_keys, const auto& __in_vals, const auto&, const auto&) {
+        using _ValueType = oneapi::dpl::__internal::__value_t<decltype(__in_vals)>;
+        if (__idx == 0)
+            return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]});
+        if (!__binary_pred(__in_keys[__idx], __in_keys[__idx - 1]))
+            return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]});
+        return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]});
+    };
+    auto __reduce_op = [=](const auto& __lhs_tup,  const auto& __rhs_tup) {
+        if (std::get<0>(__rhs_tup) == 0)
+        {
+            return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup),
+                    __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup)));
+        }
+        return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup),
+                                                   std::get<1>(__rhs_tup));
+    };
+    auto __gen_scan_input = __gen_reduce_input;
+    auto __scan_input_transform = oneapi::dpl::__internal::__no_op{};
+    auto __write_out = [=](std::size_t __idx, const auto& __tup, const auto& __in_keys, const auto&, auto& __out_keys, auto& __out_values) {
+        // Will be present in L1 cache
+        if (__idx == __n - 1 || !__binary_pred(__in_keys[__idx], __in_keys[__idx + 1]))
+        {
+            __out_keys[std::get<0>(__tup)] = __in_keys[__idx];
+            __out_values[std::get<0>(__tup)] = std::get<1>(__tup);
+        }
+    };
+    using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>;
+    return __parallel_transform_reduce_then_scan(
+                __backend_tag, std::forward<_ExecutionPolicy>(__exec), __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform,
+                __write_out, oneapi::dpl::unseq_backend::__no_init_value<oneapi::dpl::__internal::tuple<std::size_t, _ValueType>>{}, /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{}, __n,
+                std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values));
+}
+
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryPredicate>
 auto
 __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 8c0762f2a38..d5b44d1abcc 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -153,27 +153,27 @@ __sub_group_scan_partial(const __dpl_sycl::__sub_group& __sub_group, _ValueType&
 
 template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, bool __capture_output,
           std::uint16_t __max_inputs_per_item, typename _GenInput, typename _ScanInputTransform, typename _BinaryOp,
-          typename _WriteOp, typename _LazyValueType, typename _InRng, typename _OutRng>
+          typename _WriteOp, typename _LazyValueType, typename... _Rngs>
 void
 __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenInput __gen_input,
                                _ScanInputTransform __scan_input_transform, _BinaryOp __binary_op, _WriteOp __write_op,
-                               _LazyValueType& __sub_group_carry, const _InRng& __in_rng, _OutRng& __out_rng,
+                               _LazyValueType& __sub_group_carry,
                                std::size_t __start_id, std::size_t __n, std::uint32_t __iters_per_item,
                                std::size_t __subgroup_start_id, std::uint32_t __sub_group_id,
-                               std::uint32_t __active_subgroups)
+                               std::uint32_t __active_subgroups, _Rngs&&... __rngs)
 {
-    using _GenInputType = std::invoke_result_t<_GenInput, _InRng, std::size_t>;
+    using _GenInputType = std::invoke_result_t<_GenInput, std::size_t, _Rngs...>;
 
     bool __is_full_block = (__iters_per_item == __max_inputs_per_item);
     bool __is_full_thread = __subgroup_start_id + __iters_per_item * __sub_group_size <= __n;
     if (__is_full_thread)
     {
-        _GenInputType __v = __gen_input(__in_rng, __start_id);
+        _GenInputType __v = __gen_input(__start_id, __rngs...);
         __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __scan_input_transform(__v),
                                                                            __binary_op, __sub_group_carry);
         if constexpr (__capture_output)
         {
-            __write_op(__out_rng, __start_id, __v);
+            __write_op(__start_id, __v, __rngs...);
         }
 
         if (__is_full_block)
@@ -182,12 +182,12 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
             _ONEDPL_PRAGMA_UNROLL
             for (std::uint32_t __j = 1; __j < __max_inputs_per_item; __j++)
             {
-                __v = __gen_input(__in_rng, __start_id + __j * __sub_group_size);
+                __v = __gen_input(__start_id + __j * __sub_group_size, __rngs...);
                 __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                 if constexpr (__capture_output)
                 {
-                    __write_op(__out_rng, __start_id + __j * __sub_group_size, __v);
+                    __write_op(__start_id + __j * __sub_group_size, __v, __rngs...);
                 }
             }
         }
@@ -197,12 +197,12 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
             // can proceed without special casing for partial subgroups.
             for (std::uint32_t __j = 1; __j < __iters_per_item; __j++)
             {
-                __v = __gen_input(__in_rng, __start_id + __j * __sub_group_size);
+                __v = __gen_input(__start_id + __j * __sub_group_size, __rngs...);
                 __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                 if constexpr (__capture_output)
                 {
-                    __write_op(__out_rng, __start_id + __j * __sub_group_size, __v);
+                    __write_op(__start_id + __j * __sub_group_size, __v, __rngs...);
                 }
             }
         }
@@ -218,48 +218,48 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
             if (__iters == 1)
             {
                 std::size_t __local_id = (__start_id < __n) ? __start_id : __n - 1;
-                _GenInputType __v = __gen_input(__in_rng, __local_id);
+                _GenInputType __v = __gen_input(__local_id, __rngs...);
                 __sub_group_scan_partial<__sub_group_size, __is_inclusive, __init_present>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry,
                     __n - __subgroup_start_id);
                 if constexpr (__capture_output)
                 {
                     if (__start_id < __n)
-                        __write_op(__out_rng, __start_id, __v);
+                        __write_op(__start_id, __v, __rngs...);
                 }
             }
             else
             {
-                _GenInputType __v = __gen_input(__in_rng, __start_id);
+                _GenInputType __v = __gen_input(__start_id, __rngs...);
                 __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                 if constexpr (__capture_output)
                 {
-                    __write_op(__out_rng, __start_id, __v);
+                    __write_op(__start_id, __v, __rngs...);
                 }
 
                 for (std::uint32_t __j = 1; __j < __iters - 1; __j++)
                 {
                     std::size_t __local_id = __start_id + __j * __sub_group_size;
-                    __v = __gen_input(__in_rng, __local_id);
+                    __v = __gen_input(__local_id, __rngs...);
                     __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
                         __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                     if constexpr (__capture_output)
                     {
-                        __write_op(__out_rng, __local_id, __v);
+                        __write_op(__local_id, __v, __rngs...);
                     }
                 }
 
                 std::size_t __offset = __start_id + (__iters - 1) * __sub_group_size;
                 std::size_t __local_id = (__offset < __n) ? __offset : __n - 1;
-                __v = __gen_input(__in_rng, __local_id);
+                __v = __gen_input(__local_id, __rngs...);
                 __sub_group_scan_partial<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry,
                     __n - (__subgroup_start_id + (__iters - 1) * __sub_group_size));
                 if constexpr (__capture_output)
                 {
                     if (__offset < __n)
-                        __write_op(__out_rng, __offset, __v);
+                        __write_op(__offset, __v, __rngs...);
                 }
             }
         }
@@ -286,12 +286,12 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
 {
     // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory
     // input buffer
-    template <typename _ExecutionPolicy, typename _InRng, typename _TmpStorageAcc>
+    template <typename _ExecutionPolicy, typename _TmpStorageAcc, typename... _Rngs>
     sycl::event
-    operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng,
+    operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range,
                _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event,
                const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item,
-               const std::size_t __block_num) const
+               const std::size_t __block_num, _Rngs&&... __rngs) const
     {
         using _InitValueType = typename _InitType::__value_type;
         return __exec.queue().submit([&, this](sycl::handler& __cgh) {
@@ -332,8 +332,8 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                                                    /*__init_present=*/false,
                                                    /*__capture_output=*/false, __max_inputs_per_item>(
                         __sub_group, __gen_reduce_input, oneapi::dpl::__internal::__no_op{}, __reduce_op, nullptr,
-                        __sub_group_carry, __in_rng, /*unused*/ __in_rng, __start_id, __n, __inputs_per_item,
-                        __subgroup_start_id, __sub_group_id, __active_subgroups);
+                        __sub_group_carry, __start_id, __n, __inputs_per_item,
+                        __subgroup_start_id, __sub_group_id, __active_subgroups, __rngs...);
                     if (__sub_group_local_id == 0)
                         __sub_group_partials[__sub_group_id] = __sub_group_carry.__v;
                     __sub_group_carry.__destroy();
@@ -437,12 +437,12 @@ struct __parallel_reduce_then_scan_scan_submitter<
         __tmp_ptr[__num_sub_groups_global + 1 - (__block_num % 2)] = __block_carry_out;
     }
 
-    template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _TmpStorageAcc>
+    template <typename _ExecutionPolicy, typename _TmpStorageAcc, typename... _Rngs>
     sycl::event
-    operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng,
+    operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range,
                _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event,
                const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item,
-               const std::size_t __block_num) const
+               const std::size_t __block_num, _Rngs&&... __rngs) const
     {
         std::uint32_t __inputs_in_block = std::min(__n - __block_num * __max_block_size, std::size_t{__max_block_size});
         std::uint32_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div(
@@ -626,7 +626,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                             if (__sub_group_local_id == 0)
                             {
                                 // For unique patterns, always copy the 0th element to the output
-                                __write_op.__assign(__in_rng[0], __out_rng[0]);
+                                //__write_op.__assign(__in_rng[0], __out_rng[0]);
                             }
                         }
 
@@ -672,8 +672,8 @@ struct __parallel_reduce_then_scan_scan_submitter<
                                                    /*__init_present=*/true,
                                                    /*__capture_output=*/true, __max_inputs_per_item>(
                         __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op,
-                        __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item, __subgroup_start_id,
-                        __sub_group_id, __active_subgroups);
+                        __sub_group_carry, __start_id, __n, __inputs_per_item, __subgroup_start_id,
+                        __sub_group_id, __active_subgroups, __rngs...);
                 }
                 else // first group first block, no subgroup carry
                 {
@@ -681,8 +681,8 @@ struct __parallel_reduce_then_scan_scan_submitter<
                                                    /*__init_present=*/false,
                                                    /*__capture_output=*/true, __max_inputs_per_item>(
                         __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op,
-                        __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item, __subgroup_start_id,
-                        __sub_group_id, __active_subgroups);
+                        __sub_group_carry, __start_id, __n, __inputs_per_item, __subgroup_start_id,
+                        __sub_group_id, __active_subgroups, __rngs...);
                 }
                 // If within the last active group and sub-group of the block, use the 0th work-item of the sub-group
                 // to write out the last carry out for either the return value or the next block
@@ -746,15 +746,16 @@ __is_gpu_with_sg_32(const _ExecutionPolicy& __exec)
 // _ReduceOp - a binary function which is used in the reduction and scan operations
 // _WriteOp - a function which accepts output range, index, and output of `_GenScanInput` applied to the input range
 //            and performs the final write to output operation
-template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _GenReduceInput, typename _ReduceOp,
+template <typename _ExecutionPolicy, typename _GenReduceInput, typename _ReduceOp,
           typename _GenScanInput, typename _ScanInputTransform, typename _WriteOp, typename _InitType,
-          typename _Inclusive, typename _IsUniquePattern>
+          typename _Inclusive, typename _IsUniquePattern, typename... _Rngs>
 auto
 __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec,
-                                      _InRng&& __in_rng, _OutRng&& __out_rng, _GenReduceInput __gen_reduce_input,
+                                      _GenReduceInput __gen_reduce_input,
                                       _ReduceOp __reduce_op, _GenScanInput __gen_scan_input,
                                       _ScanInputTransform __scan_input_transform, _WriteOp __write_op, _InitType __init,
-                                      _Inclusive, _IsUniquePattern)
+                                      _Inclusive, _IsUniquePattern,
+                                      std::size_t __n, _Rngs&&... __rngs)
 {
     using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
     using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
@@ -780,7 +781,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     const std::uint32_t __num_work_items = __num_work_groups * __work_group_size;
     const std::uint32_t __num_sub_groups_local = __work_group_size / __sub_group_size;
     const std::uint32_t __num_sub_groups_global = __num_sub_groups_local * __num_work_groups;
-    const std::size_t __n = __in_rng.size();
+    //const std::size_t __n = __in_rng.size();
     const std::uint32_t __max_inputs_per_block = __work_group_size * __max_inputs_per_item * __num_work_groups;
     std::size_t __inputs_remaining = __n;
     if constexpr (__is_unique_pattern_v)
@@ -848,11 +849,11 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
         auto __local_range = sycl::range<1>(__work_group_size);
         auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range);
         // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory.
-        __event = __reduce_submitter(__exec, __kernel_nd_range, __in_rng, __result_and_scratch, __event,
-                                     __inputs_per_sub_group, __inputs_per_item, __b);
+        __event = __reduce_submitter(__exec, __kernel_nd_range, __result_and_scratch, __event,
+                                     __inputs_per_sub_group, __inputs_per_item, __b, __rngs...);
         // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan.
-        __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event,
-                                   __inputs_per_sub_group, __inputs_per_item, __b);
+        __event = __scan_submitter(__exec, __kernel_nd_range, __result_and_scratch, __event,
+                                   __inputs_per_sub_group, __inputs_per_item, __b, __rngs...);
         __inputs_remaining -= std::min(__inputs_remaining, __block_size);
         // We only need to resize these parameters prior to the last block as it is the only non-full case.
         if (__b + 2 == __num_blocks)

From b7328e8e063cc6b3d4ec3804554289ca27a23e9a Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 14 Oct 2024 13:36:40 -0700
Subject: [PATCH 02/31] Revert change to ranges and use zip_view over segments
 / values instead

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 47 ++++++-----
 .../parallel_backend_sycl_reduce_then_scan.h  | 81 +++++++++----------
 2 files changed, 67 insertions(+), 61 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 3141374ebcd..148c2565ba8 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -773,9 +773,9 @@ __group_scan_fits_in_slm(const sycl::queue& __queue, std::size_t __n, std::size_
 template <typename _UnaryOp>
 struct __gen_transform_input
 {
-    template <typename _InRng, typename _OutRng>
+    template <typename _InRng>
     auto
-    operator()(std::size_t __id, const _InRng& __in_rng, _OutRng&) const
+    operator()(const _InRng& __in_rng, std::size_t __id) const
     {
         // We explicitly convert __in_rng[__id] to the value type of _InRng to properly handle the case where we
         // process zip_iterator input where the reference type is a tuple of a references. This prevents the caller
@@ -788,9 +788,9 @@ struct __gen_transform_input
 
 struct __simple_write_to_id
 {
-    template <typename _ValueType, typename _InRng, typename _OutRng>
+    template <typename _InRng, typename _OutRng, typename _ValueType>
     void
-    operator()(std::size_t __id, const _ValueType& __v, _InRng&, _OutRng& __out_rng) const
+    operator()(const _InRng&, _OutRng& __out_rng, std::size_t __id, const _ValueType& __v) const
     {
         // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our
         // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through.
@@ -804,9 +804,9 @@ struct __simple_write_to_id
 template <typename _Predicate, typename _RangeTransform = oneapi::dpl::__internal::__no_op>
 struct __gen_mask
 {
-    template <typename _InRng, typename _OutRng>
+    template <typename _InRng>
     bool
-    operator()(std::size_t __id, const _InRng& __in_rng, _OutRng&) const
+    operator()(const _InRng& __in_rng, std::size_t __id) const
     {
         return __pred((__rng_transform(std::forward<_InRng>(__in_rng)))[__id]);
     }
@@ -937,9 +937,9 @@ struct __get_zeroth_element
 template <std::int32_t __offset, typename _Assign>
 struct __write_to_id_if
 {
-    template <typename _OutRng, typename _SizeType, typename _ValueType>
+    template <typename _InRng, typename _OutRng, typename _SizeType, typename _ValueType>
     void
-    operator()(_OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const
+    operator()(const _InRng&, _OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const
     {
         // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our
         // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through.
@@ -955,9 +955,9 @@ struct __write_to_id_if
 template <typename _Assign>
 struct __write_to_id_if_else
 {
-    template <typename _OutRng, typename _SizeType, typename _ValueType>
+    template <typename _InRng, typename _OutRng, typename _SizeType, typename _ValueType>
     void
-    operator()(_OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const
+    operator()(const _InRng&, _OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const
     {
         using _ConvertedTupleType =
             typename oneapi::dpl::__internal::__get_tuple_type<std::decay_t<decltype(std::get<2>(__v))>,
@@ -1012,9 +1012,9 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
             _GenInput __gen_transform{__unary_op};
 
             return __parallel_transform_reduce_then_scan(
-                __backend_tag, std::forward<_ExecutionPolicy>(__exec), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{},
-                _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}, __n,
-                std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng));
+                __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
+                std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{},
+                _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{});
         }
     }
 
@@ -1185,7 +1185,6 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
                                     _CopyOp{_ReduceOp{}, _Assign{}});
     }
 }
-
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3,
           typename _Range4, typename _BinaryPredicate, typename _BinaryOperator>
 auto
@@ -1194,7 +1193,9 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac
                             _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
 {
     auto __n = __keys.size();
-    auto __gen_reduce_input = [=](std::size_t __idx, const auto& __in_keys, const auto& __in_vals, const auto&, const auto&) {
+    auto __gen_reduce_input = [=](const auto& __in_rng, std::size_t __idx) {
+        auto&& __in_keys = std::get<0>(__in_rng.tuple());
+        auto&& __in_vals = std::get<1>(__in_rng.tuple());
         using _ValueType = oneapi::dpl::__internal::__value_t<decltype(__in_vals)>;
         if (__idx == 0)
             return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]});
@@ -1213,8 +1214,11 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac
     };
     auto __gen_scan_input = __gen_reduce_input;
     auto __scan_input_transform = oneapi::dpl::__internal::__no_op{};
-    auto __write_out = [=](std::size_t __idx, const auto& __tup, const auto& __in_keys, const auto&, auto& __out_keys, auto& __out_values) {
-        // Will be present in L1 cache
+    auto __write_out = [=](auto& __in_rng, auto& __out_rng, std::size_t __idx, const auto& __tup) {
+        auto&& __in_keys = std::get<0>(__in_rng.tuple());
+        auto&& __out_keys = std::get<0>(__out_rng.tuple());
+        auto&& __out_vals = std::get<1>(__out_rng.tuple());
+        // Assuming this will be present in L1 cache
         if (__idx == __n - 1 || !__binary_pred(__in_keys[__idx], __in_keys[__idx + 1]))
         {
             __out_keys[std::get<0>(__tup)] = __in_keys[__idx];
@@ -1223,9 +1227,12 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac
     };
     using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>;
     return __parallel_transform_reduce_then_scan(
-                __backend_tag, std::forward<_ExecutionPolicy>(__exec), __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform,
-                __write_out, oneapi::dpl::unseq_backend::__no_init_value<oneapi::dpl::__internal::tuple<std::size_t, _ValueType>>{}, /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{}, __n,
-                std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values));
+                __backend_tag, std::forward<_ExecutionPolicy>(__exec),
+                oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
+                oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)),
+                __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform,
+                __write_out, oneapi::dpl::unseq_backend::__no_init_value<oneapi::dpl::__internal::tuple<std::size_t, _ValueType>>{}, /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{}
+                );
 }
 
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryPredicate>
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index d5b44d1abcc..09c0d754bd7 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -153,27 +153,27 @@ __sub_group_scan_partial(const __dpl_sycl::__sub_group& __sub_group, _ValueType&
 
 template <std::uint8_t __sub_group_size, bool __is_inclusive, bool __init_present, bool __capture_output,
           std::uint16_t __max_inputs_per_item, typename _GenInput, typename _ScanInputTransform, typename _BinaryOp,
-          typename _WriteOp, typename _LazyValueType, typename... _Rngs>
+          typename _WriteOp, typename _LazyValueType, typename _InRng, typename _OutRng>
 void
 __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenInput __gen_input,
                                _ScanInputTransform __scan_input_transform, _BinaryOp __binary_op, _WriteOp __write_op,
-                               _LazyValueType& __sub_group_carry,
+                               _LazyValueType& __sub_group_carry, const _InRng& __in_rng, _OutRng& __out_rng,
                                std::size_t __start_id, std::size_t __n, std::uint32_t __iters_per_item,
                                std::size_t __subgroup_start_id, std::uint32_t __sub_group_id,
-                               std::uint32_t __active_subgroups, _Rngs&&... __rngs)
+                               std::uint32_t __active_subgroups)
 {
-    using _GenInputType = std::invoke_result_t<_GenInput, std::size_t, _Rngs...>;
+    using _GenInputType = std::invoke_result_t<_GenInput, _InRng, std::size_t>;
 
     bool __is_full_block = (__iters_per_item == __max_inputs_per_item);
     bool __is_full_thread = __subgroup_start_id + __iters_per_item * __sub_group_size <= __n;
     if (__is_full_thread)
     {
-        _GenInputType __v = __gen_input(__start_id, __rngs...);
+        _GenInputType __v = __gen_input(__in_rng, __start_id);
         __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __scan_input_transform(__v),
                                                                            __binary_op, __sub_group_carry);
         if constexpr (__capture_output)
         {
-            __write_op(__start_id, __v, __rngs...);
+            __write_op(__in_rng, __out_rng, __start_id, __v);
         }
 
         if (__is_full_block)
@@ -182,12 +182,12 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
             _ONEDPL_PRAGMA_UNROLL
             for (std::uint32_t __j = 1; __j < __max_inputs_per_item; __j++)
             {
-                __v = __gen_input(__start_id + __j * __sub_group_size, __rngs...);
+                __v = __gen_input(__in_rng, __start_id + __j * __sub_group_size);
                 __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                 if constexpr (__capture_output)
                 {
-                    __write_op(__start_id + __j * __sub_group_size, __v, __rngs...);
+                    __write_op(__in_rng, __out_rng, __start_id + __j * __sub_group_size, __v);
                 }
             }
         }
@@ -197,12 +197,12 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
             // can proceed without special casing for partial subgroups.
             for (std::uint32_t __j = 1; __j < __iters_per_item; __j++)
             {
-                __v = __gen_input(__start_id + __j * __sub_group_size, __rngs...);
+                __v = __gen_input(__in_rng, __start_id + __j * __sub_group_size);
                 __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                 if constexpr (__capture_output)
                 {
-                    __write_op(__start_id + __j * __sub_group_size, __v, __rngs...);
+                    __write_op(__in_rng, __out_rng, __start_id + __j * __sub_group_size, __v);
                 }
             }
         }
@@ -218,48 +218,48 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
             if (__iters == 1)
             {
                 std::size_t __local_id = (__start_id < __n) ? __start_id : __n - 1;
-                _GenInputType __v = __gen_input(__local_id, __rngs...);
+                _GenInputType __v = __gen_input(__in_rng, __local_id);
                 __sub_group_scan_partial<__sub_group_size, __is_inclusive, __init_present>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry,
                     __n - __subgroup_start_id);
                 if constexpr (__capture_output)
                 {
                     if (__start_id < __n)
-                        __write_op(__start_id, __v, __rngs...);
+                        __write_op(__in_rng, __out_rng, __start_id, __v);
                 }
             }
             else
             {
-                _GenInputType __v = __gen_input(__start_id, __rngs...);
+                _GenInputType __v = __gen_input(__in_rng, __start_id);
                 __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                 if constexpr (__capture_output)
                 {
-                    __write_op(__start_id, __v, __rngs...);
+                    __write_op(__in_rng, __out_rng, __start_id, __v);
                 }
 
                 for (std::uint32_t __j = 1; __j < __iters - 1; __j++)
                 {
                     std::size_t __local_id = __start_id + __j * __sub_group_size;
-                    __v = __gen_input(__local_id, __rngs...);
+                    __v = __gen_input(__in_rng, __local_id);
                     __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
                         __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                     if constexpr (__capture_output)
                     {
-                        __write_op(__local_id, __v, __rngs...);
+                        __write_op(__in_rng, __out_rng, __local_id, __v);
                     }
                 }
 
                 std::size_t __offset = __start_id + (__iters - 1) * __sub_group_size;
                 std::size_t __local_id = (__offset < __n) ? __offset : __n - 1;
-                __v = __gen_input(__local_id, __rngs...);
+                __v = __gen_input(__in_rng, __local_id);
                 __sub_group_scan_partial<__sub_group_size, __is_inclusive, /*__init_present=*/true>(
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry,
                     __n - (__subgroup_start_id + (__iters - 1) * __sub_group_size));
                 if constexpr (__capture_output)
                 {
                     if (__offset < __n)
-                        __write_op(__offset, __v, __rngs...);
+                        __write_op(__in_rng, __out_rng, __offset, __v);
                 }
             }
         }
@@ -286,12 +286,12 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
 {
     // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory
     // input buffer
-    template <typename _ExecutionPolicy, typename _TmpStorageAcc, typename... _Rngs>
+    template <typename _ExecutionPolicy, typename _InRng, typename _TmpStorageAcc>
     sycl::event
-    operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range,
+    operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng,
                _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event,
                const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item,
-               const std::size_t __block_num, _Rngs&&... __rngs) const
+               const std::size_t __block_num) const
     {
         using _InitValueType = typename _InitType::__value_type;
         return __exec.queue().submit([&, this](sycl::handler& __cgh) {
@@ -332,8 +332,8 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
                                                    /*__init_present=*/false,
                                                    /*__capture_output=*/false, __max_inputs_per_item>(
                         __sub_group, __gen_reduce_input, oneapi::dpl::__internal::__no_op{}, __reduce_op, nullptr,
-                        __sub_group_carry, __start_id, __n, __inputs_per_item,
-                        __subgroup_start_id, __sub_group_id, __active_subgroups, __rngs...);
+                        __sub_group_carry, __in_rng, /*unused*/ __in_rng, __start_id, __n, __inputs_per_item,
+                        __subgroup_start_id, __sub_group_id, __active_subgroups);
                     if (__sub_group_local_id == 0)
                         __sub_group_partials[__sub_group_id] = __sub_group_carry.__v;
                     __sub_group_carry.__destroy();
@@ -437,12 +437,12 @@ struct __parallel_reduce_then_scan_scan_submitter<
         __tmp_ptr[__num_sub_groups_global + 1 - (__block_num % 2)] = __block_carry_out;
     }
 
-    template <typename _ExecutionPolicy, typename _TmpStorageAcc, typename... _Rngs>
+    template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _TmpStorageAcc>
     sycl::event
-    operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range,
+    operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng,
                _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event,
                const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item,
-               const std::size_t __block_num, _Rngs&&... __rngs) const
+               const std::size_t __block_num) const
     {
         std::uint32_t __inputs_in_block = std::min(__n - __block_num * __max_block_size, std::size_t{__max_block_size});
         std::uint32_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div(
@@ -626,7 +626,7 @@ struct __parallel_reduce_then_scan_scan_submitter<
                             if (__sub_group_local_id == 0)
                             {
                                 // For unique patterns, always copy the 0th element to the output
-                                //__write_op.__assign(__in_rng[0], __out_rng[0]);
+                                __write_op.__assign(__in_rng[0], __out_rng[0]);
                             }
                         }
 
@@ -672,8 +672,8 @@ struct __parallel_reduce_then_scan_scan_submitter<
                                                    /*__init_present=*/true,
                                                    /*__capture_output=*/true, __max_inputs_per_item>(
                         __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op,
-                        __sub_group_carry, __start_id, __n, __inputs_per_item, __subgroup_start_id,
-                        __sub_group_id, __active_subgroups, __rngs...);
+                        __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item, __subgroup_start_id,
+                        __sub_group_id, __active_subgroups);
                 }
                 else // first group first block, no subgroup carry
                 {
@@ -681,8 +681,8 @@ struct __parallel_reduce_then_scan_scan_submitter<
                                                    /*__init_present=*/false,
                                                    /*__capture_output=*/true, __max_inputs_per_item>(
                         __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op,
-                        __sub_group_carry, __start_id, __n, __inputs_per_item, __subgroup_start_id,
-                        __sub_group_id, __active_subgroups, __rngs...);
+                        __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item, __subgroup_start_id,
+                        __sub_group_id, __active_subgroups);
                 }
                 // If within the last active group and sub-group of the block, use the 0th work-item of the sub-group
                 // to write out the last carry out for either the return value or the next block
@@ -746,16 +746,15 @@ __is_gpu_with_sg_32(const _ExecutionPolicy& __exec)
 // _ReduceOp - a binary function which is used in the reduction and scan operations
 // _WriteOp - a function which accepts output range, index, and output of `_GenScanInput` applied to the input range
 //            and performs the final write to output operation
-template <typename _ExecutionPolicy, typename _GenReduceInput, typename _ReduceOp,
+template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _GenReduceInput, typename _ReduceOp,
           typename _GenScanInput, typename _ScanInputTransform, typename _WriteOp, typename _InitType,
-          typename _Inclusive, typename _IsUniquePattern, typename... _Rngs>
+          typename _Inclusive, typename _IsUniquePattern>
 auto
 __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec,
-                                      _GenReduceInput __gen_reduce_input,
+                                      _InRng&& __in_rng, _OutRng&& __out_rng, _GenReduceInput __gen_reduce_input,
                                       _ReduceOp __reduce_op, _GenScanInput __gen_scan_input,
                                       _ScanInputTransform __scan_input_transform, _WriteOp __write_op, _InitType __init,
-                                      _Inclusive, _IsUniquePattern,
-                                      std::size_t __n, _Rngs&&... __rngs)
+                                      _Inclusive, _IsUniquePattern)
 {
     using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
     using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
@@ -781,7 +780,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
     const std::uint32_t __num_work_items = __num_work_groups * __work_group_size;
     const std::uint32_t __num_sub_groups_local = __work_group_size / __sub_group_size;
     const std::uint32_t __num_sub_groups_global = __num_sub_groups_local * __num_work_groups;
-    //const std::size_t __n = __in_rng.size();
+    const std::size_t __n = __in_rng.size();
     const std::uint32_t __max_inputs_per_block = __work_group_size * __max_inputs_per_item * __num_work_groups;
     std::size_t __inputs_remaining = __n;
     if constexpr (__is_unique_pattern_v)
@@ -849,11 +848,11 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
         auto __local_range = sycl::range<1>(__work_group_size);
         auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range);
         // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory.
-        __event = __reduce_submitter(__exec, __kernel_nd_range, __result_and_scratch, __event,
-                                     __inputs_per_sub_group, __inputs_per_item, __b, __rngs...);
+        __event = __reduce_submitter(__exec, __kernel_nd_range, __in_rng, __result_and_scratch, __event,
+                                     __inputs_per_sub_group, __inputs_per_item, __b);
         // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan.
-        __event = __scan_submitter(__exec, __kernel_nd_range, __result_and_scratch, __event,
-                                   __inputs_per_sub_group, __inputs_per_item, __b, __rngs...);
+        __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event,
+                                   __inputs_per_sub_group, __inputs_per_item, __b);
         __inputs_remaining -= std::min(__inputs_remaining, __block_size);
         // We only need to resize these parameters prior to the last block as it is the only non-full case.
         if (__b + 2 == __num_blocks)

From f017ffd753248c9cfe5b054bd2a0bd66eda4ec22 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 15 Oct 2024 06:42:29 -0700
Subject: [PATCH 03/31] Implement correct return for reduce_by_segment

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/pstl/hetero/algorithm_ranges_impl_hetero.h  | 13 ++++++++-----
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h   | 10 ++++------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index f845c209cfe..7558e29b4d5 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -913,11 +913,14 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
                             _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
                             _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
 {
-    oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys),
-                                 std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys),
-                                 std::forward<_Range4>(__out_values), __binary_pred, __binary_op)
-        .wait();
-    return 1;
+    auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys),
+                                                                                 std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys),
+                                                                                 std::forward<_Range4>(__out_values), __binary_pred, __binary_op);
+    __res.wait();
+    // Because our init type ends up being tuple<std::size_t, ValType>, return the first component which is the write index. Add 1 to return the
+    // past-the-end iterator pair of segmented reduction.
+    return std::get<0>(__res.get()) + 1;
+    // TODO: this needs to be enabled if reduce then scan cannot be satisfied.
     #if 0
     // The algorithm reduces values in __values where the
     // associated keys for the values are equal to the adjacent key.
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 148c2565ba8..8d782dbc38c 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -1197,11 +1197,9 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac
         auto&& __in_keys = std::get<0>(__in_rng.tuple());
         auto&& __in_vals = std::get<1>(__in_rng.tuple());
         using _ValueType = oneapi::dpl::__internal::__value_t<decltype(__in_vals)>;
-        if (__idx == 0)
+        if (__idx == 0 || __binary_pred(__in_keys[__idx], __in_keys[__idx - 1]))
             return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]});
-        if (!__binary_pred(__in_keys[__idx], __in_keys[__idx - 1]))
-            return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]});
-        return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]});
+        return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]});
     };
     auto __reduce_op = [=](const auto& __lhs_tup,  const auto& __rhs_tup) {
         if (std::get<0>(__rhs_tup) == 0)
@@ -1231,8 +1229,8 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac
                 oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
                 oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)),
                 __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform,
-                __write_out, oneapi::dpl::unseq_backend::__no_init_value<oneapi::dpl::__internal::tuple<std::size_t, _ValueType>>{}, /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{}
-                );
+                __write_out, oneapi::dpl::unseq_backend::__no_init_value<oneapi::dpl::__internal::tuple<std::size_t, _ValueType>>{},
+                /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{});
 }
 
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryPredicate>

From 5cfa661b0505e7dc21433066845392c9604dfdc1 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 15 Oct 2024 07:29:49 -0700
Subject: [PATCH 04/31] Add support for flag predicates

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h   | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 8d782dbc38c..f26c4e097b0 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -1185,6 +1185,7 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
                                     _CopyOp{_ReduceOp{}, _Assign{}});
     }
 }
+
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3,
           typename _Range4, typename _BinaryPredicate, typename _BinaryOperator>
 auto
@@ -1196,19 +1197,20 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac
     auto __gen_reduce_input = [=](const auto& __in_rng, std::size_t __idx) {
         auto&& __in_keys = std::get<0>(__in_rng.tuple());
         auto&& __in_vals = std::get<1>(__in_rng.tuple());
+        using _KeyType = oneapi::dpl::__internal::__value_t<decltype(__in_keys)>;
         using _ValueType = oneapi::dpl::__internal::__value_t<decltype(__in_vals)>;
-        if (__idx == 0 || __binary_pred(__in_keys[__idx], __in_keys[__idx - 1]))
-            return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]});
-        return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]});
+        if (__idx == 0 || __binary_pred(__in_keys[__idx - 1], __in_keys[__idx]))
+            return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}, _KeyType{__in_keys[__idx]});
+        return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}, _KeyType{__in_keys[__idx]});
     };
     auto __reduce_op = [=](const auto& __lhs_tup,  const auto& __rhs_tup) {
         if (std::get<0>(__rhs_tup) == 0)
         {
             return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup),
-                    __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup)));
+                    __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup)), std::get<2>(__lhs_tup));
         }
         return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup),
-                                                   std::get<1>(__rhs_tup));
+                                                   std::get<1>(__rhs_tup), std::get<2>(__rhs_tup));
     };
     auto __gen_scan_input = __gen_reduce_input;
     auto __scan_input_transform = oneapi::dpl::__internal::__no_op{};
@@ -1219,17 +1221,18 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac
         // Assuming this will be present in L1 cache
         if (__idx == __n - 1 || !__binary_pred(__in_keys[__idx], __in_keys[__idx + 1]))
         {
-            __out_keys[std::get<0>(__tup)] = __in_keys[__idx];
+            __out_keys[std::get<0>(__tup)] = std::get<2>(__tup);
             __out_values[std::get<0>(__tup)] = std::get<1>(__tup);
         }
     };
+    using _KeyType = oneapi::dpl::__internal::__value_t<_Range1>;
     using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>;
     return __parallel_transform_reduce_then_scan(
                 __backend_tag, std::forward<_ExecutionPolicy>(__exec),
                 oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
                 oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)),
                 __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform,
-                __write_out, oneapi::dpl::unseq_backend::__no_init_value<oneapi::dpl::__internal::tuple<std::size_t, _ValueType>>{},
+                __write_out, oneapi::dpl::unseq_backend::__no_init_value<oneapi::dpl::__internal::tuple<std::size_t, _ValueType, _KeyType>>{},
                 /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{});
 }
 

From 4f8059afba9fd24da298095d79fa7aaa07c0952e Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 15 Oct 2024 08:16:44 -0700
Subject: [PATCH 05/31] Revert "Add support for flag predicates"

This reverts commit 0e0d50e8eea85685d46ccbb32b74f1211bfbab20.
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h   | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index f26c4e097b0..8d782dbc38c 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -1185,7 +1185,6 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
                                     _CopyOp{_ReduceOp{}, _Assign{}});
     }
 }
-
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3,
           typename _Range4, typename _BinaryPredicate, typename _BinaryOperator>
 auto
@@ -1197,20 +1196,19 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac
     auto __gen_reduce_input = [=](const auto& __in_rng, std::size_t __idx) {
         auto&& __in_keys = std::get<0>(__in_rng.tuple());
         auto&& __in_vals = std::get<1>(__in_rng.tuple());
-        using _KeyType = oneapi::dpl::__internal::__value_t<decltype(__in_keys)>;
         using _ValueType = oneapi::dpl::__internal::__value_t<decltype(__in_vals)>;
-        if (__idx == 0 || __binary_pred(__in_keys[__idx - 1], __in_keys[__idx]))
-            return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}, _KeyType{__in_keys[__idx]});
-        return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}, _KeyType{__in_keys[__idx]});
+        if (__idx == 0 || __binary_pred(__in_keys[__idx], __in_keys[__idx - 1]))
+            return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]});
+        return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]});
     };
     auto __reduce_op = [=](const auto& __lhs_tup,  const auto& __rhs_tup) {
         if (std::get<0>(__rhs_tup) == 0)
         {
             return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup),
-                    __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup)), std::get<2>(__lhs_tup));
+                    __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup)));
         }
         return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup),
-                                                   std::get<1>(__rhs_tup), std::get<2>(__rhs_tup));
+                                                   std::get<1>(__rhs_tup));
     };
     auto __gen_scan_input = __gen_reduce_input;
     auto __scan_input_transform = oneapi::dpl::__internal::__no_op{};
@@ -1221,18 +1219,17 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac
         // Assuming this will be present in L1 cache
         if (__idx == __n - 1 || !__binary_pred(__in_keys[__idx], __in_keys[__idx + 1]))
         {
-            __out_keys[std::get<0>(__tup)] = std::get<2>(__tup);
+            __out_keys[std::get<0>(__tup)] = __in_keys[__idx];
             __out_values[std::get<0>(__tup)] = std::get<1>(__tup);
         }
     };
-    using _KeyType = oneapi::dpl::__internal::__value_t<_Range1>;
     using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>;
     return __parallel_transform_reduce_then_scan(
                 __backend_tag, std::forward<_ExecutionPolicy>(__exec),
                 oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
                 oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)),
                 __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform,
-                __write_out, oneapi::dpl::unseq_backend::__no_init_value<oneapi::dpl::__internal::tuple<std::size_t, _ValueType, _KeyType>>{},
+                __write_out, oneapi::dpl::unseq_backend::__no_init_value<oneapi::dpl::__internal::tuple<std::size_t, _ValueType>>{},
                 /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{});
 }
 

From d56bc5a11eb27356ab99191c75a182c523b284a8 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 15 Oct 2024 08:37:53 -0700
Subject: [PATCH 06/31] Re-implement support for flag predicates in a more
 performant manner

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h  | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 8d782dbc38c..ebb86585133 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -1197,7 +1197,7 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac
         auto&& __in_keys = std::get<0>(__in_rng.tuple());
         auto&& __in_vals = std::get<1>(__in_rng.tuple());
         using _ValueType = oneapi::dpl::__internal::__value_t<decltype(__in_vals)>;
-        if (__idx == 0 || __binary_pred(__in_keys[__idx], __in_keys[__idx - 1]))
+        if (__idx == 0 || __binary_pred(__in_keys[__idx - 1], __in_keys[__idx]))
             return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]});
         return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]});
     };
@@ -1216,10 +1216,16 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac
         auto&& __in_keys = std::get<0>(__in_rng.tuple());
         auto&& __out_keys = std::get<0>(__out_rng.tuple());
         auto&& __out_vals = std::get<1>(__out_rng.tuple());
-        // Assuming this will be present in L1 cache
-        if (__idx == __n - 1 || !__binary_pred(__in_keys[__idx], __in_keys[__idx + 1]))
+        // TODO: substantial improvement expected with special handling in kernel
+        // The first key must be output to __out_keys[__idx] for a segment, so when we encounter a segment end we
+        // must output the current segment's value and the next segment's key.
+        if (__idx == 0)
+            __out_keys[0] = __in_keys[0];
+        if (__idx == __n - 1)
+            __out_values[std::get<0>(__tup)] = std::get<1>(__tup);
+        else if (!__binary_pred(__in_keys[__idx], __in_keys[__idx + 1]))
         {
-            __out_keys[std::get<0>(__tup)] = __in_keys[__idx];
+            __out_keys[std::get<0>(__tup) + 1] = __in_keys[__idx + 1];
             __out_values[std::get<0>(__tup)] = std::get<1>(__tup);
         }
     };

From 543e82f60ffffd3209f4c2754ac10c857433ad93 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 15 Oct 2024 09:51:35 -0700
Subject: [PATCH 07/31] Add fallback and remove old SYCL implementation

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/internal/reduce_by_segment_impl.h     | 388 +-----------------
 .../hetero/algorithm_ranges_impl_hetero.h     |  26 +-
 2 files changed, 23 insertions(+), 391 deletions(-)

diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h
index a22f2970b49..23259e5e550 100644
--- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h
+++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h
@@ -191,391 +191,17 @@ using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>;
 } // namespace
 
 template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3,
-          typename _Range4, typename _BinaryPredicate, typename _BinaryOperator, typename _KnownIdentity>
+          typename _Range4, typename _BinaryPredicate, typename _BinaryOperator>
 oneapi::dpl::__internal::__difference_t<_Range3>
-__sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __keys,
-                         _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
-                         _BinaryPredicate __binary_pred, _BinaryOperator __binary_op,
-                         _KnownIdentity)
+__pattern_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __keys,
+                            _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
+                            _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
 {
     return oneapi::dpl::experimental::ranges::reduce_by_segment(
         ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__keys), ::std::forward<_Range2>(__values),
         ::std::forward<_Range3>(__out_keys), ::std::forward<_Range4>(__out_values), __binary_pred, __binary_op);
 }
 
-#if 0
-template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3,
-          typename _Range4, typename _BinaryPredicate, typename _BinaryOperator>
-oneapi::dpl::__internal::__difference_t<_Range3>
-__sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __keys,
-                         _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
-                         _BinaryPredicate __binary_pred, _BinaryOperator __binary_op,
-                         ::std::true_type /* has_known_identity */)
-{
-    using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
-
-    using _SegReduceCountKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<
-        _SegReduceCountPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate,
-        _BinaryOperator>;
-    using _SegReduceOffsetKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<
-        _SegReduceOffsetPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate,
-        _BinaryOperator>;
-    using _SegReduceWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<
-        _SegReduceWgPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate,
-        _BinaryOperator>;
-    using _SegReducePrefixKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<
-        _SegReducePrefixPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate,
-        _BinaryOperator>;
-
-    using __diff_type = oneapi::dpl::__internal::__difference_t<_Range3>;
-    using __key_type = oneapi::dpl::__internal::__value_t<_Range1>;
-    using __val_type = oneapi::dpl::__internal::__value_t<_Range2>;
-
-    const ::std::size_t __n = __keys.size();
-
-    constexpr ::std::uint16_t __vals_per_item =
-        16; // Each work item serially processes 16 items. Best observed performance on gpu
-
-    // Limit the work-group size to prevent large sizes on CPUs. Empirically found value.
-    // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future.
-    std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)2048);
-
-    // adjust __wgroup_size according to local memory limit. Double the requirement on __val_type due to sycl group algorithm's use
-    // of SLM.
-    __wgroup_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(
-        __exec, sizeof(__key_type) + 2 * sizeof(__val_type), __wgroup_size);
-
-#if _ONEDPL_COMPILE_KERNEL
-    auto __seg_reduce_count_kernel =
-        __par_backend_hetero::__internal::__kernel_compiler<_SegReduceCountKernel>::__compile(__exec);
-    auto __seg_reduce_offset_kernel =
-        __par_backend_hetero::__internal::__kernel_compiler<_SegReduceOffsetKernel>::__compile(__exec);
-    auto __seg_reduce_wg_kernel =
-        __par_backend_hetero::__internal::__kernel_compiler<_SegReduceWgKernel>::__compile(__exec);
-    auto __seg_reduce_prefix_kernel =
-        __par_backend_hetero::__internal::__kernel_compiler<_SegReducePrefixKernel>::__compile(__exec);
-    __wgroup_size =
-        ::std::min({__wgroup_size,
-                    oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_count_kernel),
-                    oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_offset_kernel),
-                    oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_wg_kernel),
-                    oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_prefix_kernel)});
-#endif
-
-    ::std::size_t __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __wgroup_size * __vals_per_item);
-
-    // intermediate reductions within a workgroup
-    auto __partials =
-        oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n_groups).get_buffer();
-
-    auto __end_idx = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, 1).get_buffer();
-
-    // the number of segment ends found in each work group
-    auto __seg_ends =
-        oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n_groups).get_buffer();
-
-    // buffer that stores an exclusive scan of the results
-    auto __seg_ends_scanned =
-        oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n_groups).get_buffer();
-
-    // 1. Count the segment ends in each workgroup
-    auto __seg_end_identification = __exec.queue().submit([&](sycl::handler& __cgh) {
-        oneapi::dpl::__ranges::__require_access(__cgh, __keys);
-        auto __seg_ends_acc = __seg_ends.template get_access<sycl::access_mode::write>(__cgh);
-#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
-        __cgh.use_kernel_bundle(__seg_reduce_count_kernel.get_kernel_bundle());
-#endif
-        __cgh.parallel_for<_SegReduceCountKernel>(
-            sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](
-#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
-                                                                              __seg_reduce_count_kernel,
-#endif
-                                                                              sycl::nd_item<1> __item) {
-                auto __group = __item.get_group();
-                ::std::size_t __group_id = __item.get_group(0);
-                ::std::size_t __local_id = __item.get_local_id(0);
-                ::std::size_t __global_id = __item.get_global_id(0);
-
-                ::std::size_t __start = __global_id * __vals_per_item;
-                ::std::size_t __end = __dpl_sycl::__minimum<decltype(__n)>{}(__start + __vals_per_item, __n);
-                ::std::size_t __item_segments = 0;
-
-                // 1a. Work item scan to identify segment ends
-                for (::std::size_t __i = __start; __i < __end; ++__i)
-                    if (__n - 1 == __i || !__binary_pred(__keys[__i], __keys[__i + 1]))
-                        ++__item_segments;
-
-                // 1b. Work group reduction
-                ::std::size_t __num_segs = __dpl_sycl::__reduce_over_group(
-                    __group, __item_segments, __dpl_sycl::__plus<decltype(__item_segments)>());
-
-                // 1c. First work item writes segment count to global memory
-                if (__local_id == 0)
-                    __seg_ends_acc[__group_id] = __num_segs;
-            });
-    });
-
-    // 1.5 Small single-group kernel
-    auto __single_group_scan = __exec.queue().submit([&](sycl::handler& __cgh) {
-        __cgh.depends_on(__seg_end_identification);
-        auto __seg_ends_acc = __seg_ends.template get_access<sycl::access_mode::read>(__cgh);
-        auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access<sycl::access_mode::read_write>(__cgh);
-#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
-        __cgh.use_kernel_bundle(__seg_reduce_offset_kernel.get_kernel_bundle());
-#endif
-        __cgh.parallel_for<_SegReduceOffsetKernel>(
-#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
-            __seg_reduce_offset_kernel,
-#endif
-            sycl::nd_range<1>{__wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
-                auto __beg = __dpl_sycl::__get_accessor_ptr(__seg_ends_acc);
-                auto __out_beg = __dpl_sycl::__get_accessor_ptr(__seg_ends_scan_acc);
-                __dpl_sycl::__joint_exclusive_scan(__item.get_group(), __beg, __beg + __n_groups, __out_beg,
-                                                   __diff_type(0), sycl::plus<__diff_type>());
-            });
-    });
-
-    // 2. Work group reduction
-    auto __wg_reduce = __exec.queue().submit([&](sycl::handler& __cgh) {
-        __cgh.depends_on(__single_group_scan);
-        oneapi::dpl::__ranges::__require_access(__cgh, __keys, __out_keys, __out_values, __values);
-
-        auto __partials_acc = __partials.template get_access<sycl::access_mode::read_write>(__cgh);
-        auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access<sycl::access_mode::read>(__cgh);
-        __dpl_sycl::__local_accessor<__val_type> __loc_acc(2 * __wgroup_size, __cgh);
-#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
-        __cgh.use_kernel_bundle(__seg_reduce_wg_kernel.get_kernel_bundle());
-#endif
-        __cgh.parallel_for<_SegReduceWgKernel>(
-#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
-            __seg_reduce_wg_kernel,
-#endif
-            sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
-                ::std::array<__val_type, __vals_per_item> __loc_partials;
-
-                auto __group = __item.get_group();
-                ::std::size_t __group_id = __item.get_group(0);
-                ::std::size_t __local_id = __item.get_local_id(0);
-                ::std::size_t __global_id = __item.get_global_id(0);
-
-                // 2a. Lookup the number of prior segs
-                auto __wg_num_prior_segs = __seg_ends_scan_acc[__group_id];
-
-                // 2b. Perform a serial scan within the work item over assigned elements. Store partial
-                // reductions in work group local memory.
-                ::std::size_t __start = __global_id * __vals_per_item;
-                ::std::size_t __end = __dpl_sycl::__minimum<decltype(__n)>{}(__start + __vals_per_item, __n);
-
-                ::std::size_t __max_end = 0;
-                ::std::size_t __item_segments = 0;
-                auto __identity = unseq_backend::__known_identity<_BinaryOperator, __val_type>;
-
-                __val_type __accumulator = __identity;
-                for (::std::size_t __i = __start; __i < __end; ++__i)
-                {
-                    __accumulator = __binary_op(__accumulator, __values[__i]);
-                    if (__n - 1 == __i || !__binary_pred(__keys[__i], __keys[__i + 1]))
-                    {
-                        __loc_partials[__i - __start] = __accumulator;
-                        ++__item_segments;
-                        __max_end = __local_id;
-                        __accumulator = __identity;
-                    }
-                }
-
-                // 2c. Count the number of prior work segments cooperatively over group
-                ::std::size_t __prior_segs_in_wg = __dpl_sycl::__exclusive_scan_over_group(
-                    __group, __item_segments, __dpl_sycl::__plus<decltype(__item_segments)>());
-                ::std::size_t __start_idx = __wg_num_prior_segs + __prior_segs_in_wg;
-
-                // 2d. Find the greatest segment end less than the current index (inclusive)
-                ::std::size_t __closest_seg_id = __dpl_sycl::__inclusive_scan_over_group(
-                    __group, __max_end, __dpl_sycl::__maximum<decltype(__max_end)>());
-
-                // __wg_segmented_scan is a derivative work and responsible for the third header copyright
-                __val_type __carry_in = oneapi::dpl::internal::__wg_segmented_scan(
-                    __item, __loc_acc, __local_id, __local_id - __closest_seg_id, __accumulator, __identity,
-                    __binary_op, __wgroup_size);
-
-                // 2e. Update local partial reductions in first segment and write to global memory.
-                bool __apply_aggs = true;
-                ::std::size_t __item_offset = 0;
-
-                // first item in group does not have any work-group aggregates to apply
-                if (__local_id == 0)
-                {
-                    __apply_aggs = false;
-                    if (__global_id == 0 && __n > 0)
-                    {
-                        // first segment identifier is always the first key
-                        __out_keys[0] = __keys[0];
-                    }
-                }
-
-                // apply the aggregates and copy the locally stored values to destination buffer
-                for (::std::size_t __i = __start; __i < __end; ++__i)
-                {
-                    if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1]))
-                    {
-                        ::std::size_t __idx = __start_idx + __item_offset;
-                        if (__apply_aggs)
-                        {
-                            __out_values[__idx] = __binary_op(__carry_in, __loc_partials[__i - __start]);
-                            __apply_aggs = false;
-                        }
-                        else
-                        {
-                            __out_values[__idx] = __loc_partials[__i - __start];
-                        }
-                        if (__i != __n - 1)
-                        {
-                            __out_keys[__idx + 1] = __keys[__i + 1];
-                        }
-                        ++__item_offset;
-                    }
-                }
-
-                // 2f. Output the work group aggregate and total number of segments for use in phase 3.
-                if (__local_id == __wgroup_size - 1) // last work item writes the group's carry out
-                {
-                    // If no segment ends in the item, the aggregates from previous work groups must be applied.
-                    if (__max_end == 0)
-                    {
-                        // needs to be inclusive with last element
-                        __partials_acc[__group_id] = __binary_op(__carry_in, __accumulator);
-                    }
-                    else
-                    {
-                        __partials_acc[__group_id] = __accumulator;
-                    }
-                }
-            });
-    });
-
-    // 3. Apply inter work-group aggregates
-    __exec.queue()
-        .submit([&](sycl::handler& __cgh) {
-            oneapi::dpl::__ranges::__require_access(__cgh, __keys, __out_keys, __out_values);
-
-            auto __partials_acc = __partials.template get_access<sycl::access_mode::read>(__cgh);
-            auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access<sycl::access_mode::read>(__cgh);
-            auto __seg_ends_acc = __seg_ends.template get_access<sycl::access_mode::read>(__cgh);
-            auto __end_idx_acc = __end_idx.template get_access<sycl::access_mode::write>(__cgh);
-
-            __dpl_sycl::__local_accessor<__val_type> __loc_partials_acc(__wgroup_size, __cgh);
-            __dpl_sycl::__local_accessor<__diff_type> __loc_seg_ends_acc(__wgroup_size, __cgh);
-
-            __cgh.depends_on(__wg_reduce);
-#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
-            __cgh.use_kernel_bundle(__seg_reduce_prefix_kernel.get_kernel_bundle());
-#endif
-            __cgh.parallel_for<_SegReducePrefixKernel>(
-#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
-                __seg_reduce_prefix_kernel,
-#endif
-                sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
-                    auto __group = __item.get_group();
-                    ::std::int64_t __group_id = __item.get_group(0);
-                    ::std::size_t __global_id = __item.get_global_id(0);
-                    ::std::size_t __local_id = __item.get_local_id(0);
-
-                    ::std::size_t __start = __global_id * __vals_per_item;
-                    ::std::size_t __end = __dpl_sycl::__minimum<decltype(__n)>{}(__start + __vals_per_item, __n);
-                    ::std::size_t __item_segments = 0;
-
-                    ::std::int64_t __wg_agg_idx = __group_id - 1;
-                    __val_type __agg_collector = unseq_backend::__known_identity<_BinaryOperator, __val_type>;
-
-                    bool __ag_exists = false;
-                    // 3a. Check to see if an aggregate exists and compute that value in the first
-                    // work item.
-                    if (__group_id != 0)
-                    {
-                        __ag_exists = __start < __n;
-                        // local reductions followed by a sweep
-                        constexpr ::std::int32_t __vals_to_explore = 16;
-                        bool __last_it = false;
-                        __loc_seg_ends_acc[__local_id] = false;
-                        __loc_partials_acc[__local_id] = unseq_backend::__known_identity<_BinaryOperator, __val_type>;
-                        for (::std::int32_t __i = __wg_agg_idx - __vals_to_explore * __local_id; !__last_it;
-                             __i -= __wgroup_size * __vals_to_explore)
-                        {
-                            __val_type __local_collector = unseq_backend::__known_identity<_BinaryOperator, __val_type>;
-                            // exploration phase
-                            for (::std::int32_t __j = __i;
-                                 __j > __dpl_sycl::__maximum<::std::int32_t>{}(-1L, __i - __vals_to_explore); --__j)
-                            {
-                                __local_collector = __binary_op(__partials_acc[__j], __local_collector);
-                                if (__seg_ends_acc[__j] || __j == 0)
-                                {
-                                    __loc_seg_ends_acc[__local_id] = true;
-                                    break;
-                                }
-                            }
-                            __loc_partials_acc[__local_id] = __local_collector;
-                            __dpl_sycl::__group_barrier(__item);
-                            // serial aggregate collection and synchronization
-                            if (__local_id == 0)
-                            {
-                                for (::std::size_t __j = 0; __j < __wgroup_size; ++__j)
-                                {
-                                    __agg_collector = __binary_op(__loc_partials_acc[__j], __agg_collector);
-                                    if (__loc_seg_ends_acc[__j])
-                                    {
-                                        __last_it = true;
-                                        break;
-                                    }
-                                }
-                            }
-                            __agg_collector = __dpl_sycl::__group_broadcast(__item.get_group(), __agg_collector);
-                            __last_it = __dpl_sycl::__group_broadcast(__item.get_group(), __last_it);
-                        }
-
-                        // Check to see if aggregates exist.
-                        // The last group must always stay to write the final index
-                        __ag_exists = __dpl_sycl::__any_of_group(__group, __ag_exists);
-                        if (!__ag_exists && __group_id != __n_groups - 1)
-                            return;
-                    }
-                    // 3b. count the segment ends
-                    for (::std::size_t __i = __start; __i < __end; ++__i)
-                        if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1]))
-                            ++__item_segments;
-
-                    ::std::size_t __prior_segs_in_wg = __dpl_sycl::__exclusive_scan_over_group(
-                        __group, __item_segments, __dpl_sycl::__plus<decltype(__item_segments)>());
-
-                    // 3c. Determine prior index
-                    ::std::size_t __wg_num_prior_segs = __seg_ends_scan_acc[__group_id];
-
-                    // 3d. Second pass over the keys, reidentifying end segments and applying work group
-                    // aggregates if appropriate. Both the key and reduction value are written to the final output at the
-                    // computed index
-                    ::std::size_t __item_offset = 0;
-                    for (::std::size_t __i = __start; __i < __end; ++__i)
-                    {
-                        if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1]))
-                        {
-                            ::std::size_t __idx = __wg_num_prior_segs + __prior_segs_in_wg + __item_offset;
-
-                            // apply the aggregate if it is the first segment end in the workgroup only
-                            if (__prior_segs_in_wg == 0 && __item_offset == 0 && __ag_exists)
-                                __out_values[__idx] = __binary_op(__agg_collector, __out_values[__idx]);
-
-                            ++__item_offset;
-                            // the last item must write the last index's position to return
-                            if (__i == __n - 1)
-                                __end_idx_acc[0] = __idx;
-                        }
-                    }
-                });
-        })
-        .wait();
-
-    return __end_idx.get_host_access()[0] + 1;
-}
-#endif
-
 template <typename _BackendTag, typename Policy, typename InputIterator1, typename InputIterator2,
           typename OutputIterator1, typename OutputIterator2, typename BinaryPred, typename BinaryOperator>
 ::std::pair<OutputIterator1, OutputIterator2>
@@ -615,9 +241,9 @@ reduce_by_segment_impl(__internal::__hetero_tag<_BackendTag> __tag, Policy&& pol
                                                      typename ::std::iterator_traits<InputIterator2>::value_type>::type;
 
     // number of unique keys
-    _CountType __n = __sycl_reduce_by_segment(
-        __tag, ::std::forward<Policy>(policy), key_buf.all_view(), value_buf.all_view(), key_output_buf.all_view(),
-        value_output_buf.all_view(), binary_pred, binary_op, has_known_identity{});
+    _CountType __n =
+        __pattern_reduce_by_segment(__tag, ::std::forward<Policy>(policy), key_buf.all_view(), value_buf.all_view(),
+                                    key_output_buf.all_view(), value_output_buf.all_view(), binary_pred, binary_op);
 
     return ::std::make_pair(result1 + __n, result2 + __n);
 }
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index 7558e29b4d5..72782105d93 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -913,15 +913,22 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
                             _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
                             _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
 {
-    auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys),
-                                                                                 std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys),
-                                                                                 std::forward<_Range4>(__out_values), __binary_pred, __binary_op);
-    __res.wait();
-    // Because our init type ends up being tuple<std::size_t, ValType>, return the first component which is the write index. Add 1 to return the
-    // past-the-end iterator pair of segmented reduction.
-    return std::get<0>(__res.get()) + 1;
-    // TODO: this needs to be enabled if reduce then scan cannot be satisfied.
-    #if 0
+#if _ONEDPL_BACKEND_SYCL
+    // We would normally dispatch to the parallel implementation which would make the decision to invoke
+    // reduce-then-scan. However, since the fallback is implemented at the ranges level we must choose
+    // whether or not to use reduce-then-scan here.
+    if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
+    {
+        auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(
+            _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys),
+            std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values),
+            __binary_pred, __binary_op);
+        __res.wait();
+        // Because our init type ends up being tuple<std::size_t, ValType>, return the first component which is the write index. Add 1 to return the
+        // past-the-end iterator pair of segmented reduction.
+        return std::get<0>(__res.get()) + 1;
+    }
+#endif
     // The algorithm reduces values in __values where the
     // associated keys for the values are equal to the adjacent key.
     //
@@ -1052,7 +1059,6 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
         .__deferrable_wait();
 
     return __result_end;
-    #endif
 }
 
 } // namespace __ranges

From 7c92238a6994c86daa1e024d47743f34e2b2d3ac Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 15 Oct 2024 11:42:03 -0700
Subject: [PATCH 08/31] Switch from using lambdas to functors

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 131 +++++++++++-------
 1 file changed, 82 insertions(+), 49 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index ebb86585133..ccd891b954b 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -786,6 +786,68 @@ struct __gen_transform_input
     _UnaryOp __unary_op;
 };
 
+template <typename _BinaryPred>
+struct __gen_red_by_seg_input
+{
+    template <typename _InRng>
+    auto
+    operator()(const _InRng& __in_rng, std::size_t __id) const
+    {
+        auto&& __in_keys = std::get<0>(__in_rng.tuple());
+        auto&& __in_vals = std::get<1>(__in_rng.tuple());
+        using _ValueType = oneapi::dpl::__internal::__value_t<decltype(__in_vals)>;
+        if (__id == 0 || __binary_pred(__in_keys[__id - 1], __in_keys[__id]))
+            return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__id]});
+        return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__id]});
+    }
+    _BinaryPred __binary_pred;
+};
+
+template <typename _BinaryOp>
+struct __red_by_seg_op
+{
+    template <typename _Tup1, typename _Tup2>
+    auto
+    operator()(const _Tup1& __lhs_tup, const _Tup2& __rhs_tup) const
+    {
+        if (std::get<0>(__rhs_tup) == 0)
+        {
+            return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup),
+                                                       __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup)));
+        }
+        return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup),
+                                                   std::get<1>(__rhs_tup));
+    }
+    _BinaryOp __binary_op;
+};
+
+template <typename _BinaryPred>
+struct __write_red_by_seg
+{
+    template <typename _InRng, typename _OutRng, typename _Tup>
+    void
+    operator()(_InRng& __in_rng, _OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const
+    {
+        auto&& __in_keys = std::get<0>(__in_rng.tuple());
+        auto&& __out_keys = std::get<0>(__out_rng.tuple());
+        auto&& __out_values = std::get<1>(__out_rng.tuple());
+        // TODO: substantial improvement expected with special handling in kernel
+        // The first key must be output to __out_keys[__id] for a segment, so when we encounter a segment end we
+        // must output the current segment's value and the next segment's key.
+        if (__id == 0)
+            __out_keys[0] = __in_keys[0];
+        if (__id == __n - 1)
+            __out_values[std::get<0>(__tup)] = std::get<1>(__tup);
+        else if (!__binary_pred(__in_keys[__id], __in_keys[__id + 1]))
+        {
+            __out_keys[std::get<0>(__tup) + 1] = __in_keys[__id + 1];
+            __out_values[std::get<0>(__tup)] = std::get<1>(__tup);
+        }
+    }
+    _BinaryPred __binary_pred;
+    std::size_t __n;
+};
+
 struct __simple_write_to_id
 {
     template <typename _InRng, typename _OutRng, typename _ValueType>
@@ -1185,58 +1247,29 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
                                     _CopyOp{_ReduceOp{}, _Assign{}});
     }
 }
-template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3,
-          typename _Range4, typename _BinaryPredicate, typename _BinaryOperator>
+
+template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Range4,
+          typename _BinaryPredicate, typename _BinaryOperator>
 auto
-__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys,
-                            _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
-                            _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
-{
-    auto __n = __keys.size();
-    auto __gen_reduce_input = [=](const auto& __in_rng, std::size_t __idx) {
-        auto&& __in_keys = std::get<0>(__in_rng.tuple());
-        auto&& __in_vals = std::get<1>(__in_rng.tuple());
-        using _ValueType = oneapi::dpl::__internal::__value_t<decltype(__in_vals)>;
-        if (__idx == 0 || __binary_pred(__in_keys[__idx - 1], __in_keys[__idx]))
-            return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]});
-        return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]});
-    };
-    auto __reduce_op = [=](const auto& __lhs_tup,  const auto& __rhs_tup) {
-        if (std::get<0>(__rhs_tup) == 0)
-        {
-            return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup),
-                    __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup)));
-        }
-        return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup),
-                                                   std::get<1>(__rhs_tup));
-    };
-    auto __gen_scan_input = __gen_reduce_input;
-    auto __scan_input_transform = oneapi::dpl::__internal::__no_op{};
-    auto __write_out = [=](auto& __in_rng, auto& __out_rng, std::size_t __idx, const auto& __tup) {
-        auto&& __in_keys = std::get<0>(__in_rng.tuple());
-        auto&& __out_keys = std::get<0>(__out_rng.tuple());
-        auto&& __out_vals = std::get<1>(__out_rng.tuple());
-        // TODO: substantial improvement expected with special handling in kernel
-        // The first key must be output to __out_keys[__idx] for a segment, so when we encounter a segment end we
-        // must output the current segment's value and the next segment's key.
-        if (__idx == 0)
-            __out_keys[0] = __in_keys[0];
-        if (__idx == __n - 1)
-            __out_values[std::get<0>(__tup)] = std::get<1>(__tup);
-        else if (!__binary_pred(__in_keys[__idx], __in_keys[__idx + 1]))
-        {
-            __out_keys[std::get<0>(__tup) + 1] = __in_keys[__idx + 1];
-            __out_values[std::get<0>(__tup)] = std::get<1>(__tup);
-        }
-    };
+__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
+                             _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
+                             _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
+{
+    using _GenReduceInput = __gen_red_by_seg_input<_BinaryPredicate>;
+    using _ReduceOp = __red_by_seg_op<_BinaryOperator>;
+    using _GenScanInput = _GenReduceInput;
+    using _ScanInputTransform = oneapi::dpl::__internal::__no_op;
+    using _WriteOp = __write_red_by_seg<_BinaryPredicate>;
     using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>;
+    std::size_t __n = __keys.size();
     return __parallel_transform_reduce_then_scan(
-                __backend_tag, std::forward<_ExecutionPolicy>(__exec),
-                oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
-                oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)),
-                __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform,
-                __write_out, oneapi::dpl::unseq_backend::__no_init_value<oneapi::dpl::__internal::tuple<std::size_t, _ValueType>>{},
-                /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{});
+        __backend_tag, std::forward<_ExecutionPolicy>(__exec),
+        oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
+        oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)),
+        _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred}, _ScanInputTransform{},
+        _WriteOp{__binary_pred, __n},
+        oneapi::dpl::unseq_backend::__no_init_value<oneapi::dpl::__internal::tuple<std::size_t, _ValueType>>{},
+        /*Inclusive*/ std::true_type{}, /*_IsUniquePattern=*/std::false_type{});
 }
 
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryPredicate>

From 6c4267be04faaa120e5ca5d6aadae5778337ef31 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 15 Oct 2024 12:04:35 -0700
Subject: [PATCH 09/31] Add device copyable specializations for red-by-seg
 functors and update testing

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h |  2 +-
 .../dpl/pstl/hetero/dpcpp/sycl_traits.h       | 29 ++++++++++++++++++
 .../device_copyable.pass.cpp                  | 30 +++++++++++++++++++
 3 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index ccd891b954b..5c22ff49dc9 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -868,7 +868,7 @@ struct __gen_mask
 {
     template <typename _InRng>
     bool
-    operator()(const _InRng& __in_rng, std::size_t __id) const
+    operator()(_InRng&& __in_rng, std::size_t __id) const
     {
         return __pred((__rng_transform(std::forward<_InRng>(__in_rng)))[__id]);
     }
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
index 2d0e88fd34b..40713cf8621 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
@@ -236,6 +236,9 @@ namespace oneapi::dpl::__par_backend_hetero
 template <typename _UnaryOp>
 struct __gen_transform_input;
 
+template <typename _BinaryPred>
+struct __gen_red_by_seg_input;
+
 template <typename _Predicate, typename _RangeTransform>
 struct __gen_mask;
 
@@ -254,12 +257,18 @@ struct __write_to_id_if;
 template <typename _Assign>
 struct __write_to_id_if_else;
 
+template <typename _BinaryPred>
+struct __write_red_by_seg;
+
 template <typename _ExecutionPolicy, typename _Pred>
 struct __early_exit_find_or;
 
 template <typename _Range, typename _Compare>
 struct __leaf_sorter;
 
+template <typename _BinaryOp>
+struct __red_by_seg_op;
+
 } // namespace oneapi::dpl::__par_backend_hetero
 
 template <typename _UnaryOp>
@@ -276,6 +285,13 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen
 {
 };
 
+template <typename _BinaryPred>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_input,
+                                                       _BinaryPred)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryPred>
+{
+};
+
 template <typename _BinaryPredicate>
 struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_unique_mask,
                                                        _BinaryPredicate)>
@@ -309,6 +325,13 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen
 {
 };
 
+template <typename _BinaryPred>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_red_by_seg,
+                                                       _BinaryPred)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryPred>
+{
+};
+
 template <typename _ExecutionPolicy, typename _Pred>
 struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__early_exit_find_or,
                                                        _ExecutionPolicy, _Pred)>
@@ -323,6 +346,12 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen
 {
 };
 
+template <typename _BinaryOp>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__red_by_seg_op, _BinaryOp)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryOp>
+{
+};
+
 namespace oneapi::dpl::unseq_backend
 {
 
diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp
index 9b1b09e8c2e..ee507860633 100644
--- a/test/general/implementation_details/device_copyable.pass.cpp
+++ b/test/general/implementation_details/device_copyable.pass.cpp
@@ -155,6 +155,11 @@ test_device_copyable()
         sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_transform_input<noop_device_copyable>>,
         "__gen_transform_input is not device copyable with device copyable types");
 
+    //__gen_red_by_seg_input
+    static_assert(
+        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_input<noop_device_copyable>>,
+        "__gen_red_by_seg_input is not device copyable with device copyable types");
+
     //__gen_mask
     static_assert(sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_mask<noop_device_copyable>>,
                   "__gen_mask is not device copyable with device copyable types");
@@ -184,6 +189,11 @@ test_device_copyable()
         sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__write_to_id_if_else<assign_device_copyable>>,
         "__write_to_id_if_else is not device copyable with device copyable types");
 
+    //__write_red_by_seg
+    static_assert(
+        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__write_red_by_seg<noop_device_copyable>>,
+        "__write_red_by_seg is not device copyable with device copyable types");
+
     // __early_exit_find_or
     static_assert(
         sycl::is_device_copyable_v<
@@ -199,6 +209,11 @@ test_device_copyable()
                                                                                     noop_device_copyable>>,
         "__leaf_sorter is not device copyable with device copyable types");
 
+    //__red_by_seg_op
+    static_assert(
+        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__red_by_seg_op<binary_op_device_copyable>>,
+        "__red_by_seg_op is not device copyable with device copyable types");
+
     //__not_pred
     static_assert(sycl::is_device_copyable_v<oneapi::dpl::__internal::__not_pred<noop_device_copyable>>,
                   "__not_pred is not device copyable with device copyable types");
@@ -397,6 +412,11 @@ test_non_device_copyable()
         !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_transform_input<noop_non_device_copyable>>,
         "__gen_transform_input is device copyable with non device copyable types");
 
+    //__gen_red_by_seg_input
+    static_assert(
+        !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_input<noop_non_device_copyable>>,
+        "__gen_red_by_seg_input is device copyable with device copyable types");
+
     //__gen_mask
     static_assert(!sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_mask<noop_non_device_copyable>>,
                   "__gen_mask is device copyable with non device copyable types");
@@ -426,6 +446,11 @@ test_non_device_copyable()
                       oneapi::dpl::__par_backend_hetero::__write_to_id_if_else<assign_non_device_copyable>>,
                   "__write_to_id_if_else is device copyable with non device copyable types");
 
+    //__write_red_by_seg
+    static_assert(
+        !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__write_red_by_seg<noop_non_device_copyable>>,
+        "__write_red_by_seg is device copyable with device copyable types");
+
     // __early_exit_find_or
     static_assert(
         !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__early_exit_find_or<policy_non_device_copyable,
@@ -437,6 +462,11 @@ test_non_device_copyable()
                                                                                                noop_non_device_copyable>>,
                   "__leaf_sorter is device copyable with non device copyable types");
 
+    //__red_by_seg_op
+    static_assert(
+        !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__red_by_seg_op<binary_op_non_device_copyable>>,
+        "__red_by_seg_op is device copyable with device copyable types");
+
     //__not_pred
     static_assert(!sycl::is_device_copyable_v<oneapi::dpl::__internal::__not_pred<noop_non_device_copyable>>,
                   "__not_pred is device copyable with non device copyable types");

From 3eee2429ea170294fe6e814852be2d7ee7bca246 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 15 Oct 2024 12:08:05 -0700
Subject: [PATCH 10/31] Fix typo in error message in device_copyable.pass.cpp

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../general/implementation_details/device_copyable.pass.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp
index ee507860633..f7af1f9e9af 100644
--- a/test/general/implementation_details/device_copyable.pass.cpp
+++ b/test/general/implementation_details/device_copyable.pass.cpp
@@ -415,7 +415,7 @@ test_non_device_copyable()
     //__gen_red_by_seg_input
     static_assert(
         !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_input<noop_non_device_copyable>>,
-        "__gen_red_by_seg_input is device copyable with device copyable types");
+        "__gen_red_by_seg_input is device copyable with non device copyable types");
 
     //__gen_mask
     static_assert(!sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_mask<noop_non_device_copyable>>,
@@ -449,7 +449,7 @@ test_non_device_copyable()
     //__write_red_by_seg
     static_assert(
         !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__write_red_by_seg<noop_non_device_copyable>>,
-        "__write_red_by_seg is device copyable with device copyable types");
+        "__write_red_by_seg is device copyable with non device copyable types");
 
     // __early_exit_find_or
     static_assert(
@@ -465,7 +465,7 @@ test_non_device_copyable()
     //__red_by_seg_op
     static_assert(
         !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__red_by_seg_op<binary_op_non_device_copyable>>,
-        "__red_by_seg_op is device copyable with device copyable types");
+        "__red_by_seg_op is device copyable with non device copyable types");
 
     //__not_pred
     static_assert(!sycl::is_device_copyable_v<oneapi::dpl::__internal::__not_pred<noop_non_device_copyable>>,

From b334fb010530edb69b3daa9b0c2c8cf2ce65e98e Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Fri, 18 Oct 2024 07:20:31 -0700
Subject: [PATCH 11/31] Introduce separate input generation for scan phase and
 update tests

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../hetero/algorithm_ranges_impl_hetero.h     |  2 +-
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 66 +++++++++++++------
 .../dpl/pstl/hetero/dpcpp/sycl_traits.h       | 14 +++-
 .../device_copyable.pass.cpp                  | 22 +++++--
 4 files changed, 75 insertions(+), 29 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index 72782105d93..2f42f5e4b60 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -919,7 +919,7 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
     // whether or not to use reduce-then-scan here.
     if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
     {
-        auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(
+        auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan(
             _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys),
             std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values),
             __binary_pred, __binary_op);
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 5c22ff49dc9..5a08852b79a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -787,7 +787,7 @@ struct __gen_transform_input
 };
 
 template <typename _BinaryPred>
-struct __gen_red_by_seg_input
+struct __gen_red_by_seg_reduce_input
 {
     template <typename _InRng>
     auto
@@ -796,11 +796,33 @@ struct __gen_red_by_seg_input
         auto&& __in_keys = std::get<0>(__in_rng.tuple());
         auto&& __in_vals = std::get<1>(__in_rng.tuple());
         using _ValueType = oneapi::dpl::__internal::__value_t<decltype(__in_vals)>;
-        if (__id == 0 || __binary_pred(__in_keys[__id - 1], __in_keys[__id]))
-            return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__id]});
-        return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__id]});
+        std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]);
+        return oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]});
     }
     _BinaryPred __binary_pred;
+    std::size_t __n;
+};
+
+template <typename _BinaryPred>
+struct __gen_red_by_seg_scan_input
+{
+    template <typename _InRng>
+    auto
+    operator()(const _InRng& __in_rng, std::size_t __id) const
+    {
+        auto&& __in_keys = std::get<0>(__in_rng.tuple());
+        auto&& __in_vals = std::get<1>(__in_rng.tuple());
+        using _ValueType = oneapi::dpl::__internal::__value_t<decltype(__in_vals)>;
+        // Each beginning segment is marked with a flag to know when to stop reduce lower indexed inputs
+        std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]);
+        // Each last element in a segment is marked with an output flag to store its reduction in the write phase
+        bool __output_mask = __id == __n - 1 || !__binary_pred(__in_keys[__id], __in_keys[__id + 1]);
+        const auto __candidate_key = __id < __n - 1 ? __in_keys[__id + 1] : __in_keys[__id];
+        return oneapi::dpl::__internal::make_tuple(oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}),
+                                                   __output_mask, __candidate_key);
+    }
+    _BinaryPred __binary_pred;
+    std::size_t __n;
 };
 
 template <typename _BinaryOp>
@@ -826,22 +848,26 @@ struct __write_red_by_seg
 {
     template <typename _InRng, typename _OutRng, typename _Tup>
     void
-    operator()(_InRng& __in_rng, _OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const
+    operator()(const _InRng& __in_rng, _OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const
     {
+        using std::get;
         auto&& __in_keys = std::get<0>(__in_rng.tuple());
         auto&& __out_keys = std::get<0>(__out_rng.tuple());
         auto&& __out_values = std::get<1>(__out_rng.tuple());
-        // TODO: substantial improvement expected with special handling in kernel
-        // The first key must be output to __out_keys[__id] for a segment, so when we encounter a segment end we
-        // must output the current segment's value and the next segment's key.
+        // TODO: substantial improvement expected with special handling in kernel of first and last sub-groups.
+        // The first key must be output to __out_keys for a segment, so when we encounter a segment end we
+        // must output the current segment's value and the next segment's key. For index zero we must special handle
+        // and write the first key from the current index. 
         if (__id == 0)
-            __out_keys[0] = __in_keys[0];
+            __out_keys[0] = __in_keys[0]; 
+        // We are at the end of the input so there is no key to output for the next segment
         if (__id == __n - 1)
-            __out_values[std::get<0>(__tup)] = std::get<1>(__tup);
-        else if (!__binary_pred(__in_keys[__id], __in_keys[__id + 1]))
+            __out_values[get<0>(get<0>(__tup))] = get<1>(get<0>(__tup));
+        // Update the current segment's output value and the next segment's key value
+        else if (get<1>(__tup))
         {
-            __out_keys[std::get<0>(__tup) + 1] = __in_keys[__id + 1];
-            __out_values[std::get<0>(__tup)] = std::get<1>(__tup);
+            __out_keys[get<0>(get<0>(__tup)) + 1] = get<2>(__tup);
+            __out_values[get<0>(get<0>(__tup))] = get<1>(get<0>(__tup));
         }
     }
     _BinaryPred __binary_pred;
@@ -1251,14 +1277,14 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Range4,
           typename _BinaryPredicate, typename _BinaryOperator>
 auto
-__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
-                             _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
-                             _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
+__parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
+                                              _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
+                                              _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
 {
-    using _GenReduceInput = __gen_red_by_seg_input<_BinaryPredicate>;
+    using _GenReduceInput = __gen_red_by_seg_reduce_input<_BinaryPredicate>;
     using _ReduceOp = __red_by_seg_op<_BinaryOperator>;
-    using _GenScanInput = _GenReduceInput;
-    using _ScanInputTransform = oneapi::dpl::__internal::__no_op;
+    using _GenScanInput = __gen_red_by_seg_scan_input<_BinaryPredicate>;
+    using _ScanInputTransform = __get_zeroth_element;
     using _WriteOp = __write_red_by_seg<_BinaryPredicate>;
     using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>;
     std::size_t __n = __keys.size();
@@ -1266,7 +1292,7 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac
         __backend_tag, std::forward<_ExecutionPolicy>(__exec),
         oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
         oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)),
-        _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred}, _ScanInputTransform{},
+        _GenReduceInput{__binary_pred, __n}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, _ScanInputTransform{},
         _WriteOp{__binary_pred, __n},
         oneapi::dpl::unseq_backend::__no_init_value<oneapi::dpl::__internal::tuple<std::size_t, _ValueType>>{},
         /*Inclusive*/ std::true_type{}, /*_IsUniquePattern=*/std::false_type{});
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
index 40713cf8621..7d3fd829cc5 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
@@ -237,7 +237,10 @@ template <typename _UnaryOp>
 struct __gen_transform_input;
 
 template <typename _BinaryPred>
-struct __gen_red_by_seg_input;
+struct __gen_red_by_seg_reduce_input;
+
+template <typename _BinaryPred>
+struct __gen_red_by_seg_scan_input;
 
 template <typename _Predicate, typename _RangeTransform>
 struct __gen_mask;
@@ -286,7 +289,14 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen
 };
 
 template <typename _BinaryPred>
-struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_input,
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_reduce_input,
+                                                       _BinaryPred)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryPred>
+{
+};
+
+template <typename _BinaryPred>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_scan_input,
                                                        _BinaryPred)>
     : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryPred>
 {
diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp
index f7af1f9e9af..572c16b818a 100644
--- a/test/general/implementation_details/device_copyable.pass.cpp
+++ b/test/general/implementation_details/device_copyable.pass.cpp
@@ -155,10 +155,15 @@ test_device_copyable()
         sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_transform_input<noop_device_copyable>>,
         "__gen_transform_input is not device copyable with device copyable types");
 
-    //__gen_red_by_seg_input
+    //__gen_red_by_seg_reduce_input
     static_assert(
-        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_input<noop_device_copyable>>,
-        "__gen_red_by_seg_input is not device copyable with device copyable types");
+        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_reduce_input<noop_device_copyable>>,
+        "__gen_red_by_seg_reduce_input is not device copyable with device copyable types");
+
+    //__gen_red_by_seg_scan_input
+    static_assert(
+        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_scan_input<noop_device_copyable>>,
+        "__gen_red_by_seg_scan_input is not device copyable with device copyable types");
 
     //__gen_mask
     static_assert(sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_mask<noop_device_copyable>>,
@@ -412,10 +417,15 @@ test_non_device_copyable()
         !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_transform_input<noop_non_device_copyable>>,
         "__gen_transform_input is device copyable with non device copyable types");
 
-    //__gen_red_by_seg_input
+    //__gen_red_by_seg_reduce_input
+    static_assert(
+        !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_reduce_input<noop_non_device_copyable>>,
+        "__gen_red_by_seg_reduce_input is device copyable with non device copyable types");
+
+    //__gen_red_by_seg_reduce_input
     static_assert(
-        !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_input<noop_non_device_copyable>>,
-        "__gen_red_by_seg_input is device copyable with non device copyable types");
+        !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_scan_input<noop_non_device_copyable>>,
+        "__gen_red_by_seg_scan_input is device copyable with non device copyable types");
 
     //__gen_mask
     static_assert(!sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_mask<noop_non_device_copyable>>,

From d564b3306d473c86f624b241ce4d61134e592fe9 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Fri, 18 Oct 2024 09:09:56 -0700
Subject: [PATCH 12/31] Improve code readability

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 65 +++++++++++--------
 1 file changed, 38 insertions(+), 27 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 5a08852b79a..cf932d2df07 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -806,20 +806,28 @@ struct __gen_red_by_seg_reduce_input
 template <typename _BinaryPred>
 struct __gen_red_by_seg_scan_input
 {
+    // Returns the following tuple:
+    // ((new_seg_mask: size_t, value: ValueType), output_value: bool, candidate_key: KeyType)
+    // new_seg_mask : 1 for a start of a new segment, 0 otherwise
+    // value        : Current element's value for reduction
+    // output_value : Whether this work-item should write an output
+    // candidate_key: The key of the next segment to write if output_value is true.
     template <typename _InRng>
     auto
     operator()(const _InRng& __in_rng, std::size_t __id) const
     {
-        auto&& __in_keys = std::get<0>(__in_rng.tuple());
-        auto&& __in_vals = std::get<1>(__in_rng.tuple());
+        auto __in_keys = std::get<0>(__in_rng.tuple());
+        auto __in_vals = std::get<1>(__in_rng.tuple());
+        using _KeyType = oneapi::dpl::__internal::__value_t<decltype(__in_keys)>;
         using _ValueType = oneapi::dpl::__internal::__value_t<decltype(__in_vals)>;
-        // Each beginning segment is marked with a flag to know when to stop reduce lower indexed inputs
         std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]);
-        // Each last element in a segment is marked with an output flag to store its reduction in the write phase
-        bool __output_mask = __id == __n - 1 || !__binary_pred(__in_keys[__id], __in_keys[__id + 1]);
-        const auto __candidate_key = __id < __n - 1 ? __in_keys[__id + 1] : __in_keys[__id];
-        return oneapi::dpl::__internal::make_tuple(oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}),
-                                                   __output_mask, __candidate_key);
+        if (__id == __n - 1)
+            return oneapi::dpl::__internal::make_tuple(
+                oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true,
+                _KeyType{__in_keys[__id]}); // __in_keys[__id] is an unused placeholder
+        return oneapi::dpl::__internal::make_tuple(
+            oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}),
+            !__binary_pred(__in_keys[__id], __in_keys[__id + 1]), _KeyType{__in_keys[__id + 1]});
     }
     _BinaryPred __binary_pred;
     std::size_t __n;
@@ -832,13 +840,15 @@ struct __red_by_seg_op
     auto
     operator()(const _Tup1& __lhs_tup, const _Tup2& __rhs_tup) const
     {
+        using std::get;
+        // The left-hand side has processed elements from the same segment, so update the reduction value.
         if (std::get<0>(__rhs_tup) == 0)
         {
-            return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup),
-                                                       __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup)));
+            return oneapi::dpl::__internal::make_tuple(get<0>(__lhs_tup),
+                                                       __binary_op(get<1>(__lhs_tup), get<1>(__rhs_tup)));
         }
-        return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup),
-                                                   std::get<1>(__rhs_tup));
+        // We are looking at elements from a previous segment so just update the output index.
+        return oneapi::dpl::__internal::make_tuple(get<0>(__lhs_tup) + get<0>(__rhs_tup), get<1>(__rhs_tup));
     }
     _BinaryOp __binary_op;
 };
@@ -851,23 +861,24 @@ struct __write_red_by_seg
     operator()(const _InRng& __in_rng, _OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const
     {
         using std::get;
-        auto&& __in_keys = std::get<0>(__in_rng.tuple());
-        auto&& __out_keys = std::get<0>(__out_rng.tuple());
-        auto&& __out_values = std::get<1>(__out_rng.tuple());
-        // TODO: substantial improvement expected with special handling in kernel of first and last sub-groups.
-        // The first key must be output to __out_keys for a segment, so when we encounter a segment end we
-        // must output the current segment's value and the next segment's key. For index zero we must special handle
-        // and write the first key from the current index. 
+        auto __in_keys = get<0>(__in_rng.tuple());
+        auto __out_keys = get<0>(__out_rng.tuple());
+        auto __out_values = get<1>(__out_rng.tuple());
+        using _KeyType = oneapi::dpl::__internal::__value_t<decltype(__out_keys)>;
+        using _ValType = oneapi::dpl::__internal::__value_t<decltype(__out_values)>;
+
+        const _KeyType& __next_segment_key = get<2>(__tup);
+        const _ValType& __cur_segment_value = get<1>(get<0>(__tup));
+        const bool __is_seg_end = get<1>(__tup);
+        const std::size_t __out_idx = get<0>(get<0>(__tup));
+
         if (__id == 0)
-            __out_keys[0] = __in_keys[0]; 
-        // We are at the end of the input so there is no key to output for the next segment
-        if (__id == __n - 1)
-            __out_values[get<0>(get<0>(__tup))] = get<1>(get<0>(__tup));
-        // Update the current segment's output value and the next segment's key value
-        else if (get<1>(__tup))
+            __out_keys[0] = __in_keys[0];
+        if (__is_seg_end)
         {
-            __out_keys[get<0>(get<0>(__tup)) + 1] = get<2>(__tup);
-            __out_values[get<0>(get<0>(__tup))] = get<1>(get<0>(__tup));
+            __out_values[__out_idx] = __cur_segment_value;
+            if (__id != __n - 1)
+                __out_keys[__out_idx + 1] = __next_segment_key;
         }
     }
     _BinaryPred __binary_pred;

From 358ec3be82665a4cbe8c9deb031dd147f39e91dd Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Fri, 18 Oct 2024 09:46:54 -0700
Subject: [PATCH 13/31] Add optional first key field to scan input and remove
 input range in write operations

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 32 +++++++++++--------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index cf932d2df07..1260d5450a1 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -28,6 +28,7 @@
 #include <cmath>
 #include <limits>
 #include <cstdint>
+#include <optional>
 
 #include "../../iterator_impl.h"
 #include "../../execution_impl.h"
@@ -793,8 +794,8 @@ struct __gen_red_by_seg_reduce_input
     auto
     operator()(const _InRng& __in_rng, std::size_t __id) const
     {
-        auto&& __in_keys = std::get<0>(__in_rng.tuple());
-        auto&& __in_vals = std::get<1>(__in_rng.tuple());
+        auto __in_keys = std::get<0>(__in_rng.tuple());
+        auto __in_vals = std::get<1>(__in_rng.tuple());
         using _ValueType = oneapi::dpl::__internal::__value_t<decltype(__in_vals)>;
         std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]);
         return oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]});
@@ -819,15 +820,19 @@ struct __gen_red_by_seg_scan_input
         auto __in_keys = std::get<0>(__in_rng.tuple());
         auto __in_vals = std::get<1>(__in_rng.tuple());
         using _KeyType = oneapi::dpl::__internal::__value_t<decltype(__in_keys)>;
+        using _OptKeyType = std::optional<_KeyType>;
         using _ValueType = oneapi::dpl::__internal::__value_t<decltype(__in_vals)>;
+        _OptKeyType __first_key;
+        if (__id == 0)
+            __first_key = _OptKeyType{__in_keys[0]};
         std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]);
         if (__id == __n - 1)
             return oneapi::dpl::__internal::make_tuple(
                 oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true,
-                _KeyType{__in_keys[__id]}); // __in_keys[__id] is an unused placeholder
+                _KeyType{__in_keys[__id]}, __first_key); // __in_keys[__id] is an unused placeholder
         return oneapi::dpl::__internal::make_tuple(
             oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}),
-            !__binary_pred(__in_keys[__id], __in_keys[__id + 1]), _KeyType{__in_keys[__id + 1]});
+            !__binary_pred(__in_keys[__id], __in_keys[__id + 1]), _KeyType{__in_keys[__id + 1]}, __first_key);
     }
     _BinaryPred __binary_pred;
     std::size_t __n;
@@ -856,12 +861,11 @@ struct __red_by_seg_op
 template <typename _BinaryPred>
 struct __write_red_by_seg
 {
-    template <typename _InRng, typename _OutRng, typename _Tup>
+    template <typename _OutRng, typename _Tup>
     void
-    operator()(const _InRng& __in_rng, _OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const
+    operator()(_OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const
     {
         using std::get;
-        auto __in_keys = get<0>(__in_rng.tuple());
         auto __out_keys = get<0>(__out_rng.tuple());
         auto __out_values = get<1>(__out_rng.tuple());
         using _KeyType = oneapi::dpl::__internal::__value_t<decltype(__out_keys)>;
@@ -873,7 +877,7 @@ struct __write_red_by_seg
         const std::size_t __out_idx = get<0>(get<0>(__tup));
 
         if (__id == 0)
-            __out_keys[0] = __in_keys[0];
+            __out_keys[0] = *get<3>(__tup);
         if (__is_seg_end)
         {
             __out_values[__out_idx] = __cur_segment_value;
@@ -887,9 +891,9 @@ struct __write_red_by_seg
 
 struct __simple_write_to_id
 {
-    template <typename _InRng, typename _OutRng, typename _ValueType>
+    template <typename _OutRng, typename _ValueType>
     void
-    operator()(const _InRng&, _OutRng& __out_rng, std::size_t __id, const _ValueType& __v) const
+    operator()(_OutRng& __out_rng, std::size_t __id, const _ValueType& __v) const
     {
         // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our
         // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through.
@@ -1036,9 +1040,9 @@ struct __get_zeroth_element
 template <std::int32_t __offset, typename _Assign>
 struct __write_to_id_if
 {
-    template <typename _InRng, typename _OutRng, typename _SizeType, typename _ValueType>
+    template <typename _OutRng, typename _SizeType, typename _ValueType>
     void
-    operator()(const _InRng&, _OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const
+    operator()(_OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const
     {
         // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our
         // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through.
@@ -1054,9 +1058,9 @@ struct __write_to_id_if
 template <typename _Assign>
 struct __write_to_id_if_else
 {
-    template <typename _InRng, typename _OutRng, typename _SizeType, typename _ValueType>
+    template <typename _OutRng, typename _SizeType, typename _ValueType>
     void
-    operator()(const _InRng&, _OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const
+    operator()(_OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const
     {
         using _ConvertedTupleType =
             typename oneapi::dpl::__internal::__get_tuple_type<std::decay_t<decltype(std::get<2>(__v))>,

From db0bc25056188a81d6979fdf38e3ac3b57c4e071 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Fri, 18 Oct 2024 09:49:30 -0700
Subject: [PATCH 14/31] Update __write_op in reduce-then-scan

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpcpp/parallel_backend_sycl_reduce_then_scan.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index 09c0d754bd7..8c0762f2a38 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -173,7 +173,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
                                                                            __binary_op, __sub_group_carry);
         if constexpr (__capture_output)
         {
-            __write_op(__in_rng, __out_rng, __start_id, __v);
+            __write_op(__out_rng, __start_id, __v);
         }
 
         if (__is_full_block)
@@ -187,7 +187,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                 if constexpr (__capture_output)
                 {
-                    __write_op(__in_rng, __out_rng, __start_id + __j * __sub_group_size, __v);
+                    __write_op(__out_rng, __start_id + __j * __sub_group_size, __v);
                 }
             }
         }
@@ -202,7 +202,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                 if constexpr (__capture_output)
                 {
-                    __write_op(__in_rng, __out_rng, __start_id + __j * __sub_group_size, __v);
+                    __write_op(__out_rng, __start_id + __j * __sub_group_size, __v);
                 }
             }
         }
@@ -225,7 +225,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
                 if constexpr (__capture_output)
                 {
                     if (__start_id < __n)
-                        __write_op(__in_rng, __out_rng, __start_id, __v);
+                        __write_op(__out_rng, __start_id, __v);
                 }
             }
             else
@@ -235,7 +235,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
                     __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                 if constexpr (__capture_output)
                 {
-                    __write_op(__in_rng, __out_rng, __start_id, __v);
+                    __write_op(__out_rng, __start_id, __v);
                 }
 
                 for (std::uint32_t __j = 1; __j < __iters - 1; __j++)
@@ -246,7 +246,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
                         __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry);
                     if constexpr (__capture_output)
                     {
-                        __write_op(__in_rng, __out_rng, __local_id, __v);
+                        __write_op(__out_rng, __local_id, __v);
                     }
                 }
 
@@ -259,7 +259,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI
                 if constexpr (__capture_output)
                 {
                     if (__offset < __n)
-                        __write_op(__in_rng, __out_rng, __offset, __v);
+                        __write_op(__out_rng, __offset, __v);
                 }
             }
         }

From 7abbff8abd0a02f4b826543f97a236c737e1a234 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Fri, 18 Oct 2024 13:17:58 -0700
Subject: [PATCH 15/31] Remove now unneeded
 ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION macro

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 CMakeLists.txt                                | 10 --------
 cmake/README.md                               |  1 -
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 17 +++-----------
 test/CMakeLists.txt                           |  5 ----
 .../numeric.ops/reduce_by_segment.pass.cpp    | 23 ++++---------------
 5 files changed, 8 insertions(+), 48 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3cb28a1693d..c682e745f9a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -287,16 +287,6 @@ if (ONEDPL_BACKEND MATCHES "^(tbb|dpcpp|dpcpp_only)$")
             endif()
         endif()
 
-        if (DEFINED ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION)
-            if(ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION)
-                message(STATUS "Adding -DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1 option")
-                target_compile_options(oneDPL INTERFACE "-DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1")
-            else()
-                message(STATUS "Adding -DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=0 option")
-                target_compile_options(oneDPL INTERFACE "-DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=0")
-            endif()
-        endif()
-
         # DPC++ specific macro
         target_compile_definitions(oneDPL INTERFACE
             $<$<OR:$<BOOL:${ONEDPL_USE_DEVICE_FPGA_HW}>,$<BOOL:${ONEDPL_USE_DEVICE_FPGA_EMU}>>:ONEDPL_FPGA_DEVICE>
diff --git a/cmake/README.md b/cmake/README.md
index 7335b7e2312..0683a377820 100644
--- a/cmake/README.md
+++ b/cmake/README.md
@@ -18,7 +18,6 @@ The following variables are provided for oneDPL configuration:
 | ONEDPL_AOT_ARCH              | STRING | Architecture options for ahead-of-time compilation, supported values can be found [here](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html)                                                                                            | "*" for GPU device and "avx" for CPU device |
 | ONEDPL_TEST_EXPLICIT_KERNEL_NAMES   | STRING | Control kernel naming. Affects only oneDPL test targets. Supported values: AUTO, ALWAYS. AUTO: rely on the compiler if "Unnamed SYCL lambda kernels" feature is on, otherwise provide kernel names explicitly; ALWAYS: provide kernel names explicitly | AUTO          |
 | ONEDPL_TEST_WIN_ICX_FIXES     | BOOL   | Affects only oneDPL test targets.  Enable icx, icx-cl workarounds to fix issues in CMake for Windows.                      | ON            |
-| ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION | BOOL | Use as a workaround for incorrect results, which may be produced by reduction algorithms with 64-bit data types compiled by the Intel&reg; oneAPI DPC++/C++ Compiler and executed on GPU devices. |               |
 
 Some useful CMake variables ([here](https://cmake.org/cmake/help/latest/manual/cmake-variables.7.html) you can find a full list of CMake variables for the latest version):
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index b17c619de89..9c588d0bf52 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -34,21 +34,11 @@ namespace unseq_backend
 //This optimization depends on Intel(R) oneAPI DPC++ Compiler implementation such as support of binary operators from std namespace.
 //We need to use defined(SYCL_IMPLEMENTATION_INTEL) macro as a guard.
 
-template <typename _Tp>
-inline constexpr bool __can_use_known_identity =
-#    if ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION
-    // When ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION is defined as non-zero, we avoid using known identity for 64-bit arithmetic data types
-    !(::std::is_arithmetic_v<_Tp> && sizeof(_Tp) == sizeof(::std::uint64_t));
-#    else
-    true;
-#    endif // ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION
-
 //TODO: To change __has_known_identity implementation as soon as the Intel(R) oneAPI DPC++ Compiler implementation issues related to
 //std::multiplies, std::bit_or, std::bit_and and std::bit_xor operations will be fixed.
 //std::logical_and and std::logical_or are not supported in Intel(R) oneAPI DPC++ Compiler to be used in sycl::inclusive_scan_over_group and sycl::reduce_over_group
 template <typename _BinaryOp, typename _Tp>
-using __has_known_identity = ::std::conditional_t<
-    __can_use_known_identity<_Tp>,
+using __has_known_identity =
 #    if _ONEDPL_LIBSYCL_VERSION >= 50200
     typename ::std::disjunction<
         __dpl_sycl::__has_known_identity<_BinaryOp, _Tp>,
@@ -60,16 +50,15 @@ using __has_known_identity = ::std::conditional_t<
                                               ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum<_Tp>>,
                                               ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum<void>>,
                                               ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<_Tp>>,
-                                              ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<void>>>>>,
+                                              ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<void>>>>>;
 #    else  //_ONEDPL_LIBSYCL_VERSION >= 50200
     typename ::std::conjunction<
         ::std::is_arithmetic<_Tp>,
         ::std::disjunction<::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus<_Tp>>,
                            ::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus<void>>,
                            ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<_Tp>>,
-                           ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<void>>>>,
+                           ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<void>>>>;
 #    endif //_ONEDPL_LIBSYCL_VERSION >= 50200
-    ::std::false_type>;     // This is for the case of __can_use_known_identity<_Tp>==false
 
 #else //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL)
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 90eb3d5c737..e85e8e9f5f8 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -195,7 +195,6 @@ macro(onedpl_add_test test_source_file switch_off_checked_iterators)
     string(REPLACE "\.cpp" "" _test_name ${_test_name})
 
     set(coal_tests "reduce.pass" "transform_reduce.pass" "count.pass" "sycl_iterator_reduce.pass" "minmax_element.pass")
-    set(workaround_for_igpu_64bit_reduction_tests "reduce_by_segment.pass")
     # mark those tests with pstloffload_smoke_tests label
     set (pstloffload_smoke_tests "adjacent_find.pass" "copy_move.pass" "merge.pass" "partial_sort.pass" "remove_copy.pass"
         "transform_reduce.pass" "transform_reduce.pass.coal" "transform_scan.pass" "algorithm.pass"
@@ -209,10 +208,6 @@ macro(onedpl_add_test test_source_file switch_off_checked_iterators)
     if (_test_name IN_LIST coal_tests)
         onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "-D_ONEDPL_DETECT_SPIRV_COMPILATION=1" "${extra_test_label}")
         onedpl_construct_exec(${test_source_file} ${_test_name}.coal ${switch_off_checked_iterators} "-D_ONEDPL_DETECT_SPIRV_COMPILATION=0" "${extra_test_label}")
-    elseif (_test_name IN_LIST workaround_for_igpu_64bit_reduction_tests)
-        onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "" "${extra_test_label}")
-        string(REPLACE "\.pass" "_workaround_64bit_reduction\.pass" _test_name ${_test_name})
-        onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "-D_ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1" "${extra_test_label}")
     elseif(_test_name STREQUAL "free_after_unload.pass")
         onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "" "${extra_test_label}")
         onedpl_construct_exec(${test_source_file} ${_test_name}.after_pstl_offload ${switch_off_checked_iterators} "" "${extra_test_label}")
diff --git a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
index 4de95e26e9b..c75be5f2694 100644
--- a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
@@ -13,14 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#if defined(ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION)
-#undef ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION
-#endif
-
-#if defined(_ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION)
-#    define ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION _ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION
-#endif
-
 #include "support/test_config.h"
 
 #include "oneapi/dpl/execution"
@@ -307,17 +299,12 @@ run_test_on_device()
 {
 #if TEST_DPCPP_BACKEND_PRESENT
     // Skip 64-byte types testing when the algorithm is broken and there is no the workaround
-#if _PSTL_ICPX_TEST_RED_BY_SEG_BROKEN_64BIT_TYPES && !ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION
-    if constexpr (sizeof(ValueType) != 8)
-#endif
+    if (TestUtils::has_type_support<ValueType>(TestUtils::get_test_queue().get_device()))
     {
-        if (TestUtils::has_type_support<ValueType>(TestUtils::get_test_queue().get_device()))
-        {
-            // Run tests for USM shared memory
-            test4buffers<sycl::usm::alloc::shared, test_reduce_by_segment<ValueType, BinaryPredicate, BinaryOperation>>();
-            // Run tests for USM device memory
-            test4buffers<sycl::usm::alloc::device, test_reduce_by_segment<ValueType, BinaryPredicate, BinaryOperation>>();
-        }
+        // Run tests for USM shared memory
+        test4buffers<sycl::usm::alloc::shared, test_reduce_by_segment<ValueType, BinaryPredicate, BinaryOperation>>();
+        // Run tests for USM device memory
+        test4buffers<sycl::usm::alloc::device, test_reduce_by_segment<ValueType, BinaryPredicate, BinaryOperation>>();
     }
 #endif // TEST_DPCPP_BACKEND_PRESENT
 }

From 92cf29a88593eb740042d0b5cae71226e27e6d33 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 21 Oct 2024 13:53:35 -0500
Subject: [PATCH 16/31] Alternate testing between usm shared and device to
 prevent excessive binary size

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../numeric.ops/reduce_by_segment.pass.cpp    | 27 +++++++++----------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
index c75be5f2694..2cee63239b6 100644
--- a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
@@ -293,17 +293,14 @@ test_flag_pred()
 }
 #endif
 
-template <typename ValueType, typename BinaryPredicate, typename BinaryOperation>
+template <bool use_device_alloc, typename ValueType, typename BinaryPredicate, typename BinaryOperation>
 void
 run_test_on_device()
 {
 #if TEST_DPCPP_BACKEND_PRESENT
-    // Skip 64-byte types testing when the algorithm is broken and there is no the workaround
     if (TestUtils::has_type_support<ValueType>(TestUtils::get_test_queue().get_device()))
     {
-        // Run tests for USM shared memory
-        test4buffers<sycl::usm::alloc::shared, test_reduce_by_segment<ValueType, BinaryPredicate, BinaryOperation>>();
-        // Run tests for USM device memory
+        constexpr sycl::usm::alloc allocation_type = use_device_alloc ? sycl::usm::alloc::device : sycl::usm::alloc::shared;
         test4buffers<sycl::usm::alloc::device, test_reduce_by_segment<ValueType, BinaryPredicate, BinaryOperation>>();
     }
 #endif // TEST_DPCPP_BACKEND_PRESENT
@@ -322,12 +319,12 @@ run_test_on_host()
 #endif // !_PSTL_ICC_TEST_SIMD_UDS_BROKEN && !_PSTL_ICPX_TEST_RED_BY_SEG_OPTIMIZER_CRASH
 }
 
-template <typename ValueType, typename BinaryPredicate, typename BinaryOperation>
+template <bool use_device_alloc, typename ValueType, typename BinaryPredicate, typename BinaryOperation>
 void
 run_test()
 {
     run_test_on_host<ValueType, BinaryPredicate, BinaryOperation>();
-    run_test_on_device<ValueType, BinaryPredicate, BinaryOperation>();
+    run_test_on_device<use_device_alloc, ValueType, BinaryPredicate, BinaryOperation>();
 }
 
 int
@@ -337,7 +334,7 @@ main()
     // kernels. This is being filed to the compiler team. In the meantime, we can rearrange this test
     // to resolve the issue on our side.
 #if _PSTL_RED_BY_SEG_WINDOWS_COMPILE_ORDER_BROKEN
-    run_test<MatrixPoint<float>, UserBinaryPredicate<MatrixPoint<float>>, MaxFunctor<MatrixPoint<float>>>();
+    run_test</*use_device_alloc=*/true, MatrixPoint<float>, UserBinaryPredicate<MatrixPoint<float>>, MaxFunctor<MatrixPoint<float>>>();
 #endif
 
 #if TEST_DPCPP_BACKEND_PRESENT
@@ -347,17 +344,17 @@ main()
 #endif // TEST_DPCPP_BACKEND_PRESENT
 
 #if !_PSTL_RED_BY_SEG_WINDOWS_COMPILE_ORDER_BROKEN
-    run_test<MatrixPoint<float>, UserBinaryPredicate<MatrixPoint<float>>, MaxFunctor<MatrixPoint<float>>>();
+    run_test</*use_device_alloc=*/true, MatrixPoint<float>, UserBinaryPredicate<MatrixPoint<float>>, MaxFunctor<MatrixPoint<float>>>();
 #endif
 
-    run_test<int, ::std::equal_to<int>, ::std::plus<int>>();
-    run_test<float, ::std::equal_to<float>, ::std::plus<float>>();
-    run_test<double, ::std::equal_to<double>, ::std::plus<double>>();
+    run_test</*use_device_alloc=*/false, int, ::std::equal_to<int>, ::std::plus<int>>();
+    run_test</*use_device_alloc=*/true, float, ::std::equal_to<float>, ::std::plus<float>>();
+    run_test</*use_device_alloc=*/false, double, ::std::equal_to<double>, ::std::plus<double>>();
 
     // TODO investigate possible overflow: see issue #1416
-    run_test_on_device<int, ::std::equal_to<int>, ::std::multiplies<int>>();
-    run_test_on_device<float, ::std::equal_to<float>, ::std::multiplies<float>>();
-    run_test_on_device<double, ::std::equal_to<double>, ::std::multiplies<double>>();
+    run_test_on_device</*use_device_alloc=*/true, int, ::std::equal_to<int>, ::std::multiplies<int>>();
+    run_test_on_device</*use_device_alloc=*/false, float, ::std::equal_to<float>, ::std::multiplies<float>>();
+    run_test_on_device</*use_device_alloc=*/true, double, ::std::equal_to<double>, ::std::multiplies<double>>();
 
     return TestUtils::done();
 }

From d292e07f16b4e984e9c76030b1e77947f20743f9 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 21 Oct 2024 13:48:28 -0700
Subject: [PATCH 17/31] Performance tuning within scan input functor

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 52 +++++++++++++------
 1 file changed, 35 insertions(+), 17 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 1260d5450a1..7895ace6730 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -28,7 +28,6 @@
 #include <cmath>
 #include <limits>
 #include <cstdint>
-#include <optional>
 
 #include "../../iterator_impl.h"
 #include "../../execution_impl.h"
@@ -808,11 +807,12 @@ template <typename _BinaryPred>
 struct __gen_red_by_seg_scan_input
 {
     // Returns the following tuple:
-    // ((new_seg_mask: size_t, value: ValueType), output_value: bool, candidate_key: KeyType)
-    // new_seg_mask : 1 for a start of a new segment, 0 otherwise
-    // value        : Current element's value for reduction
-    // output_value : Whether this work-item should write an output
-    // candidate_key: The key of the next segment to write if output_value is true.
+    // ((new_seg_mask, value), output_value, next_key, current_key)
+    // size_t new_seg_mask : 1 for a start of a new segment, 0 otherwise
+    // ValueType value     : Current element's value for reduction
+    // bool output_value   : Whether this work-item should write an output (end of segment)
+    // KeyType next_key    : The key of the next segment to write if output_value is true
+    // KeyType current_key : The current element's key. This is only ever used by work-item 0 to write the first key
     template <typename _InRng>
     auto
     operator()(const _InRng& __in_rng, std::size_t __id) const
@@ -820,21 +820,37 @@ struct __gen_red_by_seg_scan_input
         auto __in_keys = std::get<0>(__in_rng.tuple());
         auto __in_vals = std::get<1>(__in_rng.tuple());
         using _KeyType = oneapi::dpl::__internal::__value_t<decltype(__in_keys)>;
-        using _OptKeyType = std::optional<_KeyType>;
         using _ValueType = oneapi::dpl::__internal::__value_t<decltype(__in_vals)>;
-        _OptKeyType __first_key;
-        if (__id == 0)
-            __first_key = _OptKeyType{__in_keys[0]};
-        std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]);
-        if (__id == __n - 1)
+        const _KeyType& __current_key = __in_keys[__id];
+        // Ordering the most common condition first has yielded the best results.
+        if (__id > 0 && __id < __n - 1)
+        {
+            const _KeyType& __prev_key = __in_keys[__id - 1];
+            const _KeyType& __next_key = __in_keys[__id + 1];
+            std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key);
+            return oneapi::dpl::__internal::make_tuple(
+                    oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}),
+                    !__binary_pred(__current_key, __next_key),
+                    __next_key, __current_key);
+        }
+        else if (__id == __n - 1)
+        {
+            const _KeyType& __prev_key = __in_keys[__id - 1];
+            std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key);
             return oneapi::dpl::__internal::make_tuple(
                 oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true,
-                _KeyType{__in_keys[__id]}, __first_key); // __in_keys[__id] is an unused placeholder
-        return oneapi::dpl::__internal::make_tuple(
-            oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}),
-            !__binary_pred(__in_keys[__id], __in_keys[__id + 1]), _KeyType{__in_keys[__id + 1]}, __first_key);
+                __current_key, __current_key); // Passing __current_key as the next key for the last element is a placeholder
+        }
+        else
+        {
+            const _KeyType& __next_key = __in_keys[__id + 1];
+            return oneapi::dpl::__internal::make_tuple(
+                oneapi::dpl::__internal::make_tuple(std::size_t{0}, _ValueType{__in_vals[__id]}),
+                !__binary_pred(__current_key, __next_key), __next_key, __current_key);
+        }
     }
     _BinaryPred __binary_pred;
+    // For correctness of the function call operator, __n must be greater than 1.
     std::size_t __n;
 };
 
@@ -877,7 +893,7 @@ struct __write_red_by_seg
         const std::size_t __out_idx = get<0>(get<0>(__tup));
 
         if (__id == 0)
-            __out_keys[0] = *get<3>(__tup);
+            __out_keys[0] = get<3>(__tup);
         if (__is_seg_end)
         {
             __out_values[__out_idx] = __cur_segment_value;
@@ -1303,6 +1319,8 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_
     using _WriteOp = __write_red_by_seg<_BinaryPredicate>;
     using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>;
     std::size_t __n = __keys.size();
+    // __gen_red_by_seg_scan_input requires that __n > 1
+    assert(__n > 1);
     return __parallel_transform_reduce_then_scan(
         __backend_tag, std::forward<_ExecutionPolicy>(__exec),
         oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),

From e41298dfc84854324a8a01cddb3090344b45c1bf Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 21 Oct 2024 13:49:44 -0700
Subject: [PATCH 18/31] Handle n=0, n=1 first in reduce_by_segment

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../hetero/algorithm_ranges_impl_hetero.h     | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index 2f42f5e4b60..7179e1f864a 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -913,22 +913,6 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
                             _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
                             _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
 {
-#if _ONEDPL_BACKEND_SYCL
-    // We would normally dispatch to the parallel implementation which would make the decision to invoke
-    // reduce-then-scan. However, since the fallback is implemented at the ranges level we must choose
-    // whether or not to use reduce-then-scan here.
-    if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
-    {
-        auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan(
-            _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys),
-            std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values),
-            __binary_pred, __binary_op);
-        __res.wait();
-        // Because our init type ends up being tuple<std::size_t, ValType>, return the first component which is the write index. Add 1 to return the
-        // past-the-end iterator pair of segmented reduction.
-        return std::get<0>(__res.get()) + 1;
-    }
-#endif
     // The algorithm reduces values in __values where the
     // associated keys for the values are equal to the adjacent key.
     //
@@ -960,6 +944,22 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
         return 1;
     }
 
+#if _ONEDPL_BACKEND_SYCL
+    // We would normally dispatch to the parallel implementation which would make the decision to invoke
+    // reduce-then-scan. However, since the fallback is implemented at the ranges level we must choose
+    // whether or not to use reduce-then-scan here.
+    if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
+    {
+        auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan(
+            _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys),
+            std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values),
+            __binary_pred, __binary_op);
+        __res.wait();
+        // Because our init type ends up being tuple<std::size_t, ValType>, return the first component which is the write index. Add 1 to return the
+        // past-the-end iterator pair of segmented reduction.
+        return std::get<0>(__res.get()) + 1;
+    }
+#endif
     using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>;
     using __key_type = oneapi::dpl::__internal::__value_t<_Range1>;
     using __val_type = oneapi::dpl::__internal::__value_t<_Range2>;

From fb9a30635b5cec9041ef0e9ae9e1ffd9f7c6e69e Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 22 Oct 2024 08:50:00 -0700
Subject: [PATCH 19/31] Code cleanup

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/internal/reduce_by_segment_impl.h     | 56 +++----------------
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 20 +++----
 2 files changed, 19 insertions(+), 57 deletions(-)

diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h
index 23259e5e550..8a04717c5c0 100644
--- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h
+++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h
@@ -57,9 +57,8 @@
 #include "../pstl/utils_ranges.h"
 #include "../pstl/hetero/dpcpp/utils_ranges_sycl.h"
 #include "../pstl/ranges_defs.h"
-#include "../pstl/glue_algorithm_ranges_impl.h"
+#include "../pstl/hetero/algorithm_ranges_impl_hetero.h"
 #include "../pstl/hetero/dpcpp/sycl_traits.h" //SYCL traits specialization for some oneDPL types.
-#include "scan_by_segment_impl.h"
 #endif
 
 namespace oneapi
@@ -169,42 +168,9 @@ reduce_by_segment_impl(_Tag, Policy&& policy, InputIterator1 first1, InputIterat
 
 #if _ONEDPL_BACKEND_SYCL
 
-template <typename... Name>
-class __seg_reduce_count_kernel;
-template <typename... Name>
-class __seg_reduce_offset_kernel;
-template <typename... Name>
-class __seg_reduce_wg_kernel;
-template <typename... Name>
-class __seg_reduce_prefix_kernel;
-
-namespace
-{
-template <typename... _Name>
-using _SegReduceCountPhase = __seg_reduce_count_kernel<_Name...>;
-template <typename... _Name>
-using _SegReduceOffsetPhase = __seg_reduce_offset_kernel<_Name...>;
-template <typename... _Name>
-using _SegReduceWgPhase = __seg_reduce_wg_kernel<_Name...>;
-template <typename... _Name>
-using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>;
-} // namespace
-
-template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3,
-          typename _Range4, typename _BinaryPredicate, typename _BinaryOperator>
-oneapi::dpl::__internal::__difference_t<_Range3>
-__pattern_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __keys,
-                            _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
-                            _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
-{
-    return oneapi::dpl::experimental::ranges::reduce_by_segment(
-        ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__keys), ::std::forward<_Range2>(__values),
-        ::std::forward<_Range3>(__out_keys), ::std::forward<_Range4>(__out_values), __binary_pred, __binary_op);
-}
-
 template <typename _BackendTag, typename Policy, typename InputIterator1, typename InputIterator2,
           typename OutputIterator1, typename OutputIterator2, typename BinaryPred, typename BinaryOperator>
-::std::pair<OutputIterator1, OutputIterator2>
+std::pair<OutputIterator1, OutputIterator2>
 reduce_by_segment_impl(__internal::__hetero_tag<_BackendTag> __tag, Policy&& policy, InputIterator1 first1,
                        InputIterator1 last1, InputIterator2 first2, OutputIterator1 result1, OutputIterator2 result2,
                        BinaryPred binary_pred, BinaryOperator binary_op)
@@ -218,14 +184,14 @@ reduce_by_segment_impl(__internal::__hetero_tag<_BackendTag> __tag, Policy&& pol
     //          keys_result   = { 1, 2, 3, 4, 1, 3, 1, 3, 0 } -- result1
     //          values_result = { 1, 2, 3, 4, 2, 6, 2, 6, 0 } -- result2
 
-    using _CountType = ::std::uint64_t;
+    using _CountType = std::uint64_t;
 
     namespace __bknd = __par_backend_hetero;
 
-    const auto n = ::std::distance(first1, last1);
+    const auto n = std::distance(first1, last1);
 
     if (n == 0)
-        return ::std::make_pair(result1, result2);
+        return std::make_pair(result1, result2);
 
     auto keep_keys = __ranges::__get_sycl_range<__bknd::access_mode::read, InputIterator1>();
     auto key_buf = keep_keys(first1, last1);
@@ -236,16 +202,12 @@ reduce_by_segment_impl(__internal::__hetero_tag<_BackendTag> __tag, Policy&& pol
     auto keep_value_outputs = __ranges::__get_sycl_range<__bknd::access_mode::write, OutputIterator2>();
     auto value_output_buf = keep_value_outputs(result2, result2 + n);
 
-    using has_known_identity =
-        typename unseq_backend::__has_known_identity<BinaryOperator,
-                                                     typename ::std::iterator_traits<InputIterator2>::value_type>::type;
-
     // number of unique keys
-    _CountType __n =
-        __pattern_reduce_by_segment(__tag, ::std::forward<Policy>(policy), key_buf.all_view(), value_buf.all_view(),
-                                    key_output_buf.all_view(), value_output_buf.all_view(), binary_pred, binary_op);
+    _CountType __n = oneapi::dpl::__internal::__ranges::__pattern_reduce_by_segment(
+        __tag, std::forward<Policy>(policy), key_buf.all_view(), value_buf.all_view(), key_output_buf.all_view(),
+        value_output_buf.all_view(), binary_pred, binary_op);
 
-    return ::std::make_pair(result1 + __n, result2 + __n);
+    return std::make_pair(result1 + __n, result2 + __n);
 }
 #endif
 } // namespace internal
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 7895ace6730..54646e0b182 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -829,17 +829,16 @@ struct __gen_red_by_seg_scan_input
             const _KeyType& __next_key = __in_keys[__id + 1];
             std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key);
             return oneapi::dpl::__internal::make_tuple(
-                    oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}),
-                    !__binary_pred(__current_key, __next_key),
-                    __next_key, __current_key);
+                oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}),
+                !__binary_pred(__current_key, __next_key), __next_key, __current_key);
         }
         else if (__id == __n - 1)
         {
             const _KeyType& __prev_key = __in_keys[__id - 1];
             std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key);
             return oneapi::dpl::__internal::make_tuple(
-                oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true,
-                __current_key, __current_key); // Passing __current_key as the next key for the last element is a placeholder
+                oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, __current_key,
+                __current_key); // Passing __current_key as the next key for the last element is a placeholder
         }
         else
         {
@@ -887,18 +886,19 @@ struct __write_red_by_seg
         using _KeyType = oneapi::dpl::__internal::__value_t<decltype(__out_keys)>;
         using _ValType = oneapi::dpl::__internal::__value_t<decltype(__out_values)>;
 
-        const _KeyType& __next_segment_key = get<2>(__tup);
-        const _ValType& __cur_segment_value = get<1>(get<0>(__tup));
+        const _KeyType& __next_key = get<2>(__tup);
+        const _KeyType& __current_key = get<3>(__tup);
+        const _ValType& __current_value = get<1>(get<0>(__tup));
         const bool __is_seg_end = get<1>(__tup);
         const std::size_t __out_idx = get<0>(get<0>(__tup));
 
         if (__id == 0)
-            __out_keys[0] = get<3>(__tup);
+            __out_keys[0] = __current_key;
         if (__is_seg_end)
         {
-            __out_values[__out_idx] = __cur_segment_value;
+            __out_values[__out_idx] = __current_value;
             if (__id != __n - 1)
-                __out_keys[__out_idx + 1] = __next_segment_key;
+                __out_keys[__out_idx + 1] = __next_key;
         }
     }
     _BinaryPred __binary_pred;

From d32ea4d02dc37d02f3d6e5da3ee8eb5328e87e22 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 22 Oct 2024 09:16:12 -0700
Subject: [PATCH 20/31] Improve comments and mark relevant variables as const

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 54646e0b182..204b8058b61 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -789,18 +789,21 @@ struct __gen_transform_input
 template <typename _BinaryPred>
 struct __gen_red_by_seg_reduce_input
 {
+    // Returns the following tuple:
+    // (new_seg_mask, value)
+    // size_t new_seg_mask : 1 for a start of a new segment, 0 otherwise
+    // ValueType value     : Current element's value for reduction
     template <typename _InRng>
     auto
     operator()(const _InRng& __in_rng, std::size_t __id) const
     {
-        auto __in_keys = std::get<0>(__in_rng.tuple());
-        auto __in_vals = std::get<1>(__in_rng.tuple());
+        const auto __in_keys = std::get<0>(__in_rng.tuple());
+        const auto __in_vals = std::get<1>(__in_rng.tuple());
         using _ValueType = oneapi::dpl::__internal::__value_t<decltype(__in_vals)>;
-        std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]);
+        const std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]);
         return oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]});
     }
     _BinaryPred __binary_pred;
-    std::size_t __n;
 };
 
 template <typename _BinaryPred>
@@ -817,8 +820,8 @@ struct __gen_red_by_seg_scan_input
     auto
     operator()(const _InRng& __in_rng, std::size_t __id) const
     {
-        auto __in_keys = std::get<0>(__in_rng.tuple());
-        auto __in_vals = std::get<1>(__in_rng.tuple());
+        const auto __in_keys = std::get<0>(__in_rng.tuple());
+        const auto __in_vals = std::get<1>(__in_rng.tuple());
         using _KeyType = oneapi::dpl::__internal::__value_t<decltype(__in_keys)>;
         using _ValueType = oneapi::dpl::__internal::__value_t<decltype(__in_vals)>;
         const _KeyType& __current_key = __in_keys[__id];
@@ -827,7 +830,7 @@ struct __gen_red_by_seg_scan_input
         {
             const _KeyType& __prev_key = __in_keys[__id - 1];
             const _KeyType& __next_key = __in_keys[__id + 1];
-            std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key);
+            const std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key);
             return oneapi::dpl::__internal::make_tuple(
                 oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}),
                 !__binary_pred(__current_key, __next_key), __next_key, __current_key);
@@ -835,7 +838,7 @@ struct __gen_red_by_seg_scan_input
         else if (__id == __n - 1)
         {
             const _KeyType& __prev_key = __in_keys[__id - 1];
-            std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key);
+            const std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key);
             return oneapi::dpl::__internal::make_tuple(
                 oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, __current_key,
                 __current_key); // Passing __current_key as the next key for the last element is a placeholder
@@ -1325,7 +1328,7 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_
         __backend_tag, std::forward<_ExecutionPolicy>(__exec),
         oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
         oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)),
-        _GenReduceInput{__binary_pred, __n}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, _ScanInputTransform{},
+        _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, _ScanInputTransform{},
         _WriteOp{__binary_pred, __n},
         oneapi::dpl::unseq_backend::__no_init_value<oneapi::dpl::__internal::tuple<std::size_t, _ValueType>>{},
         /*Inclusive*/ std::true_type{}, /*_IsUniquePattern=*/std::false_type{});

From aa4baafb931f6ba7c2e06b5c508438d377449116 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 22 Oct 2024 12:16:18 -0700
Subject: [PATCH 21/31] Add condition to ensure value type is trivially
 copyable to call reduce-then-scan

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../hetero/algorithm_ranges_impl_hetero.h     | 28 ++++++++++---------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index 7179e1f864a..fc04e0296ff 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -944,26 +944,28 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
         return 1;
     }
 
+    using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>;
+    using __key_type = oneapi::dpl::__internal::__value_t<_Range1>;
+    using __val_type = oneapi::dpl::__internal::__value_t<_Range2>;
 #if _ONEDPL_BACKEND_SYCL
     // We would normally dispatch to the parallel implementation which would make the decision to invoke
     // reduce-then-scan. However, since the fallback is implemented at the ranges level we must choose
     // whether or not to use reduce-then-scan here.
-    if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
+    if constexpr (std::is_trivially_copyable_v<__val_type>)
     {
-        auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan(
-            _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys),
-            std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values),
-            __binary_pred, __binary_op);
-        __res.wait();
-        // Because our init type ends up being tuple<std::size_t, ValType>, return the first component which is the write index. Add 1 to return the
-        // past-the-end iterator pair of segmented reduction.
-        return std::get<0>(__res.get()) + 1;
+        if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
+        {
+            auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan(
+                _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys),
+                std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values),
+                __binary_pred, __binary_op);
+            __res.wait();
+            // Because our init type ends up being tuple<std::size_t, ValType>, return the first component which is the write index. Add 1 to return the
+            // past-the-end iterator pair of segmented reduction.
+            return std::get<0>(__res.get()) + 1;
+        }
     }
 #endif
-    using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>;
-    using __key_type = oneapi::dpl::__internal::__value_t<_Range1>;
-    using __val_type = oneapi::dpl::__internal::__value_t<_Range2>;
-
     // Round 1: reduce with extra indices added to avoid long segments
     // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence.
     // Skip a round of copy_if and reduces if there are none.

From 8cc59df59bb163b5689a050c2743c5884d6b3361 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 22 Oct 2024 12:25:34 -0700
Subject: [PATCH 22/31] clang-format

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 204b8058b61..056a472a356 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -1311,8 +1311,9 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Range4,
           typename _BinaryPredicate, typename _BinaryOperator>
 auto
-__parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
-                                              _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
+__parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
+                                              _ExecutionPolicy&& __exec, _Range1&& __keys, _Range2&& __values,
+                                              _Range3&& __out_keys, _Range4&& __out_values,
                                               _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
 {
     using _GenReduceInput = __gen_red_by_seg_reduce_input<_BinaryPredicate>;
@@ -1328,8 +1329,8 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_
         __backend_tag, std::forward<_ExecutionPolicy>(__exec),
         oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
         oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)),
-        _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, _ScanInputTransform{},
-        _WriteOp{__binary_pred, __n},
+        _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n},
+        _ScanInputTransform{}, _WriteOp{__binary_pred, __n},
         oneapi::dpl::unseq_backend::__no_init_value<oneapi::dpl::__internal::tuple<std::size_t, _ValueType>>{},
         /*Inclusive*/ std::true_type{}, /*_IsUniquePattern=*/std::false_type{});
 }

From 80ceacdae7b395ca1b5ff6c0d507e75c071733cd Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 5 Nov 2024 13:52:13 -0600
Subject: [PATCH 23/31] Introduce iterator based __pattern_reduce_by_segment

* An iterator based __pattern_reduce_by_segment is added
* Due to compiler issues prior to icpx 2025.0, the reduce-then-scan path
  is disabled and the previous handcrafted SYCL implementation is restored to
  prevent performance regressions with older compilers
* The previous range-based fallback implementation has been moved to the
  SYCL backend along with the handcrafted SYCL version

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/internal/reduce_by_segment_impl.h     |  16 +-
 .../dpl/pstl/hetero/algorithm_impl_hetero.h   |  40 ++
 .../hetero/algorithm_ranges_impl_hetero.h     | 150 +-----
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 176 +++++++
 .../parallel_backend_sycl_reduce_by_segment.h | 464 ++++++++++++++++++
 5 files changed, 692 insertions(+), 154 deletions(-)
 create mode 100644 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h

diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h
index 8a04717c5c0..aa718e3743f 100644
--- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h
+++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h
@@ -57,7 +57,7 @@
 #include "../pstl/utils_ranges.h"
 #include "../pstl/hetero/dpcpp/utils_ranges_sycl.h"
 #include "../pstl/ranges_defs.h"
-#include "../pstl/hetero/algorithm_ranges_impl_hetero.h"
+#include "../pstl/hetero/algorithm_impl_hetero.h"
 #include "../pstl/hetero/dpcpp/sycl_traits.h" //SYCL traits specialization for some oneDPL types.
 #endif
 
@@ -193,19 +193,9 @@ reduce_by_segment_impl(__internal::__hetero_tag<_BackendTag> __tag, Policy&& pol
     if (n == 0)
         return std::make_pair(result1, result2);
 
-    auto keep_keys = __ranges::__get_sycl_range<__bknd::access_mode::read, InputIterator1>();
-    auto key_buf = keep_keys(first1, last1);
-    auto keep_values = __ranges::__get_sycl_range<__bknd::access_mode::read, InputIterator2>();
-    auto value_buf = keep_values(first2, first2 + n);
-    auto keep_key_outputs = __ranges::__get_sycl_range<__bknd::access_mode::write, OutputIterator1>();
-    auto key_output_buf = keep_key_outputs(result1, result1 + n);
-    auto keep_value_outputs = __ranges::__get_sycl_range<__bknd::access_mode::write, OutputIterator2>();
-    auto value_output_buf = keep_value_outputs(result2, result2 + n);
-
     // number of unique keys
-    _CountType __n = oneapi::dpl::__internal::__ranges::__pattern_reduce_by_segment(
-        __tag, std::forward<Policy>(policy), key_buf.all_view(), value_buf.all_view(), key_output_buf.all_view(),
-        value_output_buf.all_view(), binary_pred, binary_op);
+    _CountType __n = oneapi::dpl::__internal::__pattern_reduce_by_segment(
+        __tag, std::forward<Policy>(policy), first1, last1, first2, result1, result2, binary_pred, binary_op);
 
     return std::make_pair(result1 + __n, result2 + __n);
 }
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
index 1a51076c612..d853f057561 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -2003,6 +2003,46 @@ __pattern_shift_right(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec
     return __last - __res;
 }
 
+template <typename _Name>
+struct __copy_keys_values_wrapper;
+
+template <typename _BackendTag, typename _ExecutionPolicy, typename _Iterator1, typename _Iterator2,
+          typename _Iterator3, typename _Iterator4, typename _BinaryPredicate, typename _BinaryOperator>
+typename std::iterator_traits<_Iterator3>::difference_type
+__pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Iterator1 __keys_first,
+                            _Iterator1 __keys_last, _Iterator2 __values_first, _Iterator3 __out_keys_first,
+                            _Iterator4 __out_values_first, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
+{
+    std::size_t __n = std::distance(__keys_first, __keys_last);
+
+    if (__n == 0)
+        return 0;
+
+    if (__n == 1)
+    {
+        __brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy> __copy_op{};
+
+        oneapi::dpl::__internal::__pattern_walk2_n(
+            __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_keys_values_wrapper>(__exec),
+            oneapi::dpl::make_zip_iterator(__keys_first, __values_first), 1,
+            oneapi::dpl::make_zip_iterator(__out_keys_first, __out_values_first), __copy_op);
+
+        return 1;
+    }
+
+    auto __keep_keys = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>();
+    auto __keys = __keep_keys(__keys_first, __keys_last);
+    auto __keep_values = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator2>();
+    auto __values = __keep_values(__values_first, __values_first + __n);
+    auto __keep_key_outputs = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator3>();
+    auto __out_keys = __keep_key_outputs(__out_keys_first, __out_keys_first + __n);
+    auto __keep_value_outputs = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator4>();
+    auto __out_values = __keep_value_outputs(__out_values_first, __out_values_first + __n);
+    return oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __keys.all_view(), __values.all_view(),
+        __out_keys.all_view(), __out_values.all_view(), __binary_pred, __binary_op);
+}
+
 } // namespace __internal
 } // namespace dpl
 } // namespace oneapi
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index fc04e0296ff..da7820b91a2 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -889,22 +889,7 @@ __pattern_minmax_element(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _
 //------------------------------------------------------------------------
 
 template <typename _Name>
-struct __copy_keys_wrapper;
-
-template <typename _Name>
-struct __copy_values_wrapper;
-
-template <typename _Name>
-struct __reduce1_wrapper;
-
-template <typename _Name>
-struct __reduce2_wrapper;
-
-template <typename _Name>
-struct __assign_key1_wrapper;
-
-template <typename _Name>
-struct __assign_key2_wrapper;
+struct __copy_keys_values_range_wrapper;
 
 template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3,
           typename _Range4, typename _BinaryPredicate, typename _BinaryOperator>
@@ -932,135 +917,18 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
         __brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy> __copy_range{};
 
         oneapi::dpl::__internal::__ranges::__pattern_walk_n(
-            __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_keys_wrapper>(__exec), __copy_range,
-            std::forward<_Range1>(__keys), std::forward<_Range3>(__out_keys));
-
-        oneapi::dpl::__internal::__ranges::__pattern_walk_n(
-            __tag,
-            oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_values_wrapper>(
-                std::forward<_ExecutionPolicy>(__exec)),
-            __copy_range, std::forward<_Range2>(__values), std::forward<_Range4>(__out_values));
+            __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_keys_values_range_wrapper>(__exec),
+            __copy_range,
+            oneapi::dpl::__ranges::zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
+            oneapi::dpl::__ranges::zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)));
 
         return 1;
     }
 
-    using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>;
-    using __key_type = oneapi::dpl::__internal::__value_t<_Range1>;
-    using __val_type = oneapi::dpl::__internal::__value_t<_Range2>;
-#if _ONEDPL_BACKEND_SYCL
-    // We would normally dispatch to the parallel implementation which would make the decision to invoke
-    // reduce-then-scan. However, since the fallback is implemented at the ranges level we must choose
-    // whether or not to use reduce-then-scan here.
-    if constexpr (std::is_trivially_copyable_v<__val_type>)
-    {
-        if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
-        {
-            auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan(
-                _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys),
-                std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values),
-                __binary_pred, __binary_op);
-            __res.wait();
-            // Because our init type ends up being tuple<std::size_t, ValType>, return the first component which is the write index. Add 1 to return the
-            // past-the-end iterator pair of segmented reduction.
-            return std::get<0>(__res.get()) + 1;
-        }
-    }
-#endif
-    // Round 1: reduce with extra indices added to avoid long segments
-    // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence.
-    // Skip a round of copy_if and reduces if there are none.
-    auto __idx = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer();
-    auto __tmp_out_keys =
-        oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer();
-    auto __tmp_out_values =
-        oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n).get_buffer();
-
-    // Replicating first element of keys view to be able to compare (i-1)-th and (i)-th key with aligned sequences,
-    //  dropping the last key for the i-1 sequence.
-    auto __k1 =
-        oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::replicate_start_view_simple(__keys, 1), __n);
-
-    // view1 elements are a tuple of the element index and pairs of adjacent keys
-    // view2 elements are a tuple of the elements where key-index pairs will be written by copy_if
-    auto __view1 = experimental::ranges::zip_view(experimental::ranges::views::iota(0, __n), __k1, __keys);
-    auto __view2 = experimental::ranges::zip_view(experimental::ranges::views::all_write(__tmp_out_keys),
-                                                  experimental::ranges::views::all_write(__idx));
-
-    // use work group size adjusted to shared local memory as the maximum segment size.
-    std::size_t __wgroup_size =
-        oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(__key_type) + sizeof(__val_type));
-
-    // element is copied if it is the 0th element (marks beginning of first segment), is in an index
-    // evenly divisible by wg size (ensures segments are not long), or has a key not equal to the
-    // adjacent element (marks end of real segments)
-    // TODO: replace wgroup size with segment size based on platform specifics.
-    auto __intermediate_result_end = __ranges::__pattern_copy_if(
-        __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2,
-        [__binary_pred, __wgroup_size](const auto& __a) {
-            // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys
-            // for (i-1), but we still need to get its key value as it is the start of a segment
-            const auto index = std::get<0>(__a);
-            if (index == 0)
-                return true;
-            return index % __wgroup_size == 0                             // segment size
-                   || !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // key comparison
-        },
-        unseq_backend::__brick_assign_key_position{});
-
-    //reduce by segment
-    oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec),
-        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), __intermediate_result_end,
-        oneapi::dpl::__ranges::take_view_simple(experimental::ranges::views::all_read(__idx),
-                                                __intermediate_result_end),
-        std::forward<_Range2>(__values), experimental::ranges::views::all_write(__tmp_out_values))
-        .wait();
-
-    // Round 2: final reduction to get result for each segment of equal adjacent keys
-    // create views over adjacent keys
-    oneapi::dpl::__ranges::all_view<__key_type, __par_backend_hetero::access_mode::read_write> __new_keys(
-        __tmp_out_keys);
-
-    // Replicating first element of key views to be able to compare (i-1)-th and (i)-th key,
-    //  dropping the last key for the i-1 sequence.  Only taking the appropriate number of keys to start with here.
-    auto __clipped_new_keys = oneapi::dpl::__ranges::take_view_simple(__new_keys, __intermediate_result_end);
-
-    auto __k3 = oneapi::dpl::__ranges::take_view_simple(
-        oneapi::dpl::__ranges::replicate_start_view_simple(__clipped_new_keys, 1), __intermediate_result_end);
-
-    // view3 elements are a tuple of the element index and pairs of adjacent keys
-    // view4 elements are a tuple of the elements where key-index pairs will be written by copy_if
-    auto __view3 = experimental::ranges::zip_view(experimental::ranges::views::iota(0, __intermediate_result_end), __k3,
-                                                  __clipped_new_keys);
-    auto __view4 = experimental::ranges::zip_view(experimental::ranges::views::all_write(__out_keys),
-                                                  experimental::ranges::views::all_write(__idx));
-
-    // element is copied if it is the 0th element (marks beginning of first segment), or has a key not equal to
-    // the adjacent element (end of a segment). Artificial segments based on wg size are not created.
-    auto __result_end = __ranges::__pattern_copy_if(
-        __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key2_wrapper>(__exec), __view3, __view4,
-        [__binary_pred](const auto& __a) {
-            // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys
-            // for (i-1), but we still need to get its key value as it is the start of a segment
-            if (std::get<0>(__a) == 0)
-                return true;
-            return !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // keys comparison
-        },
-        unseq_backend::__brick_assign_key_position{});
-
-    //reduce by segment
-    oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{},
-        oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>(
-            std::forward<_ExecutionPolicy>(__exec)),
-        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>(
-            __binary_op, __intermediate_result_end),
-        __result_end,
-        oneapi::dpl::__ranges::take_view_simple(experimental::ranges::views::all_read(__idx), __result_end),
-        experimental::ranges::views::all_read(__tmp_out_values), std::forward<_Range4>(__out_values))
-        .__deferrable_wait();
-
-    return __result_end;
+    return oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys),
+        std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values),
+        __binary_pred, __binary_op);
 }
 
 } // namespace __ranges
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 056a472a356..431d58d75ea 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -32,12 +32,14 @@
 #include "../../iterator_impl.h"
 #include "../../execution_impl.h"
 #include "../../utils_ranges.h"
+#include "../../ranges_defs.h"
 
 #include "sycl_defs.h"
 #include "parallel_backend_sycl_utils.h"
 #include "parallel_backend_sycl_reduce.h"
 #include "parallel_backend_sycl_merge.h"
 #include "parallel_backend_sycl_merge_sort.h"
+#include "parallel_backend_sycl_reduce_by_segment.h"
 #include "parallel_backend_sycl_reduce_then_scan.h"
 #include "execution_sycl_defs.h"
 #include "sycl_iterator.h"
@@ -2273,6 +2275,180 @@ __parallel_partial_sort(oneapi::dpl::__internal::__device_backend_tag __backend_
     return __parallel_partial_sort_impl(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __buf.all_view(),
                                         __partial_merge_kernel<decltype(__mid_idx)>{__mid_idx}, __comp);
 }
+
+//------------------------------------------------------------------------
+// reduce_by_segment - sync pattern
+//------------------------------------------------------------------------
+
+// TODO: The non-identity fallback path of reduce-by-segment must currently be implemented synchronously due to the
+// inability to create event dependency chains across separate parallel pattern calls. If we ever add support for
+// cross parallel pattern dependencies, then we can implement this as an async pattern.
+template <typename _Name>
+struct __reduce1_wrapper;
+
+template <typename _Name>
+struct __reduce2_wrapper;
+
+template <typename _Name>
+struct __assign_key1_wrapper;
+
+template <typename _Name>
+struct __assign_key2_wrapper;
+
+template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Range4,
+          typename _BinaryPredicate, typename _BinaryOperator>
+oneapi::dpl::__internal::__difference_t<_Range3>
+__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys,
+                             _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
+                             _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
+{
+    // The algorithm reduces values in __values where the
+    // associated keys for the values are equal to the adjacent key.
+    //
+    // Example: __keys       = { 1, 2, 3, 4, 1, 1, 3, 3, 1, 1, 3, 3, 0 }
+    //          __values     = { 1, 2, 3, 4, 1, 1, 3, 3, 1, 1, 3, 3, 0 }
+    //
+    //          __out_keys   = { 1, 2, 3, 4, 1, 3, 1, 3, 0 }
+    //          __out_values = { 1, 2, 3, 4, 2, 6, 2, 6, 0 }
+
+    const auto __n = __keys.size();
+
+    using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>;
+    using __key_type = oneapi::dpl::__internal::__value_t<_Range1>;
+    using __val_type = oneapi::dpl::__internal::__value_t<_Range2>;
+    // Prior to icpx 2025.0, the reduce-then-scan path performs poorly and should be avoided.
+#if !defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER >= 20250000
+    if constexpr (std::is_trivially_copyable_v<__val_type>)
+    {
+        if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
+        {
+            auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan(
+                oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec),
+                std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys),
+                std::forward<_Range4>(__out_values), __binary_pred, __binary_op);
+            __res.wait();
+            // Because our init type ends up being tuple<std::size_t, ValType>, return the first component which is the write index. Add 1 to return the
+            // past-the-end iterator pair of segmented reduction.
+            return std::get<0>(__res.get()) + 1;
+        }
+    }
+#endif
+    if constexpr (oneapi::dpl::unseq_backend::__has_known_identity<_BinaryOperator, __val_type>::value)
+    {
+        return __parallel_reduce_by_segment_known_identity(
+            oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec),
+            std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys),
+            std::forward<_Range4>(__out_values), __binary_pred, __binary_op);
+    }
+    else
+    {
+        // Round 1: reduce with extra indices added to avoid long segments
+        // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence.
+        // Skip a round of copy_if and reduces if there are none.
+        auto __idx =
+            oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer();
+        auto __tmp_out_keys =
+            oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer();
+        auto __tmp_out_values =
+            oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n).get_buffer();
+
+        // Replicating first element of keys view to be able to compare (i-1)-th and (i)-th key with aligned sequences,
+        //  dropping the last key for the i-1 sequence.
+        auto __k1 =
+            oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::replicate_start_view_simple(__keys, 1), __n);
+
+        // view1 elements are a tuple of the element index and pairs of adjacent keys
+        // view2 elements are a tuple of the elements where key-index pairs will be written by copy_if
+        auto __view1 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __n), __k1, __keys);
+        auto __view2 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__tmp_out_keys),
+                                                       oneapi::dpl::__ranges::views::all_write(__idx));
+
+        // use work group size adjusted to shared local memory as the maximum segment size.
+        std::size_t __wgroup_size =
+            oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(__key_type) + sizeof(__val_type));
+
+        // element is copied if it is the 0th element (marks beginning of first segment), is in an index
+        // evenly divisible by wg size (ensures segments are not long), or has a key not equal to the
+        // adjacent element (marks end of real segments)
+        // TODO: replace wgroup size with segment size based on platform specifics.
+        auto __intermediate_result_end =
+            oneapi::dpl::__par_backend_hetero::__parallel_copy_if(
+                oneapi::dpl::__internal::__device_backend_tag{},
+                oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2,
+                __n,
+                [__binary_pred, __wgroup_size](const auto& __a) {
+                    // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys
+                    // for (i-1), but we still need to get its key value as it is the start of a segment
+                    const auto index = std::get<0>(__a);
+                    if (index == 0)
+                        return true;
+                    return index % __wgroup_size == 0                             // segment size
+                           || !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // key comparison
+                },
+                unseq_backend::__brick_assign_key_position{})
+                .get();
+
+        //reduce by segment
+        oneapi::dpl::__par_backend_hetero::__parallel_for(
+            oneapi::dpl::__internal::__device_backend_tag{},
+            oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec),
+            unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n),
+            __intermediate_result_end,
+            oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx),
+                                                    __intermediate_result_end),
+            std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values))
+            .wait();
+
+        // Round 2: final reduction to get result for each segment of equal adjacent keys
+        // create views over adjacent keys
+        oneapi::dpl::__ranges::all_view<__key_type, __par_backend_hetero::access_mode::read_write> __new_keys(
+            __tmp_out_keys);
+
+        // Replicating first element of key views to be able to compare (i-1)-th and (i)-th key,
+        //  dropping the last key for the i-1 sequence.  Only taking the appropriate number of keys to start with here.
+        auto __clipped_new_keys = oneapi::dpl::__ranges::take_view_simple(__new_keys, __intermediate_result_end);
+
+        auto __k3 = oneapi::dpl::__ranges::take_view_simple(
+            oneapi::dpl::__ranges::replicate_start_view_simple(__clipped_new_keys, 1), __intermediate_result_end);
+
+        // view3 elements are a tuple of the element index and pairs of adjacent keys
+        // view4 elements are a tuple of the elements where key-index pairs will be written by copy_if
+        auto __view3 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __intermediate_result_end),
+                                                       __k3, __clipped_new_keys);
+        auto __view4 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__out_keys),
+                                                       oneapi::dpl::__ranges::views::all_write(__idx));
+
+        // element is copied if it is the 0th element (marks beginning of first segment), or has a key not equal to
+        // the adjacent element (end of a segment). Artificial segments based on wg size are not created.
+        auto __result_end = oneapi::dpl::__par_backend_hetero::__parallel_copy_if(
+                                oneapi::dpl::__internal::__device_backend_tag{},
+                                oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key2_wrapper>(__exec),
+                                __view3, __view4, __view3.size(),
+                                [__binary_pred](const auto& __a) {
+                                    // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys
+                                    // for (i-1), but we still need to get its key value as it is the start of a segment
+                                    if (std::get<0>(__a) == 0)
+                                        return true;
+                                    return !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // keys comparison
+                                },
+                                unseq_backend::__brick_assign_key_position{})
+                                .get();
+
+        //reduce by segment
+        oneapi::dpl::__par_backend_hetero::__parallel_for(
+            oneapi::dpl::__internal::__device_backend_tag{},
+            oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>(
+                std::forward<_ExecutionPolicy>(__exec)),
+            unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>(
+                __binary_op, __intermediate_result_end),
+            __result_end,
+            oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __result_end),
+            oneapi::dpl::__ranges::views::all_read(__tmp_out_values), std::forward<_Range4>(__out_values))
+            .__deferrable_wait();
+        return __result_end;
+    }
+}
+
 } // namespace __par_backend_hetero
 } // namespace dpl
 } // namespace oneapi
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h
new file mode 100644
index 00000000000..14860e3830a
--- /dev/null
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h
@@ -0,0 +1,464 @@
+// -*- C++ -*-
+//===-- parallel_backend_sycl_reduce_by_segment.h ---------------------------------===//
+/*  Copyright (c) Intel Corporation
+ *
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ *  ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_BY_SEGMENT_H
+#define _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_BY_SEGMENT_H
+
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <algorithm>
+#include <array>
+
+#include "sycl_defs.h"
+#include "parallel_backend_sycl_utils.h"
+#include "utils_ranges_sycl.h"
+#include "sycl_traits.h"
+
+#include "../../utils.h"
+#include "../../../internal/scan_by_segment_impl.h"
+
+namespace oneapi
+{
+namespace dpl
+{
+namespace __par_backend_hetero
+{
+
+template <typename... Name>
+class __seg_reduce_count_kernel;
+template <typename... Name>
+class __seg_reduce_offset_kernel;
+template <typename... Name>
+class __seg_reduce_wg_kernel;
+template <typename... Name>
+class __seg_reduce_prefix_kernel;
+
+namespace
+{
+template <typename... _Name>
+using _SegReduceCountPhase = __seg_reduce_count_kernel<_Name...>;
+template <typename... _Name>
+using _SegReduceOffsetPhase = __seg_reduce_offset_kernel<_Name...>;
+template <typename... _Name>
+using _SegReduceWgPhase = __seg_reduce_wg_kernel<_Name...>;
+template <typename... _Name>
+using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>;
+} // namespace
+
+template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3,
+          typename _Range4, typename _BinaryPredicate, typename _BinaryOperator>
+oneapi::dpl::__internal::__difference_t<_Range3>
+__parallel_reduce_by_segment_known_identity(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys,
+											_Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
+											_BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
+{
+    using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
+
+    using _SegReduceCountKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<
+        _SegReduceCountPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate,
+        _BinaryOperator>;
+    using _SegReduceOffsetKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<
+        _SegReduceOffsetPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate,
+        _BinaryOperator>;
+    using _SegReduceWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<
+        _SegReduceWgPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate,
+        _BinaryOperator>;
+    using _SegReducePrefixKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<
+        _SegReducePrefixPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate,
+        _BinaryOperator>;
+
+    using __diff_type = oneapi::dpl::__internal::__difference_t<_Range3>;
+    using __key_type = oneapi::dpl::__internal::__value_t<_Range1>;
+    using __val_type = oneapi::dpl::__internal::__value_t<_Range2>;
+
+    const std::size_t __n = __keys.size();
+
+    constexpr std::uint16_t __vals_per_item =
+        16; // Each work item serially processes 16 items. Best observed performance on gpu
+
+    // Limit the work-group size to prevent large sizes on CPUs. Empirically found value.
+    // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future.
+    std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)2048);
+
+    // adjust __wgroup_size according to local memory limit. Double the requirement on __val_type due to sycl group algorithm's use
+    // of SLM.
+    __wgroup_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(
+        __exec, sizeof(__key_type) + 2 * sizeof(__val_type), __wgroup_size);
+
+#if _ONEDPL_COMPILE_KERNEL
+    auto __seg_reduce_count_kernel =
+        __par_backend_hetero::__internal::__kernel_compiler<_SegReduceCountKernel>::__compile(__exec);
+    auto __seg_reduce_offset_kernel =
+        __par_backend_hetero::__internal::__kernel_compiler<_SegReduceOffsetKernel>::__compile(__exec);
+    auto __seg_reduce_wg_kernel =
+        __par_backend_hetero::__internal::__kernel_compiler<_SegReduceWgKernel>::__compile(__exec);
+    auto __seg_reduce_prefix_kernel =
+        __par_backend_hetero::__internal::__kernel_compiler<_SegReducePrefixKernel>::__compile(__exec);
+    __wgroup_size =
+        std::min({__wgroup_size,
+                    oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_count_kernel),
+                    oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_offset_kernel),
+                    oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_wg_kernel),
+                    oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_prefix_kernel)});
+#endif
+
+    std::size_t __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __wgroup_size * __vals_per_item);
+
+    // intermediate reductions within a workgroup
+    auto __partials =
+        oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n_groups).get_buffer();
+
+    auto __end_idx = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, 1).get_buffer();
+
+    // the number of segment ends found in each work group
+    auto __seg_ends =
+        oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n_groups).get_buffer();
+
+    // buffer that stores an exclusive scan of the results
+    auto __seg_ends_scanned =
+        oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n_groups).get_buffer();
+
+    // 1. Count the segment ends in each workgroup
+    auto __seg_end_identification = __exec.queue().submit([&](sycl::handler& __cgh) {
+        oneapi::dpl::__ranges::__require_access(__cgh, __keys);
+        auto __seg_ends_acc = __seg_ends.template get_access<sycl::access_mode::write>(__cgh);
+#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
+        __cgh.use_kernel_bundle(__seg_reduce_count_kernel.get_kernel_bundle());
+#endif
+        __cgh.parallel_for<_SegReduceCountKernel>(
+            sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](
+#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
+                                                                              __seg_reduce_count_kernel,
+#endif
+                                                                              sycl::nd_item<1> __item) {
+                auto __group = __item.get_group();
+                std::size_t __group_id = __item.get_group(0);
+                std::size_t __local_id = __item.get_local_id(0);
+                std::size_t __global_id = __item.get_global_id(0);
+
+                std::size_t __start = __global_id * __vals_per_item;
+                std::size_t __end = __dpl_sycl::__minimum<decltype(__n)>{}(__start + __vals_per_item, __n);
+                std::size_t __item_segments = 0;
+
+                // 1a. Work item scan to identify segment ends
+                for (std::size_t __i = __start; __i < __end; ++__i)
+                    if (__n - 1 == __i || !__binary_pred(__keys[__i], __keys[__i + 1]))
+                        ++__item_segments;
+
+                // 1b. Work group reduction
+                std::size_t __num_segs = __dpl_sycl::__reduce_over_group(
+                    __group, __item_segments, __dpl_sycl::__plus<decltype(__item_segments)>());
+
+                // 1c. First work item writes segment count to global memory
+                if (__local_id == 0)
+                    __seg_ends_acc[__group_id] = __num_segs;
+            });
+    });
+
+    // 1.5 Small single-group kernel
+    auto __single_group_scan = __exec.queue().submit([&](sycl::handler& __cgh) {
+        __cgh.depends_on(__seg_end_identification);
+        auto __seg_ends_acc = __seg_ends.template get_access<sycl::access_mode::read>(__cgh);
+        auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access<sycl::access_mode::read_write>(__cgh);
+#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
+        __cgh.use_kernel_bundle(__seg_reduce_offset_kernel.get_kernel_bundle());
+#endif
+        __cgh.parallel_for<_SegReduceOffsetKernel>(
+#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
+            __seg_reduce_offset_kernel,
+#endif
+            sycl::nd_range<1>{__wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
+                auto __beg = __dpl_sycl::__get_accessor_ptr(__seg_ends_acc);
+                auto __out_beg = __dpl_sycl::__get_accessor_ptr(__seg_ends_scan_acc);
+                __dpl_sycl::__joint_exclusive_scan(__item.get_group(), __beg, __beg + __n_groups, __out_beg,
+                                                   __diff_type(0), sycl::plus<__diff_type>());
+            });
+    });
+
+    // 2. Work group reduction
+    auto __wg_reduce = __exec.queue().submit([&](sycl::handler& __cgh) {
+        __cgh.depends_on(__single_group_scan);
+        oneapi::dpl::__ranges::__require_access(__cgh, __keys, __out_keys, __out_values, __values);
+
+        auto __partials_acc = __partials.template get_access<sycl::access_mode::read_write>(__cgh);
+        auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access<sycl::access_mode::read>(__cgh);
+        __dpl_sycl::__local_accessor<__val_type> __loc_acc(2 * __wgroup_size, __cgh);
+#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
+        __cgh.use_kernel_bundle(__seg_reduce_wg_kernel.get_kernel_bundle());
+#endif
+        __cgh.parallel_for<_SegReduceWgKernel>(
+#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
+            __seg_reduce_wg_kernel,
+#endif
+            sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
+                std::array<__val_type, __vals_per_item> __loc_partials;
+
+                auto __group = __item.get_group();
+                std::size_t __group_id = __item.get_group(0);
+                std::size_t __local_id = __item.get_local_id(0);
+                std::size_t __global_id = __item.get_global_id(0);
+
+                // 2a. Lookup the number of prior segs
+                auto __wg_num_prior_segs = __seg_ends_scan_acc[__group_id];
+
+                // 2b. Perform a serial scan within the work item over assigned elements. Store partial
+                // reductions in work group local memory.
+                std::size_t __start = __global_id * __vals_per_item;
+                std::size_t __end = __dpl_sycl::__minimum<decltype(__n)>{}(__start + __vals_per_item, __n);
+
+                std::size_t __max_end = 0;
+                std::size_t __item_segments = 0;
+                auto __identity = unseq_backend::__known_identity<_BinaryOperator, __val_type>;
+
+                __val_type __accumulator = __identity;
+                for (std::size_t __i = __start; __i < __end; ++__i)
+                {
+                    __accumulator = __binary_op(__accumulator, __values[__i]);
+                    if (__n - 1 == __i || !__binary_pred(__keys[__i], __keys[__i + 1]))
+                    {
+                        __loc_partials[__i - __start] = __accumulator;
+                        ++__item_segments;
+                        __max_end = __local_id;
+                        __accumulator = __identity;
+                    }
+                }
+
+                // 2c. Count the number of prior work segments cooperatively over group
+                std::size_t __prior_segs_in_wg = __dpl_sycl::__exclusive_scan_over_group(
+                    __group, __item_segments, __dpl_sycl::__plus<decltype(__item_segments)>());
+                std::size_t __start_idx = __wg_num_prior_segs + __prior_segs_in_wg;
+
+                // 2d. Find the greatest segment end less than the current index (inclusive)
+                std::size_t __closest_seg_id = __dpl_sycl::__inclusive_scan_over_group(
+                    __group, __max_end, __dpl_sycl::__maximum<decltype(__max_end)>());
+
+                // __wg_segmented_scan is a derivative work and responsible for the third header copyright
+                __val_type __carry_in = oneapi::dpl::internal::__wg_segmented_scan(
+                    __item, __loc_acc, __local_id, __local_id - __closest_seg_id, __accumulator, __identity,
+                    __binary_op, __wgroup_size);
+
+                // 2e. Update local partial reductions in first segment and write to global memory.
+                bool __apply_aggs = true;
+                std::size_t __item_offset = 0;
+
+                // first item in group does not have any work-group aggregates to apply
+                if (__local_id == 0)
+                {
+                    __apply_aggs = false;
+                    if (__global_id == 0 && __n > 0)
+                    {
+                        // first segment identifier is always the first key
+                        __out_keys[0] = __keys[0];
+                    }
+                }
+
+                // apply the aggregates and copy the locally stored values to destination buffer
+                for (std::size_t __i = __start; __i < __end; ++__i)
+                {
+                    if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1]))
+                    {
+                        std::size_t __idx = __start_idx + __item_offset;
+                        if (__apply_aggs)
+                        {
+                            __out_values[__idx] = __binary_op(__carry_in, __loc_partials[__i - __start]);
+                            __apply_aggs = false;
+                        }
+                        else
+                        {
+                            __out_values[__idx] = __loc_partials[__i - __start];
+                        }
+                        if (__i != __n - 1)
+                        {
+                            __out_keys[__idx + 1] = __keys[__i + 1];
+                        }
+                        ++__item_offset;
+                    }
+                }
+
+                // 2f. Output the work group aggregate and total number of segments for use in phase 3.
+                if (__local_id == __wgroup_size - 1) // last work item writes the group's carry out
+                {
+                    // If no segment ends in the item, the aggregates from previous work groups must be applied.
+                    if (__max_end == 0)
+                    {
+                        // needs to be inclusive with last element
+                        __partials_acc[__group_id] = __binary_op(__carry_in, __accumulator);
+                    }
+                    else
+                    {
+                        __partials_acc[__group_id] = __accumulator;
+                    }
+                }
+            });
+    });
+
+    // 3. Apply inter work-group aggregates
+    __exec.queue()
+        .submit([&](sycl::handler& __cgh) {
+            oneapi::dpl::__ranges::__require_access(__cgh, __keys, __out_keys, __out_values);
+
+            auto __partials_acc = __partials.template get_access<sycl::access_mode::read>(__cgh);
+            auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access<sycl::access_mode::read>(__cgh);
+            auto __seg_ends_acc = __seg_ends.template get_access<sycl::access_mode::read>(__cgh);
+            auto __end_idx_acc = __end_idx.template get_access<sycl::access_mode::write>(__cgh);
+
+            __dpl_sycl::__local_accessor<__val_type> __loc_partials_acc(__wgroup_size, __cgh);
+            __dpl_sycl::__local_accessor<__diff_type> __loc_seg_ends_acc(__wgroup_size, __cgh);
+
+            __cgh.depends_on(__wg_reduce);
+#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
+            __cgh.use_kernel_bundle(__seg_reduce_prefix_kernel.get_kernel_bundle());
+#endif
+            __cgh.parallel_for<_SegReducePrefixKernel>(
+#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
+                __seg_reduce_prefix_kernel,
+#endif
+                sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
+                    auto __group = __item.get_group();
+                    std::int64_t __group_id = __item.get_group(0);
+                    std::size_t __global_id = __item.get_global_id(0);
+                    std::size_t __local_id = __item.get_local_id(0);
+
+                    std::size_t __start = __global_id * __vals_per_item;
+                    std::size_t __end = __dpl_sycl::__minimum<decltype(__n)>{}(__start + __vals_per_item, __n);
+                    std::size_t __item_segments = 0;
+
+                    std::int64_t __wg_agg_idx = __group_id - 1;
+                    __val_type __agg_collector = unseq_backend::__known_identity<_BinaryOperator, __val_type>;
+
+                    bool __ag_exists = false;
+                    // 3a. Check to see if an aggregate exists and compute that value in the first
+                    // work item.
+                    if (__group_id != 0)
+                    {
+                        __ag_exists = __start < __n;
+                        // local reductions followed by a sweep
+                        constexpr std::int32_t __vals_to_explore = 16;
+                        bool __last_it = false;
+                        __loc_seg_ends_acc[__local_id] = false;
+                        __loc_partials_acc[__local_id] = unseq_backend::__known_identity<_BinaryOperator, __val_type>;
+                        for (std::int32_t __i = __wg_agg_idx - __vals_to_explore * __local_id; !__last_it;
+                             __i -= __wgroup_size * __vals_to_explore)
+                        {
+                            __val_type __local_collector = unseq_backend::__known_identity<_BinaryOperator, __val_type>;
+                            // exploration phase
+                            for (std::int32_t __j = __i;
+                                 __j > __dpl_sycl::__maximum<std::int32_t>{}(-1L, __i - __vals_to_explore); --__j)
+                            {
+                                __local_collector = __binary_op(__partials_acc[__j], __local_collector);
+                                if (__seg_ends_acc[__j] || __j == 0)
+                                {
+                                    __loc_seg_ends_acc[__local_id] = true;
+                                    break;
+                                }
+                            }
+                            __loc_partials_acc[__local_id] = __local_collector;
+                            __dpl_sycl::__group_barrier(__item);
+                            // serial aggregate collection and synchronization
+                            if (__local_id == 0)
+                            {
+                                for (std::size_t __j = 0; __j < __wgroup_size; ++__j)
+                                {
+                                    __agg_collector = __binary_op(__loc_partials_acc[__j], __agg_collector);
+                                    if (__loc_seg_ends_acc[__j])
+                                    {
+                                        __last_it = true;
+                                        break;
+                                    }
+                                }
+                            }
+                            __agg_collector = __dpl_sycl::__group_broadcast(__item.get_group(), __agg_collector);
+                            __last_it = __dpl_sycl::__group_broadcast(__item.get_group(), __last_it);
+                        }
+
+                        // Check to see if aggregates exist.
+                        // The last group must always stay to write the final index
+                        __ag_exists = __dpl_sycl::__any_of_group(__group, __ag_exists);
+                        if (!__ag_exists && __group_id != __n_groups - 1)
+                            return;
+                    }
+                    // 3b. count the segment ends
+                    for (std::size_t __i = __start; __i < __end; ++__i)
+                        if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1]))
+                            ++__item_segments;
+
+                    std::size_t __prior_segs_in_wg = __dpl_sycl::__exclusive_scan_over_group(
+                        __group, __item_segments, __dpl_sycl::__plus<decltype(__item_segments)>());
+
+                    // 3c. Determine prior index
+                    std::size_t __wg_num_prior_segs = __seg_ends_scan_acc[__group_id];
+
+                    // 3d. Second pass over the keys, reidentifying end segments and applying work group
+                    // aggregates if appropriate. Both the key and reduction value are written to the final output at the
+                    // computed index
+                    std::size_t __item_offset = 0;
+                    for (std::size_t __i = __start; __i < __end; ++__i)
+                    {
+                        if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1]))
+                        {
+                            std::size_t __idx = __wg_num_prior_segs + __prior_segs_in_wg + __item_offset;
+
+                            // apply the aggregate if it is the first segment end in the workgroup only
+                            if (__prior_segs_in_wg == 0 && __item_offset == 0 && __ag_exists)
+                                __out_values[__idx] = __binary_op(__agg_collector, __out_values[__idx]);
+
+                            ++__item_offset;
+                            // the last item must write the last index's position to return
+                            if (__i == __n - 1)
+                                __end_idx_acc[0] = __idx;
+                        }
+                    }
+                });
+        })
+        .wait();
+
+    return __end_idx.get_host_access()[0] + 1;
+}
+
+} // namespace __par_backend_hetero
+} // namespace dpl
+} // namespace oneapi
+
+#endif
+

From 74143b2936eb001c379e28c074a29a3dd03b7365 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 5 Nov 2024 13:57:48 -0600
Subject: [PATCH 24/31] Revert "Remove now unneeded
 ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION macro"

This reverts commit a4c783533fcd8675094f77681d6d00dc3d23c0f7.
---
 CMakeLists.txt                                | 10 ++++++++++
 cmake/README.md                               |  1 +
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 17 +++++++++++++---
 test/CMakeLists.txt                           |  5 +++++
 .../numeric.ops/reduce_by_segment.pass.cpp    | 20 ++++++++++++++++---
 5 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c682e745f9a..3cb28a1693d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -287,6 +287,16 @@ if (ONEDPL_BACKEND MATCHES "^(tbb|dpcpp|dpcpp_only)$")
             endif()
         endif()
 
+        if (DEFINED ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION)
+            if(ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION)
+                message(STATUS "Adding -DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1 option")
+                target_compile_options(oneDPL INTERFACE "-DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1")
+            else()
+                message(STATUS "Adding -DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=0 option")
+                target_compile_options(oneDPL INTERFACE "-DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=0")
+            endif()
+        endif()
+
         # DPC++ specific macro
         target_compile_definitions(oneDPL INTERFACE
             $<$<OR:$<BOOL:${ONEDPL_USE_DEVICE_FPGA_HW}>,$<BOOL:${ONEDPL_USE_DEVICE_FPGA_EMU}>>:ONEDPL_FPGA_DEVICE>
diff --git a/cmake/README.md b/cmake/README.md
index 0683a377820..7335b7e2312 100644
--- a/cmake/README.md
+++ b/cmake/README.md
@@ -18,6 +18,7 @@ The following variables are provided for oneDPL configuration:
 | ONEDPL_AOT_ARCH              | STRING | Architecture options for ahead-of-time compilation, supported values can be found [here](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html)                                                                                            | "*" for GPU device and "avx" for CPU device |
 | ONEDPL_TEST_EXPLICIT_KERNEL_NAMES   | STRING | Control kernel naming. Affects only oneDPL test targets. Supported values: AUTO, ALWAYS. AUTO: rely on the compiler if "Unnamed SYCL lambda kernels" feature is on, otherwise provide kernel names explicitly; ALWAYS: provide kernel names explicitly | AUTO          |
 | ONEDPL_TEST_WIN_ICX_FIXES     | BOOL   | Affects only oneDPL test targets.  Enable icx, icx-cl workarounds to fix issues in CMake for Windows.                      | ON            |
+| ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION | BOOL | Use as a workaround for incorrect results, which may be produced by reduction algorithms with 64-bit data types compiled by the Intel&reg; oneAPI DPC++/C++ Compiler and executed on GPU devices. |               |
 
 Some useful CMake variables ([here](https://cmake.org/cmake/help/latest/manual/cmake-variables.7.html) you can find a full list of CMake variables for the latest version):
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 9c588d0bf52..b17c619de89 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -34,11 +34,21 @@ namespace unseq_backend
 //This optimization depends on Intel(R) oneAPI DPC++ Compiler implementation such as support of binary operators from std namespace.
 //We need to use defined(SYCL_IMPLEMENTATION_INTEL) macro as a guard.
 
+template <typename _Tp>
+inline constexpr bool __can_use_known_identity =
+#    if ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION
+    // When ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION is defined as non-zero, we avoid using known identity for 64-bit arithmetic data types
+    !(::std::is_arithmetic_v<_Tp> && sizeof(_Tp) == sizeof(::std::uint64_t));
+#    else
+    true;
+#    endif // ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION
+
 //TODO: To change __has_known_identity implementation as soon as the Intel(R) oneAPI DPC++ Compiler implementation issues related to
 //std::multiplies, std::bit_or, std::bit_and and std::bit_xor operations will be fixed.
 //std::logical_and and std::logical_or are not supported in Intel(R) oneAPI DPC++ Compiler to be used in sycl::inclusive_scan_over_group and sycl::reduce_over_group
 template <typename _BinaryOp, typename _Tp>
-using __has_known_identity =
+using __has_known_identity = ::std::conditional_t<
+    __can_use_known_identity<_Tp>,
 #    if _ONEDPL_LIBSYCL_VERSION >= 50200
     typename ::std::disjunction<
         __dpl_sycl::__has_known_identity<_BinaryOp, _Tp>,
@@ -50,15 +60,16 @@ using __has_known_identity =
                                               ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum<_Tp>>,
                                               ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum<void>>,
                                               ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<_Tp>>,
-                                              ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<void>>>>>;
+                                              ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<void>>>>>,
 #    else  //_ONEDPL_LIBSYCL_VERSION >= 50200
     typename ::std::conjunction<
         ::std::is_arithmetic<_Tp>,
         ::std::disjunction<::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus<_Tp>>,
                            ::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus<void>>,
                            ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<_Tp>>,
-                           ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<void>>>>;
+                           ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<void>>>>,
 #    endif //_ONEDPL_LIBSYCL_VERSION >= 50200
+    ::std::false_type>;     // This is for the case of __can_use_known_identity<_Tp>==false
 
 #else //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL)
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e85e8e9f5f8..90eb3d5c737 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -195,6 +195,7 @@ macro(onedpl_add_test test_source_file switch_off_checked_iterators)
     string(REPLACE "\.cpp" "" _test_name ${_test_name})
 
     set(coal_tests "reduce.pass" "transform_reduce.pass" "count.pass" "sycl_iterator_reduce.pass" "minmax_element.pass")
+    set(workaround_for_igpu_64bit_reduction_tests "reduce_by_segment.pass")
     # mark those tests with pstloffload_smoke_tests label
     set (pstloffload_smoke_tests "adjacent_find.pass" "copy_move.pass" "merge.pass" "partial_sort.pass" "remove_copy.pass"
         "transform_reduce.pass" "transform_reduce.pass.coal" "transform_scan.pass" "algorithm.pass"
@@ -208,6 +209,10 @@ macro(onedpl_add_test test_source_file switch_off_checked_iterators)
     if (_test_name IN_LIST coal_tests)
         onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "-D_ONEDPL_DETECT_SPIRV_COMPILATION=1" "${extra_test_label}")
         onedpl_construct_exec(${test_source_file} ${_test_name}.coal ${switch_off_checked_iterators} "-D_ONEDPL_DETECT_SPIRV_COMPILATION=0" "${extra_test_label}")
+    elseif (_test_name IN_LIST workaround_for_igpu_64bit_reduction_tests)
+        onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "" "${extra_test_label}")
+        string(REPLACE "\.pass" "_workaround_64bit_reduction\.pass" _test_name ${_test_name})
+        onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "-D_ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1" "${extra_test_label}")
     elseif(_test_name STREQUAL "free_after_unload.pass")
         onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "" "${extra_test_label}")
         onedpl_construct_exec(${test_source_file} ${_test_name}.after_pstl_offload ${switch_off_checked_iterators} "" "${extra_test_label}")
diff --git a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
index 2cee63239b6..f71f78ed26e 100644
--- a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
@@ -13,6 +13,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#if defined(ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION)
+#undef ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION
+#endif
+
+#if defined(_ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION)
+#    define ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION _ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION
+#endif
+
 #include "support/test_config.h"
 
 #include "oneapi/dpl/execution"
@@ -298,10 +306,16 @@ void
 run_test_on_device()
 {
 #if TEST_DPCPP_BACKEND_PRESENT
-    if (TestUtils::has_type_support<ValueType>(TestUtils::get_test_queue().get_device()))
+    // Skip 64-byte types testing when the algorithm is broken and there is no the workaround
+#if _PSTL_ICPX_TEST_RED_BY_SEG_BROKEN_64BIT_TYPES && !ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION
+    if constexpr (sizeof(ValueType) != 8)
+#endif
     {
-        constexpr sycl::usm::alloc allocation_type = use_device_alloc ? sycl::usm::alloc::device : sycl::usm::alloc::shared;
-        test4buffers<sycl::usm::alloc::device, test_reduce_by_segment<ValueType, BinaryPredicate, BinaryOperation>>();
+        if (TestUtils::has_type_support<ValueType>(TestUtils::get_test_queue().get_device()))
+        {
+            constexpr sycl::usm::alloc allocation_type = use_device_alloc ? sycl::usm::alloc::device : sycl::usm::alloc::shared;
+            test4buffers<sycl::usm::alloc::device, test_reduce_by_segment<ValueType, BinaryPredicate, BinaryOperation>>();
+        }
     }
 #endif // TEST_DPCPP_BACKEND_PRESENT
 }

From fdf6a3952d8fed112159962ea48dccfe3a1470f9 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 5 Nov 2024 14:00:41 -0600
Subject: [PATCH 25/31] Fix test bug where device allocation is always used for
 testing

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
index f71f78ed26e..80aa9f53d3c 100644
--- a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
@@ -314,7 +314,7 @@ run_test_on_device()
         if (TestUtils::has_type_support<ValueType>(TestUtils::get_test_queue().get_device()))
         {
             constexpr sycl::usm::alloc allocation_type = use_device_alloc ? sycl::usm::alloc::device : sycl::usm::alloc::shared;
-            test4buffers<sycl::usm::alloc::device, test_reduce_by_segment<ValueType, BinaryPredicate, BinaryOperation>>();
+            test4buffers<allocation_type, test_reduce_by_segment<ValueType, BinaryPredicate, BinaryOperation>>();
         }
     }
 #endif // TEST_DPCPP_BACKEND_PRESENT

From 5987df9c1c2f469a265cbf884b85c836d87d1e1e Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 13 Nov 2024 13:45:21 -0600
Subject: [PATCH 26/31] Separate each reduce_by_segment fallback path into
 their own functions

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 239 +++++++++---------
 .../parallel_backend_sycl_reduce_by_segment.h |   7 +-
 2 files changed, 129 insertions(+), 117 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 431d58d75ea..61975975363 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -2295,6 +2295,126 @@ struct __assign_key1_wrapper;
 template <typename _Name>
 struct __assign_key2_wrapper;
 
+template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Range4,
+          typename _BinaryPredicate, typename _BinaryOperator>
+oneapi::dpl::__internal::__difference_t<_Range3>
+__parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec,
+                                      _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys,
+                                      _Range4&& __out_values, _BinaryPredicate __binary_pred,
+                                      _BinaryOperator __binary_op,
+                                      /*known_identity=*/std::false_type)
+{
+    using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>;
+    using __key_type = oneapi::dpl::__internal::__value_t<_Range1>;
+    using __val_type = oneapi::dpl::__internal::__value_t<_Range2>;
+
+    const auto __n = __keys.size();
+    // Round 1: reduce with extra indices added to avoid long segments
+    // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence.
+    // Skip a round of copy_if and reduces if there are none.
+    auto __idx =
+        oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer();
+    auto __tmp_out_keys =
+        oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer();
+    auto __tmp_out_values =
+        oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n).get_buffer();
+
+    // Replicating first element of keys view to be able to compare (i-1)-th and (i)-th key with aligned sequences,
+    //  dropping the last key for the i-1 sequence.
+    auto __k1 =
+        oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::replicate_start_view_simple(__keys, 1), __n);
+
+    // view1 elements are a tuple of the element index and pairs of adjacent keys
+    // view2 elements are a tuple of the elements where key-index pairs will be written by copy_if
+    auto __view1 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __n), __k1, __keys);
+    auto __view2 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__tmp_out_keys),
+                                                   oneapi::dpl::__ranges::views::all_write(__idx));
+
+    // use work group size adjusted to shared local memory as the maximum segment size.
+    std::size_t __wgroup_size =
+        oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(__key_type) + sizeof(__val_type));
+
+    // element is copied if it is the 0th element (marks beginning of first segment), is in an index
+    // evenly divisible by wg size (ensures segments are not long), or has a key not equal to the
+    // adjacent element (marks end of real segments)
+    // TODO: replace wgroup size with segment size based on platform specifics.
+    auto __intermediate_result_end =
+        oneapi::dpl::__par_backend_hetero::__parallel_copy_if(
+            oneapi::dpl::__internal::__device_backend_tag{},
+            oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2,
+            __n,
+            [__binary_pred, __wgroup_size](const auto& __a) {
+                // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys
+                // for (i-1), but we still need to get its key value as it is the start of a segment
+                const auto index = std::get<0>(__a);
+                if (index == 0)
+                    return true;
+                return index % __wgroup_size == 0                             // segment size
+                       || !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // key comparison
+            },
+            unseq_backend::__brick_assign_key_position{})
+            .get();
+
+    //reduce by segment
+    oneapi::dpl::__par_backend_hetero::__parallel_for(
+        oneapi::dpl::__internal::__device_backend_tag{},
+        oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec),
+        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n),
+        __intermediate_result_end,
+        oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx),
+                                                __intermediate_result_end),
+        std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values))
+        .wait();
+
+    // Round 2: final reduction to get result for each segment of equal adjacent keys
+    // create views over adjacent keys
+    oneapi::dpl::__ranges::all_view<__key_type, __par_backend_hetero::access_mode::read_write> __new_keys(
+        __tmp_out_keys);
+
+    // Replicating first element of key views to be able to compare (i-1)-th and (i)-th key,
+    //  dropping the last key for the i-1 sequence.  Only taking the appropriate number of keys to start with here.
+    auto __clipped_new_keys = oneapi::dpl::__ranges::take_view_simple(__new_keys, __intermediate_result_end);
+
+    auto __k3 = oneapi::dpl::__ranges::take_view_simple(
+        oneapi::dpl::__ranges::replicate_start_view_simple(__clipped_new_keys, 1), __intermediate_result_end);
+
+    // view3 elements are a tuple of the element index and pairs of adjacent keys
+    // view4 elements are a tuple of the elements where key-index pairs will be written by copy_if
+    auto __view3 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __intermediate_result_end),
+                                                   __k3, __clipped_new_keys);
+    auto __view4 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__out_keys),
+                                                   oneapi::dpl::__ranges::views::all_write(__idx));
+
+    // element is copied if it is the 0th element (marks beginning of first segment), or has a key not equal to
+    // the adjacent element (end of a segment). Artificial segments based on wg size are not created.
+    auto __result_end = oneapi::dpl::__par_backend_hetero::__parallel_copy_if(
+                            oneapi::dpl::__internal::__device_backend_tag{},
+                            oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key2_wrapper>(__exec),
+                            __view3, __view4, __view3.size(),
+                            [__binary_pred](const auto& __a) {
+                                // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys
+                                // for (i-1), but we still need to get its key value as it is the start of a segment
+                                if (std::get<0>(__a) == 0)
+                                    return true;
+                                return !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // keys comparison
+                            },
+                            unseq_backend::__brick_assign_key_position{})
+                            .get();
+
+    //reduce by segment
+    oneapi::dpl::__par_backend_hetero::__parallel_for(
+        oneapi::dpl::__internal::__device_backend_tag{},
+        oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>(
+            std::forward<_ExecutionPolicy>(__exec)),
+        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>(
+            __binary_op, __intermediate_result_end),
+        __result_end,
+        oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __result_end),
+        oneapi::dpl::__ranges::views::all_read(__tmp_out_values), std::forward<_Range4>(__out_values))
+        .__deferrable_wait();
+    return __result_end;
+}
+
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Range4,
           typename _BinaryPredicate, typename _BinaryOperator>
 oneapi::dpl::__internal::__difference_t<_Range3>
@@ -2333,120 +2453,11 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _Exe
         }
     }
 #endif
-    if constexpr (oneapi::dpl::unseq_backend::__has_known_identity<_BinaryOperator, __val_type>::value)
-    {
-        return __parallel_reduce_by_segment_known_identity(
-            oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec),
-            std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys),
-            std::forward<_Range4>(__out_values), __binary_pred, __binary_op);
-    }
-    else
-    {
-        // Round 1: reduce with extra indices added to avoid long segments
-        // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence.
-        // Skip a round of copy_if and reduces if there are none.
-        auto __idx =
-            oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer();
-        auto __tmp_out_keys =
-            oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer();
-        auto __tmp_out_values =
-            oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n).get_buffer();
-
-        // Replicating first element of keys view to be able to compare (i-1)-th and (i)-th key with aligned sequences,
-        //  dropping the last key for the i-1 sequence.
-        auto __k1 =
-            oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::replicate_start_view_simple(__keys, 1), __n);
-
-        // view1 elements are a tuple of the element index and pairs of adjacent keys
-        // view2 elements are a tuple of the elements where key-index pairs will be written by copy_if
-        auto __view1 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __n), __k1, __keys);
-        auto __view2 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__tmp_out_keys),
-                                                       oneapi::dpl::__ranges::views::all_write(__idx));
-
-        // use work group size adjusted to shared local memory as the maximum segment size.
-        std::size_t __wgroup_size =
-            oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(__key_type) + sizeof(__val_type));
-
-        // element is copied if it is the 0th element (marks beginning of first segment), is in an index
-        // evenly divisible by wg size (ensures segments are not long), or has a key not equal to the
-        // adjacent element (marks end of real segments)
-        // TODO: replace wgroup size with segment size based on platform specifics.
-        auto __intermediate_result_end =
-            oneapi::dpl::__par_backend_hetero::__parallel_copy_if(
-                oneapi::dpl::__internal::__device_backend_tag{},
-                oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2,
-                __n,
-                [__binary_pred, __wgroup_size](const auto& __a) {
-                    // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys
-                    // for (i-1), but we still need to get its key value as it is the start of a segment
-                    const auto index = std::get<0>(__a);
-                    if (index == 0)
-                        return true;
-                    return index % __wgroup_size == 0                             // segment size
-                           || !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // key comparison
-                },
-                unseq_backend::__brick_assign_key_position{})
-                .get();
-
-        //reduce by segment
-        oneapi::dpl::__par_backend_hetero::__parallel_for(
-            oneapi::dpl::__internal::__device_backend_tag{},
-            oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec),
-            unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n),
-            __intermediate_result_end,
-            oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx),
-                                                    __intermediate_result_end),
-            std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values))
-            .wait();
-
-        // Round 2: final reduction to get result for each segment of equal adjacent keys
-        // create views over adjacent keys
-        oneapi::dpl::__ranges::all_view<__key_type, __par_backend_hetero::access_mode::read_write> __new_keys(
-            __tmp_out_keys);
-
-        // Replicating first element of key views to be able to compare (i-1)-th and (i)-th key,
-        //  dropping the last key for the i-1 sequence.  Only taking the appropriate number of keys to start with here.
-        auto __clipped_new_keys = oneapi::dpl::__ranges::take_view_simple(__new_keys, __intermediate_result_end);
-
-        auto __k3 = oneapi::dpl::__ranges::take_view_simple(
-            oneapi::dpl::__ranges::replicate_start_view_simple(__clipped_new_keys, 1), __intermediate_result_end);
-
-        // view3 elements are a tuple of the element index and pairs of adjacent keys
-        // view4 elements are a tuple of the elements where key-index pairs will be written by copy_if
-        auto __view3 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __intermediate_result_end),
-                                                       __k3, __clipped_new_keys);
-        auto __view4 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__out_keys),
-                                                       oneapi::dpl::__ranges::views::all_write(__idx));
-
-        // element is copied if it is the 0th element (marks beginning of first segment), or has a key not equal to
-        // the adjacent element (end of a segment). Artificial segments based on wg size are not created.
-        auto __result_end = oneapi::dpl::__par_backend_hetero::__parallel_copy_if(
-                                oneapi::dpl::__internal::__device_backend_tag{},
-                                oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key2_wrapper>(__exec),
-                                __view3, __view4, __view3.size(),
-                                [__binary_pred](const auto& __a) {
-                                    // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys
-                                    // for (i-1), but we still need to get its key value as it is the start of a segment
-                                    if (std::get<0>(__a) == 0)
-                                        return true;
-                                    return !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // keys comparison
-                                },
-                                unseq_backend::__brick_assign_key_position{})
-                                .get();
-
-        //reduce by segment
-        oneapi::dpl::__par_backend_hetero::__parallel_for(
-            oneapi::dpl::__internal::__device_backend_tag{},
-            oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>(
-                std::forward<_ExecutionPolicy>(__exec)),
-            unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>(
-                __binary_op, __intermediate_result_end),
-            __result_end,
-            oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __result_end),
-            oneapi::dpl::__ranges::views::all_read(__tmp_out_values), std::forward<_Range4>(__out_values))
-            .__deferrable_wait();
-        return __result_end;
-    }
+    return __parallel_reduce_by_segment_fallback(
+        oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec),
+        std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys),
+        std::forward<_Range4>(__out_values), __binary_pred, __binary_op,
+        oneapi::dpl::unseq_backend::__has_known_identity<_BinaryOperator, __val_type>{});
 }
 
 } // namespace __par_backend_hetero
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h
index 14860e3830a..f3455d45779 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h
@@ -89,9 +89,10 @@ using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>;
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3,
           typename _Range4, typename _BinaryPredicate, typename _BinaryOperator>
 oneapi::dpl::__internal::__difference_t<_Range3>
-__parallel_reduce_by_segment_known_identity(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys,
-											_Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
-											_BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
+__parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys,
+									  _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
+									  _BinaryPredicate __binary_pred, _BinaryOperator __binary_op,
+                                      /*known_identity=*/std::true_type)
 {
     using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
 

From d80377ebe28955b95310b64dde4906d56e31ef43 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 13 Nov 2024 14:50:25 -0600
Subject: [PATCH 27/31] clang-format

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h |  6 ++----
 .../parallel_backend_sycl_reduce_by_segment.h | 21 +++++++++----------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 61975975363..497790abff8 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -2312,8 +2312,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
     // Round 1: reduce with extra indices added to avoid long segments
     // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence.
     // Skip a round of copy_if and reduces if there are none.
-    auto __idx =
-        oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer();
+    auto __idx = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer();
     auto __tmp_out_keys =
         oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer();
     auto __tmp_out_values =
@@ -2359,8 +2358,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         oneapi::dpl::__internal::__device_backend_tag{},
         oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec),
-        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n),
-        __intermediate_result_end,
+        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), __intermediate_result_end,
         oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx),
                                                 __intermediate_result_end),
         std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values))
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h
index f3455d45779..bc702137963 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h
@@ -86,12 +86,13 @@ template <typename... _Name>
 using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>;
 } // namespace
 
-template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3,
-          typename _Range4, typename _BinaryPredicate, typename _BinaryOperator>
+template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Range4,
+          typename _BinaryPredicate, typename _BinaryOperator>
 oneapi::dpl::__internal::__difference_t<_Range3>
-__parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys,
-									  _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
-									  _BinaryPredicate __binary_pred, _BinaryOperator __binary_op,
+__parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec,
+                                      _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys,
+                                      _Range4&& __out_values, _BinaryPredicate __binary_pred,
+                                      _BinaryOperator __binary_op,
                                       /*known_identity=*/std::true_type)
 {
     using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
@@ -137,11 +138,10 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
     auto __seg_reduce_prefix_kernel =
         __par_backend_hetero::__internal::__kernel_compiler<_SegReducePrefixKernel>::__compile(__exec);
     __wgroup_size =
-        std::min({__wgroup_size,
-                    oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_count_kernel),
-                    oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_offset_kernel),
-                    oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_wg_kernel),
-                    oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_prefix_kernel)});
+        std::min({__wgroup_size, oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_count_kernel),
+                  oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_offset_kernel),
+                  oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_wg_kernel),
+                  oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_prefix_kernel)});
 #endif
 
     std::size_t __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __wgroup_size * __vals_per_item);
@@ -462,4 +462,3 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
 } // namespace oneapi
 
 #endif
-

From 3c8154e66747e3a9a4279bcb0d33b72c318bea33 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Thu, 21 Nov 2024 16:40:45 -0600
Subject: [PATCH 28/31] Address comments in reduce-then-scan based
 implementation

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h    | 12 ++++++++++--
 .../implementation_details/device_copyable.pass.cpp  |  6 +++---
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 497790abff8..2c8b5c472ae 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -802,6 +802,9 @@ struct __gen_red_by_seg_reduce_input
         const auto __in_keys = std::get<0>(__in_rng.tuple());
         const auto __in_vals = std::get<1>(__in_rng.tuple());
         using _ValueType = oneapi::dpl::__internal::__value_t<decltype(__in_vals)>;
+        // The first segment start (index 0) is not marked with a 1. This is because we need the first
+        // segment's key and value output index to be 0. We begin marking new segments only after the
+        // first.
         const std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]);
         return oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]});
     }
@@ -845,7 +848,7 @@ struct __gen_red_by_seg_scan_input
                 oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, __current_key,
                 __current_key); // Passing __current_key as the next key for the last element is a placeholder
         }
-        else
+        else // __id == 0
         {
             const _KeyType& __next_key = __in_keys[__id + 1];
             return oneapi::dpl::__internal::make_tuple(
@@ -867,7 +870,7 @@ struct __red_by_seg_op
     {
         using std::get;
         // The left-hand side has processed elements from the same segment, so update the reduction value.
-        if (std::get<0>(__rhs_tup) == 0)
+        if (get<0>(__rhs_tup) == 0)
         {
             return oneapi::dpl::__internal::make_tuple(get<0>(__lhs_tup),
                                                        __binary_op(get<1>(__lhs_tup), get<1>(__rhs_tup)));
@@ -897,6 +900,11 @@ struct __write_red_by_seg
         const bool __is_seg_end = get<1>(__tup);
         const std::size_t __out_idx = get<0>(get<0>(__tup));
 
+        // With the exception of the first key which is output by index 0, the first key in each segment is written
+        // by the work item that outputs the previous segment's reduction value. This is because the reduce_by_segment
+        // API requires that the first key in a segment is output and is important for when keys in a segment might not
+        // be the same (but satisfy the predicate). The last segment does not output a key as there are no future
+        // segments process.
         if (__id == 0)
             __out_keys[0] = __current_key;
         if (__is_seg_end)
diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp
index 572c16b818a..1ae8489e35e 100644
--- a/test/general/implementation_details/device_copyable.pass.cpp
+++ b/test/general/implementation_details/device_copyable.pass.cpp
@@ -156,9 +156,9 @@ test_device_copyable()
         "__gen_transform_input is not device copyable with device copyable types");
 
     //__gen_red_by_seg_reduce_input
-    static_assert(
-        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_reduce_input<noop_device_copyable>>,
-        "__gen_red_by_seg_reduce_input is not device copyable with device copyable types");
+    static_assert(sycl::is_device_copyable_v<
+                      oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_reduce_input<binary_op_device_copyable>>,
+                  "__gen_red_by_seg_reduce_input is not device copyable with device copyable types");
 
     //__gen_red_by_seg_scan_input
     static_assert(

From db63d459f0e2114307b9012e5048bc136eccef7d Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Fri, 22 Nov 2024 15:36:52 -0600
Subject: [PATCH 29/31] Improve explanations of reduce-by-segment approach

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 2c8b5c472ae..1d10b8ac29b 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -864,6 +864,51 @@ struct __gen_red_by_seg_scan_input
 template <typename _BinaryOp>
 struct __red_by_seg_op
 {
+    // Consider the following segment / value pairs that would be processed in reduce-then-scan by a sub-group of size 8:
+    // ----------------------------------------------------------
+    // Keys:   0 0 1 1 2 2 2 2
+    // Values: 1 1 1 1 1 1 1 1
+    // ----------------------------------------------------------
+    // The reduce and scan input generation phase flags new segments (excluding index 0) for use in the sub-group scan
+    // operation. The above key, value pairs correspond to the following flag, value pairs:
+    // ----------------------------------------------------------
+    // Flags:  0 0 1 0 1 0 0 0
+    // Values: 1 1 1 1 1 1 1 1
+    // ----------------------------------------------------------
+    // The sub-group scan operation looks back by powers-of-2 applying encountered prefixes. The __red_by_seg_op
+    // operation performs a standard inclusive scan over the flags to compute output indices while performing a masked
+    // scan over values to avoid applying a previous segment's partial reduction. Previous value elements are reduced
+    // so long as the current index's flag is 0, indicating that input within its segment is still being processed
+    // ----------------------------------------------------------
+    // Start:
+    // ----------------------------------------------------------
+    // Flags:  0 0 1 0 1 0 0 0
+    // Values: 1 1 1 1 1 1 1 1
+    // ----------------------------------------------------------
+    // After step 1 (apply the i-1th value if the ith flag is 0):
+    // ----------------------------------------------------------
+    // Flags:  0 0 1 1 1 1 0 0
+    // Values: 1 2 1 2 1 2 2 2
+    // ----------------------------------------------------------
+    // After step 2 (apply the i-2th value if the ith flag is 0):
+    // ----------------------------------------------------------
+    // Flags:  0 0 1 1 2 2 1 1
+    // Values: 1 2 1 2 1 2 3 4
+    // ----------------------------------------------------------
+    // After step 3 (apply the i-4th value if the ith flag is 0):
+    // ----------------------------------------------------------
+    // Flags:  0 0 1 1 2 2 2 2
+    // Values: 1 2 1 2 1 2 3 4
+    //           ^   ^       ^
+    // ----------------------------------------------------------
+    // Note that the scan of segment flags results in the desired output index of the reduce_by_segment operation in
+    // each segment and the item corresponding to the final key in a segment contains its output reduction value. This
+    // operation is first applied within a sub-group and then across sub-groups, work-groups, and blocks to
+    // reduce-by-segment across the full input. The result of these operations combined with cached key data in
+    // __gen_red_by_seg_scan_input enables the write phase to output keys and reduction values.
+    // =>
+    // Segments : 0 1 2
+    // Values   : 2 2 4
     template <typename _Tup1, typename _Tup2>
     auto
     operator()(const _Tup1& __lhs_tup, const _Tup2& __rhs_tup) const
@@ -1326,10 +1371,15 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_
                                               _Range3&& __out_keys, _Range4&& __out_values,
                                               _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
 {
+    // Flags new segments and passes input value through a 2-tuple
     using _GenReduceInput = __gen_red_by_seg_reduce_input<_BinaryPredicate>;
+    // Operation that computes output indices and output reduction values per segment
     using _ReduceOp = __red_by_seg_op<_BinaryOperator>;
+    // Returns 4-component tuple which contains flags, keys, value, and a flag to write output
     using _GenScanInput = __gen_red_by_seg_scan_input<_BinaryPredicate>;
+    // Returns the first component from scan input which is scanned over
     using _ScanInputTransform = __get_zeroth_element;
+    // Writes current segment's output reduction and the next segment's output key
     using _WriteOp = __write_red_by_seg<_BinaryPredicate>;
     using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>;
     std::size_t __n = __keys.size();

From 3deed76b7eade1397fd0f584f414f979df95fca8 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Fri, 22 Nov 2024 15:54:08 -0600
Subject: [PATCH 30/31] Use binary_op[_non]_device_copyable where appropriate

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../device_copyable.pass.cpp                  | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp
index 1ae8489e35e..474e4268b69 100644
--- a/test/general/implementation_details/device_copyable.pass.cpp
+++ b/test/general/implementation_details/device_copyable.pass.cpp
@@ -161,9 +161,9 @@ test_device_copyable()
                   "__gen_red_by_seg_reduce_input is not device copyable with device copyable types");
 
     //__gen_red_by_seg_scan_input
-    static_assert(
-        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_scan_input<noop_device_copyable>>,
-        "__gen_red_by_seg_scan_input is not device copyable with device copyable types");
+    static_assert(sycl::is_device_copyable_v<
+                      oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_scan_input<binary_op_device_copyable>>,
+                  "__gen_red_by_seg_scan_input is not device copyable with device copyable types");
 
     //__gen_mask
     static_assert(sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_mask<noop_device_copyable>>,
@@ -196,7 +196,7 @@ test_device_copyable()
 
     //__write_red_by_seg
     static_assert(
-        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__write_red_by_seg<noop_device_copyable>>,
+        sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__write_red_by_seg<binary_op_device_copyable>>,
         "__write_red_by_seg is not device copyable with device copyable types");
 
     // __early_exit_find_or
@@ -418,14 +418,14 @@ test_non_device_copyable()
         "__gen_transform_input is device copyable with non device copyable types");
 
     //__gen_red_by_seg_reduce_input
-    static_assert(
-        !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_reduce_input<noop_non_device_copyable>>,
-        "__gen_red_by_seg_reduce_input is device copyable with non device copyable types");
+    static_assert(!sycl::is_device_copyable_v<
+                      oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_reduce_input<binary_op_non_device_copyable>>,
+                  "__gen_red_by_seg_reduce_input is device copyable with non device copyable types");
 
     //__gen_red_by_seg_reduce_input
-    static_assert(
-        !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_scan_input<noop_non_device_copyable>>,
-        "__gen_red_by_seg_scan_input is device copyable with non device copyable types");
+    static_assert(!sycl::is_device_copyable_v<
+                      oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_scan_input<binary_op_non_device_copyable>>,
+                  "__gen_red_by_seg_scan_input is device copyable with non device copyable types");
 
     //__gen_mask
     static_assert(!sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__gen_mask<noop_non_device_copyable>>,
@@ -457,9 +457,9 @@ test_non_device_copyable()
                   "__write_to_id_if_else is device copyable with non device copyable types");
 
     //__write_red_by_seg
-    static_assert(
-        !sycl::is_device_copyable_v<oneapi::dpl::__par_backend_hetero::__write_red_by_seg<noop_non_device_copyable>>,
-        "__write_red_by_seg is device copyable with non device copyable types");
+    static_assert(!sycl::is_device_copyable_v<
+                      oneapi::dpl::__par_backend_hetero::__write_red_by_seg<binary_op_non_device_copyable>>,
+                  "__write_red_by_seg is device copyable with non device copyable types");
 
     // __early_exit_find_or
     static_assert(

From c641bd39dc0187237f7761ac6d7b71cd4e6cdb2d Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Fri, 22 Nov 2024 16:57:30 -0600
Subject: [PATCH 31/31] Address comments in fallback implementation

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpcpp/parallel_backend_sycl_reduce_by_segment.h       | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h
index bc702137963..62ae736782d 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h
@@ -175,7 +175,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
                                                                               sycl::nd_item<1> __item) {
                 auto __group = __item.get_group();
                 std::size_t __group_id = __item.get_group(0);
-                std::size_t __local_id = __item.get_local_id(0);
+                std::uint32_t __local_id = __item.get_local_id(0);
                 std::size_t __global_id = __item.get_global_id(0);
 
                 std::size_t __start = __global_id * __vals_per_item;
@@ -267,12 +267,12 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
 
                 // 2c. Count the number of prior work segments cooperatively over group
                 std::size_t __prior_segs_in_wg = __dpl_sycl::__exclusive_scan_over_group(
-                    __group, __item_segments, __dpl_sycl::__plus<decltype(__item_segments)>());
+                    __group, __item_segments, __dpl_sycl::__plus<std::size_t>());
                 std::size_t __start_idx = __wg_num_prior_segs + __prior_segs_in_wg;
 
                 // 2d. Find the greatest segment end less than the current index (inclusive)
                 std::size_t __closest_seg_id = __dpl_sycl::__inclusive_scan_over_group(
-                    __group, __max_end, __dpl_sycl::__maximum<decltype(__max_end)>());
+                    __group, __max_end, __dpl_sycl::__maximum<std::size_t>());
 
                 // __wg_segmented_scan is a derivative work and responsible for the third header copyright
                 __val_type __carry_in = oneapi::dpl::internal::__wg_segmented_scan(
@@ -287,7 +287,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
                 if (__local_id == 0)
                 {
                     __apply_aggs = false;
-                    if (__global_id == 0 && __n > 0)
+                    if (__global_id == 0)
                     {
                         // first segment identifier is always the first key
                         __out_keys[0] = __keys[0];