Always use size 16 sub-groups in single work-group radix sort if supp…

…orted (#1833) * Only use sub-group sizes of 16 in one-wg radix sort This change is added to avoid a bug where IGC cannot compile SIMD32 kernels with -O0 compilation flags. No performance impact is observed. Signed-off-by: Matthew Michel <[email protected]> --------- Signed-off-by: Matthew Michel <[email protected]>
oneapi-src · Sep 13, 2024 · 87d85d5 · 87d85d5
1 parent 4f846e0
commit 87d85d5
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 7 deletions.
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
@@ -816,11 +816,11 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP
     else if (__n <= 4096 && __wg_size * 4 <= __max_wg_size)
         __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 4, 16, __radix_bits, __is_ascending>{}(
             __exec.queue(), ::std::forward<_Range>(__in_rng), __proj);
-    // In __subgroup_radix_sort, we request a sub-group size via _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE_IF_SUPPORTED
-    // based upon the iters per item. For the below cases, register spills that result in runtime exceptions have
-    // been observed on accelerators that do not support the requested sub-group size of 16. For the above cases
-    // that request but may not receive a sub-group size of 16, inputs are small enough to avoid register
-    // spills on assessed hardware.
+    // In __subgroup_radix_sort, we request a sub-group size of 16 via _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE_IF_SUPPORTED
+    // for compilation targets that support this option. For the below cases, register spills that result in
+    // runtime exceptions have been observed on accelerators that do not support the requested sub-group size of 16.
+    // For the above cases that request but may not receive a sub-group size of 16, inputs are small enough to avoid
+    // register spills on assessed hardware.
     else if (__n <= 8192 && __wg_size * 8 <= __max_wg_size && __dev_has_sg16)
         __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 8, 16, __radix_bits, __is_ascending>{}(
             __exec.queue(), ::std::forward<_Range>(__in_rng), __proj);

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h
@@ -30,8 +30,7 @@ template <typename... _Name>
 class __radix_sort_one_wg_kernel;
 
 template <typename _KernelNameBase, uint16_t __wg_size = 256 /*work group size*/, uint16_t __block_size = 16,
-          ::std::uint32_t __radix = 4, bool __is_asc = true,
-          uint16_t __req_sub_group_size = (__block_size < 4 ? 32 : 16)>
+          std::uint32_t __radix = 4, bool __is_asc = true>
 struct __subgroup_radix_sort
 {
     template <typename _RangeIn, typename _Proj>
@@ -147,6 +146,7 @@ struct __subgroup_radix_sort
         auto
         operator()(sycl::queue __q, _RangeIn&& __src, _Proj __proj, _SLM_tag_val, _SLM_counter)
         {
+            constexpr std::uint16_t __req_sub_group_size = 16;
             uint16_t __n = __src.size();
             assert(__n <= __block_size * __wg_size);