diff --git a/documentation/library_guide/kernel_templates/esimd/radix_sort.rst b/documentation/library_guide/kernel_templates/esimd/radix_sort.rst index c74ece79a27..3b41f9ed9e7 100644 --- a/documentation/library_guide/kernel_templates/esimd/radix_sort.rst +++ b/documentation/library_guide/kernel_templates/esimd/radix_sort.rst @@ -1,16 +1,17 @@ Radix Sort ########## ---------------------------------------------------- -radix_sort and radix_sort_by_key Function Templates ---------------------------------------------------- +----------------------------- +radix_sort Function Templates +----------------------------- -The ``radix_sort`` and ``radix_sort_by_key`` functions sort data using the radix sort algorithm. -The sorting is stable, ensuring the preservation of the relative order of elements with equal keys. -The functions implement a Onesweep* [#fnote1]_ algorithm variant. Both in-place and out-of-place -overloads are provided. For out-of-place overloads, the input data order is preserved. +The ``radix_sort`` and ``radix_sort_copy`` functions sort data using the radix sort algorithm. +The sorting is stable, preserving the relative order of elements with equal keys. +Both in-place and out-of-place overloads are provided. Out-of-place overloads do not alter the input sequence. -A synopsis of the ``radix_sort`` and ``radix_sort_by_key`` functions is provided below: +The functions implement a Onesweep* [#fnote1]_ algorithm variant. + +A synopsis of the ``radix_sort`` and ``radix_sort_copy`` functions are provided below: .. code:: cpp @@ -18,7 +19,7 @@ A synopsis of the ``radix_sort`` and ``radix_sort_by_key`` functions is provided namespace oneapi::dpl::experimental::kt::gpu::esimd { - // Sort a single sequence + // Sort in-place template sycl::event @@ -31,53 +32,19 @@ A synopsis of the ``radix_sort`` and ``radix_sort_by_key`` functions is provided radix_sort (sycl::queue q, Range&& r, KernelParam param); // (2) - // Sort a single sequence out-of-place + // Sort out-of-place template sycl::event - radix_sort (sycl::queue q, Iterator1 first, Iterator1 last, - Iterator2 first_out, KernelParam param) // (3) + radix_sort_copy (sycl::queue q, Iterator1 first, Iterator1 last, + Iterator2 first_out, KernelParam param); // (3) template sycl::event - radix_sort (sycl::queue q, Range1&& r, Range2&& r_out, - KernelParam param) // (4) - - - // Sort a sequence of keys and apply the same order to a sequence of values - template - sycl::event - radix_sort_by_key (sycl::queue q, Iterator1 keys_first, Iterator1 keys_last, - Iterator2 values_first, KernelParam param); // (5) - - template - sycl::event - radix_sort_by_key (sycl::queue q, KeysRng&& keys, - ValuesRng&& values, KernelParam param); // (6) - - - // Sort a sequence of keys and values out-of-place - template - sycl::event - radix_sort_by_key (sycl::queue q, KeysIterator1 keys_first, - KeysIterator1 keys_last, ValsIterator1 vals_first, - KeysIterator2 keys_out_first, ValsIterator2 vals_out_first, - KernelParam param) // (7) - - template - sycl::event - radix_sort_by_key (sycl::queue q, KeysRng1&& keys, ValsRng1&& values, - KeysRng2&& keys_out, ValsRng2&& vals_out, - KernelParam param) // (8) + radix_sort_copy (sycl::queue q, Range1&& r, Range2&& r_out, + KernelParam param); // (4) } @@ -99,22 +66,22 @@ Parameters +-----------------------------------------------+---------------------------------------------------------------------+ | Name | Description | +===============================================+=====================================================================+ -| ``q`` | The SYCL* queue where kernels are submitted. | +| ``q`` | The SYCL* queue where kernels are submitted. | +-----------------------------------------------+---------------------------------------------------------------------+ | | | | | The sequences to apply the algorithm to. | | - ``first``, ``last`` (1), | Supported sequence types: | | - ``r`` (2), | | -| - ``first``, ``last``, ``first_out`` (3), | - :ref:`USM pointers ` (1,3,5,7), | -| - ``r``, ``r_out`` (4), | - :ref:`oneapi::dpl::begin and oneapi::dpl::end | -| - ``keys_first``, ``keys_last``, | ` (1,3,5,7). | -| ``values_first`` (5), | - ``sycl::buffer`` (2,4,6,8), | -| - ``keys``, ``values`` (6), | - :ref:`oneapi::dpl::experimental::ranges::views::all | -| - ``keys_first``, ``keys_last``, | ` (2,4,6,8), | -| ``vals_first``, ``keys_out_first``, | - :ref:`oneapi::dpl::experimental::ranges::views::subrange | -| ``values_out_first`` (7) | ` (2,4,6,8), | -| - ``keys``, ``values``, | | -| ``keys_out``, ``values_out`` (8), | | +| - ``first``, ``last``, ``first_out`` (3), | - :ref:`USM pointers ` (1,3), | +| - ``r``, ``r_out`` (4). | - :ref:`oneapi::dpl::begin and oneapi::dpl::end | +| | ` (1,3). | +| | - ``sycl::buffer`` (2,4), | +| | - :ref:`oneapi::dpl::experimental::ranges::views::all | +| | ` (2,4), | +| | - :ref:`oneapi::dpl::experimental::ranges::views::subrange | +| | ` (2,4). | +| | | +| | | | | | +-----------------------------------------------+---------------------------------------------------------------------+ | ``param`` | A :doc:`kernel_param <../kernel_configuration>` object. | @@ -147,8 +114,8 @@ Usage Examples -------------- -radix_sort In-Place Example ---------------------------- +In-Place Example +---------------- .. code:: cpp @@ -192,72 +159,14 @@ radix_sort In-Place Example 5 3 3 3 2 1 -radix_sort_by_key In-Place Example ----------------------------------- - -.. code:: cpp - - // possible build and run commands: - // icpx -fsycl radix_sort_by_key.cpp -o radix_sort_by_key -I /path/to/oneDPL/include && ./radix_sort_by_key - - #include - #include - #include - - #include - - namespace kt = oneapi::dpl::experimental::kt; - - int main() - { - std::size_t n = 6; - sycl::queue q{sycl::gpu_selector_v}; - sycl::buffer keys{sycl::range<1>(n)}; - sycl::buffer values{sycl::range<1>(n)}; - - // initialize - { - sycl::host_accessor k_acc{keys, sycl::write_only}; - k_acc[0] = 3, k_acc[1] = 2, k_acc[2] = 1, k_acc[3] = 5, k_acc[4] = 3, k_acc[5] = 3; - - sycl::host_accessor v_acc{values, sycl::write_only}; - v_acc[0] = 'r', v_acc[1] = 'o', v_acc[2] = 's', v_acc[3] = 'd', v_acc[4] = 't', v_acc[5] = 'e'; - } - - // sort - auto e = kt::gpu::esimd::radix_sort_by_key(q, keys, values, kt::kernel_param<96, 64>{}); // (6) - e.wait(); - - // print - { - sycl::host_accessor k_acc{keys, sycl::read_only}; - for(std::size_t i = 0; i < n; ++i) - std::cout << k_acc[i] << ' '; - std::cout << '\n'; - - sycl::host_accessor v_acc{values, sycl::read_only}; - for(std::size_t i = 0; i < n; ++i) - std::cout << v_acc[i] << ' '; - std::cout << '\n'; - } - - return 0; - } - -**Output:** - -.. code:: none - - 1 2 3 3 3 5 - s o r t e d -radix_sort Out-of-Place Example -------------------------------- +Out-of-Place Example +-------------------- .. code:: cpp // possible build and run commands: - // icpx -fsycl radix_sort.cpp -o radix_sort -I /path/to/oneDPL/include && ./radix_sort + // icpx -fsycl radix_sort_copy.cpp -o radix_sort_copy -I /path/to/oneDPL/include && ./radix_sort_copy #include #include @@ -278,7 +187,7 @@ radix_sort Out-of-Place Example keys[0] = 3, keys[1] = 2, keys[2] = 1, keys[3] = 5, keys[4] = 3, keys[5] = 3; // sort - auto e = kt::gpu::esimd::radix_sort(q, keys, keys + n, keys_out, kt::kernel_param<416, 64>{}); // (3) + auto e = kt::gpu::esimd::radix_sort_copy(q, keys, keys + n, keys_out, kt::kernel_param<416, 64>{}); // (3) e.wait(); // print @@ -301,92 +210,16 @@ radix_sort Out-of-Place Example 3 2 1 5 3 3 5 3 3 3 2 1 -radix_sort_by_key Out-of-Place Example --------------------------------------- - -.. code:: cpp - - // possible build and run commands: - // icpx -fsycl radix_sort_by_key.cpp -o radix_sort_by_key -I /path/to/oneDPL/include && ./radix_sort_by_key - - #include - #include - #include - - #include - - namespace kt = oneapi::dpl::experimental::kt; - - int main() - { - std::size_t n = 6; - sycl::queue q{sycl::gpu_selector_v}; - sycl::buffer keys{sycl::range<1>(n)}; - sycl::buffer keys_out{sycl::range<1>(n)}; - sycl::buffer values{sycl::range<1>(n)}; - sycl::buffer values_out{sycl::range<1>(n)}; - - - // initialize - { - sycl::host_accessor k_acc{keys, sycl::write_only}; - k_acc[0] = 3, k_acc[1] = 2, k_acc[2] = 1, k_acc[3] = 5, k_acc[4] = 3, k_acc[5] = 3; - - sycl::host_accessor v_acc{values, sycl::write_only}; - v_acc[0] = 'r', v_acc[1] = 'o', v_acc[2] = 's', v_acc[3] = 'd', v_acc[4] = 't', v_acc[5] = 'e'; - } - - // sort - auto e = kt::gpu::esimd::radix_sort_by_key(q, keys, values, keys_out, values_out, - kt::kernel_param<96, 64>{}); // (8) - e.wait(); - - // print - { - sycl::host_accessor k_acc{keys, sycl::read_only}; - for(std::size_t i = 0; i < n; ++i) - std::cout << k_acc[i] << ' '; - std::cout << '\n'; - - sycl::host_accessor v_acc{values, sycl::read_only}; - for(std::size_t i = 0; i < n; ++i) - std::cout << v_acc[i] << ' '; - std::cout << "\n\n"; - - sycl::host_accessor k_out_acc{keys_out, sycl::read_only}; - for(std::size_t i = 0; i < n; ++i) - std::cout << k_out_acc[i] << ' '; - std::cout << '\n'; - - sycl::host_accessor v_out_acc{values_out, sycl::read_only}; - for(std::size_t i = 0; i < n; ++i) - std::cout << v_out_acc[i] << ' '; - std::cout << '\n'; - } - return 0; - } - -**Output:** - -.. code:: none - - 3 2 1 5 3 3 - r o s d t e - - 1 2 3 3 3 5 - s o r t e d - - -.. _memory-requirements: +.. _radix-sort-memory-requirements: ------------------- Memory Requirements ------------------- -The algorithms use global and local device memory (see `SYCL 2020 Specification +The algorithm uses global and local device memory (see `SYCL 2020 Specification `_) -for intermediate data storage. For the algorithms to operate correctly, there must be enough memory +for intermediate data storage. For the algorithm to operate correctly, there must be enough memory on the device; otherwise, the behavior is undefined. The amount of memory that is required depends on input data and configuration parameters, as described below. @@ -396,12 +229,9 @@ Global Memory Requirements Global memory is used for copying the input sequence(s) and storing internal data such as radix value counters. The used amount depends on many parameters; below is an upper bound approximation: -:``radix_sort``: N\ :sub:`keys` + C * N\ :sub:`keys` - -:``radix_sort_by_key``: N\ :sub:`keys` + N\ :sub:`values` + C * N\ :sub:`keys` + N\ :sub:`keys` + C * N\ :sub:`keys` -where the sequence with keys takes N\ :sub:`keys` space, the sequence with values takes N\ :sub:`values` space, -and the additional space is C * N\ :sub:`keys`. +where the sequence with keys takes N\ :sub:`keys` space, and the additional space is C * N\ :sub:`keys`. The value of `C` depends on ``param.data_per_workitem``, ``param.workgroup_size``, and ``RadixBits``. For ``param.data_per_workitem`` set to `32`, ``param.workgroup_size`` to `64`, and ``RadixBits`` to `8`, @@ -412,36 +242,27 @@ Incrementing ``RadixBits`` increases `C` up to twice, while doubling either .. note:: If the number of elements to sort does not exceed ``param.data_per_workitem * param.workgroup_size``, - ``radix_sort`` is executed by a single work-group and does not use any global memory. - For ``radix_sort_by_key`` there is no single work-group implementation yet. + ``radix_sort[_copy]`` is executed by a single work-group and does not use any global memory. .. The estimation above is not very precise and it seems it is not necessary for the global memory. The C coefficient base is actually 0.53 instead of 1. An increment of RadixBits multiplies C by the factor of ~1.5 on average. - Additionally, C exceeds 1 for radix_sort_by_key, - when N is small and the global histogram takes more space than the sequences. - This space is small, single WG implementation will be added, therefore this is neglected. - -.. _local-memory: Local Memory Requirements ------------------------- -Local memory is used for reordering keys or key-value pairs within a work-group, +Local memory is used for reordering keys within a work-group, and for storing internal data such as radix value counters. The used amount depends on many parameters; below is an upper bound approximation: -:``radix_sort``: N\ :sub:`keys_per_workgroup` + C - -:``radix_sort_by_key``: N\ :sub:`keys_per_workgroup` + N\ :sub:`values_per_workgroup` + C + N\ :sub:`keys_per_workgroup` + C -where N\ :sub:`keys_per_workgroup` and N\ :sub:`values_per_workgroup` are the amounts of memory -to store keys and values, respectively. `C` is some additional space for storing internal data. +where N\ :sub:`keys_per_workgroup` is the amounts of memory to store keys. +`C` is some additional space for storing internal data. N\ :sub:`keys_per_workgroup` equals to ``sizeof(key_type) * param.data_per_workitem * param.workgroup_size``, -N\ :sub:`values_per_workgroup` equals to ``sizeof(value_type) * param.data_per_workitem * param.workgroup_size``, `C` does not exceed `4KB`. .. @@ -469,7 +290,7 @@ The initial configuration may be selected according to these high-level guidelin Increasing ``param.data_per_workitem`` should usually be preferred to increasing ``param.workgroup_size``, to avoid extra synchronization overhead within a work-group. -- When the number of elements to sort (N) is small (~16K or less) and the algorithm is ``radix_sort``, +- When the number of elements to sort (N) is small (~16K or less) and the algorithm is ``radix_sort[_copy]``, generally sorting is done more efficiently by a single work-group. Increase the ``param`` values to make ``N <= param.data_per_workitem * param.workgroup_size``. @@ -484,7 +305,7 @@ The initial configuration may be selected according to these high-level guidelin .. warning:: Avoid setting too large ``param.data_per_workitem`` and ``param.workgroup_size`` values. - Make sure that :ref:`Memory requirements ` are satisfied. + Make sure that :ref:`Memory requirements ` are satisfied. .. note:: diff --git a/documentation/library_guide/kernel_templates/esimd/radix_sort_by_key.rst b/documentation/library_guide/kernel_templates/esimd/radix_sort_by_key.rst new file mode 100755 index 00000000000..9b8cd721e92 --- /dev/null +++ b/documentation/library_guide/kernel_templates/esimd/radix_sort_by_key.rst @@ -0,0 +1,362 @@ +Radix Sort By Key +################# + +------------------------------------ +radix_sort_by_key Function Templates +------------------------------------ + +The ``radix_sort_by_key`` and ``radix_sort_copy_by_key`` functions sort keys using the radix sort algorithm, +applying the same order to the corresponding values. +The sorting is stable, preserving the relative order of elements with equal keys. +Both in-place and out-of-place APIs are provided. Out-of-place APIs do not alter the input sequences. + +The functions implement a Onesweep* [#fnote1]_ algorithm variant. + +A synopsis of the ``radix_sort_by_key`` and ``radix_sort_copy_by_key`` functions are provided below: + +.. code:: cpp + + // defined in + + namespace oneapi::dpl::experimental::kt::gpu::esimd { + + // Sort in-place + template + sycl::event + radix_sort_by_key (sycl::queue q, Iterator1 keys_first, Iterator1 keys_last, + Iterator2 values_first, KernelParam param); // (1) + + template + sycl::event + radix_sort_by_key (sycl::queue q, KeysRng&& keys, + ValuesRng&& values, KernelParam param); // (2) + + + // Sort out-of-place + template + sycl::event + radix_sort_copy_by_key (sycl::queue q, KeysIterator1 keys_first, + KeysIterator1 keys_last, ValuesIterator1 values_first, + KeysIterator2 keys_out_first, ValuesIterator2 values_out_first, + KernelParam param); // (3) + + template + sycl::event + radix_sort_copy_by_key (sycl::queue q, KeysRng1&& keys, ValuesRng1&& values, + KeysRng2&& keys_out, ValuesRng2&& values_out, + KernelParam param); // (4) + } + + +Template Parameters +-------------------- + ++-----------------------------+---------------------------------------------------------------------------------------+ +| Name | Description | ++=============================+=======================================================================================+ +| ``bool IsAscending`` | The sort order. Ascending: ``true``; Descending: ``false``. | ++-----------------------------+---------------------------------------------------------------------------------------+ +| ``std::uint8_t RadixBits`` | The number of bits to sort for each radix sort algorithm pass. | ++-----------------------------+---------------------------------------------------------------------------------------+ + + +Parameters +---------- + ++-----------------------------------------------+---------------------------------------------------------------------+ +| Name | Description | ++===============================================+=====================================================================+ +| ``q`` | The SYCL* queue where kernels are submitted. | ++-----------------------------------------------+---------------------------------------------------------------------+ +| | | +| | The sequences to apply the algorithm to. | +| - ``keys_first``, ``keys_last``, | Supported sequence types: | +| ``values_first`` (1), | | +| - ``keys``, ``values`` (2), | - :ref:`USM pointers ` (1,3), | +| - ``keys_first``, ``keys_last``, | - :ref:`oneapi::dpl::begin and oneapi::dpl::end | +| ``values_first``, ``keys_out_first``, | ` (1,3). | +| ``values_out_first`` (3) | - ``sycl::buffer`` (2,4), | +| - ``keys``, ``values``, | - :ref:`oneapi::dpl::experimental::ranges::views::all | +| ``keys_out``, ``values_out`` (4). | ` (2,4), | +| | - :ref:`oneapi::dpl::experimental::ranges::views::subrange | +| | ` (2,4). | +| | | +| | | +| | | ++-----------------------------------------------+---------------------------------------------------------------------+ +| ``param`` | A :doc:`kernel_param <../kernel_configuration>` object. | +| | Its ``data_per_workitem`` must be a positive multiple of 32. | +| | | +| | | ++-----------------------------------------------+---------------------------------------------------------------------+ + + +**Type Requirements**: + +- The element type of sequence(s) to sort must be a C++ integral or floating-point type + other than ``bool`` with a width of up to 64 bits. + +.. note:: + + Current limitations: + + - Number of elements to sort must not exceed `2^30`. + - ``RadixBits`` can only be `8`. + - ``param.workgroup_size`` can only be `64`. + +Return Value +------------ + +A ``sycl::event`` object representing the status of the algorithm execution. + +-------------- +Usage Examples +-------------- + + +In-Place Example +---------------- + +.. code:: cpp + + // possible build and run commands: + // icpx -fsycl radix_sort_by_key.cpp -o radix_sort_by_key -I /path/to/oneDPL/include && ./radix_sort_by_key + + #include + #include + #include + + #include + + namespace kt = oneapi::dpl::experimental::kt; + + int main() + { + std::size_t n = 6; + sycl::queue q{sycl::gpu_selector_v}; + sycl::buffer keys{sycl::range<1>(n)}; + sycl::buffer values{sycl::range<1>(n)}; + + // initialize + { + sycl::host_accessor k_acc{keys, sycl::write_only}; + k_acc[0] = 3, k_acc[1] = 2, k_acc[2] = 1, k_acc[3] = 5, k_acc[4] = 3, k_acc[5] = 3; + + sycl::host_accessor v_acc{values, sycl::write_only}; + v_acc[0] = 'r', v_acc[1] = 'o', v_acc[2] = 's', v_acc[3] = 'd', v_acc[4] = 't', v_acc[5] = 'e'; + } + + // sort + auto e = kt::gpu::esimd::radix_sort_by_key(q, keys, values, kt::kernel_param<96, 64>{}); // (2) + e.wait(); + + // print + { + sycl::host_accessor k_acc{keys, sycl::read_only}; + for(std::size_t i = 0; i < n; ++i) + std::cout << k_acc[i] << ' '; + std::cout << '\n'; + + sycl::host_accessor v_acc{values, sycl::read_only}; + for(std::size_t i = 0; i < n; ++i) + std::cout << v_acc[i] << ' '; + std::cout << '\n'; + } + + return 0; + } + +**Output:** + +.. code:: none + + 1 2 3 3 3 5 + s o r t e d + +Out-of-Place Example +-------------------- + +.. code:: cpp + + // possible build and run commands: + // icpx -fsycl radix_sort_copy_by_key.cpp -o radix_sort_copy_by_key -I /path/to/oneDPL/include && ./radix_sort_copy_by_key + + #include + #include + #include + + #include + + namespace kt = oneapi::dpl::experimental::kt; + + int main() + { + std::size_t n = 6; + sycl::queue q{sycl::gpu_selector_v}; + sycl::buffer keys{sycl::range<1>(n)}; + sycl::buffer keys_out{sycl::range<1>(n)}; + sycl::buffer values{sycl::range<1>(n)}; + sycl::buffer values_out{sycl::range<1>(n)}; + + + // initialize + { + sycl::host_accessor k_acc{keys, sycl::write_only}; + k_acc[0] = 3, k_acc[1] = 2, k_acc[2] = 1, k_acc[3] = 5, k_acc[4] = 3, k_acc[5] = 3; + + sycl::host_accessor v_acc{values, sycl::write_only}; + v_acc[0] = 'r', v_acc[1] = 'o', v_acc[2] = 's', v_acc[3] = 'd', v_acc[4] = 't', v_acc[5] = 'e'; + } + + // sort + auto e = kt::gpu::esimd::radix_sort_copy_by_key(q, keys, values, keys_out, values_out, + kt::kernel_param<96, 64>{}); // (4) + e.wait(); + + // print + { + sycl::host_accessor k_acc{keys, sycl::read_only}; + for(std::size_t i = 0; i < n; ++i) + std::cout << k_acc[i] << ' '; + std::cout << '\n'; + + sycl::host_accessor v_acc{values, sycl::read_only}; + for(std::size_t i = 0; i < n; ++i) + std::cout << v_acc[i] << ' '; + std::cout << "\n\n"; + + sycl::host_accessor k_out_acc{keys_out, sycl::read_only}; + for(std::size_t i = 0; i < n; ++i) + std::cout << k_out_acc[i] << ' '; + std::cout << '\n'; + + sycl::host_accessor v_out_acc{values_out, sycl::read_only}; + for(std::size_t i = 0; i < n; ++i) + std::cout << v_out_acc[i] << ' '; + std::cout << '\n'; + } + + return 0; + } + +**Output:** + +.. code:: none + + 3 2 1 5 3 3 + r o s d t e + + 1 2 3 3 3 5 + s o r t e d + + +.. _radix-sort-by-key-memory-requirements: + +------------------- +Memory Requirements +------------------- + +The algorithm uses global and local device memory (see `SYCL 2020 Specification +`_) +for intermediate data storage. For the algorithm to operate correctly, there must be enough memory +on the device; otherwise, the behavior is undefined. The amount of memory that is required +depends on input data and configuration parameters, as described below. + +Global Memory Requirements +-------------------------- + +Global memory is used for copying the input sequence(s) and storing internal data such as radix value counters. +The used amount depends on many parameters; below is an upper bound approximation: + + N\ :sub:`keys` + N\ :sub:`values` + C * N\ :sub:`keys` + +where the sequence with keys takes N\ :sub:`keys` space, the sequence with values takes N\ :sub:`values` space, +and the additional space is C * N\ :sub:`keys`. + +The value of `C` depends on ``param.data_per_workitem``, ``param.workgroup_size``, and ``RadixBits``. +For ``param.data_per_workitem`` set to `32`, ``param.workgroup_size`` to `64`, and ``RadixBits`` to `8`, +`C` approximately equals to `1`. +Incrementing ``RadixBits`` increases `C` up to twice, while doubling either +``param.data_per_workitem`` or ``param.workgroup_size`` leads to a halving of `C`. + +.. + The estimation above is not very precise and it seems it is not necessary for the global memory. + The C coefficient base is actually 0.53 instead of 1. + An increment of RadixBits multiplies C by the factor of ~1.5 on average. + + Additionally, C exceeds 1 for radix_sort[_copy]_by_key, + when N is small and the global histogram takes more space than the sequences. + This space is small, single WG implementation will be added, therefore this is neglected. + +Local Memory Requirements +------------------------- + +Local memory is used for reordering key-value pairs within a work-group, +and for storing internal data such as radix value counters. +The used amount depends on many parameters; below is an upper bound approximation: + + N\ :sub:`keys_per_workgroup` + N\ :sub:`values_per_workgroup` + C + +where N\ :sub:`keys_per_workgroup` and N\ :sub:`values_per_workgroup` are the amounts of memory +to store keys and values, respectively. `C` is some additional space for storing internal data. + +N\ :sub:`keys_per_workgroup` equals to ``sizeof(key_type) * param.data_per_workitem * param.workgroup_size``, +N\ :sub:`values_per_workgroup` equals to ``sizeof(value_type) * param.data_per_workitem * param.workgroup_size``, +`C` does not exceed `4KB`. + +.. + C as 4KB stands on these points: + 1) Extra space is needed to store a histogram to distribute keys. It's size is 4 * (2^RadixBits). + The estimation is correct for RadixBits 9 (2KB) and smaller. Support of larger RadixBits is not expected. + 1) N_keys + N_values is rounded up at 2KB border (temporarily as a workaround for a GPU driver bug). + +.. + The estimation assumes that reordering keys/pairs takes more space than ranking keys. + The ranking takes approximatelly "2 * workgroup_size * (2^RadixBits)" bytes. + It suprpasses Intel Data Center GPU Max SLM capacity in only marginal cases, + e.g., when RadixBits is 10 and workgroup_size is 64, or when RadixBits is 9 and workgroup_size is 128. + It is ignored as an unrealistic case. + +----------------------------------------- +Recommended Settings for Best Performance +----------------------------------------- + +The general advice is to choose kernel parameters based on performance measurements and profiling information. +The initial configuration may be selected according to these high-level guidelines: + +.. + TODO: add this part when param.workgroup_size supports more than one value: + Increasing ``param.data_per_workitem`` should usually be preferred to increasing ``param.workgroup_size``, + to avoid extra synchronization overhead within a work-group. + +- When the number of elements to sort ``N`` is less than 1M, utilizing all available + compute cores is key for better performance. Allow creating enough work chunks to feed all + X\ :sup:`e`-cores [#fnote2]_ on a GPU: ``param.data_per_workitem * param.workgroup_size ≈ N / xe_core_count``. + +- When the number of elements to sort is large (more than ~1M), maximizing the number of elements + processed by a work-group, which equals to ``param.data_per_workitem * param.workgroup_size``, + reduces synchronization overheads between work-groups and usually benefits the overall performance. + +.. warning:: + + Avoid setting too large ``param.data_per_workitem`` and ``param.workgroup_size`` values. + Make sure that :ref:`Memory requirements ` are satisfied. + +.. note:: + + ``param.data_per_workitem`` is the only available parameter to tune the performance, + since ``param.workgroup_size`` currently supports only one value (`64`). + + +.. [#fnote1] Andy Adinets and Duane Merrill (2022). Onesweep: A Faster Least Significant Digit Radix Sort for GPUs. Retrieved from https://arxiv.org/abs/2206.01784. +.. [#fnote2] The X\ :sup:`e`-core term is described in the `oneAPI GPU Optimization Guide + `_. + Check the number of cores in the device specification, such as `Intel® Data Center GPU Max specification + `_. diff --git a/documentation/library_guide/kernel_templates/esimd_main.rst b/documentation/library_guide/kernel_templates/esimd_main.rst index c717e46d1e0..73c4e50d7b9 100644 --- a/documentation/library_guide/kernel_templates/esimd_main.rst +++ b/documentation/library_guide/kernel_templates/esimd_main.rst @@ -8,7 +8,8 @@ This technology only supports Intel GPU devices. These templates are available in the ``oneapi::dpl::experimental::kt::gpu::esimd`` namespace. The following are implemented: -* :doc:`radix_sort and radix_sort_by_key ` +* :doc:`radix_sort ` +* :doc:`radix_sort_by_key ` .. toctree:: :maxdepth: 2 @@ -17,6 +18,7 @@ These templates are available in the ``oneapi::dpl::experimental::kt::gpu::esimd :hidden: esimd/radix_sort + esimd/radix_sort_by_key ------------------- System Requirements diff --git a/include/oneapi/dpl/experimental/kt/esimd_radix_sort.h b/include/oneapi/dpl/experimental/kt/esimd_radix_sort.h index 6d12e300990..285cc38010f 100644 --- a/include/oneapi/dpl/experimental/kt/esimd_radix_sort.h +++ b/include/oneapi/dpl/experimental/kt/esimd_radix_sort.h @@ -88,8 +88,8 @@ radix_sort_by_key(sycl::queue __q, _KeysIterator __keys_first, _KeysIterator __k template -std::enable_if_t, sycl::event> -radix_sort(sycl::queue __q, _KeysRng1&& __keys_rng, _KeysRng2&& __keys_rng_out, _KernelParam __param = {}) +sycl::event +radix_sort_copy(sycl::queue __q, _KeysRng1&& __keys_rng, _KeysRng2&& __keys_rng_out, _KernelParam __param = {}) { __impl::__check_esimd_sort_params<__radix_bits, _KernelParam::data_per_workitem, _KernelParam::workgroup_size>(); if (__keys_rng.size() == 0) @@ -103,9 +103,9 @@ radix_sort(sycl::queue __q, _KeysRng1&& __keys_rng, _KeysRng2&& __keys_rng_out, template -std::enable_if_t, sycl::event> -radix_sort(sycl::queue __q, _KeysIterator1 __keys_first, _KeysIterator1 __keys_last, _KeysIterator2 __keys_out_first, - _KernelParam __param = {}) +sycl::event +radix_sort_copy(sycl::queue __q, _KeysIterator1 __keys_first, _KeysIterator1 __keys_last, + _KeysIterator2 __keys_out_first, _KernelParam __param = {}) { __impl::__check_esimd_sort_params<__radix_bits, _KernelParam::data_per_workitem, _KernelParam::workgroup_size>(); @@ -125,9 +125,9 @@ radix_sort(sycl::queue __q, _KeysIterator1 __keys_first, _KeysIterator1 __keys_l template -std::enable_if_t, sycl::event> -radix_sort_by_key(sycl::queue __q, _KeysRng1&& __keys_rng, _ValsRng1&& __vals_rng, _KeysRng2&& __keys_out_rng, - _ValsRng2&& __vals_out_rng, _KernelParam __param = {}) +sycl::event +radix_sort_copy_by_key(sycl::queue __q, _KeysRng1&& __keys_rng, _ValsRng1&& __vals_rng, _KeysRng2&& __keys_out_rng, + _ValsRng2&& __vals_out_rng, _KernelParam __param = {}) { __impl::__check_esimd_sort_params<__radix_bits, _KernelParam::data_per_workitem, _KernelParam::workgroup_size>(); if (__keys_rng.size() == 0) @@ -143,9 +143,10 @@ radix_sort_by_key(sycl::queue __q, _KeysRng1&& __keys_rng, _ValsRng1&& __vals_rn template -std::enable_if_t, sycl::event> -radix_sort_by_key(sycl::queue __q, _KeysIterator1 __keys_first, _KeysIterator1 __keys_last, _ValsIterator1 __vals_first, - _KeysIterator2 __keys_out_first, _ValsIterator2 __vals_out_first, _KernelParam __param = {}) +sycl::event +radix_sort_copy_by_key(sycl::queue __q, _KeysIterator1 __keys_first, _KeysIterator1 __keys_last, + _ValsIterator1 __vals_first, _KeysIterator2 __keys_out_first, _ValsIterator2 __vals_out_first, + _KernelParam __param = {}) { __impl::__check_esimd_sort_params<__radix_bits, _KernelParam::data_per_workitem, _KernelParam::workgroup_size>(); @@ -170,6 +171,62 @@ radix_sort_by_key(sycl::queue __q, _KeysIterator1 __keys_first, _KeysIterator1 _ ::std::move(__pack_out), __param); } +template +[[deprecated("Use of oneapi::dpl::experimental::kt::gpu::radix_sort " + "API for out of place operations is deprecated " + "and will be removed in a future release. " + "Use oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_copy instead")]] +std::enable_if_t, sycl::event> +radix_sort(sycl::queue __q, _KeysRng1&& __keys_rng, _KeysRng2&& __keys_rng_out, _KernelParam __param = {}) +{ + return radix_sort_copy(__q, ::std::forward<_KeysRng1>(__keys_rng), ::std::forward<_KeysRng2>(__keys_rng_out), + __param); +} + +template +[[deprecated("Use of oneapi::dpl::experimental::kt::gpu::radix_sort " + "API for out of place operations is deprecated " + "and will be removed in a future release. " + "Use oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_copy instead")]] +std::enable_if_t, sycl::event> +radix_sort(sycl::queue __q, _KeysIterator1 __keys_first, _KeysIterator1 __keys_last, + _KeysIterator2 __keys_out_first, _KernelParam __param = {}) +{ + return radix_sort_copy(__q, __keys_first, __keys_last, __keys_out_first, __param); +} + +template +[[deprecated("Use of oneapi::dpl::experimental::kt::gpu::radix_sort_by_key " + "API for out of place operations is deprecated " + "and will be removed in a future release. " + "Use oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_copy_by_key instead")]] +std::enable_if_t, sycl::event> +radix_sort_by_key(sycl::queue __q, _KeysRng1&& __keys_rng, _ValsRng1&& __vals_rng, _KeysRng2&& __keys_out_rng, + _ValsRng2&& __vals_out_rng, _KernelParam __param = {}) +{ + return radix_sort_copy_by_key(__q, ::std::forward<_KeysRng1>(__keys_rng), ::std::forward<_ValsRng1>(__vals_rng), + ::std::forward<_KeysRng2>(__keys_out_rng), ::std::forward<_ValsRng2>(__vals_out_rng), + __param); +} + +template +[[deprecated("Use of oneapi::dpl::experimental::kt::gpu::radix_sort_by_key " + "API for out of place operations is deprecated " + "and will be removed in a future release. " + "Use oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_copy_by_key instead")]] +std::enable_if_t, sycl::event> +radix_sort_by_key(sycl::queue __q, _KeysIterator1 __keys_first, _KeysIterator1 __keys_last, + _ValsIterator1 __vals_first, _KeysIterator2 __keys_out_first, _ValsIterator2 __vals_out_first, + _KernelParam __param = {}) +{ + return radix_sort_copy_by_key(__q, __keys_first, __keys_last, __vals_first, __keys_out_first, __vals_out_first, + __param); +} + } // namespace oneapi::dpl::experimental::kt::gpu::esimd namespace oneapi::dpl::experimental::kt diff --git a/test/kt/CMakeLists.txt b/test/kt/CMakeLists.txt index 3e312108819..a36578b5f42 100644 --- a/test/kt/CMakeLists.txt +++ b/test/kt/CMakeLists.txt @@ -97,8 +97,8 @@ endfunction() # 5. Number of elements. # Parameters #1 and #4 were selected for generation to allow compilation and execution of a test within ~5 minutes function(_generate_esimd_sort_tests _key_value_pairs) - set(_base_file_all "esimd_radix_sort" "esimd_radix_sort_out_of_place") - set(_base_file_by_key_all "esimd_radix_sort_by_key" "esimd_radix_sort_by_key_out_of_place") + set(_base_file_all "esimd_radix_sort" "esimd_radix_sort_copy") + set(_base_file_by_key_all "esimd_radix_sort_by_key" "esimd_radix_sort_copy_by_key") set(_data_per_work_item_all "32" "64" "96" "128" "160" "192" "224" "256" "288" "320" "352" "384" "416" "448" "480" "512") set(_work_group_size_all "32" "64") set(_type_all "char" "uint16_t" "int" "uint64_t" "float" "double") diff --git a/test/kt/esimd_radix_sort_out_of_place.cpp b/test/kt/esimd_radix_sort_copy.cpp similarity index 93% rename from test/kt/esimd_radix_sort_out_of_place.cpp rename to test/kt/esimd_radix_sort_copy.cpp index ae1c6b3ecd2..ac3e362ee88 100644 --- a/test/kt/esimd_radix_sort_out_of_place.cpp +++ b/test/kt/esimd_radix_sort_copy.cpp @@ -1,5 +1,5 @@ // -*- C++ -*- -//===-- esimd_radix_sort_out_of_place.cpp ---------------------------------===// +//===-- esimd_radix_sort_copy.cpp ---------------------------------===// // // Copyright (C) 2023 Intel Corporation // @@ -55,7 +55,7 @@ test_all_view(sycl::queue q, std::size_t size, KernelParam param) sycl::buffer buf_out(output.data(), output.size()); oneapi::dpl::experimental::ranges::all_view view(buf); oneapi::dpl::experimental::ranges::all_view view_out(buf_out); - oneapi::dpl::experimental::kt::gpu::esimd::radix_sort(q, view, view_out, param).wait(); + oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_copy(q, view, view_out, param).wait(); } std::string msg = "input modified with all_view, n: " + std::to_string(size); @@ -85,7 +85,7 @@ test_subrange_view(sycl::queue q, std::size_t size, KernelParam param) oneapi::dpl::experimental::ranges::views::subrange view_in(dt_input.get_data(), dt_input.get_data() + size); oneapi::dpl::experimental::ranges::views::subrange view_out(dt_output.get_data(), dt_output.get_data() + size); - oneapi::dpl::experimental::kt::gpu::esimd::radix_sort(q, view_in, view_out, param).wait(); + oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_copy(q, view_in, view_out, param).wait(); std::vector output_actual(size); std::vector input_actual(input_ref); @@ -119,7 +119,7 @@ test_usm(sycl::queue q, std::size_t size, KernelParam param) std::stable_sort(output_ref.begin(), output_ref.end(), Compare{}); - oneapi::dpl::experimental::kt::gpu::esimd::radix_sort( + oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_copy( q, dt_input.get_data(), dt_input.get_data() + size, dt_output.get_data(), param) .wait(); @@ -151,7 +151,7 @@ test_sycl_iterators(sycl::queue q, std::size_t size, KernelParam param) { sycl::buffer buf(input.data(), input.size()); sycl::buffer buf_out(output.data(), output.size()); - oneapi::dpl::experimental::kt::gpu::esimd::radix_sort( + oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_copy( q, oneapi::dpl::begin(buf), oneapi::dpl::end(buf), oneapi::dpl::begin(buf_out), param) .wait(); } @@ -179,7 +179,7 @@ test_sycl_buffer(sycl::queue q, std::size_t size, KernelParam param) { sycl::buffer buf(input.data(), input.size()); sycl::buffer buf_out(output.data(), output.size()); - oneapi::dpl::experimental::kt::gpu::esimd::radix_sort(q, buf, buf_out, param).wait(); + oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_copy(q, buf, buf_out, param).wait(); } std::string msg = "modified input data with sycl::buffer, n: " + std::to_string(size); @@ -199,7 +199,7 @@ test_small_sizes(sycl::queue q, KernelParam param) std::vector output(size, T{9}); std::vector output_ref(size, T{9}); - oneapi::dpl::experimental::kt::gpu::esimd::radix_sort( + oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_copy( q, oneapi::dpl::begin(input), oneapi::dpl::begin(input), oneapi::dpl::begin(output), param) .wait(); EXPECT_EQ_RANGES(ref, input, "sort modified input data when size == 0"); diff --git a/test/kt/esimd_radix_sort_by_key_out_of_place.cpp b/test/kt/esimd_radix_sort_copy_by_key.cpp similarity index 95% rename from test/kt/esimd_radix_sort_by_key_out_of_place.cpp rename to test/kt/esimd_radix_sort_copy_by_key.cpp index 1ef7757090a..9bb238d57c5 100644 --- a/test/kt/esimd_radix_sort_by_key_out_of_place.cpp +++ b/test/kt/esimd_radix_sort_copy_by_key.cpp @@ -1,5 +1,5 @@ // -*- C++ -*- -//===-- esimd_radix_sort_by_key_out_of_place.cpp --------------------------===// +//===-- esimd_radix_sort_copy_by_key.cpp --------------------------===// // // Copyright (C) 2023 Intel Corporation // @@ -50,7 +50,7 @@ test_sycl_buffer(sycl::queue q, std::size_t size, KernelParam param) sycl::buffer keys_out(actual_keys_out.data(), actual_keys_out.size()); sycl::buffer values_out(actual_values_out.data(), actual_values_out.size()); - oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_by_key(q, keys, values, keys_out, + oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_copy_by_key(q, keys, values, keys_out, values_out, param) .wait(); } @@ -101,7 +101,7 @@ test_usm(sycl::queue q, std::size_t size, KernelParam param) auto expected_first = oneapi::dpl::make_zip_iterator(std::begin(expected_keys), std::begin(expected_values)); std::stable_sort(expected_first, expected_first + size, CompareKey{}); - oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_by_key( + oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_copy_by_key( q, keys.get_data(), keys.get_data() + size, values.get_data(), keys_out.get_data(), values_out.get_data(), param) .wait();