Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move sycl::event and __result_and_scratch_storage into __future #1774

Draft
wants to merge 12 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 13 additions & 15 deletions include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,10 @@ __pattern_walk2_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__acc_mode2, _ForwardIterator2>();
auto __buf2 = __keep2(__first2, __first2 + __n);

auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf1.all_view(), __buf2.all_view());

return __future.__make_future(__first2 + __n);
return oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f},
__n, __buf1.all_view(), __buf2.all_view())
.__make_future(__first2 + __n);
}

template <typename _BackendTag, typename _ExecutionPolicy, typename _ForwardIterator1, typename _ForwardIterator2,
Expand All @@ -91,12 +90,11 @@ __pattern_walk3_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator3>();
auto __buf3 = __keep3(__first3, __first3 + __n);

auto __future =
oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n,
__buf1.all_view(), __buf2.all_view(), __buf3.all_view());

return __future.__make_future(__first3 + __n);
return oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f},
__n, __buf1.all_view(), __buf2.all_view(),
__buf3.all_view())
.__make_future(__first3 + __n);
}

template <typename _BackendTag, typename _ExecutionPolicy, typename _ForwardIterator1, typename _ForwardIterator2,
Expand Down Expand Up @@ -201,10 +199,10 @@ __pattern_transform_scan_base_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&
auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator2>();
auto __buf2 = __keep2(__result, __result + __n);

auto __res = oneapi::dpl::__par_backend_hetero::__parallel_transform_scan(
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __n, __unary_op,
__init, __binary_op, _Inclusive{});
return __res.__make_future(__result + __n);
return oneapi::dpl::__par_backend_hetero::__parallel_transform_scan(
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __n,
__unary_op, __init, __binary_op, _Inclusive{})
.__make_future(__result + __n);
}

template <typename _BackendTag, typename _ExecutionPolicy, typename _Iterator1, typename _Iterator2,
Expand Down
17 changes: 8 additions & 9 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,12 +228,12 @@ template <typename... _Name>
struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
{
template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
auto
__future<sycl::event>
operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
{
assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
_PRINT_INFO_IN_DEBUG_MODE(__exec);
auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
return __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
//get an access to data under SYCL buffer:
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);

Expand All @@ -242,7 +242,6 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
__brick(__idx, __rngs...);
});
});
return __future(__event);
}
};

Expand Down Expand Up @@ -372,7 +371,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
});
});

return __future(__final_event, __result_and_scratch);
return __make_future(std::move(__final_event), std::move(__result_and_scratch));
}
};

Expand Down Expand Up @@ -644,7 +643,7 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
}
});
});
return __future(__event, __result);
return __make_future(std::move(__event), std::move(__result));
}
};

Expand Down Expand Up @@ -700,7 +699,7 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend
/* _IsFullGroup= */ ::std::false_type, _Inclusive, _CustomName>>>()(
::std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op);
return __future(__event, __dummy_result_and_scratch);
return __make_future(std::move(__event), std::move(__dummy_result_and_scratch));
};
if (__n <= 16)
return __single_group_scan_f(std::integral_constant<::std::uint16_t, 16>{});
Expand Down Expand Up @@ -734,7 +733,7 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend
__parallel_transform_scan_dynamic_single_group_submitter<_Inclusive::value, _DynamicGroupScanKernel>()(
std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op, __max_wg_size);
return __future(__event, __dummy_result_and_scratch);
return __make_future(std::move(__event), std::move(__dummy_result_and_scratch));
}
}

Expand Down Expand Up @@ -1806,7 +1805,7 @@ struct __parallel_partial_sort_submitter<__internal::__optional_kernel_name<_Glo
__internal::__optional_kernel_name<_CopyBackName...>>
{
template <typename _BackendTag, typename _ExecutionPolicy, typename _Range, typename _Merge, typename _Compare>
auto
__future<sycl::event>
operator()(_BackendTag, _ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge, _Compare __comp) const
{
using _Tp = oneapi::dpl::__internal::__value_t<_Range>;
Expand Down Expand Up @@ -1866,7 +1865,7 @@ struct __parallel_partial_sort_submitter<__internal::__optional_kernel_name<_Glo
});
}
// return future and extend lifetime of temporary buffer
return __future(__event1);
return __event1;
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,14 @@ template <typename... _Name>
struct __parallel_for_fpga_submitter<__internal::__optional_kernel_name<_Name...>>
{
template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
auto
__future<sycl::event>
operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
{
auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...);
assert(__n > 0);

_PRINT_INFO_IN_DEBUG_MODE(__exec);
auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
return __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
//get an access to data under SYCL buffer:
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);

Expand All @@ -75,7 +75,6 @@ struct __parallel_for_fpga_submitter<__internal::__optional_kernel_name<_Name...
}
});
});
return __future(__event);
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ __histogram_general_private_global_atomics(oneapi::dpl::__internal::__device_bac

template <::std::uint16_t __iters_per_work_item, typename _ExecutionPolicy, typename _Range1, typename _Range2,
typename _BinHashMgr>
auto
__future<sycl::event>
__parallel_histogram_select_kernel(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
_ExecutionPolicy&& __exec, const sycl::event& __init_event, _Range1&& __input,
_Range2&& __bins, const _BinHashMgr& __binhash_manager)
Expand All @@ -516,19 +516,18 @@ __parallel_histogram_select_kernel(oneapi::dpl::__internal::__device_backend_tag
// if bins fit into registers, use register private accumulation
if (__num_bins <= __max_work_item_private_bins)
{
return __future(
__histogram_general_registers_local_reduction<__iters_per_work_item, __max_work_item_private_bins>(
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size,
::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager));
return __histogram_general_registers_local_reduction<__iters_per_work_item, __max_work_item_private_bins>(
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size,
::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager);
}
// if bins fit into SLM, use local atomics
else if (__num_bins * sizeof(_local_histogram_type) +
__binhash_manager.get_required_SLM_elements() * sizeof(_extra_memory_type) <
__local_mem_size)
{
return __future(__histogram_general_local_atomics<__iters_per_work_item>(
return __histogram_general_local_atomics<__iters_per_work_item>(
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size,
::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager));
::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager);
}
else // otherwise, use global atomics (private copies per workgroup)
{
Expand All @@ -537,9 +536,9 @@ __parallel_histogram_select_kernel(oneapi::dpl::__internal::__device_backend_tag
// suggestion which but global memory limitations may increase this value to be able to fit the workgroup
// private copies of the histogram bins in global memory. No unrolling is taken advantage of here because it
// is a runtime argument.
return __future(__histogram_general_private_global_atomics(
return __histogram_general_private_global_atomics(
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, __iters_per_work_item,
__work_group_size, ::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager));
__work_group_size, ::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ template <typename _IdType, typename... _Name>
struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_Name...>>
{
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare>
auto
__future<sycl::event>
operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const
{
const _IdType __n1 = __rng1.size();
Expand All @@ -153,7 +153,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N

const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk);

auto __event = __exec.queue().submit([&](sycl::handler& __cgh) {
return __exec.queue().submit([&](sycl::handler& __cgh) {
oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3);
__cgh.parallel_for<_Name...>(sycl::range</*dim=*/1>(__steps), [=](sycl::item</*dim=*/1> __item_id) {
const _IdType __i_elem = __item_id.get_linear_id() * __chunk;
Expand All @@ -162,7 +162,6 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N
__comp);
});
});
return __future(__event);
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ struct __parallel_sort_submitter<_IdType, __internal::__optional_kernel_name<_Le
__internal::__optional_kernel_name<_CopyBackName...>>
{
template <typename _ExecutionPolicy, typename _Range, typename _Compare, typename _LeafSorter>
auto
__future<sycl::event>
operator()(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp, _LeafSorter& __leaf_sorter) const
{
using _Tp = oneapi::dpl::__internal::__value_t<_Range>;
Expand Down Expand Up @@ -303,7 +303,7 @@ struct __parallel_sort_submitter<_IdType, __internal::__optional_kernel_name<_Le
});
}

return __future(__event1);
return __event1;
}
};

Expand Down
Loading
Loading