From 5d184e9838083023aacaf158ae578e89e21b5267 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Mon, 7 Oct 2024 13:01:38 +0100 Subject: [PATCH 1/4] [SYCL][Graph] Implement dynamic command-groups Implement Dynamic Command-Group feature specified in PR [[SYCL][Graph] Add specification for kernel binary updates](https://github.com/intel/llvm/pull/14896) This feature enables updating `ur_kernel_handle_t` objects in graph nodes between executions as well as parameters and execution range of nodes. This functionality is currently supported on CUDA & HIP which are used for testing in the new E2E tests. Level Zero support will follow shortly, resulting in the removal of the `XFAIL` labels from the E2E tests. The code for adding nodes to a graph has been refactored to split out verification of edges, and marking memory objects used in a node, as separate helper functions. This allows path for adding a command-group node to do this functions over each CG in the list before creating the node itself. The `dynamic_parameter_impl` code has also been refactored so the code is shared for updating a dynamic parameter used in both a regular kernel node and a dynamic command-group node. See the addition to the design doc for further details on the implementation. --- sycl/doc/design/CommandGraph.md | 24 + .../sycl/ext/oneapi/experimental/graph.hpp | 24 + sycl/include/sycl/handler.hpp | 14 +- sycl/source/detail/cg.hpp | 5 +- sycl/source/detail/graph_impl.cpp | 419 +++++++++++++++--- sycl/source/detail/graph_impl.hpp | 265 ++++++----- sycl/source/detail/handler_impl.hpp | 4 - sycl/source/detail/scheduler/commands.cpp | 62 ++- sycl/source/handler.cpp | 23 +- .../Graph/Update/dyn_cgf_accessor.cpp | 60 +++ .../Graph/Update/dyn_cgf_accessor_deps.cpp | 74 ++++ .../Graph/Update/dyn_cgf_accessor_deps2.cpp | 85 ++++ .../Graph/Update/dyn_cgf_accessor_spv.cpp | 81 ++++ .../Update/dyn_cgf_different_arg_nums.cpp | 148 +++++++ .../Graph/Update/dyn_cgf_event_deps.cpp | 73 +++ .../test-e2e/Graph/Update/dyn_cgf_ndrange.cpp | 72 +++ .../Graph/Update/dyn_cgf_ndrange_3D.cpp | 80 ++++ .../Graph/Update/dyn_cgf_overwrite_range.cpp | 59 +++ .../Graph/Update/dyn_cgf_parameters.cpp | 70 +++ .../Graph/Update/dyn_cgf_shared_nodes.cpp | 74 ++++ .../Update/dyn_cgf_update_before_finalize.cpp | 48 ++ sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp | 55 +++ .../Update/dyn_cgf_with_all_dyn_params.cpp | 121 +++++ ...dyn_cgf_with_different_type_dyn_params.cpp | 137 ++++++ .../Update/dyn_cgf_with_some_dyn_params.cpp | 107 +++++ .../Graph/Update/update_ndrange_to_range.cpp | 55 +++ .../Graph/Update/update_range_to_ndrange.cpp | 56 +++ .../Graph/Update/whole_update_dynamic_cgf.cpp | 75 ++++ .../Update/whole_update_dynamic_param.cpp | 2 - sycl/test/abi/sycl_symbols_linux.dump | 5 + sycl/test/abi/sycl_symbols_windows.dump | 9 + .../Extensions/CommandGraph/Exceptions.cpp | 163 +++++++ .../Extensions/CommandGraph/Update.cpp | 29 -- 33 files changed, 2309 insertions(+), 269 deletions(-) create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_different_arg_nums.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_ndrange_3D.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_with_all_dyn_params.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_with_different_type_dyn_params.cpp create mode 100644 sycl/test-e2e/Graph/Update/dyn_cgf_with_some_dyn_params.cpp create mode 100644 sycl/test-e2e/Graph/Update/update_ndrange_to_range.cpp create mode 100644 sycl/test-e2e/Graph/Update/update_range_to_ndrange.cpp create mode 100644 sycl/test-e2e/Graph/Update/whole_update_dynamic_cgf.cpp diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 2f83d42a3c57c..d7587113a4615 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -282,6 +282,30 @@ requirements for these new accessors to correctly trigger allocations before updating. This is similar to how individual graph commands are enqueued when accessors are used in a graph node. +### Dynamic Command-Group + +To implement the `dynamic_command_group` class for updating the command-groups (CG) +associated with nodes, the CG member of the node implementation class changes +from a `std::unique_ptr` to a `std::shared_ptr` so that multiple nodes and the +`dynamic_command_group_impl` object can share the same CG object. This avoids +the overhead of having to allocate and free copies of the CG when a new active +CG is selected. + +The `dynamic_command_group_impl` class contains weak pointers to the nodes which +have been created with it, so that when a new active CG is selected it can +propagate the change to those nodes. The `node_impl` class also contains a +reference to the dynamic command-group that created it, so that when the graph +is finalized each node can use the list of kernels in its dynamic command-group +as part of the `urCommandBufferAppendKernelLaunchExp` call to pass the possible +alternative kernels. + +The `sycl::detail::CGExecKernel` class has been added to, so that if the +object was created from an element in the dynamic command-group list, the class +stores a vector of weak pointers to the other alternative command-groups created +from the same dynamic command-group object. This allows the DPC++ scheduler to +access the list of alternative kernels when calling the UR API to append a +kernel command to a command-buffer. + ## Optimizations ### Interactions with Profiling diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index d18cf3ebc4b3d..2bc3ef1d921ab 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -96,6 +96,7 @@ class node_impl; class graph_impl; class exec_graph_impl; class dynamic_parameter_impl; +class dynamic_command_group_impl; } // namespace detail enum class node_type { @@ -216,6 +217,23 @@ class depends_on_all_leaves : public ::sycl::detail::DataLessProperty< } // namespace node } // namespace property +class __SYCL_EXPORT dynamic_command_group { +public: + dynamic_command_group( + const command_graph &Graph, + const std::vector> &CGFList); + + size_t get_active_cgf() const; + void set_active_cgf(size_t Index); + +private: + template + friend const decltype(Obj::impl) & + sycl::detail::getSyclObjImpl(const Obj &SyclObject); + + std::shared_ptr impl; +}; + namespace detail { // Templateless modifiable command-graph base class. class __SYCL_EXPORT modifiable_command_graph { @@ -337,6 +355,12 @@ class __SYCL_EXPORT modifiable_command_graph { modifiable_command_graph(const std::shared_ptr &Impl) : impl(Impl) {} + /// Template-less implementation of add() for dynamic command-group nodes. + /// @param DynCGF Dynamic Command-group function object to add. + /// @param Dep List of predecessor nodes. + /// @return Node added to the graph. + node addImpl(dynamic_command_group &DynCGF, const std::vector &Dep); + /// Template-less implementation of add() for CGF nodes. /// @param CGF Command-group function to add. /// @param Dep List of predecessor nodes. diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 8abb4323ab3e1..4e8f62d53c36d 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1175,7 +1175,6 @@ class __SYCL_EXPORT handler { StoreLambda( std::move(Wrapper)); setType(detail::CGType::Kernel); - setNDRangeUsed(false); #endif } else #endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ && @@ -1198,7 +1197,6 @@ class __SYCL_EXPORT handler { StoreLambda( std::move(KernelFunc)); setType(detail::CGType::Kernel); - setNDRangeUsed(false); #endif #else (void)KernelFunc; @@ -1249,7 +1247,6 @@ class __SYCL_EXPORT handler { StoreLambda( std::move(KernelFunc)); setType(detail::CGType::Kernel); - setNDRangeUsed(true); #endif } @@ -1272,7 +1269,6 @@ class __SYCL_EXPORT handler { setNDRangeDescriptor(std::move(NumWorkItems)); processLaunchProperties(Props); setType(detail::CGType::Kernel); - setNDRangeUsed(false); extractArgsAndReqs(); MKernelName = getKernelName(); #endif @@ -1298,7 +1294,6 @@ class __SYCL_EXPORT handler { setNDRangeDescriptor(std::move(NDRange)); processLaunchProperties(Props); setType(detail::CGType::Kernel); - setNDRangeUsed(true); extractArgsAndReqs(); MKernelName = getKernelName(); #endif @@ -1339,7 +1334,6 @@ class __SYCL_EXPORT handler { setNDRangeDescriptor(NumWorkGroups, /*SetNumWorkGroups=*/true); StoreLambda(std::move(KernelFunc)); setType(detail::CGType::Kernel); - setNDRangeUsed(false); #endif // __SYCL_DEVICE_ONLY__ } @@ -1971,7 +1965,6 @@ class __SYCL_EXPORT handler { StoreLambda( std::move(KernelFunc)); setType(detail::CGType::Kernel); - setNDRangeUsed(false); #endif } @@ -2069,7 +2062,6 @@ class __SYCL_EXPORT handler { detail::checkValueRange(NumWorkItems, WorkItemOffset); setNDRangeDescriptor(std::move(NumWorkItems), std::move(WorkItemOffset)); setType(detail::CGType::Kernel); - setNDRangeUsed(false); extractArgsAndReqs(); MKernelName = getKernelName(); #endif @@ -2148,7 +2140,6 @@ class __SYCL_EXPORT handler { setNDRangeDescriptor(std::move(NumWorkItems)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CGType::Kernel); - setNDRangeUsed(false); if (!lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); @@ -2189,7 +2180,6 @@ class __SYCL_EXPORT handler { setNDRangeDescriptor(std::move(NumWorkItems), std::move(WorkItemOffset)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CGType::Kernel); - setNDRangeUsed(false); if (!lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); @@ -2229,7 +2219,6 @@ class __SYCL_EXPORT handler { setNDRangeDescriptor(std::move(NDRange)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CGType::Kernel); - setNDRangeUsed(true); if (!lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); @@ -3357,6 +3346,7 @@ class __SYCL_EXPORT handler { size_t Size, bool Block = false); friend class ext::oneapi::experimental::detail::graph_impl; friend class ext::oneapi::experimental::detail::dynamic_parameter_impl; + friend class ext::oneapi::experimental::detail::dynamic_command_group_impl; bool DisableRangeRounding(); @@ -3626,8 +3616,10 @@ class __SYCL_EXPORT handler { } #endif +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES // Set that an ND Range was used during a call to parallel_for void setNDRangeUsed(bool Value); +#endif inline void internalProfilingTagImpl() { throwIfActionIsCreated(); diff --git a/sycl/source/detail/cg.hpp b/sycl/source/detail/cg.hpp index 1799bbedd4903..f0dadad99dac5 100644 --- a/sycl/source/detail/cg.hpp +++ b/sycl/source/detail/cg.hpp @@ -257,6 +257,9 @@ class CGExecKernel : public CG { std::string MKernelName; std::vector> MStreams; std::vector> MAuxiliaryResources; + /// Used to implement ext_oneapi_graph dynamic_command_group. Stores the list + /// of command-groups that a kernel command can be updated to. + std::vector> MAlternativeKernels; ur_kernel_cache_config_t MKernelCacheConfig; bool MKernelIsCooperative = false; bool MKernelUsesClusterLaunch = false; @@ -277,7 +280,7 @@ class CGExecKernel : public CG { MKernelBundle(std::move(KernelBundle)), MArgs(std::move(Args)), MKernelName(std::move(KernelName)), MStreams(std::move(Streams)), MAuxiliaryResources(std::move(AuxiliaryResources)), - MKernelCacheConfig(std::move(KernelCacheConfig)), + MAlternativeKernels{}, MKernelCacheConfig(std::move(KernelCacheConfig)), MKernelIsCooperative(KernelIsCooperative), MKernelUsesClusterLaunch(MKernelUsesClusterLaunch) { assert(getType() == CGType::Kernel && "Wrong type of exec kernel CG."); diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index e725fc4ce0c82..ad10a3bdeefaa 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -352,11 +352,76 @@ void graph_impl::removeRoot(const std::shared_ptr &Root) { MRoots.erase(Root); } -std::shared_ptr -graph_impl::add(const std::vector> &Dep) { - // Copy deps so we can modify them - auto Deps = Dep; +std::set> graph_impl::getCGEdges( + const std::shared_ptr &CommandGroup) const { + const auto &Requirements = CommandGroup->getRequirements(); + if (!MAllowBuffers && Requirements.size()) { + throw sycl::exception(make_error_code(errc::invalid), + "Cannot use buffers in a graph without passing the " + "assume_buffer_outlives_graph property on " + "Graph construction."); + } + + if (CommandGroup->getType() == sycl::detail::CGType::Kernel) { + auto CGKernel = + static_cast(CommandGroup.get()); + if (CGKernel->hasStreams()) { + throw sycl::exception( + make_error_code(errc::invalid), + "Using sycl streams in a graph node is unsupported."); + } + } + + // Add any nodes specified by event dependencies into the dependency list + std::set> UniqueDeps; + for (auto &Dep : CommandGroup->getEvents()) { + if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl == MEventsMap.end()) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Event dependency from handler::depends_on does " + "not correspond to a node within the graph"); + } else { + UniqueDeps.insert(NodeImpl->second); + } + } + + // A unique set of dependencies obtained by checking requirements and events + for (auto &Req : Requirements) { + // Look through the graph for nodes which share this requirement + for (auto &Node : MNodeStorage) { + if (Node->hasRequirementDependency(Req)) { + bool ShouldAddDep = true; + // If any of this node's successors have this requirement then we skip + // adding the current node as a dependency. + for (auto &Succ : Node->MSuccessors) { + if (Succ.lock()->hasRequirementDependency(Req)) { + ShouldAddDep = false; + break; + } + } + if (ShouldAddDep) { + UniqueDeps.insert(Node); + } + } + } + } + + return std::move(UniqueDeps); +} + +void graph_impl::markCGMemObjs( + const std::shared_ptr &CommandGroup) { + const auto &Requirements = CommandGroup->getRequirements(); + for (auto &Req : Requirements) { + auto MemObj = static_cast(Req->MSYCLMemObj); + bool WasInserted = MMemObjs.insert(MemObj).second; + if (WasInserted) { + MemObj->markBeingUsedInGraph(); + } + } +} +std::shared_ptr +graph_impl::add(std::vector> &Deps) { const std::shared_ptr &NodeImpl = std::make_shared(); MNodeStorage.push_back(NodeImpl); @@ -370,7 +435,7 @@ graph_impl::add(const std::vector> &Dep) { std::shared_ptr graph_impl::add(std::function CGF, const std::vector &Args, - const std::vector> &Dep) { + std::vector> &Deps) { (void)Args; sycl::handler Handler{shared_from_this()}; @@ -401,8 +466,8 @@ graph_impl::add(std::function CGF, Handler.getType()); auto NodeImpl = - this->add(NodeType, std::move(Handler.impl->MGraphNodeCG), Dep); - NodeImpl->MNDRangeUsed = Handler.impl->MNDRangeUsed; + this->add(NodeType, std::move(Handler.impl->MGraphNodeCG), Deps); + // Add an event associated with this explicit node for mixed usage addEventForNode(std::make_shared(), NodeImpl); @@ -444,67 +509,15 @@ graph_impl::add(const std::vector Events) { std::shared_ptr graph_impl::add(node_type NodeType, - std::unique_ptr CommandGroup, - const std::vector> &Dep) { - // Copy deps so we can modify them - auto Deps = Dep; + std::shared_ptr CommandGroup, + std::vector> &Deps) { // A unique set of dependencies obtained by checking requirements and events - std::set> UniqueDeps; - const auto &Requirements = CommandGroup->getRequirements(); - if (!MAllowBuffers && Requirements.size()) { - throw sycl::exception(make_error_code(errc::invalid), - "Cannot use buffers in a graph without passing the " - "assume_buffer_outlives_graph property on " - "Graph construction."); - } - - if (CommandGroup->getType() == sycl::detail::CGType::Kernel) { - auto CGKernel = - static_cast(CommandGroup.get()); - if (CGKernel->hasStreams()) { - throw sycl::exception( - make_error_code(errc::invalid), - "Using sycl streams in a graph node is unsupported."); - } - } + std::set> UniqueDeps = getCGEdges(CommandGroup); - for (auto &Req : Requirements) { - // Track and mark the memory objects being used by the graph. - auto MemObj = static_cast(Req->MSYCLMemObj); - bool WasInserted = MMemObjs.insert(MemObj).second; - if (WasInserted) { - MemObj->markBeingUsedInGraph(); - } - // Look through the graph for nodes which share this requirement - for (auto &Node : MNodeStorage) { - if (Node->hasRequirementDependency(Req)) { - bool ShouldAddDep = true; - // If any of this node's successors have this requirement then we skip - // adding the current node as a dependency. - for (auto &Succ : Node->MSuccessors) { - if (Succ.lock()->hasRequirementDependency(Req)) { - ShouldAddDep = false; - break; - } - } - if (ShouldAddDep) { - UniqueDeps.insert(Node); - } - } - } - } + // Track and mark the memory objects being used by the graph. + markCGMemObjs(CommandGroup); - // Add any nodes specified by event dependencies into the dependency list - for (auto &Dep : CommandGroup->getEvents()) { - if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl != MEventsMap.end()) { - UniqueDeps.insert(NodeImpl->second); - } else { - throw sycl::exception(sycl::make_error_code(errc::invalid), - "Event dependency from handler::depends_on does " - "not correspond to a node within the graph"); - } - } // Add any deps determined from requirements and events into the dependency // list Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end()); @@ -518,6 +531,42 @@ graph_impl::add(node_type NodeType, return NodeImpl; } +std::shared_ptr +graph_impl::add(std::shared_ptr &DynCGImpl, + std::vector> &Deps) { + // Set of Dependent nodes based on CG event and accessor dependencies. + std::set> DynCGDeps = + getCGEdges(DynCGImpl->MKernels[0]); + for (unsigned i = 1; i < DynCGImpl->getNumCGs(); i++) { + auto &CG = DynCGImpl->MKernels[i]; + auto CGEdges = getCGEdges(CG); + if (CGEdges != DynCGDeps) { + throw sycl::exception(make_error_code(sycl::errc::invalid), + "Command-groups in dynamic command-group don't have" + "equivalent dependencies to other graph nodes."); + } + } + + // Track and mark the memory objects being used by the graph. + for (auto &CG : DynCGImpl->MKernels) { + markCGMemObjs(CG); + } + + // Get active dynamic command-group CG and use to create a node object + const auto &ActiveKernel = DynCGImpl->getActiveKernel(); + std::shared_ptr NodeImpl = + add(node_type::kernel, ActiveKernel, Deps); + + // Add an event associated with this explicit node for mixed usage + addEventForNode(std::make_shared(), NodeImpl); + + // Track the dynamic command-group used inside the node object + DynCGImpl->MNodes.push_back(NodeImpl); + NodeImpl->MDynCG = DynCGImpl; + + return NodeImpl; +} + bool graph_impl::clearQueues() { bool AnyQueuesCleared = false; for (auto &Queue : MRecordingQueues) { @@ -1568,6 +1617,27 @@ modifiable_command_graph::modifiable_command_graph( : impl(std::make_shared( SyclQueue.get_context(), SyclQueue.get_device(), PropList)) {} +node modifiable_command_graph::addImpl(dynamic_command_group &DynCGF, + const std::vector &Deps) { + impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function"); + auto DynCGFImpl = sycl::detail::getSyclObjImpl(DynCGF); + + if (DynCGFImpl->MGraph != impl) { + throw sycl::exception(make_error_code(sycl::errc::invalid), + "Graph does not match the graph associated with " + "dynamic command-group."); + } + + std::vector> DepImpls; + for (auto &D : Deps) { + DepImpls.push_back(sycl::detail::getSyclObjImpl(D)); + } + + graph_impl::WriteLock Lock(impl->MMutex); + std::shared_ptr NodeImpl = impl->add(DynCGFImpl, DepImpls); + return sycl::detail::createSyclObjFromImpl(NodeImpl); +} + node modifiable_command_graph::addImpl(const std::vector &Deps) { impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function"); std::vector> DepImpls; @@ -1775,6 +1845,208 @@ void dynamic_parameter_base::updateAccessor( impl->updateAccessor(Acc); } +void dynamic_parameter_impl::updateValue(const raw_kernel_arg *NewRawValue, + size_t Size) { + // Number of bytes is taken from member of raw_kernel_arg object rather + // than using the size parameter which represents sizeof(raw_kernel_arg). + std::ignore = Size; + size_t RawArgSize = NewRawValue->MArgSize; + const void *RawArgData = NewRawValue->MArgData; + + for (auto &[NodeWeak, ArgIndex] : MNodes) { + auto NodeShared = NodeWeak.lock(); + if (NodeShared) { + dynamic_parameter_impl::updateCGArgValue( + NodeShared->MCommandGroup, ArgIndex, RawArgData, RawArgSize); + } + } + + for (auto &DynCGInfo : MDynCGs) { + auto DynCG = DynCGInfo.DynCG.lock(); + if (DynCG) { + auto &CG = DynCG->MKernels[DynCGInfo.CGIndex]; + dynamic_parameter_impl::updateCGArgValue(CG, DynCGInfo.ArgIndex, + RawArgData, RawArgSize); + } + } + + std::memcpy(MValueStorage.data(), RawArgData, RawArgSize); +} + +void dynamic_parameter_impl::updateValue(const void *NewValue, size_t Size) { + for (auto &[NodeWeak, ArgIndex] : MNodes) { + auto NodeShared = NodeWeak.lock(); + if (NodeShared) { + dynamic_parameter_impl::updateCGArgValue(NodeShared->MCommandGroup, + ArgIndex, NewValue, Size); + } + } + + for (auto &DynCGInfo : MDynCGs) { + auto DynCG = DynCGInfo.DynCG.lock(); + if (DynCG) { + auto &CG = DynCG->MKernels[DynCGInfo.CGIndex]; + dynamic_parameter_impl::updateCGArgValue(CG, DynCGInfo.ArgIndex, NewValue, + Size); + } + } + + std::memcpy(MValueStorage.data(), NewValue, Size); +} + +void dynamic_parameter_impl::updateAccessor( + const sycl::detail::AccessorBaseHost *Acc) { + for (auto &[NodeWeak, ArgIndex] : MNodes) { + auto NodeShared = NodeWeak.lock(); + // Should we fail here if the node isn't alive anymore? + if (NodeShared) { + dynamic_parameter_impl::updateCGAccessor(NodeShared->MCommandGroup, + ArgIndex, Acc); + } + } + + for (auto &DynCGInfo : MDynCGs) { + auto DynCG = DynCGInfo.DynCG.lock(); + if (DynCG) { + auto &CG = DynCG->MKernels[DynCGInfo.CGIndex]; + dynamic_parameter_impl::updateCGAccessor(CG, DynCGInfo.ArgIndex, Acc); + } + } + + std::memcpy(MValueStorage.data(), Acc, + sizeof(sycl::detail::AccessorBaseHost)); +} + +void dynamic_parameter_impl::updateCGArgValue( + std::shared_ptr CG, int ArgIndex, const void *NewValue, + size_t Size) { + auto &Args = static_cast(CG.get())->MArgs; + for (auto &Arg : Args) { + if (Arg.MIndex != ArgIndex) { + continue; + } + assert(Arg.MSize == static_cast(Size)); + // MPtr may be a pointer into arg storage so we memcpy the contents of + // NewValue rather than assign it directly + std::memcpy(Arg.MPtr, NewValue, Size); + break; + } +} + +void dynamic_parameter_impl::updateCGAccessor( + std::shared_ptr CG, int ArgIndex, + const sycl::detail::AccessorBaseHost *Acc) { + auto &Args = static_cast(CG.get())->MArgs; + + auto NewAccImpl = sycl::detail::getSyclObjImpl(*Acc); + for (auto &Arg : Args) { + if (Arg.MIndex != ArgIndex) { + continue; + } + assert(Arg.MType == sycl::detail::kernel_param_kind_t::kind_accessor); + + // Find old accessor in accessor storage and replace with new one + if (static_cast(NewAccImpl->MSYCLMemObj) + ->needsWriteBack()) { + throw sycl::exception( + make_error_code(errc::invalid), + "Accessors to buffers which have write_back enabled " + "are not allowed to be used in command graphs."); + } + + // All accessors passed to this function will be placeholders, so we must + // perform steps similar to what happens when handler::require() is + // called here. + sycl::detail::Requirement *NewReq = NewAccImpl.get(); + if (NewReq->MAccessMode != sycl::access_mode::read) { + auto SYCLMemObj = + static_cast(NewReq->MSYCLMemObj); + SYCLMemObj->handleWriteAccessorCreation(); + } + + for (auto &Acc : CG->getAccStorage()) { + if (auto OldAcc = static_cast(Arg.MPtr); + Acc.get() == OldAcc) { + Acc = NewAccImpl; + } + } + + for (auto &Req : CG->getRequirements()) { + if (auto OldReq = static_cast(Arg.MPtr); + Req == OldReq) { + Req = NewReq; + } + } + Arg.MPtr = NewAccImpl.get(); + break; + } +} + +dynamic_command_group_impl::dynamic_command_group_impl( + const command_graph &Graph) + : MGraph{sycl::detail::getSyclObjImpl(Graph)}, MActiveCGF(0) {} + +void dynamic_command_group_impl::finalizeCGFList( + const std::vector> &CGFList) { + // True if kernels use sycl::nd_range, and false if using sycl::range + for (size_t CGFIndex = 0; CGFIndex < CGFList.size(); CGFIndex++) { + const auto &CGF = CGFList[CGFIndex]; + // Handler defined inside the loop so it doesn't appear to the runtime + // as a single command-group with multiple commands inside. + sycl::handler Handler{MGraph}; + CGF(Handler); + + if (Handler.getType() != sycl::detail::CGType::Kernel) { + throw sycl::exception( + make_error_code(errc::invalid), + "The only type of command-groups that can be used in " + "dynamic command-groups is kernels."); + } + + Handler.finalize(); + + // Take unique_ptr object from handler and convert to + // shared_ptr to store + sycl::detail::CG *RawCGPtr = Handler.impl->MGraphNodeCG.release(); + auto RawCGExecPtr = static_cast(RawCGPtr); + auto CGExecSP = std::shared_ptr(RawCGExecPtr); + MKernels.push_back(CGExecSP); + + // Track dynamic_parameter usage in command-list + auto &DynamicParams = Handler.impl->MDynamicParameters; + for (auto &[DynamicParam, ArgIndex] : DynamicParams) { + DynamicParam->registerDynCG(shared_from_this(), CGFIndex, ArgIndex); + } + } + + // For each CGExecKernel store the list of alternative kernels, not + // including itself. + using CGExecKernelSP = std::shared_ptr; + using CGExecKernelWP = std::weak_ptr; + for (auto KernelCG : MKernels) { + std::vector Alternatives; + std::copy_if( + MKernels.begin(), MKernels.end(), std::back_inserter(Alternatives), + [&KernelCG](const CGExecKernelSP &K) { return K != KernelCG; }); + + KernelCG->MAlternativeKernels = std::move(Alternatives); + } +} + +void dynamic_command_group_impl::setActiveIndex(size_t Index) { + if (Index >= getNumCGs()) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Index is out of range."); + } + MActiveCGF = Index; + + // Update nodes using the dynamic command-group to use the new active CG + for (auto &Node : MNodes) { + if (auto NodeSP = Node.lock()) { + NodeSP->MCommandGroup = getActiveKernel(); + } + } +} } // namespace detail node_type node::get_type() const { return impl->MNodeType; } @@ -1813,6 +2085,25 @@ template <> __SYCL_EXPORT void node::update_range<2>(range<2> Range) { template <> __SYCL_EXPORT void node::update_range<3>(range<3> Range) { impl->updateRange(Range); } + +dynamic_command_group::dynamic_command_group( + const command_graph &Graph, + const std::vector> &CGFList) + : impl(std::make_shared(Graph)) { + if (CGFList.empty()) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Dynamic command-group cannot be created with an " + "empty CGF list."); + } + impl->finalizeCGFList(CGFList); +} + +size_t dynamic_command_group::get_active_cgf() const { + return impl->getActiveIndex(); +} +void dynamic_command_group::set_active_cgf(size_t Index) { + return impl->setActiveIndex(Index); +} } // namespace experimental } // namespace oneapi } // namespace ext diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 4ee34830f39a2..3b1fc3fa01641 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -95,10 +95,12 @@ class node_impl : public std::enable_shared_from_this { /// User facing type of the node. node_type MNodeType = node_type::empty; /// Command group object which stores all args etc needed to enqueue the node - std::unique_ptr MCommandGroup; + std::shared_ptr MCommandGroup; /// Stores the executable graph impl associated with this node if it is a /// subgraph node. std::shared_ptr MSubGraphImpl; + /// Dynamic command-group object used in node, if any. + std::shared_ptr MDynCG; /// Used for tracking visited status during cycle checks. bool MVisited = false; @@ -108,9 +110,6 @@ class node_impl : public std::enable_shared_from_this { /// cannot be used to find out the partion of a node outside of this process. int MPartitionNum = -1; - /// Track whether an ND-Range was used for kernel nodes - bool MNDRangeUsed = false; - /// Add successor to the node. /// @param Node Node to add as a successor. void registerSuccessor(const std::shared_ptr &Node) { @@ -143,10 +142,9 @@ class node_impl : public std::enable_shared_from_this { /// @param NodeType Type of the command-group. /// @param CommandGroup The CG which stores the command information for this /// node. - node_impl(node_type NodeType, - std::unique_ptr &&CommandGroup) + node_impl(node_type NodeType, std::shared_ptr CommandGroup) : MCGType(CommandGroup->getType()), MNodeType(NodeType), - MCommandGroup(std::move(CommandGroup)) { + MCommandGroup(CommandGroup) { if (NodeType == node_type::subgraph) { MSubGraphImpl = static_cast(MCommandGroup.get()) @@ -160,7 +158,7 @@ class node_impl : public std::enable_shared_from_this { : enable_shared_from_this(Other), MSuccessors(Other.MSuccessors), MPredecessors(Other.MPredecessors), MCGType(Other.MCGType), MNodeType(Other.MNodeType), MCommandGroup(Other.getCGCopy()), - MSubGraphImpl(Other.MSubGraphImpl) {} + MSubGraphImpl(Other.MSubGraphImpl), MDynCG(Other.MDynCG) {} /// Copy-assignment operator. This will perform a deep-copy of the /// command group object associated with this node. @@ -172,6 +170,7 @@ class node_impl : public std::enable_shared_from_this { MNodeType = Other.MNodeType; MCommandGroup = Other.getCGCopy(); MSubGraphImpl = Other.MSubGraphImpl; + MDynCG = Other.MDynCG; } return *this; } @@ -405,75 +404,6 @@ class node_impl : public std::enable_shared_from_this { return (ReqSrc->MDims > 1) || (ReqDst->MDims > 1); } - /// Update the value of an accessor inside this node. Accessors must be - /// handled specifically compared to other argument values. - /// @param ArgIndex The index of the accessor arg to be updated - /// @param Acc Pointer to the new accessor value - void updateAccessor(int ArgIndex, const sycl::detail::AccessorBaseHost *Acc) { - auto &Args = - static_cast(MCommandGroup.get())->MArgs; - auto NewAccImpl = sycl::detail::getSyclObjImpl(*Acc); - for (auto &Arg : Args) { - if (Arg.MIndex != ArgIndex) { - continue; - } - assert(Arg.MType == sycl::detail::kernel_param_kind_t::kind_accessor); - - // Find old accessor in accessor storage and replace with new one - if (static_cast(NewAccImpl->MSYCLMemObj) - ->needsWriteBack()) { - throw sycl::exception( - make_error_code(errc::invalid), - "Accessors to buffers which have write_back enabled " - "are not allowed to be used in command graphs."); - } - - // All accessors passed to this function will be placeholders, so we must - // perform steps similar to what happens when handler::require() is - // called here. - sycl::detail::Requirement *NewReq = NewAccImpl.get(); - if (NewReq->MAccessMode != sycl::access_mode::read) { - auto SYCLMemObj = - static_cast(NewReq->MSYCLMemObj); - SYCLMemObj->handleWriteAccessorCreation(); - } - - for (auto &Acc : MCommandGroup->getAccStorage()) { - if (auto OldAcc = - static_cast(Arg.MPtr); - Acc.get() == OldAcc) { - Acc = NewAccImpl; - } - } - - for (auto &Req : MCommandGroup->getRequirements()) { - if (auto OldReq = - static_cast(Arg.MPtr); - Req == OldReq) { - Req = NewReq; - } - } - Arg.MPtr = NewAccImpl.get(); - break; - } - } - - void updateArgValue(int ArgIndex, const void *NewValue, size_t Size) { - - auto &Args = - static_cast(MCommandGroup.get())->MArgs; - for (auto &Arg : Args) { - if (Arg.MIndex != ArgIndex) { - continue; - } - assert(Arg.MSize == static_cast(Size)); - // MPtr may be a pointer into arg storage so we memcpy the contents of - // NewValue rather than assign it directly - std::memcpy(Arg.MPtr, NewValue, Size); - break; - } - } - template void updateNDRange(nd_range ExecutionRange) { if (MCGType != sycl::detail::CGType::Kernel) { @@ -481,11 +411,6 @@ class node_impl : public std::enable_shared_from_this { sycl::errc::invalid, "Cannot update execution range of nodes which are not kernel nodes"); } - if (!MNDRangeUsed) { - throw sycl::exception(sycl::errc::invalid, - "Cannot update node which was created with a " - "sycl::range with a sycl::nd_range"); - } auto &NDRDesc = static_cast(MCommandGroup.get()) @@ -507,11 +432,6 @@ class node_impl : public std::enable_shared_from_this { sycl::errc::invalid, "Cannot update execution range of nodes which are not kernel nodes"); } - if (MNDRangeUsed) { - throw sycl::exception(sycl::errc::invalid, - "Cannot update node which was created with a " - "sycl::nd_range with a sycl::range"); - } auto &NDRDesc = static_cast(MCommandGroup.get()) @@ -535,6 +455,7 @@ class node_impl : public std::enable_shared_from_this { ExecCG->MArgs = OtherExecCG->MArgs; ExecCG->MNDRDesc = OtherExecCG->MNDRDesc; + ExecCG->MKernelName = OtherExecCG->MKernelName; ExecCG->getAccStorage() = OtherExecCG->getAccStorage(); ExecCG->getRequirements() = OtherExecCG->getRequirements(); @@ -888,30 +809,40 @@ class graph_impl : public std::enable_shared_from_this { /// @param Root Node to remove from list of root nodes. void removeRoot(const std::shared_ptr &Root); + /// Verifies the CG is valid to add to the graph and returns set of + /// dependent nodes if so. + /// @param CommandGroup The command group to verify and retrieve edges for. + /// @return Set of dependent nodes in the graph. + std::set> + getCGEdges(const std::shared_ptr &CommandGroup) const; + + /// Identifies the sycl buffers used in the command-group and marks them + /// as used in the graph. + /// @param CommandGroup The command-group to check for buffer usage in. + void markCGMemObjs(const std::shared_ptr &CommandGroup); + /// Create a kernel node in the graph. /// @param NodeType User facing type of the node. /// @param CommandGroup The CG which stores all information for this node. - /// @param Dep Dependencies of the created node. + /// @param Deps Dependencies of the created node. /// @return Created node in the graph. - std::shared_ptr - add(node_type NodeType, std::unique_ptr CommandGroup, - const std::vector> &Dep = {}); + std::shared_ptr add(node_type NodeType, + std::shared_ptr CommandGroup, + std::vector> &Deps); /// Create a CGF node in the graph. /// @param CGF Command-group function to create node with. /// @param Args Node arguments. - /// @param Dep Dependencies of the created node. + /// @param Deps Dependencies of the created node. /// @return Created node in the graph. - std::shared_ptr - add(std::function CGF, - const std::vector &Args, - const std::vector> &Dep = {}); + std::shared_ptr add(std::function CGF, + const std::vector &Args, + std::vector> &Deps); /// Create an empty node in the graph. - /// @param Dep List of predecessor nodes. + /// @param Deps List of predecessor nodes. /// @return Created node in the graph. - std::shared_ptr - add(const std::vector> &Dep = {}); + std::shared_ptr add(std::vector> &Deps); /// Create an empty node in the graph. /// @param Events List of events associated to this node. @@ -919,6 +850,14 @@ class graph_impl : public std::enable_shared_from_this { std::shared_ptr add(const std::vector Events); + /// Create a dynamic command-group node in the graph. + /// @param DynCGImpl Dynamic command-group used to create node. + /// @param Deps List of predecessor nodes. + /// @return Created node in the graph. + std::shared_ptr + add(std::shared_ptr &DynCGImpl, + std::vector> &Deps); + /// Add a queue to the set of queues which are currently recording to this /// graph. /// @param RecordingQueue Queue to add to set. @@ -1236,7 +1175,12 @@ class graph_impl : public std::enable_shared_from_this { /// @param Node The node to add deps for /// @param Deps List of dependent nodes void addDepsToNode(std::shared_ptr Node, - const std::vector> &Deps) { + std::vector> &Deps) { + // Remove empty shared pointers from the list + auto EmptyElementIter = + std::remove(Deps.begin(), Deps.end(), std::shared_ptr()); + Deps.erase(EmptyElementIter, Deps.end()); + if (!Deps.empty()) { for (auto &N : Deps) { N->registerSuccessor(Node); @@ -1520,65 +1464,110 @@ class dynamic_parameter_impl { MNodes.emplace_back(NodeImpl, ArgIndex); } + /// Struct detailing an instance of the usage of the dynamic parameter in a + /// dynamic CG. + struct DynamicCGInfo { + /// Dynamic command-group that uses this dynamic parameter. + std::weak_ptr DynCG; + /// Index of the CG in the Dynamic CG that uses this dynamic parameter. + size_t CGIndex; + /// The arg index in the kernel the dynamic parameter is used. + int ArgIndex; + }; + + /// Registers a dynamic command-group with this dynamic parameter. + /// @param DynCG The dynamic command-group to register. + /// @param CGIndex Index of the CG in DynCG using this dynamic parameter. + /// @param ArgIndex The arg index in the kernel the dynamic parameter is used. + void registerDynCG(std::shared_ptr DynCG, + size_t CGIndex, int ArgIndex) { + MDynCGs.emplace_back(DynamicCGInfo{DynCG, CGIndex, ArgIndex}); + } + /// Get a pointer to the internal value of this dynamic parameter void *getValue() { return MValueStorage.data(); } /// Update sycl_ext_oneapi_raw_kernel_arg parameter /// @param NewRawValue Pointer to a raw_kernel_arg object. /// @param Size Parameter is ignored. - void updateValue(const raw_kernel_arg *NewRawValue, size_t Size) { - // Number of bytes is taken from member of raw_kernel_arg object rather - // than using the size parameter which represents sizeof(raw_kernel_arg). - std::ignore = Size; - size_t RawArgSize = NewRawValue->MArgSize; - const void *RawArgData = NewRawValue->MArgData; - - for (auto &[NodeWeak, ArgIndex] : MNodes) { - auto NodeShared = NodeWeak.lock(); - if (NodeShared) { - NodeShared->updateArgValue(ArgIndex, RawArgData, RawArgSize); - } - } - std::memcpy(MValueStorage.data(), RawArgData, RawArgSize); - } + void updateValue(const raw_kernel_arg *NewRawValue, size_t Size); /// Update the internal value of this dynamic parameter as well as the value - /// of this parameter in all registered nodes. + /// of this parameter in all registered nodes and dynamic CGs. /// @param NewValue Pointer to the new value /// @param Size Size of the data pointer to by NewValue - void updateValue(const void *NewValue, size_t Size) { - for (auto &[NodeWeak, ArgIndex] : MNodes) { - auto NodeShared = NodeWeak.lock(); - if (NodeShared) { - NodeShared->updateArgValue(ArgIndex, NewValue, Size); - } - } - std::memcpy(MValueStorage.data(), NewValue, Size); - } + void updateValue(const void *NewValue, size_t Size); /// Update the internal value of this dynamic parameter as well as the value - /// of this parameter in all registered nodes. Should only be called for - /// accessor dynamic_parameters. + /// of this parameter in all registered nodes and dynamic CGs. Should only be + /// called for accessor dynamic_parameters. /// @param Acc The new accessor value - void updateAccessor(const sycl::detail::AccessorBaseHost *Acc) { - for (auto &[NodeWeak, ArgIndex] : MNodes) { - auto NodeShared = NodeWeak.lock(); - // Should we fail here if the node isn't alive anymore? - if (NodeShared) { - NodeShared->updateAccessor(ArgIndex, Acc); - } - } - std::memcpy(MValueStorage.data(), Acc, - sizeof(sycl::detail::AccessorBaseHost)); - } + void updateAccessor(const sycl::detail::AccessorBaseHost *Acc); + + /// Static helper function for updating command-group value arguments. + /// @param CG The command-group to update the argument information for. + /// @param ArgIndex The argument index to update. + /// @param NewValue Pointer to the new value. + /// @param Size Size of the data pointer to by NewValue + static void updateCGArgValue(std::shared_ptr CG, + int ArgIndex, const void *NewValue, size_t Size); + + /// Static helper function for updating command-group accessor arguments. + /// @param CG The command-group to update the argument information for. + /// @param ArgIndex The argument index to update. + /// @param Acc The new accessor value + static void updateCGAccessor(std::shared_ptr CG, + int ArgIndex, + const sycl::detail::AccessorBaseHost *Acc); // Weak ptrs to node_impls which will be updated std::vector, int>> MNodes; + // Dynamic command-groups which will be updated + std::vector MDynCGs; std::shared_ptr MGraph; std::vector MValueStorage; }; +class dynamic_command_group_impl + : public std::enable_shared_from_this { +public: + dynamic_command_group_impl( + const command_graph &Graph); + + /// Returns the index of the active command-group + size_t getActiveIndex() const { return MActiveCGF; } + + /// Returns the number of CGs in the dynamic command-group. + size_t getNumCGs() const { return MKernels.size(); } + + /// Set the index of the active command-group. + /// @param Index The new index. + void setActiveIndex(size_t Index); + + /// Instantiates a command-group object for each CGF in the list. + /// @param CGFList List of CGFs to finalize with a handler into CG objects. + void + finalizeCGFList(const std::vector> &CGFList); + + /// Retrieve CG at the currently active index + /// @param Shared pointer to the active CG object. + std::shared_ptr getActiveKernel() const { + return MKernels[MActiveCGF]; + } + + /// Graph this dynamic command-group is associated with. + std::shared_ptr MGraph; + + /// Index of active command-group + std::atomic MActiveCGF; + + /// List of kernel command-groups for dynamic command-group nodes + std::vector> MKernels; + + /// List of nodes using this dynamic command-group. + std::vector> MNodes; +}; } // namespace detail } // namespace experimental } // namespace oneapi diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index 2c2a4963bed98..e452eca0c8a6d 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -152,10 +152,6 @@ class handler_impl { ext::oneapi::experimental::detail::dynamic_parameter_impl *, int>> MDynamicParameters; - // Track whether an NDRange was used when submitting a kernel (as opposed to a - // range), needed for graph update - bool MNDRangeUsed = false; - /// The storage for the arguments passed. /// We need to store a copy of values that are passed explicitly through /// set_arg, require and so on, because we need them to be alive after diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 3785e1dd351f9..87b170c94c4bb 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2487,11 +2487,22 @@ ur_result_t enqueueImpCommandBufferKernel( const std::function &getMemAllocationFunc) { auto ContextImpl = sycl::detail::getSyclObjImpl(Ctx); const sycl::detail::AdapterPtr &Adapter = ContextImpl->getAdapter(); + + const std::vector> + &AlternativeKernels = CommandGroup.MAlternativeKernels; + + // UR kernel and program for 'CommandGroup' ur_kernel_handle_t UrKernel = nullptr; ur_program_handle_t UrProgram = nullptr; + + // Impl objects created when 'CommandGroup' is from a kernel bundle std::shared_ptr SyclKernelImpl = nullptr; std::shared_ptr DeviceImageImpl = nullptr; + // List of ur objects to be released after UR call + std::vector UrKernelsToRelease; + std::vector UrProgramsToRelease; + auto Kernel = CommandGroup.MSyclKernel; auto KernelBundleImplPtr = CommandGroup.MKernelBundle; const KernelArgMask *EliminatedArgMask = nullptr; @@ -2520,6 +2531,42 @@ ur_result_t enqueueImpCommandBufferKernel( std::tie(UrKernel, std::ignore, EliminatedArgMask, UrProgram) = sycl::detail::ProgramManager::getInstance().getOrCreateKernel( ContextImpl, DeviceImpl, CommandGroup.MKernelName); + UrKernelsToRelease.push_back(UrKernel); + UrProgramsToRelease.push_back(UrProgram); + } + + // Build up the list of UR kernel handles that the UR command could be + // updated to use. + std::vector AltUrKernels; + for (const auto &AltCGKernelWP : AlternativeKernels) { + auto AltCGKernel = AltCGKernelWP.lock(); + assert(AltCGKernel != nullptr); + + ur_kernel_handle_t AltUrKernel = nullptr; + if (auto KernelBundleImplPtr = AltCGKernel->MKernelBundle; + KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) { + auto KernelName = AltCGKernel->MKernelName; + kernel_id KernelID = + detail::ProgramManager::getInstance().getSYCLKernelID(KernelName); + kernel SyclKernel = + KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr); + AltUrKernel = detail::getSyclObjImpl(SyclKernel)->getHandleRef(); + } else if (AltCGKernel->MSyclKernel != nullptr) { + AltUrKernel = Kernel->getHandleRef(); + } else { + ur_program_handle_t UrProgram = nullptr; + std::tie(AltUrKernel, std::ignore, std::ignore, UrProgram) = + sycl::detail::ProgramManager::getInstance().getOrCreateKernel( + ContextImpl, DeviceImpl, AltCGKernel->MKernelName); + UrKernelsToRelease.push_back(AltUrKernel); + UrProgramsToRelease.push_back(UrProgram); + } + + if (AltUrKernel != UrKernel) { + // Don't include command-group 'CommandGroup' in the list to pass to UR, + // as this will be used for the primary ur kernel parameter. + AltUrKernels.push_back(AltUrKernel); + } } auto SetFunc = [&Adapter, &UrKernel, &DeviceImageImpl, &Ctx, @@ -2572,14 +2619,17 @@ ur_result_t enqueueImpCommandBufferKernel( ur_result_t Res = Adapter->call_nocheck( CommandBuffer, UrKernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], - &NDRDesc.GlobalSize[0], LocalSize, 0, nullptr, SyncPoints.size(), - SyncPoints.size() ? SyncPoints.data() : nullptr, 0, nullptr, - OutSyncPoint, nullptr, + &NDRDesc.GlobalSize[0], LocalSize, AltUrKernels.size(), + AltUrKernels.size() ? AltUrKernels.data() : nullptr, + SyncPoints.size(), SyncPoints.size() ? SyncPoints.data() : nullptr, 0, + nullptr, OutSyncPoint, nullptr, CommandBufferDesc.isUpdatable ? OutCommand : nullptr); - if (!SyclKernelImpl && !Kernel) { - Adapter->call(UrKernel); - Adapter->call(UrProgram); + for (auto &Kernel : UrKernelsToRelease) { + Adapter->call(Kernel); + } + for (auto &Program : UrProgramsToRelease) { + Adapter->call(Program); } if (Res != UR_RESULT_SUCCESS) { diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 61ea7ee7be0a0..adb1e2ee50796 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -555,11 +555,9 @@ event handler::finalize() { // Find the last node added to the graph from this queue, so our new // node can set it as a predecessor. auto DependentNode = GraphImpl->getLastInorderNode(MQueue); - - NodeImpl = DependentNode - ? GraphImpl->add(NodeType, std::move(CommandGroup), - {DependentNode}) - : GraphImpl->add(NodeType, std::move(CommandGroup)); + std::vector> + Deps = {DependentNode}; + NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup), Deps); // If we are recording an in-order queue remember the new node, so it // can be used as a dependency for any more nodes recorded from this @@ -567,12 +565,9 @@ event handler::finalize() { GraphImpl->setLastInorderNode(MQueue, NodeImpl); } else { auto LastBarrierRecordedFromQueue = GraphImpl->getBarrierDep(MQueue); - if (LastBarrierRecordedFromQueue) { - NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup), - {LastBarrierRecordedFromQueue}); - } else { - NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup)); - } + std::vector> + Deps = {LastBarrierRecordedFromQueue}; + NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup), Deps); if (NodeImpl->MCGType == sycl::detail::CGType::Barrier) { GraphImpl->setBarrierDep(MQueue, NodeImpl); @@ -582,8 +577,6 @@ event handler::finalize() { // Associate an event with this new node and return the event. GraphImpl->addEventForNode(EventImpl, NodeImpl); - NodeImpl->MNDRangeUsed = impl->MNDRangeUsed; - return detail::createSyclObjFromImpl(EventImpl); } @@ -2008,7 +2001,9 @@ std::tuple, bool> handler::getMaxWorkGroups_v2() { return {std::array{0, 0, 0}, false}; } -void handler::setNDRangeUsed(bool Value) { impl->MNDRangeUsed = Value; } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES +void handler::setNDRangeUsed(bool Value) { (void)Value; } +#endif void handler::registerDynamicParameter( ext::oneapi::experimental::detail::dynamic_parameter_base &DynamicParamBase, diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp new file mode 100644 index 0000000000000..2b5f378d8bed7 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp @@ -0,0 +1,60 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests using dynamic command-group objects with buffer accessors + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + const size_t N = 1024; + std::vector HostData(N, 0); + buffer Buf{HostData}; + Buf.set_write_back(false); + auto Acc = Buf.get_access(); + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + CGH.require(Acc); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + CGH.require(Acc); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Acc, HostData.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternA); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(Acc, HostData.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternB); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp new file mode 100644 index 0000000000000..5ce7a4bf40df1 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp @@ -0,0 +1,74 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests adding a dynamic command-group node to a graph using buffer +// accessors for the node edges. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + const size_t N = 1024; + int *Ptr = (int *)sycl::malloc_device(N, Queue); + std::vector HostData(N, 0); + buffer Buf{HostData}; + Buf.set_write_back(false); + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + auto RootNode = Graph.add([&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = 1; }); + }); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] += PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] += PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + + auto LeafNode = Graph.add([&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for( + N, [=](item<1> Item) { Ptr[Item.get_id()] = Acc[Item.get_id()]; }); + }); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == (PatternA + 1)); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == (PatternB + 1)); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp new file mode 100644 index 0000000000000..8d50b8b26e0c2 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp @@ -0,0 +1,85 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests adding a dynamic command-group node to a graph using buffer +// accessors for the node edges, but where different command-groups +// use different buffers that create identical edges. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + const size_t N = 1024; + int *Ptr = (int *)sycl::malloc_device(N, Queue); + std::vector HostData(N, 0); + buffer BufA{sycl::range<1>(N)}; + buffer BufB{sycl::range<1>(N)}; + BufA.set_write_back(false); + BufB.set_write_back(false); + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + int InitA = 4; + int InitB = -4; + auto RootNode = Graph.add([&](handler &CGH) { + auto AccA = BufA.get_access(CGH); + auto AccB = BufB.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { + AccA[Item.get_id()] = InitA; + AccB[Item.get_id()] = InitB; + }); + }); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + auto AccA = BufA.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccA[Item.get_id()] += PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + auto AccB = BufB.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccB[Item.get_id()] += PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + + auto LeafNode = Graph.add([&](handler &CGH) { + auto AccA = BufA.get_access(CGH); + auto AccB = BufB.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { + Ptr[Item.get_id()] = AccA[Item.get_id()] + AccB[Item.get_id()]; + }); + }); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == (InitA + InitB + PatternA)); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == (InitA + InitB + PatternB)); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp new file mode 100644 index 0000000000000..80556f60fc75f --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp @@ -0,0 +1,81 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out %S/../Inputs/Kernels/update_with_indices_accessor.spv +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out %S/../Inputs/Kernels/update_with_indices_accessor.spv 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out %S/../Inputs/Kernels/update_with_indices_accessor.spv 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// REQUIRES: level_zero +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating an accessor argument to a graph node created from SPIR-V +// using dynamic command-groups. + +#include "../graph_common.hpp" + +int main(int, char **argv) { + queue Queue{}; + sycl::kernel_bundle KernelBundle = loadKernelsFromFile(Queue, argv[1]); + const auto getKernel = + [](sycl::kernel_bundle &bundle, + const std::string &name) { + return bundle.ext_oneapi_get_kernel(name); + }; + + kernel kernel = getKernel( + KernelBundle, "_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlvE_"); + + const size_t N = 1024; + + exp_ext::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + std::vector HostDataA(N, 0); + std::vector HostDataB(N, 0); + + buffer BufA{HostDataA}; + buffer BufB{HostDataB}; + BufA.set_write_back(false); + BufB.set_write_back(false); + + auto AccA = BufA.get_access(); + auto AccB = BufB.get_access(); + + auto CGFA = [&](handler &CGH) { + CGH.require(AccA); + CGH.set_arg(0, AccA); + CGH.single_task(kernel); + }; + + auto CGFB = [&](handler &CGH) { + CGH.require(AccB); + CGH.set_arg(0, AccB); + CGH.single_task(kernel); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Queue, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == 0); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); + Queue.copy(BufB.get_access(), HostDataB.data()).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == i); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_different_arg_nums.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_different_arg_nums.cpp new file mode 100644 index 0000000000000..e1602864b44a0 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_different_arg_nums.cpp @@ -0,0 +1,148 @@ +// RUN: %{build} -o %t.out +// RUN: env SYCL_UR_TRACE=2 %{run} %t.out | FileCheck %s +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating a dynamic command-group with command-groups containing a +// different number of arguments. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *Ptr = malloc_device(N, Queue); + std::vector HostData(N); + + // 3 kernel arguments: Ptr, PatternA, PatternB + int PatternA = 42; + int PatternB = 0xA; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for( + N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA + PatternB; }); + }; + + // 2 kernel arguments: Ptr, MyPatternStruct + struct PatternStruct { + int PatternA; + int PatternB; + }; + PatternStruct MyPatternStruct{PatternA + 1, PatternB + 1}; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { + Ptr[Item.get_id()] = MyPatternStruct.PatternA + MyPatternStruct.PatternB; + }); + }; + + // 1 kernel argument: Ptr + auto CGFC = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = 42 - 0xA; }); + }; + + // 4 kernel argument: Ptr + int PatternC = -12; + auto CGFD = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { + Ptr[Item.get_id()] = PatternA + PatternB + PatternC; + }); + }; + + // CHECK: <--- urKernelSetArgPointer( + // CHECK-SAME: .hKernel = [[KERNEL_HANDLE1:[0-9a-fA-Fx]+]] + // CHECL-SAME: .argIndex = 0 + + // CHECK: <--- urKernelSetArgValue + // CHECK-SAME: .hKernel = [[KERNEL_HANDLE1]] + // CHECK-SAME: .argIndex = 1 + + // CHECK: <--- urKernelSetArgValue + // CHECK-SAME: .hKernel = [[KERNEL_HANDLE1]] + // CHECK-SAME: .argIndex = 2 + + // CHECK: <--- urCommandBufferAppendKernelLaunchExp + // CHECK-SAME: .hKernel = [[KERNEL_HANDLE1]] + // CHECK-SAME: .numKernelAlternatives = 3 + // CHECK-SAME: .phKernelAlternatives = {[[KERNEL_HANDLE2:[0-9a-fA-Fx]+]], [[KERNEL_HANDLE3:[0-9a-fA-Fx]+]], [[KERNEL_HANDLE4:[0-9a-fA-Fx]+]]} + auto DynamicCG = + exp_ext::dynamic_command_group(Graph, {CGFA, CGFB, CGFC, CGFD}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // Verify CGFA works with 3 arguments + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + int Ref = PatternA + PatternB; + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == Ref); + } + + // Verify CGFB works with 2 arguments + // CHECK: <--- urCommandBufferUpdateKernelLaunchExp + // CHECK-SAME: .hNewKernel = [[KERNEL_HANDLE2]] + // CHECK-SAME: .numNewMemObjArgs = 0 + // CHECK-SAME: .numNewPointerArgs = 1 + // CHECK-SAME: .numNewValueArgs = 1 + // CHECK-SAME: .stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC + // CHECK-SAME: .argIndex = 0 + // CHECK-SAME: .stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC + // CHECK-SAME: .argIndex = 1 + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + Ref = (PatternA + 1) + (PatternB + 1); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == Ref); + } + + // Verify CGFC works with 1 argument + // CHECK: <--- urCommandBufferUpdateKernelLaunchExp + // CHECK-SAME: .hNewKernel = [[KERNEL_HANDLE3]] + // CHECK-SAME: .numNewMemObjArgs = 0 + // CHECK-SAME: .numNewPointerArgs = 1 + // CHECK-SAME: .numNewValueArgs = 0 + // CHECK-SAME: .stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC + // CHECK-SAME: .argIndex = 0 + DynamicCG.set_active_cgf(2); + ExecGraph.update(DynamicCGNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + Ref = PatternA - PatternB; + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == Ref); + } + + // Verify CGFD works with 4 arguments + // CHECK: <--- urCommandBufferUpdateKernelLaunchExp + // CHECK-SAME: .hNewKernel = [[KERNEL_HANDLE4]] + // CHECK-SAME: .numNewMemObjArgs = 0 + // CHECK-SAME: .numNewPointerArgs = 1 + // CHECK-SAME: .numNewValueArgs = 3 + // CHECK-SAME: .stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC + // CHECK-SAME: .argIndex = 0 + // CHECK-SAME: .stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC + // CHECK-SAME: .argIndex = 1 + // CHECK-SAME: .stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC + // CHECK-SAME: .argIndex = 2 + // CHECK-SAME: .stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC + // CHECK-SAME: .argIndex = 3 + DynamicCG.set_active_cgf(3); + ExecGraph.update(DynamicCGNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + Ref = PatternA + PatternB + PatternC; + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == Ref); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp new file mode 100644 index 0000000000000..11e28a033a4c2 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp @@ -0,0 +1,73 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests adding a dynamic command-group node to a graph using graph limited +// events for dependencies. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + int *PtrC = malloc_device(N, Queue); + std::vector HostData(N); + + Graph.begin_recording(Queue); + int PatternA = 42; + auto EventA = Queue.fill(PtrA, PatternA, N); + int PatternB = 0xA; + auto EventB = Queue.fill(PtrB, PatternB, N); + Graph.end_recording(Queue); + + auto CGFA = [&](handler &CGH) { + CGH.depends_on({EventA, EventB}); + CGH.parallel_for(N, [=](item<1> Item) { + auto I = Item.get_id(); + PtrC[I] = PtrA[I] * PtrB[I]; + }); + }; + + auto CGFB = [&](handler &CGH) { + CGH.depends_on({EventA, EventB}); + CGH.parallel_for(N, [=](item<1> Item) { + auto I = Item.get_id(); + PtrC[I] = PtrA[I] + PtrB[I]; + }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(PtrC, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternA * PatternB); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(PtrC, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternA + PatternB); + } + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + sycl::free(PtrC, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp new file mode 100644 index 0000000000000..f4717210bb35e --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp @@ -0,0 +1,72 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating a dynamic command-group node where the dynamic command-groups +// have different ranges/nd-ranges + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *Ptr = malloc_device(N, Queue); + std::vector HostData(N); + + auto RootNode = + Graph.add([&](handler &cgh) { cgh.memset(Ptr, 0, N * sizeof(int)); }); + + int PatternA = 42; + sycl::range<1> RangeA{512}; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(RangeA, + [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + size_t UpdatedN = 256; + sycl::nd_range<1> RangeB{sycl::range{UpdatedN}, sycl::range{16}}; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for( + RangeB, [=](nd_item<1> Item) { Ptr[Item.get_global_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = + Graph.add(DynamicCG, exp_ext::property::node::depends_on(RootNode)); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + if (i < RangeA.get(0)) { + assert(HostData[i] == PatternA); + } else { + assert(HostData[i] == 0); + } + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + if (i < UpdatedN) { + assert(HostData[i] == PatternB); + } else { + assert(HostData[i] == 0); + } + } + + sycl::free(Ptr, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange_3D.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange_3D.cpp new file mode 100644 index 0000000000000..f6390df64303a --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange_3D.cpp @@ -0,0 +1,80 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating a dynamic command-group node where the dynamic command-groups +// have different ranges/nd-ranges + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 64; + int *Ptr = malloc_device(N, Queue); + std::vector HostData(N); + + auto RootNode = + Graph.add([&](handler &cgh) { cgh.memset(Ptr, 0, N * sizeof(int)); }); + + int PatternA = 42; + sycl::range<1> RangeA{N}; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(RangeA, + [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + sycl::nd_range<3> RangeB{sycl::range{4, 4, 4}, sycl::range{2, 2, 2}}; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(RangeB, [=](nd_item<3> Item) { + Ptr[Item.get_global_linear_id()] = PatternB; + }); + }; + + int PatternC = 7; + sycl::range<2> RangeC{8, 8}; + auto CGFC = [&](handler &CGH) { + CGH.parallel_for( + RangeC, [=](item<2> Item) { Ptr[Item.get_linear_id()] = PatternC; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB, CGFC}); + auto DynamicCGNode = + Graph.add(DynamicCG, exp_ext::property::node::depends_on(RootNode)); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternA); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternB); + } + + DynamicCG.set_active_cgf(2); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternC); + } + + sycl::free(Ptr, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp new file mode 100644 index 0000000000000..3ba2500cd6189 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp @@ -0,0 +1,59 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests how the nd-range of a node is overwritten by the active command-group + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + std::vector HostData(N); + int *Ptr = malloc_device(N, Queue); + Queue.memset(Ptr, 0, N * sizeof(int)).wait(); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + + size_t NewRange = 512; + sycl::range<1> UpdateRange(NewRange); + DynamicCGNode.update_range(UpdateRange); + + DynamicCG.set_active_cgf(1); + + // Check that the UpdateRange from active CGF 0 is preserved + DynamicCG.set_active_cgf(0); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + if (i < NewRange) { + assert(HostData[i] == PatternA); + } else { + assert(HostData[i] == 0); + } + } + + sycl::free(Ptr, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp new file mode 100644 index 0000000000000..0c46672869c7d --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp @@ -0,0 +1,70 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating kernel code using dynamic command-groups that have different +// parameters in each command-group. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)); + Queue.memset(PtrB, 0, N * sizeof(int)); + Queue.wait(); + + int PatternA = 0xA; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { PtrA[Item.get_id()] = PatternA; }); + }; + + int PatternB = 42; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { PtrB[Item.get_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(PtrA, HostDataA.data(), N); + Queue.copy(PtrB, HostDataB.data(), N); + Queue.wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == PatternA); + assert(HostDataB[i] == 0); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(PtrA, HostDataA.data(), N); + Queue.copy(PtrB, HostDataB.data(), N); + Queue.wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == PatternA); + assert(HostDataB[i] == PatternB); + } + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp new file mode 100644 index 0000000000000..f9b0728d8ea67 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp @@ -0,0 +1,74 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests using the same dynamic command-group in more than one graph node. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + std::vector HostData(N); + int *Ptr = malloc_device(N, Queue); + + auto RootNode = + Graph.add([&](handler &CGH) { CGH.memset(Ptr, 0, N * sizeof(int)); }); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] += PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] += PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto Node1 = + Graph.add(DynamicCG, exp_ext::property::node::depends_on(RootNode)); + + auto Node2 = Graph.add( + [&](handler &cgh) { + cgh.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] *= 2; }); + }, + exp_ext::property::node::depends_on(Node1)); + + auto Node3 = Graph.add(DynamicCG, exp_ext::property::node::depends_on(Node2)); + + // This ND-Range affects Node 1 as well, as the range is tied to the node. + sycl::range<1> Node3Range(512); + Node3.update_range(Node3Range); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + int Ref = (i < Node3Range.get(0)) ? (PatternA * 3) : 0; + assert(HostData[i] == Ref); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(Node1); + ExecGraph.update(Node3); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + int Ref = (PatternB * 3); + assert(HostData[i] == Ref); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp new file mode 100644 index 0000000000000..a9109d000eb17 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp @@ -0,0 +1,48 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating a dynamic command-group node after it has been added to +// a graph but before the graph has been finalized + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *Ptr = malloc_device(N, Queue); + std::vector HostData(N); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + DynamicCG.set_active_cgf(1); + auto ExecGraph = Graph.finalize(); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternB); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp new file mode 100644 index 0000000000000..79db8ebe67c57 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp @@ -0,0 +1,55 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests updating usm kernel code using dynamic command-groups + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *Ptr = malloc_device(N, Queue); + std::vector HostData(N); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternA); + } + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternB); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_with_all_dyn_params.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_with_all_dyn_params.cpp new file mode 100644 index 0000000000000..cb9bdf15f76b8 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_with_all_dyn_params.cpp @@ -0,0 +1,121 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests using a dynamic command-group object with dynamic parameters inside it + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + int *PtrC = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + std::vector HostDataC(N); + + exp_ext::dynamic_parameter DynParam1(Graph, PtrA); + exp_ext::dynamic_parameter DynParam2(Graph, PtrC); + + auto CGFA = [&](handler &CGH) { + CGH.set_arg(0, DynParam1); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }; + + auto CGFB = [&](handler &CGH) { + CGH.set_arg(0, DynParam1); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }; + + auto CGFC = [&](handler &CGH) { + CGH.set_arg(0, DynParam2); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrC[i] = i; + } + }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB, CGFC}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto KernelNode = Graph.add( + [&](handler &CGH) { + CGH.set_arg(0, DynParam2); + // TODO: Use the free function kernel extension instead of regular + // kernels when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrC[i] += i; + } + }); + }, + exp_ext::property::node::depends_on(DynamicCGNode)); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + auto ExecuteGraphAndVerifyResults = [&](bool A, bool B, bool C) { + Queue.memset(PtrA, 0, N * sizeof(int)); + Queue.memset(PtrB, 0, N * sizeof(int)); + Queue.memset(PtrC, 0, N * sizeof(int)); + Queue.wait(); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N); + Queue.copy(PtrB, HostDataB.data(), N); + Queue.copy(PtrC, HostDataC.data(), N); + Queue.wait(); + + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == (A ? i : 0)); + assert(HostDataB[i] == (B ? i : 0)); + assert(HostDataC[i] == (C ? (2 * i) : i)); + } + }; + ExecuteGraphAndVerifyResults(true, false, false); + + DynParam1.update(PtrB); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, true, false); + + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, true, false); + + DynamicCG.set_active_cgf(2); + // Should be ignored as DynParam1 not used in active node + DynParam1.update(PtrA); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, false, true); + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + sycl::free(PtrC, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_with_different_type_dyn_params.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_with_different_type_dyn_params.cpp new file mode 100644 index 0000000000000..15f815664a740 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_with_different_type_dyn_params.cpp @@ -0,0 +1,137 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests using a dynamic command-group object with dynamic parameters of +// different types + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + int *PtrC = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + std::vector HostDataC(N); + + int ScalarValue = 17; + exp_ext::dynamic_parameter DynParamScalar(Graph, ScalarValue); + exp_ext::dynamic_parameter DynParamPtr(Graph, PtrA); + + // Kernel has 2 dynamic parameters, one of scalar type & one of ptr type + auto CGFA = [&](handler &CGH) { + CGH.set_arg(0, DynParamPtr); + CGH.set_arg(1, DynParamScalar); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = ScalarValue; + } + }); + }; + + // Kernel has a single argument, a dynamic parameter of ptr type + auto CGFB = [&](handler &CGH) { + CGH.set_arg(0, DynParamPtr); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = ScalarValue; + } + }); + }; + + // Kernel has a two arguments, an immutable ptr type argument and a + // dynamic parameter of scalar type. + auto CGFC = [&](handler &CGH) { + CGH.set_arg(1, DynParamScalar); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrC[i] = ScalarValue; + } + }); + }; + + // Kernel has a single argument, of immutable pointer type + auto CGFD = [&](handler &CGH) { + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = ScalarValue; + } + }); + }; + + auto DynamicCG = + exp_ext::dynamic_command_group(Graph, {CGFA, CGFB, CGFC, CGFD}); + auto DynamicCGNode = Graph.add(DynamicCG); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + auto ExecuteGraphAndVerifyResults = [&](int A, int B, int C) { + Queue.memset(PtrA, 0, N * sizeof(int)); + Queue.memset(PtrB, 0, N * sizeof(int)); + Queue.memset(PtrC, 0, N * sizeof(int)); + Queue.wait(); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N); + Queue.copy(PtrB, HostDataB.data(), N); + Queue.copy(PtrC, HostDataC.data(), N); + Queue.wait(); + + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == A); + assert(HostDataB[i] == B); + assert(HostDataC[i] == C); + } + }; + // CGFA using PtrA and ScalarValue in its dynamic parameters + ExecuteGraphAndVerifyResults(ScalarValue, 0, 0); + + // CGFA using PtrB and UpdatedScalarValue in its dynamic parameters + DynParamPtr.update(PtrB); + int UpdatedScalarValue = 42; + DynParamScalar.update(UpdatedScalarValue); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(0, UpdatedScalarValue, 0); + + // CGFB using PtrB in its dynamic parameter and immutable ScalarValue + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(0, ScalarValue, false); + + // CGFC using immutable PtrC and UpdatedScalarValue in its dynamic parameter + DynamicCG.set_active_cgf(2); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(0, 0, UpdatedScalarValue); + + // CGFD using immutable PtrA and immutable ScalarValue for arguments + DynamicCG.set_active_cgf(3); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(ScalarValue, 0, 0); + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + sycl::free(PtrC, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_with_some_dyn_params.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_with_some_dyn_params.cpp new file mode 100644 index 0000000000000..264c1b6849689 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_with_some_dyn_params.cpp @@ -0,0 +1,107 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests using a dynamic command-group object where some but not all the +// command-groups use dynamic parameters. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + int *PtrC = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + std::vector HostDataC(N); + + exp_ext::dynamic_parameter DynParam(Graph, PtrA); + + auto CGFA = [&](handler &CGH) { + CGH.set_arg(0, DynParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }; + + auto CGFB = [&](handler &CGH) { + CGH.set_arg(0, DynParam); + // TODO: Use the free function kernel extension instead of regular kernels + // when available. + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrA[i] = i; + } + }); + }; + + auto CGFC = [&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < N; i++) { + PtrC[i] = i; + } + }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB, CGFC}); + auto DynamicCGNode = Graph.add(DynamicCG); + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + auto ExecuteGraphAndVerifyResults = [&](bool A, bool B, bool C) { + Queue.memset(PtrA, 0, N * sizeof(int)); + Queue.memset(PtrB, 0, N * sizeof(int)); + Queue.memset(PtrC, 0, N * sizeof(int)); + Queue.wait(); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N); + Queue.copy(PtrB, HostDataB.data(), N); + Queue.copy(PtrC, HostDataC.data(), N); + Queue.wait(); + + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == (A ? i : 0)); + assert(HostDataB[i] == (B ? i : 0)); + assert(HostDataC[i] == (C ? i : 0)); + } + }; + // CGFA with DynParam using PtrA + ExecuteGraphAndVerifyResults(true, false, false); + + // CGFA with DynParam using PtrB + DynParam.update(PtrB); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, true, false); + + // CGFB with DynParam using PtrB + DynamicCG.set_active_cgf(1); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, true, false); + + // CGFC unconditionally using PtrC + DynamicCG.set_active_cgf(2); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, false, true); + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + sycl::free(PtrC, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_ndrange_to_range.cpp b/sycl/test-e2e/Graph/Update/update_ndrange_to_range.cpp new file mode 100644 index 0000000000000..0f1c10e5142bf --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_ndrange_to_range.cpp @@ -0,0 +1,55 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// + +// Tests updating a graph node from sycl::nd_range to sycl::range + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + + std::vector HostDataA(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + + nd_range<1> NDRange{range{N}, range{32}}; + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.parallel_for(NDRange, [=](nd_item<1> Item) { + size_t GlobalID = Item.get_global_id(); + PtrA[GlobalID] += GlobalID; + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // first half of PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + } + + // Update NDRange to target first half only + KernelNode.update_range(range<1>{512}); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == (i >= 512 ? i : i * 2)); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/update_range_to_ndrange.cpp b/sycl/test-e2e/Graph/Update/update_range_to_ndrange.cpp new file mode 100644 index 0000000000000..9489d20c6a916 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/update_range_to_ndrange.cpp @@ -0,0 +1,56 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// + +// Tests updating a graph node from using a sycl::range to a sycl::nd_range + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + const size_t N = 1024; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + + std::vector HostDataA(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + + range<1> Range{1024}; + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.parallel_for(Range, [=](item<1> Item) { + size_t GlobalID = Item.get_id(); + PtrA[GlobalID] += GlobalID; + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // first half of PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i); + } + + // Update NDRange to target first half only + nd_range<1> NDRange{range{512}, range{32}}; + KernelNode.update_nd_range(NDRange); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == (i >= 512 ? i : i * 2)); + } + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/whole_update_dynamic_cgf.cpp b/sycl/test-e2e/Graph/Update/whole_update_dynamic_cgf.cpp new file mode 100644 index 0000000000000..03a0e19f8c51e --- /dev/null +++ b/sycl/test-e2e/Graph/Update/whole_update_dynamic_cgf.cpp @@ -0,0 +1,75 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// XFAIL: level_zero +// XFAIL-TRACKER: OFNAAO-307 + +// Tests interaction of whole graph update and dynamic command-groups + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()}; + + const size_t N = 1024; + int *Ptr = malloc_device(N, Queue); + std::vector HostData(N); + + int PatternA = 42; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + }; + + int PatternB = 0xA; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); + }; + + auto DynamicCGA = exp_ext::dynamic_command_group(GraphA, {CGFA, CGFB}); + auto DynamicCGNodeA = GraphA.add(DynamicCGA); + + auto DynamicCGB = exp_ext::dynamic_command_group(GraphB, {CGFA, CGFB}); + auto DynamicCGNodeB = GraphB.add(DynamicCGB); + DynamicCGB.set_active_cgf(1); // Check if doesn't affect GraphA + + auto ExecGraph = GraphA.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternA); + } + + // Graph B has CGF B as active, while Graph A has CGF A as active. + // Different command-groups should error due to being different + // kernels. + std::error_code ErrorCode = make_error_code(sycl::errc::success); + try { + ExecGraph.update(GraphB); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + assert(ErrorCode == sycl::errc::invalid); + + // Both ExecGraph and Graph B have CGFB as active, so + // whole graph update should be valid as graphs match. + DynamicCGA.set_active_cgf(1); + ExecGraph.update(DynamicCGNodeA); + ExecGraph.update(GraphB); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(Ptr, HostData.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostData[i] == PatternB); + } + + sycl::free(Ptr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/whole_update_dynamic_param.cpp b/sycl/test-e2e/Graph/Update/whole_update_dynamic_param.cpp index c9a4922e7fd46..b894685a8bd87 100644 --- a/sycl/test-e2e/Graph/Update/whole_update_dynamic_param.cpp +++ b/sycl/test-e2e/Graph/Update/whole_update_dynamic_param.cpp @@ -5,8 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// REQUIRES: aspect-usm_shared_allocations - // Tests that whole graph update works when using dynamic parameters. #include "../graph_common.hpp" diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 11d85801727c7..621765568d50c 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3023,6 +3023,9 @@ _ZN4sycl3_V13ext6oneapi12experimental20pitched_alloc_deviceEPmRKNS3_16image_desc _ZN4sycl3_V13ext6oneapi12experimental20pitched_alloc_deviceEPmRKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental20pitched_alloc_deviceEPmmmjRKNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental20pitched_alloc_deviceEPmmmjRKNS0_6deviceERKNS0_7contextE +_ZN4sycl3_V13ext6oneapi12experimental21dynamic_command_group14set_active_cgfEm +_ZN4sycl3_V13ext6oneapi12experimental21dynamic_command_groupC1ERKNS3_13command_graphILNS3_11graph_stateE0EEERKSt6vectorISt8functionIFvRNS0_7handlerEEESaISF_EE +_ZN4sycl3_V13ext6oneapi12experimental21dynamic_command_groupC2ERKNS3_13command_graphILNS3_11graph_stateE0EEERKSt6vectorISt8functionIFvRNS0_7handlerEEESaISF_EE _ZN4sycl3_V13ext6oneapi12experimental21get_composite_devicesEv _ZN4sycl3_V13ext6oneapi12experimental22get_image_channel_typeENS3_16image_mem_handleERKNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental22get_image_channel_typeENS3_16image_mem_handleERKNS0_6deviceERKNS0_7contextE @@ -3081,6 +3084,7 @@ _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph15begin_re _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph15begin_recordingERNS0_5queueERKNS0_13property_listE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph24addGraphLeafDependenciesENS3_4nodeE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph7addImplERKSt6vectorINS3_4nodeESaIS7_EE +_ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph7addImplERNS3_21dynamic_command_groupERKSt6vectorINS3_4nodeESaIS9_EE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph7addImplESt8functionIFvRNS0_7handlerEEERKSt6vectorINS3_4nodeESaISC_EE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graph9make_edgeERNS3_4nodeES7_ _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graphC1ERKNS0_5queueERKNS0_13property_listE @@ -3597,6 +3601,7 @@ _ZNK4sycl3_V13ext6oneapi12experimental12physical_mem10get_deviceEv _ZNK4sycl3_V13ext6oneapi12experimental12physical_mem11get_contextEv _ZNK4sycl3_V13ext6oneapi12experimental12physical_mem3mapEmmNS3_19address_access_modeEm _ZNK4sycl3_V13ext6oneapi12experimental12physical_mem4sizeEv +_ZNK4sycl3_V13ext6oneapi12experimental21dynamic_command_group14get_active_cgfEv _ZNK4sycl3_V13ext6oneapi12experimental4node14get_successorsEv _ZNK4sycl3_V13ext6oneapi12experimental4node16get_predecessorsEv _ZNK4sycl3_V13ext6oneapi12experimental4node8get_typeEv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 2b6dbdb2fe52b..b0b7fc3f0112d 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -326,6 +326,15 @@ ??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV0123456@@Z ??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV0123456@@Z ??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@V?$command_graph@$0A@@23456@_KPEBX@Z +?get_active_cgf@dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEBA_KXZ +??4dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@$$QEAV012345@@Z +?addImpl@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA?AVnode@34567@AEAVdynamic_command_group@34567@AEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@@Z +??0dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV?$command_graph@$0A@@12345@AEBV?$vector@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V?$allocator@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@@2@@std@@@Z +??0dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV012345@@Z +??0dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV012345@@Z +?set_active_cgf@dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAAX_K@Z +??1dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ +??4dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@AEBV012345@@Z ??0event@_V1@sycl@@AEAA@V?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@std@@@Z ??0event@_V1@sycl@@QEAA@$$QEAV012@@Z ??0event@_V1@sycl@@QEAA@AEBV012@@Z diff --git a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp index 1542b5f34d7dc..90d95975a0245 100644 --- a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp @@ -700,3 +700,166 @@ TEST_F(CommandGraphTest, RecordingWrongGraphDep) { }), sycl::exception); } + +// Error when a dynamic command-group is used with a graph belonging to a +// different graph. +TEST_F(CommandGraphTest, DynamicCommandGroupWrongGraph) { + experimental::command_graph Graph1{Queue.get_context(), Queue.get_device()}; + experimental::command_graph Graph2{Queue.get_context(), Queue.get_device()}; + auto CGF = [&](sycl::handler &CGH) { + CGH.single_task>([]() {}); + }; + + experimental::dynamic_command_group DynCG(Graph2, {CGF}); + ASSERT_THROW(Graph1.add(DynCG), sycl::exception); +} + +// Error when a non-kernel command-group is included in a dynamic command-group +TEST_F(CommandGraphTest, DynamicCommandGroupNotKernel) { + int *Ptr = malloc_device(1, Queue); + auto CGF = [&](sycl::handler &CGH) { CGH.memset(Ptr, 1, 0); }; + + experimental::command_graph Graph{Queue}; + ASSERT_THROW(experimental::dynamic_command_group DynCG(Graph, {CGF}), + sycl::exception); + sycl::free(Ptr, Queue); +} + +// Error if edges are not the same for all command-groups in dynamic command +// group, test using graph limited events to create edges +TEST_F(CommandGraphTest, DynamicCommandGroupMismatchEventEdges) { + size_t N = 32; + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + experimental::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + Graph.begin_recording(Queue); + + auto EventA = Queue.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { PtrA[Item.get_id()] = 1; }); + }); + + auto EventB = Queue.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { PtrB[Item.get_id()] = 4; }); + }); + + Graph.end_recording(); + + auto CGFA = [&](handler &CGH) { + CGH.depends_on(EventA); + CGH.parallel_for(N, [=](item<1> Item) { PtrA[Item.get_id()] += 2; }); + }; + + auto CGFB = [&](handler &CGH) { + CGH.depends_on(EventB); + CGH.parallel_for(N, [=](item<1> Item) { PtrB[Item.get_id()] += 0xA; }); + }; + + experimental::dynamic_command_group DynCG(Graph, {CGFA, CGFB}); + ASSERT_THROW(Graph.add(DynCG), sycl::exception); + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); +} + +// Test that an exception is thrown when a graph isn't created with buffer +// property, but buffers are used. +TEST_F(CommandGraphTest, DynamicCommandGroupBufferThrows) { + size_t N = 32; + std::vector HostData(N, 0); + buffer Buf{HostData}; + Buf.set_write_back(false); + + experimental::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + auto CGFA = [&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = 2; }); + }; + + auto CGFB = [&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = 0xA; }); + }; + + experimental::dynamic_command_group DynCG(Graph, {CGFA, CGFB}); + ASSERT_THROW(Graph.add(DynCG), sycl::exception); +} + +// Test and exception is thrown when using a host-accessor to a buffer +// used in a non active CGF node in the graph. +TEST_F(CommandGraphTest, DynamicCommandGroupBufferHostAccThrows) { + size_t N = 32; + std::vector HostData(N, 0); + buffer Buf{HostData}; + Buf.set_write_back(false); + + int *Ptr = malloc_device(N, Queue); + + { + ext::oneapi::experimental::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {experimental::property::graph::assume_buffer_outlives_graph{}}}; + + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = 2; }); + }; + + auto CGFB = [&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = 0xA; }); + }; + + experimental::dynamic_command_group DynCG(Graph, {CGFA, CGFB}); + ASSERT_NO_THROW(Graph.add(DynCG)); + + ASSERT_THROW({ host_accessor HostAcc{Buf}; }, sycl::exception); + } + + sycl::free(Ptr, Queue); +} + +// Error if edges are not the same for all command-groups in dynamic command +// group, test using accessors to create edges +TEST_F(CommandGraphTest, DynamicCommandGroupMismatchAccessorEdges) { + size_t N = 32; + std::vector HostData(N, 0); + buffer BufA{HostData}; + buffer BufB{HostData}; + BufA.set_write_back(false); + BufB.set_write_back(false); + + experimental::command_graph Graph{ + Queue.get_context(), + Queue.get_device(), + {experimental::property::graph::assume_buffer_outlives_graph{}}}; + + Graph.begin_recording(Queue); + + Queue.submit([&](handler &CGH) { + auto AccA = BufA.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccA[Item.get_id()] = 1; }); + }); + + Queue.submit([&](handler &CGH) { + auto AccB = BufB.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccB[Item.get_id()] = 4; }); + }); + + Graph.end_recording(); + + auto CGFA = [&](handler &CGH) { + auto AccA = BufA.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccA[Item.get_id()] += 2; }); + }; + + auto CGFB = [&](handler &CGH) { + auto AccB = BufB.get_access(CGH); + CGH.parallel_for(N, [=](item<1> Item) { AccB[Item.get_id()] += 0xA; }); + }; + + experimental::dynamic_command_group DynCG(Graph, {CGFA, CGFB}); + ASSERT_THROW(Graph.add(DynCG), sycl::exception); +} diff --git a/sycl/unittests/Extensions/CommandGraph/Update.cpp b/sycl/unittests/Extensions/CommandGraph/Update.cpp index ff66af25d9e83..676e3bead1416 100644 --- a/sycl/unittests/Extensions/CommandGraph/Update.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Update.cpp @@ -134,35 +134,6 @@ TEST_F(CommandGraphTest, UpdateNodeTypeExceptions) { })); } -TEST_F(CommandGraphTest, UpdateRangeErrors) { - // Test that the correct errors are throw when trying to update node ranges - - nd_range<1> NDRange{range{128}, range{32}}; - range<1> Range{128}; - auto NodeNDRange = Graph.add([&](sycl::handler &cgh) { - cgh.parallel_for>(NDRange, [](nd_item<1>) {}); - }); - - // OK - EXPECT_NO_THROW(NodeNDRange.update_nd_range(NDRange)); - // Can't update an nd_range node with a range - EXPECT_ANY_THROW(NodeNDRange.update_range(Range)); - // Can't update with a different number of dimensions - EXPECT_ANY_THROW(NodeNDRange.update_nd_range( - nd_range<2>{range<2>{128, 128}, range<2>{32, 32}})); - - auto NodeRange = Graph.add([&](sycl::handler &cgh) { - cgh.parallel_for>(range<1>{128}, [](item<1>) {}); - }); - - // OK - EXPECT_NO_THROW(NodeRange.update_range(Range)); - // Can't update a range node with an nd_range - EXPECT_ANY_THROW(NodeRange.update_nd_range(NDRange)); - // Can't update with a different number of dimensions - EXPECT_ANY_THROW(NodeRange.update_range(range<2>{128, 128})); -} - class WholeGraphUpdateTest : public CommandGraphTest { protected: From e78f24997c2a276b7a1d1559944f472cde42c7c4 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Wed, 23 Oct 2024 10:32:15 +0100 Subject: [PATCH 2/4] Refactor scheduler code Create helper function for getting UR details out of CG. --- sycl/source/detail/scheduler/commands.cpp | 101 ++++++++++------------ 1 file changed, 47 insertions(+), 54 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 87b170c94c4bb..52e8783c4f28d 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2477,34 +2477,16 @@ static ur_result_t SetKernelParamsAndLaunch( return Error; } -ur_result_t enqueueImpCommandBufferKernel( - context Ctx, DeviceImplPtr DeviceImpl, - ur_exp_command_buffer_handle_t CommandBuffer, - const CGExecKernel &CommandGroup, - std::vector &SyncPoints, - ur_exp_command_buffer_sync_point_t *OutSyncPoint, - ur_exp_command_buffer_command_handle_t *OutCommand, - const std::function &getMemAllocationFunc) { - auto ContextImpl = sycl::detail::getSyclObjImpl(Ctx); - const sycl::detail::AdapterPtr &Adapter = ContextImpl->getAdapter(); - - const std::vector> - &AlternativeKernels = CommandGroup.MAlternativeKernels; +namespace { +std::tuple, + const KernelArgMask *> +getCGKernelInfo(const CGExecKernel &CommandGroup, ContextImplPtr ContextImpl, + DeviceImplPtr DeviceImpl, + std::vector &UrKernelsToRelease, + std::vector &UrProgramsToRelease) { - // UR kernel and program for 'CommandGroup' ur_kernel_handle_t UrKernel = nullptr; - ur_program_handle_t UrProgram = nullptr; - - // Impl objects created when 'CommandGroup' is from a kernel bundle - std::shared_ptr SyclKernelImpl = nullptr; std::shared_ptr DeviceImageImpl = nullptr; - - // List of ur objects to be released after UR call - std::vector UrKernelsToRelease; - std::vector UrProgramsToRelease; - - auto Kernel = CommandGroup.MSyclKernel; - auto KernelBundleImplPtr = CommandGroup.MKernelBundle; const KernelArgMask *EliminatedArgMask = nullptr; // Use kernel_bundle if available unless it is interop. @@ -2512,63 +2494,74 @@ ur_result_t enqueueImpCommandBufferKernel( // in interop kernel bundles (if any) do not have kernel_id // and can therefore not be looked up, but since they are self-contained // they can simply be launched directly. - if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) { + if (auto KernelBundleImplPtr = CommandGroup.MKernelBundle; + KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) { auto KernelName = CommandGroup.MKernelName; kernel_id KernelID = detail::ProgramManager::getInstance().getSYCLKernelID(KernelName); + kernel SyclKernel = KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr); - SyclKernelImpl = detail::getSyclObjImpl(SyclKernel); + + auto SyclKernelImpl = detail::getSyclObjImpl(SyclKernel); UrKernel = SyclKernelImpl->getHandleRef(); DeviceImageImpl = SyclKernelImpl->getDeviceImage(); - UrProgram = DeviceImageImpl->get_ur_program_ref(); EliminatedArgMask = SyclKernelImpl->getKernelArgMask(); - } else if (Kernel != nullptr) { + } else if (auto Kernel = CommandGroup.MSyclKernel; Kernel != nullptr) { UrKernel = Kernel->getHandleRef(); - UrProgram = Kernel->getProgramRef(); EliminatedArgMask = Kernel->getKernelArgMask(); } else { + ur_program_handle_t UrProgram = nullptr; std::tie(UrKernel, std::ignore, EliminatedArgMask, UrProgram) = sycl::detail::ProgramManager::getInstance().getOrCreateKernel( ContextImpl, DeviceImpl, CommandGroup.MKernelName); UrKernelsToRelease.push_back(UrKernel); UrProgramsToRelease.push_back(UrProgram); } + return std::make_tuple(UrKernel, DeviceImageImpl, EliminatedArgMask); +} +} // anonymous namespace + +ur_result_t enqueueImpCommandBufferKernel( + context Ctx, DeviceImplPtr DeviceImpl, + ur_exp_command_buffer_handle_t CommandBuffer, + const CGExecKernel &CommandGroup, + std::vector &SyncPoints, + ur_exp_command_buffer_sync_point_t *OutSyncPoint, + ur_exp_command_buffer_command_handle_t *OutCommand, + const std::function &getMemAllocationFunc) { + // List of ur objects to be released after UR call. We don't do anything + // with the ur_program_handle_t objects, but need to update their reference + // count. + std::vector UrKernelsToRelease; + std::vector UrProgramsToRelease; + + ur_kernel_handle_t UrKernel = nullptr; + std::shared_ptr DeviceImageImpl = nullptr; + const KernelArgMask *EliminatedArgMask = nullptr; + + auto ContextImpl = sycl::detail::getSyclObjImpl(Ctx); + std::tie(UrKernel, DeviceImageImpl, EliminatedArgMask) = + getCGKernelInfo(CommandGroup, ContextImpl, DeviceImpl, UrKernelsToRelease, + UrProgramsToRelease); // Build up the list of UR kernel handles that the UR command could be // updated to use. std::vector AltUrKernels; + const std::vector> + &AlternativeKernels = CommandGroup.MAlternativeKernels; for (const auto &AltCGKernelWP : AlternativeKernels) { auto AltCGKernel = AltCGKernelWP.lock(); assert(AltCGKernel != nullptr); ur_kernel_handle_t AltUrKernel = nullptr; - if (auto KernelBundleImplPtr = AltCGKernel->MKernelBundle; - KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) { - auto KernelName = AltCGKernel->MKernelName; - kernel_id KernelID = - detail::ProgramManager::getInstance().getSYCLKernelID(KernelName); - kernel SyclKernel = - KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr); - AltUrKernel = detail::getSyclObjImpl(SyclKernel)->getHandleRef(); - } else if (AltCGKernel->MSyclKernel != nullptr) { - AltUrKernel = Kernel->getHandleRef(); - } else { - ur_program_handle_t UrProgram = nullptr; - std::tie(AltUrKernel, std::ignore, std::ignore, UrProgram) = - sycl::detail::ProgramManager::getInstance().getOrCreateKernel( - ContextImpl, DeviceImpl, AltCGKernel->MKernelName); - UrKernelsToRelease.push_back(AltUrKernel); - UrProgramsToRelease.push_back(UrProgram); - } - - if (AltUrKernel != UrKernel) { - // Don't include command-group 'CommandGroup' in the list to pass to UR, - // as this will be used for the primary ur kernel parameter. - AltUrKernels.push_back(AltUrKernel); - } + std::tie(AltUrKernel, std::ignore, std::ignore) = + getCGKernelInfo(*AltCGKernel.get(), ContextImpl, DeviceImpl, + UrKernelsToRelease, UrProgramsToRelease); + AltUrKernels.push_back(AltUrKernel); } + const sycl::detail::AdapterPtr &Adapter = ContextImpl->getAdapter(); auto SetFunc = [&Adapter, &UrKernel, &DeviceImageImpl, &Ctx, &getMemAllocationFunc](sycl::detail::ArgDesc &Arg, size_t NextTrueIndex) { From 89193f01f1b39c6985e4b64a93252319a847f7cd Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Wed, 23 Oct 2024 12:29:19 +0100 Subject: [PATCH 3/4] Reintroduce unittest for range update errors --- sycl/doc/design/CommandGraph.md | 15 ++++---- sycl/source/detail/graph_impl.cpp | 1 - sycl/source/detail/graph_impl.hpp | 9 ++--- .../Graph/Update/dyn_cgf_accessor.cpp | 15 ++++---- .../Graph/Update/dyn_cgf_accessor_deps.cpp | 33 +++++++++-------- .../Graph/Update/dyn_cgf_accessor_deps2.cpp | 31 ++++++++-------- .../Graph/Update/dyn_cgf_accessor_spv.cpp | 16 ++++----- .../Update/dyn_cgf_different_arg_nums.cpp | 30 ++++++++-------- .../Graph/Update/dyn_cgf_event_deps.cpp | 25 +++++++------ .../test-e2e/Graph/Update/dyn_cgf_ndrange.cpp | 26 +++++++------- .../Graph/Update/dyn_cgf_ndrange_3D.cpp | 2 +- .../Graph/Update/dyn_cgf_overwrite_range.cpp | 19 +++++----- .../Graph/Update/dyn_cgf_parameters.cpp | 31 ++++++++-------- .../Graph/Update/dyn_cgf_shared_nodes.cpp | 27 +++++++------- .../Update/dyn_cgf_update_before_finalize.cpp | 15 ++++---- sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp | 19 +++++----- .../Update/dyn_cgf_with_all_dyn_params.cpp | 35 +++++++++---------- ...dyn_cgf_with_different_type_dyn_params.cpp | 35 +++++++++---------- .../Update/dyn_cgf_with_some_dyn_params.cpp | 33 +++++++++-------- .../Graph/Update/update_ndrange_to_range.cpp | 23 ++++++------ .../Graph/Update/update_range_to_ndrange.cpp | 23 ++++++------ .../Graph/Update/whole_update_dynamic_cgf.cpp | 19 +++++----- .../Extensions/CommandGraph/Update.cpp | 31 ++++++++++++++++ 23 files changed, 274 insertions(+), 239 deletions(-) diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index d7587113a4615..23d32fd9ff7e2 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -291,18 +291,17 @@ from a `std::unique_ptr` to a `std::shared_ptr` so that multiple nodes and the the overhead of having to allocate and free copies of the CG when a new active CG is selected. -The `dynamic_command_group_impl` class contains weak pointers to the nodes which -have been created with it, so that when a new active CG is selected it can -propagate the change to those nodes. The `node_impl` class also contains a -reference to the dynamic command-group that created it, so that when the graph -is finalized each node can use the list of kernels in its dynamic command-group -as part of the `urCommandBufferAppendKernelLaunchExp` call to pass the possible -alternative kernels. +The `dynamic_command_group_impl` class contains a list of weak pointers to the +nodes which have been created with it, so that when a new active CG is selected +it can propagate the change to those nodes. The `dynamic_parameter_impl` class +also contains a list of weak pointers, but to the `dynamic_command_group_impl` +instances of any dynamic command-groups where they are used. This allows +updating the dynamic parameter to propagate to dynamic command-group nodes. The `sycl::detail::CGExecKernel` class has been added to, so that if the object was created from an element in the dynamic command-group list, the class stores a vector of weak pointers to the other alternative command-groups created -from the same dynamic command-group object. This allows the DPC++ scheduler to +from the same dynamic command-group object. This allows the SYCL runtime to access the list of alternative kernels when calling the UR API to append a kernel command to a command-buffer. diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index ad10a3bdeefaa..0c9394045acfa 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -562,7 +562,6 @@ graph_impl::add(std::shared_ptr &DynCGImpl, // Track the dynamic command-group used inside the node object DynCGImpl->MNodes.push_back(NodeImpl); - NodeImpl->MDynCG = DynCGImpl; return NodeImpl; } diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 3b1fc3fa01641..11b432f208ea1 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -99,8 +99,6 @@ class node_impl : public std::enable_shared_from_this { /// Stores the executable graph impl associated with this node if it is a /// subgraph node. std::shared_ptr MSubGraphImpl; - /// Dynamic command-group object used in node, if any. - std::shared_ptr MDynCG; /// Used for tracking visited status during cycle checks. bool MVisited = false; @@ -158,7 +156,7 @@ class node_impl : public std::enable_shared_from_this { : enable_shared_from_this(Other), MSuccessors(Other.MSuccessors), MPredecessors(Other.MPredecessors), MCGType(Other.MCGType), MNodeType(Other.MNodeType), MCommandGroup(Other.getCGCopy()), - MSubGraphImpl(Other.MSubGraphImpl), MDynCG(Other.MDynCG) {} + MSubGraphImpl(Other.MSubGraphImpl) {} /// Copy-assignment operator. This will perform a deep-copy of the /// command group object associated with this node. @@ -170,7 +168,6 @@ class node_impl : public std::enable_shared_from_this { MNodeType = Other.MNodeType; MCommandGroup = Other.getCGCopy(); MSubGraphImpl = Other.MSubGraphImpl; - MDynCG = Other.MDynCG; } return *this; } @@ -420,7 +417,7 @@ class node_impl : public std::enable_shared_from_this { throw sycl::exception(sycl::errc::invalid, "Cannot update execution range of a node with an " "execution range of different dimensions than what " - "the node was originall created with."); + "the node was original created with."); } NDRDesc = sycl::detail::NDRDescT{ExecutionRange}; @@ -441,7 +438,7 @@ class node_impl : public std::enable_shared_from_this { throw sycl::exception(sycl::errc::invalid, "Cannot update execution range of a node with an " "execution range of different dimensions than what " - "the node was originall created with."); + "the node was original created with."); } NDRDesc = sycl::detail::NDRDescT{ExecutionRange}; diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp index 2b5f378d8bed7..a5e5a1ea78b87 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor.cpp @@ -14,9 +14,7 @@ int main() { queue Queue{}; - const size_t N = 1024; - std::vector HostData(N, 0); - buffer Buf{HostData}; + buffer Buf{sycl::range<1>(Size)}; Buf.set_write_back(false); auto Acc = Buf.get_access(); @@ -28,13 +26,15 @@ int main() { int PatternA = 42; auto CGFA = [&](handler &CGH) { CGH.require(Acc); - CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = PatternA; }); + CGH.parallel_for(Size, + [=](item<1> Item) { Acc[Item.get_id()] = PatternA; }); }; int PatternB = 0xA; auto CGFB = [&](handler &CGH) { CGH.require(Acc); - CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = PatternB; }); + CGH.parallel_for(Size, + [=](item<1> Item) { Acc[Item.get_id()] = PatternB; }); }; auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); @@ -42,8 +42,9 @@ int main() { auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); Queue.ext_oneapi_graph(ExecGraph).wait(); + std::vector HostData(Size, 0); Queue.copy(Acc, HostData.data()).wait(); - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { assert(HostData[i] == PatternA); } @@ -52,7 +53,7 @@ int main() { Queue.ext_oneapi_graph(ExecGraph).wait(); Queue.copy(Acc, HostData.data()).wait(); - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { assert(HostData[i] == PatternB); } diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp index 5ce7a4bf40df1..7b477edacff98 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps.cpp @@ -15,10 +15,8 @@ int main() { queue Queue{}; - const size_t N = 1024; - int *Ptr = (int *)sycl::malloc_device(N, Queue); - std::vector HostData(N, 0); - buffer Buf{HostData}; + int *Ptr = (int *)sycl::malloc_device(Size, Queue); + buffer Buf{sycl::range<1>(Size)}; Buf.set_write_back(false); exp_ext::command_graph Graph{ @@ -28,19 +26,21 @@ int main() { auto RootNode = Graph.add([&](handler &CGH) { auto Acc = Buf.get_access(CGH); - CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] = 1; }); + CGH.parallel_for(Size, [=](item<1> Item) { Acc[Item.get_id()] = 1; }); }); int PatternA = 42; auto CGFA = [&](handler &CGH) { auto Acc = Buf.get_access(CGH); - CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] += PatternA; }); + CGH.parallel_for(Size, + [=](item<1> Item) { Acc[Item.get_id()] += PatternA; }); }; int PatternB = 0xA; auto CGFB = [&](handler &CGH) { auto Acc = Buf.get_access(CGH); - CGH.parallel_for(N, [=](item<1> Item) { Acc[Item.get_id()] += PatternB; }); + CGH.parallel_for(Size, + [=](item<1> Item) { Acc[Item.get_id()] += PatternB; }); }; auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); @@ -49,23 +49,28 @@ int main() { auto LeafNode = Graph.add([&](handler &CGH) { auto Acc = Buf.get_access(CGH); CGH.parallel_for( - N, [=](item<1> Item) { Ptr[Item.get_id()] = Acc[Item.get_id()]; }); + Size, [=](item<1> Item) { Ptr[Item.get_id()] = Acc[Item.get_id()]; }); }); auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); - for (size_t i = 0; i < N; i++) { - assert(HostData[i] == (PatternA + 1)); + + std::vector HostData(Size, 0); + Queue.copy(Ptr, HostData.data(), Size).wait(); + + int Ref = PatternA + 1; + for (size_t i = 0; i < Size; i++) { + assert(HostData[i] == Ref); } DynamicCG.set_active_cgf(1); ExecGraph.update(DynamicCGNode); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); - for (size_t i = 0; i < N; i++) { - assert(HostData[i] == (PatternB + 1)); + Queue.copy(Ptr, HostData.data(), Size).wait(); + Ref = PatternB + 1; + for (size_t i = 0; i < Size; i++) { + assert(HostData[i] == Ref); } sycl::free(Ptr, Queue); diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp index 8d50b8b26e0c2..a420d7deb58de 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_deps2.cpp @@ -16,11 +16,9 @@ int main() { queue Queue{}; - const size_t N = 1024; - int *Ptr = (int *)sycl::malloc_device(N, Queue); - std::vector HostData(N, 0); - buffer BufA{sycl::range<1>(N)}; - buffer BufB{sycl::range<1>(N)}; + int *Ptr = (int *)sycl::malloc_device(Size, Queue); + buffer BufA{sycl::range<1>(Size)}; + buffer BufB{sycl::range<1>(Size)}; BufA.set_write_back(false); BufB.set_write_back(false); @@ -34,7 +32,7 @@ int main() { auto RootNode = Graph.add([&](handler &CGH) { auto AccA = BufA.get_access(CGH); auto AccB = BufB.get_access(CGH); - CGH.parallel_for(N, [=](item<1> Item) { + CGH.parallel_for(Size, [=](item<1> Item) { AccA[Item.get_id()] = InitA; AccB[Item.get_id()] = InitB; }); @@ -43,13 +41,15 @@ int main() { int PatternA = 42; auto CGFA = [&](handler &CGH) { auto AccA = BufA.get_access(CGH); - CGH.parallel_for(N, [=](item<1> Item) { AccA[Item.get_id()] += PatternA; }); + CGH.parallel_for(Size, + [=](item<1> Item) { AccA[Item.get_id()] += PatternA; }); }; int PatternB = 0xA; auto CGFB = [&](handler &CGH) { auto AccB = BufB.get_access(CGH); - CGH.parallel_for(N, [=](item<1> Item) { AccB[Item.get_id()] += PatternB; }); + CGH.parallel_for(Size, + [=](item<1> Item) { AccB[Item.get_id()] += PatternB; }); }; auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); @@ -58,15 +58,17 @@ int main() { auto LeafNode = Graph.add([&](handler &CGH) { auto AccA = BufA.get_access(CGH); auto AccB = BufB.get_access(CGH); - CGH.parallel_for(N, [=](item<1> Item) { + CGH.parallel_for(Size, [=](item<1> Item) { Ptr[Item.get_id()] = AccA[Item.get_id()] + AccB[Item.get_id()]; }); }); auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); - for (size_t i = 0; i < N; i++) { + + std::vector HostData(Size, 0); + Queue.copy(Ptr, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { assert(HostData[i] == (InitA + InitB + PatternA)); } @@ -74,9 +76,10 @@ int main() { ExecGraph.update(DynamicCGNode); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); - for (size_t i = 0; i < N; i++) { - assert(HostData[i] == (InitA + InitB + PatternB)); + Queue.copy(Ptr, HostData.data(), Size).wait(); + int Ref = InitA + InitB + PatternB; + for (size_t i = 0; i < Size; i++) { + assert(HostData[i] == Ref); } sycl::free(Ptr, Queue); diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp index 80556f60fc75f..0eaa714463670 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_accessor_spv.cpp @@ -26,17 +26,13 @@ int main(int, char **argv) { kernel kernel = getKernel( KernelBundle, "_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlvE_"); - const size_t N = 1024; - exp_ext::command_graph Graph{ Queue.get_context(), Queue.get_device(), {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; - std::vector HostDataA(N, 0); - std::vector HostDataB(N, 0); - buffer BufA{HostDataA}; - buffer BufB{HostDataB}; + buffer BufA{sycl::range<1>(Size)}; + buffer BufB{sycl::range<1>(Size)}; BufA.set_write_back(false); BufB.set_write_back(false); @@ -60,9 +56,12 @@ int main(int, char **argv) { auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); Queue.ext_oneapi_graph(ExecGraph).wait(); + + std::vector HostDataA(Size, 0); + std::vector HostDataB(Size, 0); Queue.copy(BufA.get_access(), HostDataA.data()).wait(); Queue.copy(BufB.get_access(), HostDataB.data()).wait(); - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { assert(HostDataA[i] == i); assert(HostDataB[i] == 0); } @@ -71,9 +70,10 @@ int main(int, char **argv) { ExecGraph.update(DynamicCGNode); Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.copy(BufA.get_access(), HostDataA.data()).wait(); Queue.copy(BufB.get_access(), HostDataB.data()).wait(); - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { assert(HostDataA[i] == i); assert(HostDataB[i] == i); } diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_different_arg_nums.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_different_arg_nums.cpp index e1602864b44a0..795dc074d882e 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_different_arg_nums.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_different_arg_nums.cpp @@ -17,16 +17,15 @@ int main() { queue Queue{}; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - const size_t N = 1024; - int *Ptr = malloc_device(N, Queue); - std::vector HostData(N); + int *Ptr = malloc_device(Size, Queue); + std::vector HostData(Size); // 3 kernel arguments: Ptr, PatternA, PatternB int PatternA = 42; int PatternB = 0xA; auto CGFA = [&](handler &CGH) { CGH.parallel_for( - N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA + PatternB; }); + Size, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA + PatternB; }); }; // 2 kernel arguments: Ptr, MyPatternStruct @@ -36,20 +35,21 @@ int main() { }; PatternStruct MyPatternStruct{PatternA + 1, PatternB + 1}; auto CGFB = [&](handler &CGH) { - CGH.parallel_for(N, [=](item<1> Item) { + CGH.parallel_for(Size, [=](item<1> Item) { Ptr[Item.get_id()] = MyPatternStruct.PatternA + MyPatternStruct.PatternB; }); }; // 1 kernel argument: Ptr auto CGFC = [&](handler &CGH) { - CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = 42 - 0xA; }); + CGH.parallel_for(Size, + [=](item<1> Item) { Ptr[Item.get_id()] = 42 - 0xA; }); }; // 4 kernel argument: Ptr int PatternC = -12; auto CGFD = [&](handler &CGH) { - CGH.parallel_for(N, [=](item<1> Item) { + CGH.parallel_for(Size, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA + PatternB + PatternC; }); }; @@ -77,9 +77,9 @@ int main() { // Verify CGFA works with 3 arguments Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); + Queue.copy(Ptr, HostData.data(), Size).wait(); int Ref = PatternA + PatternB; - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { assert(HostData[i] == Ref); } @@ -96,9 +96,9 @@ int main() { DynamicCG.set_active_cgf(1); ExecGraph.update(DynamicCGNode); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); + Queue.copy(Ptr, HostData.data(), Size).wait(); Ref = (PatternA + 1) + (PatternB + 1); - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { assert(HostData[i] == Ref); } @@ -113,9 +113,9 @@ int main() { DynamicCG.set_active_cgf(2); ExecGraph.update(DynamicCGNode); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); + Queue.copy(Ptr, HostData.data(), Size).wait(); Ref = PatternA - PatternB; - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { assert(HostData[i] == Ref); } @@ -136,9 +136,9 @@ int main() { DynamicCG.set_active_cgf(3); ExecGraph.update(DynamicCGNode); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); + Queue.copy(Ptr, HostData.data(), Size).wait(); Ref = PatternA + PatternB + PatternC; - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { assert(HostData[i] == Ref); } diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp index 11e28a033a4c2..0964f6e0c354e 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_event_deps.cpp @@ -17,22 +17,21 @@ int main() { queue Queue{}; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - const size_t N = 1024; - int *PtrA = malloc_device(N, Queue); - int *PtrB = malloc_device(N, Queue); - int *PtrC = malloc_device(N, Queue); - std::vector HostData(N); + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + int *PtrC = malloc_device(Size, Queue); + std::vector HostData(Size); Graph.begin_recording(Queue); int PatternA = 42; - auto EventA = Queue.fill(PtrA, PatternA, N); + auto EventA = Queue.fill(PtrA, PatternA, Size); int PatternB = 0xA; - auto EventB = Queue.fill(PtrB, PatternB, N); + auto EventB = Queue.fill(PtrB, PatternB, Size); Graph.end_recording(Queue); auto CGFA = [&](handler &CGH) { CGH.depends_on({EventA, EventB}); - CGH.parallel_for(N, [=](item<1> Item) { + CGH.parallel_for(Size, [=](item<1> Item) { auto I = Item.get_id(); PtrC[I] = PtrA[I] * PtrB[I]; }); @@ -40,7 +39,7 @@ int main() { auto CGFB = [&](handler &CGH) { CGH.depends_on({EventA, EventB}); - CGH.parallel_for(N, [=](item<1> Item) { + CGH.parallel_for(Size, [=](item<1> Item) { auto I = Item.get_id(); PtrC[I] = PtrA[I] + PtrB[I]; }); @@ -51,8 +50,8 @@ int main() { auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(PtrC, HostData.data(), N).wait(); - for (size_t i = 0; i < N; i++) { + Queue.copy(PtrC, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { assert(HostData[i] == PatternA * PatternB); } @@ -60,8 +59,8 @@ int main() { ExecGraph.update(DynamicCGNode); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(PtrC, HostData.data(), N).wait(); - for (size_t i = 0; i < N; i++) { + Queue.copy(PtrC, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { assert(HostData[i] == PatternA + PatternB); } diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp index f4717210bb35e..237e9173f253e 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange.cpp @@ -17,23 +17,23 @@ int main() { queue Queue{}; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - const size_t N = 1024; - int *Ptr = malloc_device(N, Queue); - std::vector HostData(N); + int *Ptr = malloc_device(Size, Queue); + std::vector HostData(Size); auto RootNode = - Graph.add([&](handler &cgh) { cgh.memset(Ptr, 0, N * sizeof(int)); }); + Graph.add([&](handler &cgh) { cgh.memset(Ptr, 0, Size * sizeof(int)); }); int PatternA = 42; - sycl::range<1> RangeA{512}; + size_t ItemsA = Size / 2; + sycl::range<1> RangeA{ItemsA}; auto CGFA = [&](handler &CGH) { CGH.parallel_for(RangeA, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); }; int PatternB = 0xA; - size_t UpdatedN = 256; - sycl::nd_range<1> RangeB{sycl::range{UpdatedN}, sycl::range{16}}; + size_t ItemsB = Size / 4; + sycl::nd_range<1> RangeB{sycl::range{ItemsB}, sycl::range{16}}; auto CGFB = [&](handler &CGH) { CGH.parallel_for( RangeB, [=](nd_item<1> Item) { Ptr[Item.get_global_id()] = PatternB; }); @@ -45,9 +45,9 @@ int main() { auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); - for (size_t i = 0; i < N; i++) { - if (i < RangeA.get(0)) { + Queue.copy(Ptr, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + if (i < ItemsA) { assert(HostData[i] == PatternA); } else { assert(HostData[i] == 0); @@ -58,9 +58,9 @@ int main() { ExecGraph.update(DynamicCGNode); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); - for (size_t i = 0; i < N; i++) { - if (i < UpdatedN) { + Queue.copy(Ptr, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + if (i < ItemsB) { assert(HostData[i] == PatternB); } else { assert(HostData[i] == 0); diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange_3D.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange_3D.cpp index f6390df64303a..261ac6ecf5c3b 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange_3D.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_ndrange_3D.cpp @@ -9,7 +9,7 @@ // XFAIL-TRACKER: OFNAAO-307 // Tests updating a dynamic command-group node where the dynamic command-groups -// have different ranges/nd-ranges +// have different range/nd-range dimensions #include "../graph_common.hpp" diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp index 3ba2500cd6189..c34bd45f0f46e 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp @@ -16,25 +16,26 @@ int main() { queue Queue{}; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - const size_t N = 1024; - std::vector HostData(N); - int *Ptr = malloc_device(N, Queue); - Queue.memset(Ptr, 0, N * sizeof(int)).wait(); + std::vector HostData(Size); + int *Ptr = malloc_device(Size, Queue); + Queue.memset(Ptr, 0, Size * sizeof(int)).wait(); int PatternA = 42; auto CGFA = [&](handler &CGH) { - CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + CGH.parallel_for(Size, + [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); }; int PatternB = 0xA; auto CGFB = [&](handler &CGH) { - CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); + CGH.parallel_for(Size, + [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); }; auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); auto DynamicCGNode = Graph.add(DynamicCG); - size_t NewRange = 512; + size_t NewRange = Size / 2; sycl::range<1> UpdateRange(NewRange); DynamicCGNode.update_range(UpdateRange); @@ -45,8 +46,8 @@ int main() { auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); - for (size_t i = 0; i < N; i++) { + Queue.copy(Ptr, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { if (i < NewRange) { assert(HostData[i] == PatternA); } else { diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp index 0c46672869c7d..7049b5bdde305 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_parameters.cpp @@ -17,25 +17,26 @@ int main() { queue Queue{}; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - const size_t N = 1024; - int *PtrA = malloc_device(N, Queue); - int *PtrB = malloc_device(N, Queue); + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); - std::vector HostDataA(N); - std::vector HostDataB(N); + std::vector HostDataA(Size); + std::vector HostDataB(Size); - Queue.memset(PtrA, 0, N * sizeof(int)); - Queue.memset(PtrB, 0, N * sizeof(int)); + Queue.memset(PtrA, 0, Size * sizeof(int)); + Queue.memset(PtrB, 0, Size * sizeof(int)); Queue.wait(); int PatternA = 0xA; auto CGFA = [&](handler &CGH) { - CGH.parallel_for(N, [=](item<1> Item) { PtrA[Item.get_id()] = PatternA; }); + CGH.parallel_for(Size, + [=](item<1> Item) { PtrA[Item.get_id()] = PatternA; }); }; int PatternB = 42; auto CGFB = [&](handler &CGH) { - CGH.parallel_for(N, [=](item<1> Item) { PtrB[Item.get_id()] = PatternB; }); + CGH.parallel_for(Size, + [=](item<1> Item) { PtrB[Item.get_id()] = PatternB; }); }; auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); @@ -43,10 +44,10 @@ int main() { auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(PtrA, HostDataA.data(), N); - Queue.copy(PtrB, HostDataB.data(), N); + Queue.copy(PtrA, HostDataA.data(), Size); + Queue.copy(PtrB, HostDataB.data(), Size); Queue.wait(); - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { assert(HostDataA[i] == PatternA); assert(HostDataB[i] == 0); } @@ -55,10 +56,10 @@ int main() { ExecGraph.update(DynamicCGNode); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(PtrA, HostDataA.data(), N); - Queue.copy(PtrB, HostDataB.data(), N); + Queue.copy(PtrA, HostDataA.data(), Size); + Queue.copy(PtrB, HostDataB.data(), Size); Queue.wait(); - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { assert(HostDataA[i] == PatternA); assert(HostDataB[i] == PatternB); } diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp index f9b0728d8ea67..1f98200791b6c 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_shared_nodes.cpp @@ -16,21 +16,22 @@ int main() { queue Queue{}; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - const size_t N = 1024; - std::vector HostData(N); - int *Ptr = malloc_device(N, Queue); + std::vector HostData(Size); + int *Ptr = malloc_device(Size, Queue); auto RootNode = - Graph.add([&](handler &CGH) { CGH.memset(Ptr, 0, N * sizeof(int)); }); + Graph.add([&](handler &CGH) { CGH.memset(Ptr, 0, Size * sizeof(int)); }); int PatternA = 42; auto CGFA = [&](handler &CGH) { - CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] += PatternA; }); + CGH.parallel_for(Size, + [=](item<1> Item) { Ptr[Item.get_id()] += PatternA; }); }; int PatternB = 0xA; auto CGFB = [&](handler &CGH) { - CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] += PatternB; }); + CGH.parallel_for(Size, + [=](item<1> Item) { Ptr[Item.get_id()] += PatternB; }); }; auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); @@ -39,20 +40,20 @@ int main() { auto Node2 = Graph.add( [&](handler &cgh) { - cgh.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] *= 2; }); + cgh.parallel_for(Size, [=](item<1> Item) { Ptr[Item.get_id()] *= 2; }); }, exp_ext::property::node::depends_on(Node1)); auto Node3 = Graph.add(DynamicCG, exp_ext::property::node::depends_on(Node2)); // This ND-Range affects Node 1 as well, as the range is tied to the node. - sycl::range<1> Node3Range(512); + sycl::range<1> Node3Range(Size / 2); Node3.update_range(Node3Range); auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); - for (size_t i = 0; i < N; i++) { + Queue.copy(Ptr, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { int Ref = (i < Node3Range.get(0)) ? (PatternA * 3) : 0; assert(HostData[i] == Ref); } @@ -62,9 +63,9 @@ int main() { ExecGraph.update(Node3); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); - for (size_t i = 0; i < N; i++) { - int Ref = (PatternB * 3); + Queue.copy(Ptr, HostData.data(), Size).wait(); + int Ref = (PatternB * 3); + for (size_t i = 0; i < Size; i++) { assert(HostData[i] == Ref); } diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp index a9109d000eb17..2038c94610dfc 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp @@ -17,18 +17,19 @@ int main() { queue Queue{}; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - const size_t N = 1024; - int *Ptr = malloc_device(N, Queue); - std::vector HostData(N); + int *Ptr = malloc_device(Size, Queue); + std::vector HostData(Size); int PatternA = 42; auto CGFA = [&](handler &CGH) { - CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + CGH.parallel_for(Size, + [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); }; int PatternB = 0xA; auto CGFB = [&](handler &CGH) { - CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); + CGH.parallel_for(Size, + [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); }; auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); @@ -37,8 +38,8 @@ int main() { auto ExecGraph = Graph.finalize(); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); - for (size_t i = 0; i < N; i++) { + Queue.copy(Ptr, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { assert(HostData[i] == PatternB); } diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp index 79db8ebe67c57..100701f7b62aa 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_usm.cpp @@ -16,18 +16,19 @@ int main() { queue Queue{}; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - const size_t N = 1024; - int *Ptr = malloc_device(N, Queue); - std::vector HostData(N); + int *Ptr = malloc_device(Size, Queue); + std::vector HostData(Size); int PatternA = 42; auto CGFA = [&](handler &CGH) { - CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + CGH.parallel_for(Size, + [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); }; int PatternB = 0xA; auto CGFB = [&](handler &CGH) { - CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); + CGH.parallel_for(Size, + [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); }; auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); @@ -35,8 +36,8 @@ int main() { auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); - for (size_t i = 0; i < N; i++) { + Queue.copy(Ptr, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { assert(HostData[i] == PatternA); } @@ -44,8 +45,8 @@ int main() { ExecGraph.update(DynamicCGNode); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); - for (size_t i = 0; i < N; i++) { + Queue.copy(Ptr, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { assert(HostData[i] == PatternB); } diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_with_all_dyn_params.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_with_all_dyn_params.cpp index cb9bdf15f76b8..53b34d1add289 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_with_all_dyn_params.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_with_all_dyn_params.cpp @@ -16,14 +16,13 @@ int main() { queue Queue{}; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - const size_t N = 1024; - int *PtrA = malloc_device(N, Queue); - int *PtrB = malloc_device(N, Queue); - int *PtrC = malloc_device(N, Queue); + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + int *PtrC = malloc_device(Size, Queue); - std::vector HostDataA(N); - std::vector HostDataB(N); - std::vector HostDataC(N); + std::vector HostDataA(Size); + std::vector HostDataB(Size); + std::vector HostDataC(Size); exp_ext::dynamic_parameter DynParam1(Graph, PtrA); exp_ext::dynamic_parameter DynParam2(Graph, PtrC); @@ -33,7 +32,7 @@ int main() { // TODO: Use the free function kernel extension instead of regular kernels // when available. CGH.single_task([=]() { - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { PtrA[i] = i; } }); @@ -44,7 +43,7 @@ int main() { // TODO: Use the free function kernel extension instead of regular kernels // when available. CGH.single_task([=]() { - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { PtrA[i] = i; } }); @@ -55,7 +54,7 @@ int main() { // TODO: Use the free function kernel extension instead of regular kernels // when available. CGH.single_task([=]() { - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { PtrC[i] = i; } }); @@ -69,7 +68,7 @@ int main() { // TODO: Use the free function kernel extension instead of regular // kernels when available. CGH.single_task([=]() { - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { PtrC[i] += i; } }); @@ -79,19 +78,19 @@ int main() { auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); auto ExecuteGraphAndVerifyResults = [&](bool A, bool B, bool C) { - Queue.memset(PtrA, 0, N * sizeof(int)); - Queue.memset(PtrB, 0, N * sizeof(int)); - Queue.memset(PtrC, 0, N * sizeof(int)); + Queue.memset(PtrA, 0, Size * sizeof(int)); + Queue.memset(PtrB, 0, Size * sizeof(int)); + Queue.memset(PtrC, 0, Size * sizeof(int)); Queue.wait(); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(PtrA, HostDataA.data(), N); - Queue.copy(PtrB, HostDataB.data(), N); - Queue.copy(PtrC, HostDataC.data(), N); + Queue.copy(PtrA, HostDataA.data(), Size); + Queue.copy(PtrB, HostDataB.data(), Size); + Queue.copy(PtrC, HostDataC.data(), Size); Queue.wait(); - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { assert(HostDataA[i] == (A ? i : 0)); assert(HostDataB[i] == (B ? i : 0)); assert(HostDataC[i] == (C ? (2 * i) : i)); diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_with_different_type_dyn_params.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_with_different_type_dyn_params.cpp index 15f815664a740..00482185ebc27 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_with_different_type_dyn_params.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_with_different_type_dyn_params.cpp @@ -17,14 +17,13 @@ int main() { queue Queue{}; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - const size_t N = 1024; - int *PtrA = malloc_device(N, Queue); - int *PtrB = malloc_device(N, Queue); - int *PtrC = malloc_device(N, Queue); + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + int *PtrC = malloc_device(Size, Queue); - std::vector HostDataA(N); - std::vector HostDataB(N); - std::vector HostDataC(N); + std::vector HostDataA(Size); + std::vector HostDataB(Size); + std::vector HostDataC(Size); int ScalarValue = 17; exp_ext::dynamic_parameter DynParamScalar(Graph, ScalarValue); @@ -37,7 +36,7 @@ int main() { // TODO: Use the free function kernel extension instead of regular kernels // when available. CGH.single_task([=]() { - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { PtrA[i] = ScalarValue; } }); @@ -49,7 +48,7 @@ int main() { // TODO: Use the free function kernel extension instead of regular kernels // when available. CGH.single_task([=]() { - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { PtrA[i] = ScalarValue; } }); @@ -62,7 +61,7 @@ int main() { // TODO: Use the free function kernel extension instead of regular kernels // when available. CGH.single_task([=]() { - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { PtrC[i] = ScalarValue; } }); @@ -73,7 +72,7 @@ int main() { // TODO: Use the free function kernel extension instead of regular kernels // when available. CGH.single_task([=]() { - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { PtrA[i] = ScalarValue; } }); @@ -86,19 +85,19 @@ int main() { auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); auto ExecuteGraphAndVerifyResults = [&](int A, int B, int C) { - Queue.memset(PtrA, 0, N * sizeof(int)); - Queue.memset(PtrB, 0, N * sizeof(int)); - Queue.memset(PtrC, 0, N * sizeof(int)); + Queue.memset(PtrA, 0, Size * sizeof(int)); + Queue.memset(PtrB, 0, Size * sizeof(int)); + Queue.memset(PtrC, 0, Size * sizeof(int)); Queue.wait(); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(PtrA, HostDataA.data(), N); - Queue.copy(PtrB, HostDataB.data(), N); - Queue.copy(PtrC, HostDataC.data(), N); + Queue.copy(PtrA, HostDataA.data(), Size); + Queue.copy(PtrB, HostDataB.data(), Size); + Queue.copy(PtrC, HostDataC.data(), Size); Queue.wait(); - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { assert(HostDataA[i] == A); assert(HostDataB[i] == B); assert(HostDataC[i] == C); diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_with_some_dyn_params.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_with_some_dyn_params.cpp index 264c1b6849689..3213fc4eec2fe 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_with_some_dyn_params.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_with_some_dyn_params.cpp @@ -17,14 +17,13 @@ int main() { queue Queue{}; exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - const size_t N = 1024; - int *PtrA = malloc_device(N, Queue); - int *PtrB = malloc_device(N, Queue); - int *PtrC = malloc_device(N, Queue); + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + int *PtrC = malloc_device(Size, Queue); - std::vector HostDataA(N); - std::vector HostDataB(N); - std::vector HostDataC(N); + std::vector HostDataA(Size); + std::vector HostDataB(Size); + std::vector HostDataC(Size); exp_ext::dynamic_parameter DynParam(Graph, PtrA); @@ -33,7 +32,7 @@ int main() { // TODO: Use the free function kernel extension instead of regular kernels // when available. CGH.single_task([=]() { - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { PtrA[i] = i; } }); @@ -44,7 +43,7 @@ int main() { // TODO: Use the free function kernel extension instead of regular kernels // when available. CGH.single_task([=]() { - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { PtrA[i] = i; } }); @@ -52,7 +51,7 @@ int main() { auto CGFC = [&](handler &CGH) { CGH.single_task([=]() { - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { PtrC[i] = i; } }); @@ -63,19 +62,19 @@ int main() { auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); auto ExecuteGraphAndVerifyResults = [&](bool A, bool B, bool C) { - Queue.memset(PtrA, 0, N * sizeof(int)); - Queue.memset(PtrB, 0, N * sizeof(int)); - Queue.memset(PtrC, 0, N * sizeof(int)); + Queue.memset(PtrA, 0, Size * sizeof(int)); + Queue.memset(PtrB, 0, Size * sizeof(int)); + Queue.memset(PtrC, 0, Size * sizeof(int)); Queue.wait(); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(PtrA, HostDataA.data(), N); - Queue.copy(PtrB, HostDataB.data(), N); - Queue.copy(PtrC, HostDataC.data(), N); + Queue.copy(PtrA, HostDataA.data(), Size); + Queue.copy(PtrB, HostDataB.data(), Size); + Queue.copy(PtrC, HostDataC.data(), Size); Queue.wait(); - for (size_t i = 0; i < N; i++) { + for (size_t i = 0; i < Size; i++) { assert(HostDataA[i] == (A ? i : 0)); assert(HostDataB[i] == (B ? i : 0)); assert(HostDataC[i] == (C ? i : 0)); diff --git a/sycl/test-e2e/Graph/Update/update_ndrange_to_range.cpp b/sycl/test-e2e/Graph/Update/update_ndrange_to_range.cpp index 0f1c10e5142bf..43db9d172e618 100644 --- a/sycl/test-e2e/Graph/Update/update_ndrange_to_range.cpp +++ b/sycl/test-e2e/Graph/Update/update_ndrange_to_range.cpp @@ -13,17 +13,15 @@ int main() { queue Queue{}; - const size_t N = 1024; - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - int *PtrA = malloc_device(N, Queue); + int *PtrA = malloc_device(Size, Queue); - std::vector HostDataA(N); + std::vector HostDataA(Size); - Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); - nd_range<1> NDRange{range{N}, range{32}}; + nd_range<1> NDRange{range{Size}, range{32}}; auto KernelNode = Graph.add([&](handler &cgh) { cgh.parallel_for(NDRange, [=](nd_item<1> Item) { @@ -37,19 +35,20 @@ int main() { // first half of PtrA should be filled with values Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(PtrA, HostDataA.data(), N).wait(); - for (size_t i = 0; i < N; i++) { + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { assert(HostDataA[i] == i); } // Update NDRange to target first half only - KernelNode.update_range(range<1>{512}); + size_t UpdateSize = Size / 2; + KernelNode.update_range(range<1>{UpdateSize}); ExecGraph.update(KernelNode); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(PtrA, HostDataA.data(), N).wait(); - for (size_t i = 0; i < N; i++) { - assert(HostDataA[i] == (i >= 512 ? i : i * 2)); + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == (i >= UpdateSize ? i : i * 2)); } return 0; } diff --git a/sycl/test-e2e/Graph/Update/update_range_to_ndrange.cpp b/sycl/test-e2e/Graph/Update/update_range_to_ndrange.cpp index 9489d20c6a916..94052c8379b58 100644 --- a/sycl/test-e2e/Graph/Update/update_range_to_ndrange.cpp +++ b/sycl/test-e2e/Graph/Update/update_range_to_ndrange.cpp @@ -13,17 +13,15 @@ int main() { queue Queue{}; - const size_t N = 1024; - exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; - int *PtrA = malloc_device(N, Queue); + int *PtrA = malloc_device(Size, Queue); - std::vector HostDataA(N); + std::vector HostDataA(Size); - Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); - range<1> Range{1024}; + range<1> Range{Size}; auto KernelNode = Graph.add([&](handler &cgh) { cgh.parallel_for(Range, [=](item<1> Item) { @@ -37,20 +35,21 @@ int main() { // first half of PtrA should be filled with values Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(PtrA, HostDataA.data(), N).wait(); - for (size_t i = 0; i < N; i++) { + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { assert(HostDataA[i] == i); } // Update NDRange to target first half only - nd_range<1> NDRange{range{512}, range{32}}; + size_t UpdateSize = Size / 2; + nd_range<1> NDRange{range{UpdateSize}, range{32}}; KernelNode.update_nd_range(NDRange); ExecGraph.update(KernelNode); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(PtrA, HostDataA.data(), N).wait(); - for (size_t i = 0; i < N; i++) { - assert(HostDataA[i] == (i >= 512 ? i : i * 2)); + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == (i >= UpdateSize ? i : i * 2)); } return 0; } diff --git a/sycl/test-e2e/Graph/Update/whole_update_dynamic_cgf.cpp b/sycl/test-e2e/Graph/Update/whole_update_dynamic_cgf.cpp index 03a0e19f8c51e..0e8b87c0725f2 100644 --- a/sycl/test-e2e/Graph/Update/whole_update_dynamic_cgf.cpp +++ b/sycl/test-e2e/Graph/Update/whole_update_dynamic_cgf.cpp @@ -17,18 +17,19 @@ int main() { exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()}; exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()}; - const size_t N = 1024; - int *Ptr = malloc_device(N, Queue); - std::vector HostData(N); + int *Ptr = malloc_device(Size, Queue); + std::vector HostData(Size); int PatternA = 42; auto CGFA = [&](handler &CGH) { - CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); + CGH.parallel_for(Size, + [=](item<1> Item) { Ptr[Item.get_id()] = PatternA; }); }; int PatternB = 0xA; auto CGFB = [&](handler &CGH) { - CGH.parallel_for(N, [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); + CGH.parallel_for(Size, + [=](item<1> Item) { Ptr[Item.get_id()] = PatternB; }); }; auto DynamicCGA = exp_ext::dynamic_command_group(GraphA, {CGFA, CGFB}); @@ -41,8 +42,8 @@ int main() { auto ExecGraph = GraphA.finalize(exp_ext::property::graph::updatable{}); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); - for (size_t i = 0; i < N; i++) { + Queue.copy(Ptr, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { assert(HostData[i] == PatternA); } @@ -64,8 +65,8 @@ int main() { ExecGraph.update(GraphB); Queue.ext_oneapi_graph(ExecGraph).wait(); - Queue.copy(Ptr, HostData.data(), N).wait(); - for (size_t i = 0; i < N; i++) { + Queue.copy(Ptr, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { assert(HostData[i] == PatternB); } diff --git a/sycl/unittests/Extensions/CommandGraph/Update.cpp b/sycl/unittests/Extensions/CommandGraph/Update.cpp index 676e3bead1416..b149cc08ccc88 100644 --- a/sycl/unittests/Extensions/CommandGraph/Update.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Update.cpp @@ -134,6 +134,37 @@ TEST_F(CommandGraphTest, UpdateNodeTypeExceptions) { })); } +TEST_F(CommandGraphTest, UpdateRangeErrors) { + // Test that the correct errors are throw when trying to update node ranges + nd_range<1> NDRange{range{128}, range{32}}; + range<1> Range{128}; + auto NodeNDRange = Graph.add([&](sycl::handler &cgh) { + cgh.parallel_for>(NDRange, [](nd_item<1>) {}); + }); + + // OK + EXPECT_NO_THROW(NodeNDRange.update_nd_range(NDRange)); + // OK to update an nd_range node with a range of the same dimension + EXPECT_NO_THROW(NodeNDRange.update_range(Range)); + // Can't update with a different number of dimensions + EXPECT_ANY_THROW(NodeNDRange.update_nd_range( + nd_range<2>{range<2>{128, 128}, range<2>{32, 32}})); + EXPECT_ANY_THROW(NodeNDRange.update_range(range<3>{32, 32, 1})); + + auto NodeRange = Graph.add([&](sycl::handler &cgh) { + cgh.parallel_for>(range<1>{128}, [](item<1>) {}); + }); + + // OK + EXPECT_NO_THROW(NodeRange.update_range(Range)); + // OK to update a range node with an nd_range of the same dimension + EXPECT_NO_THROW(NodeRange.update_nd_range(NDRange)); + // Can't update with a different number of dimensions + EXPECT_ANY_THROW(NodeRange.update_range(range<2>{128, 128})); + EXPECT_ANY_THROW(NodeRange.update_nd_range( + nd_range<3>{range<3>{8, 8, 8}, range<3>{8, 8, 8}})); +} + class WholeGraphUpdateTest : public CommandGraphTest { protected: From 3737c98de2feaed47629982f327f0f2e7cdc969c Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Thu, 31 Oct 2024 09:54:10 +0000 Subject: [PATCH 4/4] Refactor raw arg update --- sycl/source/detail/graph_impl.cpp | 24 +++---------------- sycl/source/detail/graph_impl.hpp | 9 ++----- sycl/source/handler.cpp | 11 +++++++-- .../Graph/Update/dyn_cgf_overwrite_range.cpp | 3 --- .../Update/dyn_cgf_update_before_finalize.cpp | 3 --- 5 files changed, 14 insertions(+), 36 deletions(-) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 0c9394045acfa..861ec2a883601 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -405,7 +405,7 @@ std::set> graph_impl::getCGEdges( } } - return std::move(UniqueDeps); + return UniqueDeps; } void graph_impl::markCGMemObjs( @@ -1563,7 +1563,7 @@ void exec_graph_impl::updateImpl(std::shared_ptr Node) { } } - UpdateDesc.hNewKernel = nullptr; + UpdateDesc.hNewKernel = UrKernel; UpdateDesc.numNewMemObjArgs = MemobjDescs.size(); UpdateDesc.pNewMemObjArgList = MemobjDescs.data(); UpdateDesc.numNewPointerArgs = PtrDescs.size(); @@ -1852,24 +1852,7 @@ void dynamic_parameter_impl::updateValue(const raw_kernel_arg *NewRawValue, size_t RawArgSize = NewRawValue->MArgSize; const void *RawArgData = NewRawValue->MArgData; - for (auto &[NodeWeak, ArgIndex] : MNodes) { - auto NodeShared = NodeWeak.lock(); - if (NodeShared) { - dynamic_parameter_impl::updateCGArgValue( - NodeShared->MCommandGroup, ArgIndex, RawArgData, RawArgSize); - } - } - - for (auto &DynCGInfo : MDynCGs) { - auto DynCG = DynCGInfo.DynCG.lock(); - if (DynCG) { - auto &CG = DynCG->MKernels[DynCGInfo.CGIndex]; - dynamic_parameter_impl::updateCGArgValue(CG, DynCGInfo.ArgIndex, - RawArgData, RawArgSize); - } - } - - std::memcpy(MValueStorage.data(), RawArgData, RawArgSize); + updateValue(RawArgData, RawArgSize); } void dynamic_parameter_impl::updateValue(const void *NewValue, size_t Size) { @@ -1987,7 +1970,6 @@ dynamic_command_group_impl::dynamic_command_group_impl( void dynamic_command_group_impl::finalizeCGFList( const std::vector> &CGFList) { - // True if kernels use sycl::nd_range, and false if using sycl::range for (size_t CGFIndex = 0; CGFIndex < CGFList.size(); CGFIndex++) { const auto &CGF = CGFList[CGFIndex]; // Handler defined inside the loop so it doesn't appear to the runtime diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 11b432f208ea1..6144e3f51b9da 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -417,7 +417,7 @@ class node_impl : public std::enable_shared_from_this { throw sycl::exception(sycl::errc::invalid, "Cannot update execution range of a node with an " "execution range of different dimensions than what " - "the node was original created with."); + "the node was originally created with."); } NDRDesc = sycl::detail::NDRDescT{ExecutionRange}; @@ -438,7 +438,7 @@ class node_impl : public std::enable_shared_from_this { throw sycl::exception(sycl::errc::invalid, "Cannot update execution range of a node with an " "execution range of different dimensions than what " - "the node was original created with."); + "the node was originally created with."); } NDRDesc = sycl::detail::NDRDescT{ExecutionRange}; @@ -1173,11 +1173,6 @@ class graph_impl : public std::enable_shared_from_this { /// @param Deps List of dependent nodes void addDepsToNode(std::shared_ptr Node, std::vector> &Deps) { - // Remove empty shared pointers from the list - auto EmptyElementIter = - std::remove(Deps.begin(), Deps.end(), std::shared_ptr()); - Deps.erase(EmptyElementIter, Deps.end()); - if (!Deps.empty()) { for (auto &N : Deps) { N->registerSuccessor(Node); diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index adb1e2ee50796..a7ac73f9e4c34 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -556,7 +556,10 @@ event handler::finalize() { // node can set it as a predecessor. auto DependentNode = GraphImpl->getLastInorderNode(MQueue); std::vector> - Deps = {DependentNode}; + Deps; + if (DependentNode) { + Deps.push_back(DependentNode); + } NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup), Deps); // If we are recording an in-order queue remember the new node, so it @@ -566,7 +569,11 @@ event handler::finalize() { } else { auto LastBarrierRecordedFromQueue = GraphImpl->getBarrierDep(MQueue); std::vector> - Deps = {LastBarrierRecordedFromQueue}; + Deps; + + if (LastBarrierRecordedFromQueue) { + Deps.push_back(LastBarrierRecordedFromQueue); + } NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup), Deps); if (NodeImpl->MCGType == sycl::detail::CGType::Barrier) { diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp index c34bd45f0f46e..04697077bec36 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_overwrite_range.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-307 - // Tests how the nd-range of a node is overwritten by the active command-group #include "../graph_common.hpp" diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp index 2038c94610dfc..8c0c705960ef6 100644 --- a/sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_update_before_finalize.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-307 - // Tests updating a dynamic command-group node after it has been added to // a graph but before the graph has been finalized