Skip to content

[SYCL][Graph] Optimize graph enqueue for in-order queues #18792

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Jun 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7a5c5fd
Optimization enqueue work in progress
fabiomestre May 26, 2025
0241111
Fix Unit test failure
fabiomestre Jun 4, 2025
d3445ef
Fix command-buffer dependencies on the legacy adapter when immediate …
fabiomestre Jun 5, 2025
dd8f6d1
Fix data race in multiple_exec_graphs test
fabiomestre Jun 6, 2025
240c952
Let L0 event implementation handler dependencies for in-order queue
fabiomestre Jun 6, 2025
d70fc37
Wait for command-buffer execution before destroying
fabiomestre Jun 6, 2025
109bc20
Don't rely on default context being the same for ext_oneapi_enqueue_f…
fabiomestre Jun 6, 2025
91c84a7
Remove commented code
fabiomestre Jun 6, 2025
da3720a
Revert changes to graph_common
fabiomestre Jun 6, 2025
4894aca
Try to remove extra sync in V2 adapter
fabiomestre Jun 6, 2025
76e58f8
Add unit-tests for eventless path
fabiomestre Jun 10, 2025
1582ab7
Address review comments
fabiomestre Jun 10, 2025
74d22e5
Merge remote-tracking branch 'origin/sycl' into fabio/eventless_graph…
fabiomestre Jun 10, 2025
597dd41
Update new functions to not use shared_ptr argument for queue
fabiomestre Jun 10, 2025
9343cf9
Address review comments
fabiomestre Jun 11, 2025
c4cafcf
Fix typo
fabiomestre Jun 11, 2025
a9e3270
Merge remote-tracking branch 'origin/sycl' into fabio/eventless_graph…
fabiomestre Jun 12, 2025
97fa20e
Revert opencl adapter changes
fabiomestre Jun 12, 2025
a25ad87
Merge remote-tracking branch 'origin/sycl' into fabio/eventless_graph…
fabiomestre Jun 16, 2025
71cc56f
Workaround HIP limitations
fabiomestre Jun 16, 2025
03fe4b6
Update comment for new hip variable
fabiomestre Jun 17, 2025
92cf34f
Merge remote-tracking branch 'origin/sycl' into fabio/eventless_graph…
fabiomestre Jun 17, 2025
758cecf
[HIP] Enqueue event wait instead of waiting on the host
fabiomestre Jun 17, 2025
fc2ad25
Merge remote-tracking branch 'origin/sycl' into fabio/eventless_graph…
fabiomestre Jun 17, 2025
a034bb8
Fix build failures after rebase
fabiomestre Jun 17, 2025
d31495e
Merge remote-tracking branch 'origin/sycl' into fabio/eventless_graph…
fabiomestre Jun 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
471 changes: 318 additions & 153 deletions sycl/source/detail/graph_impl.cpp

Large diffs are not rendered by default.

113 changes: 98 additions & 15 deletions sycl/source/detail/graph_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -771,16 +771,28 @@ class partition {
std::unordered_map<sycl::device, ur_exp_command_buffer_handle_t>
MCommandBuffers;
/// List of predecessors to this partition.
std::vector<std::shared_ptr<partition>> MPredecessors;
std::vector<partition *> MPredecessors;

/// List of successors to this partition.
std::vector<partition *> MSuccessors;

/// List of requirements for this partition.
std::vector<sycl::detail::AccessorImplHost *> MRequirements;

/// Storage for accessors which are used by this partition.
std::vector<AccessorImplPtr> MAccessors;

/// True if the graph of this partition is a single path graph
/// and in-order optmization can be applied on it.
bool MIsInOrderGraph = false;

/// @return True if the partition contains a host task
bool isHostTask() const {
return (MRoots.size() && ((*MRoots.begin()).lock()->MCGType ==
sycl::detail::CGType::CodeplayHostTask));
}
/// True if this partition contains only one node which is a host_task.
bool MIsHostTask = false;

// Submission event for the partition. Used during enqueue to define
// dependencies between this partition and its successors. This event is
// replaced every time the partition is executed.
EventImplPtr MEvent;

/// Checks if the graph is single path, i.e. each node has a single successor.
/// @return True if the graph is a single path
Expand Down Expand Up @@ -1330,9 +1342,17 @@ class exec_graph_impl {
/// execution.
/// @param Queue Command-queue to schedule execution on.
/// @param CGData Command-group data provided by the sycl::handler
/// @return Event associated with the execution of the graph.
sycl::event enqueue(sycl::detail::queue_impl &Queue,
sycl::detail::CG::StorageInitHelper CGData);
/// @param EventNeeded Whether an event signalling the completion of this
/// operation needs to be returned.
/// @return Returns an event if EventNeeded is true. Returns nullptr
/// otherwise.
EventImplPtr enqueue(sycl::detail::queue_impl &Queue,
sycl::detail::CG::StorageInitHelper CGData,
bool EventNeeded);

/// Iterates through all the nodes in the graph to build the list of
/// accessor requirements for the whole graph and for each partition.
void buildRequirements();

/// Turns the internal graph representation into UR command-buffers for a
/// device.
Expand Down Expand Up @@ -1366,13 +1386,17 @@ class exec_graph_impl {
return MPartitions;
}

/// Query whether the graph contains any host-task nodes.
/// @return True if the graph contains any host-task nodes. False otherwise.
bool containsHostTask() const { return MContainsHostTask; }

/// Checks if the previous submissions of this graph have been completed
/// This function checks the status of events associated to the previous graph
/// submissions.
/// @return true if all previous submissions have been completed, false
/// otherwise.
bool previousSubmissionCompleted() const {
for (auto Event : MExecutionEvents) {
for (auto Event : MSchedulerDependencies) {
if (!Event->isCompleted()) {
return false;
}
Expand Down Expand Up @@ -1447,6 +1471,65 @@ class exec_graph_impl {
ur_exp_command_buffer_handle_t CommandBuffer,
std::shared_ptr<node_impl> Node);

/// Enqueues a host-task partition (i.e. a partition that contains only a
/// single node and that node is a host-task).
/// @param Partition The partition to enqueue.
/// @param Queue Command-queue to schedule execution on.
/// @param CGData Command-group data used for initializing the host-task
/// command-group.
/// @param EventNeeded Whether an event signalling the completion of this
/// operation needs to be returned.
/// @return If EventNeeded is true returns the event resulting from enqueueing
/// the host-task through the scheduler. Returns nullptr otherwise.
EventImplPtr enqueueHostTaskPartition(
std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded);

/// Enqueues a graph partition that contains no host-tasks using the
/// scheduler.
/// @param Partition The partition to enqueue.
/// @param Queue Command-queue to schedule execution on.
/// @param CGData Command-group data used for initializing the command-buffer
/// command-group.
/// @param EventNeeded Whether an event signalling the completion of this
/// operation needs to be returned.
/// @return If EventNeeded is true returns the event resulting from enqueueing
/// the command-buffer through the scheduler. Returns nullptr otherwise.
EventImplPtr enqueuePartitionWithScheduler(
std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded);

/// Enqueues a graph partition that contains no host-tasks by directly calling
/// the unified-runtime API (i.e. avoids scheduler overhead).
/// @param Partition The partition to enqueue.
/// @param Queue Command-queue to schedule execution on.
/// @param WaitEvents List of events to wait on. All the events on this list
/// must be safe for scheduler bypass. Only events containing a valid UR event
/// handle will be waited for.
/// @param EventNeeded Whether an event signalling the completion of this
/// operation needs to be returned.
/// @return If EventNeeded is true returns the event resulting from enqueueing
/// the command-buffer. Returns nullptr otherwise.
EventImplPtr enqueuePartitionDirectly(
std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
std::vector<detail::EventImplPtr> &WaitEvents, bool EventNeeded);

/// Enqueues all the partitions in a graph.
/// @param Queue Command-queue to schedule execution on.
/// @param CGData Command-group data that contains the dependencies and
/// accessor requirements needed to enqueue this graph.
/// @param IsCGDataSafeForSchedulerBypass Whether CGData contains any events
/// that require enqueuing through the scheduler (e.g. requirements or
/// host-task events).
/// @param EventNeeded Whether an event signalling the completion of this
/// operation needs to be returned.
/// @return If EventNeeded is true returns the event resulting from enqueueing
/// the command-buffer. Returns nullptr otherwise.
EventImplPtr enqueuePartitions(sycl::detail::queue_impl &Queue,
sycl::detail::CG::StorageInitHelper &CGData,
bool IsCGDataSafeForSchedulerBypass,
bool EventNeeded);

/// Iterates back through predecessors to find the real dependency.
/// @param[out] Deps Found dependencies.
/// @param[in] CurrentNode Node to find dependencies for.
Expand Down Expand Up @@ -1541,11 +1624,9 @@ class exec_graph_impl {
/// List of requirements for enqueueing this command graph, accumulated from
/// all nodes enqueued to the graph.
std::vector<sycl::detail::AccessorImplHost *> MRequirements;
/// Storage for accessors which are used by this graph, accumulated from
/// all nodes enqueued to the graph.
std::vector<sycl::detail::AccessorImplPtr> MAccessors;
/// List of all execution events returned from command buffer enqueue calls.
std::vector<sycl::detail::EventImplPtr> MExecutionEvents;
/// List of dependencies that enqueue or update commands need to wait on
/// when using the scheduler path.
std::vector<sycl::detail::EventImplPtr> MSchedulerDependencies;
/// List of the partitions that compose the exec graph.
std::vector<std::shared_ptr<partition>> MPartitions;
/// Storage for copies of nodes from the original modifiable graph.
Expand All @@ -1554,6 +1635,8 @@ class exec_graph_impl {
std::unordered_map<std::shared_ptr<node_impl>,
ur_exp_command_buffer_command_handle_t>
MCommandMap;
/// List of partition without any predecessors in this exec graph.
std::vector<std::weak_ptr<partition>> MRootPartitions;
/// True if this graph can be updated (set with property::updatable)
bool MIsUpdatable;
/// If true, the graph profiling is enabled.
Expand Down
13 changes: 5 additions & 8 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -336,15 +336,12 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,

HandlerImpl->MEventMode = SubmitInfo.EventMode();

auto isHostTask = Type == CGType::CodeplayHostTask;

// TODO: this shouldn't be needed but without this
// the legacy adapter doesn't synchronize the operations properly
// when non-immediate command lists are used.
auto isGraphSubmission = Type == CGType::ExecCommandBuffer;
auto isHostTask = Type == CGType::CodeplayHostTask ||
(Type == CGType::ExecCommandBuffer &&
HandlerImpl->MExecGraph->containsHostTask());

auto requiresPostProcess = SubmitInfo.PostProcessorFunc() || Streams.size();
auto noLastEventPath = !isHostTask && !isGraphSubmission &&
auto noLastEventPath = !isHostTask &&
MNoLastEventMode.load(std::memory_order_acquire) &&
!requiresPostProcess;

Expand All @@ -363,7 +360,7 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
} else {
std::unique_lock<std::mutex> Lock(MMutex);

if (!isGraphSubmission && trySwitchingToNoEventsMode()) {
if (trySwitchingToNoEventsMode()) {
EventImpl = finalizeHandlerInOrderNoEventsUnlocked(Handler);
} else {
EventImpl = finalizeHandlerInOrderWithDepsUnlocked(Handler);
Expand Down
13 changes: 5 additions & 8 deletions sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -757,7 +757,9 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
detail::EventImplPtr
finalizeHandlerInOrderHostTaskUnlocked(HandlerType &Handler) {
assert(isInOrder());
assert(Handler.getType() == CGType::CodeplayHostTask);
assert(Handler.getType() == CGType::CodeplayHostTask ||
(Handler.getType() == CGType::ExecCommandBuffer &&
getSyclObjImpl(Handler)->MExecGraph->containsHostTask()));

auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
: MExtGraphDeps.LastEventPtr;
Expand Down Expand Up @@ -791,13 +793,8 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
finalizeHandlerInOrderWithDepsUnlocked(HandlerType &Handler) {
// this is handled by finalizeHandlerInOrderHostTask
assert(Handler.getType() != CGType::CodeplayHostTask);

if (Handler.getType() == CGType::ExecCommandBuffer && MNoLastEventMode) {
// TODO: this shouldn't be needed but without this
// the legacy adapter doesn't synchronize the operations properly
// when non-immediate command lists are used.
Handler.depends_on(insertHelperBarrier(Handler));
}
assert(!(Handler.getType() == CGType::ExecCommandBuffer &&
getSyclObjImpl(Handler)->MExecGraph->containsHostTask()));

auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
: MExtGraphDeps.LastEventPtr;
Expand Down
17 changes: 11 additions & 6 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -798,15 +798,20 @@ event handler::finalize() {
nullptr, impl->MExecGraph, std::move(impl->CGData)));

} else {
event GraphCompletionEvent =
impl->MExecGraph->enqueue(impl->get_queue(), std::move(impl->CGData));

detail::queue_impl &Queue = impl->get_queue();
bool DiscardEvent = !impl->MEventNeeded &&
Queue.supportsDiscardingPiEvents() &&
!impl->MExecGraph->containsHostTask();
detail::EventImplPtr GraphCompletionEvent = impl->MExecGraph->enqueue(
Queue, std::move(impl->CGData), !DiscardEvent);
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
MLastEvent = getSyclObjImpl(GraphCompletionEvent);
return GraphCompletionEvent;
#else
MLastEvent = GraphCompletionEvent;
return sycl::detail::createSyclObjFromImpl<sycl::event>(
GraphCompletionEvent
? GraphCompletionEvent
: sycl::detail::event_impl::create_discarded_event());
#endif
return MLastEvent;
}
} break;
case detail::CGType::CopyImage:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}

// Tests the enqueue free function kernel shortcuts.

#include "../graph_common.hpp"
#include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>
#include <sycl/properties/all_properties.hpp>

int main() {
device Device{};
context Context{Device};

queue InOrderQueue{Context, Device, property::queue::in_order{}};
queue OtherQueue{Context, Device, property::queue::in_order{}};

using T = int;

T *PtrA = malloc_device<T>(Size, InOrderQueue);
T *PtrB = malloc_device<T>(Size, InOrderQueue);
T *PtrC = malloc_device<T>(Size, InOrderQueue);

exp_ext::command_graph Graph{InOrderQueue};
Graph.begin_recording(InOrderQueue);

T Pattern = 42;
exp_ext::fill(InOrderQueue, PtrA, Pattern, Size);

exp_ext::single_task(InOrderQueue, [=]() {
for (size_t i = 0; i < Size; ++i) {
PtrB[i] = i;
}
});

exp_ext::parallel_for(
InOrderQueue, sycl::range<1>{Size},
[=](sycl::item<1> Item) { PtrC[Item] += PtrA[Item] * PtrB[Item]; });

std::vector<T> Output(Size);
exp_ext::copy(InOrderQueue, PtrC, Output.data(), Size);

Graph.end_recording();

auto GraphExec = Graph.finalize();

const size_t MemsetValue = 12;
sycl::event Event =
exp_ext::submit_with_event(OtherQueue, [&](sycl::handler &CGH) {
exp_ext::single_task(CGH, [=]() {
for (size_t I = 0; I < Size; ++I)
PtrC[I] = MemsetValue;
});
});

exp_ext::submit(InOrderQueue, [&](sycl::handler &CGH) {
CGH.depends_on(Event);
exp_ext::execute_graph(CGH, GraphExec);
});

InOrderQueue.wait_and_throw();

free(PtrA, InOrderQueue);
free(PtrB, InOrderQueue);
free(PtrC, InOrderQueue);

for (size_t i = 0; i < Size; i++) {
T Ref = Pattern * i + MemsetValue;
assert(check_value(i, Ref, Output[i], "Output"));
}

return 0;
}
10 changes: 5 additions & 5 deletions sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -417,11 +417,11 @@ TEST_F(CommandGraphTest, GraphPartitionsMerging) {
auto GraphExecImpl = sycl::detail::getSyclObjImpl(GraphExec);
auto PartitionsList = GraphExecImpl->getPartitions();
ASSERT_EQ(PartitionsList.size(), 5ul);
ASSERT_FALSE(PartitionsList[0]->isHostTask());
ASSERT_TRUE(PartitionsList[1]->isHostTask());
ASSERT_FALSE(PartitionsList[2]->isHostTask());
ASSERT_TRUE(PartitionsList[3]->isHostTask());
ASSERT_FALSE(PartitionsList[4]->isHostTask());
ASSERT_FALSE(PartitionsList[0]->MIsHostTask);
ASSERT_TRUE(PartitionsList[1]->MIsHostTask);
ASSERT_FALSE(PartitionsList[2]->MIsHostTask);
ASSERT_TRUE(PartitionsList[3]->MIsHostTask);
ASSERT_FALSE(PartitionsList[4]->MIsHostTask);
}

TEST_F(CommandGraphTest, GetNodeFromEvent) {
Expand Down
Loading
Loading