Skip to content

Commit b643b8b

Browse files
authored
[SYCL][Graph] Optimize graph enqueue for in-order queues (#18792)
Optimizes the `enqueue()` function of sycl graphs to bypass the scheduler whenever possible and avoid creating events when not needed. * Refactors the executable graph `enqueue()` to have different paths depending on workload: * The direct path will be used when there are no host-tasks or accessor requirements in the graph and the execution dependencies are considered safe to bypass the scheduler. * The scheduler path will be used when there are requirements in the graph but no host-tasks or, if the execution dependencies require using the scheduler. * The multiple partitions path will be used when the graph contains `host-tasks` which requires scheduling multiple graph partitions. The implementation was also changed to avoid adding unnecessary event dependencies to partition executions and avoiding copying `CGData` when possible. * Extends the changes in #18277 to sycl graphs. This means that no implicit events will be created when using in-order queues and graphs without `host-tasks`. Also updates the handler to only request events from the graph `enqueue()` when they are needed.
1 parent 56d06a6 commit b643b8b

File tree

12 files changed

+725
-231
lines changed

12 files changed

+725
-231
lines changed

sycl/source/detail/graph_impl.cpp

Lines changed: 318 additions & 153 deletions
Large diffs are not rendered by default.

sycl/source/detail/graph_impl.hpp

Lines changed: 98 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -771,16 +771,28 @@ class partition {
771771
std::unordered_map<sycl::device, ur_exp_command_buffer_handle_t>
772772
MCommandBuffers;
773773
/// List of predecessors to this partition.
774-
std::vector<std::shared_ptr<partition>> MPredecessors;
774+
std::vector<partition *> MPredecessors;
775+
776+
/// List of successors to this partition.
777+
std::vector<partition *> MSuccessors;
778+
779+
/// List of requirements for this partition.
780+
std::vector<sycl::detail::AccessorImplHost *> MRequirements;
781+
782+
/// Storage for accessors which are used by this partition.
783+
std::vector<AccessorImplPtr> MAccessors;
784+
775785
/// True if the graph of this partition is a single path graph
776786
/// and in-order optmization can be applied on it.
777787
bool MIsInOrderGraph = false;
778788

779-
/// @return True if the partition contains a host task
780-
bool isHostTask() const {
781-
return (MRoots.size() && ((*MRoots.begin()).lock()->MCGType ==
782-
sycl::detail::CGType::CodeplayHostTask));
783-
}
789+
/// True if this partition contains only one node which is a host_task.
790+
bool MIsHostTask = false;
791+
792+
// Submission event for the partition. Used during enqueue to define
793+
// dependencies between this partition and its successors. This event is
794+
// replaced every time the partition is executed.
795+
EventImplPtr MEvent;
784796

785797
/// Checks if the graph is single path, i.e. each node has a single successor.
786798
/// @return True if the graph is a single path
@@ -1330,9 +1342,17 @@ class exec_graph_impl {
13301342
/// execution.
13311343
/// @param Queue Command-queue to schedule execution on.
13321344
/// @param CGData Command-group data provided by the sycl::handler
1333-
/// @return Event associated with the execution of the graph.
1334-
sycl::event enqueue(sycl::detail::queue_impl &Queue,
1335-
sycl::detail::CG::StorageInitHelper CGData);
1345+
/// @param EventNeeded Whether an event signalling the completion of this
1346+
/// operation needs to be returned.
1347+
/// @return Returns an event if EventNeeded is true. Returns nullptr
1348+
/// otherwise.
1349+
EventImplPtr enqueue(sycl::detail::queue_impl &Queue,
1350+
sycl::detail::CG::StorageInitHelper CGData,
1351+
bool EventNeeded);
1352+
1353+
/// Iterates through all the nodes in the graph to build the list of
1354+
/// accessor requirements for the whole graph and for each partition.
1355+
void buildRequirements();
13361356

13371357
/// Turns the internal graph representation into UR command-buffers for a
13381358
/// device.
@@ -1366,13 +1386,17 @@ class exec_graph_impl {
13661386
return MPartitions;
13671387
}
13681388

1389+
/// Query whether the graph contains any host-task nodes.
1390+
/// @return True if the graph contains any host-task nodes. False otherwise.
1391+
bool containsHostTask() const { return MContainsHostTask; }
1392+
13691393
/// Checks if the previous submissions of this graph have been completed
13701394
/// This function checks the status of events associated to the previous graph
13711395
/// submissions.
13721396
/// @return true if all previous submissions have been completed, false
13731397
/// otherwise.
13741398
bool previousSubmissionCompleted() const {
1375-
for (auto Event : MExecutionEvents) {
1399+
for (auto Event : MSchedulerDependencies) {
13761400
if (!Event->isCompleted()) {
13771401
return false;
13781402
}
@@ -1447,6 +1471,65 @@ class exec_graph_impl {
14471471
ur_exp_command_buffer_handle_t CommandBuffer,
14481472
std::shared_ptr<node_impl> Node);
14491473

1474+
/// Enqueues a host-task partition (i.e. a partition that contains only a
1475+
/// single node and that node is a host-task).
1476+
/// @param Partition The partition to enqueue.
1477+
/// @param Queue Command-queue to schedule execution on.
1478+
/// @param CGData Command-group data used for initializing the host-task
1479+
/// command-group.
1480+
/// @param EventNeeded Whether an event signalling the completion of this
1481+
/// operation needs to be returned.
1482+
/// @return If EventNeeded is true returns the event resulting from enqueueing
1483+
/// the host-task through the scheduler. Returns nullptr otherwise.
1484+
EventImplPtr enqueueHostTaskPartition(
1485+
std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
1486+
sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded);
1487+
1488+
/// Enqueues a graph partition that contains no host-tasks using the
1489+
/// scheduler.
1490+
/// @param Partition The partition to enqueue.
1491+
/// @param Queue Command-queue to schedule execution on.
1492+
/// @param CGData Command-group data used for initializing the command-buffer
1493+
/// command-group.
1494+
/// @param EventNeeded Whether an event signalling the completion of this
1495+
/// operation needs to be returned.
1496+
/// @return If EventNeeded is true returns the event resulting from enqueueing
1497+
/// the command-buffer through the scheduler. Returns nullptr otherwise.
1498+
EventImplPtr enqueuePartitionWithScheduler(
1499+
std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
1500+
sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded);
1501+
1502+
/// Enqueues a graph partition that contains no host-tasks by directly calling
1503+
/// the unified-runtime API (i.e. avoids scheduler overhead).
1504+
/// @param Partition The partition to enqueue.
1505+
/// @param Queue Command-queue to schedule execution on.
1506+
/// @param WaitEvents List of events to wait on. All the events on this list
1507+
/// must be safe for scheduler bypass. Only events containing a valid UR event
1508+
/// handle will be waited for.
1509+
/// @param EventNeeded Whether an event signalling the completion of this
1510+
/// operation needs to be returned.
1511+
/// @return If EventNeeded is true returns the event resulting from enqueueing
1512+
/// the command-buffer. Returns nullptr otherwise.
1513+
EventImplPtr enqueuePartitionDirectly(
1514+
std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
1515+
std::vector<detail::EventImplPtr> &WaitEvents, bool EventNeeded);
1516+
1517+
/// Enqueues all the partitions in a graph.
1518+
/// @param Queue Command-queue to schedule execution on.
1519+
/// @param CGData Command-group data that contains the dependencies and
1520+
/// accessor requirements needed to enqueue this graph.
1521+
/// @param IsCGDataSafeForSchedulerBypass Whether CGData contains any events
1522+
/// that require enqueuing through the scheduler (e.g. requirements or
1523+
/// host-task events).
1524+
/// @param EventNeeded Whether an event signalling the completion of this
1525+
/// operation needs to be returned.
1526+
/// @return If EventNeeded is true returns the event resulting from enqueueing
1527+
/// the command-buffer. Returns nullptr otherwise.
1528+
EventImplPtr enqueuePartitions(sycl::detail::queue_impl &Queue,
1529+
sycl::detail::CG::StorageInitHelper &CGData,
1530+
bool IsCGDataSafeForSchedulerBypass,
1531+
bool EventNeeded);
1532+
14501533
/// Iterates back through predecessors to find the real dependency.
14511534
/// @param[out] Deps Found dependencies.
14521535
/// @param[in] CurrentNode Node to find dependencies for.
@@ -1541,11 +1624,9 @@ class exec_graph_impl {
15411624
/// List of requirements for enqueueing this command graph, accumulated from
15421625
/// all nodes enqueued to the graph.
15431626
std::vector<sycl::detail::AccessorImplHost *> MRequirements;
1544-
/// Storage for accessors which are used by this graph, accumulated from
1545-
/// all nodes enqueued to the graph.
1546-
std::vector<sycl::detail::AccessorImplPtr> MAccessors;
1547-
/// List of all execution events returned from command buffer enqueue calls.
1548-
std::vector<sycl::detail::EventImplPtr> MExecutionEvents;
1627+
/// List of dependencies that enqueue or update commands need to wait on
1628+
/// when using the scheduler path.
1629+
std::vector<sycl::detail::EventImplPtr> MSchedulerDependencies;
15491630
/// List of the partitions that compose the exec graph.
15501631
std::vector<std::shared_ptr<partition>> MPartitions;
15511632
/// Storage for copies of nodes from the original modifiable graph.
@@ -1554,6 +1635,8 @@ class exec_graph_impl {
15541635
std::unordered_map<std::shared_ptr<node_impl>,
15551636
ur_exp_command_buffer_command_handle_t>
15561637
MCommandMap;
1638+
/// List of partition without any predecessors in this exec graph.
1639+
std::vector<std::weak_ptr<partition>> MRootPartitions;
15571640
/// True if this graph can be updated (set with property::updatable)
15581641
bool MIsUpdatable;
15591642
/// If true, the graph profiling is enabled.

sycl/source/detail/queue_impl.cpp

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -336,15 +336,12 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
336336

337337
HandlerImpl->MEventMode = SubmitInfo.EventMode();
338338

339-
auto isHostTask = Type == CGType::CodeplayHostTask;
340-
341-
// TODO: this shouldn't be needed but without this
342-
// the legacy adapter doesn't synchronize the operations properly
343-
// when non-immediate command lists are used.
344-
auto isGraphSubmission = Type == CGType::ExecCommandBuffer;
339+
auto isHostTask = Type == CGType::CodeplayHostTask ||
340+
(Type == CGType::ExecCommandBuffer &&
341+
HandlerImpl->MExecGraph->containsHostTask());
345342

346343
auto requiresPostProcess = SubmitInfo.PostProcessorFunc() || Streams.size();
347-
auto noLastEventPath = !isHostTask && !isGraphSubmission &&
344+
auto noLastEventPath = !isHostTask &&
348345
MNoLastEventMode.load(std::memory_order_acquire) &&
349346
!requiresPostProcess;
350347

@@ -369,7 +366,7 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
369366
} else {
370367
std::unique_lock<std::mutex> Lock(MMutex);
371368

372-
if (!isGraphSubmission && trySwitchingToNoEventsMode()) {
369+
if (trySwitchingToNoEventsMode()) {
373370
EventImpl = finalizeHandlerInOrderNoEventsUnlocked(Handler);
374371
} else {
375372
EventImpl = finalizeHandlerInOrderWithDepsUnlocked(Handler);

sycl/source/detail/queue_impl.hpp

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -764,7 +764,9 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
764764
detail::EventImplPtr
765765
finalizeHandlerInOrderHostTaskUnlocked(HandlerType &Handler) {
766766
assert(isInOrder());
767-
assert(Handler.getType() == CGType::CodeplayHostTask);
767+
assert(Handler.getType() == CGType::CodeplayHostTask ||
768+
(Handler.getType() == CGType::ExecCommandBuffer &&
769+
getSyclObjImpl(Handler)->MExecGraph->containsHostTask()));
768770

769771
auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
770772
: MExtGraphDeps.LastEventPtr;
@@ -798,13 +800,8 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
798800
finalizeHandlerInOrderWithDepsUnlocked(HandlerType &Handler) {
799801
// this is handled by finalizeHandlerInOrderHostTask
800802
assert(Handler.getType() != CGType::CodeplayHostTask);
801-
802-
if (Handler.getType() == CGType::ExecCommandBuffer && MNoLastEventMode) {
803-
// TODO: this shouldn't be needed but without this
804-
// the legacy adapter doesn't synchronize the operations properly
805-
// when non-immediate command lists are used.
806-
Handler.depends_on(insertHelperBarrier(Handler));
807-
}
803+
assert(!(Handler.getType() == CGType::ExecCommandBuffer &&
804+
getSyclObjImpl(Handler)->MExecGraph->containsHostTask()));
808805

809806
auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
810807
: MExtGraphDeps.LastEventPtr;

sycl/source/handler.cpp

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -798,15 +798,20 @@ event handler::finalize() {
798798
nullptr, impl->MExecGraph, std::move(impl->CGData)));
799799

800800
} else {
801-
event GraphCompletionEvent =
802-
impl->MExecGraph->enqueue(impl->get_queue(), std::move(impl->CGData));
803-
801+
detail::queue_impl &Queue = impl->get_queue();
802+
bool DiscardEvent = !impl->MEventNeeded &&
803+
Queue.supportsDiscardingPiEvents() &&
804+
!impl->MExecGraph->containsHostTask();
805+
detail::EventImplPtr GraphCompletionEvent = impl->MExecGraph->enqueue(
806+
Queue, std::move(impl->CGData), !DiscardEvent);
804807
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
805-
MLastEvent = getSyclObjImpl(GraphCompletionEvent);
808+
return GraphCompletionEvent;
806809
#else
807-
MLastEvent = GraphCompletionEvent;
810+
return sycl::detail::createSyclObjFromImpl<sycl::event>(
811+
GraphCompletionEvent
812+
? GraphCompletionEvent
813+
: sycl::detail::event_impl::create_discarded_event());
808814
#endif
809-
return MLastEvent;
810815
}
811816
} break;
812817
case detail::CGType::CopyImage:
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
8+
// Tests the enqueue free function kernel shortcuts.
9+
10+
#include "../graph_common.hpp"
11+
#include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>
12+
#include <sycl/properties/all_properties.hpp>
13+
14+
int main() {
15+
device Device{};
16+
context Context{Device};
17+
18+
queue InOrderQueue{Context, Device, property::queue::in_order{}};
19+
queue OtherQueue{Context, Device, property::queue::in_order{}};
20+
21+
using T = int;
22+
23+
T *PtrA = malloc_device<T>(Size, InOrderQueue);
24+
T *PtrB = malloc_device<T>(Size, InOrderQueue);
25+
T *PtrC = malloc_device<T>(Size, InOrderQueue);
26+
27+
exp_ext::command_graph Graph{InOrderQueue};
28+
Graph.begin_recording(InOrderQueue);
29+
30+
T Pattern = 42;
31+
exp_ext::fill(InOrderQueue, PtrA, Pattern, Size);
32+
33+
exp_ext::single_task(InOrderQueue, [=]() {
34+
for (size_t i = 0; i < Size; ++i) {
35+
PtrB[i] = i;
36+
}
37+
});
38+
39+
exp_ext::parallel_for(
40+
InOrderQueue, sycl::range<1>{Size},
41+
[=](sycl::item<1> Item) { PtrC[Item] += PtrA[Item] * PtrB[Item]; });
42+
43+
std::vector<T> Output(Size);
44+
exp_ext::copy(InOrderQueue, PtrC, Output.data(), Size);
45+
46+
Graph.end_recording();
47+
48+
auto GraphExec = Graph.finalize();
49+
50+
const size_t MemsetValue = 12;
51+
sycl::event Event =
52+
exp_ext::submit_with_event(OtherQueue, [&](sycl::handler &CGH) {
53+
exp_ext::single_task(CGH, [=]() {
54+
for (size_t I = 0; I < Size; ++I)
55+
PtrC[I] = MemsetValue;
56+
});
57+
});
58+
59+
exp_ext::submit(InOrderQueue, [&](sycl::handler &CGH) {
60+
CGH.depends_on(Event);
61+
exp_ext::execute_graph(CGH, GraphExec);
62+
});
63+
64+
InOrderQueue.wait_and_throw();
65+
66+
free(PtrA, InOrderQueue);
67+
free(PtrB, InOrderQueue);
68+
free(PtrC, InOrderQueue);
69+
70+
for (size_t i = 0; i < Size; i++) {
71+
T Ref = Pattern * i + MemsetValue;
72+
assert(check_value(i, Ref, Output[i], "Output"));
73+
}
74+
75+
return 0;
76+
}

sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -417,11 +417,11 @@ TEST_F(CommandGraphTest, GraphPartitionsMerging) {
417417
auto GraphExecImpl = sycl::detail::getSyclObjImpl(GraphExec);
418418
auto PartitionsList = GraphExecImpl->getPartitions();
419419
ASSERT_EQ(PartitionsList.size(), 5ul);
420-
ASSERT_FALSE(PartitionsList[0]->isHostTask());
421-
ASSERT_TRUE(PartitionsList[1]->isHostTask());
422-
ASSERT_FALSE(PartitionsList[2]->isHostTask());
423-
ASSERT_TRUE(PartitionsList[3]->isHostTask());
424-
ASSERT_FALSE(PartitionsList[4]->isHostTask());
420+
ASSERT_FALSE(PartitionsList[0]->MIsHostTask);
421+
ASSERT_TRUE(PartitionsList[1]->MIsHostTask);
422+
ASSERT_FALSE(PartitionsList[2]->MIsHostTask);
423+
ASSERT_TRUE(PartitionsList[3]->MIsHostTask);
424+
ASSERT_FALSE(PartitionsList[4]->MIsHostTask);
425425
}
426426

427427
TEST_F(CommandGraphTest, GetNodeFromEvent) {

0 commit comments

Comments
 (0)