Skip to content

Commit 54a67eb

Browse files
[SYCL][Graph] Avoid unnecessary inter-partition dependencies (#12680)
Improves management of inter-partition dependencies, so that only required dependencies are added. As removing these dependencies can results in multiple executions paths, we have added a map to track all events returned from submitted partitions. All these events are linked to the main event returned to user. Adds tests.
1 parent 1a98c4c commit 54a67eb

11 files changed

+417
-2
lines changed

sycl/doc/design/CommandGraph.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,32 @@ illustrated in the following diagrams:
192192
![Graph partition illustration step 10b.](images/SYCL-Graph-partitions_step11.jpg)
193193
![Graph partition illustration step 11b.](images/SYCL-Graph-partitions_step12.jpg)
194194

195+
### Multiple Roots Execution Flow
196+
The following diagram shows the partitions of a graph with two roots
197+
and a host-task in each branch.
198+
199+
![Multiple roots graph partition illustration.](images/SYCL-Graph-multiple_roots_partitions.jpg)
200+
201+
When executing this graph, the partitions were enqueued one after the other,
202+
with each partition waiting for the previous one to complete
203+
(see top of the following diagram).
204+
However, for multi-root graph, this behavior adds unnecessary dependency
205+
between partitions, slowing down the execution of the whole graph.
206+
Now, we keep track of the actual predecessors of each partition and
207+
only enforce dependencies between partitions when necessary.
208+
In our example, the extra dependency is therefore removed and
209+
both branches can be executed concurrently.
210+
But as we can see on this diagram, this new approach can involve
211+
multiple execution tails, which leads to difficulties when
212+
we want to know when the graph execution has finished.
213+
To cope with this issue, the events associated to the completion of
214+
each partition are linked to the event returned to users.
215+
Hence, when the returned event is complete, we can guarantee that
216+
all work associated with the graph has been completed.
217+
218+
![Multiple roots graph partition execution flow.](images/SYCL-Graph-partition_execution_flow.jpg)
219+
220+
195221
## Memory handling: Buffer and Accessor
196222

197223
There is no extra support for graph-specific USM allocations in the current
Loading
Loading

sycl/source/detail/graph_impl.cpp

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,18 @@ void exec_graph_impl::makePartitions() {
281281
Partition->MSchedule.end());
282282
}
283283

284+
// Compute partition dependencies
285+
for (auto Partition : MPartitions) {
286+
for (auto const &Root : Partition->MRoots) {
287+
auto RootNode = Root.lock();
288+
for (const auto &Dep : RootNode->MPredecessors) {
289+
auto NodeDep = Dep.lock();
290+
Partition->MPredecessors.push_back(
291+
MPartitions[MPartitionNodes[NodeDep]]);
292+
}
293+
}
294+
}
295+
284296
// Reset node groups (if node have to be re-processed - e.g. subgraph)
285297
for (auto &Node : MNodeStorage) {
286298
Node->MPartitionNum = -1;
@@ -762,7 +774,22 @@ exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
762774
});
763775

764776
sycl::detail::EventImplPtr NewEvent;
765-
for (auto CurrentPartition : MPartitions) {
777+
std::vector<sycl::detail::EventImplPtr> BackupCGDataMEvents;
778+
if (MPartitions.size() > 1) {
779+
BackupCGDataMEvents = CGData.MEvents;
780+
}
781+
for (uint32_t currentPartitionsNum = 0;
782+
currentPartitionsNum < MPartitions.size(); currentPartitionsNum++) {
783+
auto CurrentPartition = MPartitions[currentPartitionsNum];
784+
// restore initial MEvents to add only needed additional depenencies
785+
if (currentPartitionsNum > 0) {
786+
CGData.MEvents = BackupCGDataMEvents;
787+
}
788+
789+
for (auto const &DepPartition : CurrentPartition->MPredecessors) {
790+
CGData.MEvents.push_back(MPartitionsExecutionEvents[DepPartition]);
791+
}
792+
766793
auto CommandBuffer =
767794
CurrentPartition->MPiCommandBuffers[Queue->get_device()];
768795

@@ -902,12 +929,19 @@ exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
902929
NewEvent->setStateIncomplete();
903930
NewEvent->getPreparedDepsEvents() = ScheduledEvents;
904931
}
905-
CGData.MEvents.push_back(NewEvent);
932+
MPartitionsExecutionEvents[CurrentPartition] = NewEvent;
906933
}
907934

908935
// Keep track of this execution event so we can make sure it's completed in
909936
// the destructor.
910937
MExecutionEvents.push_back(NewEvent);
938+
// Attach events of previous partitions to ensure that when the returned event
939+
// is complete all execution associated with the graph have been completed.
940+
for (auto const &Elem : MPartitionsExecutionEvents) {
941+
if (Elem.second != NewEvent) {
942+
NewEvent->attachEventToComplete(Elem.second);
943+
}
944+
}
911945
sycl::event QueueEvent =
912946
sycl::detail::createSyclObjFromImpl<sycl::event>(NewEvent);
913947
return QueueEvent;

sycl/source/detail/graph_impl.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,8 @@ class partition {
548548
/// Map of devices to command buffers.
549549
std::unordered_map<sycl::device, sycl::detail::pi::PiExtCommandBuffer>
550550
MPiCommandBuffers;
551+
/// List of predecessors to this partition.
552+
std::vector<std::shared_ptr<partition>> MPredecessors;
551553

552554
/// @return True if the partition contains a host task
553555
bool isHostTask() const {
@@ -1188,6 +1190,9 @@ class exec_graph_impl {
11881190
std::vector<sycl::detail::EventImplPtr> MExecutionEvents;
11891191
/// List of the partitions that compose the exec graph.
11901192
std::vector<std::shared_ptr<partition>> MPartitions;
1193+
/// Map of the partitions to their execution events
1194+
std::unordered_map<std::shared_ptr<partition>, sycl::detail::EventImplPtr>
1195+
MPartitionsExecutionEvents;
11911196
/// Storage for copies of nodes from the original modifiable graph.
11921197
std::vector<std::shared_ptr<node_impl>> MNodeStorage;
11931198
};
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
//
8+
9+
#define GRAPH_E2E_EXPLICIT
10+
11+
#include "../Inputs/host_task2_multiple_roots.cpp"
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
//
8+
9+
#define GRAPH_E2E_EXPLICIT
10+
11+
#include "../Inputs/host_task_multiple_roots.cpp"
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
// This test uses a host_task when adding a command_graph node for graph with
2+
// multiple roots.
3+
4+
#include "../graph_common.hpp"
5+
6+
int main() {
7+
queue Queue{};
8+
9+
if (!are_graphs_supported(Queue)) {
10+
return 0;
11+
}
12+
13+
using T = int;
14+
15+
if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations)) {
16+
return 0;
17+
}
18+
19+
const T ModValue = T{7};
20+
std::vector<T> DataA(Size), DataB(Size), DataC(Size), Res2(Size);
21+
22+
std::iota(DataA.begin(), DataA.end(), 1);
23+
std::iota(DataB.begin(), DataB.end(), 10);
24+
std::iota(DataC.begin(), DataC.end(), 1000);
25+
26+
std::vector<T> Reference(DataC);
27+
for (unsigned n = 0; n < Iterations; n++) {
28+
for (size_t i = 0; i < Size; i++) {
29+
Reference[i] = (((DataA[i] + DataB[i]) * ModValue) + 1) * DataB[i];
30+
}
31+
}
32+
33+
exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
34+
35+
T *PtrA = malloc_device<T>(Size, Queue);
36+
T *PtrB = malloc_device<T>(Size, Queue);
37+
T *PtrC = malloc_shared<T>(Size, Queue);
38+
T *PtrA2 = malloc_device<T>(Size, Queue);
39+
T *PtrB2 = malloc_device<T>(Size, Queue);
40+
T *PtrC2 = malloc_shared<T>(Size, Queue);
41+
42+
Queue.copy(DataA.data(), PtrA, Size);
43+
Queue.copy(DataB.data(), PtrB, Size);
44+
Queue.copy(DataC.data(), PtrC, Size);
45+
Queue.copy(DataA.data(), PtrA2, Size);
46+
Queue.copy(DataB.data(), PtrB2, Size);
47+
Queue.copy(DataC.data(), PtrC2, Size);
48+
Queue.wait_and_throw();
49+
50+
// Vector add to output
51+
auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
52+
CGH.parallel_for(range<1>(Size), [=](item<1> id) { PtrC[id] = PtrA[id]; });
53+
});
54+
55+
// Vector add to output
56+
auto NodeB = add_node(
57+
Graph, Queue,
58+
[&](handler &CGH) {
59+
depends_on_helper(CGH, NodeA);
60+
CGH.parallel_for(range<1>(Size),
61+
[=](item<1> id) { PtrC[id] += PtrB[id]; });
62+
},
63+
NodeA);
64+
65+
// Modify the output values in a host_task
66+
auto NodeC = add_node(
67+
Graph, Queue,
68+
[&](handler &CGH) {
69+
depends_on_helper(CGH, NodeB);
70+
CGH.host_task([=]() {
71+
for (size_t i = 0; i < Size; i++) {
72+
PtrC[i] *= ModValue;
73+
}
74+
});
75+
},
76+
NodeB);
77+
78+
// Modify temp buffer and write to output buffer
79+
auto NodeD = add_node(
80+
Graph, Queue,
81+
[&](handler &CGH) {
82+
depends_on_helper(CGH, NodeC);
83+
CGH.parallel_for(range<1>(Size), [=](item<1> id) { PtrC[id] += 1; });
84+
},
85+
NodeC);
86+
87+
// Modify temp buffer and write to output buffer
88+
add_node(
89+
Graph, Queue,
90+
[&](handler &CGH) {
91+
depends_on_helper(CGH, NodeD);
92+
CGH.parallel_for(range<1>(Size),
93+
[=](item<1> id) { PtrC[id] *= PtrB[id]; });
94+
},
95+
NodeD);
96+
97+
// Vector add to output
98+
auto NodeA2 = add_node(Graph, Queue, [&](handler &CGH) {
99+
CGH.parallel_for(range<1>(Size),
100+
[=](item<1> id) { PtrC2[id] = PtrA2[id]; });
101+
});
102+
103+
// Vector add to output
104+
auto NodeB2 = add_node(
105+
Graph, Queue,
106+
[&](handler &CGH) {
107+
depends_on_helper(CGH, NodeA2);
108+
CGH.parallel_for(range<1>(Size),
109+
[=](item<1> id) { PtrC2[id] += PtrB2[id]; });
110+
},
111+
NodeA2);
112+
113+
// Modify the output values in a host_task
114+
auto NodeC2 = add_node(
115+
Graph, Queue,
116+
[&](handler &CGH) {
117+
depends_on_helper(CGH, NodeB2);
118+
CGH.host_task([=]() {
119+
for (size_t i = 0; i < Size; i++) {
120+
PtrC2[i] *= ModValue;
121+
}
122+
});
123+
},
124+
NodeB2);
125+
126+
// Modify temp buffer and write to output buffer
127+
auto NodeD2 = add_node(
128+
Graph, Queue,
129+
[&](handler &CGH) {
130+
depends_on_helper(CGH, NodeC2);
131+
CGH.parallel_for(range<1>(Size), [=](item<1> id) { PtrC2[id] += 1; });
132+
},
133+
NodeC2);
134+
135+
// Modify temp buffer and write to output buffer
136+
add_node(
137+
Graph, Queue,
138+
[&](handler &CGH) {
139+
depends_on_helper(CGH, NodeD2);
140+
CGH.parallel_for(range<1>(Size),
141+
[=](item<1> id) { PtrC2[id] *= PtrB2[id]; });
142+
},
143+
NodeD2);
144+
145+
auto GraphExec = Graph.finalize();
146+
147+
event Event;
148+
for (unsigned n = 0; n < Iterations; n++) {
149+
Event = Queue.submit([&](handler &CGH) {
150+
CGH.depends_on(Event);
151+
CGH.ext_oneapi_graph(GraphExec);
152+
});
153+
Event.wait();
154+
}
155+
Queue.wait_and_throw();
156+
157+
Queue.copy(PtrC, DataC.data(), Size);
158+
Queue.copy(PtrC2, Res2.data(), Size);
159+
Queue.wait_and_throw();
160+
161+
free(PtrA, Queue);
162+
free(PtrB, Queue);
163+
free(PtrC, Queue);
164+
free(PtrA2, Queue);
165+
free(PtrB2, Queue);
166+
free(PtrC2, Queue);
167+
168+
for (size_t i = 0; i < Size; i++) {
169+
assert(check_value(i, Reference[i], DataC[i], "DataC"));
170+
assert(check_value(i, Reference[i], Res2[i], "Res2"));
171+
}
172+
173+
return 0;
174+
}

0 commit comments

Comments
 (0)