Skip to content

Commit 0eeae2a

Browse files
mfrancepilloisEwanCkbenzie
authored
[SYCL][Graph] Update design doc for copy optimization and add test (#13051)
- Update UR tag to include L0 command-buffer copy engine optimization - Add test which mixes copy and kernel commands - Update design doc to detail copy engine optimization Co-authored By: * Ben Tracy <[email protected]> * Ewan Crawford <[email protected]> --------- Co-authored-by: Ewan Crawford <[email protected]> Co-authored-by: Kenneth Benzie (Benie) <[email protected]>
1 parent cedb48f commit 0eeae2a

File tree

3 files changed

+126
-6
lines changed

3 files changed

+126
-6
lines changed

sycl/doc/design/CommandGraph.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,24 @@ Level Zero:
438438
Future work will include exploring L0 API extensions to improve the mapping of
439439
UR command-buffer to L0 command-list.
440440

441+
#### Copy Engine
442+
443+
For performance considerations, the Unified Runtime Level Zero adapter uses
444+
different Level Zero command-queues to submit compute kernels and memory
445+
operations when the device has a dedicated copy engine. To take advantage of the
446+
copy engine when available, the graph workload can also be split between memory
447+
operations and compute kernels. To achieve this, two graph workload
448+
command-lists live simultaneously in a command-buffer.
449+
450+
When the command-buffer is finalized, memory operations (e.g. buffer copy,
451+
buffer fill, ...) are enqueued in the *copy* command-list while the other
452+
commands are enqueued in the compute command-list. On submission, if not empty,
453+
the *copy* command-list is sent to the main copy command-queue while the compute
454+
command-list is sent to the compute command-queue.
455+
456+
Both are executed concurrently. Synchronization between the command-lists is
457+
handled by Level Zero events.
458+
441459
### CUDA
442460

443461
The SYCL Graph CUDA backend relies on the

sycl/plugins/unified_runtime/CMakeLists.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -110,13 +110,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
110110

111111
fetch_adapter_source(level_zero
112112
${UNIFIED_RUNTIME_REPO}
113-
# commit 2c86cd84a86761204f302a1c5148a8455561b8e6
114-
# Merge: f23ee23a a4617787
113+
# commit b8a1a3f232198bf2c3d8edd2bbc909bb2a9be555
114+
# Merge: 0cd127ad 30f8ac50
115115
# Author: Kenneth Benzie (Benie) <[email protected]>
116-
# Date: Fri Jun 14 10:54:08 2024 +0100
117-
# Merge pull request #1749 from nrspruit/fix_NonBlocking_LastCommand
118-
# [L0] Maintain Lock of Queue while syncing the Last Command Event and update Last Command Event only if matching
119-
2c86cd84a86761204f302a1c5148a8455561b8e6
116+
# Date: Fri Jun 14 14:26:17 2024 +0100
117+
# Merge pull request #1738 from Bensuo/cmd-buf-copy-queue
118+
# [CMDBUF][L0] Use copy engine to optimize cmd-buffer usage
119+
b8a1a3f232198bf2c3d8edd2bbc909bb2a9be555
120120
)
121121

122122
fetch_adapter_source(opencl
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
//
8+
9+
// Tests that the optimization to use the L0 Copy Engine for memory commands
10+
// does not interfere with the linear graph optimization
11+
12+
#include "../graph_common.hpp"
13+
14+
#include <sycl/properties/queue_properties.hpp>
15+
16+
int main() {
17+
queue Queue{{sycl::property::queue::in_order{}}};
18+
19+
using T = int;
20+
21+
const T ModValue = 7;
22+
std::vector<T> DataA(Size), DataB(Size), DataC(Size);
23+
24+
std::iota(DataA.begin(), DataA.end(), 1);
25+
std::iota(DataB.begin(), DataB.end(), 10);
26+
std::iota(DataC.begin(), DataC.end(), 1000);
27+
28+
// Create reference data for output
29+
std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
30+
for (size_t i = 0; i < Iterations; i++) {
31+
for (size_t j = 0; j < Size; j++) {
32+
ReferenceA[j] += ModValue;
33+
ReferenceB[j] = ReferenceA[j];
34+
ReferenceB[j] -= ModValue;
35+
ReferenceC[j] = ReferenceB[j];
36+
ReferenceC[j] += ModValue;
37+
}
38+
}
39+
40+
ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
41+
Queue.get_device()};
42+
43+
T *PtrA = malloc_device<T>(Size, Queue);
44+
T *PtrB = malloc_device<T>(Size, Queue);
45+
T *PtrC = malloc_device<T>(Size, Queue);
46+
47+
Queue.copy(DataA.data(), PtrA, Size);
48+
Queue.copy(DataB.data(), PtrB, Size);
49+
Queue.copy(DataC.data(), PtrC, Size);
50+
Queue.wait_and_throw();
51+
52+
Graph.begin_recording(Queue);
53+
Queue.submit([&](handler &CGH) {
54+
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
55+
auto LinID = id.get_linear_id();
56+
PtrA[LinID] += ModValue;
57+
});
58+
});
59+
60+
Queue.submit([&](handler &CGH) { CGH.memcpy(PtrB, PtrA, Size * sizeof(T)); });
61+
62+
Queue.submit([&](handler &CGH) {
63+
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
64+
auto LinID = id.get_linear_id();
65+
PtrB[LinID] -= ModValue;
66+
});
67+
});
68+
69+
Queue.submit([&](handler &CGH) { CGH.memcpy(PtrC, PtrB, Size * sizeof(T)); });
70+
71+
Queue.submit([&](handler &CGH) {
72+
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
73+
auto LinID = id.get_linear_id();
74+
PtrC[LinID] += ModValue;
75+
});
76+
});
77+
78+
Graph.end_recording();
79+
80+
auto GraphExec = Graph.finalize();
81+
82+
event Event;
83+
for (unsigned n = 0; n < Iterations; n++) {
84+
Event =
85+
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
86+
}
87+
88+
Queue.copy(PtrA, DataA.data(), Size, Event);
89+
Queue.copy(PtrB, DataB.data(), Size, Event);
90+
Queue.copy(PtrC, DataC.data(), Size, Event);
91+
Queue.wait_and_throw();
92+
93+
free(PtrA, Queue);
94+
free(PtrB, Queue);
95+
free(PtrC, Queue);
96+
97+
for (size_t i = 0; i < Size; i++) {
98+
assert(check_value(i, ReferenceA[i], DataA[i], "DataA"));
99+
assert(check_value(i, ReferenceB[i], DataB[i], "DataB"));
100+
assert(check_value(i, ReferenceC[i], DataC[i], "DataC"));
101+
}
102+
}

0 commit comments

Comments
 (0)