Skip to content

Commit 438dc49

Browse files
[SYCL] Clean up finished command nodes of the execution graph (#1066)
Delete completed command nodes that are neither memory allocations nor leaves for any memory object. This cleanup is performed at each host-device synchronization point in order to avoid execution graph bloat. Signed-off-by: Sergey Semenov <[email protected]>
1 parent 360b25b commit 438dc49

File tree

12 files changed

+268
-101
lines changed

12 files changed

+268
-101
lines changed

sycl/doc/SYCLEnvironmentVariables.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ subject to change. Do not rely on these variables in production code.
1818
| SYCL_USE_KERNEL_SPV | Path to the SPIR-V binary | Load device image from the specified file. If runtime is unable to read the file, `cl::sycl::runtime_error` exception is thrown.|
1919
| SYCL_DUMP_IMAGES | Any(*) | Dump device image binaries to file. Control has no effect if SYCL_USE_KERNEL_SPV is set. |
2020
| SYCL_PRINT_EXECUTION_GRAPH | Described [below](#sycl_print_execution_graph-options) | Print execution graph to DOT text file. |
21+
| SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP | Any(*) | Disable cleanup of finished command nodes at host-device synchronization points. |
2122
| SYCL_THROW_ON_BLOCK | Any(*) | Throw an exception on attempt to wait for a blocked command. |
2223
| SYCL_DEVICELIB_INHIBIT_NATIVE | String of device library extensions (separated by a whitespace) | Do not rely on device native support for devicelib extensions listed in this option. |
2324
| SYCL_DEVICE_ALLOWLIST | A list of devices and their minimum driver version following the pattern: DeviceName:{{XXX}},DriverVersion:{{X.Y.Z.W}}. Also may contain PlatformName and PlatformVersion | Filter out devices that do not match the pattern specified. Regular expression can be passed and the SYCL RT will select only those devices which satisfy the regex. |

sycl/include/CL/sycl/detail/scheduler/commands.hpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <atomic>
1212
#include <memory>
13+
#include <unordered_set>
1314
#include <vector>
1415

1516
#include <CL/sycl/access/access.hpp>
@@ -98,7 +99,7 @@ class Command {
9899

99100
void addDep(EventImplPtr Event) { MDepsEvents.push_back(std::move(Event)); }
100101

101-
void addUser(Command *NewUser) { MUsers.push_back(NewUser); }
102+
void addUser(Command *NewUser) { MUsers.insert(NewUser); }
102103

103104
// Return type of the command, e.g. Allocate, MemoryCopy.
104105
CommandType getType() const { return MType; }
@@ -149,11 +150,13 @@ class Command {
149150
// Contains list of dependencies(edges)
150151
std::vector<DepDesc> MDeps;
151152
// Contains list of commands that depend on the command
152-
std::vector<Command *> MUsers;
153+
std::unordered_set<Command *> MUsers;
153154
// Indicates whether the command can be blocked from enqueueing
154155
bool MIsBlockable = false;
155156
// Indicates whether the command is blocked from enqueueing
156157
std::atomic<bool> MCanEnqueue;
158+
// Counts the number of memory objects this command is a leaf for
159+
unsigned MLeafCounter = 0;
157160

158161
const char *MBlockReason = "Unknown";
159162
};

sycl/include/CL/sycl/detail/scheduler/scheduler.hpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ class Scheduler {
7777
// sycl::image destructors.
7878
void removeMemoryObject(detail::SYCLMemObjI *MemObj);
7979

80+
// Removes finished non-leaf non-alloca commands from the subgraph (assuming
81+
// that all its commands have been waited for).
82+
void cleanupFinishedCommands(Command *FinishedCmd);
83+
8084
// Creates nodes in the graph, that update Req with the pointer to the host
8185
// memory which contains the latest data of the memory object. New operations
8286
// with the same memory object that have side effects are blocked until
@@ -125,8 +129,9 @@ class Scheduler {
125129
// Event passed and its dependencies.
126130
void optimize(EventImplPtr Event);
127131

128-
// Removes unneeded commands from the graph.
129-
void cleanupCommands(bool CleanupReleaseCommands = false);
132+
// Removes finished non-leaf non-alloca commands from the subgraph (assuming
133+
// that all its commands have been waited for).
134+
void cleanupFinishedCommands(Command *FinishedCmd);
130135

131136
// Reschedules command passed using Queue provided. this can lead to
132137
// rescheduling of all dependent commands. This can be used when user
@@ -140,6 +145,9 @@ class Scheduler {
140145
MemObjRecord *getOrInsertMemObjRecord(const QueueImplPtr &Queue,
141146
Requirement *Req);
142147

148+
// Decrements leaf counters for all leaves of the record.
149+
void decrementLeafCountersForRecord(MemObjRecord *Record);
150+
143151
// Removes commands that use given MemObjRecord from the graph.
144152
void cleanupCommandsForRecord(MemObjRecord *Record);
145153

sycl/source/detail/config.def

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,5 @@
1111
// underscore(__).
1212

1313
CONFIG(SYCL_PRINT_EXECUTION_GRAPH, 32, __SYCL_PRINT_EXECUTION_GRAPH)
14+
CONFIG(SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP, 1, __SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP)
1415
CONFIG(SYCL_DEVICE_ALLOWLIST, 1024, __SYCL_DEVICE_ALLOWLIST)
15-

sycl/source/detail/event_impl.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
#include <CL/sycl/detail/queue_impl.hpp>
1414
#include <CL/sycl/detail/scheduler/scheduler.hpp>
1515

16+
#include "detail/config.hpp"
17+
1618
#include <chrono>
1719

1820
__SYCL_INLINE namespace cl {
@@ -100,6 +102,9 @@ void event_impl::wait(
100102
waitInternal();
101103
else if (MCommand)
102104
detail::Scheduler::getInstance().waitForEvent(std::move(Self));
105+
if (MCommand && !SYCLConfig<SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP>::get())
106+
detail::Scheduler::getInstance().cleanupFinishedCommands(
107+
static_cast<Command *>(MCommand));
103108
}
104109

105110
void event_impl::wait_and_throw(

sycl/source/detail/scheduler/graph_builder.cpp

Lines changed: 59 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -138,13 +138,15 @@ void Scheduler::GraphBuilder::UpdateLeaves(const std::set<Command *> &Cmds,
138138
if (ReadOnlyReq)
139139
return;
140140

141-
for (const Command *Cmd : Cmds) {
141+
for (Command *Cmd : Cmds) {
142142
auto NewEnd = std::remove(Record->MReadLeaves.begin(),
143143
Record->MReadLeaves.end(), Cmd);
144+
Cmd->MLeafCounter -= std::distance(NewEnd, Record->MReadLeaves.end());
144145
Record->MReadLeaves.erase(NewEnd, Record->MReadLeaves.end());
145146

146147
NewEnd = std::remove(Record->MWriteLeaves.begin(),
147148
Record->MWriteLeaves.end(), Cmd);
149+
Cmd->MLeafCounter -= std::distance(NewEnd, Record->MWriteLeaves.end());
148150
Record->MWriteLeaves.erase(NewEnd, Record->MWriteLeaves.end());
149151
}
150152
}
@@ -166,8 +168,10 @@ void Scheduler::GraphBuilder::AddNodeToLeaves(MemObjRecord *Record,
166168
Dep.MDepCommand = OldLeaf;
167169
Cmd->addDep(Dep);
168170
OldLeaf->addUser(Cmd);
171+
--(OldLeaf->MLeafCounter);
169172
}
170173
Leaves.push_back(Cmd);
174+
++(Cmd->MLeafCounter);
171175
}
172176

173177
UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
@@ -560,6 +564,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
560564

561565
Record->MAllocaCommands.push_back(AllocaCmd);
562566
Record->MWriteLeaves.push_back(AllocaCmd);
567+
++(AllocaCmd->MLeafCounter);
563568
}
564569
return AllocaCmd;
565570
}
@@ -633,6 +638,16 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
633638
return NewCmd.release();
634639
}
635640

641+
void Scheduler::GraphBuilder::decrementLeafCountersForRecord(
642+
MemObjRecord *Record) {
643+
for (Command *Cmd : Record->MReadLeaves) {
644+
--(Cmd->MLeafCounter);
645+
}
646+
for (Command *Cmd : Record->MWriteLeaves) {
647+
--(Cmd->MLeafCounter);
648+
}
649+
}
650+
636651
void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) {
637652
std::vector<AllocaCommandBase *> &AllocaCommands = Record->MAllocaCommands;
638653
if (AllocaCommands.empty())
@@ -683,9 +698,7 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) {
683698
for (auto DepCmdIt : ShouldBeUpdated) {
684699
if (!DepCmdIt.second)
685700
continue;
686-
std::vector<Command *> &DepUsers = DepCmdIt.first->MUsers;
687-
DepUsers.erase(std::remove(DepUsers.begin(), DepUsers.end(), Cmd),
688-
DepUsers.end());
701+
DepCmdIt.first->MUsers.erase(Cmd);
689702
}
690703

691704
// If all dependencies have been removed this way, mark the command for
@@ -702,8 +715,48 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) {
702715
}
703716
}
704717

705-
void Scheduler::GraphBuilder::cleanupCommands(bool CleanupReleaseCommands) {
706-
// TODO: Implement.
718+
void Scheduler::GraphBuilder::cleanupFinishedCommands(Command *FinishedCmd) {
719+
std::queue<Command *> CmdsToVisit({FinishedCmd});
720+
std::set<Command *> Visited;
721+
722+
// Traverse the graph using BFS
723+
while (!CmdsToVisit.empty()) {
724+
Command *Cmd = CmdsToVisit.front();
725+
CmdsToVisit.pop();
726+
727+
if (!Visited.insert(Cmd).second)
728+
continue;
729+
730+
for (const DepDesc &Dep : Cmd->MDeps) {
731+
if (Dep.MDepCommand)
732+
CmdsToVisit.push(Dep.MDepCommand);
733+
}
734+
735+
// Do not clean up the node if it is a leaf for any memory object
736+
if (Cmd->MLeafCounter > 0)
737+
continue;
738+
// Do not clean up allocation commands
739+
Command::CommandType CmdT = Cmd->getType();
740+
if (CmdT == Command::ALLOCA || CmdT == Command::ALLOCA_SUB_BUF)
741+
continue;
742+
743+
for (Command *UserCmd : Cmd->MUsers) {
744+
for (DepDesc &Dep : UserCmd->MDeps) {
745+
// Link the users of the command to the alloca command(s) instead
746+
if (Dep.MDepCommand == Cmd) {
747+
Dep.MDepCommand = Dep.MAllocaCmd;
748+
Dep.MDepCommand->MUsers.insert(UserCmd);
749+
}
750+
}
751+
}
752+
// Update dependency users
753+
for (DepDesc &Dep : Cmd->MDeps) {
754+
Command *DepCmd = Dep.MDepCommand;
755+
DepCmd->MUsers.erase(Cmd);
756+
}
757+
Cmd->getEvent()->setCommand(nullptr);
758+
delete Cmd;
759+
}
707760
}
708761

709762
void Scheduler::GraphBuilder::removeRecordForMemObj(SYCLMemObjI *MemObject) {

sycl/source/detail/scheduler/scheduler.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,11 @@ void Scheduler::waitForEvent(EventImplPtr Event) {
117117
GraphProcessor::waitForEvent(std::move(Event));
118118
}
119119

120+
void Scheduler::cleanupFinishedCommands(Command *FinishedCmd) {
121+
std::lock_guard<std::mutex> lock(MGraphLock);
122+
MGraphBuilder.cleanupFinishedCommands(FinishedCmd);
123+
}
124+
120125
void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) {
121126
std::lock_guard<std::mutex> lock(MGraphLock);
122127

@@ -125,6 +130,7 @@ void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) {
125130
// No operations were performed on the mem object
126131
return;
127132
waitForRecordToFinish(Record);
133+
MGraphBuilder.decrementLeafCountersForRecord(Record);
128134
MGraphBuilder.cleanupCommandsForRecord(Record);
129135
MGraphBuilder.removeRecordForMemObj(MemObj);
130136
}

sycl/test/scheduler/FakeCommand.hpp

Lines changed: 0 additions & 23 deletions
This file was deleted.
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
// RUN: %clangxx -fsycl %s -o %t.out
2+
// RUN: %t.out
3+
#include <CL/sycl.hpp>
4+
5+
#include <algorithm>
6+
#include <vector>
7+
8+
#include "SchedulerTestUtils.hpp"
9+
10+
using namespace cl::sycl;
11+
12+
// This test checks regular execution graph cleanup at host-device
13+
// synchronization points
14+
int main() {
15+
TestScheduler TS;
16+
queue Queue;
17+
buffer<int, 1> BufA(range<1>(1));
18+
buffer<int, 1> BufB(range<1>(1));
19+
buffer<int, 1> BufC(range<1>(1));
20+
detail::Requirement FakeReqA = getFakeRequirement(BufA);
21+
detail::Requirement FakeReqB = getFakeRequirement(BufB);
22+
detail::Requirement FakeReqC = getFakeRequirement(BufC);
23+
detail::MemObjRecord *RecC =
24+
TS.getOrInsertMemObjRecord(detail::getSyclObjImpl(Queue), &FakeReqC);
25+
26+
// Create a graph and check that all inner nodes have been deleted and
27+
// their users have had the corresponding dependency replaced with a
28+
// dependency on the alloca. The graph should undergo the following
29+
// transformation:
30+
// +---------+ +---------+ +---------++---------+
31+
// | LeafA | <-- | InnerA | | LeafA || LeafB |
32+
// +---------+ +---------+ +---------++---------+
33+
// | | | |
34+
// | | ===> | |
35+
// v v v v
36+
// +---------+ +---------+ +---------++---------+
37+
// | InnerC | | InnerB | | AllocaA || AllocaB |
38+
// +---------+ +---------+ +---------++---------+
39+
// | |
40+
// | |
41+
// v v
42+
// +---------+ +---------+
43+
// | AllocaA | | LeafB |
44+
// +---------+ +---------+
45+
// |
46+
// |
47+
// v
48+
// +---------+
49+
// | AllocaB |
50+
// +---------+
51+
detail::AllocaCommand AllocaA{detail::getSyclObjImpl(Queue), FakeReqA};
52+
detail::AllocaCommand AllocaB{detail::getSyclObjImpl(Queue), FakeReqB};
53+
54+
int NInnerCommandsAlive = 3;
55+
std::function<void()> Callback = [&]() { --NInnerCommandsAlive; };
56+
57+
FakeCommand *InnerC = new FakeCommandWithCallback(
58+
detail::getSyclObjImpl(Queue), FakeReqA, Callback);
59+
addEdge(InnerC, &AllocaA, &AllocaA);
60+
61+
FakeCommand LeafB{detail::getSyclObjImpl(Queue), FakeReqB};
62+
addEdge(&LeafB, &AllocaB, &AllocaB);
63+
TS.AddNodeToLeaves(RecC, &LeafB);
64+
65+
FakeCommand LeafA{detail::getSyclObjImpl(Queue), FakeReqA};
66+
addEdge(&LeafA, InnerC, &AllocaA);
67+
TS.AddNodeToLeaves(RecC, &LeafA);
68+
69+
FakeCommand *InnerB = new FakeCommandWithCallback(
70+
detail::getSyclObjImpl(Queue), FakeReqB, Callback);
71+
addEdge(InnerB, &LeafB, &AllocaB);
72+
73+
FakeCommand *InnerA = new FakeCommandWithCallback(
74+
detail::getSyclObjImpl(Queue), FakeReqA, Callback);
75+
addEdge(InnerA, &LeafA, &AllocaA);
76+
addEdge(InnerA, InnerB, &AllocaB);
77+
78+
TS.cleanupFinishedCommands(InnerA);
79+
TS.removeRecordForMemObj(detail::getSyclObjImpl(BufC).get());
80+
81+
assert(NInnerCommandsAlive == 0);
82+
83+
assert(LeafA.MDeps.size() == 1);
84+
assert(LeafA.MDeps[0].MDepCommand == &AllocaA);
85+
assert(AllocaA.MUsers.size() == 1);
86+
assert(*AllocaA.MUsers.begin() == &LeafA);
87+
88+
assert(LeafB.MDeps.size() == 1);
89+
assert(LeafB.MDeps[0].MDepCommand == &AllocaB);
90+
assert(AllocaB.MUsers.size() == 1);
91+
assert(*AllocaB.MUsers.begin() == &LeafB);
92+
}

0 commit comments

Comments
 (0)