Skip to content

Commit 197353f

Browse files
[SYCL] Clean up finished command nodes of the execution graph
Delete completed command nodes that are neither memory allocations nor leaves for any memory object. This cleanup is performed at each host-device synchronization point in order to avoid execution graph bloat. Signed-off-by: Sergey Semenov <[email protected]>
1 parent 38ace4d commit 197353f

File tree

8 files changed

+92
-13
lines changed

8 files changed

+92
-13
lines changed

sycl/doc/SYCLEnvironmentVariables.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ subject to change. Do not rely on these variables in production code.
1818
| SYCL_USE_KERNEL_SPV | Path to the SPIR-V binary | Load device image from the specified file. If runtime is unable to read the file, `cl::sycl::runtime_error` exception is thrown.|
1919
| SYCL_DUMP_IMAGES | Any(*) | Dump device image binaries to file. Control has no effect if SYCL_USE_KERNEL_SPV is set. |
2020
| SYCL_PRINT_EXECUTION_GRAPH | Described [below](#sycl_print_execution_graph-options) | Print execution graph to DOT text file. |
21+
| SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP | Any(*) | Disable cleanup of finished command nodes at host-device synchronization points. |
2122
| SYCL_THROW_ON_BLOCK | Any(*) | Throw an exception on attempt to wait for a blocked command. |
2223
| SYCL_DEVICELIB_INHIBIT_NATIVE | String of device library extensions (separated by a whitespace) | Do not rely on device native support for devicelib extensions listed in this option. |
2324
| SYCL_DEVICE_ALLOWLIST | A list of devices and their minimum driver version following the pattern: DeviceName:{{XXX}},DriverVersion:{{X.Y.Z.W}}. Also may contain PlatformName and PlatformVersion | Filter out devices that do not match the pattern specified. Regular expression can be passed and the SYCL RT will select only those devices which satisfy the regex. |

sycl/include/CL/sycl/detail/scheduler/commands.hpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <atomic>
1212
#include <memory>
13+
#include <unordered_set>
1314
#include <vector>
1415

1516
#include <CL/sycl/access/access.hpp>
@@ -98,7 +99,7 @@ class Command {
9899

99100
void addDep(EventImplPtr Event) { MDepsEvents.push_back(std::move(Event)); }
100101

101-
void addUser(Command *NewUser) { MUsers.push_back(NewUser); }
102+
void addUser(Command *NewUser) { MUsers.insert(NewUser); }
102103

103104
// Return type of the command, e.g. Allocate, MemoryCopy.
104105
CommandType getType() const { return MType; }
@@ -149,11 +150,13 @@ class Command {
149150
// Contains list of dependencies(edges)
150151
std::vector<DepDesc> MDeps;
151152
// Contains list of commands that depend on the command
152-
std::vector<Command *> MUsers;
153+
std::unordered_set<Command *> MUsers;
153154
// Indicates whether the command can be blocked from enqueueing
154155
bool MIsBlockable = false;
155156
// Indicates whether the command is blocked from enqueueing
156157
std::atomic<bool> MCanEnqueue;
158+
// Counts the number of memory objects this command is a leaf for
159+
unsigned MLeafCounter = 0;
157160

158161
const char *MBlockReason = "Unknown";
159162
};

sycl/include/CL/sycl/detail/scheduler/scheduler.hpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ class Scheduler {
7777
// sycl::image destructors.
7878
void removeMemoryObject(detail::SYCLMemObjI *MemObj);
7979

80+
// Removes finished non-leaf non-alloca commands from the subgraph (assuming
81+
// that all its commands have been waited for).
82+
void cleanupFinishedCommands(Command *FinishedCmd);
83+
8084
// Creates nodes in the graph, that update Req with the pointer to the host
8185
// memory which contains the latest data of the memory object. New operations
8286
// with the same memory object that have side effects are blocked until
@@ -125,8 +129,9 @@ class Scheduler {
125129
// Event passed and its dependencies.
126130
void optimize(EventImplPtr Event);
127131

128-
// Removes unneeded commands from the graph.
129-
void cleanupCommands(bool CleanupReleaseCommands = false);
132+
// Removes finished non-leaf non-alloca commands from the subgraph (assuming
133+
// that all its commands have been waited for).
134+
void cleanupFinishedCommands(Command *FinishedCmd);
130135

131136
// Reschedules command passed using Queue provided. this can lead to
132137
// rescheduling of all dependent commands. This can be used when user
@@ -140,6 +145,9 @@ class Scheduler {
140145
MemObjRecord *getOrInsertMemObjRecord(const QueueImplPtr &Queue,
141146
Requirement *Req);
142147

148+
// Decrements leaf counters for all leaves of the record.
149+
void updateLeafCountersForRecord(MemObjRecord *Record);
150+
143151
// Removes commands that use given MemObjRecord from the graph.
144152
void cleanupCommandsForRecord(MemObjRecord *Record);
145153

sycl/source/detail/config.def

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,5 @@
1111
// underscore(__).
1212

1313
CONFIG(SYCL_PRINT_EXECUTION_GRAPH, 32, __SYCL_PRINT_EXECUTION_GRAPH)
14+
CONFIG(SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP, 1, __SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP)
1415
CONFIG(SYCL_DEVICE_ALLOWLIST, 1024, __SYCL_DEVICE_ALLOWLIST)
15-

sycl/source/detail/event_impl.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
#include <CL/sycl/detail/queue_impl.hpp>
1414
#include <CL/sycl/detail/scheduler/scheduler.hpp>
1515

16+
#include "detail/config.hpp"
17+
1618
#include <chrono>
1719

1820
__SYCL_INLINE namespace cl {
@@ -100,6 +102,9 @@ void event_impl::wait(
100102
waitInternal();
101103
else if (MCommand)
102104
detail::Scheduler::getInstance().waitForEvent(std::move(Self));
105+
if (MCommand && !SYCLConfig<SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP>::get())
106+
detail::Scheduler::getInstance().cleanupFinishedCommands(
107+
static_cast<Command *>(MCommand));
103108
}
104109

105110
void event_impl::wait_and_throw(

sycl/source/detail/scheduler/graph_builder.cpp

Lines changed: 59 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -138,13 +138,15 @@ void Scheduler::GraphBuilder::UpdateLeaves(const std::set<Command *> &Cmds,
138138
if (ReadOnlyReq)
139139
return;
140140

141-
for (const Command *Cmd : Cmds) {
141+
for (Command *Cmd : Cmds) {
142142
auto NewEnd = std::remove(Record->MReadLeaves.begin(),
143143
Record->MReadLeaves.end(), Cmd);
144+
Cmd->MLeafCounter -= std::distance(NewEnd, Record->MReadLeaves.end());
144145
Record->MReadLeaves.erase(NewEnd, Record->MReadLeaves.end());
145146

146147
NewEnd = std::remove(Record->MWriteLeaves.begin(),
147148
Record->MWriteLeaves.end(), Cmd);
149+
Cmd->MLeafCounter -= std::distance(NewEnd, Record->MWriteLeaves.end());
148150
Record->MWriteLeaves.erase(NewEnd, Record->MWriteLeaves.end());
149151
}
150152
}
@@ -166,8 +168,10 @@ void Scheduler::GraphBuilder::AddNodeToLeaves(MemObjRecord *Record,
166168
Dep.MDepCommand = OldLeaf;
167169
Cmd->addDep(Dep);
168170
OldLeaf->addUser(Cmd);
171+
--(OldLeaf->MLeafCounter);
169172
}
170173
Leaves.push_back(Cmd);
174+
++(Cmd->MLeafCounter);
171175
}
172176

173177
UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
@@ -560,6 +564,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
560564

561565
Record->MAllocaCommands.push_back(AllocaCmd);
562566
Record->MWriteLeaves.push_back(AllocaCmd);
567+
++(AllocaCmd->MLeafCounter);
563568
}
564569
return AllocaCmd;
565570
}
@@ -633,6 +638,16 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
633638
return NewCmd.release();
634639
}
635640

641+
void Scheduler::GraphBuilder::updateLeafCountersForRecord(
642+
MemObjRecord *Record) {
643+
for (Command *Cmd : Record->MReadLeaves) {
644+
--(Cmd->MLeafCounter);
645+
}
646+
for (Command *Cmd : Record->MWriteLeaves) {
647+
--(Cmd->MLeafCounter);
648+
}
649+
}
650+
636651
void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) {
637652
std::vector<AllocaCommandBase *> &AllocaCommands = Record->MAllocaCommands;
638653
if (AllocaCommands.empty())
@@ -683,9 +698,7 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) {
683698
for (auto DepCmdIt : ShouldBeUpdated) {
684699
if (!DepCmdIt.second)
685700
continue;
686-
std::vector<Command *> &DepUsers = DepCmdIt.first->MUsers;
687-
DepUsers.erase(std::remove(DepUsers.begin(), DepUsers.end(), Cmd),
688-
DepUsers.end());
701+
DepCmdIt.first->MUsers.erase(Cmd);
689702
}
690703

691704
// If all dependencies have been removed this way, mark the command for
@@ -702,8 +715,48 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) {
702715
}
703716
}
704717

705-
void Scheduler::GraphBuilder::cleanupCommands(bool CleanupReleaseCommands) {
706-
// TODO: Implement.
718+
void Scheduler::GraphBuilder::cleanupFinishedCommands(Command *FinishedCmd) {
719+
std::queue<Command *> CmdsToVisit({FinishedCmd});
720+
std::set<Command *> Visited;
721+
722+
// Traverse the graph using BFS
723+
while (!CmdsToVisit.empty()) {
724+
Command *Cmd = CmdsToVisit.front();
725+
CmdsToVisit.pop();
726+
727+
if (!Visited.insert(Cmd).second)
728+
continue;
729+
730+
for (const DepDesc &Dep : Cmd->MDeps) {
731+
if (Dep.MDepCommand)
732+
CmdsToVisit.push(Dep.MDepCommand);
733+
}
734+
735+
// Do not clean up the node if it is a leaf for any memory object
736+
if (Cmd->MLeafCounter > 0)
737+
continue;
738+
// Do not clean up allocation commands
739+
Command::CommandType CmdT = Cmd->getType();
740+
if (CmdT == Command::ALLOCA || CmdT == Command::ALLOCA_SUB_BUF)
741+
continue;
742+
743+
for (Command *UserCmd : Cmd->MUsers) {
744+
for (DepDesc &Dep : UserCmd->MDeps) {
745+
// Link the users of the command to the alloca command(s) instead
746+
if (Dep.MDepCommand == Cmd) {
747+
Dep.MDepCommand = Dep.MAllocaCmd;
748+
Dep.MDepCommand->MUsers.insert(UserCmd);
749+
}
750+
}
751+
}
752+
// Update dependency users
753+
for (DepDesc &Dep : Cmd->MDeps) {
754+
Command *DepCmd = Dep.MDepCommand;
755+
DepCmd->MUsers.erase(Cmd);
756+
}
757+
Cmd->getEvent()->setCommand(nullptr);
758+
delete Cmd;
759+
}
707760
}
708761

709762
void Scheduler::GraphBuilder::removeRecordForMemObj(SYCLMemObjI *MemObject) {

sycl/source/detail/scheduler/scheduler.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,11 @@ void Scheduler::waitForEvent(EventImplPtr Event) {
117117
GraphProcessor::waitForEvent(std::move(Event));
118118
}
119119

120+
void Scheduler::cleanupFinishedCommands(Command *FinishedCmd) {
121+
std::lock_guard<std::mutex> lock(MGraphLock);
122+
MGraphBuilder.cleanupFinishedCommands(FinishedCmd);
123+
}
124+
120125
void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) {
121126
std::lock_guard<std::mutex> lock(MGraphLock);
122127

@@ -125,6 +130,7 @@ void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) {
125130
// No operations were performed on the mem object
126131
return;
127132
waitForRecordToFinish(Record);
133+
MGraphBuilder.updateLeafCountersForRecord(Record);
128134
MGraphBuilder.cleanupCommandsForRecord(Record);
129135
MGraphBuilder.removeRecordForMemObj(MemObj);
130136
}

sycl/test/scheduler/LeafLimit.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// RUN: %t.out
33
#include <CL/sycl.hpp>
44

5+
#include <algorithm>
56
#include <cstddef>
67
#include <memory>
78
#include <vector>
@@ -70,7 +71,9 @@ int main() {
7071
FakeCommand *OldestLeaf = LeavesToAdd.front();
7172
FakeCommand *NewestLeaf = LeavesToAdd.back();
7273
assert(OldestLeaf->MUsers.size() == 1);
73-
assert(OldestLeaf->MUsers[0] == NewestLeaf);
74+
assert(OldestLeaf->MUsers.count(NewestLeaf));
7475
assert(NewestLeaf->MDeps.size() == 2);
75-
assert(NewestLeaf->MDeps[1].MDepCommand == OldestLeaf);
76+
assert(std::any_of(
77+
NewestLeaf->MDeps.begin(), NewestLeaf->MDeps.end(),
78+
[&](const detail::DepDesc &DD) { return DD.MDepCommand == OldestLeaf; }));
7679
}

0 commit comments

Comments
 (0)