Skip to content

[SYCL] Clean up finished command nodes of the execution graph #1066

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sycl/doc/SYCLEnvironmentVariables.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ subject to change. Do not rely on these variables in production code.
| SYCL_USE_KERNEL_SPV | Path to the SPIR-V binary | Load device image from the specified file. If runtime is unable to read the file, `cl::sycl::runtime_error` exception is thrown.|
| SYCL_DUMP_IMAGES | Any(*) | Dump device image binaries to file. Control has no effect if SYCL_USE_KERNEL_SPV is set. |
| SYCL_PRINT_EXECUTION_GRAPH | Described [below](#sycl_print_execution_graph-options) | Print execution graph to DOT text file. |
| SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP | Any(*) | Disable cleanup of finished command nodes at host-device synchronization points. |
| SYCL_THROW_ON_BLOCK | Any(*) | Throw an exception on attempt to wait for a blocked command. |
| SYCL_DEVICELIB_INHIBIT_NATIVE | String of device library extensions (separated by a whitespace) | Do not rely on device native support for devicelib extensions listed in this option. |
| SYCL_DEVICE_ALLOWLIST | A list of devices and their minimum driver version following the pattern: DeviceName:{{XXX}},DriverVersion:{{X.Y.Z.W}}. Also may contain PlatformName and PlatformVersion | Filter out devices that do not match the pattern specified. Regular expression can be passed and the SYCL RT will select only those devices which satisfy the regex. |
Expand Down
7 changes: 5 additions & 2 deletions sycl/include/CL/sycl/detail/scheduler/commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <atomic>
#include <memory>
#include <unordered_set>
#include <vector>

#include <CL/sycl/access/access.hpp>
Expand Down Expand Up @@ -98,7 +99,7 @@ class Command {

void addDep(EventImplPtr Event) { MDepsEvents.push_back(std::move(Event)); }

void addUser(Command *NewUser) { MUsers.push_back(NewUser); }
void addUser(Command *NewUser) { MUsers.insert(NewUser); }

// Return type of the command, e.g. Allocate, MemoryCopy.
CommandType getType() const { return MType; }
Expand Down Expand Up @@ -149,11 +150,13 @@ class Command {
// Contains list of dependencies(edges)
std::vector<DepDesc> MDeps;
// Contains list of commands that depend on the command
std::vector<Command *> MUsers;
std::unordered_set<Command *> MUsers;
// Indicates whether the command can be blocked from enqueueing
bool MIsBlockable = false;
// Indicates whether the command is blocked from enqueueing
std::atomic<bool> MCanEnqueue;
// Counts the number of memory objects this command is a leaf for
unsigned MLeafCounter = 0;

const char *MBlockReason = "Unknown";
};
Expand Down
12 changes: 10 additions & 2 deletions sycl/include/CL/sycl/detail/scheduler/scheduler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ class Scheduler {
// sycl::image destructors.
void removeMemoryObject(detail::SYCLMemObjI *MemObj);

// Removes finished non-leaf non-alloca commands from the subgraph (assuming
// that all its commands have been waited for).
void cleanupFinishedCommands(Command *FinishedCmd);

// Creates nodes in the graph, that update Req with the pointer to the host
// memory which contains the latest data of the memory object. New operations
// with the same memory object that have side effects are blocked until
Expand Down Expand Up @@ -125,8 +129,9 @@ class Scheduler {
// Event passed and its dependencies.
void optimize(EventImplPtr Event);

// Removes unneeded commands from the graph.
void cleanupCommands(bool CleanupReleaseCommands = false);
// Removes finished non-leaf non-alloca commands from the subgraph (assuming
// that all its commands have been waited for).
void cleanupFinishedCommands(Command *FinishedCmd);

// Reschedules command passed using Queue provided. this can lead to
// rescheduling of all dependent commands. This can be used when user
Expand All @@ -140,6 +145,9 @@ class Scheduler {
MemObjRecord *getOrInsertMemObjRecord(const QueueImplPtr &Queue,
Requirement *Req);

// Decrements leaf counters for all leaves of the record.
void decrementLeafCountersForRecord(MemObjRecord *Record);

// Removes commands that use given MemObjRecord from the graph.
void cleanupCommandsForRecord(MemObjRecord *Record);

Expand Down
2 changes: 1 addition & 1 deletion sycl/source/detail/config.def
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@
// underscore(__).

CONFIG(SYCL_PRINT_EXECUTION_GRAPH, 32, __SYCL_PRINT_EXECUTION_GRAPH)
CONFIG(SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP, 1, __SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP)
CONFIG(SYCL_DEVICE_ALLOWLIST, 1024, __SYCL_DEVICE_ALLOWLIST)

5 changes: 5 additions & 0 deletions sycl/source/detail/event_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#include <CL/sycl/detail/queue_impl.hpp>
#include <CL/sycl/detail/scheduler/scheduler.hpp>

#include "detail/config.hpp"

#include <chrono>

__SYCL_INLINE namespace cl {
Expand Down Expand Up @@ -100,6 +102,9 @@ void event_impl::wait(
waitInternal();
else if (MCommand)
detail::Scheduler::getInstance().waitForEvent(std::move(Self));
if (MCommand && !SYCLConfig<SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP>::get())
detail::Scheduler::getInstance().cleanupFinishedCommands(
static_cast<Command *>(MCommand));
}

void event_impl::wait_and_throw(
Expand Down
65 changes: 59 additions & 6 deletions sycl/source/detail/scheduler/graph_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,13 +138,15 @@ void Scheduler::GraphBuilder::UpdateLeaves(const std::set<Command *> &Cmds,
if (ReadOnlyReq)
return;

for (const Command *Cmd : Cmds) {
for (Command *Cmd : Cmds) {
auto NewEnd = std::remove(Record->MReadLeaves.begin(),
Record->MReadLeaves.end(), Cmd);
Cmd->MLeafCounter -= std::distance(NewEnd, Record->MReadLeaves.end());
Record->MReadLeaves.erase(NewEnd, Record->MReadLeaves.end());

NewEnd = std::remove(Record->MWriteLeaves.begin(),
Record->MWriteLeaves.end(), Cmd);
Cmd->MLeafCounter -= std::distance(NewEnd, Record->MWriteLeaves.end());
Record->MWriteLeaves.erase(NewEnd, Record->MWriteLeaves.end());
}
}
Expand All @@ -166,8 +168,10 @@ void Scheduler::GraphBuilder::AddNodeToLeaves(MemObjRecord *Record,
Dep.MDepCommand = OldLeaf;
Cmd->addDep(Dep);
OldLeaf->addUser(Cmd);
--(OldLeaf->MLeafCounter);
}
Leaves.push_back(Cmd);
++(Cmd->MLeafCounter);
}

UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
Expand Down Expand Up @@ -560,6 +564,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(

Record->MAllocaCommands.push_back(AllocaCmd);
Record->MWriteLeaves.push_back(AllocaCmd);
++(AllocaCmd->MLeafCounter);
}
return AllocaCmd;
}
Expand Down Expand Up @@ -633,6 +638,16 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
return NewCmd.release();
}

void Scheduler::GraphBuilder::decrementLeafCountersForRecord(
MemObjRecord *Record) {
for (Command *Cmd : Record->MReadLeaves) {
--(Cmd->MLeafCounter);
}
for (Command *Cmd : Record->MWriteLeaves) {
--(Cmd->MLeafCounter);
}
}

void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) {
std::vector<AllocaCommandBase *> &AllocaCommands = Record->MAllocaCommands;
if (AllocaCommands.empty())
Expand Down Expand Up @@ -683,9 +698,7 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) {
for (auto DepCmdIt : ShouldBeUpdated) {
if (!DepCmdIt.second)
continue;
std::vector<Command *> &DepUsers = DepCmdIt.first->MUsers;
DepUsers.erase(std::remove(DepUsers.begin(), DepUsers.end(), Cmd),
DepUsers.end());
DepCmdIt.first->MUsers.erase(Cmd);
}

// If all dependencies have been removed this way, mark the command for
Expand All @@ -702,8 +715,48 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) {
}
}

void Scheduler::GraphBuilder::cleanupCommands(bool CleanupReleaseCommands) {
// TODO: Implement.
void Scheduler::GraphBuilder::cleanupFinishedCommands(Command *FinishedCmd) {
std::queue<Command *> CmdsToVisit({FinishedCmd});
std::set<Command *> Visited;

// Traverse the graph using BFS
while (!CmdsToVisit.empty()) {
Command *Cmd = CmdsToVisit.front();
CmdsToVisit.pop();

if (!Visited.insert(Cmd).second)
continue;

for (const DepDesc &Dep : Cmd->MDeps) {
if (Dep.MDepCommand)
CmdsToVisit.push(Dep.MDepCommand);
}

// Do not clean up the node if it is a leaf for any memory object
if (Cmd->MLeafCounter > 0)
continue;
// Do not clean up allocation commands
Command::CommandType CmdT = Cmd->getType();
if (CmdT == Command::ALLOCA || CmdT == Command::ALLOCA_SUB_BUF)
continue;

for (Command *UserCmd : Cmd->MUsers) {
for (DepDesc &Dep : UserCmd->MDeps) {
// Link the users of the command to the alloca command(s) instead
if (Dep.MDepCommand == Cmd) {
Dep.MDepCommand = Dep.MAllocaCmd;
Dep.MDepCommand->MUsers.insert(UserCmd);
}
}
}
// Update dependency users
for (DepDesc &Dep : Cmd->MDeps) {
Command *DepCmd = Dep.MDepCommand;
DepCmd->MUsers.erase(Cmd);
}
Cmd->getEvent()->setCommand(nullptr);
delete Cmd;
}
}

void Scheduler::GraphBuilder::removeRecordForMemObj(SYCLMemObjI *MemObject) {
Expand Down
6 changes: 6 additions & 0 deletions sycl/source/detail/scheduler/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,11 @@ void Scheduler::waitForEvent(EventImplPtr Event) {
GraphProcessor::waitForEvent(std::move(Event));
}

void Scheduler::cleanupFinishedCommands(Command *FinishedCmd) {
std::lock_guard<std::mutex> lock(MGraphLock);
MGraphBuilder.cleanupFinishedCommands(FinishedCmd);
}

void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) {
std::lock_guard<std::mutex> lock(MGraphLock);

Expand All @@ -125,6 +130,7 @@ void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) {
// No operations were performed on the mem object
return;
waitForRecordToFinish(Record);
MGraphBuilder.decrementLeafCountersForRecord(Record);
MGraphBuilder.cleanupCommandsForRecord(Record);
MGraphBuilder.removeRecordForMemObj(MemObj);
}
Expand Down
23 changes: 0 additions & 23 deletions sycl/test/scheduler/FakeCommand.hpp

This file was deleted.

92 changes: 92 additions & 0 deletions sycl/test/scheduler/FinishedCmdCleanup.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// RUN: %clangxx -fsycl %s -o %t.out
// RUN: %t.out
#include <CL/sycl.hpp>

#include <algorithm>
#include <vector>

#include "SchedulerTestUtils.hpp"

using namespace cl::sycl;

// This test checks regular execution graph cleanup at host-device
// synchronization points
int main() {
TestScheduler TS;
queue Queue;
buffer<int, 1> BufA(range<1>(1));
buffer<int, 1> BufB(range<1>(1));
buffer<int, 1> BufC(range<1>(1));
detail::Requirement FakeReqA = getFakeRequirement(BufA);
detail::Requirement FakeReqB = getFakeRequirement(BufB);
detail::Requirement FakeReqC = getFakeRequirement(BufC);
detail::MemObjRecord *RecC =
TS.getOrInsertMemObjRecord(detail::getSyclObjImpl(Queue), &FakeReqC);

// Create a graph and check that all inner nodes have been deleted and
// their users have had the corresponding dependency replaced with a
// dependency on the alloca. The graph should undergo the following
// transformation:
// +---------+ +---------+ +---------++---------+
// | LeafA | <-- | InnerA | | LeafA || LeafB |
// +---------+ +---------+ +---------++---------+
// | | | |
// | | ===> | |
// v v v v
// +---------+ +---------+ +---------++---------+
// | InnerC | | InnerB | | AllocaA || AllocaB |
// +---------+ +---------+ +---------++---------+
// | |
// | |
// v v
// +---------+ +---------+
// | AllocaA | | LeafB |
// +---------+ +---------+
// |
// |
// v
// +---------+
// | AllocaB |
// +---------+
detail::AllocaCommand AllocaA{detail::getSyclObjImpl(Queue), FakeReqA};
detail::AllocaCommand AllocaB{detail::getSyclObjImpl(Queue), FakeReqB};

int NInnerCommandsAlive = 3;
std::function<void()> Callback = [&]() { --NInnerCommandsAlive; };

FakeCommand *InnerC = new FakeCommandWithCallback(
detail::getSyclObjImpl(Queue), FakeReqA, Callback);
addEdge(InnerC, &AllocaA, &AllocaA);

FakeCommand LeafB{detail::getSyclObjImpl(Queue), FakeReqB};
addEdge(&LeafB, &AllocaB, &AllocaB);
TS.AddNodeToLeaves(RecC, &LeafB);

FakeCommand LeafA{detail::getSyclObjImpl(Queue), FakeReqA};
addEdge(&LeafA, InnerC, &AllocaA);
TS.AddNodeToLeaves(RecC, &LeafA);

FakeCommand *InnerB = new FakeCommandWithCallback(
detail::getSyclObjImpl(Queue), FakeReqB, Callback);
addEdge(InnerB, &LeafB, &AllocaB);

FakeCommand *InnerA = new FakeCommandWithCallback(
detail::getSyclObjImpl(Queue), FakeReqA, Callback);
addEdge(InnerA, &LeafA, &AllocaA);
addEdge(InnerA, InnerB, &AllocaB);

TS.cleanupFinishedCommands(InnerA);
TS.removeRecordForMemObj(detail::getSyclObjImpl(BufC).get());

assert(NInnerCommandsAlive == 0);

assert(LeafA.MDeps.size() == 1);
assert(LeafA.MDeps[0].MDepCommand == &AllocaA);
assert(AllocaA.MUsers.size() == 1);
assert(*AllocaA.MUsers.begin() == &LeafA);

assert(LeafB.MDeps.size() == 1);
assert(LeafB.MDeps[0].MDepCommand == &AllocaB);
assert(AllocaB.MUsers.size() == 1);
assert(*AllocaB.MUsers.begin() == &LeafB);
}
Loading