Skip to content

[SYCL] Fix memory leak in reduction resources #5653

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Mar 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion sycl/include/CL/sycl/detail/cg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ class CGExecKernel : public CG {
std::string MKernelName;
detail::OSModuleHandle MOSModuleHandle;
std::vector<std::shared_ptr<detail::stream_impl>> MStreams;
std::vector<std::shared_ptr<const void>> MAuxiliaryResources;

CGExecKernel(NDRDescT NDRDesc, std::unique_ptr<HostKernelBase> HKernel,
std::shared_ptr<detail::kernel_impl> SyclKernel,
Expand All @@ -259,14 +260,16 @@ class CGExecKernel : public CG {
std::vector<ArgDesc> Args, std::string KernelName,
detail::OSModuleHandle OSModuleHandle,
std::vector<std::shared_ptr<detail::stream_impl>> Streams,
std::vector<std::shared_ptr<const void>> AuxiliaryResources,
CGTYPE Type, detail::code_location loc = {})
: CG(Type, std::move(ArgsStorage), std::move(AccStorage),
std::move(SharedPtrStorage), std::move(Requirements),
std::move(Events), std::move(loc)),
MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)),
MSyclKernel(std::move(SyclKernel)), MArgs(std::move(Args)),
MKernelName(std::move(KernelName)), MOSModuleHandle(OSModuleHandle),
MStreams(std::move(Streams)) {
MStreams(std::move(Streams)),
MAuxiliaryResources(std::move(AuxiliaryResources)) {
assert((getType() == RunOnHostIntel || getType() == Kernel) &&
"Wrong type of exec kernel CG.");
}
Expand All @@ -277,6 +280,10 @@ class CGExecKernel : public CG {
return MStreams;
}

std::vector<std::shared_ptr<const void>> getAuxiliaryResources() const {
return MAuxiliaryResources;
}

std::shared_ptr<detail::kernel_bundle_impl> getKernelBundle() {
const std::shared_ptr<std::vector<ExtendedMemberT>> &ExtendedMembers =
getExtendedMembers();
Expand All @@ -291,6 +298,9 @@ class CGExecKernel : public CG {

void clearStreams() { MStreams.clear(); }
bool hasStreams() { return !MStreams.empty(); }

void clearAuxiliaryResources() { MAuxiliaryResources.clear(); }
bool hasAuxiliaryResources() { return !MAuxiliaryResources.empty(); }
};

/// "Copy memory" command group class.
Expand Down
6 changes: 2 additions & 4 deletions sycl/include/CL/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -472,12 +472,9 @@ class __SYCL_EXPORT handler {
/// Saves buffers created by handling reduction feature in handler.
/// They are then forwarded to command group and destroyed only after
/// the command group finishes the work on device/host.
/// The 'MSharedPtrStorage' suits that need.
///
/// @param ReduObj is a pointer to object that must be stored.
void addReduction(const std::shared_ptr<const void> &ReduObj) {
MSharedPtrStorage.push_back(ReduObj);
}
void addReduction(const std::shared_ptr<const void> &ReduObj);

~handler() = default;

Expand Down Expand Up @@ -1280,6 +1277,7 @@ class __SYCL_EXPORT handler {
}

std::shared_ptr<detail::handler_impl> getHandlerImpl() const;
std::shared_ptr<detail::handler_impl> evictHandlerImpl() const;

void setStateExplicitKernelBundle();
void setStateSpecConstSet();
Expand Down
2 changes: 2 additions & 0 deletions sycl/include/sycl/ext/oneapi/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,7 @@ class reduction_impl : private reduction_impl_base {
auto RWReduVal = std::make_shared<T>(MIdentity);
CGH.addReduction(RWReduVal);
MOutBufPtr = std::make_shared<buffer<T, 1>>(RWReduVal.get(), range<1>(1));
MOutBufPtr->set_final_data();
CGH.addReduction(MOutBufPtr);
return createHandlerWiredReadWriteAccessor(CGH, *MOutBufPtr);
}
Expand All @@ -728,6 +729,7 @@ class reduction_impl : private reduction_impl_base {
auto CounterMem = std::make_shared<int>(0);
CGH.addReduction(CounterMem);
auto CounterBuf = std::make_shared<buffer<int, 1>>(CounterMem.get(), 1);
CounterBuf->set_final_data();
CGH.addReduction(CounterBuf);
return {*CounterBuf, CGH};
}
Expand Down
3 changes: 3 additions & 0 deletions sycl/source/detail/handler_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ class handler_impl {
/// equal to the queue associated with the handler if the corresponding
/// submission is a fallback from a previous submission.
std::shared_ptr<queue_impl> MSubmissionSecondaryQueue;

// Stores auxiliary resources used by internal operations.
std::vector<std::shared_ptr<const void>> MAuxiliaryResources;
};

} // namespace detail
Expand Down
20 changes: 18 additions & 2 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1378,11 +1378,23 @@ std::vector<StreamImplPtr> ExecCGCommand::getStreams() const {
return {};
}

std::vector<std::shared_ptr<const void>>
ExecCGCommand::getAuxiliaryResources() const {
if (MCommandGroup->getType() == CG::Kernel)
return ((CGExecKernel *)MCommandGroup.get())->getAuxiliaryResources();
return {};
}

void ExecCGCommand::clearStreams() {
if (MCommandGroup->getType() == CG::Kernel)
((CGExecKernel *)MCommandGroup.get())->clearStreams();
}

void ExecCGCommand::clearAuxiliaryResources() {
if (MCommandGroup->getType() == CG::Kernel)
((CGExecKernel *)MCommandGroup.get())->clearAuxiliaryResources();
}

cl_int UpdateHostRequirementCommand::enqueueImp() {
waitForPreparedHostEvents();
std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
Expand Down Expand Up @@ -1673,7 +1685,9 @@ ExecCGCommand::ExecCGCommand(std::unique_ptr<detail::CG> CommandGroup,
static_cast<detail::CGHostTask *>(MCommandGroup.get())->MQueue;
MEvent->setNeedsCleanupAfterWait(true);
} else if (MCommandGroup->getType() == CG::CGTYPE::Kernel &&
(static_cast<CGExecKernel *>(MCommandGroup.get()))->hasStreams())
(static_cast<CGExecKernel *>(MCommandGroup.get())->hasStreams() ||
static_cast<CGExecKernel *>(MCommandGroup.get())
->hasAuxiliaryResources()))
MEvent->setNeedsCleanupAfterWait(true);

emitInstrumentationDataProxy();
Expand Down Expand Up @@ -2481,7 +2495,9 @@ bool ExecCGCommand::supportsPostEnqueueCleanup() const {
return Command::supportsPostEnqueueCleanup() &&
(MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask) &&
(MCommandGroup->getType() != CG::CGTYPE::Kernel ||
!(static_cast<CGExecKernel *>(MCommandGroup.get()))->hasStreams());
(!static_cast<CGExecKernel *>(MCommandGroup.get())->hasStreams() &&
!static_cast<CGExecKernel *>(MCommandGroup.get())
->hasAuxiliaryResources()));
}

} // namespace detail
Expand Down
2 changes: 2 additions & 0 deletions sycl/source/detail/scheduler/commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -541,8 +541,10 @@ class ExecCGCommand : public Command {
ExecCGCommand(std::unique_ptr<detail::CG> CommandGroup, QueueImplPtr Queue);

std::vector<StreamImplPtr> getStreams() const;
std::vector<std::shared_ptr<const void>> getAuxiliaryResources() const;

void clearStreams();
void clearAuxiliaryResources();

void printDot(std::ostream &Stream) const final;
void emitInstrumentationData() final;
Expand Down
25 changes: 23 additions & 2 deletions sycl/source/detail/scheduler/graph_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1045,7 +1045,8 @@ void Scheduler::GraphBuilder::decrementLeafCountersForRecord(

void Scheduler::GraphBuilder::cleanupCommandsForRecord(
MemObjRecord *Record,
std::vector<std::shared_ptr<stream_impl>> &StreamsToDeallocate) {
std::vector<std::shared_ptr<stream_impl>> &StreamsToDeallocate,
std::vector<std::shared_ptr<const void>> &AuxResourcesToDeallocate) {
std::vector<AllocaCommandBase *> &AllocaCommands = Record->MAllocaCommands;
if (AllocaCommands.empty())
return;
Expand Down Expand Up @@ -1097,10 +1098,19 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(
// Collect stream objects for a visited command.
if (Cmd->getType() == Command::CommandType::RUN_CG) {
auto ExecCmd = static_cast<ExecCGCommand *>(Cmd);

// Transfer ownership of stream implementations.
std::vector<std::shared_ptr<stream_impl>> Streams = ExecCmd->getStreams();
ExecCmd->clearStreams();
StreamsToDeallocate.insert(StreamsToDeallocate.end(), Streams.begin(),
Streams.end());

// Transfer ownership of auxiliary resources.
std::vector<std::shared_ptr<const void>> AuxResources =
ExecCmd->getAuxiliaryResources();
ExecCmd->clearAuxiliaryResources();
AuxResourcesToDeallocate.insert(AuxResourcesToDeallocate.end(),
AuxResources.begin(), AuxResources.end());
}

for (Command *UserCmd : Cmd->MUsers)
Expand Down Expand Up @@ -1160,6 +1170,7 @@ void Scheduler::GraphBuilder::cleanupCommand(Command *Cmd) {
if (ExecCGCmd->getCG().getType() == CG::CGTYPE::Kernel) {
auto *ExecKernelCG = static_cast<CGExecKernel *>(&ExecCGCmd->getCG());
assert(!ExecKernelCG->hasStreams());
assert(!ExecKernelCG->hasAuxiliaryResources());
}
}
#endif
Expand Down Expand Up @@ -1191,7 +1202,8 @@ void Scheduler::GraphBuilder::cleanupCommand(Command *Cmd) {

void Scheduler::GraphBuilder::cleanupFinishedCommands(
Command *FinishedCmd,
std::vector<std::shared_ptr<stream_impl>> &StreamsToDeallocate) {
std::vector<std::shared_ptr<stream_impl>> &StreamsToDeallocate,
std::vector<std::shared_ptr<const void>> &AuxResourcesToDeallocate) {
assert(MCmdsToVisit.empty());
MCmdsToVisit.push(FinishedCmd);
MVisitedCmds.clear();
Expand All @@ -1207,10 +1219,19 @@ void Scheduler::GraphBuilder::cleanupFinishedCommands(
// Collect stream objects for a visited command.
if (Cmd->getType() == Command::CommandType::RUN_CG) {
auto ExecCmd = static_cast<ExecCGCommand *>(Cmd);

// Transfer ownership of stream implementations.
std::vector<std::shared_ptr<stream_impl>> Streams = ExecCmd->getStreams();
ExecCmd->clearStreams();
StreamsToDeallocate.insert(StreamsToDeallocate.end(), Streams.begin(),
Streams.end());

// Transfer ownership of auxiliary resources.
std::vector<std::shared_ptr<const void>> AuxResources =
ExecCmd->getAuxiliaryResources();
ExecCmd->clearAuxiliaryResources();
AuxResourcesToDeallocate.insert(AuxResourcesToDeallocate.end(),
AuxResources.begin(), AuxResources.end());
}

for (const DepDesc &Dep : Cmd->MDeps) {
Expand Down
16 changes: 14 additions & 2 deletions sycl/source/detail/scheduler/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,11 @@ void Scheduler::cleanupFinishedCommands(EventImplPtr FinishedEvent) {
// objects, this is needed to guarantee that streamed data is printed and
// resources are released.
std::vector<std::shared_ptr<stream_impl>> StreamsToDeallocate;
// Similar to streams, we also collect the auxiliary resources used by the
// commands. Cleanup will make sure the commands do not own the resources
// anymore, so we just need them to survive the graph lock then they can die
// as they go out of scope.
std::vector<std::shared_ptr<const void>> AuxResourcesToDeallocate;
{
// Avoiding deadlock situation, where one thread is in the process of
// enqueueing (with a locked mutex) a currently blocked task that waits for
Expand All @@ -249,7 +254,8 @@ void Scheduler::cleanupFinishedCommands(EventImplPtr FinishedEvent) {
// The command might have been cleaned up (and set to nullptr) by another
// thread
if (FinishedCmd)
MGraphBuilder.cleanupFinishedCommands(FinishedCmd, StreamsToDeallocate);
MGraphBuilder.cleanupFinishedCommands(FinishedCmd, StreamsToDeallocate,
AuxResourcesToDeallocate);
}
}
deallocateStreams(StreamsToDeallocate);
Expand All @@ -261,6 +267,11 @@ void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) {
// objects, this is needed to guarantee that streamed data is printed and
// resources are released.
std::vector<std::shared_ptr<stream_impl>> StreamsToDeallocate;
// Similar to streams, we also collect the auxiliary resources used by the
// commands. Cleanup will make sure the commands do not own the resources
// anymore, so we just need them to survive the graph lock then they can die
// as they go out of scope.
std::vector<std::shared_ptr<const void>> AuxResourcesToDeallocate;

{
MemObjRecord *Record = nullptr;
Expand All @@ -282,7 +293,8 @@ void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) {
WriteLockT Lock(MGraphLock, std::defer_lock);
acquireWriteLock(Lock);
MGraphBuilder.decrementLeafCountersForRecord(Record);
MGraphBuilder.cleanupCommandsForRecord(Record, StreamsToDeallocate);
MGraphBuilder.cleanupCommandsForRecord(Record, StreamsToDeallocate,
AuxResourcesToDeallocate);
MGraphBuilder.removeRecordForMemObj(MemObj);
}
}
Expand Down
6 changes: 4 additions & 2 deletions sycl/source/detail/scheduler/scheduler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -514,7 +514,8 @@ class Scheduler {
/// (assuming that all its commands have been waited for).
void cleanupFinishedCommands(
Command *FinishedCmd,
std::vector<std::shared_ptr<cl::sycl::detail::stream_impl>> &);
std::vector<std::shared_ptr<cl::sycl::detail::stream_impl>> &,
std::vector<std::shared_ptr<const void>> &);

/// Reschedules the command passed using Queue provided.
///
Expand All @@ -540,7 +541,8 @@ class Scheduler {
/// Removes commands that use the given MemObjRecord from the graph.
void cleanupCommandsForRecord(
MemObjRecord *Record,
std::vector<std::shared_ptr<cl::sycl::detail::stream_impl>> &);
std::vector<std::shared_ptr<cl::sycl::detail::stream_impl>> &,
std::vector<std::shared_ptr<const void>> &);

/// Removes the MemObjRecord for the memory object passed.
void removeRecordForMemObj(SYCLMemObjI *MemObject);
Expand Down
49 changes: 37 additions & 12 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,24 +49,40 @@ handler::handler(std::shared_ptr<detail::queue_impl> Queue,
MSharedPtrStorage.push_back(std::move(ExtendedMembers));
}

static detail::ExtendedMemberT &getHandlerImplMember(
std::vector<std::shared_ptr<const void>> &SharedPtrStorage) {
assert(!SharedPtrStorage.empty());
std::shared_ptr<std::vector<detail::ExtendedMemberT>> ExtendedMembersVec =
detail::convertToExtendedMembers(SharedPtrStorage[0]);
assert(ExtendedMembersVec->size() > 0);
auto &HandlerImplMember = (*ExtendedMembersVec)[0];
assert(detail::ExtendedMembersType::HANDLER_IMPL == HandlerImplMember.MType);
return HandlerImplMember;
}

/// Gets the handler_impl at the start of the extended members.
std::shared_ptr<detail::handler_impl> handler::getHandlerImpl() const {
std::lock_guard<std::mutex> Lock(
detail::GlobalHandler::instance().getHandlerExtendedMembersMutex());
return std::static_pointer_cast<detail::handler_impl>(
getHandlerImplMember(MSharedPtrStorage).MData);
}

assert(!MSharedPtrStorage.empty());

std::shared_ptr<std::vector<detail::ExtendedMemberT>> ExtendedMembersVec =
detail::convertToExtendedMembers(MSharedPtrStorage[0]);

assert(ExtendedMembersVec->size() > 0);

auto HandlerImplMember = (*ExtendedMembersVec)[0];
/// Gets the handler_impl at the start of the extended members and removes it.
std::shared_ptr<detail::handler_impl> handler::evictHandlerImpl() const {
std::lock_guard<std::mutex> Lock(
detail::GlobalHandler::instance().getHandlerExtendedMembersMutex());
auto &HandlerImplMember = getHandlerImplMember(MSharedPtrStorage);
auto Impl =
std::static_pointer_cast<detail::handler_impl>(HandlerImplMember.MData);

assert(detail::ExtendedMembersType::HANDLER_IMPL == HandlerImplMember.MType);
// Reset the data of the member.
// NOTE: We let it stay because removing the front can be expensive. This will
// be improved when the impl is made a member of handler. In fact eviction is
// likely to not be needed when that happens.
HandlerImplMember.MData.reset();

return std::static_pointer_cast<detail::handler_impl>(
HandlerImplMember.MData);
return Impl;
}

// Sets the submission state to indicate that an explicit kernel bundle has been
Expand Down Expand Up @@ -281,6 +297,10 @@ event handler::finalize() {
return MLastEvent;
}

// Evict handler_impl from extended members to make sure the command group
// does not keep it alive.
std::shared_ptr<detail::handler_impl> Impl = evictHandlerImpl();

std::unique_ptr<detail::CG> CommandGroup;
switch (type) {
case detail::CG::Kernel:
Expand All @@ -293,7 +313,8 @@ event handler::finalize() {
std::move(MArgsStorage), std::move(MAccStorage),
std::move(MSharedPtrStorage), std::move(MRequirements),
std::move(MEvents), std::move(MArgs), MKernelName, MOSModuleHandle,
std::move(MStreamStorage), MCGType, MCodeLoc));
std::move(MStreamStorage), std::move(Impl->MAuxiliaryResources),
MCGType, MCodeLoc));
break;
}
case detail::CG::CodeplayInteropTask:
Expand Down Expand Up @@ -382,6 +403,10 @@ event handler::finalize() {
return MLastEvent;
}

void handler::addReduction(const std::shared_ptr<const void> &ReduObj) {
getHandlerImpl()->MAuxiliaryResources.push_back(ReduObj);
}

void handler::associateWithHandler(detail::AccessorBaseHost *AccBase,
access::target AccTarget) {
detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
Expand Down
2 changes: 2 additions & 0 deletions sycl/test/abi/sycl_symbols_linux.dump
Original file line number Diff line number Diff line change
Expand Up @@ -3994,6 +3994,7 @@ _ZN2cl4sycl7handler10depends_onERKSt6vectorINS0_5eventESaIS3_EE
_ZN2cl4sycl7handler10mem_adviseEPKvmi
_ZN2cl4sycl7handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmb
_ZN2cl4sycl7handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmbb
_ZN2cl4sycl7handler12addReductionERKSt10shared_ptrIKvE
_ZN2cl4sycl7handler13getKernelNameB5cxx11Ev
_ZN2cl4sycl7handler17use_kernel_bundleERKNS0_13kernel_bundleILNS0_12bundle_stateE2EEE
_ZN2cl4sycl7handler18RangeRoundingTraceEv
Expand Down Expand Up @@ -4390,6 +4391,7 @@ _ZNK2cl4sycl7context8get_infoILNS0_4info7contextE65552EEENS3_12param_traitsIS4_X
_ZNK2cl4sycl7context8get_infoILNS0_4info7contextE65553EEENS3_12param_traitsIS4_XT_EE11return_typeEv
_ZNK2cl4sycl7context9getNativeEv
_ZNK2cl4sycl7handler14getHandlerImplEv
_ZNK2cl4sycl7handler16evictHandlerImplEv
_ZNK2cl4sycl7handler27isStateExplicitKernelBundleEv
_ZNK2cl4sycl7handler30getOrInsertHandlerKernelBundleEb
_ZNK2cl4sycl7program10get_kernelENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE
Expand Down
1 change: 1 addition & 0 deletions sycl/test/abi/sycl_symbols_windows.dump
Original file line number Diff line number Diff line change
Expand Up @@ -1754,6 +1754,7 @@
?erfc@__host_std@cl@@YA?AVhalf@half_impl@detail@sycl@2@V34562@@Z
?erfc@__host_std@cl@@YAMM@Z
?erfc@__host_std@cl@@YANN@Z
?evictHandlerImpl@handler@sycl@cl@@AEBA?AV?$shared_ptr@Vhandler_impl@detail@sycl@cl@@@std@@XZ
?exp10@__host_std@cl@@YA?AV?$vec@M$00@sycl@2@V342@@Z
?exp10@__host_std@cl@@YA?AV?$vec@M$01@sycl@2@V342@@Z
?exp10@__host_std@cl@@YA?AV?$vec@M$02@sycl@2@V342@@Z
Expand Down
Loading