Skip to content

Commit a33b0a9

Browse files
authored
[SYCL][KernelFusion] Ensure queue_impl is not unnecessarily referenced (#10948)
The KernelFusionCmd keeps a references to the queue on which it was submitted. However these commands are also kept on the side in order to ensure event validity and are only reset when start fusion is called again on the same queue. This forces queue_impl and events to be kept alive as the command keeps the last reference until the sycl shutdown procedure. The patch forces 1) the command to reset it pointers to the queue once enqueued and 2) to clean up the KernelFusionCmd attached to the queue to free up events. Note: there is no new test as kernel fusion already cover this and I'm not too sure on how to approach it. I'm happy to try out any suggestion. --------- Signed-off-by: Victor Lomuller <[email protected]>
1 parent 0989c8a commit a33b0a9

File tree

7 files changed

+76
-6
lines changed

7 files changed

+76
-6
lines changed

sycl/source/detail/queue_impl.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,10 @@ pi_native_handle queue_impl::getNative(int32_t &NativeHandleDesc) const {
558558
return Handle;
559559
}
560560

561+
void queue_impl::cleanup_fusion_cmd() {
562+
detail::Scheduler::getInstance().cleanUpCmdFusion(this);
563+
}
564+
561565
bool queue_impl::ext_oneapi_empty() const {
562566
// If we have in-order queue where events are not discarded then just check
563567
// the status of the last event.

sycl/source/detail/queue_impl.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@ class queue_impl {
313313
#endif
314314
throw_asynchronous();
315315
if (!MHostQueue) {
316+
cleanup_fusion_cmd();
316317
getPlugin()->call<PiApiKind::piQueueRelease>(MQueues[0]);
317318
}
318319
}
@@ -694,6 +695,9 @@ class queue_impl {
694695
}
695696

696697
protected:
698+
// Hook to the scheduler to clean up any fusion command held on destruction.
699+
void cleanup_fusion_cmd();
700+
697701
// template is needed for proper unit testing
698702
template <typename HandlerType = handler>
699703
void finalizeHandler(HandlerType &Handler, const CG::CGTYPE &Type,

sycl/source/detail/scheduler/commands.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3159,13 +3159,24 @@ pi_int32 KernelFusionCommand::enqueueImp() {
31593159
waitForPreparedHostEvents();
31603160
waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef());
31613161

3162+
// We need to release the queue here because KernelFusionCommands are
3163+
// held back by the scheduler thus prevent the deallocation of the queue.
3164+
resetQueue();
31623165
return PI_SUCCESS;
31633166
}
31643167

31653168
void KernelFusionCommand::setFusionStatus(FusionStatus Status) {
31663169
MStatus = Status;
31673170
}
31683171

3172+
void KernelFusionCommand::resetQueue() {
3173+
assert(MStatus != FusionStatus::ACTIVE &&
3174+
"Cannot release the queue attached to the KernelFusionCommand if it "
3175+
"is active.");
3176+
MQueue.reset();
3177+
MWorkerQueue.reset();
3178+
}
3179+
31693180
void KernelFusionCommand::emitInstrumentationData() {
31703181
#ifdef XPTI_ENABLE_INSTRUMENTATION
31713182
constexpr uint16_t NotificationTraceType = xpti::trace_node_create;

sycl/source/detail/scheduler/commands.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -727,6 +727,11 @@ class KernelFusionCommand : public Command {
727727
/// only be called under the protection of the scheduler write-lock.
728728
void setFusionStatus(FusionStatus Status);
729729

730+
/// Reset the queue. This can be required as the command is held in order
731+
/// to maintain events alive, however this prevent the normal destruction of
732+
/// the queue.
733+
void resetQueue();
734+
730735
bool isActive() const { return MStatus == FusionStatus::ACTIVE; }
731736

732737
bool readyForDeletion() const { return MStatus == FusionStatus::DELETED; }

sycl/source/detail/scheduler/graph_builder.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -948,7 +948,7 @@ Scheduler::GraphBuildResult Scheduler::GraphBuilder::addCG(
948948
// they create any requirement or event dependency on any of the kernels in
949949
// the fusion list, this will lead to cancellation of the fusion in the
950950
// GraphProcessor.
951-
auto QUniqueID = std::hash<QueueImplPtr>()(Queue);
951+
auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
952952
if (isInFusionMode(QUniqueID) && !NewCmd->isHostTask()) {
953953
auto *FusionCmd = findFusionList(QUniqueID)->second.get();
954954

@@ -1361,7 +1361,14 @@ Command *Scheduler::GraphBuilder::connectDepEvent(
13611361
}
13621362

13631363
void Scheduler::GraphBuilder::startFusion(QueueImplPtr Queue) {
1364-
auto QUniqueID = std::hash<QueueImplPtr>()(Queue);
1364+
cleanUpCmdFusion(Queue.get());
1365+
auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
1366+
MFusionMap.emplace(QUniqueID, std::make_unique<KernelFusionCommand>(Queue));
1367+
}
1368+
1369+
void Scheduler::GraphBuilder::cleanUpCmdFusion(
1370+
sycl::detail::queue_impl *Queue) {
1371+
auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue);
13651372
if (isInFusionMode(QUniqueID)) {
13661373
throw sycl::exception{sycl::make_error_code(sycl::errc::invalid),
13671374
"Queue already in fusion mode"};
@@ -1377,7 +1384,6 @@ void Scheduler::GraphBuilder::startFusion(QueueImplPtr Queue) {
13771384
cleanupCommand(OldFusionCmd->second.release());
13781385
MFusionMap.erase(OldFusionCmd);
13791386
}
1380-
MFusionMap.emplace(QUniqueID, std::make_unique<KernelFusionCommand>(Queue));
13811387
}
13821388

13831389
void Scheduler::GraphBuilder::removeNodeFromGraph(
@@ -1416,7 +1422,7 @@ void Scheduler::GraphBuilder::removeNodeFromGraph(
14161422

14171423
void Scheduler::GraphBuilder::cancelFusion(QueueImplPtr Queue,
14181424
std::vector<Command *> &ToEnqueue) {
1419-
auto QUniqueID = std::hash<QueueImplPtr>()(Queue);
1425+
auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
14201426
if (!isInFusionMode(QUniqueID)) {
14211427
return;
14221428
}
@@ -1504,7 +1510,7 @@ EventImplPtr
15041510
Scheduler::GraphBuilder::completeFusion(QueueImplPtr Queue,
15051511
std::vector<Command *> &ToEnqueue,
15061512
const property_list &PropList) {
1507-
auto QUniqueID = std::hash<QueueImplPtr>()(Queue);
1513+
auto QUniqueID = std::hash<sycl::detail::queue_impl *>()(Queue.get());
15081514
#if SYCL_EXT_CODEPLAY_KERNEL_FUSION
15091515
if (!isInFusionMode(QUniqueID)) {
15101516
auto InactiveFusionList = findFusionList(QUniqueID);

sycl/source/detail/scheduler/scheduler.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -569,13 +569,22 @@ void Scheduler::cleanupAuxiliaryResources(BlockingT Blocking) {
569569

570570
void Scheduler::startFusion(QueueImplPtr Queue) {
571571
WriteLockT Lock = acquireWriteLock();
572+
WriteLockT FusionMapLock = acquireFusionWriteLock();
572573
MGraphBuilder.startFusion(Queue);
573574
}
574575

576+
void Scheduler::cleanUpCmdFusion(sycl::detail::queue_impl *Queue) {
577+
// No graph lock, we might be called because the graph builder is releasing
578+
// resources.
579+
WriteLockT FusionMapLock = acquireFusionWriteLock();
580+
MGraphBuilder.cleanUpCmdFusion(Queue);
581+
}
582+
575583
void Scheduler::cancelFusion(QueueImplPtr Queue) {
576584
std::vector<Command *> ToEnqueue;
577585
{
578586
WriteLockT Lock = acquireWriteLock();
587+
WriteLockT FusionMapLock = acquireFusionWriteLock();
579588
MGraphBuilder.cancelFusion(Queue, ToEnqueue);
580589
}
581590
enqueueCommandForCG(nullptr, ToEnqueue);
@@ -587,6 +596,7 @@ EventImplPtr Scheduler::completeFusion(QueueImplPtr Queue,
587596
EventImplPtr FusedEvent;
588597
{
589598
WriteLockT Lock = acquireWriteLock();
599+
WriteLockT FusionMapLock = acquireFusionWriteLock();
590600
FusedEvent = MGraphBuilder.completeFusion(Queue, ToEnqueue, PropList);
591601
}
592602
enqueueCommandForCG(nullptr, ToEnqueue);
@@ -595,7 +605,7 @@ EventImplPtr Scheduler::completeFusion(QueueImplPtr Queue,
595605
}
596606

597607
bool Scheduler::isInFusionMode(QueueIdT queue) {
598-
ReadLockT Lock = acquireReadLock();
608+
ReadLockT Lock = acquireFusionReadLock();
599609
return MGraphBuilder.isInFusionMode(queue);
600610
}
601611

sycl/source/detail/scheduler/scheduler.hpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,8 @@ class Scheduler {
449449

450450
void startFusion(QueueImplPtr Queue);
451451

452+
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue);
453+
452454
void cancelFusion(QueueImplPtr Queue);
453455

454456
EventImplPtr completeFusion(QueueImplPtr Queue, const property_list &);
@@ -488,10 +490,33 @@ class Scheduler {
488490
return Lock;
489491
}
490492

493+
/// Provides exclusive access to std::shared_timed_mutex object with deadlock
494+
/// avoidance to the Fusion map
495+
WriteLockT acquireFusionWriteLock() {
496+
#ifdef _WIN32
497+
WriteLockT Lock(MFusionMapLock, std::defer_lock);
498+
while (!Lock.try_lock_for(std::chrono::milliseconds(10))) {
499+
// Without yield while loop acts like endless while loop and occupies the
500+
// whole CPU when multiple command groups are created in multiple host
501+
// threads
502+
std::this_thread::yield();
503+
}
504+
#else
505+
WriteLockT Lock(MFusionMapLock);
506+
// It is a deadlock on UNIX in implementation of lock and lock_shared, if
507+
// try_lock in the loop above will be executed, so using a single lock here
508+
#endif // _WIN32
509+
return Lock;
510+
}
511+
491512
/// Provides shared access to std::shared_timed_mutex object with deadlock
492513
/// avoidance
493514
ReadLockT acquireReadLock() { return ReadLockT{MGraphLock}; }
494515

516+
/// Provides shared access to std::shared_timed_mutex object with deadlock
517+
/// avoidance to the Fusion map
518+
ReadLockT acquireFusionReadLock() { return ReadLockT{MFusionMapLock}; }
519+
495520
void cleanupCommands(const std::vector<Command *> &Cmds);
496521

497522
void NotifyHostTaskCompletion(Command *Cmd);
@@ -627,6 +652,10 @@ class Scheduler {
627652

628653
void startFusion(QueueImplPtr Queue);
629654

655+
/// Clean up the internal fusion commands held for the given queue.
656+
/// @param Queue the queue for which to remove the fusion commands.
657+
void cleanUpCmdFusion(sycl::detail::queue_impl *Queue);
658+
630659
void cancelFusion(QueueImplPtr Queue, std::vector<Command *> &ToEnqueue);
631660

632661
EventImplPtr completeFusion(QueueImplPtr Queue,
@@ -870,6 +899,7 @@ class Scheduler {
870899

871900
GraphBuilder MGraphBuilder;
872901
RWLockT MGraphLock;
902+
RWLockT MFusionMapLock;
873903

874904
std::vector<Command *> MDeferredCleanupCommands;
875905
std::mutex MDeferredCleanupMutex;

0 commit comments

Comments
 (0)