Skip to content

Commit 7fe4ad5

Browse files
[SYCL][L0] move waiting from piEventRelease to Event->cleanup() (#4290)
1 parent eef0760 commit 7fe4ad5

File tree

2 files changed

+61
-44
lines changed

2 files changed

+61
-44
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 58 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@
2828

2929
#include "usm_allocator.hpp"
3030

31+
extern "C" {
32+
// Forward declarartions.
33+
static pi_result EventRelease(pi_event Event, pi_queue LockedQueue);
34+
static pi_result QueueRelease(pi_queue Queue, pi_queue LockedQueue);
35+
}
36+
3137
namespace {
3238

3339
// Controls Level Zero calls serialization to w/a Level Zero driver being not MT
@@ -533,6 +539,7 @@ createEventAndAssociateQueue(pi_queue Queue, pi_event *Event,
533539
if (CommandList != Queue->CommandListMap.end()) {
534540
(*Event)->ZeCommandList = CommandList->first;
535541
CommandList->second.append(*Event);
542+
PI_CALL(piEventRetain(*Event));
536543
} else {
537544
(*Event)->ZeCommandList = nullptr;
538545
}
@@ -548,7 +555,7 @@ createEventAndAssociateQueue(pi_queue Queue, pi_event *Event,
548555
// release a PI event as soon as that's not being waited in the app.
549556
// But we have to ensure that the event is not destroyed before
550557
// it is really signalled, so retain it explicitly here and
551-
// release in cleanupAfterEvent.
558+
// release in Event->cleanup().
552559
//
553560
PI_CALL(piEventRetain(*Event));
554561

@@ -706,7 +713,17 @@ pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList,
706713
ZE_CALL(zeFenceReset, (CommandList->second.ZeFence));
707714
ZE_CALL(zeCommandListReset, (CommandList->first));
708715
CommandList->second.InUse = false;
709-
CommandList->second.EventList.clear();
716+
717+
// Finally release/cleanup all the events in this command list.
718+
auto &EventList = CommandList->second.EventList;
719+
for (auto &Event : EventList) {
720+
if (!Event->CleanedUp) {
721+
ZE_CALL(zeHostSynchronize, (Event->ZeEvent));
722+
Event->cleanup(this);
723+
}
724+
PI_CALL(EventRelease(Event, this));
725+
}
726+
EventList.clear();
710727

711728
if (MakeAvailable) {
712729
std::lock_guard<std::mutex> lock(this->Context->ZeCommandListCacheMutex);
@@ -2634,13 +2651,20 @@ pi_result piQueueRetain(pi_queue Queue) {
26342651
}
26352652

26362653
pi_result piQueueRelease(pi_queue Queue) {
2654+
return QueueRelease(Queue, nullptr);
2655+
}
2656+
2657+
static pi_result QueueRelease(pi_queue Queue, pi_queue LockedQueue) {
26372658
PI_ASSERT(Queue, PI_INVALID_QUEUE);
26382659
// We need to use a bool variable here to check the condition that
26392660
// RefCount becomes zero atomically with PiQueueMutex lock.
26402661
// Then, we can release the lock before we remove the Queue below.
26412662
bool RefCountZero = false;
26422663
{
2643-
std::lock_guard<std::mutex> Lock(Queue->PiQueueMutex);
2664+
auto Lock = ((Queue == LockedQueue)
2665+
? std::unique_lock<std::mutex>()
2666+
: std::unique_lock<std::mutex>(Queue->PiQueueMutex));
2667+
26442668
Queue->RefCount--;
26452669
if (Queue->RefCount == 0)
26462670
RefCountZero = true;
@@ -4049,7 +4073,7 @@ pi_result piKernelRelease(pi_kernel Kernel) {
40494073
PI_ASSERT(Kernel, PI_INVALID_KERNEL);
40504074

40514075
if (IndirectAccessTrackingEnabled) {
4052-
// piKernelRelease is called by cleanupAfterEvent as soon as kernel
4076+
// piKernelRelease is called by Event->cleanup() as soon as kernel
40534077
// execution has finished. This is the place where we need to release memory
40544078
// allocations. If kernel is not in use (not submitted by some other thread)
40554079
// then release referenced memory allocations. As a result, memory can be
@@ -4199,7 +4223,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
41994223

42004224
// Use piKernelRetain to increment the reference count and indicate
42014225
// that the Kernel is in use. Once the event has been signalled, the
4202-
// code in cleanupAfterEvent will do a piReleaseKernel to update
4226+
// code in Event.cleanup() will do a piReleaseKernel to update
42034227
// the reference count on the kernel, using the kernel saved
42044228
// in CommandData.
42054229
PI_CALL(piKernelRetain(Kernel));
@@ -4391,7 +4415,7 @@ pi_result piEventGetProfilingInfo(pi_event Event, pi_profiling_info ParamName,
43914415
// Perform any necessary cleanup after an event has been signalled.
43924416
// This currently recycles the associate command list, and also makes
43934417
// sure to release any kernel that may have been used by the event.
4394-
static pi_result cleanupAfterEvent(pi_event Event) {
4418+
pi_result _pi_event::cleanup(pi_queue LockedQueue) {
43954419
// The implementation of this is slightly tricky. The same event
43964420
// can be referred to by multiple threads, so it is possible to
43974421
// have a race condition between the read of fields of the event,
@@ -4401,21 +4425,18 @@ static pi_result cleanupAfterEvent(pi_event Event) {
44014425
// queue to also serve as the thread safety mechanism for the
44024426
// any of the Event's data members that need to be read/reset as
44034427
// part of the cleanup operations.
4404-
auto Queue = Event->Queue;
44054428
if (Queue) {
44064429
// Lock automatically releases when this goes out of scope.
4407-
std::lock_guard<std::mutex> lock(Queue->PiQueueMutex);
4430+
auto Lock = ((Queue == LockedQueue)
4431+
? std::unique_lock<std::mutex>()
4432+
: std::unique_lock<std::mutex>(Queue->PiQueueMutex));
44084433

4409-
// Cleanup the command list associated with the event if it hasn't
4410-
// been cleaned up already.
4411-
auto EventCommandList = Event->ZeCommandList;
4412-
4413-
if (EventCommandList) {
4434+
if (ZeCommandList) {
44144435
// Event has been signalled: If the fence for the associated command list
44154436
// is signalled, then reset the fence and command list and add them to the
44164437
// available list for reuse in PI calls.
44174438
if (Queue->RefCount > 0) {
4418-
auto it = Queue->CommandListMap.find(EventCommandList);
4439+
auto it = Queue->CommandListMap.find(ZeCommandList);
44194440
if (it == Queue->CommandListMap.end()) {
44204441
die("Missing command-list completition fence");
44214442
}
@@ -4436,42 +4457,41 @@ static pi_result cleanupAfterEvent(pi_event Event) {
44364457
// too, so we might need to add a new command type to differentiate.
44374458
//
44384459
ze_result_t ZeResult =
4439-
(Event->CommandType == PI_COMMAND_TYPE_MEM_BUFFER_COPY)
4460+
(CommandType == PI_COMMAND_TYPE_MEM_BUFFER_COPY)
44404461
? ZE_CALL_NOCHECK(zeHostSynchronize, (it->second.ZeFence))
44414462
: ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence));
44424463

44434464
if (ZeResult == ZE_RESULT_SUCCESS) {
44444465
Queue->resetCommandList(it, true);
4445-
Event->ZeCommandList = nullptr;
4466+
ZeCommandList = nullptr;
44464467
}
44474468
}
44484469
}
44494470
}
44504471

44514472
// Release the kernel associated with this event if there is one.
4452-
if (Event->CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL &&
4453-
Event->CommandData) {
4454-
PI_CALL(piKernelRelease(pi_cast<pi_kernel>(Event->CommandData)));
4455-
Event->CommandData = nullptr;
4473+
if (CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL && CommandData) {
4474+
PI_CALL(piKernelRelease(pi_cast<pi_kernel>(CommandData)));
4475+
CommandData = nullptr;
44564476
}
44574477

44584478
// If this event was the LastCommandEvent in the queue, being used
44594479
// to make sure that commands were executed in-order, remove this.
44604480
// If we don't do this, the event can get released and freed leaving
44614481
// a dangling pointer to this event. It could also cause unneeded
44624482
// already finished events to show up in the wait list.
4463-
if (Queue->LastCommandEvent == Event) {
4483+
if (Queue->LastCommandEvent == this) {
44644484
Queue->LastCommandEvent = nullptr;
44654485
}
44664486
}
44674487

4468-
if (!Event->CleanedUp) {
4469-
Event->CleanedUp = true;
4488+
if (!CleanedUp) {
4489+
CleanedUp = true;
44704490
// Release this event since we explicitly retained it on creation.
44714491
// NOTE: that this needs to be done only once for an event so
44724492
// this is guarded with the CleanedUp flag.
44734493
//
4474-
PI_CALL(piEventRelease(Event));
4494+
PI_CALL(EventRelease(this, LockedQueue));
44754495
}
44764496

44774497
// Make a list of all the dependent events that must have signalled
@@ -4484,8 +4504,7 @@ static pi_result cleanupAfterEvent(pi_event Event) {
44844504

44854505
std::list<pi_event> EventsToBeReleased;
44864506

4487-
Event->WaitList.collectEventsForReleaseAndDestroyPiZeEventList(
4488-
EventsToBeReleased);
4507+
WaitList.collectEventsForReleaseAndDestroyPiZeEventList(EventsToBeReleased);
44894508

44904509
while (!EventsToBeReleased.empty()) {
44914510
pi_event DepEvent = EventsToBeReleased.front();
@@ -4499,14 +4518,18 @@ static pi_result cleanupAfterEvent(pi_event Event) {
44994518
// twice, so it is safe. Lock automatically releases when this goes out of
45004519
// scope.
45014520
// TODO: this code needs to be moved out of the guard.
4502-
std::lock_guard<std::mutex> lock(DepEvent->Queue->PiQueueMutex);
4521+
auto Lock =
4522+
((DepEvent->Queue == LockedQueue)
4523+
? std::unique_lock<std::mutex>()
4524+
: std::unique_lock<std::mutex>(DepEvent->Queue->PiQueueMutex));
4525+
45034526
if (DepEvent->CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL &&
45044527
DepEvent->CommandData) {
45054528
PI_CALL(piKernelRelease(pi_cast<pi_kernel>(DepEvent->CommandData)));
45064529
DepEvent->CommandData = nullptr;
45074530
}
45084531
}
4509-
PI_CALL(piEventRelease(DepEvent));
4532+
PI_CALL(EventRelease(DepEvent, LockedQueue));
45104533
}
45114534

45124535
return PI_SUCCESS;
@@ -4539,7 +4562,7 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
45394562

45404563
// NOTE: we are cleaning up after the event here to free resources
45414564
// sooner in case run-time is not calling piEventRelease soon enough.
4542-
cleanupAfterEvent(EventList[I]);
4565+
EventList[I]->cleanup();
45434566
}
45444567

45454568
return PI_SUCCESS;
@@ -4571,26 +4594,18 @@ pi_result piEventRetain(pi_event Event) {
45714594
}
45724595

45734596
pi_result piEventRelease(pi_event Event) {
4597+
return EventRelease(Event, nullptr);
4598+
}
4599+
4600+
static pi_result EventRelease(pi_event Event, pi_queue LockedQueue) {
45744601
PI_ASSERT(Event, PI_INVALID_EVENT);
45754602
if (!Event->RefCount) {
45764603
die("piEventRelease: called on a destroyed event");
45774604
}
45784605

4579-
// The event is no longer needed upstream, but we have to wait for its
4580-
// completion in order to do proper cleanup. Otherwise refcount may still be
4581-
// non-zero in the check below and we will get event leak.
4582-
//
4583-
// TODO: in case this potentially "early" wait causes performance problems,
4584-
// e.g. due to closing a batch too early, or blocking the host for no good
4585-
// reason, then we should look into moving the wait down to queue release
4586-
// (will need to remember all events in the queue for that).
4587-
//
4588-
if (!Event->CleanedUp)
4589-
PI_CALL(piEventsWait(1, &Event));
4590-
45914606
if (--(Event->RefCount) == 0) {
45924607
if (!Event->CleanedUp)
4593-
cleanupAfterEvent(Event);
4608+
Event->cleanup(LockedQueue);
45944609

45954610
if (Event->CommandType == PI_COMMAND_TYPE_MEM_BUFFER_UNMAP &&
45964611
Event->CommandData) {
@@ -4617,7 +4632,7 @@ pi_result piEventRelease(pi_event Event) {
46174632
// pi_event is released. Here we have to decrement it so pi_queue
46184633
// can be released successfully.
46194634
if (Event->Queue) {
4620-
PI_CALL(piQueueRelease(Event->Queue));
4635+
PI_CALL(QueueRelease(Event->Queue, LockedQueue));
46214636
}
46224637
delete Event;
46234638
}

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -868,7 +868,9 @@ struct _pi_event : _pi_object {
868868
// This list must be destroyed once the event has signalled.
869869
_pi_ze_event_list_t WaitList;
870870

871-
// Tracks if the needed cleanupAfterEvent was already performed for
871+
// Performs the cleanup of a completed event.
872+
pi_result cleanup(pi_queue LockedQueue = nullptr);
873+
// Tracks if the needed cleanup was already performed for
872874
// a completed event. This allows to control that some cleanup
873875
// actions are performed only once.
874876
//

0 commit comments

Comments
 (0)