Skip to content

Commit 46e3c64

Browse files
[SYCL] Keep track of signaled fences to properly recycle associated command-lists. (#3215)
This fixes the leak of "command-list"s that had their associated fences not yet signaled at the time all events in that list were already completed (sporadic timing issue). In this case the fence remained non-reset and it's command-list is not returned to the command-list cache. The solution is to track pending fences and reset those still in-use by the time of the queue destruction. Signed-off-by: Sergey V Maslov <[email protected]>
1 parent bdfad9e commit 46e3c64

File tree

2 files changed

+58
-27
lines changed

2 files changed

+58
-27
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -521,17 +521,19 @@ pi_result _pi_context::finalize() {
521521
return PI_SUCCESS;
522522
}
523523

524-
pi_result
525-
_pi_queue::resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList,
526-
bool MakeAvailable) {
527-
// Event has been signalled: If the fence for the associated command list
528-
// is signalled, then reset the fence and command list and add them to the
529-
// available list for ruse in PI calls.
530-
ZE_CALL(zeFenceReset(this->ZeCommandListFenceMap[ZeCommandList]));
531-
ZE_CALL(zeCommandListReset(ZeCommandList));
524+
pi_result _pi_queue::resetCommandListFenceEntry(
525+
_pi_queue::command_list_fence_map_t::value_type &MapEntry,
526+
bool MakeAvailable) {
527+
// Fence had been signalled meaning the associated command-list completed.
528+
// Reset the fence and put the command list into a cache for reuse in PI
529+
// calls.
530+
ZE_CALL(zeFenceReset(MapEntry.second.ZeFence));
531+
ZE_CALL(zeCommandListReset(MapEntry.first));
532+
MapEntry.second.InUse = false;
533+
532534
if (MakeAvailable) {
533535
std::lock_guard<std::mutex> lock(this->Context->ZeCommandListCacheMutex);
534-
this->Context->ZeCommandListCache.push_back(ZeCommandList);
536+
this->Context->ZeCommandListCache.push_back(MapEntry.first);
535537
}
536538

537539
return PI_SUCCESS;
@@ -597,14 +599,15 @@ pi_result _pi_context::getAvailableCommandList(
597599
*ZeCommandList = Queue->Context->ZeCommandListCache.front();
598600
auto it = Queue->ZeCommandListFenceMap.find(*ZeCommandList);
599601
if (it != Queue->ZeCommandListFenceMap.end()) {
600-
*ZeFence = it->second;
602+
*ZeFence = it->second.ZeFence;
603+
it->second.InUse = true;
601604
} else {
602605
// If there is a command list available on this device, but no
603606
// fence yet associated, then we must create a fence/list
604607
// reference for this Queue. This can happen if two Queues reuse
605608
// a device which did not have the resources freed.
606609
ZE_CALL(zeFenceCreate(Queue->ZeCommandQueue, &ZeFenceDesc, ZeFence));
607-
Queue->ZeCommandListFenceMap[*ZeCommandList] = *ZeFence;
610+
Queue->ZeCommandListFenceMap[*ZeCommandList] = {*ZeFence, true};
608611
}
609612
Queue->Context->ZeCommandListCache.pop_front();
610613
return PI_SUCCESS;
@@ -617,12 +620,14 @@ pi_result _pi_context::getAvailableCommandList(
617620
// if a command list has completed dispatch of its commands and is ready for
618621
// reuse. If a command list is found to have been signalled, then the
619622
// command list & fence are reset and we return.
620-
for (const auto &MapEntry : Queue->ZeCommandListFenceMap) {
621-
ze_result_t ZeResult = ZE_CALL_NOCHECK(zeFenceQueryStatus(MapEntry.second));
623+
for (auto &MapEntry : Queue->ZeCommandListFenceMap) {
624+
ze_result_t ZeResult =
625+
ZE_CALL_NOCHECK(zeFenceQueryStatus(MapEntry.second.ZeFence));
622626
if (ZeResult == ZE_RESULT_SUCCESS) {
623-
Queue->resetCommandListFenceEntry(MapEntry.first, false);
627+
Queue->resetCommandListFenceEntry(MapEntry, false);
624628
*ZeCommandList = MapEntry.first;
625-
*ZeFence = MapEntry.second;
629+
*ZeFence = MapEntry.second.ZeFence;
630+
MapEntry.second.InUse = true;
626631
return PI_SUCCESS;
627632
}
628633
}
@@ -642,8 +647,8 @@ pi_result _pi_context::getAvailableCommandList(
642647
Queue->Device->Platform->ZeGlobalCommandListCount++;
643648
ZE_CALL(zeFenceCreate(Queue->ZeCommandQueue, &ZeFenceDesc, ZeFence));
644649
Queue->ZeCommandListFenceMap.insert(
645-
std::pair<ze_command_list_handle_t, ze_fence_handle_t>(*ZeCommandList,
646-
*ZeFence));
650+
std::pair<ze_command_list_handle_t, _pi_queue::command_list_fence_t>(
651+
*ZeCommandList, {*ZeFence, false}));
647652
pi_result = PI_SUCCESS;
648653
}
649654

@@ -2161,14 +2166,21 @@ pi_result piQueueRelease(pi_queue Queue) {
21612166

21622167
if (RefCountZero) {
21632168
// It is possible to get to here and still have an open command list
2164-
// if no wait or finish ever occurred for this queue. But still need
2165-
// // TODO: o make sure commands get executed.
2169+
// if no wait or finish ever occurred for this queue.
21662170
if (auto Res = Queue->executeOpenCommandList())
21672171
return Res;
21682172

2173+
// Make sure all commands get executed.
2174+
zeCommandQueueSynchronize(Queue->ZeCommandQueue, UINT64_MAX);
2175+
21692176
// Destroy all the fences created associated with this queue.
2170-
for (const auto &MapEntry : Queue->ZeCommandListFenceMap) {
2171-
ZE_CALL(zeFenceDestroy(MapEntry.second));
2177+
for (auto &MapEntry : Queue->ZeCommandListFenceMap) {
2178+
// This fence wasn't yet signalled when we polled it for recycling
2179+
// the command-list, so need to release the command-list too.
2180+
if (MapEntry.second.InUse) {
2181+
Queue->resetCommandListFenceEntry(MapEntry, true);
2182+
}
2183+
ZE_CALL(zeFenceDestroy(MapEntry.second.ZeFence));
21722184
}
21732185
Queue->ZeCommandListFenceMap.clear();
21742186
ZE_CALL(zeCommandQueueDestroy(Queue->ZeCommandQueue));
@@ -3793,10 +3805,14 @@ static pi_result cleanupAfterEvent(pi_event Event) {
37933805
// is signalled, then reset the fence and command list and add them to the
37943806
// available list for reuse in PI calls.
37953807
if (Queue->RefCount > 0) {
3796-
ze_result_t ZeResult = ZE_CALL_NOCHECK(
3797-
zeFenceQueryStatus(Queue->ZeCommandListFenceMap[EventCommandList]));
3808+
auto it = Queue->ZeCommandListFenceMap.find(EventCommandList);
3809+
if (it == Queue->ZeCommandListFenceMap.end()) {
3810+
die("Missing command-list completition fence");
3811+
}
3812+
ze_result_t ZeResult =
3813+
ZE_CALL_NOCHECK(zeFenceQueryStatus(it->second.ZeFence));
37983814
if (ZeResult == ZE_RESULT_SUCCESS) {
3799-
Queue->resetCommandListFenceEntry(EventCommandList, true);
3815+
Queue->resetCommandListFenceEntry(*it, true);
38003816
Event->ZeCommandList = nullptr;
38013817
}
38023818
}

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -335,9 +335,24 @@ struct _pi_queue : _pi_object {
335335
pi_uint32 NumTimesClosedEarly = {0};
336336
pi_uint32 NumTimesClosedFull = {0};
337337

338+
// Structure describing the fence used to track command-list completion.
339+
typedef struct {
340+
// The Level-Zero fence that will be signalled at completion.
341+
ze_fence_handle_t ZeFence;
342+
// Record if the fence is in use by any command-list.
343+
// This is needed to avoid leak of the tracked command-list if the fence
344+
// was not yet signaled at the time all events in that list were already
345+
// completed (we are polling the fence at events completion). The fence
346+
// may be still "in-use" due to sporadic delay in HW.
347+
//
348+
bool InUse;
349+
} command_list_fence_t;
350+
338351
// Map of all Command lists created with their associated Fence used for
339352
// tracking when the command list is available for use again.
340-
std::map<ze_command_list_handle_t, ze_fence_handle_t> ZeCommandListFenceMap;
353+
typedef std::map<ze_command_list_handle_t, command_list_fence_t>
354+
command_list_fence_map_t;
355+
command_list_fence_map_t ZeCommandListFenceMap;
341356

342357
// Returns true if any commands for this queue are allowed to
343358
// be batched together.
@@ -356,8 +371,8 @@ struct _pi_queue : _pi_object {
356371
// If the reset command list should be made available, then MakeAvailable
357372
// needs to be set to true. The caller must verify that this command list and
358373
// fence have been signalled.
359-
pi_result resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList,
360-
bool MakeAvailable);
374+
pi_result resetCommandListFenceEntry(
375+
command_list_fence_map_t::value_type &ZeCommandList, bool MakeAvailable);
361376

362377
// Attach a command list to this queue, close, and execute it.
363378
// Note that this command list cannot be appended to after this.

0 commit comments

Comments
 (0)