Skip to content

Commit 12b7c2b

Browse files
nrspruitKornevNikita
authored andcommitted
[UR][L0] Fix L0 teardown checks for stability (#17818)
- Address the race conditions with L0 Loader teardown timing such that L0 teardown is verified before handle destruction in all cases and uses a L0 loader api to verify stability. --------- Signed-off-by: Neil R. Spruit <[email protected]>
1 parent 4fdd7e8 commit 12b7c2b

24 files changed

+143
-168
lines changed

unified-runtime/cmake/FetchLevelZero.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR)
4343
set(UR_LEVEL_ZERO_LOADER_REPO "https://github.com/oneapi-src/level-zero.git")
4444
endif()
4545
if (UR_LEVEL_ZERO_LOADER_TAG STREQUAL "")
46-
set(UR_LEVEL_ZERO_LOADER_TAG v1.21.1)
46+
set(UR_LEVEL_ZERO_LOADER_TAG ecfe375b30cc04265b20ac1b7996a85d0910f3ed)
4747
endif()
4848

4949
# Disable due to a bug https://github.com/oneapi-src/level-zero/issues/104

unified-runtime/source/adapters/level_zero/command_buffer.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -445,16 +445,16 @@ void ur_exp_command_buffer_handle_t_::cleanupCommandBufferResources() {
445445

446446
// Release the memory allocated to the CommandList stored in the
447447
// command_buffer
448-
if (ZeComputeCommandList) {
448+
if (ZeComputeCommandList && checkL0LoaderTeardown()) {
449449
ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeComputeCommandList));
450450
}
451-
if (useCopyEngine() && ZeCopyCommandList) {
451+
if (useCopyEngine() && ZeCopyCommandList && checkL0LoaderTeardown()) {
452452
ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCopyCommandList));
453453
}
454454

455455
// Release the memory allocated to the CommandListResetEvents stored in the
456456
// command_buffer
457-
if (ZeCommandListResetEvents) {
457+
if (ZeCommandListResetEvents && checkL0LoaderTeardown()) {
458458
ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandListResetEvents));
459459
}
460460

@@ -502,7 +502,9 @@ void ur_exp_command_buffer_handle_t_::cleanupCommandBufferResources() {
502502
// Release fences allocated to command-buffer
503503
for (auto &ZeFencePair : ZeFencesMap) {
504504
auto &ZeFence = ZeFencePair.second;
505-
ZE_CALL_NOCHECK(zeFenceDestroy, (ZeFence));
505+
if (checkL0LoaderTeardown()) {
506+
ZE_CALL_NOCHECK(zeFenceDestroy, (ZeFence));
507+
}
506508
}
507509

508510
auto ReleaseIndirectMem = [](ur_kernel_handle_t Kernel) {

unified-runtime/source/adapters/level_zero/common.hpp

Lines changed: 8 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <unistd.h>
2626
#endif
2727

28+
#include <loader/ze_loader.h>
2829
#include <ur/ur.hpp>
2930
#include <ur_ddi.h>
3031
#include <ze_api.h>
@@ -38,65 +39,15 @@
3839
struct _ur_platform_handle_t;
3940

4041
[[maybe_unused]] static bool checkL0LoaderTeardown() {
41-
bool loaderStable = true;
42-
#ifdef _WIN32
43-
uint32_t ZeDriverCount = 0;
44-
HMODULE zeLoader = LoadLibrary("ze_loader.dll");
45-
if (zeLoader) {
46-
typedef ze_result_t (*zeDriverGet_t)(uint32_t *, ze_driver_handle_t *);
47-
zeDriverGet_t zeDriverGetLoader =
48-
(zeDriverGet_t)GetProcAddress(zeLoader, "zeDriverGet");
49-
if (zeDriverGetLoader) {
50-
ze_result_t result = zeDriverGetLoader(&ZeDriverCount, nullptr);
51-
logger::debug(
52-
"ZE ---> checkL0LoaderTeardown result = {} driver count = {}", result,
53-
ZeDriverCount);
54-
if (result != ZE_RESULT_SUCCESS || ZeDriverCount == 0) {
55-
loaderStable = false;
56-
}
57-
} else {
58-
logger::debug("ZE ---> checkL0LoaderTeardown: Failed to get address of "
59-
"zeDriverGet");
60-
loaderStable = false;
61-
}
62-
FreeLibrary(zeLoader);
63-
} else {
64-
logger::debug(
65-
"ZE ---> checkL0LoaderTeardown: Failed to load ze_loader.dll");
66-
loaderStable = false;
67-
}
68-
#else
69-
uint32_t ZeDriverCount = 0;
70-
void *zeLoader = dlopen("libze_loader.so.1", RTLD_LAZY);
71-
if (zeLoader) {
72-
typedef ze_result_t (*zeDriverGet_t)(uint32_t *, ze_driver_handle_t *);
73-
zeDriverGet_t zeDriverGetLoader =
74-
(zeDriverGet_t)dlsym(zeLoader, "zeDriverGet");
75-
if (zeDriverGetLoader) {
76-
ze_result_t result = zeDriverGetLoader(&ZeDriverCount, nullptr);
77-
logger::debug(
78-
"ZE ---> checkL0LoaderTeardown result = {} driver count = {}", result,
79-
ZeDriverCount);
80-
if (result != ZE_RESULT_SUCCESS || ZeDriverCount == 0) {
81-
loaderStable = false;
82-
}
83-
} else {
84-
logger::debug("ZE ---> checkL0LoaderTeardown: Failed to get address of "
85-
"zeDriverGet");
86-
loaderStable = false;
42+
try {
43+
if (!zelCheckIsLoaderInTearDown()) {
44+
return true;
8745
}
88-
dlclose(zeLoader);
89-
} else {
90-
logger::debug(
91-
"ZE ---> checkL0LoaderTeardown: Failed to load libze_loader.so.1");
92-
loaderStable = false;
46+
} catch (...) {
9347
}
94-
#endif
95-
if (!loaderStable) {
96-
logger::debug(
97-
"ZE ---> checkL0LoaderTeardown: Loader is not stable, returning false");
98-
}
99-
return loaderStable;
48+
logger::debug(
49+
"ZE ---> checkL0LoaderTeardown: Loader is in teardown or is unstable");
50+
return false;
10051
}
10152

10253
static auto getUrResultString = [](ur_result_t Result) {
@@ -504,9 +455,6 @@ struct _ur_object {
504455
// Indicates if we own the native handle or it came from interop that
505456
// asked to not transfer the ownership to SYCL RT.
506457
bool OwnNativeHandle = false;
507-
508-
// Indicates if this object is an interop handle.
509-
bool IsInteropNativeHandle = false;
510458
};
511459

512460
// Record for a memory allocation. This structure is used to keep information

unified-runtime/source/adapters/level_zero/context.cpp

Lines changed: 46 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,6 @@ ur_result_t urContextCreateWithNativeHandle(
151151
ur_context_handle_t_ *UrContext = new ur_context_handle_t_(
152152
ZeContext, NumDevices, Devices, OwnNativeHandle);
153153
UrContext->initialize();
154-
UrContext->IsInteropNativeHandle = true;
155154
*Context = reinterpret_cast<ur_context_handle_t>(UrContext);
156155
} catch (const std::bad_alloc &) {
157156
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
@@ -263,11 +262,8 @@ ur_result_t ContextReleaseHelper(ur_context_handle_t Context) {
263262
Contexts.erase(It);
264263
}
265264
ze_context_handle_t DestroyZeContext =
266-
((Context->OwnNativeHandle && !Context->IsInteropNativeHandle) ||
267-
(Context->OwnNativeHandle && Context->IsInteropNativeHandle &&
268-
checkL0LoaderTeardown()))
269-
? Context->ZeContext
270-
: nullptr;
265+
(Context->OwnNativeHandle && checkL0LoaderTeardown()) ? Context->ZeContext
266+
: nullptr;
271267

272268
// Clean up any live memory associated with Context
273269
ur_result_t Result = Context->finalize();
@@ -284,8 +280,12 @@ ur_result_t ContextReleaseHelper(ur_context_handle_t Context) {
284280
if (DestroyZeContext) {
285281
auto ZeResult = ZE_CALL_NOCHECK(zeContextDestroy, (DestroyZeContext));
286282
// Gracefully handle the case that L0 was already unloaded.
287-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
283+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
284+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
288285
return ze2urResult(ZeResult);
286+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
287+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
288+
}
289289
}
290290

291291
return Result;
@@ -304,12 +304,15 @@ ur_result_t ur_context_handle_t_::finalize() {
304304
std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
305305
for (auto &EventCache : EventCaches) {
306306
for (auto &Event : EventCache) {
307-
if (!Event->IsInteropNativeHandle ||
308-
(Event->IsInteropNativeHandle && checkL0LoaderTeardown())) {
307+
if (checkL0LoaderTeardown()) {
309308
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
310309
// Gracefully handle the case that L0 was already unloaded.
311-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
310+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
311+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
312312
return ze2urResult(ZeResult);
313+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
314+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
315+
}
313316
}
314317
Event->ZeEvent = nullptr;
315318
delete Event;
@@ -321,41 +324,61 @@ ur_result_t ur_context_handle_t_::finalize() {
321324
std::scoped_lock<ur_mutex> Lock(ZeEventPoolCacheMutex);
322325
for (auto &ZePoolCache : ZeEventPoolCache) {
323326
for (auto &ZePool : ZePoolCache) {
324-
auto ZeResult = ZE_CALL_NOCHECK(zeEventPoolDestroy, (ZePool));
325-
// Gracefully handle the case that L0 was already unloaded.
326-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
327-
return ze2urResult(ZeResult);
327+
if (checkL0LoaderTeardown()) {
328+
auto ZeResult = ZE_CALL_NOCHECK(zeEventPoolDestroy, (ZePool));
329+
// Gracefully handle the case that L0 was already unloaded.
330+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
331+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
332+
return ze2urResult(ZeResult);
333+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
334+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
335+
}
336+
}
328337
}
329338
ZePoolCache.clear();
330339
}
331340
}
332341

333-
// Destroy the command list used for initializations
334-
auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandListInit));
335-
// Gracefully handle the case that L0 was already unloaded.
336-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
337-
return ze2urResult(ZeResult);
342+
if (checkL0LoaderTeardown()) {
343+
// Destroy the command list used for initializations
344+
auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandListInit));
345+
// Gracefully handle the case that L0 was already unloaded.
346+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
347+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
348+
return ze2urResult(ZeResult);
349+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
350+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
351+
}
352+
}
338353

339354
std::scoped_lock<ur_mutex> Lock(ZeCommandListCacheMutex);
340355
for (auto &List : ZeComputeCommandListCache) {
341356
for (auto &Item : List.second) {
342357
ze_command_list_handle_t ZeCommandList = Item.first;
343-
if (ZeCommandList) {
358+
if (ZeCommandList && checkL0LoaderTeardown()) {
344359
auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList));
345360
// Gracefully handle the case that L0 was already unloaded.
346-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
361+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
362+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
347363
return ze2urResult(ZeResult);
364+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
365+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
366+
}
348367
}
349368
}
350369
}
351370
for (auto &List : ZeCopyCommandListCache) {
352371
for (auto &Item : List.second) {
353372
ze_command_list_handle_t ZeCommandList = Item.first;
354-
if (ZeCommandList) {
373+
if (ZeCommandList && checkL0LoaderTeardown()) {
355374
auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList));
356375
// Gracefully handle the case that L0 was already unloaded.
357-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
376+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
377+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
358378
return ze2urResult(ZeResult);
379+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
380+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
381+
}
359382
}
360383
}
361384
}

unified-runtime/source/adapters/level_zero/device.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1416,7 +1416,6 @@ ur_result_t urDeviceCreateWithNativeHandle(
14161416
if (Dev == nullptr)
14171417
return UR_RESULT_ERROR_INVALID_VALUE;
14181418

1419-
Dev->IsInteropNativeHandle = true;
14201419
*Device = Dev;
14211420
return UR_RESULT_SUCCESS;
14221421
}

unified-runtime/source/adapters/level_zero/event.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1013,7 +1013,6 @@ ur_result_t urEventCreateWithNativeHandle(
10131013
UREvent->CleanedUp = true;
10141014

10151015
*Event = reinterpret_cast<ur_event_handle_t>(UREvent);
1016-
UREvent->IsInteropNativeHandle = true;
10171016

10181017
return UR_RESULT_SUCCESS;
10191018
}
@@ -1102,7 +1101,7 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
11021101
* leaks or resource mismanagement.
11031102
*/
11041103
ur_event_handle_t_::~ur_event_handle_t_() {
1105-
if (this->ZeEvent && this->Completed) {
1104+
if (this->ZeEvent && this->Completed && checkL0LoaderTeardown()) {
11061105
if (this->UrQueue && !this->UrQueue->isDiscardEvents())
11071106
ZE_CALL_NOCHECK(zeEventDestroy, (this->ZeEvent));
11081107
}
@@ -1129,12 +1128,15 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
11291128
}
11301129
if (Event->OwnNativeHandle) {
11311130
if (DisableEventsCaching) {
1132-
if (!Event->IsInteropNativeHandle ||
1133-
(Event->IsInteropNativeHandle && checkL0LoaderTeardown())) {
1131+
if (checkL0LoaderTeardown()) {
11341132
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
11351133
// Gracefully handle the case that L0 was already unloaded.
1136-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
1134+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
1135+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
11371136
return ze2urResult(ZeResult);
1137+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
1138+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
1139+
}
11381140
}
11391141
Event->ZeEvent = nullptr;
11401142
auto Context = Event->Context;

unified-runtime/source/adapters/level_zero/image.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,9 @@ ur_result_t urBindlessImagesUnsampledImageHandleDestroyExp(
313313
auto item = hDevice->ZeOffsetToImageHandleMap.find(hImage);
314314

315315
if (item != hDevice->ZeOffsetToImageHandleMap.end()) {
316-
ZE2UR_CALL(zeImageDestroy, (item->second));
316+
if (checkL0LoaderTeardown()) {
317+
ZE2UR_CALL(zeImageDestroy, (item->second));
318+
}
317319
hDevice->ZeOffsetToImageHandleMap.erase(item);
318320
} else {
319321
return UR_RESULT_ERROR_INVALID_NULL_HANDLE;

unified-runtime/source/adapters/level_zero/kernel.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -940,12 +940,15 @@ ur_result_t urKernelRelease(
940940
auto KernelProgram = Kernel->Program;
941941
if (Kernel->OwnNativeHandle) {
942942
for (auto &ZeKernel : Kernel->ZeKernels) {
943-
if (!Kernel->IsInteropNativeHandle ||
944-
(Kernel->IsInteropNativeHandle && checkL0LoaderTeardown())) {
943+
if (checkL0LoaderTeardown()) {
945944
auto ZeResult = ZE_CALL_NOCHECK(zeKernelDestroy, (ZeKernel));
946945
// Gracefully handle the case that L0 was already unloaded.
947-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
946+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
947+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
948948
return ze2urResult(ZeResult);
949+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
950+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
951+
}
949952
}
950953
}
951954
}
@@ -1157,7 +1160,6 @@ ur_result_t urKernelCreateWithNativeHandle(
11571160
}
11581161

11591162
Kernel->Program = Program;
1160-
Kernel->IsInteropNativeHandle = true;
11611163

11621164
UR_CALL(Kernel->initialize());
11631165

unified-runtime/source/adapters/level_zero/memory.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1563,7 +1563,6 @@ ur_result_t urMemImageCreateWithNativeHandle(
15631563
auto OwnNativeHandle = Properties ? Properties->isNativeHandleOwned : false;
15641564
UR_CALL(createUrMemFromZeImage(Context, ZeHImage, OwnNativeHandle,
15651565
ZeImageDesc, Mem));
1566-
(*Mem)->IsInteropNativeHandle = true;
15671566

15681567
return UR_RESULT_SUCCESS;
15691568
}
@@ -1663,13 +1662,16 @@ ur_result_t urMemRelease(
16631662
if (Image->OwnNativeHandle) {
16641663
UR_CALL(Mem->getZeHandle(ZeHandleImage, ur_mem_handle_t_::write_only,
16651664
nullptr, nullptr, 0u));
1666-
if (!Image->IsInteropNativeHandle ||
1667-
(Image->IsInteropNativeHandle && checkL0LoaderTeardown())) {
1665+
if (checkL0LoaderTeardown()) {
16681666
auto ZeResult = ZE_CALL_NOCHECK(
16691667
zeImageDestroy, (ur_cast<ze_image_handle_t>(ZeHandleImage)));
16701668
// Gracefully handle the case that L0 was already unloaded.
1671-
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
1669+
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED ||
1670+
ZeResult != ZE_RESULT_ERROR_UNKNOWN))
16721671
return ze2urResult(ZeResult);
1672+
if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
1673+
ZeResult = ZE_RESULT_ERROR_UNINITIALIZED;
1674+
}
16731675
}
16741676
}
16751677
delete Image;
@@ -1776,7 +1778,6 @@ ur_result_t urMemBufferCreateWithNativeHandle(
17761778
Buffer = new _ur_buffer(Context, Size, Device, ur_cast<char *>(NativeMem),
17771779
OwnNativeHandle);
17781780
*Mem = reinterpret_cast<ur_mem_handle_t>(Buffer);
1779-
(*Mem)->IsInteropNativeHandle = true;
17801781
} catch (const std::bad_alloc &) {
17811782
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
17821783
} catch (...) {

unified-runtime/source/adapters/level_zero/physical_mem.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,10 @@ ur_result_t urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem) {
5050
if (!hPhysicalMem->RefCount.decrementAndTest())
5151
return UR_RESULT_SUCCESS;
5252

53-
ZE2UR_CALL(zePhysicalMemDestroy, (hPhysicalMem->Context->getZeHandle(),
54-
hPhysicalMem->ZePhysicalMem));
53+
if (checkL0LoaderTeardown()) {
54+
ZE2UR_CALL(zePhysicalMemDestroy, (hPhysicalMem->Context->getZeHandle(),
55+
hPhysicalMem->ZePhysicalMem));
56+
}
5557
delete hPhysicalMem;
5658

5759
return UR_RESULT_SUCCESS;

0 commit comments

Comments
 (0)