Skip to content

Commit c4a7290

Browse files
[SYCL] Switch to using blocking USM free for OpenCL GPU (#4928)
Whenever a kernel is enqueued on GPU, the GPU driver records the state of all USM pointers that might be used in an indirect fashion. Because of this, these pointers cannot be freed until the execution of the kernel is finished. This change addresses this problem for OpenCL by using a blocking version of free, while Level Zero already handles this by deferring USM release. The change is temporarily limited to OpenCL GPU until a bug in OpenCL CPU runtime is resolved.
1 parent db7ff53 commit c4a7290

File tree

2 files changed

+52
-4
lines changed

2 files changed

+52
-4
lines changed

sycl/include/CL/sycl/detail/pi.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1635,7 +1635,9 @@ __SYCL_EXPORT pi_result piextUSMSharedAlloc(void **result_ptr,
16351635
pi_usm_mem_properties *properties,
16361636
size_t size, pi_uint32 alignment);
16371637

1638-
/// Frees allocated USM memory
1638+
/// Indicates that the allocated USM memory is no longer needed on the runtime
1639+
/// side. The actual freeing of the memory may be done in a blocking or deferred
1640+
/// manner, e.g. to avoid issues with indirect memory access from kernels.
16391641
///
16401642
/// \param context is the pi_context of the allocation
16411643
/// \param ptr is the memory to be freed

sycl/plugins/opencl/pi_opencl.cpp

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ CONSTFIX char clHostMemAllocName[] = "clHostMemAllocINTEL";
5757
CONSTFIX char clDeviceMemAllocName[] = "clDeviceMemAllocINTEL";
5858
CONSTFIX char clSharedMemAllocName[] = "clSharedMemAllocINTEL";
5959
CONSTFIX char clMemFreeName[] = "clMemFreeINTEL";
60+
CONSTFIX char clMemBlockingFreeName[] = "clMemBlockingFreeINTEL";
6061
CONSTFIX char clCreateBufferWithPropertiesName[] =
6162
"clCreateBufferWithPropertiesINTEL";
6263
CONSTFIX char clSetKernelArgMemPointerName[] = "clSetKernelArgMemPointerINTEL";
@@ -969,11 +970,56 @@ pi_result piextUSMSharedAlloc(void **result_ptr, pi_context context,
969970
/// \param context is the pi_context of the allocation
970971
/// \param ptr is the memory to be freed
971972
pi_result piextUSMFree(pi_context context, void *ptr) {
973+
// Use a blocking free to avoid issues with indirect access from kernels that
974+
// might be still running.
975+
clMemBlockingFreeINTEL_fn FuncPtr = nullptr;
976+
977+
// We need to use clMemBlockingFreeINTEL here, however, due to a bug in OpenCL
978+
// CPU runtime this call fails with CL_INVALID_EVENT on CPU devices in certain
979+
// cases. As a temporary workaround, this function replicates caching of
980+
// extension function pointers in getExtFuncFromContext, while choosing
981+
// clMemBlockingFreeINTEL for GPU and clMemFreeINTEL for other device types.
982+
// TODO remove this workaround when the new OpenCL CPU runtime version is
983+
// uplifted in CI.
984+
static_assert(
985+
std::is_same<clMemBlockingFreeINTEL_fn, clMemFreeINTEL_fn>::value);
986+
cl_uint deviceCount;
987+
cl_int ret_err =
988+
clGetContextInfo(cast<cl_context>(context), CL_CONTEXT_NUM_DEVICES,
989+
sizeof(cl_uint), &deviceCount, nullptr);
990+
991+
if (ret_err != CL_SUCCESS || deviceCount < 1) {
992+
return PI_INVALID_CONTEXT;
993+
}
994+
995+
std::vector<cl_device_id> devicesInCtx(deviceCount);
996+
ret_err = clGetContextInfo(cast<cl_context>(context), CL_CONTEXT_DEVICES,
997+
deviceCount * sizeof(cl_device_id),
998+
devicesInCtx.data(), nullptr);
999+
1000+
if (ret_err != CL_SUCCESS) {
1001+
return PI_INVALID_CONTEXT;
1002+
}
1003+
1004+
bool useBlockingFree = true;
1005+
for (const cl_device_id &dev : devicesInCtx) {
1006+
cl_device_type devType = CL_DEVICE_TYPE_DEFAULT;
1007+
ret_err = clGetDeviceInfo(dev, CL_DEVICE_TYPE, sizeof(cl_device_type),
1008+
&devType, nullptr);
1009+
if (ret_err != CL_SUCCESS) {
1010+
return PI_INVALID_DEVICE;
1011+
}
1012+
useBlockingFree &= devType == CL_DEVICE_TYPE_GPU;
1013+
}
9721014

973-
clMemFreeINTEL_fn FuncPtr = nullptr;
9741015
pi_result RetVal = PI_INVALID_OPERATION;
975-
RetVal = getExtFuncFromContext<clMemFreeName, clMemFreeINTEL_fn>(context,
976-
&FuncPtr);
1016+
if (useBlockingFree)
1017+
RetVal =
1018+
getExtFuncFromContext<clMemBlockingFreeName, clMemBlockingFreeINTEL_fn>(
1019+
context, &FuncPtr);
1020+
else
1021+
RetVal = getExtFuncFromContext<clMemFreeName, clMemFreeINTEL_fn>(context,
1022+
&FuncPtr);
9771023

9781024
if (FuncPtr) {
9791025
RetVal = cast<pi_result>(FuncPtr(cast<cl_context>(context), ptr));

0 commit comments

Comments
 (0)