Skip to content

Commit fb32977

Browse files
authored
[Libomptarget] Fix RPC-based malloc on NVPTX (llvm#72440)
Summary: The device allocator on NVPTX architectures is enqueued to a stream that the kernel is potentially executing on. This can lead to deadlocks as the kernel will not proceed until the allocation is complete and the allocation will not proceed until the kernel is complete. CUDA 11.2 introduced async allocations that we can manually place on separate streams to combat this. This patch makes a new allocation type that's guaranteed to be non-blocking so it will actually make progress, only Nvidia needs to care about this as the others are not blocking in this way by default. I had originally tried to make the `alloc` and `free` methods take a `__tgt_async_info`. However, I observed that with the large volume of streams being created by a parallel test it quickly locked up the system as presumably too many streams were being created. This implementation not just creates a new stream and immediately destroys it. This obviously isn't very fast, but it at least gets the cases to stop deadlocking for now.
1 parent 41a07e6 commit fb32977

File tree

8 files changed

+50
-6
lines changed

8 files changed

+50
-6
lines changed

openmp/libomptarget/include/omptarget.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,9 @@ enum TargetAllocTy : int32_t {
103103
TARGET_ALLOC_DEVICE = 0,
104104
TARGET_ALLOC_HOST,
105105
TARGET_ALLOC_SHARED,
106-
TARGET_ALLOC_DEFAULT
106+
TARGET_ALLOC_DEFAULT,
107+
/// The allocation will not block on other streams.
108+
TARGET_ALLOC_DEVICE_NON_BLOCKING,
107109
};
108110

109111
inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr,

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2112,6 +2112,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
21122112
switch (Kind) {
21132113
case TARGET_ALLOC_DEFAULT:
21142114
case TARGET_ALLOC_DEVICE:
2115+
case TARGET_ALLOC_DEVICE_NON_BLOCKING:
21152116
MemoryPool = CoarseGrainedMemoryPools[0];
21162117
break;
21172118
case TARGET_ALLOC_HOST:
@@ -3315,6 +3316,7 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
33153316
switch (Kind) {
33163317
case TARGET_ALLOC_DEFAULT:
33173318
case TARGET_ALLOC_DEVICE:
3319+
case TARGET_ALLOC_DEVICE_NON_BLOCKING:
33183320
MemoryPool = CoarseGrainedMemoryPools[0];
33193321
break;
33203322
case TARGET_ALLOC_HOST:

openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,14 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
6262
"Failed to initialize RPC server for device %d: %d", DeviceId, Err);
6363

6464
// Register a custom opcode handler to perform plugin specific allocation.
65-
// FIXME: We need to make sure this uses asynchronous allocations on CUDA.
6665
auto MallocHandler = [](rpc_port_t Port, void *Data) {
6766
rpc_recv_and_send(
6867
Port,
6968
[](rpc_buffer_t *Buffer, void *Data) {
7069
plugin::GenericDeviceTy &Device =
7170
*reinterpret_cast<plugin::GenericDeviceTy *>(Data);
72-
Buffer->data[0] = reinterpret_cast<uintptr_t>(
73-
Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE));
71+
Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
72+
Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
7473
},
7574
Data);
7675
};
@@ -88,7 +87,7 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
8887
plugin::GenericDeviceTy &Device =
8988
*reinterpret_cast<plugin::GenericDeviceTy *>(Data);
9089
Device.free(reinterpret_cast<void *>(Buffer->data[0]),
91-
TARGET_ALLOC_DEVICE);
90+
TARGET_ALLOC_DEVICE_NON_BLOCKING);
9291
},
9392
Data);
9493
};

openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ DLWRAP(cuLaunchKernel, 11)
4343
DLWRAP(cuMemAlloc, 2)
4444
DLWRAP(cuMemAllocHost, 2)
4545
DLWRAP(cuMemAllocManaged, 3)
46+
DLWRAP(cuMemAllocAsync, 3)
4647

4748
DLWRAP(cuMemcpyDtoDAsync, 4)
4849
DLWRAP(cuMemcpyDtoH, 3)
@@ -52,6 +53,8 @@ DLWRAP(cuMemcpyHtoDAsync, 4)
5253

5354
DLWRAP(cuMemFree, 1)
5455
DLWRAP(cuMemFreeHost, 1)
56+
DLWRAP(cuMemFreeAsync, 2)
57+
5558
DLWRAP(cuModuleGetFunction, 3)
5659
DLWRAP(cuModuleGetGlobal, 4)
5760

openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,7 @@ CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
293293
CUresult cuMemAlloc(CUdeviceptr *, size_t);
294294
CUresult cuMemAllocHost(void **, size_t);
295295
CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
296+
CUresult cuMemAllocAsync(CUdeviceptr *, size_t, CUstream);
296297

297298
CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
298299
CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
@@ -302,6 +303,7 @@ CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
302303

303304
CUresult cuMemFree(CUdeviceptr);
304305
CUresult cuMemFreeHost(void *);
306+
CUresult cuMemFreeAsync(CUdeviceptr, CUstream);
305307

306308
CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
307309
CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);

openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,14 @@ cuMemGetAllocationGranularity(size_t *granularity,
6363
CUmemAllocationGranularity_flags option) {}
6464
#endif
6565

66+
#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020))
67+
// Forward declarations of asynchronous memory management functions. This is
68+
// necessary for older versions of CUDA.
69+
CUresult cuMemAllocAsync(CUdeviceptr *ptr, size_t, CUstream) { *ptr = nullptr; }
70+
71+
CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
72+
#endif
73+
6674
/// Class implementing the CUDA device images properties.
6775
struct CUDADeviceImageTy : public DeviceImageTy {
6876
/// Create the CUDA image with the id and the target image pointer.
@@ -488,6 +496,16 @@ struct CUDADeviceTy : public GenericDeviceTy {
488496
Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
489497
MemAlloc = (void *)DevicePtr;
490498
break;
499+
case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
500+
CUstream Stream;
501+
if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
502+
break;
503+
if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
504+
break;
505+
cuStreamSynchronize(Stream);
506+
Res = cuStreamDestroy(Stream);
507+
MemAlloc = (void *)DevicePtr;
508+
}
491509
}
492510

493511
if (auto Err =
@@ -518,6 +536,15 @@ struct CUDADeviceTy : public GenericDeviceTy {
518536
case TARGET_ALLOC_HOST:
519537
Res = cuMemFreeHost(TgtPtr);
520538
break;
539+
case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
540+
CUstream Stream;
541+
if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
542+
break;
543+
cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
544+
cuStreamSynchronize(Stream);
545+
if ((Res = cuStreamDestroy(Stream)))
546+
break;
547+
}
521548
}
522549

523550
if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) {

openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
215215
case TARGET_ALLOC_DEVICE:
216216
case TARGET_ALLOC_HOST:
217217
case TARGET_ALLOC_SHARED:
218+
case TARGET_ALLOC_DEVICE_NON_BLOCKING:
218219
MemAlloc = std::malloc(Size);
219220
break;
220221
}

openmp/libomptarget/test/libc/malloc.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ int main() {
1313
unsigned *d_x;
1414
#pragma omp target map(from : d_x)
1515
{
16-
d_x = malloc(sizeof(unsigned));
16+
d_x = (unsigned *)malloc(sizeof(unsigned));
1717
*d_x = 1;
1818
}
1919

@@ -23,6 +23,14 @@ int main() {
2323
#pragma omp target is_device_ptr(d_x)
2424
{ free(d_x); }
2525

26+
#pragma omp target teams num_teams(64)
27+
#pragma omp parallel num_threads(32)
28+
{
29+
int *ptr = (int *)malloc(sizeof(int));
30+
*ptr = 42;
31+
free(ptr);
32+
}
33+
2634
// CHECK: PASS
2735
if (h_x == 1)
2836
fputs("PASS\n", stdout);

0 commit comments

Comments
 (0)