-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[Libomptarget] Fix RPC-based malloc on NVPTX #72440
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Joseph Huber (jhuber6) Changes
Full diff: https://github.com/llvm/llvm-project/pull/72440.diff 8 Files Affected:
diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index 818967c88904ec0..745f6d928476990 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -116,7 +116,9 @@ enum TargetAllocTy : int32_t {
TARGET_ALLOC_DEVICE = 0,
TARGET_ALLOC_HOST,
TARGET_ALLOC_SHARED,
- TARGET_ALLOC_DEFAULT
+ TARGET_ALLOC_DEFAULT,
+ /// The allocation will not block on other streams.
+ TARGET_ALLOC_DEVICE_NON_BLOCKING,
};
/// This struct contains all of the arguments to a target kernel region launch.
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index a529c379844e904..d7141726bffa5d6 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2064,6 +2064,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemoryPool = CoarseGrainedMemoryPools[0];
break;
case TARGET_ALLOC_HOST:
@@ -3247,6 +3248,7 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemoryPool = CoarseGrainedMemoryPools[0];
break;
case TARGET_ALLOC_HOST:
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
index 72bba012fcf93c6..27a5a6324ceb429 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
@@ -67,15 +67,14 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
"Failed to initialize RPC server for device %d: %d", DeviceId, Err);
// Register a custom opcode handler to perform plugin specific allocation.
- // FIXME: We need to make sure this uses asynchronous allocations on CUDA.
auto MallocHandler = [](rpc_port_t Port, void *Data) {
rpc_recv_and_send(
Port,
[](rpc_buffer_t *Buffer, void *Data) {
plugin::GenericDeviceTy &Device =
*reinterpret_cast<plugin::GenericDeviceTy *>(Data);
- Buffer->data[0] = reinterpret_cast<uintptr_t>(
- Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE));
+ Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
+ Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
},
Data);
};
@@ -93,7 +92,7 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
plugin::GenericDeviceTy &Device =
*reinterpret_cast<plugin::GenericDeviceTy *>(Data);
Device.free(reinterpret_cast<void *>(Buffer->data[0]),
- TARGET_ALLOC_DEVICE);
+ TARGET_ALLOC_DEVICE_NON_BLOCKING);
},
Data);
};
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 3d0de0d5b2caff6..e968ec712921641 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -42,6 +42,7 @@ DLWRAP(cuLaunchKernel, 11)
DLWRAP(cuMemAlloc, 2)
DLWRAP(cuMemAllocHost, 2)
DLWRAP(cuMemAllocManaged, 3)
+DLWRAP(cuMemAllocAsync, 3)
DLWRAP(cuMemcpyDtoDAsync, 4)
DLWRAP(cuMemcpyDtoH, 3)
@@ -51,6 +52,8 @@ DLWRAP(cuMemcpyHtoDAsync, 4)
DLWRAP(cuMemFree, 1)
DLWRAP(cuMemFreeHost, 1)
+DLWRAP(cuMemFreeAsync, 2)
+
DLWRAP(cuModuleGetFunction, 3)
DLWRAP(cuModuleGetGlobal, 4)
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 3e0307759924b21..32031c28f8797ed 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -293,6 +293,7 @@ CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
CUresult cuMemAlloc(CUdeviceptr *, size_t);
CUresult cuMemAllocHost(void **, size_t);
CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
+CUresult cuMemAllocAsync(CUdeviceptr *, size_t, CUstream);
CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
@@ -302,6 +303,7 @@ CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
CUresult cuMemFree(CUdeviceptr);
CUresult cuMemFreeHost(void *);
+CUresult cuMemFreeAsync(CUdeviceptr, CUstream);
CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index a6e28574a7f08e3..0ee46cd64a64ebf 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -61,6 +61,14 @@ cuMemGetAllocationGranularity(size_t *granularity,
CUmemAllocationGranularity_flags option) {}
#endif
+#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020))
+// Forward declarations of asynchronous memory management functions. This is
+// necessary for older versions of CUDA.
+CUresult cuMemAllocAsync(CUdeviceptr *ptr, size_t, CUstream) { *ptr = nullptr; }
+
+CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
+#endif
+
/// Class implementing the CUDA device images properties.
struct CUDADeviceImageTy : public DeviceImageTy {
/// Create the CUDA image with the id and the target image pointer.
@@ -486,6 +494,15 @@ struct CUDADeviceTy : public GenericDeviceTy {
Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
MemAlloc = (void *)DevicePtr;
break;
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+ CUstream Stream;
+ if (Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING))
+ break;
+ if (Res = cuMemAllocAsync(&DevicePtr, Size, Stream))
+ break;
+ MemAlloc = (void *)DevicePtr;
+ Res = cuStreamDestroy(Stream);
+ }
}
if (auto Err =
@@ -516,6 +533,14 @@ struct CUDADeviceTy : public GenericDeviceTy {
case TARGET_ALLOC_HOST:
Res = cuMemFreeHost(TgtPtr);
break;
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+ CUstream Stream;
+ if (Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING))
+ break;
+ cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
+ if (Res = cuStreamDestroy(Stream))
+ break;
+ }
}
if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) {
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index 85cf9bef1543b2a..66937aa3e10bb4d 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -214,6 +214,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
case TARGET_ALLOC_DEVICE:
case TARGET_ALLOC_HOST:
case TARGET_ALLOC_SHARED:
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemAlloc = std::malloc(Size);
break;
}
diff --git a/openmp/libomptarget/test/libc/malloc.c b/openmp/libomptarget/test/libc/malloc.c
index c18a724930f41e5..b587b618472e430 100644
--- a/openmp/libomptarget/test/libc/malloc.c
+++ b/openmp/libomptarget/test/libc/malloc.c
@@ -13,7 +13,7 @@ int main() {
unsigned *d_x;
#pragma omp target map(from : d_x)
{
- d_x = malloc(sizeof(unsigned));
+ d_x = (unsigned *)malloc(sizeof(unsigned));
*d_x = 1;
}
@@ -23,6 +23,14 @@ int main() {
#pragma omp target is_device_ptr(d_x)
{ free(d_x); }
+#pragma omp target teams num_teams(64)
+#pragma omp parallel num_threads(32)
+ {
+ int *ptr = (int *)malloc(sizeof(int));
+ *ptr = 42;
+ free(ptr);
+ }
+
// CHECK: PASS
if (h_x == 1)
fputs("PASS\n", stdout);
|
@@ -2064,6 +2064,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { | |||
switch (Kind) { | |||
case TARGET_ALLOC_DEFAULT: | |||
case TARGET_ALLOC_DEVICE: | |||
case TARGET_ALLOC_DEVICE_NON_BLOCKING: | |||
MemoryPool = CoarseGrainedMemoryPools[0]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably more of a general question here: Why is this always the 0
th entry? Can this ever be problematic?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we just assume they're all equivalent right now, and there's an assert somewhere to ensure that there's at least one IIRC.
CUstream Stream; | ||
if (Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You mentioned that using the getStream(AsyncInfoTy ... )
method seems to lock up the system. I wonder why that is, there shouldn't be much of a difference? At least from what it is supposed to do.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So, there is no AsyncInfoTy object here, but I am also confused why we can't simply call getStream and later return it again. Stream creation and destruction is not free, hence our resource pools.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not 100% sure, but I think it's because the resource pools rarely ever delete the created streams. This lead to some problems. I tried making a single stream that was shared but this lead to weird errors as well. Hence, this being the easiest way to get it to stop deadlocking in the short term.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So, there is no AsyncInfoTy object here, but I am also confused why we can't simply call getStream and later return it again. Stream creation and destruction is not free, hence our resource pools.
I mean, getStream
requires an AsyncInfoWrapperTy
reference (at least the one I found), no? Obviously nothing would stop you from having a local object that you can use to synchronize once your malloc
is done. Or am I missing something fundamental here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, so I think one problem is that the logic states that a waiting thread can check the RPC server. However, this logic isn't really prepared to handle when a thread handling the RPC server already comes in and waits. So we end up with some kind of infinite recursion. I'll need to think of some logic to prevent this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Further, if I use a single stream I get a single successful allocation from it, then the rest return nullptr
for unknown reasons.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since this is only for RPC, it should be fine. I still believe creating/deleting the stream is bad, but we can look at it later.
Summary: The device allocator on NVPTX architectures is enqueud to a stream that the kernel is potentially executing on. This can lead to deadlocks as the kernel will not proceed until the allocation is complete and the allocation will not proceed until the kernel is complete. CUDA 11.2 introduced async allocations that we can manually place on separate streams to combat this. This patch makes a new allocation type that's guarunteed to be non-blocking so it will actually make progress, only Nvidia needs to care about this as the others are not blocking in this way by default. I had originally tried to make the `alloc` and `free` methods take a `__tgt_async_info`. However, I observed that with the large volume of streams being created by a parallel test it quickly locked up the system as presumably too many streams were being created. This implementation not just creates a new stream and immediately destroys it. This obviously isn't very fast, but it at least gets the cases to stop deadlocking for now.
Summary:
The device allocator on NVPTX architectures is enqueued to a stream that
the kernel is potentially executing on. This can lead to deadlocks as
the kernel will not proceed until the allocation is complete and the
allocation will not proceed until the kernel is complete. CUDA 11.2
introduced async allocations that we can manually place on separate
streams to combat this. This patch makes a new allocation type that's
guaranteed to be non-blocking so it will actually make progress, only
Nvidia needs to care about this as the others are not blocking in this
way by default.
I had originally tried to make the
alloc
andfree
methods take a__tgt_async_info
. However, I observed that with the large volume ofstreams being created by a parallel test it quickly locked up the system
as presumably too many streams were being created. This implementation
not just creates a new stream and immediately destroys it. This
obviously isn't very fast, but it at least gets the cases to stop
deadlocking for now.