-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[Libomptarget] Handle dynamic stack sizes for AMD COV5 #72606
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Joseph Huber (jhuber6) ChangesSummary: Full diff: https://github.com/llvm/llvm-project/pull/72606.diff 3 Files Affected:
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
index 573a2ef8fc2005a..64a1d3308aed0bb 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
@@ -288,6 +288,7 @@ typedef enum {
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14,
+ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15,
} hsa_executable_symbol_info_t;
typedef struct hsa_code_object_s {
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index a529c379844e904..73c7986eb728d0f 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -436,6 +436,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
+ {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
for (auto &Info : RequiredInfos) {
@@ -485,6 +486,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
/// @return 56 for cov4 and 256 for cov5
uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
+ /// Indicates whether or not we need to set up our own private segment size.
+ bool usesDynamicStack() const { return DynamicStack; }
+
private:
/// The kernel object to execute.
uint64_t KernelObject;
@@ -493,6 +497,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
uint32_t ArgsSize;
uint32_t GroupSize;
uint32_t PrivateSize;
+ bool DynamicStack;
/// The size of implicit kernel arguments.
uint32_t ImplicitArgsSize;
@@ -621,7 +626,8 @@ struct AMDGPUQueueTy {
/// signal and can define an optional input signal (nullptr if none).
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
- uint32_t GroupSize, AMDGPUSignalTy *OutputSignal,
+ uint32_t GroupSize, uint64_t StackSize,
+ AMDGPUSignalTy *OutputSignal,
AMDGPUSignalTy *InputSignal) {
assert(OutputSignal && "Invalid kernel output signal");
@@ -658,7 +664,8 @@ struct AMDGPUQueueTy {
Packet->grid_size_x = NumBlocks * NumThreads;
Packet->grid_size_y = 1;
Packet->grid_size_z = 1;
- Packet->private_segment_size = Kernel.getPrivateSize();
+ Packet->private_segment_size =
+ Kernel.usesDynamicStack() ? Kernel.getPrivateSize() : StackSize;
Packet->group_segment_size = GroupSize;
Packet->kernel_object = Kernel.getKernelObject();
Packet->kernarg_address = KernelArgs;
@@ -1124,7 +1131,7 @@ struct AMDGPUStreamTy {
/// the kernel args buffer to the specified memory manager.
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
- uint32_t GroupSize,
+ uint32_t GroupSize, uint64_t StackSize,
AMDGPUMemoryManagerTy &MemoryManager) {
if (Queue == nullptr)
return Plugin::error("Target queue was nullptr");
@@ -1147,7 +1154,8 @@ struct AMDGPUStreamTy {
// Push the kernel with the output signal and an input signal (optional)
return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks,
- GroupSize, OutputSignal, InputSignal);
+ GroupSize, StackSize, OutputSignal,
+ InputSignal);
}
/// Push an asynchronous memory copy between pinned memory buffers.
@@ -2574,10 +2582,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// Getters and setters for stack and heap sizes.
Error getDeviceStackSize(uint64_t &Value) override {
- Value = 0;
+ Value = StackSize;
return Plugin::success();
}
Error setDeviceStackSize(uint64_t Value) override {
+ StackSize = Value;
return Plugin::success();
}
Error getDeviceHeapSize(uint64_t &Value) override {
@@ -2728,6 +2737,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// The current size of the global device memory pool (managed by us).
uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */;
+
+ /// The current size of the stack that will be used in cases where it could
+ /// not be statically determined.
+ uint64_t StackSize = 1024 /* 1 KB */;
};
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
@@ -3100,6 +3113,9 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
GroupSize += MaxDynCGroupMem;
}
+ uint64_t StackSize;
+ GenericDevice.getDeviceStackSize(StackSize);
+
// Initialize implicit arguments.
utils::AMDGPUImplicitArgsTy *ImplArgs =
reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
@@ -3138,7 +3154,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
- GroupSize, ArgsMemoryManager);
+ GroupSize, StackSize, ArgsMemoryManager);
}
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index f09ae24163dfc2b..9174ecaab08ca00 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -864,6 +864,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
return 0;
}
+ virtual Error getDeviceStackSize(uint64_t &V) = 0;
+
private:
/// Register offload entry for global variable.
Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
@@ -882,7 +884,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Get and set the stack size and heap size for the device. If not used, the
/// plugin can implement the setters as no-op and setting the output
/// value to zero for the getters.
- virtual Error getDeviceStackSize(uint64_t &V) = 0;
virtual Error setDeviceStackSize(uint64_t V) = 0;
virtual Error getDeviceHeapSize(uint64_t &V) = 0;
virtual Error setDeviceHeapSize(uint64_t V) = 0;
|
@@ -658,7 +664,8 @@ struct AMDGPUQueueTy { | |||
Packet->grid_size_x = NumBlocks * NumThreads; | |||
Packet->grid_size_y = 1; | |||
Packet->grid_size_z = 1; | |||
Packet->private_segment_size = Kernel.getPrivateSize(); | |||
Packet->private_segment_size = | |||
Kernel.usesDynamicStack() ? Kernel.getPrivateSize() : StackSize; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Naming here is kind of confusing. For the most part "private size" and "stack size" are the same thing
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can rename it to all be private size
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand the control flow here.
Before, we called getPrivateSize() which retrieved N from somewhere, presumably kernel metadata.
Now, we only do that if useDynamicStack is true, otherwise it's taken from a new argument.
Condition the wrong way around?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ack, you're right it's backwards. I think it's correct in the libc
version of this patch, but for some reason I deleted the !
and didn't reverse it.
What about the compile error that we received a couple of weeks ago regarding too much stack use? |
4179982
to
4f505dc
Compare
I don't think this will be related. This is a change to the kernel call ABI where in the case that it could not be statically determined the runtime is expected to provide one. The old default was 16k, I could probably increase the default to something larger. |
That's unrelated. The GPU tries to statically determine stack size. In that case it was statically determined to be larger than the maximum possible amount, so it's a hard failure. |
4f505dc
to
e2bfef3
Compare
It's really difficult to work out what's going on here from the diff. I think you've rolled a change in default from 16k to 1k in along with the rest of it and got a branch inverted but it's hard to tell. What we used to have is the compiler emitted a guess at the stack size needed. Now it just punts with "no idea, good luck runtime", in which case we assume 1k and fall over if that's no good? This behaviour is pretty weird - if the compiler can't tell how much stack to allocate, the chances that the single right number for all the difficult kernels is in an environment variable seems pretty remote. I think we should go with more sensible semantics or be much clearer with the naming of of things here. Something like:
For more sensible semantics, we know the upper bound on how much stack size we could give a kernel, and we know that the compiler failing to determine a stack size means the application is doing something exciting with control flow, so I think we should (optionally warn and) default to the maximum. That is, your complicated kernel doing stuff we failed to analyse gets to run slowly, unless you annotate it somehow. Further, this single environment variable to set all stack sizes for all unknown things is just a dreadful construct and I recommend should ignore it entirely. |
I talked a bit with Matt and reduced it, but I can keep it at 16K if needed.
This more or less mimics the handling of the Nvidia stack. I believe the Nvidia stack defaults to 1024 or something in all cases if it wasn't statically determined. If we want to rework that I'd suggest it's in a follow-up patch that handles both cases.
Somewhat orthogonal to this as a bug-fix, right now it will be zero, which is pretty much guaranteed failure. I'd prefer to fix the stack handling in a more unified way later. |
Summary: One of the changes in the AMD code-object version five was that kernels that use an unknown amount of private stack memory now no longer default to 16 KBs. Instead it emits a flag that indicates the runtime must provide a value. This patch checks if we must provide such a stack, and uses the existing handling of the stack environment variable to configure it.
e2bfef3
to
ec43330
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is fixing long standing failure of test/offloading/thread_limit.c for cov5.
LGTM!
Add documentation w.r.t. changes by #72606, which allows to set the dynamic callstack size.
Reinstated several minor changes in behavior w.r.t. conformity with HIP. 1. Default device stack size: 1024 / 1 KiB (hipLimitStackSize). 2. During AQL packet generation in case of a dyn callstack the maximum between user-provided and compiler-default is chosen. 3. Make sure we only allow 32bit values for stack size. Added calculation of maximum dyn callstack size per thread * If a value provided via LIBOMPTARGET_STACK_SIZE exceeds MaxThreadScratchSize, it will be capped See: * gerrit review 942931 / 968158 * llvm#72606 * llvm#74080 Change-Id: Ib0ef997b567f5f55097456c56d3f0bc2e287f848
Summary:
One of the changes in the AMD code-object version five was that kernels
that use an unknown amount of private stack memory now no longer default
to 16 KBs. Instead it emits a flag that indicates the runtime must
provide a value. This patch checks if we must provide such a stack, and
uses the existing handling of the stack environment variable to
configure it.