[Libomptarget] Handle dynamic stack sizes for AMD COV5 #72606

jhuber6 · 2023-11-17T03:27:50Z

Summary:
One of the changes in the AMD code-object version five was that kernels
that use an unknown amount of private stack memory now no longer default
to 16 KBs. Instead it emits a flag that indicates the runtime must
provide a value. This patch checks if we must provide such a stack, and
uses the existing handling of the stack environment variable to
configure it.

llvmbot · 2023-11-17T03:28:21Z

@llvm/pr-subscribers-backend-amdgpu

Author: Joseph Huber (jhuber6)

Changes

Summary:
One of the changes in the AMD code-object version five was that kernels
that use an unknown amount of private stack memory now no longer default
to 16 KBs. Instead it emits a flag that indicates the runtime must
provide a value. This patch checks if we must provide such a stack, and
uses the existing handling of the stack environment variable to
configure it.

Full diff: https://github.com/llvm/llvm-project/pull/72606.diff

3 Files Affected:

(modified) openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h (+1)
(modified) openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp (+22-6)
(modified) openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h (+2-1)

diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
index 573a2ef8fc2005a..64a1d3308aed0bb 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
@@ -288,6 +288,7 @@ typedef enum {
   HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11,
   HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13,
   HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14,
+  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15,
 } hsa_executable_symbol_info_t;
 
 typedef struct hsa_code_object_s {
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index a529c379844e904..73c7986eb728d0f 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -436,6 +436,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
         {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
         {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
         {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
+        {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack},
         {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
 
     for (auto &Info : RequiredInfos) {
@@ -485,6 +486,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
   /// @return 56 for cov4 and 256 for cov5
   uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
 
+  /// Indicates whether or not we need to set up our own private segment size.
+  bool usesDynamicStack() const { return DynamicStack; }
+
 private:
   /// The kernel object to execute.
   uint64_t KernelObject;
@@ -493,6 +497,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
   uint32_t ArgsSize;
   uint32_t GroupSize;
   uint32_t PrivateSize;
+  bool DynamicStack;
 
   /// The size of implicit kernel arguments.
   uint32_t ImplicitArgsSize;
@@ -621,7 +626,8 @@ struct AMDGPUQueueTy {
   /// signal and can define an optional input signal (nullptr if none).
   Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
                          uint32_t NumThreads, uint64_t NumBlocks,
-                         uint32_t GroupSize, AMDGPUSignalTy *OutputSignal,
+                         uint32_t GroupSize, uint64_t StackSize,
+                         AMDGPUSignalTy *OutputSignal,
                          AMDGPUSignalTy *InputSignal) {
     assert(OutputSignal && "Invalid kernel output signal");
 
@@ -658,7 +664,8 @@ struct AMDGPUQueueTy {
     Packet->grid_size_x = NumBlocks * NumThreads;
     Packet->grid_size_y = 1;
     Packet->grid_size_z = 1;
-    Packet->private_segment_size = Kernel.getPrivateSize();
+    Packet->private_segment_size =
+        Kernel.usesDynamicStack() ? Kernel.getPrivateSize() : StackSize;
     Packet->group_segment_size = GroupSize;
     Packet->kernel_object = Kernel.getKernelObject();
     Packet->kernarg_address = KernelArgs;
@@ -1124,7 +1131,7 @@ struct AMDGPUStreamTy {
   /// the kernel args buffer to the specified memory manager.
   Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
                          uint32_t NumThreads, uint64_t NumBlocks,
-                         uint32_t GroupSize,
+                         uint32_t GroupSize, uint64_t StackSize,
                          AMDGPUMemoryManagerTy &MemoryManager) {
     if (Queue == nullptr)
       return Plugin::error("Target queue was nullptr");
@@ -1147,7 +1154,8 @@ struct AMDGPUStreamTy {
 
     // Push the kernel with the output signal and an input signal (optional)
     return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks,
-                                   GroupSize, OutputSignal, InputSignal);
+                                   GroupSize, StackSize, OutputSignal,
+                                   InputSignal);
   }
 
   /// Push an asynchronous memory copy between pinned memory buffers.
@@ -2574,10 +2582,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
   /// Getters and setters for stack and heap sizes.
   Error getDeviceStackSize(uint64_t &Value) override {
-    Value = 0;
+    Value = StackSize;
     return Plugin::success();
   }
   Error setDeviceStackSize(uint64_t Value) override {
+    StackSize = Value;
     return Plugin::success();
   }
   Error getDeviceHeapSize(uint64_t &Value) override {
@@ -2728,6 +2737,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
   /// The current size of the global device memory pool (managed by us).
   uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */;
+
+  /// The current size of the stack that will be used in cases where it could
+  /// not be statically determined.
+  uint64_t StackSize = 1024 /* 1 KB */;
 };
 
 Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
@@ -3100,6 +3113,9 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
     GroupSize += MaxDynCGroupMem;
   }
 
+  uint64_t StackSize;
+  GenericDevice.getDeviceStackSize(StackSize);
+
   // Initialize implicit arguments.
   utils::AMDGPUImplicitArgsTy *ImplArgs =
       reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
@@ -3138,7 +3154,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
 
   // Push the kernel launch into the stream.
   return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
-                                  GroupSize, ArgsMemoryManager);
+                                  GroupSize, StackSize, ArgsMemoryManager);
 }
 
 Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index f09ae24163dfc2b..9174ecaab08ca00 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -864,6 +864,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
     return 0;
   }
 
+  virtual Error getDeviceStackSize(uint64_t &V) = 0;
+
 private:
   /// Register offload entry for global variable.
   Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
@@ -882,7 +884,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
   /// value to zero for the getters.
-  virtual Error getDeviceStackSize(uint64_t &V) = 0;
   virtual Error setDeviceStackSize(uint64_t V) = 0;
   virtual Error getDeviceHeapSize(uint64_t &V) = 0;
   virtual Error setDeviceHeapSize(uint64_t V) = 0;

arsenm · 2023-11-17T04:49:53Z

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

@@ -658,7 +664,8 @@ struct AMDGPUQueueTy {
    Packet->grid_size_x = NumBlocks * NumThreads;
    Packet->grid_size_y = 1;
    Packet->grid_size_z = 1;
-    Packet->private_segment_size = Kernel.getPrivateSize();
+    Packet->private_segment_size =
+        Kernel.usesDynamicStack() ? Kernel.getPrivateSize() : StackSize;


Naming here is kind of confusing. For the most part "private size" and "stack size" are the same thing

I can rename it to all be private size.

I don't understand the control flow here.

Before, we called getPrivateSize() which retrieved N from somewhere, presumably kernel metadata.

Now, we only do that if useDynamicStack is true, otherwise it's taken from a new argument.

Condition the wrong way around?

Ack, you're right it's backwards. I think it's correct in the libc version of this patch, but for some reason I deleted the ! and didn't reverse it.

shiltian · 2023-11-17T05:11:36Z

What about the compile error that we received a couple of weeks ago regarding too much stack use?

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

jhuber6 · 2023-11-17T12:58:22Z

What about the compile error that we received a couple of weeks ago regarding too much stack use?

I don't think this will be related. This is a change to the kernel call ABI where in the case that it could not be statically determined the runtime is expected to provide one. The old default was 16k, I could probably increase the default to something larger.

jhuber6 · 2023-11-20T14:38:02Z

What about the compile error that we received a couple of weeks ago regarding too much stack use?

That's unrelated. The GPU tries to statically determine stack size. In that case it was statically determined to be larger than the maximum possible amount, so it's a hard failure.

JonChesterfield · 2023-11-20T14:43:55Z

It's really difficult to work out what's going on here from the diff.

I think you've rolled a change in default from 16k to 1k in along with the rest of it and got a branch inverted but it's hard to tell.

What we used to have is the compiler emitted a guess at the stack size needed. Now it just punts with "no idea, good luck runtime", in which case we assume 1k and fall over if that's no good? This behaviour is pretty weird - if the compiler can't tell how much stack to allocate, the chances that the single right number for all the difficult kernels is in an environment variable seems pretty remote.

I think we should go with more sensible semantics or be much clearer with the naming of of things here. Something like:

bool kernel_metadata_missing_stack_size()
uint64_t guess_stack_size_for_missing_metadata_from_environment_variable()

For more sensible semantics, we know the upper bound on how much stack size we could give a kernel, and we know that the compiler failing to determine a stack size means the application is doing something exciting with control flow, so I think we should (optionally warn and) default to the maximum. That is, your complicated kernel doing stuff we failed to analyse gets to run slowly, unless you annotate it somehow.

Further, this single environment variable to set all stack sizes for all unknown things is just a dreadful construct and I recommend should ignore it entirely.

jhuber6 · 2023-11-20T14:47:50Z

It's really difficult to work out what's going on here from the diff.

I think you've rolled a change in default from 16k to 1k in along with the rest of it and got a branch inverted but it's hard to tell.

What we used to have is the compiler emitted a guess at the stack size needed. Now it just punts with "no idea, good luck runtime", in which case we assume 1k and fall over if that's no good? This behaviour is pretty weird - if the compiler can't tell how much stack to allocate, the chances that the single right number for all the difficult kernels is in an environment variable seems pretty remote.

I talked a bit with Matt and reduced it, but I can keep it at 16K if needed.

I think we should go with more sensible semantics or be much clearer with the naming of of things here. Something like:

bool kernel_metadata_missing_stack_size() uint64_t guess_stack_size_for_missing_metadata_from_environment_variable()

This more or less mimics the handling of the Nvidia stack. I believe the Nvidia stack defaults to 1024 or something in all cases if it wasn't statically determined. If we want to rework that I'd suggest it's in a follow-up patch that handles both cases.

For more sensible semantics, we know the upper bound on how much stack size we could give a kernel, and we know that the compiler failing to determine a stack size means the application is doing something exciting with control flow, so I think we should (optionally warn and) default to the maximum. That is, your complicated kernel doing stuff we failed to analyse gets to run slowly, unless you annotate it somehow.

Further, this single environment variable to set all stack sizes for all unknown things is just a dreadful construct and I recommend should ignore it entirely.

Somewhat orthogonal to this as a bug-fix, right now it will be zero, which is pretty much guaranteed failure. I'd prefer to fix the stack handling in a more unified way later.

Summary: One of the changes in the AMD code-object version five was that kernels that use an unknown amount of private stack memory now no longer default to 16 KBs. Instead it emits a flag that indicates the runtime must provide a value. This patch checks if we must provide such a stack, and uses the existing handling of the stack environment variable to configure it.

saiislam

This is fixing long standing failure of test/offloading/thread_limit.c for cov5.

LGTM!

Add documentation w.r.t. changes by #72606, which allows to set the dynamic callstack size.

Reinstated several minor changes in behavior w.r.t. conformity with HIP. 1. Default device stack size: 1024 / 1 KiB (hipLimitStackSize). 2. During AQL packet generation in case of a dyn callstack the maximum between user-provided and compiler-default is chosen. 3. Make sure we only allow 32bit values for stack size. Added calculation of maximum dyn callstack size per thread * If a value provided via LIBOMPTARGET_STACK_SIZE exceeds MaxThreadScratchSize, it will be capped See: * gerrit review 942931 / 968158 * llvm#72606 * llvm#74080 Change-Id: Ib0ef997b567f5f55097456c56d3f0bc2e287f848

jhuber6 requested review from arsenm, jdoerfert, JonChesterfield, saiislam and shiltian November 17, 2023 03:27

llvmbot added backend:AMDGPU openmp:libomptarget OpenMP offload runtime labels Nov 17, 2023

arsenm reviewed Nov 17, 2023

View reviewed changes

saiislam reviewed Nov 17, 2023

View reviewed changes

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp Outdated Show resolved Hide resolved

jhuber6 force-pushed the OpenMPDynamic branch from 4179982 to 4f505dc Compare November 17, 2023 12:57

jhuber6 force-pushed the OpenMPDynamic branch from 4f505dc to e2bfef3 Compare November 20, 2023 14:40

jhuber6 force-pushed the OpenMPDynamic branch from e2bfef3 to ec43330 Compare November 20, 2023 15:16

saiislam approved these changes Nov 20, 2023

View reviewed changes

saiislam mentioned this pull request Nov 20, 2023

[libc] Update the AMDGPU implementation to use code object 5 #72580

Merged

jhuber6 merged commit 47a3ad5 into llvm:main Nov 20, 2023

mhalk added a commit that referenced this pull request Nov 28, 2023

[NFC][docs] Add AMDGPU documentation for LIBOMPTARGET_STACK_SIZE

19fa276

Add documentation w.r.t. changes by #72606, which allows to set the dynamic callstack size.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Libomptarget] Handle dynamic stack sizes for AMD COV5 #72606

[Libomptarget] Handle dynamic stack sizes for AMD COV5 #72606

Uh oh!

jhuber6 commented Nov 17, 2023

Uh oh!

llvmbot commented Nov 17, 2023

Uh oh!

arsenm Nov 17, 2023

Uh oh!

jhuber6 Nov 17, 2023

Uh oh!

JonChesterfield Nov 20, 2023

Uh oh!

jhuber6 Nov 20, 2023

Uh oh!

shiltian commented Nov 17, 2023

Uh oh!

Uh oh!

jhuber6 commented Nov 17, 2023

Uh oh!

jhuber6 commented Nov 20, 2023

Uh oh!

JonChesterfield commented Nov 20, 2023

Uh oh!

jhuber6 commented Nov 20, 2023

Uh oh!

saiislam left a comment

Uh oh!

Uh oh!

[Libomptarget] Handle dynamic stack sizes for AMD COV5 #72606

[Libomptarget] Handle dynamic stack sizes for AMD COV5 #72606

Uh oh!

Conversation

jhuber6 commented Nov 17, 2023

Uh oh!

llvmbot commented Nov 17, 2023

Uh oh!

arsenm Nov 17, 2023

Choose a reason for hiding this comment

Uh oh!

jhuber6 Nov 17, 2023

Choose a reason for hiding this comment

Uh oh!

JonChesterfield Nov 20, 2023

Choose a reason for hiding this comment

Uh oh!

jhuber6 Nov 20, 2023

Choose a reason for hiding this comment

Uh oh!

shiltian commented Nov 17, 2023

Uh oh!

Uh oh!

jhuber6 commented Nov 17, 2023

Uh oh!

jhuber6 commented Nov 20, 2023

Uh oh!

JonChesterfield commented Nov 20, 2023

Uh oh!

jhuber6 commented Nov 20, 2023

Uh oh!

saiislam left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!