[OpenMP][AMDGPU] Adapt dynamic callstack sizes to HIP behavior

mhalk · mhalk · commit 21c4c52d809f · 2024-03-05T08:24:02.000-06:00
Added a mechanism to cap values provided via LIBOMPTARGET_STACK_SIZE to a
GFX-dependent value.

Changed several minor properties to be in sync with HIP:
  1. Default device stack size: 1024 / 1 KiB (hipLimitStackSize).
  2. During AQL packet generation in case of a dyn callstack the maximum
     between user-provided and compiler-default is chosen.
  3. Make sure we only allow 32bit values for stack size.

Added testcase where a dynamic stack is required due to recursion.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1043,7 +1043,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
   FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
   FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
   FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
-  FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
+  FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
+  FeatureMaxWaveScratchSize13x256
   ]
 >;
 
@@ -1054,7 +1055,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
   FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
   FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
   FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess,
-  FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
+  FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
+  FeatureMaxWaveScratchSize13x256
   ]
 >;
 
@@ -1070,7 +1072,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
    FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
    FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
    FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
-   FeatureDefaultComponentZero
+   FeatureDefaultComponentZero, FeatureMaxWaveScratchSize13x256
   ]
 >;
 
@@ -1088,7 +1090,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
    FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
    FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
    FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
-   FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero
+   FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero,
+   FeatureMaxWaveScratchSize13x256
   ]
 >;
 
@@ -1109,7 +1112,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
    FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
    FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
    FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
-   FeatureMaxHardClauseLength63
+   FeatureMaxHardClauseLength63, FeatureMaxWaveScratchSize13x256
   ]
 >;
 
@@ -1130,7 +1133,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
    FeatureA16, FeatureFastDenormalF32, FeatureG16,
    FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
    FeatureGWS, FeatureDefaultComponentZero,
-   FeatureMaxHardClauseLength32
+   FeatureMaxHardClauseLength32, FeatureMaxWaveScratchSize15x64
   ]
 >;
 
@@ -1151,7 +1154,7 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
    FeatureA16, FeatureFastDenormalF32, FeatureG16,
    FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
    FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast,
-   FeatureMaxHardClauseLength32
+   FeatureMaxHardClauseLength32, FeatureMaxWaveScratchSize18x64
   ]
 >;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -51,3 +51,18 @@ def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
   "Enable promote alloca pass"
 >;
 
+class SubtargetFeatureMaxWaveScratchSize <int Value, int FieldSize,
+                                          int Elements> :
+        SubtargetFeature<
+          "maxwavescratchsize"#FieldSize#"x"#Elements,
+          "MaxWaveScratchSize",
+          !cast<string>(Value),
+          "The dynamic callstack size in bytes"
+>;
+
+def FeatureMaxWaveScratchSize13x256 :
+  SubtargetFeatureMaxWaveScratchSize<8387584, 13, 256>;
+def FeatureMaxWaveScratchSize15x64 :
+  SubtargetFeatureMaxWaveScratchSize<8388352, 15, 64>;
+def FeatureMaxWaveScratchSize18x64 :
+  SubtargetFeatureMaxWaveScratchSize<67108608, 18, 64>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -68,6 +68,7 @@ class AMDGPUSubtarget {
   unsigned MaxWavesPerEU = 10;
   unsigned LocalMemorySize = 0;
   unsigned AddressableLocalMemorySize = 0;
+  unsigned MaxWaveScratchSize = 0;
   char WavefrontSizeLog2 = 0;
 
 public:
@@ -234,6 +235,8 @@ class AMDGPUSubtarget {
     return AddressableLocalMemorySize;
   }
 
+  unsigned getMaxWaveScratchSize() const { return MaxWaveScratchSize; }
+
   /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
   /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
   /// CU mode into account.
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -302,20 +302,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return (Generation)Gen;
   }
 
-  unsigned getMaxWaveScratchSize() const {
-    // See COMPUTE_TMPRING_SIZE.WAVESIZE.
-    if (getGeneration() >= GFX12) {
-      // 18-bit field in units of 64-dword.
-      return (64 * 4) * ((1 << 18) - 1);
-    }
-    if (getGeneration() == GFX11) {
-      // 15-bit field in units of 64-dword.
-      return (64 * 4) * ((1 << 15) - 1);
-    }
-    // 13-bit field in units of 256-dword.
-    return (256 * 4) * ((1 << 13) - 1);
-  }
-
   /// Return the number of high bits known to be zero for a frame index.
   unsigned getKnownHighZeroBitsForFrameIndex() const {
     return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -900,6 +900,22 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
   return 0;
 }
 
+unsigned getMaxWaveScratchSize(const MCSubtargetInfo *STI) {
+  // See COMPUTE_TMPRING_SIZE.WAVESIZE.
+  if (STI->getFeatureBits().test(FeatureMaxWaveScratchSize18x64)) {
+    // 18-bit field in units of 64-dword.
+    return (64 * 4) * ((1 << 18) - 1);
+  }
+
+  if (STI->getFeatureBits().test(FeatureMaxWaveScratchSize15x64)) {
+    // 15-bit field in units of 64-dword.
+    return (64 * 4) * ((1 << 15) - 1);
+  }
+
+  // 13-bit field in units of 256-dword.
+  return (256 * 4) * ((1 << 13) - 1);
+}
+
 unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
   // "Per CU" really means "per whatever functional block the waves of a
   // workgroup must share". For gfx10 in CU mode this is the CU, which contains
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -206,6 +206,10 @@ unsigned getLocalMemorySize(const MCSubtargetInfo *STI);
 /// \p STI.
 unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI);
 
+/// \returns Maximum dynamic callstack size in bytes for given subtarget
+/// \p STI.
+unsigned getMaxWaveScratchSize(const MCSubtargetInfo *STI);
+
 /// \returns Number of execution units per compute unit for given subtarget \p
 /// STI.
 unsigned getEUsPerCU(const MCSubtargetInfo *STI);
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -704,7 +704,7 @@ struct AMDGPUQueueTy {
   /// signal and can define an optional input signal (nullptr if none).
   Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
                          uint32_t NumThreads, uint64_t NumBlocks,
-                         uint32_t GroupSize, uint64_t StackSize,
+                         uint32_t GroupSize, uint32_t StackSize,
                          AMDGPUSignalTy *OutputSignal,
                          AMDGPUSignalTy *InputSignal) {
     assert(OutputSignal && "Invalid kernel output signal");
@@ -743,7 +743,8 @@ struct AMDGPUQueueTy {
     Packet->grid_size_y = 1;
     Packet->grid_size_z = 1;
     Packet->private_segment_size =
-        Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
+        Kernel.usesDynamicStack() ? std::max(Kernel.getPrivateSize(), StackSize)
+                                  : Kernel.getPrivateSize();
     Packet->group_segment_size = GroupSize;
     Packet->kernel_object = Kernel.getKernelObject();
     Packet->kernarg_address = KernelArgs;
@@ -1212,7 +1213,7 @@ struct AMDGPUStreamTy {
   /// the kernel args buffer to the specified memory manager.
   Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
                          uint32_t NumThreads, uint64_t NumBlocks,
-                         uint32_t GroupSize, uint64_t StackSize,
+                         uint32_t GroupSize, uint32_t StackSize,
                          AMDGPUMemoryManagerTy &MemoryManager) {
     if (Queue == nullptr)
       return Plugin::error("Target queue was nullptr");
@@ -1975,6 +1976,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
             .contains("xnack+"))
       IsXnackEnabled = true;
 
+    // See: 'getMaxWaveScratchSize' in 'llvm/lib/Target/AMDGPU/GCNSubtarget.h'.
+    // See: e.g. 'FeatureMaxWaveScratchSize13x256' in
+    //           'llvm/lib/Target/AMDGPU/AMDGPUFeatures.td'
+    // ToDo: Relay MaxWaveScratchSize value here
+    // MaxThreadScratchSize = GCNSubtarget.getMaxWaveScratchSize() /
+    // WavefrontSize;
+
     // detect if device is an APU.
     if (auto Err = checkIfAPU())
       return Err;
@@ -2708,7 +2716,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Plugin::success();
   }
   Error setDeviceStackSize(uint64_t Value) override {
-    StackSize = Value;
+    if (Value > MaxThreadScratchSize) {
+      // Cap device scratch size.
+      MESSAGE("Scratch memory size will be set to %d. Reason: Requested size "
+              "%ld would exceed available resources.",
+              MaxThreadScratchSize, Value);
+      StackSize = MaxThreadScratchSize;
+    } else {
+      // Apply device scratch size, since it is within limits.
+      StackSize = Value;
+    }
+
     return Plugin::success();
   }
   Error getDeviceHeapSize(uint64_t &Value) override {
@@ -2896,9 +2914,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// The current size of the global device memory pool (managed by us).
   uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/;
 
-  /// The current size of the stack that will be used in cases where it could
-  /// not be statically determined.
-  uint64_t StackSize = 16 * 1024 /* 16 KB */;
+  /// Default: 1024, in conformity to hipLimitStackSize.
+  uint64_t StackSize = 1024 /* 1 KiB */;
+
+  // The maximum scratch memory size per thread.
+  // See COMPUTE_TMPRING_SIZE.WAVESIZE (divided by threads per wave).
+  uint32_t MaxThreadScratchSize;
 
   /// Is the plugin associated with an APU?
   bool IsAPU = false;
@@ -3314,7 +3335,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
 
   // Push the kernel launch into the stream.
   return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
-                                  GroupSize, StackSize, ArgsMemoryManager);
+                                  GroupSize, static_cast<uint32_t>(StackSize),
+                                  ArgsMemoryManager);
 }
 
 Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
diff --git a/openmp/libomptarget/test/offloading/dynamic_callstack.c b/openmp/libomptarget/test/offloading/dynamic_callstack.c
@@ -0,0 +1,80 @@
+#include <omp.h>
+#include <stdio.h>
+
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O2 -mcode-object-version=5
+
+// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \
+// RUN: env LIBOMPTARGET_STACK_SIZE=4096 \
+// RUN:   %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
+// RUN:   | %fcheck-amdgcn-amd-amdhsa
+
+// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \
+// RUN: env LIBOMPTARGET_STACK_SIZE=131073 \
+// RUN:   %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
+// RUN:   | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_EXCEEDED
+
+// TODO: Realize the following run in an acceptable manner.
+//       Unfortunately with insufficient scratch mem size programs will hang.
+//       Therefore, a timeout mechanism would help tremendously.
+//       Additionally, we need to allow empty output / unsuccessful execution.
+
+// RUN?: env OMP_TARGET_OFFLOAD=MANDATORY \
+// RUN?: env LIBOMPTARGET_STACK_SIZE=16 \
+// RUN?:   timeout 10 %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
+// RUN?:   | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_INSUFFICIENT \
+// RUN?:   --allow-empty
+
+// REQUIRES: amdgcn-amd-amdhsa
+
+// Cause the compiler to set amdhsa_uses_dynamic_stack to '1' using recursion.
+// That is: stack requirement for main's target region may not be calculated.
+
+// This recursive function will eventually return 0.
+int recursiveFunc(const int Recursions) {
+  if (Recursions < 1)
+    return 0;
+
+  int j[Recursions];
+#pragma omp target private(j)
+  { ; }
+
+  return recursiveFunc(Recursions - 1);
+}
+
+int main() {
+  int N = 256;
+  int a[N];
+  int b[N];
+  int i;
+
+  for (i = 0; i < N; i++)
+    a[i] = 0;
+
+  for (i = 0; i < N; i++)
+    b[i] = i;
+
+#pragma omp target parallel for
+  {
+    for (int j = 0; j < N; j++)
+      a[j] = b[j] + recursiveFunc(j);
+  }
+
+  int rc = 0;
+  for (i = 0; i < N; i++)
+    if (a[i] != b[i]) {
+      rc++;
+      printf("Wrong value: a[%d]=%d\n", i, a[i]);
+    }
+
+  if (!rc)
+    printf("Success\n");
+
+  return rc;
+}
+
+/// CHECK: Success
+
+/// LIMIT_EXCEEDED: Scratch memory size will be set to
+/// LIMIT_EXCEEDED: Success
+
+/// LIMIT_INSUFFICIENT-NOT: Success