Skip to content

Commit 9ca5835

Browse files
committed
[OpenMP][AMDGPU] Adapt dynamic callstack sizes to HIP behavior
Added a mechanism to cap values provided via LIBOMPTARGET_STACK_SIZE to a GFX-dependent value. Changed several minor properties to be in sync with HIP: 1. Default device stack size: 1024 / 1 KiB (hipLimitStackSize). 2. During AQL packet generation in case of a dyn callstack the maximum between user-provided and compiler-default is chosen. 3. Make sure we only allow 32bit values for stack size. Added testcase where a dynamic stack is required due to recursion.
1 parent d222fa4 commit 9ca5835

File tree

2 files changed

+122
-6
lines changed

2 files changed

+122
-6
lines changed

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -666,7 +666,7 @@ struct AMDGPUQueueTy {
666666
/// signal and can define an optional input signal (nullptr if none).
667667
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
668668
uint32_t NumThreads, uint64_t NumBlocks,
669-
uint32_t GroupSize, uint64_t StackSize,
669+
uint32_t GroupSize, uint32_t StackSize,
670670
AMDGPUSignalTy *OutputSignal,
671671
AMDGPUSignalTy *InputSignal) {
672672
assert(OutputSignal && "Invalid kernel output signal");
@@ -705,7 +705,8 @@ struct AMDGPUQueueTy {
705705
Packet->grid_size_y = 1;
706706
Packet->grid_size_z = 1;
707707
Packet->private_segment_size =
708-
Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
708+
Kernel.usesDynamicStack() ? std::max(Kernel.getPrivateSize(), StackSize)
709+
: Kernel.getPrivateSize();
709710
Packet->group_segment_size = GroupSize;
710711
Packet->kernel_object = Kernel.getKernelObject();
711712
Packet->kernarg_address = KernelArgs;
@@ -1174,7 +1175,7 @@ struct AMDGPUStreamTy {
11741175
/// the kernel args buffer to the specified memory manager.
11751176
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
11761177
uint32_t NumThreads, uint64_t NumBlocks,
1177-
uint32_t GroupSize, uint64_t StackSize,
1178+
uint32_t GroupSize, uint32_t StackSize,
11781179
AMDGPUMemoryManagerTy &MemoryManager) {
11791180
if (Queue == nullptr)
11801181
return Plugin::error("Target queue was nullptr");
@@ -1872,6 +1873,25 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
18721873
else
18731874
return Plugin::error("Unexpected AMDGPU wavefront %d", WavefrontSize);
18741875

1876+
// To determine the correct scratch memory size per thread, we need to check
1877+
// the device architecure generation. According to AOT_OFFLOADARCHS we may
1878+
// assume that AMDGPU offload archs are prefixed with "gfx" and suffixed
1879+
// with a two char arch specialization. In-between is the 1-2 char
1880+
// generation number we want to extract.
1881+
StringRef Arch(ComputeUnitKind);
1882+
unsigned GfxGen = 0u;
1883+
if (!llvm::to_integer(Arch.slice(sizeof("gfx") - 1, Arch.size() - 2),
1884+
GfxGen))
1885+
return Plugin::error("Invalid GFX architecture string");
1886+
1887+
// See: 'getMaxWaveScratchSize' in 'llvm/lib/Target/AMDGPU/GCNSubtarget.h'.
1888+
// But we need to divide by WavefrontSize.
1889+
// For generations pre-gfx11: use 13-bit field in units of 256-dword,
1890+
// otherwise: 15-bit field in units of 64-dword.
1891+
MaxThreadScratchSize = (GfxGen < 11)
1892+
? ((256 * 4) / WavefrontSize) * ((1 << 13) - 1)
1893+
: ((64 * 4) / WavefrontSize) * ((1 << 15) - 1);
1894+
18751895
// Get maximum number of workitems per workgroup.
18761896
uint16_t WorkgroupMaxDim[3];
18771897
if (auto Err =
@@ -2623,7 +2643,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
26232643
return Plugin::success();
26242644
}
26252645
Error setDeviceStackSize(uint64_t Value) override {
2626-
StackSize = Value;
2646+
if (Value > MaxThreadScratchSize) {
2647+
// Cap device scratch size.
2648+
MESSAGE("Scratch memory size will be set to %d. Reason: Requested size "
2649+
"%ld would exceed available resources.",
2650+
MaxThreadScratchSize, Value);
2651+
StackSize = MaxThreadScratchSize;
2652+
} else {
2653+
// Apply device scratch size, since it is within limits.
2654+
StackSize = Value;
2655+
}
2656+
26272657
return Plugin::success();
26282658
}
26292659
Error getDeviceHeapSize(uint64_t &Value) override {
@@ -2782,7 +2812,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
27822812

27832813
/// The current size of the stack that will be used in cases where it could
27842814
/// not be statically determined.
2785-
uint64_t StackSize = 16 * 1024 /* 16 KB */;
2815+
/// Default: 1024, in conformity to hipLimitStackSize.
2816+
uint64_t StackSize = 1024 /* 1 KiB */;
2817+
2818+
// The maximum scratch memory size per thread.
2819+
// See COMPUTE_TMPRING_SIZE.WAVESIZE (divided by threads per wave).
2820+
uint32_t MaxThreadScratchSize;
27862821
};
27872822

27882823
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
@@ -3198,7 +3233,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
31983233

31993234
// Push the kernel launch into the stream.
32003235
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
3201-
GroupSize, StackSize, ArgsMemoryManager);
3236+
GroupSize, static_cast<uint32_t>(StackSize),
3237+
ArgsMemoryManager);
32023238
}
32033239

32043240
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#include <omp.h>
2+
#include <stdio.h>
3+
4+
// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O2 -mcode-object-version=5
5+
6+
// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \
7+
// RUN: env LIBOMPTARGET_STACK_SIZE=4096 \
8+
// RUN: %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
9+
// RUN: | %fcheck-amdgcn-amd-amdhsa
10+
11+
// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \
12+
// RUN: env LIBOMPTARGET_STACK_SIZE=131073 \
13+
// RUN: %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
14+
// RUN: | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_EXCEEDED
15+
16+
// TODO: Realize the following run in an acceptable manner.
17+
// Unfortunately with insufficient scratch mem size programs will hang.
18+
// Therefore, a timeout mechanism would help tremendously.
19+
// Additionally, we need to allow empty output / unsuccessful execution.
20+
21+
// RUN?: env OMP_TARGET_OFFLOAD=MANDATORY \
22+
// RUN?: env LIBOMPTARGET_STACK_SIZE=16 \
23+
// RUN?: timeout 10 %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
24+
// RUN?: | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_INSUFFICIENT \
25+
// RUN?: --allow-empty
26+
27+
// REQUIRES: amdgcn-amd-amdhsa
28+
29+
// Cause the compiler to set amdhsa_uses_dynamic_stack to '1' using recursion.
30+
// That is: stack requirement for main's target region may not be calculated.
31+
32+
// This recursive function will eventually return 0.
33+
int recursiveFunc(const int Recursions) {
34+
if (Recursions < 1)
35+
return 0;
36+
37+
int j[Recursions];
38+
#pragma omp target private(j)
39+
{ ; }
40+
41+
return recursiveFunc(Recursions - 1);
42+
}
43+
44+
int main() {
45+
int N = 256;
46+
int a[N];
47+
int b[N];
48+
int i;
49+
50+
for (i = 0; i < N; i++)
51+
a[i] = 0;
52+
53+
for (i = 0; i < N; i++)
54+
b[i] = i;
55+
56+
#pragma omp target parallel for
57+
{
58+
for (int j = 0; j < N; j++)
59+
a[j] = b[j] + recursiveFunc(j);
60+
}
61+
62+
int rc = 0;
63+
for (i = 0; i < N; i++)
64+
if (a[i] != b[i]) {
65+
rc++;
66+
printf("Wrong value: a[%d]=%d\n", i, a[i]);
67+
}
68+
69+
if (!rc)
70+
printf("Success\n");
71+
72+
return rc;
73+
}
74+
75+
/// CHECK: Success
76+
77+
/// LIMIT_EXCEEDED: Scratch memory size will be set to
78+
/// LIMIT_EXCEEDED: Success
79+
80+
/// LIMIT_INSUFFICIENT-NOT: Success

0 commit comments

Comments
 (0)