Skip to content

Commit 21c4c52

Browse files
committed
[OpenMP][AMDGPU] Adapt dynamic callstack sizes to HIP behavior
Added a mechanism to cap values provided via LIBOMPTARGET_STACK_SIZE to a GFX-dependent value. Changed several minor properties to be in sync with HIP: 1. Default device stack size: 1024 / 1 KiB (hipLimitStackSize). 2. During AQL packet generation in case of a dyn callstack the maximum between user-provided and compiler-default is chosen. 3. Make sure we only allow 32bit values for stack size. Added testcase where a dynamic stack is required due to recursion.
1 parent 4ce737b commit 21c4c52

File tree

8 files changed

+158
-29
lines changed

8 files changed

+158
-29
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1043,7 +1043,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
10431043
FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
10441044
FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
10451045
FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
1046-
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
1046+
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
1047+
FeatureMaxWaveScratchSize13x256
10471048
]
10481049
>;
10491050

@@ -1054,7 +1055,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
10541055
FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
10551056
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
10561057
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess,
1057-
FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
1058+
FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
1059+
FeatureMaxWaveScratchSize13x256
10581060
]
10591061
>;
10601062

@@ -1070,7 +1072,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
10701072
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
10711073
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
10721074
FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
1073-
FeatureDefaultComponentZero
1075+
FeatureDefaultComponentZero, FeatureMaxWaveScratchSize13x256
10741076
]
10751077
>;
10761078

@@ -1088,7 +1090,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
10881090
FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
10891091
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
10901092
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
1091-
FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero
1093+
FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero,
1094+
FeatureMaxWaveScratchSize13x256
10921095
]
10931096
>;
10941097

@@ -1109,7 +1112,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
11091112
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
11101113
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
11111114
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
1112-
FeatureMaxHardClauseLength63
1115+
FeatureMaxHardClauseLength63, FeatureMaxWaveScratchSize13x256
11131116
]
11141117
>;
11151118

@@ -1130,7 +1133,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
11301133
FeatureA16, FeatureFastDenormalF32, FeatureG16,
11311134
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
11321135
FeatureGWS, FeatureDefaultComponentZero,
1133-
FeatureMaxHardClauseLength32
1136+
FeatureMaxHardClauseLength32, FeatureMaxWaveScratchSize15x64
11341137
]
11351138
>;
11361139

@@ -1151,7 +1154,7 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
11511154
FeatureA16, FeatureFastDenormalF32, FeatureG16,
11521155
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
11531156
FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast,
1154-
FeatureMaxHardClauseLength32
1157+
FeatureMaxHardClauseLength32, FeatureMaxWaveScratchSize18x64
11551158
]
11561159
>;
11571160

llvm/lib/Target/AMDGPU/AMDGPUFeatures.td

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,18 @@ def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
5151
"Enable promote alloca pass"
5252
>;
5353

54+
class SubtargetFeatureMaxWaveScratchSize <int Value, int FieldSize,
55+
int Elements> :
56+
SubtargetFeature<
57+
"maxwavescratchsize"#FieldSize#"x"#Elements,
58+
"MaxWaveScratchSize",
59+
!cast<string>(Value),
60+
"The dynamic callstack size in bytes"
61+
>;
62+
63+
def FeatureMaxWaveScratchSize13x256 :
64+
SubtargetFeatureMaxWaveScratchSize<8387584, 13, 256>;
65+
def FeatureMaxWaveScratchSize15x64 :
66+
SubtargetFeatureMaxWaveScratchSize<8388352, 15, 64>;
67+
def FeatureMaxWaveScratchSize18x64 :
68+
SubtargetFeatureMaxWaveScratchSize<67108608, 18, 64>;

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ class AMDGPUSubtarget {
6868
unsigned MaxWavesPerEU = 10;
6969
unsigned LocalMemorySize = 0;
7070
unsigned AddressableLocalMemorySize = 0;
71+
unsigned MaxWaveScratchSize = 0;
7172
char WavefrontSizeLog2 = 0;
7273

7374
public:
@@ -234,6 +235,8 @@ class AMDGPUSubtarget {
234235
return AddressableLocalMemorySize;
235236
}
236237

238+
unsigned getMaxWaveScratchSize() const { return MaxWaveScratchSize; }
239+
237240
/// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
238241
/// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
239242
/// CU mode into account.

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -302,20 +302,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
302302
return (Generation)Gen;
303303
}
304304

305-
unsigned getMaxWaveScratchSize() const {
306-
// See COMPUTE_TMPRING_SIZE.WAVESIZE.
307-
if (getGeneration() >= GFX12) {
308-
// 18-bit field in units of 64-dword.
309-
return (64 * 4) * ((1 << 18) - 1);
310-
}
311-
if (getGeneration() == GFX11) {
312-
// 15-bit field in units of 64-dword.
313-
return (64 * 4) * ((1 << 15) - 1);
314-
}
315-
// 13-bit field in units of 256-dword.
316-
return (256 * 4) * ((1 << 13) - 1);
317-
}
318-
319305
/// Return the number of high bits known to be zero for a frame index.
320306
unsigned getKnownHighZeroBitsForFrameIndex() const {
321307
return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -900,6 +900,22 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
900900
return 0;
901901
}
902902

903+
unsigned getMaxWaveScratchSize(const MCSubtargetInfo *STI) {
904+
// See COMPUTE_TMPRING_SIZE.WAVESIZE.
905+
if (STI->getFeatureBits().test(FeatureMaxWaveScratchSize18x64)) {
906+
// 18-bit field in units of 64-dword.
907+
return (64 * 4) * ((1 << 18) - 1);
908+
}
909+
910+
if (STI->getFeatureBits().test(FeatureMaxWaveScratchSize15x64)) {
911+
// 15-bit field in units of 64-dword.
912+
return (64 * 4) * ((1 << 15) - 1);
913+
}
914+
915+
// 13-bit field in units of 256-dword.
916+
return (256 * 4) * ((1 << 13) - 1);
917+
}
918+
903919
unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
904920
// "Per CU" really means "per whatever functional block the waves of a
905921
// workgroup must share". For gfx10 in CU mode this is the CU, which contains

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,10 @@ unsigned getLocalMemorySize(const MCSubtargetInfo *STI);
206206
/// \p STI.
207207
unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI);
208208

209+
/// \returns Maximum dynamic callstack size in bytes for given subtarget
210+
/// \p STI.
211+
unsigned getMaxWaveScratchSize(const MCSubtargetInfo *STI);
212+
209213
/// \returns Number of execution units per compute unit for given subtarget \p
210214
/// STI.
211215
unsigned getEUsPerCU(const MCSubtargetInfo *STI);

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -704,7 +704,7 @@ struct AMDGPUQueueTy {
704704
/// signal and can define an optional input signal (nullptr if none).
705705
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
706706
uint32_t NumThreads, uint64_t NumBlocks,
707-
uint32_t GroupSize, uint64_t StackSize,
707+
uint32_t GroupSize, uint32_t StackSize,
708708
AMDGPUSignalTy *OutputSignal,
709709
AMDGPUSignalTy *InputSignal) {
710710
assert(OutputSignal && "Invalid kernel output signal");
@@ -743,7 +743,8 @@ struct AMDGPUQueueTy {
743743
Packet->grid_size_y = 1;
744744
Packet->grid_size_z = 1;
745745
Packet->private_segment_size =
746-
Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
746+
Kernel.usesDynamicStack() ? std::max(Kernel.getPrivateSize(), StackSize)
747+
: Kernel.getPrivateSize();
747748
Packet->group_segment_size = GroupSize;
748749
Packet->kernel_object = Kernel.getKernelObject();
749750
Packet->kernarg_address = KernelArgs;
@@ -1212,7 +1213,7 @@ struct AMDGPUStreamTy {
12121213
/// the kernel args buffer to the specified memory manager.
12131214
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
12141215
uint32_t NumThreads, uint64_t NumBlocks,
1215-
uint32_t GroupSize, uint64_t StackSize,
1216+
uint32_t GroupSize, uint32_t StackSize,
12161217
AMDGPUMemoryManagerTy &MemoryManager) {
12171218
if (Queue == nullptr)
12181219
return Plugin::error("Target queue was nullptr");
@@ -1975,6 +1976,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
19751976
.contains("xnack+"))
19761977
IsXnackEnabled = true;
19771978

1979+
// See: 'getMaxWaveScratchSize' in 'llvm/lib/Target/AMDGPU/GCNSubtarget.h'.
1980+
// See: e.g. 'FeatureMaxWaveScratchSize13x256' in
1981+
// 'llvm/lib/Target/AMDGPU/AMDGPUFeatures.td'
1982+
// ToDo: Relay MaxWaveScratchSize value here
1983+
// MaxThreadScratchSize = GCNSubtarget.getMaxWaveScratchSize() /
1984+
// WavefrontSize;
1985+
19781986
// detect if device is an APU.
19791987
if (auto Err = checkIfAPU())
19801988
return Err;
@@ -2708,7 +2716,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
27082716
return Plugin::success();
27092717
}
27102718
Error setDeviceStackSize(uint64_t Value) override {
2711-
StackSize = Value;
2719+
if (Value > MaxThreadScratchSize) {
2720+
// Cap device scratch size.
2721+
MESSAGE("Scratch memory size will be set to %d. Reason: Requested size "
2722+
"%ld would exceed available resources.",
2723+
MaxThreadScratchSize, Value);
2724+
StackSize = MaxThreadScratchSize;
2725+
} else {
2726+
// Apply device scratch size, since it is within limits.
2727+
StackSize = Value;
2728+
}
2729+
27122730
return Plugin::success();
27132731
}
27142732
Error getDeviceHeapSize(uint64_t &Value) override {
@@ -2896,9 +2914,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
28962914
/// The current size of the global device memory pool (managed by us).
28972915
uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/;
28982916

2899-
/// The current size of the stack that will be used in cases where it could
2900-
/// not be statically determined.
2901-
uint64_t StackSize = 16 * 1024 /* 16 KB */;
2917+
/// Default: 1024, in conformity to hipLimitStackSize.
2918+
uint64_t StackSize = 1024 /* 1 KiB */;
2919+
2920+
// The maximum scratch memory size per thread.
2921+
// See COMPUTE_TMPRING_SIZE.WAVESIZE (divided by threads per wave).
2922+
uint32_t MaxThreadScratchSize;
29022923

29032924
/// Is the plugin associated with an APU?
29042925
bool IsAPU = false;
@@ -3314,7 +3335,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
33143335

33153336
// Push the kernel launch into the stream.
33163337
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
3317-
GroupSize, StackSize, ArgsMemoryManager);
3338+
GroupSize, static_cast<uint32_t>(StackSize),
3339+
ArgsMemoryManager);
33183340
}
33193341

33203342
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#include <omp.h>
2+
#include <stdio.h>
3+
4+
// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O2 -mcode-object-version=5
5+
6+
// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \
7+
// RUN: env LIBOMPTARGET_STACK_SIZE=4096 \
8+
// RUN: %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
9+
// RUN: | %fcheck-amdgcn-amd-amdhsa
10+
11+
// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \
12+
// RUN: env LIBOMPTARGET_STACK_SIZE=131073 \
13+
// RUN: %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
14+
// RUN: | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_EXCEEDED
15+
16+
// TODO: Realize the following run in an acceptable manner.
17+
// Unfortunately with insufficient scratch mem size programs will hang.
18+
// Therefore, a timeout mechanism would help tremendously.
19+
// Additionally, we need to allow empty output / unsuccessful execution.
20+
21+
// RUN?: env OMP_TARGET_OFFLOAD=MANDATORY \
22+
// RUN?: env LIBOMPTARGET_STACK_SIZE=16 \
23+
// RUN?: timeout 10 %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
24+
// RUN?: | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_INSUFFICIENT \
25+
// RUN?: --allow-empty
26+
27+
// REQUIRES: amdgcn-amd-amdhsa
28+
29+
// Cause the compiler to set amdhsa_uses_dynamic_stack to '1' using recursion.
30+
// That is: stack requirement for main's target region may not be calculated.
31+
32+
// This recursive function will eventually return 0.
33+
int recursiveFunc(const int Recursions) {
34+
if (Recursions < 1)
35+
return 0;
36+
37+
int j[Recursions];
38+
#pragma omp target private(j)
39+
{ ; }
40+
41+
return recursiveFunc(Recursions - 1);
42+
}
43+
44+
int main() {
45+
int N = 256;
46+
int a[N];
47+
int b[N];
48+
int i;
49+
50+
for (i = 0; i < N; i++)
51+
a[i] = 0;
52+
53+
for (i = 0; i < N; i++)
54+
b[i] = i;
55+
56+
#pragma omp target parallel for
57+
{
58+
for (int j = 0; j < N; j++)
59+
a[j] = b[j] + recursiveFunc(j);
60+
}
61+
62+
int rc = 0;
63+
for (i = 0; i < N; i++)
64+
if (a[i] != b[i]) {
65+
rc++;
66+
printf("Wrong value: a[%d]=%d\n", i, a[i]);
67+
}
68+
69+
if (!rc)
70+
printf("Success\n");
71+
72+
return rc;
73+
}
74+
75+
/// CHECK: Success
76+
77+
/// LIMIT_EXCEEDED: Scratch memory size will be set to
78+
/// LIMIT_EXCEEDED: Success
79+
80+
/// LIMIT_INSUFFICIENT-NOT: Success

0 commit comments

Comments
 (0)