Skip to content

Commit 828e8dc

Browse files
mhalkronlieb
authored andcommitted
[OpenMP][AMDGPU] Adapt dynamic callstack behavior to upstream merge
Reinstated several minor changes in behavior w.r.t. conformity with HIP. 1. Default device stack size: 1024 / 1 KiB (hipLimitStackSize). 2. During AQL packet generation in case of a dyn callstack the maximum between user-provided and compiler-default is chosen. 3. Make sure we only allow 32bit values for stack size. Added calculation of maximum dyn callstack size per thread * If a value provided via LIBOMPTARGET_STACK_SIZE exceeds MaxThreadScratchSize, it will be capped See: * gerrit review 942931 / 968158 * llvm#72606 * llvm#74080 Change-Id: Ib0ef997b567f5f55097456c56d3f0bc2e287f848
1 parent c7ff25b commit 828e8dc

File tree

2 files changed

+41
-25
lines changed

2 files changed

+41
-25
lines changed

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 41 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -632,12 +632,6 @@ struct AMDGPUKernelTy : public GenericKernelTy {
632632
return Err;
633633
}
634634

635-
// Get the currently set dynamic stack size from the device as 32bit value.
636-
assert(Device.getTargetStackSize() <=
637-
std::numeric_limits<uint32_t>::max() &&
638-
"AMDGPU Private Address Space may not exceed 32bit range");
639-
TargetStackSize = static_cast<uint32_t>(Device.getTargetStackSize());
640-
641635
// Make sure it is a kernel symbol.
642636
if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
643637
return Plugin::error("Symbol %s is not a kernel function");
@@ -714,10 +708,6 @@ struct AMDGPUKernelTy : public GenericKernelTy {
714708
uint32_t getPrivateSize() const { return PrivateSize; }
715709
uint16_t getConstWGSize() const { return ConstWGSize; }
716710

717-
// Get dynamic callstack information.
718-
bool hasDynamicCallstack() const { return DynCallstack; }
719-
uint32_t getTargetStackSize() const { return TargetStackSize; }
720-
721711
/// Get the HSA kernel object representing the kernel function.
722712
uint64_t getKernelObject() const { return KernelObject; }
723713

@@ -738,14 +728,6 @@ struct AMDGPUKernelTy : public GenericKernelTy {
738728
uint32_t PrivateSize;
739729
bool DynamicStack;
740730

741-
// The kernel meta data if a dynamic callstack is being used.
742-
bool DynCallstack;
743-
744-
// The dynamic / target callstack size (from environmental variable).
745-
// Note: While the EnVar is provided as uint64_t, the amdgpu private address
746-
// space uses 32 bit.
747-
uint32_t TargetStackSize;
748-
749731
/// The size of implicit kernel arguments.
750732
uint32_t ImplicitArgsSize;
751733

@@ -1161,7 +1143,7 @@ struct AMDGPUQueueTy {
11611143
/// signal and can define an optional input signal (nullptr if none).
11621144
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
11631145
uint32_t NumThreads, uint64_t NumBlocks,
1164-
uint32_t GroupSize, uint64_t StackSize,
1146+
uint32_t GroupSize, uint32_t StackSize,
11651147
AMDGPUSignalTy *OutputSignal,
11661148
AMDGPUSignalTy *InputSignal) {
11671149
assert(OutputSignal && "Invalid kernel output signal");
@@ -1200,7 +1182,8 @@ struct AMDGPUQueueTy {
12001182
Packet->grid_size_y = 1;
12011183
Packet->grid_size_z = 1;
12021184
Packet->private_segment_size =
1203-
Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
1185+
Kernel.usesDynamicStack() ? std::max(Kernel.getPrivateSize(), StackSize)
1186+
: Kernel.getPrivateSize();
12041187
Packet->group_segment_size = GroupSize;
12051188
Packet->kernel_object = Kernel.getKernelObject();
12061189
Packet->kernarg_address = KernelArgs;
@@ -1735,7 +1718,7 @@ struct AMDGPUStreamTy {
17351718
/// the kernel args buffer to the specified memory manager.
17361719
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
17371720
uint32_t NumThreads, uint64_t NumBlocks,
1738-
uint32_t GroupSize, uint64_t StackSize,
1721+
uint32_t GroupSize, uint32_t StackSize,
17391722
AMDGPUMemoryManagerTy &MemoryManager) {
17401723
if (Queue == nullptr)
17411724
return Plugin::error("Target queue was nullptr");
@@ -2566,6 +2549,24 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
25662549
else
25672550
return Plugin::error("Unexpected AMDGPU wavefront %d", WavefrontSize);
25682551

2552+
// To determine the correct scratch memory size per thread, we need to check
2553+
// the device architecure generation. Hence, we slice the major GFX version
2554+
// from the agent info (e.g. 'gfx90a' -> 9).
2555+
StringRef Arch(ComputeUnitKind);
2556+
unsigned GfxGen = 0u;
2557+
if (!llvm::to_integer(Arch.slice(sizeof("gfx") - 1, Arch.size() - 2),
2558+
GfxGen))
2559+
return Plugin::error("Invalid GFX architecture string");
2560+
2561+
// TODO: Will try to eliminate this calculation, since its duplicated.
2562+
// See: 'getMaxWaveScratchSize' in 'llvm/lib/Target/AMDGPU/GCNSubtarget.h'.
2563+
// But we need to divide by WavefrontSize.
2564+
// For generations pre-gfx11: use 13-bit field in units of 256-dword,
2565+
// otherwise: 15-bit field in units of 64-dword.
2566+
MaxThreadScratchSize = (GfxGen < 11)
2567+
? ((256 * 4) / WavefrontSize) * ((1 << 13) - 1)
2568+
: ((64 * 4) / WavefrontSize) * ((1 << 15) - 1);
2569+
25692570
// Get maximum number of workitems per workgroup.
25702571
uint16_t WorkgroupMaxDim[3];
25712572
if (auto Err =
@@ -3416,7 +3417,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
34163417
return Plugin::success();
34173418
}
34183419
Error setDeviceStackSize(uint64_t Value) override {
3419-
StackSize = Value;
3420+
if (Value > MaxThreadScratchSize) {
3421+
// Cap device scratch size.
3422+
MESSAGE("Scratch memory size will be set to %d. Reason: Requested size "
3423+
"%ld would exceed available resources.",
3424+
MaxThreadScratchSize, Value);
3425+
StackSize = MaxThreadScratchSize;
3426+
} else {
3427+
// Apply device scratch size, since it is within limits.
3428+
StackSize = Value;
3429+
}
3430+
34203431
return Plugin::success();
34213432
}
34223433
Error getDeviceHeapSize(uint64_t &Value) override {
@@ -3663,7 +3674,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
36633674

36643675
/// The current size of the stack that will be used in cases where it could
36653676
/// not be statically determined.
3666-
uint64_t StackSize = 16 * 1024 /* 16 KB */;
3677+
/// Default: 1024, in conformity to hipLimitStackSize.
3678+
uint32_t StackSize = 1024 /* 1 KB */;
3679+
3680+
// The maximum scratch memory size per thread.
3681+
// See COMPUTE_TMPRING_SIZE.WAVESIZE (divided by threads per wave).
3682+
uint32_t MaxThreadScratchSize;
36673683
};
36683684

36693685
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
@@ -4375,7 +4391,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
43754391

43764392
// Push the kernel launch into the stream.
43774393
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
4378-
GroupSize, StackSize, ArgsMemoryManager);
4394+
GroupSize, static_cast<uint32_t>(StackSize),
4395+
ArgsMemoryManager);
43794396
}
43804397

43814398
void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,

openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -884,7 +884,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
884884
int32_t getOMPTeamsThreadLimit() const { return OMP_TeamsThreadLimit; }
885885

886886
uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
887-
uint64_t getTargetStackSize() const { return OMPX_TargetStackSize; }
888887
virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
889888

890889
virtual uint32_t getOMPXLowTripCount() const {

0 commit comments

Comments
 (0)