Skip to content

Commit ec43330

Browse files
committed
[Libomptarget] Handle dynamic stack sizes for AMD COV5
Summary: One of the changes in the AMD code-object version five was that kernels that use an unknown amount of private stack memory now no longer default to 16 KBs. Instead it emits a flag that indicates the runtime must provide a value. This patch checks if we must provide such a stack, and uses the existing handling of the stack environment variable to configure it.
1 parent cc9e19e commit ec43330

File tree

3 files changed

+26
-7
lines changed

3 files changed

+26
-7
lines changed

openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ typedef enum {
288288
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11,
289289
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13,
290290
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14,
291+
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15,
291292
} hsa_executable_symbol_info_t;
292293

293294
typedef struct hsa_code_object_s {

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
436436
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
437437
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
438438
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
439+
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack},
439440
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
440441

441442
for (auto &Info : RequiredInfos) {
@@ -485,6 +486,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
485486
/// @return 56 for cov4 and 256 for cov5
486487
uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
487488

489+
/// Indicates whether or not we need to set up our own private segment size.
490+
bool usesDynamicStack() const { return DynamicStack; }
491+
488492
private:
489493
/// The kernel object to execute.
490494
uint64_t KernelObject;
@@ -493,6 +497,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
493497
uint32_t ArgsSize;
494498
uint32_t GroupSize;
495499
uint32_t PrivateSize;
500+
bool DynamicStack;
496501

497502
/// The size of implicit kernel arguments.
498503
uint32_t ImplicitArgsSize;
@@ -621,7 +626,8 @@ struct AMDGPUQueueTy {
621626
/// signal and can define an optional input signal (nullptr if none).
622627
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
623628
uint32_t NumThreads, uint64_t NumBlocks,
624-
uint32_t GroupSize, AMDGPUSignalTy *OutputSignal,
629+
uint32_t GroupSize, uint64_t StackSize,
630+
AMDGPUSignalTy *OutputSignal,
625631
AMDGPUSignalTy *InputSignal) {
626632
assert(OutputSignal && "Invalid kernel output signal");
627633

@@ -658,7 +664,8 @@ struct AMDGPUQueueTy {
658664
Packet->grid_size_x = NumBlocks * NumThreads;
659665
Packet->grid_size_y = 1;
660666
Packet->grid_size_z = 1;
661-
Packet->private_segment_size = Kernel.getPrivateSize();
667+
Packet->private_segment_size =
668+
Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
662669
Packet->group_segment_size = GroupSize;
663670
Packet->kernel_object = Kernel.getKernelObject();
664671
Packet->kernarg_address = KernelArgs;
@@ -1124,7 +1131,7 @@ struct AMDGPUStreamTy {
11241131
/// the kernel args buffer to the specified memory manager.
11251132
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
11261133
uint32_t NumThreads, uint64_t NumBlocks,
1127-
uint32_t GroupSize,
1134+
uint32_t GroupSize, uint64_t StackSize,
11281135
AMDGPUMemoryManagerTy &MemoryManager) {
11291136
if (Queue == nullptr)
11301137
return Plugin::error("Target queue was nullptr");
@@ -1147,7 +1154,8 @@ struct AMDGPUStreamTy {
11471154

11481155
// Push the kernel with the output signal and an input signal (optional)
11491156
return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks,
1150-
GroupSize, OutputSignal, InputSignal);
1157+
GroupSize, StackSize, OutputSignal,
1158+
InputSignal);
11511159
}
11521160

11531161
/// Push an asynchronous memory copy between pinned memory buffers.
@@ -2574,10 +2582,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
25742582

25752583
/// Getters and setters for stack and heap sizes.
25762584
Error getDeviceStackSize(uint64_t &Value) override {
2577-
Value = 0;
2585+
Value = StackSize;
25782586
return Plugin::success();
25792587
}
25802588
Error setDeviceStackSize(uint64_t Value) override {
2589+
StackSize = Value;
25812590
return Plugin::success();
25822591
}
25832592
Error getDeviceHeapSize(uint64_t &Value) override {
@@ -2728,6 +2737,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
27282737

27292738
/// The current size of the global device memory pool (managed by us).
27302739
uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */;
2740+
2741+
/// The current size of the stack that will be used in cases where it could
2742+
/// not be statically determined.
2743+
uint64_t StackSize = 16 * 1024 /* 16 KB */;
27312744
};
27322745

27332746
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
@@ -3100,6 +3113,10 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
31003113
GroupSize += MaxDynCGroupMem;
31013114
}
31023115

3116+
uint64_t StackSize;
3117+
if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
3118+
return Err;
3119+
31033120
// Initialize implicit arguments.
31043121
utils::AMDGPUImplicitArgsTy *ImplArgs =
31053122
reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
@@ -3138,7 +3155,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
31383155

31393156
// Push the kernel launch into the stream.
31403157
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
3141-
GroupSize, ArgsMemoryManager);
3158+
GroupSize, StackSize, ArgsMemoryManager);
31423159
}
31433160

31443161
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,

openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -864,6 +864,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
864864
return 0;
865865
}
866866

867+
virtual Error getDeviceStackSize(uint64_t &V) = 0;
868+
867869
private:
868870
/// Register offload entry for global variable.
869871
Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
@@ -882,7 +884,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
882884
/// Get and set the stack size and heap size for the device. If not used, the
883885
/// plugin can implement the setters as no-op and setting the output
884886
/// value to zero for the getters.
885-
virtual Error getDeviceStackSize(uint64_t &V) = 0;
886887
virtual Error setDeviceStackSize(uint64_t V) = 0;
887888
virtual Error getDeviceHeapSize(uint64_t &V) = 0;
888889
virtual Error setDeviceHeapSize(uint64_t V) = 0;

0 commit comments

Comments
 (0)