Skip to content

Commit 47a3ad5

Browse files
authored
[Libomptarget] Handle dynamic stack sizes for AMD COV5 (#72606)
Summary: One of the changes in the AMD code-object version five was that kernels that use an unknown amount of private stack memory now no longer default to 16 KBs. Instead it emits a flag that indicates the runtime must provide a value. This patch checks if we must provide such a stack, and uses the existing handling of the stack environment variable to configure it.
1 parent 9824040 commit 47a3ad5

File tree

3 files changed

+26
-7
lines changed

3 files changed

+26
-7
lines changed

openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ typedef enum {
288288
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11,
289289
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13,
290290
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14,
291+
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15,
291292
} hsa_executable_symbol_info_t;
292293

293294
typedef struct hsa_code_object_s {

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
475475
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
476476
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
477477
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
478+
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack},
478479
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};
479480

480481
for (auto &Info : RequiredInfos) {
@@ -524,6 +525,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
524525
/// @return 56 for cov4 and 256 for cov5
525526
uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
526527

528+
/// Indicates whether or not we need to set up our own private segment size.
529+
bool usesDynamicStack() const { return DynamicStack; }
530+
527531
private:
528532
/// The kernel object to execute.
529533
uint64_t KernelObject;
@@ -532,6 +536,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
532536
uint32_t ArgsSize;
533537
uint32_t GroupSize;
534538
uint32_t PrivateSize;
539+
bool DynamicStack;
535540

536541
/// The size of implicit kernel arguments.
537542
uint32_t ImplicitArgsSize;
@@ -660,7 +665,8 @@ struct AMDGPUQueueTy {
660665
/// signal and can define an optional input signal (nullptr if none).
661666
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
662667
uint32_t NumThreads, uint64_t NumBlocks,
663-
uint32_t GroupSize, AMDGPUSignalTy *OutputSignal,
668+
uint32_t GroupSize, uint64_t StackSize,
669+
AMDGPUSignalTy *OutputSignal,
664670
AMDGPUSignalTy *InputSignal) {
665671
assert(OutputSignal && "Invalid kernel output signal");
666672

@@ -697,7 +703,8 @@ struct AMDGPUQueueTy {
697703
Packet->grid_size_x = NumBlocks * NumThreads;
698704
Packet->grid_size_y = 1;
699705
Packet->grid_size_z = 1;
700-
Packet->private_segment_size = Kernel.getPrivateSize();
706+
Packet->private_segment_size =
707+
Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
701708
Packet->group_segment_size = GroupSize;
702709
Packet->kernel_object = Kernel.getKernelObject();
703710
Packet->kernarg_address = KernelArgs;
@@ -1166,7 +1173,7 @@ struct AMDGPUStreamTy {
11661173
/// the kernel args buffer to the specified memory manager.
11671174
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
11681175
uint32_t NumThreads, uint64_t NumBlocks,
1169-
uint32_t GroupSize,
1176+
uint32_t GroupSize, uint64_t StackSize,
11701177
AMDGPUMemoryManagerTy &MemoryManager) {
11711178
if (Queue == nullptr)
11721179
return Plugin::error("Target queue was nullptr");
@@ -1189,7 +1196,8 @@ struct AMDGPUStreamTy {
11891196

11901197
// Push the kernel with the output signal and an input signal (optional)
11911198
return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks,
1192-
GroupSize, OutputSignal, InputSignal);
1199+
GroupSize, StackSize, OutputSignal,
1200+
InputSignal);
11931201
}
11941202

11951203
/// Push an asynchronous memory copy between pinned memory buffers.
@@ -2610,10 +2618,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
26102618

26112619
/// Getters and setters for stack and heap sizes.
26122620
Error getDeviceStackSize(uint64_t &Value) override {
2613-
Value = 0;
2621+
Value = StackSize;
26142622
return Plugin::success();
26152623
}
26162624
Error setDeviceStackSize(uint64_t Value) override {
2625+
StackSize = Value;
26172626
return Plugin::success();
26182627
}
26192628
Error getDeviceHeapSize(uint64_t &Value) override {
@@ -2769,6 +2778,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
27692778

27702779
/// The current size of the global device memory pool (managed by us).
27712780
uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */;
2781+
2782+
/// The current size of the stack that will be used in cases where it could
2783+
/// not be statically determined.
2784+
uint64_t StackSize = 16 * 1024 /* 16 KB */;
27722785
};
27732786

27742787
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
@@ -3142,6 +3155,10 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
31423155
GroupSize += MaxDynCGroupMem;
31433156
}
31443157

3158+
uint64_t StackSize;
3159+
if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
3160+
return Err;
3161+
31453162
// Initialize implicit arguments.
31463163
utils::AMDGPUImplicitArgsTy *ImplArgs =
31473164
reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
@@ -3180,7 +3197,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
31803197

31813198
// Push the kernel launch into the stream.
31823199
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
3183-
GroupSize, ArgsMemoryManager);
3200+
GroupSize, StackSize, ArgsMemoryManager);
31843201
}
31853202

31863203
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,

openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -864,6 +864,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
864864
return 0;
865865
}
866866

867+
virtual Error getDeviceStackSize(uint64_t &V) = 0;
868+
867869
private:
868870
/// Register offload entry for global variable.
869871
Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
@@ -882,7 +884,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
882884
/// Get and set the stack size and heap size for the device. If not used, the
883885
/// plugin can implement the setters as no-op and setting the output
884886
/// value to zero for the getters.
885-
virtual Error getDeviceStackSize(uint64_t &V) = 0;
886887
virtual Error setDeviceStackSize(uint64_t V) = 0;
887888
virtual Error getDeviceHeapSize(uint64_t &V) = 0;
888889
virtual Error setDeviceHeapSize(uint64_t V) = 0;

0 commit comments

Comments
 (0)