Skip to content

[Libomptarget] Handle dynamic stack sizes for AMD COV5 #72606

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@ typedef enum {
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15,
} hsa_executable_symbol_info_t;

typedef struct hsa_code_object_s {
Expand Down
29 changes: 23 additions & 6 deletions openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}};

for (auto &Info : RequiredInfos) {
Expand Down Expand Up @@ -485,6 +486,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
/// @return 56 for cov4 and 256 for cov5
uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }

/// Indicates whether or not we need to set up our own private segment size.
bool usesDynamicStack() const { return DynamicStack; }

private:
/// The kernel object to execute.
uint64_t KernelObject;
Expand All @@ -493,6 +497,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
uint32_t ArgsSize;
uint32_t GroupSize;
uint32_t PrivateSize;
bool DynamicStack;

/// The size of implicit kernel arguments.
uint32_t ImplicitArgsSize;
Expand Down Expand Up @@ -621,7 +626,8 @@ struct AMDGPUQueueTy {
/// signal and can define an optional input signal (nullptr if none).
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
uint32_t GroupSize, AMDGPUSignalTy *OutputSignal,
uint32_t GroupSize, uint64_t StackSize,
AMDGPUSignalTy *OutputSignal,
AMDGPUSignalTy *InputSignal) {
assert(OutputSignal && "Invalid kernel output signal");

Expand Down Expand Up @@ -658,7 +664,8 @@ struct AMDGPUQueueTy {
Packet->grid_size_x = NumBlocks * NumThreads;
Packet->grid_size_y = 1;
Packet->grid_size_z = 1;
Packet->private_segment_size = Kernel.getPrivateSize();
Packet->private_segment_size =
Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
Packet->group_segment_size = GroupSize;
Packet->kernel_object = Kernel.getKernelObject();
Packet->kernarg_address = KernelArgs;
Expand Down Expand Up @@ -1124,7 +1131,7 @@ struct AMDGPUStreamTy {
/// the kernel args buffer to the specified memory manager.
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
uint32_t GroupSize,
uint32_t GroupSize, uint64_t StackSize,
AMDGPUMemoryManagerTy &MemoryManager) {
if (Queue == nullptr)
return Plugin::error("Target queue was nullptr");
Expand All @@ -1147,7 +1154,8 @@ struct AMDGPUStreamTy {

// Push the kernel with the output signal and an input signal (optional)
return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks,
GroupSize, OutputSignal, InputSignal);
GroupSize, StackSize, OutputSignal,
InputSignal);
}

/// Push an asynchronous memory copy between pinned memory buffers.
Expand Down Expand Up @@ -2574,10 +2582,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {

/// Getters and setters for stack and heap sizes.
Error getDeviceStackSize(uint64_t &Value) override {
Value = 0;
Value = StackSize;
return Plugin::success();
}
Error setDeviceStackSize(uint64_t Value) override {
StackSize = Value;
return Plugin::success();
}
Error getDeviceHeapSize(uint64_t &Value) override {
Expand Down Expand Up @@ -2728,6 +2737,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {

/// The current size of the global device memory pool (managed by us).
uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */;

/// The current size of the stack that will be used in cases where it could
/// not be statically determined.
uint64_t StackSize = 16 * 1024 /* 16 KB */;
};

Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
Expand Down Expand Up @@ -3100,6 +3113,10 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
GroupSize += MaxDynCGroupMem;
}

uint64_t StackSize;
if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
return Err;

// Initialize implicit arguments.
utils::AMDGPUImplicitArgsTy *ImplArgs =
reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
Expand Down Expand Up @@ -3138,7 +3155,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,

// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
GroupSize, ArgsMemoryManager);
GroupSize, StackSize, ArgsMemoryManager);
}

Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -864,6 +864,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
return 0;
}

virtual Error getDeviceStackSize(uint64_t &V) = 0;

private:
/// Register offload entry for global variable.
Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
Expand All @@ -882,7 +884,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Get and set the stack size and heap size for the device. If not used, the
/// plugin can implement the setters as no-op and setting the output
/// value to zero for the getters.
virtual Error getDeviceStackSize(uint64_t &V) = 0;
virtual Error setDeviceStackSize(uint64_t V) = 0;
virtual Error getDeviceHeapSize(uint64_t &V) = 0;
virtual Error setDeviceHeapSize(uint64_t V) = 0;
Expand Down