Skip to content

[Offload][AMDGPU] Correctly handle variable implicit argument sizes #142199

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 35 additions & 19 deletions offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3365,9 +3365,9 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
if (ArgsSize != LaunchParams.Size &&
ArgsSize != LaunchParams.Size + getImplicitArgsSize())
ArgsSize > LaunchParams.Size + getImplicitArgsSize())
return Plugin::error(ErrorCode::INVALID_ARGUMENT,
"mismatch of kernel arguments size");
"invalid kernel arguments size");

AMDGPUPluginTy &AMDGPUPlugin =
static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
Expand Down Expand Up @@ -3401,23 +3401,39 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
return Err;

hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
ImplArgs = reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
utils::advancePtr(AllArgs, LaunchParams.Size));

// Set the COV5+ implicit arguments to the appropriate values.
std::memset(ImplArgs, 0, getImplicitArgsSize());
ImplArgs->BlockCountX = NumBlocks[0];
ImplArgs->BlockCountY = NumBlocks[1];
ImplArgs->BlockCountZ = NumBlocks[2];
ImplArgs->GroupSizeX = NumThreads[0];
ImplArgs->GroupSizeY = NumThreads[1];
ImplArgs->GroupSizeZ = NumThreads[2];
ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1
? 3
: 1 + (NumBlocks[1] * NumThreads[1] != 1);
ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
uint64_t ImplArgsOffset = utils::roundUp(
LaunchParams.Size, alignof(hsa_utils::AMDGPUImplicitArgsTy));
if (ArgsSize > ImplArgsOffset) {
hsa_utils::AMDGPUImplicitArgsTy *ImplArgs =
reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
utils::advancePtr(AllArgs, ImplArgsOffset));

// Set the COV5+ implicit arguments to the appropriate values if present.
uint64_t ImplArgsSize = ArgsSize - ImplArgsOffset;
std::memset(ImplArgs, 0, ImplArgsSize);

using ImplArgsTy = hsa_utils::AMDGPUImplicitArgsTy;
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountX, ImplArgsSize,
NumBlocks[0]);
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountY, ImplArgsSize,
NumBlocks[1]);
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountZ, ImplArgsSize,
NumBlocks[2]);

hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeX, ImplArgsSize,
NumThreads[0]);
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeY, ImplArgsSize,
NumThreads[1]);
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeZ, ImplArgsSize,
NumThreads[2]);

hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GridDims, ImplArgsSize,
NumBlocks[2] * NumThreads[2] > 1
? 3
: 1 + (NumBlocks[1] * NumThreads[1] != 1));

hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::DynamicLdsSize, ImplArgsSize,
KernelArgs.DynCGroupMem);
}

// Push the kernel launch into the stream.
Expand Down
15 changes: 14 additions & 1 deletion offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <cstdint>

#include "Shared/Debug.h"
#include "Shared/Utils.h"
#include "Utils/ELF.h"

#include "omptarget.h"
Expand All @@ -26,7 +27,7 @@ namespace plugin {
namespace hsa_utils {

// The implicit arguments of COV5 AMDGPU kernels.
struct AMDGPUImplicitArgsTy {
struct alignas(alignof(void *)) AMDGPUImplicitArgsTy {
uint32_t BlockCountX;
uint32_t BlockCountY;
uint32_t BlockCountZ;
Expand Down Expand Up @@ -60,6 +61,18 @@ inline Error readAMDGPUMetaDataFromImage(
return Err;
}

/// Initializes the HSA implicit argument if the struct size permits it. This is
/// necessary because optimizations can modify the size of the struct if
/// portions of it are unused.
template <typename MemberTy, typename T>
void initImplArg(AMDGPUImplicitArgsTy *Base,
MemberTy AMDGPUImplicitArgsTy::*Member, size_t AvailableSize,
T Value) {
uint64_t Offset = utils::getPtrDiff(&(Base->*Member), Base);
if (Offset + sizeof(MemberTy) <= AvailableSize)
Base->*Member = static_cast<MemberTy>(Value);
}

} // namespace hsa_utils
} // namespace plugin
} // namespace target
Expand Down
Loading