Skip to content

Commit 5b8031a

Browse files
authored
[Offload][AMDGPU] Correctly handle variable implicit argument sizes (llvm#142199)
Summary: The size of the implicit argument struct can vary depending on optimizations, it is not always the size as listed by the full struct. Additionally, the implicit arguments are always aligned on a pointer boundary. This patch updates the handling to use the correctly aligned offset and only initialize the members if they are contained in the reported size. Additionally, we modify the `alloc` and `free` routines to allow `alloc(0)` and `free(nullptr)` as these are mandated by the C standard and allow us to easily handle cases where the user calls a kernel with no arguments.
1 parent 41e22aa commit 5b8031a

File tree

2 files changed

+49
-20
lines changed

2 files changed

+49
-20
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 35 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3362,9 +3362,9 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
33623362
KernelLaunchParamsTy LaunchParams,
33633363
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
33643364
if (ArgsSize != LaunchParams.Size &&
3365-
ArgsSize != LaunchParams.Size + getImplicitArgsSize())
3365+
ArgsSize > LaunchParams.Size + getImplicitArgsSize())
33663366
return Plugin::error(ErrorCode::INVALID_ARGUMENT,
3367-
"mismatch of kernel arguments size");
3367+
"invalid kernel arguments size");
33683368

33693369
AMDGPUPluginTy &AMDGPUPlugin =
33703370
static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
@@ -3398,23 +3398,39 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
33983398
if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
33993399
return Err;
34003400

3401-
hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
3402-
if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
3403-
ImplArgs = reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
3404-
utils::advancePtr(AllArgs, LaunchParams.Size));
3405-
3406-
// Set the COV5+ implicit arguments to the appropriate values.
3407-
std::memset(ImplArgs, 0, getImplicitArgsSize());
3408-
ImplArgs->BlockCountX = NumBlocks[0];
3409-
ImplArgs->BlockCountY = NumBlocks[1];
3410-
ImplArgs->BlockCountZ = NumBlocks[2];
3411-
ImplArgs->GroupSizeX = NumThreads[0];
3412-
ImplArgs->GroupSizeY = NumThreads[1];
3413-
ImplArgs->GroupSizeZ = NumThreads[2];
3414-
ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1
3415-
? 3
3416-
: 1 + (NumBlocks[1] * NumThreads[1] != 1);
3417-
ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
3401+
uint64_t ImplArgsOffset = utils::roundUp(
3402+
LaunchParams.Size, alignof(hsa_utils::AMDGPUImplicitArgsTy));
3403+
if (ArgsSize > ImplArgsOffset) {
3404+
hsa_utils::AMDGPUImplicitArgsTy *ImplArgs =
3405+
reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
3406+
utils::advancePtr(AllArgs, ImplArgsOffset));
3407+
3408+
// Set the COV5+ implicit arguments to the appropriate values if present.
3409+
uint64_t ImplArgsSize = ArgsSize - ImplArgsOffset;
3410+
std::memset(ImplArgs, 0, ImplArgsSize);
3411+
3412+
using ImplArgsTy = hsa_utils::AMDGPUImplicitArgsTy;
3413+
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountX, ImplArgsSize,
3414+
NumBlocks[0]);
3415+
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountY, ImplArgsSize,
3416+
NumBlocks[1]);
3417+
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountZ, ImplArgsSize,
3418+
NumBlocks[2]);
3419+
3420+
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeX, ImplArgsSize,
3421+
NumThreads[0]);
3422+
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeY, ImplArgsSize,
3423+
NumThreads[1]);
3424+
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeZ, ImplArgsSize,
3425+
NumThreads[2]);
3426+
3427+
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GridDims, ImplArgsSize,
3428+
NumBlocks[2] * NumThreads[2] > 1
3429+
? 3
3430+
: 1 + (NumBlocks[1] * NumThreads[1] != 1));
3431+
3432+
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::DynamicLdsSize, ImplArgsSize,
3433+
KernelArgs.DynCGroupMem);
34183434
}
34193435

34203436
// Push the kernel launch into the stream.

offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <cstdint>
1414

1515
#include "Shared/Debug.h"
16+
#include "Shared/Utils.h"
1617
#include "Utils/ELF.h"
1718

1819
#include "omptarget.h"
@@ -26,7 +27,7 @@ namespace plugin {
2627
namespace hsa_utils {
2728

2829
// The implicit arguments of COV5 AMDGPU kernels.
29-
struct AMDGPUImplicitArgsTy {
30+
struct alignas(alignof(void *)) AMDGPUImplicitArgsTy {
3031
uint32_t BlockCountX;
3132
uint32_t BlockCountY;
3233
uint32_t BlockCountZ;
@@ -60,6 +61,18 @@ inline Error readAMDGPUMetaDataFromImage(
6061
return Err;
6162
}
6263

64+
/// Initializes the HSA implicit argument if the struct size permits it. This is
65+
/// necessary because optimizations can modify the size of the struct if
66+
/// portions of it are unused.
67+
template <typename MemberTy, typename T>
68+
void initImplArg(AMDGPUImplicitArgsTy *Base,
69+
MemberTy AMDGPUImplicitArgsTy::*Member, size_t AvailableSize,
70+
T Value) {
71+
uint64_t Offset = utils::getPtrDiff(&(Base->*Member), Base);
72+
if (Offset + sizeof(MemberTy) <= AvailableSize)
73+
Base->*Member = static_cast<MemberTy>(Value);
74+
}
75+
6376
} // namespace hsa_utils
6477
} // namespace plugin
6578
} // namespace target

0 commit comments

Comments
 (0)