Skip to content

Commit 1e7d08c

Browse files
shiltianronlieb
authored andcommitted
[Offload][OMPX] Add the runtime support for multi-dim grid and block (llvm#118042)
Change-Id: Iab77154e209eec3a902e4d079a4a233e52a32a8e
1 parent 5c59637 commit 1e7d08c

File tree

13 files changed

+220
-121
lines changed

13 files changed

+220
-121
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 64 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -836,21 +836,23 @@ struct AMDGPUKernelTy : public GenericKernelTy {
836836
}
837837

838838
/// Launch the AMDGPU kernel function.
839-
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
840-
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
839+
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
840+
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
841841
KernelLaunchParamsTy LaunchParams,
842842
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
843843

844844
/// Print more elaborate kernel launch info for AMDGPU
845845
Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
846-
KernelArgsTy &KernelArgs, uint32_t NumThreads,
847-
uint64_t NumBlocks, int64_t MultiDeviceLB,
846+
KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
847+
uint32_t NumBlocks[3], int64_t MultiDeviceLB,
848848
int64_t MultiDeviceUB) const override;
849849
/// Print the "old" AMD KernelTrace single-line format
850850
void printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,
851-
KernelArgsTy &KernelArgs, uint32_t NumThreads,
852-
uint64_t NumBlocks, int64_t MultiDeviceLB,
851+
KernelArgsTy &KernelArgs,
852+
uint32_t NumThreads[3], uint32_t NumBlocks[3],
853+
int64_t MultiDeviceLB,
853854
int64_t MultiDeviceUB) const;
855+
854856
/// Get group and private segment kernel size.
855857
uint32_t getGroupSize() const { return GroupSize; }
856858
uint32_t getPrivateSize() const { return PrivateSize; }
@@ -976,7 +978,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
976978
/// user-defined threads and block clauses.
977979
uint32_t getNumThreads(GenericDeviceTy &GenericDevice,
978980
uint32_t ThreadLimitClause[3]) const override {
979-
assert(ThreadLimitClause[1] == 0 && ThreadLimitClause[2] == 0 &&
981+
assert(ThreadLimitClause[1] == 1 && ThreadLimitClause[2] == 1 &&
980982
"Multi dimensional launch not supported yet.");
981983

982984
// Honor OMP_TEAMS_THREAD_LIMIT environment variable and
@@ -997,7 +999,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
997999
TeamsThreadLimitEnvVar <= static_cast<int32_t>(ConstWGSize))
9981000
return llvm::omp::getBlockSizeAsPowerOfTwo(TeamsThreadLimitEnvVar);
9991001
if (ThreadLimitClause[0] > 0 && ThreadLimitClause[0] != (uint32_t)-1 &&
1000-
ThreadLimitClause[0] <= static_cast<int32_t>(ConstWGSize))
1002+
ThreadLimitClause[0] <= static_cast<uint32_t>(ConstWGSize))
10011003
return llvm::omp::getBlockSizeAsPowerOfTwo(ThreadLimitClause[0]);
10021004
assert(((ConstWGSize & (ConstWGSize - 1)) == 0) &&
10031005
"XTeam Reduction blocksize must be a power of two");
@@ -1022,11 +1024,11 @@ struct AMDGPUKernelTy : public GenericKernelTy {
10221024
? ThreadLimitClause[0]
10231025
: PreferredNumThreads);
10241026
}
1025-
uint64_t getNumBlocks(GenericDeviceTy &GenericDevice,
1027+
uint32_t getNumBlocks(GenericDeviceTy &GenericDevice,
10261028
uint32_t NumTeamsClause[3], uint64_t LoopTripCount,
10271029
uint32_t &NumThreads,
10281030
bool IsNumThreadsFromUser) const override {
1029-
assert(NumTeamsClause[1] == 0 && NumTeamsClause[2] == 0 &&
1031+
assert(NumTeamsClause[1] == 1 && NumTeamsClause[2] == 1 &&
10301032
"Multi dimensional launch not supported yet.");
10311033

10321034
const auto getNumGroupsFromThreadsAndTripCount =
@@ -1062,7 +1064,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
10621064
getNumGroupsFromThreadsAndTripCount(LoopTripCount, NumThreads);
10631065

10641066
// Honor OMP_NUM_TEAMS environment variable for BigJumpLoop kernel type.
1065-
if (NumTeamsEnvVar > 0 && NumTeamsEnvVar <= GenericDevice.getBlockLimit())
1067+
if (NumTeamsEnvVar > 0 && static_cast<uint32_t>(NumTeamsEnvVar) <=
1068+
GenericDevice.getBlockLimit())
10661069
NumGroups = std::min(static_cast<uint64_t>(NumTeamsEnvVar), NumGroups);
10671070
// Honor num_teams clause but lower it if tripcount dictates.
10681071
else if (NumTeamsClause[0] > 0 &&
@@ -1145,8 +1148,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
11451148
NumTeamsClause[0] <= GenericDevice.getBlockLimit()) {
11461149
NumGroups =
11471150
std::min(static_cast<uint64_t>(NumTeamsClause[0]), MaxNumGroups);
1148-
} else if (NumTeamsEnvVar > 0 &&
1149-
NumTeamsEnvVar <= GenericDevice.getBlockLimit()) {
1151+
} else if (NumTeamsEnvVar > 0 && static_cast<uint32_t>(NumTeamsEnvVar) <=
1152+
GenericDevice.getBlockLimit()) {
11501153
NumGroups =
11511154
std::min(static_cast<uint64_t>(NumTeamsEnvVar), MaxNumGroups);
11521155
} else {
@@ -1462,8 +1465,8 @@ struct AMDGPUQueueTy {
14621465
/// Push a kernel launch to the queue. The kernel launch requires an output
14631466
/// signal and can define an optional input signal (nullptr if none).
14641467
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
1465-
uint32_t NumThreads, uint64_t NumBlocks,
1466-
uint32_t GroupSize, uint32_t StackSize,
1468+
uint32_t NumThreads[3], uint32_t NumBlocks[3],
1469+
uint32_t GroupSize, uint64_t StackSize,
14671470
AMDGPUSignalTy *OutputSignal,
14681471
AMDGPUSignalTy *InputSignal) {
14691472
assert(OutputSignal && "Invalid kernel output signal");
@@ -1489,17 +1492,23 @@ struct AMDGPUQueueTy {
14891492
assert(Packet && "Invalid packet");
14901493

14911494
// The first 32 bits of the packet are written after the other fields
1492-
uint16_t Setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1493-
Packet->workgroup_size_x = NumThreads;
1494-
Packet->workgroup_size_y = 1;
1495-
Packet->workgroup_size_z = 1;
1495+
uint16_t Dims = NumBlocks[2] * NumThreads[2] > 1
1496+
? 3
1497+
: 1 + (NumBlocks[1] * NumThreads[1] != 1);
1498+
uint16_t Setup = UINT16_C(Dims)
1499+
<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1500+
Packet->workgroup_size_x = NumThreads[0];
1501+
Packet->workgroup_size_y = NumThreads[1];
1502+
Packet->workgroup_size_z = NumThreads[2];
14961503
Packet->reserved0 = 0;
1497-
Packet->grid_size_x = NumBlocks * NumThreads;
1498-
Packet->grid_size_y = 1;
1499-
Packet->grid_size_z = 1;
1504+
Packet->grid_size_x = NumBlocks[0] * NumThreads[0];
1505+
Packet->grid_size_y = NumBlocks[1] * NumThreads[1];
1506+
Packet->grid_size_z = NumBlocks[2] * NumThreads[2];
15001507
Packet->private_segment_size =
1501-
Kernel.usesDynamicStack() ? std::max(Kernel.getPrivateSize(), StackSize)
1502-
: Kernel.getPrivateSize();
1508+
Kernel.usesDynamicStack()
1509+
? std::max(static_cast<uint64_t>(Kernel.getPrivateSize()),
1510+
StackSize)
1511+
: Kernel.getPrivateSize();
15031512
Packet->group_segment_size = GroupSize;
15041513
Packet->kernel_object = Kernel.getKernelObject();
15051514
Packet->kernarg_address = KernelArgs;
@@ -2117,8 +2126,9 @@ struct AMDGPUStreamTy {
21172126
/// the kernel args buffer to the specified memory manager.
21182127
Error
21192128
pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
2120-
uint32_t NumThreads, uint64_t NumBlocks, uint32_t GroupSize,
2121-
uint32_t StackSize, AMDGPUMemoryManagerTy &MemoryManager,
2129+
uint32_t NumThreads[3], uint32_t NumBlocks[3],
2130+
uint32_t GroupSize, uint32_t StackSize,
2131+
AMDGPUMemoryManagerTy &MemoryManager,
21222132
std::unique_ptr<ompt::OmptEventInfoTy> OmptInfo = nullptr) {
21232133
if (Queue == nullptr)
21242134
return Plugin::error("Target queue was nullptr");
@@ -4222,10 +4232,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
42224232
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
42234233

42244234
KernelArgsTy KernelArgs = {};
4225-
if (auto Err =
4226-
AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
4227-
/*NumBlocks=*/1ul, KernelArgs,
4228-
KernelLaunchParamsTy{}, AsyncInfoWrapper))
4235+
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
4236+
if (auto Err = AMDGPUKernel.launchImpl(
4237+
*this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
4238+
KernelLaunchParamsTy{}, AsyncInfoWrapper))
42294239
return Err;
42304240

42314241
Error Err = Plugin::success();
@@ -4960,7 +4970,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
49604970
};
49614971

49624972
Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
4963-
uint32_t NumThreads, uint64_t NumBlocks,
4973+
uint32_t NumThreads[3], uint32_t NumBlocks[3],
49644974
KernelArgsTy &KernelArgs,
49654975
KernelLaunchParamsTy LaunchParams,
49664976
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -5041,13 +5051,15 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
50415051
if (ImplArgs &&
50425052
getImplicitArgsSize() == sizeof(hsa_utils::AMDGPUImplicitArgsTy)) {
50435053
DP("Setting fields of ImplicitArgs for COV5\n");
5044-
ImplArgs->BlockCountX = NumBlocks;
5045-
ImplArgs->BlockCountY = 1;
5046-
ImplArgs->BlockCountZ = 1;
5047-
ImplArgs->GroupSizeX = NumThreads;
5048-
ImplArgs->GroupSizeY = 1;
5049-
ImplArgs->GroupSizeZ = 1;
5050-
ImplArgs->GridDims = 1;
5054+
ImplArgs->BlockCountX = NumBlocks[0];
5055+
ImplArgs->BlockCountY = NumBlocks[1];
5056+
ImplArgs->BlockCountZ = NumBlocks[2];
5057+
ImplArgs->GroupSizeX = NumThreads[0];
5058+
ImplArgs->GroupSizeY = NumThreads[1];
5059+
ImplArgs->GroupSizeZ = NumThreads[2];
5060+
ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1
5061+
? 3
5062+
: 1 + (NumBlocks[1] * NumThreads[1] != 1);
50515063
ImplArgs->HeapV1Ptr =
50525064
(uint64_t)AMDGPUDevice.getPreAllocatedDeviceMemoryPool();
50535065
ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
@@ -5065,8 +5077,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
50655077

50665078
void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,
50675079
KernelArgsTy &KernelArgs,
5068-
uint32_t NumThreads,
5069-
uint64_t NumBlocks,
5080+
uint32_t NumThreads[3],
5081+
uint32_t NumBlocks[3],
50705082
int64_t MultiDeviceLB,
50715083
int64_t MultiDeviceUB) const {
50725084
auto GroupSegmentSize = (*KernelInfo).GroupSegmentList;
@@ -5084,17 +5096,17 @@ void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,
50845096
"md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
50855097
"%d%% n:%s\n",
50865098
GenericDevice.getDeviceId(), getExecutionModeFlags(), ConstWGSize,
5087-
KernelArgs.NumArgs, NumBlocks, NumThreads, 0, 0, GroupSegmentSize,
5088-
SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount,
5089-
KernelArgs.Tripcount, NeedsHostServices, isMultiDeviceKernel(),
5090-
MultiDeviceLB, MultiDeviceUB, MaxOccupancy, AchievedOccupancy,
5091-
getName());
5099+
KernelArgs.NumArgs, NumBlocks[0], NumThreads[0], 0, 0,
5100+
GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount,
5101+
VGPRSpillCount, KernelArgs.Tripcount, NeedsHostServices,
5102+
isMultiDeviceKernel(), MultiDeviceLB, MultiDeviceUB, MaxOccupancy,
5103+
AchievedOccupancy, getName());
50925104
}
50935105

50945106
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
50955107
KernelArgsTy &KernelArgs,
5096-
uint32_t NumThreads,
5097-
uint64_t NumBlocks,
5108+
uint32_t NumThreads[3],
5109+
uint32_t NumBlocks[3],
50985110
int64_t MultiDeviceLB,
50995111
int64_t MultiDeviceUB) const {
51005112
// When LIBOMPTARGET_KERNEL_TRACE is set, print the single-line kernel trace
@@ -5140,12 +5152,13 @@ Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
51405152
// S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel
51415153
// Tripcount: loop tripcount for the kernel
51425154
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
5143-
"#Args: %d Teams x Thrds: %4lux%4u (MaxFlatWorkGroupSize: %u) LDS "
5155+
"#Args: %d Teams x Thrds: %4ux%4u (MaxFlatWorkGroupSize: %u) LDS "
51445156
"Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: "
51455157
"%lu\n",
5146-
ArgNum, NumGroups, ThreadsPerGroup, MaxFlatWorkgroupSize,
5147-
GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount,
5148-
LoopTripCount);
5158+
ArgNum, NumGroups[0] * NumGroups[1] * NumGroups[2],
5159+
ThreadsPerGroup[0] * ThreadsPerGroup[1] * ThreadsPerGroup[2],
5160+
MaxFlatWorkgroupSize, GroupSegmentSize, SGPRCount, VGPRCount,
5161+
SGPRSpillCount, VGPRSpillCount, LoopTripCount);
51495162

51505163
return Plugin::success();
51515164
}

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -270,8 +270,9 @@ struct GenericKernelTy {
270270
Error launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
271271
ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
272272
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
273-
virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
274-
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
273+
virtual Error launchImpl(GenericDeviceTy &GenericDevice,
274+
uint32_t NumThreads[3], uint32_t NumBlocks[3],
275+
KernelArgsTy &KernelArgs,
275276
KernelLaunchParamsTy LaunchParams,
276277
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
277278

@@ -311,6 +312,15 @@ struct GenericKernelTy {
311312
llvm_unreachable("Unknown execution mode!");
312313
}
313314

315+
/// Indicate whether it is a specialized kernel.
316+
bool isSpecializedKernel() const {
317+
if (ExecutionMode == OMP_TGT_EXEC_MODE_SPMD_NO_LOOP ||
318+
ExecutionMode == OMP_TGT_EXEC_MODE_SPMD_BIG_JUMP_LOOP ||
319+
ExecutionMode == OMP_TGT_EXEC_MODE_XTEAM_RED)
320+
return true;
321+
return false;
322+
}
323+
314324
/// Check if kernel is a multi-device kernel.
315325
bool isMultiDeviceKernel() const { return IsMultiDeviceKernel; }
316326

@@ -359,15 +369,16 @@ struct GenericKernelTy {
359369

360370
/// Prints generic kernel launch information.
361371
Error printLaunchInfo(GenericDeviceTy &GenericDevice,
362-
KernelArgsTy &KernelArgs, uint32_t NumThreads,
363-
uint64_t NumBlocks, int64_t MultiDeviceLB,
372+
KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
373+
uint32_t NumBlocks[3], int64_t MultiDeviceLB,
364374
int64_t MultiDeviceUB) const;
365375

366376
/// Prints plugin-specific kernel launch information after generic kernel
367377
/// launch information
368378
virtual Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
369379
KernelArgsTy &KernelArgs,
370-
uint32_t NumThreads, uint64_t NumBlocks,
380+
uint32_t NumThreads[3],
381+
uint32_t NumBlocks[3],
371382
int64_t MultiDeviceLB,
372383
int64_t MultiDeviceUB) const;
373384

@@ -396,7 +407,7 @@ struct GenericKernelTy {
396407
/// The number of threads \p NumThreads can be adjusted by this method.
397408
/// \p IsNumThreadsFromUser is true is \p NumThreads is defined by user via
398409
/// thread_limit clause.
399-
virtual uint64_t getNumBlocks(GenericDeviceTy &GenericDevice,
410+
virtual uint32_t getNumBlocks(GenericDeviceTy &GenericDevice,
400411
uint32_t BlockLimitClause[3],
401412
uint64_t LoopTripCount, uint32_t &NumThreads,
402413
bool IsNumThreadsFromUser) const;

0 commit comments

Comments
 (0)