@@ -836,21 +836,23 @@ struct AMDGPUKernelTy : public GenericKernelTy {
836
836
}
837
837
838
838
// / Launch the AMDGPU kernel function.
839
- Error launchImpl (GenericDeviceTy &GenericDevice, uint32_t NumThreads,
840
- uint64_t NumBlocks, KernelArgsTy &KernelArgs,
839
+ Error launchImpl (GenericDeviceTy &GenericDevice, uint32_t NumThreads[ 3 ] ,
840
+ uint32_t NumBlocks[ 3 ] , KernelArgsTy &KernelArgs,
841
841
KernelLaunchParamsTy LaunchParams,
842
842
AsyncInfoWrapperTy &AsyncInfoWrapper) const override ;
843
843
844
844
// / Print more elaborate kernel launch info for AMDGPU
845
845
Error printLaunchInfoDetails (GenericDeviceTy &GenericDevice,
846
- KernelArgsTy &KernelArgs, uint32_t NumThreads,
847
- uint64_t NumBlocks, int64_t MultiDeviceLB,
846
+ KernelArgsTy &KernelArgs, uint32_t NumThreads[ 3 ] ,
847
+ uint32_t NumBlocks[ 3 ] , int64_t MultiDeviceLB,
848
848
int64_t MultiDeviceUB) const override ;
849
849
// / Print the "old" AMD KernelTrace single-line format
850
850
void printAMDOneLineKernelTrace (GenericDeviceTy &GenericDevice,
851
- KernelArgsTy &KernelArgs, uint32_t NumThreads,
852
- uint64_t NumBlocks, int64_t MultiDeviceLB,
851
+ KernelArgsTy &KernelArgs,
852
+ uint32_t NumThreads[3 ], uint32_t NumBlocks[3 ],
853
+ int64_t MultiDeviceLB,
853
854
int64_t MultiDeviceUB) const ;
855
+
854
856
// / Get group and private segment kernel size.
855
857
uint32_t getGroupSize () const { return GroupSize; }
856
858
uint32_t getPrivateSize () const { return PrivateSize; }
@@ -976,7 +978,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
976
978
// / user-defined threads and block clauses.
977
979
uint32_t getNumThreads (GenericDeviceTy &GenericDevice,
978
980
uint32_t ThreadLimitClause[3 ]) const override {
979
- assert (ThreadLimitClause[1 ] == 0 && ThreadLimitClause[2 ] == 0 &&
981
+ assert (ThreadLimitClause[1 ] == 1 && ThreadLimitClause[2 ] == 1 &&
980
982
" Multi dimensional launch not supported yet." );
981
983
982
984
// Honor OMP_TEAMS_THREAD_LIMIT environment variable and
@@ -997,7 +999,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
997
999
TeamsThreadLimitEnvVar <= static_cast <int32_t >(ConstWGSize))
998
1000
return llvm::omp::getBlockSizeAsPowerOfTwo (TeamsThreadLimitEnvVar);
999
1001
if (ThreadLimitClause[0 ] > 0 && ThreadLimitClause[0 ] != (uint32_t )-1 &&
1000
- ThreadLimitClause[0 ] <= static_cast <int32_t >(ConstWGSize))
1002
+ ThreadLimitClause[0 ] <= static_cast <uint32_t >(ConstWGSize))
1001
1003
return llvm::omp::getBlockSizeAsPowerOfTwo (ThreadLimitClause[0 ]);
1002
1004
assert (((ConstWGSize & (ConstWGSize - 1 )) == 0 ) &&
1003
1005
" XTeam Reduction blocksize must be a power of two" );
@@ -1022,11 +1024,11 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1022
1024
? ThreadLimitClause[0 ]
1023
1025
: PreferredNumThreads);
1024
1026
}
1025
- uint64_t getNumBlocks (GenericDeviceTy &GenericDevice,
1027
+ uint32_t getNumBlocks (GenericDeviceTy &GenericDevice,
1026
1028
uint32_t NumTeamsClause[3 ], uint64_t LoopTripCount,
1027
1029
uint32_t &NumThreads,
1028
1030
bool IsNumThreadsFromUser) const override {
1029
- assert (NumTeamsClause[1 ] == 0 && NumTeamsClause[2 ] == 0 &&
1031
+ assert (NumTeamsClause[1 ] == 1 && NumTeamsClause[2 ] == 1 &&
1030
1032
" Multi dimensional launch not supported yet." );
1031
1033
1032
1034
const auto getNumGroupsFromThreadsAndTripCount =
@@ -1062,7 +1064,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1062
1064
getNumGroupsFromThreadsAndTripCount (LoopTripCount, NumThreads);
1063
1065
1064
1066
// Honor OMP_NUM_TEAMS environment variable for BigJumpLoop kernel type.
1065
- if (NumTeamsEnvVar > 0 && NumTeamsEnvVar <= GenericDevice.getBlockLimit ())
1067
+ if (NumTeamsEnvVar > 0 && static_cast <uint32_t >(NumTeamsEnvVar) <=
1068
+ GenericDevice.getBlockLimit ())
1066
1069
NumGroups = std::min (static_cast <uint64_t >(NumTeamsEnvVar), NumGroups);
1067
1070
// Honor num_teams clause but lower it if tripcount dictates.
1068
1071
else if (NumTeamsClause[0 ] > 0 &&
@@ -1145,8 +1148,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1145
1148
NumTeamsClause[0 ] <= GenericDevice.getBlockLimit ()) {
1146
1149
NumGroups =
1147
1150
std::min (static_cast <uint64_t >(NumTeamsClause[0 ]), MaxNumGroups);
1148
- } else if (NumTeamsEnvVar > 0 &&
1149
- NumTeamsEnvVar <= GenericDevice.getBlockLimit ()) {
1151
+ } else if (NumTeamsEnvVar > 0 && static_cast < uint32_t >(NumTeamsEnvVar) <=
1152
+ GenericDevice.getBlockLimit ()) {
1150
1153
NumGroups =
1151
1154
std::min (static_cast <uint64_t >(NumTeamsEnvVar), MaxNumGroups);
1152
1155
} else {
@@ -1462,8 +1465,8 @@ struct AMDGPUQueueTy {
1462
1465
// / Push a kernel launch to the queue. The kernel launch requires an output
1463
1466
// / signal and can define an optional input signal (nullptr if none).
1464
1467
Error pushKernelLaunch (const AMDGPUKernelTy &Kernel, void *KernelArgs,
1465
- uint32_t NumThreads, uint64_t NumBlocks,
1466
- uint32_t GroupSize, uint32_t StackSize,
1468
+ uint32_t NumThreads[ 3 ], uint32_t NumBlocks[ 3 ] ,
1469
+ uint32_t GroupSize, uint64_t StackSize,
1467
1470
AMDGPUSignalTy *OutputSignal,
1468
1471
AMDGPUSignalTy *InputSignal) {
1469
1472
assert (OutputSignal && " Invalid kernel output signal" );
@@ -1489,17 +1492,23 @@ struct AMDGPUQueueTy {
1489
1492
assert (Packet && " Invalid packet" );
1490
1493
1491
1494
// The first 32 bits of the packet are written after the other fields
1492
- uint16_t Setup = UINT16_C (1 ) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1493
- Packet->workgroup_size_x = NumThreads;
1494
- Packet->workgroup_size_y = 1 ;
1495
- Packet->workgroup_size_z = 1 ;
1495
+ uint16_t Dims = NumBlocks[2 ] * NumThreads[2 ] > 1
1496
+ ? 3
1497
+ : 1 + (NumBlocks[1 ] * NumThreads[1 ] != 1 );
1498
+ uint16_t Setup = UINT16_C (Dims)
1499
+ << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1500
+ Packet->workgroup_size_x = NumThreads[0 ];
1501
+ Packet->workgroup_size_y = NumThreads[1 ];
1502
+ Packet->workgroup_size_z = NumThreads[2 ];
1496
1503
Packet->reserved0 = 0 ;
1497
- Packet->grid_size_x = NumBlocks * NumThreads;
1498
- Packet->grid_size_y = 1 ;
1499
- Packet->grid_size_z = 1 ;
1504
+ Packet->grid_size_x = NumBlocks[ 0 ] * NumThreads[ 0 ] ;
1505
+ Packet->grid_size_y = NumBlocks[ 1 ] * NumThreads[ 1 ] ;
1506
+ Packet->grid_size_z = NumBlocks[ 2 ] * NumThreads[ 2 ] ;
1500
1507
Packet->private_segment_size =
1501
- Kernel.usesDynamicStack () ? std::max (Kernel.getPrivateSize (), StackSize)
1502
- : Kernel.getPrivateSize ();
1508
+ Kernel.usesDynamicStack ()
1509
+ ? std::max (static_cast <uint64_t >(Kernel.getPrivateSize ()),
1510
+ StackSize)
1511
+ : Kernel.getPrivateSize ();
1503
1512
Packet->group_segment_size = GroupSize;
1504
1513
Packet->kernel_object = Kernel.getKernelObject ();
1505
1514
Packet->kernarg_address = KernelArgs;
@@ -2117,8 +2126,9 @@ struct AMDGPUStreamTy {
2117
2126
// / the kernel args buffer to the specified memory manager.
2118
2127
Error
2119
2128
pushKernelLaunch (const AMDGPUKernelTy &Kernel, void *KernelArgs,
2120
- uint32_t NumThreads, uint64_t NumBlocks, uint32_t GroupSize,
2121
- uint32_t StackSize, AMDGPUMemoryManagerTy &MemoryManager,
2129
+ uint32_t NumThreads[3 ], uint32_t NumBlocks[3 ],
2130
+ uint32_t GroupSize, uint32_t StackSize,
2131
+ AMDGPUMemoryManagerTy &MemoryManager,
2122
2132
std::unique_ptr<ompt::OmptEventInfoTy> OmptInfo = nullptr ) {
2123
2133
if (Queue == nullptr )
2124
2134
return Plugin::error (" Target queue was nullptr" );
@@ -4222,10 +4232,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
4222
4232
AsyncInfoWrapperTy AsyncInfoWrapper (*this , nullptr );
4223
4233
4224
4234
KernelArgsTy KernelArgs = {};
4225
- if ( auto Err =
4226
- AMDGPUKernel.launchImpl (* this , /* NumThread= */ 1u ,
4227
- /* NumBlocks= */ 1ul , KernelArgs,
4228
- KernelLaunchParamsTy{}, AsyncInfoWrapper))
4235
+ uint32_t NumBlocksAndThreads[ 3 ] = { 1u , 1u , 1u };
4236
+ if ( auto Err = AMDGPUKernel.launchImpl (
4237
+ * this , NumBlocksAndThreads, NumBlocksAndThreads , KernelArgs,
4238
+ KernelLaunchParamsTy{}, AsyncInfoWrapper))
4229
4239
return Err;
4230
4240
4231
4241
Error Err = Plugin::success ();
@@ -4960,7 +4970,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
4960
4970
};
4961
4971
4962
4972
Error AMDGPUKernelTy::launchImpl (GenericDeviceTy &GenericDevice,
4963
- uint32_t NumThreads, uint64_t NumBlocks,
4973
+ uint32_t NumThreads[ 3 ], uint32_t NumBlocks[ 3 ] ,
4964
4974
KernelArgsTy &KernelArgs,
4965
4975
KernelLaunchParamsTy LaunchParams,
4966
4976
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -5041,13 +5051,15 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
5041
5051
if (ImplArgs &&
5042
5052
getImplicitArgsSize () == sizeof (hsa_utils::AMDGPUImplicitArgsTy)) {
5043
5053
DP (" Setting fields of ImplicitArgs for COV5\n " );
5044
- ImplArgs->BlockCountX = NumBlocks;
5045
- ImplArgs->BlockCountY = 1 ;
5046
- ImplArgs->BlockCountZ = 1 ;
5047
- ImplArgs->GroupSizeX = NumThreads;
5048
- ImplArgs->GroupSizeY = 1 ;
5049
- ImplArgs->GroupSizeZ = 1 ;
5050
- ImplArgs->GridDims = 1 ;
5054
+ ImplArgs->BlockCountX = NumBlocks[0 ];
5055
+ ImplArgs->BlockCountY = NumBlocks[1 ];
5056
+ ImplArgs->BlockCountZ = NumBlocks[2 ];
5057
+ ImplArgs->GroupSizeX = NumThreads[0 ];
5058
+ ImplArgs->GroupSizeY = NumThreads[1 ];
5059
+ ImplArgs->GroupSizeZ = NumThreads[2 ];
5060
+ ImplArgs->GridDims = NumBlocks[2 ] * NumThreads[2 ] > 1
5061
+ ? 3
5062
+ : 1 + (NumBlocks[1 ] * NumThreads[1 ] != 1 );
5051
5063
ImplArgs->HeapV1Ptr =
5052
5064
(uint64_t )AMDGPUDevice.getPreAllocatedDeviceMemoryPool ();
5053
5065
ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem ;
@@ -5065,8 +5077,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
5065
5077
5066
5078
void AMDGPUKernelTy::printAMDOneLineKernelTrace (GenericDeviceTy &GenericDevice,
5067
5079
KernelArgsTy &KernelArgs,
5068
- uint32_t NumThreads,
5069
- uint64_t NumBlocks,
5080
+ uint32_t NumThreads[ 3 ] ,
5081
+ uint32_t NumBlocks[ 3 ] ,
5070
5082
int64_t MultiDeviceLB,
5071
5083
int64_t MultiDeviceUB) const {
5072
5084
auto GroupSegmentSize = (*KernelInfo).GroupSegmentList ;
@@ -5084,17 +5096,17 @@ void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,
5084
5096
" md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
5085
5097
" %d%% n:%s\n " ,
5086
5098
GenericDevice.getDeviceId (), getExecutionModeFlags (), ConstWGSize,
5087
- KernelArgs.NumArgs , NumBlocks, NumThreads, 0 , 0 , GroupSegmentSize ,
5088
- SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount ,
5089
- KernelArgs.Tripcount , NeedsHostServices, isMultiDeviceKernel () ,
5090
- MultiDeviceLB, MultiDeviceUB, MaxOccupancy, AchievedOccupancy ,
5091
- getName ());
5099
+ KernelArgs.NumArgs , NumBlocks[ 0 ] , NumThreads[ 0 ] , 0 , 0 ,
5100
+ GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount,
5101
+ VGPRSpillCount, KernelArgs.Tripcount , NeedsHostServices,
5102
+ isMultiDeviceKernel (), MultiDeviceLB, MultiDeviceUB, MaxOccupancy,
5103
+ AchievedOccupancy, getName ());
5092
5104
}
5093
5105
5094
5106
Error AMDGPUKernelTy::printLaunchInfoDetails (GenericDeviceTy &GenericDevice,
5095
5107
KernelArgsTy &KernelArgs,
5096
- uint32_t NumThreads,
5097
- uint64_t NumBlocks,
5108
+ uint32_t NumThreads[ 3 ] ,
5109
+ uint32_t NumBlocks[ 3 ] ,
5098
5110
int64_t MultiDeviceLB,
5099
5111
int64_t MultiDeviceUB) const {
5100
5112
// When LIBOMPTARGET_KERNEL_TRACE is set, print the single-line kernel trace
@@ -5140,12 +5152,13 @@ Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
5140
5152
// S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel
5141
5153
// Tripcount: loop tripcount for the kernel
5142
5154
INFO (OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId (),
5143
- " #Args: %d Teams x Thrds: %4lux %4u (MaxFlatWorkGroupSize: %u) LDS "
5155
+ " #Args: %d Teams x Thrds: %4ux %4u (MaxFlatWorkGroupSize: %u) LDS "
5144
5156
" Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: "
5145
5157
" %lu\n " ,
5146
- ArgNum, NumGroups, ThreadsPerGroup, MaxFlatWorkgroupSize,
5147
- GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount,
5148
- LoopTripCount);
5158
+ ArgNum, NumGroups[0 ] * NumGroups[1 ] * NumGroups[2 ],
5159
+ ThreadsPerGroup[0 ] * ThreadsPerGroup[1 ] * ThreadsPerGroup[2 ],
5160
+ MaxFlatWorkgroupSize, GroupSegmentSize, SGPRCount, VGPRCount,
5161
+ SGPRSpillCount, VGPRSpillCount, LoopTripCount);
5149
5162
5150
5163
return Plugin::success ();
5151
5164
}
0 commit comments