@@ -1100,7 +1100,12 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1100
1100
if (GenericDevice.isFastReductionEnabled ()) {
1101
1101
// When fast reduction is enabled, the number of teams is capped by
1102
1102
// the MaxCUMultiplier constant.
1103
- MaxNumGroups = DeviceNumCUs * llvm::omp::xteam_red::MaxCUMultiplier;
1103
+ // When envar is enabled, use it for computing MaxNumGroup.
1104
+ if (EnvarCUMultiplier > 0 )
1105
+ MaxNumGroups = DeviceNumCUs * EnvarCUMultiplier;
1106
+ else
1107
+ MaxNumGroups = DeviceNumCUs * llvm::omp::xteam_red::MaxCUMultiplier;
1108
+
1104
1109
} else {
1105
1110
// When fast reduction is not enabled, the number of teams is capped
1106
1111
// by the metadata that clang CodeGen created. The number of teams
@@ -1111,7 +1116,13 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1111
1116
// ConstWGSize is the block size that CodeGen used.
1112
1117
uint32_t CUMultiplier =
1113
1118
llvm::omp::xteam_red::getXteamRedCUMultiplier (ConstWGSize);
1114
- MaxNumGroups = DeviceNumCUs * CUMultiplier;
1119
+
1120
+ if (EnvarCUMultiplier > 0 ) {
1121
+ MaxNumGroups =
1122
+ DeviceNumCUs * std::min (CUMultiplier, EnvarCUMultiplier);
1123
+ } else {
1124
+ MaxNumGroups = DeviceNumCUs * CUMultiplier;
1125
+ }
1115
1126
}
1116
1127
1117
1128
// If envar OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT is set and no
@@ -1166,12 +1177,6 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1166
1177
}
1167
1178
NumGroups = DesiredNumGroups;
1168
1179
}
1169
-
1170
- // Prefer OMPX_AdjustNumTeamsForXteamRedSmallBlockSize over
1171
- // OMPX_XTeamRedTeamsPerCU.
1172
- if (AdjustFactor == 0 && EnvarCUMultiplier > 0 )
1173
- NumGroups = DeviceNumCUs * EnvarCUMultiplier;
1174
-
1175
1180
NumGroups = std::min (NumGroups, MaxNumGroups);
1176
1181
NumGroups = std::min (NumGroups, NumGroupsFromTripCount);
1177
1182
0 commit comments