@@ -1100,12 +1100,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1100
1100
if (GenericDevice.isFastReductionEnabled ()) {
1101
1101
// When fast reduction is enabled, the number of teams is capped by
1102
1102
// the MaxCUMultiplier constant.
1103
- // When envar is enabled, use it for computing MaxNumGroup.
1104
- if (EnvarCUMultiplier > 0 )
1105
- MaxNumGroups = DeviceNumCUs * EnvarCUMultiplier;
1106
- else
1107
- MaxNumGroups = DeviceNumCUs * llvm::omp::xteam_red::MaxCUMultiplier;
1108
-
1103
+ MaxNumGroups = DeviceNumCUs * llvm::omp::xteam_red::MaxCUMultiplier;
1109
1104
} else {
1110
1105
// When fast reduction is not enabled, the number of teams is capped
1111
1106
// by the metadata that clang CodeGen created. The number of teams
@@ -1116,13 +1111,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1116
1111
// ConstWGSize is the block size that CodeGen used.
1117
1112
uint32_t CUMultiplier =
1118
1113
llvm::omp::xteam_red::getXteamRedCUMultiplier (ConstWGSize);
1119
-
1120
- if (EnvarCUMultiplier > 0 ) {
1121
- MaxNumGroups =
1122
- DeviceNumCUs * std::min (CUMultiplier, EnvarCUMultiplier);
1123
- } else {
1124
- MaxNumGroups = DeviceNumCUs * CUMultiplier;
1125
- }
1114
+ MaxNumGroups = DeviceNumCUs * CUMultiplier;
1126
1115
}
1127
1116
1128
1117
// If envar OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT is set and no
@@ -1177,6 +1166,12 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1177
1166
}
1178
1167
NumGroups = DesiredNumGroups;
1179
1168
}
1169
+
1170
+ // Prefer OMPX_AdjustNumTeamsForXteamRedSmallBlockSize over
1171
+ // OMPX_XTeamRedTeamsPerCU.
1172
+ if (AdjustFactor == 0 && EnvarCUMultiplier > 0 )
1173
+ NumGroups = DeviceNumCUs * EnvarCUMultiplier;
1174
+
1180
1175
NumGroups = std::min (NumGroups, MaxNumGroups);
1181
1176
NumGroups = std::min (NumGroups, NumGroupsFromTripCount);
1182
1177
0 commit comments