@@ -1101,12 +1101,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1101
1101
if (GenericDevice.isFastReductionEnabled ()) {
1102
1102
// When fast reduction is enabled, the number of teams is capped by
1103
1103
// the MaxCUMultiplier constant.
1104
- // When envar is enabled, use it for computing MaxNumGroup.
1105
- if (EnvarCUMultiplier > 0 )
1106
- MaxNumGroups = DeviceNumCUs * EnvarCUMultiplier;
1107
- else
1108
- MaxNumGroups = DeviceNumCUs * llvm::omp::xteam_red::MaxCUMultiplier;
1109
-
1104
+ MaxNumGroups = DeviceNumCUs * llvm::omp::xteam_red::MaxCUMultiplier;
1110
1105
} else {
1111
1106
// When fast reduction is not enabled, the number of teams is capped
1112
1107
// by the metadata that clang CodeGen created. The number of teams
@@ -1117,13 +1112,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1117
1112
// ConstWGSize is the block size that CodeGen used.
1118
1113
uint32_t CUMultiplier =
1119
1114
llvm::omp::xteam_red::getXteamRedCUMultiplier (ConstWGSize);
1120
-
1121
- if (EnvarCUMultiplier > 0 ) {
1122
- MaxNumGroups =
1123
- DeviceNumCUs * std::min (CUMultiplier, EnvarCUMultiplier);
1124
- } else {
1125
- MaxNumGroups = DeviceNumCUs * CUMultiplier;
1126
- }
1115
+ MaxNumGroups = DeviceNumCUs * CUMultiplier;
1127
1116
}
1128
1117
1129
1118
// If envar OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT is set and no
@@ -1178,6 +1167,12 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1178
1167
}
1179
1168
NumGroups = DesiredNumGroups;
1180
1169
}
1170
+
1171
+ // Prefer OMPX_AdjustNumTeamsForXteamRedSmallBlockSize over
1172
+ // OMPX_XTeamRedTeamsPerCU.
1173
+ if (AdjustFactor == 0 && EnvarCUMultiplier > 0 )
1174
+ NumGroups = DeviceNumCUs * EnvarCUMultiplier;
1175
+
1181
1176
NumGroups = std::min (NumGroups, MaxNumGroups);
1182
1177
NumGroups = std::min (NumGroups, NumGroupsFromTripCount);
1183
1178
0 commit comments