@@ -1117,47 +1117,25 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1117
1117
Function *F = getAssociatedFunction ();
1118
1118
auto &InfoCache = static_cast <AMDGPUInformationCache &>(A.getInfoCache ());
1119
1119
1120
- auto TakeRange = [&](std::pair<unsigned , unsigned > R) {
1121
- auto [Min, Max] = R;
1122
- ConstantRange Range (APInt (32 , Min), APInt (32 , Max + 1 ));
1123
- IntegerRangeState RangeState (Range);
1124
- clampStateAndIndicateChange (this ->getState (), RangeState);
1125
- indicateOptimisticFixpoint ();
1126
- };
1127
-
1128
- std::pair<unsigned , unsigned > MaxWavesPerEURange{
1129
- 1U , InfoCache.getMaxWavesPerEU (*F)};
1130
-
1131
1120
// If the attribute exists, we will honor it if it is not the default.
1132
1121
if (auto Attr = InfoCache.getWavesPerEUAttr (*F)) {
1122
+ std::pair<unsigned , unsigned > MaxWavesPerEURange{
1123
+ 1U , InfoCache.getMaxWavesPerEU (*F)};
1133
1124
if (*Attr != MaxWavesPerEURange) {
1134
- TakeRange (*Attr);
1125
+ auto [Min, Max] = *Attr;
1126
+ ConstantRange Range (APInt (32 , Min), APInt (32 , Max + 1 ));
1127
+ IntegerRangeState RangeState (Range);
1128
+ this ->getState () = RangeState;
1129
+ indicateOptimisticFixpoint ();
1135
1130
return ;
1136
1131
}
1137
1132
}
1138
1133
1139
- // Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the
1140
- // calculation of waves per EU involves flat work group size, we can't
1141
- // simply use an assumed flat work group size as a start point, because the
1142
- // update of flat work group size is in an inverse direction of waves per
1143
- // EU. However, we can still do something if it is an entry function. Since
1144
- // an entry function is a terminal node, and flat work group size either
1145
- // from attribute or default will be used anyway, we can take that value and
1146
- // calculate the waves per EU based on it. This result can't be updated by
1147
- // no means, but that could still allow us to propagate it.
1148
- if (AMDGPU::isEntryFunctionCC (F->getCallingConv ())) {
1149
- std::pair<unsigned , unsigned > FlatWorkGroupSize;
1150
- if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr (*F))
1151
- FlatWorkGroupSize = *Attr;
1152
- else
1153
- FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize (*F);
1154
- TakeRange (InfoCache.getEffectiveWavesPerEU (*F, MaxWavesPerEURange,
1155
- FlatWorkGroupSize));
1156
- }
1134
+ if (AMDGPU::isEntryFunctionCC (F->getCallingConv ()))
1135
+ indicatePessimisticFixpoint ();
1157
1136
}
1158
1137
1159
1138
ChangeStatus updateImpl (Attributor &A) override {
1160
- auto &InfoCache = static_cast <AMDGPUInformationCache &>(A.getInfoCache ());
1161
1139
ChangeStatus Change = ChangeStatus::UNCHANGED;
1162
1140
1163
1141
auto CheckCallSite = [&](AbstractCallSite CS) {
@@ -1166,24 +1144,21 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1166
1144
LLVM_DEBUG (dbgs () << ' [' << getName () << " ] Call " << Caller->getName ()
1167
1145
<< " ->" << Func->getName () << ' \n ' );
1168
1146
1169
- const auto *CallerInfo = A.getAAFor <AAAMDWavesPerEU>(
1147
+ const auto *CallerAA = A.getAAFor <AAAMDWavesPerEU>(
1170
1148
*this , IRPosition::function (*Caller), DepClassTy::REQUIRED);
1171
- const auto *AssumedGroupSize = A.getAAFor <AAAMDFlatWorkGroupSize>(
1172
- *this , IRPosition::function (*Func), DepClassTy::REQUIRED);
1173
- if (!CallerInfo || !AssumedGroupSize || !CallerInfo->isValidState () ||
1174
- !AssumedGroupSize->isValidState ())
1149
+ if (!CallerAA || !CallerAA->isValidState ())
1175
1150
return false ;
1176
1151
1177
- unsigned Min, Max ;
1178
- std::tie (Min, Max) = InfoCache. getEffectiveWavesPerEU (
1179
- *Caller,
1180
- {CallerInfo-> getAssumed (). getLower ().getZExtValue (),
1181
- CallerInfo ->getAssumed ().getUpper ().getZExtValue () - 1 },
1182
- {AssumedGroupSize-> getAssumed (). getLower (). getZExtValue (),
1183
- AssumedGroupSize-> getAssumed (). getUpper (). getZExtValue () - 1 } );
1184
- ConstantRange CallerRange ( APInt ( 32 , Min), APInt ( 32 , Max + 1 )) ;
1185
- IntegerRangeState CallerRangeState (CallerRange);
1186
- Change |= clampStateAndIndicateChange ( this -> getState (), CallerRangeState) ;
1152
+ auto Assumed = this -> getAssumed () ;
1153
+ unsigned Min = std::max (Assumed. getLower (). getZExtValue (),
1154
+ CallerAA-> getAssumed (). getLower (). getZExtValue ());
1155
+ unsigned Max = std::max (Assumed. getUpper ().getZExtValue (),
1156
+ CallerAA ->getAssumed ().getUpper ().getZExtValue ());
1157
+ ConstantRange Range ( APInt ( 32 , Min), APInt ( 32 , Max));
1158
+ IntegerRangeState RangeState (Range );
1159
+ this -> getState () = RangeState ;
1160
+ Change |= this -> getState () == Assumed ? ChangeStatus::UNCHANGED
1161
+ : ChangeStatus::CHANGED ;
1187
1162
1188
1163
return true ;
1189
1164
};
@@ -1342,6 +1317,59 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
1342
1317
}
1343
1318
}
1344
1319
1320
+ static void checkWavesPerEU (Module &M, TargetMachine &TM) {
1321
+ for (Function &F : M) {
1322
+ const GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(F);
1323
+
1324
+ auto FlatWgrpSizeAttr =
1325
+ AMDGPU::getIntegerPairAttribute (F, " amdgpu-flat-work-group-size" );
1326
+ auto WavesPerEUAttr = AMDGPU::getIntegerPairAttribute (
1327
+ F, " amdgpu-waves-per-eu" , /* OnlyFirstRequired=*/ true );
1328
+
1329
+ unsigned MinWavesPerEU = ST.getMinWavesPerEU ();
1330
+ unsigned MaxWavesPerEU = ST.getMaxWavesPerEU ();
1331
+
1332
+ unsigned MinFlatWgrpSize = 1U ;
1333
+ unsigned MaxFlatWgrpSize = 1024U ;
1334
+ if (FlatWgrpSizeAttr.has_value ()) {
1335
+ MinFlatWgrpSize = FlatWgrpSizeAttr->first ;
1336
+ MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second );
1337
+ }
1338
+
1339
+ // Start with the max range.
1340
+ unsigned Min = MinWavesPerEU;
1341
+ unsigned Max = MaxWavesPerEU;
1342
+
1343
+ // If the attribute exists, set them to the value from the attribute.
1344
+ if (WavesPerEUAttr.has_value ()) {
1345
+ Min = WavesPerEUAttr->first ;
1346
+ if (WavesPerEUAttr->second .has_value ())
1347
+ Max = *(WavesPerEUAttr->second );
1348
+ }
1349
+
1350
+ // Compute the range from flat workgroup size.
1351
+ auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
1352
+ ST.getWavesPerEU (F, std::make_pair (MinFlatWgrpSize, MaxFlatWgrpSize));
1353
+
1354
+ // For the lower bound, we have to "tighten" it.
1355
+ Min = std::max (Min, MinFromFlatWgrpSize);
1356
+ // For the upper bound, we have to "extend" it.
1357
+ Max = std::max (Max, MaxFromFlatWgrpSize);
1358
+
1359
+ // Clamp the range to the max range.
1360
+ Min = std::max (Min, MinWavesPerEU);
1361
+ Max = std::min (Max, MaxWavesPerEU);
1362
+
1363
+ // Update the attribute if it is not the max.
1364
+ if (Min != MinWavesPerEU || Max != MaxWavesPerEU) {
1365
+ SmallString<10 > Buffer;
1366
+ raw_svector_ostream OS (Buffer);
1367
+ OS << Min << ' ,' << Max;
1368
+ F.addFnAttr (" amdgpu-waves-per-eu" , OS.str ());
1369
+ }
1370
+ }
1371
+ }
1372
+
1345
1373
static bool runImpl (Module &M, AnalysisGetter &AG, TargetMachine &TM,
1346
1374
AMDGPUAttributorOptions Options,
1347
1375
ThinOrFullLTOPhase LTOPhase) {
@@ -1417,8 +1445,14 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1417
1445
}
1418
1446
}
1419
1447
1420
- ChangeStatus Change = A.run ();
1421
- return Change == ChangeStatus::CHANGED;
1448
+ bool Changed = A.run () == ChangeStatus::CHANGED;
1449
+
1450
+ if (Changed && (LTOPhase == ThinOrFullLTOPhase::None ||
1451
+ LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
1452
+ LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink))
1453
+ checkWavesPerEU (M, TM);
1454
+
1455
+ return Changed;
1422
1456
}
1423
1457
1424
1458
class AMDGPUAttributorLegacy : public ModulePass {
0 commit comments