14
14
#include " GCNSubtarget.h"
15
15
#include " Utils/AMDGPUBaseInfo.h"
16
16
#include " llvm/Analysis/CycleAnalysis.h"
17
+ #include " llvm/Analysis/TargetTransformInfo.h"
18
+ #include " llvm/Analysis/UniformityAnalysis.h"
17
19
#include " llvm/CodeGen/TargetPassConfig.h"
20
+ #include " llvm/IR/IRBuilder.h"
18
21
#include " llvm/IR/IntrinsicsAMDGPU.h"
19
22
#include " llvm/IR/IntrinsicsR600.h"
20
23
#include " llvm/InitializePasses.h"
@@ -1295,6 +1298,130 @@ struct AAAMDGPUNoAGPR
1295
1298
1296
1299
const char AAAMDGPUNoAGPR::ID = 0 ;
1297
1300
1301
+ struct AAAMDGPUUniform : public StateWrapper <BooleanState, AbstractAttribute> {
1302
+ using Base = StateWrapper<BooleanState, AbstractAttribute>;
1303
+ AAAMDGPUUniform (const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1304
+
1305
+ // / Create an abstract attribute view for the position \p IRP.
1306
+ static AAAMDGPUUniform &createForPosition (const IRPosition &IRP,
1307
+ Attributor &A);
1308
+
1309
+ // / See AbstractAttribute::getName()
1310
+ const std::string getName () const override { return " AAAMDGPUUniform" ; }
1311
+
1312
+ const std::string getAsStr (Attributor *A) const override {
1313
+ return getAssumed () ? " inreg" : " non-inreg" ;
1314
+ }
1315
+
1316
+ void trackStatistics () const override {}
1317
+
1318
+ // / See AbstractAttribute::getIdAddr()
1319
+ const char *getIdAddr () const override { return &ID; }
1320
+
1321
+ // / This function should return true if the type of the \p AA is
1322
+ // / AAAMDGPUUniform
1323
+ static bool classof (const AbstractAttribute *AA) {
1324
+ return (AA->getIdAddr () == &ID);
1325
+ }
1326
+
1327
+ // / Unique ID (due to the unique address)
1328
+ static const char ID;
1329
+ };
1330
+
1331
+ const char AAAMDGPUUniform::ID = 0 ;
1332
+
1333
+ struct AAAMDGPUUniformArgument : public AAAMDGPUUniform {
1334
+ AAAMDGPUUniformArgument (const IRPosition &IRP, Attributor &A)
1335
+ : AAAMDGPUUniform(IRP, A) {}
1336
+
1337
+ void initialize (Attributor &A) override {
1338
+ Argument *Arg = getAssociatedArgument ();
1339
+ CallingConv::ID CC = Arg->getParent ()->getCallingConv ();
1340
+ if (Arg->hasAttribute (Attribute::InReg)) {
1341
+ indicateOptimisticFixpoint ();
1342
+ return ;
1343
+ }
1344
+ if (AMDGPU::isEntryFunctionCC (CC)) {
1345
+ // We only use isArgPassedInSGPR on kernel entry function argument, so the
1346
+ // potential i1 argument change will not affect this.
1347
+ if (AMDGPU::isArgPassedInSGPR (Arg))
1348
+ indicateOptimisticFixpoint ();
1349
+ else
1350
+ indicatePessimisticFixpoint ();
1351
+ }
1352
+ }
1353
+
1354
+ ChangeStatus updateImpl (Attributor &A) override {
1355
+ unsigned ArgNo = getAssociatedArgument ()->getArgNo ();
1356
+
1357
+ auto isUniform = [&](AbstractCallSite ACS) -> bool {
1358
+ CallBase *CB = ACS.getInstruction ();
1359
+ Value *V = CB->getArgOperandUse (ArgNo);
1360
+ if (isa<Constant>(V))
1361
+ return true ;
1362
+ Function *F = nullptr ;
1363
+ if (auto *Arg = dyn_cast<Argument>(V)) {
1364
+ auto *AA =
1365
+ A.getOrCreateAAFor <AAAMDGPUUniform>(IRPosition::argument (*Arg));
1366
+ if (AA)
1367
+ return AA->isValidState ();
1368
+ F = Arg->getParent ();
1369
+ } else if (auto *I = dyn_cast<Instruction>(V)) {
1370
+ F = I->getFunction ();
1371
+ }
1372
+
1373
+ if (F) {
1374
+ auto *UA =
1375
+ A.getInfoCache ()
1376
+ .getAnalysisResultForFunction <UniformityInfoAnalysis>(*F);
1377
+ return UA && UA->isUniform (V);
1378
+ }
1379
+
1380
+ // What else can it be here?
1381
+ return false ;
1382
+ };
1383
+
1384
+ bool UsedAssumedInformation = true ;
1385
+ if (!A.checkForAllCallSites (isUniform, *this , /* RequireAllCallSites=*/ true ,
1386
+ UsedAssumedInformation))
1387
+ return indicatePessimisticFixpoint ();
1388
+
1389
+ if (!UsedAssumedInformation)
1390
+ return indicateOptimisticFixpoint ();
1391
+
1392
+ return ChangeStatus::UNCHANGED;
1393
+ }
1394
+
1395
+ ChangeStatus manifest (Attributor &A) override {
1396
+ Argument *Arg = getAssociatedArgument ();
1397
+ if (AMDGPU::isEntryFunctionCC (Arg->getParent ()->getCallingConv ()))
1398
+ return ChangeStatus::UNCHANGED;
1399
+ // If the argument already has inreg attribute, we will not do anything
1400
+ // about it.
1401
+ if (Arg->hasAttribute (Attribute::InReg))
1402
+ return ChangeStatus::UNCHANGED;
1403
+ // Add both inreg and "uniform" attribute to the argument. We will emit a
1404
+ // readfirstlane at each call site for inreg uniform argument, and the
1405
+ // "uniform" attribute will be removed later.
1406
+ LLVMContext &Ctx = Arg->getContext ();
1407
+ return A.manifestAttrs (getIRPosition (),
1408
+ {Attribute::get (Ctx, Attribute::InReg),
1409
+ Attribute::get (Ctx, " uniform" )});
1410
+ }
1411
+ };
1412
+
1413
+ AAAMDGPUUniform &AAAMDGPUUniform::createForPosition (const IRPosition &IRP,
1414
+ Attributor &A) {
1415
+ switch (IRP.getPositionKind ()) {
1416
+ case IRPosition::IRP_ARGUMENT:
1417
+ return *new (A.Allocator ) AAAMDGPUUniformArgument (IRP, A);
1418
+ // TODO: Since inreg is also allowed for return value, maybe we need to add
1419
+ // AAAMDGPUUniformCallSiteReturned?
1420
+ default :
1421
+ llvm_unreachable (" not a valid position for AAAMDGPUUniform" );
1422
+ }
1423
+ }
1424
+
1298
1425
// / Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
1299
1426
// / based on the finalized 'amdgpu-flat-work-group-size' attribute.
1300
1427
// / Both attributes start with narrow ranges that expand during iteration.
@@ -1363,6 +1490,59 @@ static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
1363
1490
return Changed;
1364
1491
}
1365
1492
1493
+ // / Emit the readfirstlane intrinsic for all inreg uniform function arguments at
1494
+ // / each call site. The inreg uniform attribute combination is set by
1495
+ // / AAAMDGPUUniform. This function provides a workaround for a downstream issue
1496
+ // / where failing to emit a waterfall loop for 'inreg' arguments may result in
1497
+ // / an invalid VGPR-to-SGPR copy. However, we intentionally avoid a waterfall
1498
+ // / loop for inreg uniform arguments here, because the 'inreg' attribute set by
1499
+ // / AAAMDGPUUniform guarantees uniformity, making the readfirstlane intrinsic
1500
+ // / appropriate.
1501
+ static bool emitReadFirstLaneForInregUniformArgs (Module &M) {
1502
+ std::vector<std::pair<CallBase *, unsigned >> WorkList;
1503
+
1504
+ for (Function &F : M) {
1505
+ if (F.isDeclaration ())
1506
+ continue ;
1507
+ for (Argument &Arg : F.args ()) {
1508
+ if (!Arg.hasAttribute (Attribute::InReg) || !Arg.hasAttribute (" uniform" ))
1509
+ continue ;
1510
+ unsigned ArgNo = Arg.getArgNo ();
1511
+ for (Use &U : F.uses ()) {
1512
+ auto *CB = dyn_cast<CallBase>(U.getUser ());
1513
+ if (!CB)
1514
+ continue ;
1515
+ // We will skip the call site argument when itself is an inreg argument.
1516
+ // In this case, it will already be in SGPR.
1517
+ if (auto *CSArg = dyn_cast<Argument>(CB->getArgOperand (ArgNo))) {
1518
+ if (CSArg->hasAttribute (Attribute::InReg))
1519
+ continue ;
1520
+ }
1521
+ WorkList.emplace_back (CB, ArgNo);
1522
+ }
1523
+ // We don't count this as changed since it just stays within this pass.
1524
+ Arg.removeAttr (" uniform" );
1525
+ }
1526
+ }
1527
+
1528
+ if (WorkList.empty ())
1529
+ return false ;
1530
+
1531
+ for (auto &[CB, ArgNo] : WorkList) {
1532
+ Value *V = CB->getArgOperand (ArgNo);
1533
+ IRBuilder<> Builder (CB);
1534
+ Value *NewV = Builder.CreateIntrinsic (V->getType (),
1535
+ Intrinsic::amdgcn_readfirstlane, {V});
1536
+ CB->setArgOperand (ArgNo, NewV);
1537
+ if (auto *I = dyn_cast<Instruction>(V)) {
1538
+ if (I->use_empty ())
1539
+ I->eraseFromParent ();
1540
+ }
1541
+ }
1542
+
1543
+ return true ;
1544
+ }
1545
+
1366
1546
static bool runImpl (Module &M, AnalysisGetter &AG, TargetMachine &TM,
1367
1547
AMDGPUAttributorOptions Options,
1368
1548
ThinOrFullLTOPhase LTOPhase) {
@@ -1381,7 +1561,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1381
1561
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
1382
1562
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
1383
1563
&AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1384
- &AAInstanceInfo::ID});
1564
+ &AAInstanceInfo::ID, &AAAMDGPUUniform::ID });
1385
1565
1386
1566
AttributorConfig AC (CGUpdater);
1387
1567
AC.IsClosedWorldModule = Options.IsClosedWorld ;
@@ -1434,11 +1614,17 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1434
1614
IRPosition::value (*CmpX->getPointerOperand ()));
1435
1615
}
1436
1616
}
1617
+
1618
+ if (!AMDGPU::isEntryFunctionCC (F->getCallingConv ())) {
1619
+ for (auto &Arg : F->args ())
1620
+ A.getOrCreateAAFor <AAAMDGPUUniform>(IRPosition::argument (Arg));
1621
+ }
1437
1622
}
1438
1623
1439
1624
bool Changed = A.run () == ChangeStatus::CHANGED;
1440
1625
1441
1626
Changed |= updateWavesPerEU (M, TM);
1627
+ Changed |= emitReadFirstLaneForInregUniformArgs (M);
1442
1628
1443
1629
return Changed;
1444
1630
}
@@ -1466,6 +1652,7 @@ class AMDGPUAttributorLegacy : public ModulePass {
1466
1652
1467
1653
void getAnalysisUsage (AnalysisUsage &AU) const override {
1468
1654
AU.addRequired <CycleInfoWrapperPass>();
1655
+ AU.addRequired <UniformityInfoWrapperPass>();
1469
1656
}
1470
1657
1471
1658
StringRef getPassName () const override { return " AMDGPU Attributor" ; }
0 commit comments