14
14
#include " GCNSubtarget.h"
15
15
#include " Utils/AMDGPUBaseInfo.h"
16
16
#include " llvm/Analysis/CycleAnalysis.h"
17
+ #include " llvm/Analysis/TargetTransformInfo.h"
18
+ #include " llvm/Analysis/UniformityAnalysis.h"
17
19
#include " llvm/CodeGen/TargetPassConfig.h"
20
+ #include " llvm/IR/IRBuilder.h"
18
21
#include " llvm/IR/IntrinsicsAMDGPU.h"
19
22
#include " llvm/IR/IntrinsicsR600.h"
20
23
#include " llvm/InitializePasses.h"
@@ -1299,6 +1302,130 @@ struct AAAMDGPUNoAGPR
1299
1302
1300
1303
const char AAAMDGPUNoAGPR::ID = 0 ;
1301
1304
1305
+ struct AAAMDGPUUniform : public StateWrapper <BooleanState, AbstractAttribute> {
1306
+ using Base = StateWrapper<BooleanState, AbstractAttribute>;
1307
+ AAAMDGPUUniform (const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1308
+
1309
+ // / Create an abstract attribute view for the position \p IRP.
1310
+ static AAAMDGPUUniform &createForPosition (const IRPosition &IRP,
1311
+ Attributor &A);
1312
+
1313
+ // / See AbstractAttribute::getName()
1314
+ const std::string getName () const override { return " AAAMDGPUUniform" ; }
1315
+
1316
+ const std::string getAsStr (Attributor *A) const override {
1317
+ return getAssumed () ? " inreg" : " non-inreg" ;
1318
+ }
1319
+
1320
+ void trackStatistics () const override {}
1321
+
1322
+ // / See AbstractAttribute::getIdAddr()
1323
+ const char *getIdAddr () const override { return &ID; }
1324
+
1325
+ // / This function should return true if the type of the \p AA is
1326
+ // / AAAMDGPUUniform
1327
+ static bool classof (const AbstractAttribute *AA) {
1328
+ return (AA->getIdAddr () == &ID);
1329
+ }
1330
+
1331
+ // / Unique ID (due to the unique address)
1332
+ static const char ID;
1333
+ };
1334
+
1335
+ const char AAAMDGPUUniform::ID = 0 ;
1336
+
1337
+ struct AAAMDGPUUniformArgument : public AAAMDGPUUniform {
1338
+ AAAMDGPUUniformArgument (const IRPosition &IRP, Attributor &A)
1339
+ : AAAMDGPUUniform(IRP, A) {}
1340
+
1341
+ void initialize (Attributor &A) override {
1342
+ Argument *Arg = getAssociatedArgument ();
1343
+ CallingConv::ID CC = Arg->getParent ()->getCallingConv ();
1344
+ if (Arg->hasAttribute (Attribute::InReg)) {
1345
+ indicateOptimisticFixpoint ();
1346
+ return ;
1347
+ }
1348
+ if (AMDGPU::isEntryFunctionCC (CC)) {
1349
+ // We only use isArgPassedInSGPR on kernel entry function argument, so the
1350
+ // potential i1 argument change will not affect this.
1351
+ if (AMDGPU::isArgPassedInSGPR (Arg))
1352
+ indicateOptimisticFixpoint ();
1353
+ else
1354
+ indicatePessimisticFixpoint ();
1355
+ }
1356
+ }
1357
+
1358
+ ChangeStatus updateImpl (Attributor &A) override {
1359
+ unsigned ArgNo = getAssociatedArgument ()->getArgNo ();
1360
+
1361
+ auto isUniform = [&](AbstractCallSite ACS) -> bool {
1362
+ CallBase *CB = ACS.getInstruction ();
1363
+ Value *V = CB->getArgOperandUse (ArgNo);
1364
+ if (isa<Constant>(V))
1365
+ return true ;
1366
+ Function *F = nullptr ;
1367
+ if (auto *Arg = dyn_cast<Argument>(V)) {
1368
+ auto *AA =
1369
+ A.getOrCreateAAFor <AAAMDGPUUniform>(IRPosition::argument (*Arg));
1370
+ if (AA)
1371
+ return AA->isValidState ();
1372
+ F = Arg->getParent ();
1373
+ } else if (auto *I = dyn_cast<Instruction>(V)) {
1374
+ F = I->getFunction ();
1375
+ }
1376
+
1377
+ if (F) {
1378
+ auto *UA =
1379
+ A.getInfoCache ()
1380
+ .getAnalysisResultForFunction <UniformityInfoAnalysis>(*F);
1381
+ return UA && UA->isUniform (V);
1382
+ }
1383
+
1384
+ // What else can it be here?
1385
+ return false ;
1386
+ };
1387
+
1388
+ bool UsedAssumedInformation = true ;
1389
+ if (!A.checkForAllCallSites (isUniform, *this , /* RequireAllCallSites=*/ true ,
1390
+ UsedAssumedInformation))
1391
+ return indicatePessimisticFixpoint ();
1392
+
1393
+ if (!UsedAssumedInformation)
1394
+ return indicateOptimisticFixpoint ();
1395
+
1396
+ return ChangeStatus::UNCHANGED;
1397
+ }
1398
+
1399
+ ChangeStatus manifest (Attributor &A) override {
1400
+ Argument *Arg = getAssociatedArgument ();
1401
+ if (AMDGPU::isEntryFunctionCC (Arg->getParent ()->getCallingConv ()))
1402
+ return ChangeStatus::UNCHANGED;
1403
+ // If the argument already has inreg attribute, we will not do anything
1404
+ // about it.
1405
+ if (Arg->hasAttribute (Attribute::InReg))
1406
+ return ChangeStatus::UNCHANGED;
1407
+ // Add both inreg and "uniform" attribute to the argument. We will emit a
1408
+ // readfirstlane at each call site for inreg uniform argument, and the
1409
+ // "uniform" attribute will be removed later.
1410
+ LLVMContext &Ctx = Arg->getContext ();
1411
+ return A.manifestAttrs (getIRPosition (),
1412
+ {Attribute::get (Ctx, Attribute::InReg),
1413
+ Attribute::get (Ctx, " uniform" )});
1414
+ }
1415
+ };
1416
+
1417
+ AAAMDGPUUniform &AAAMDGPUUniform::createForPosition (const IRPosition &IRP,
1418
+ Attributor &A) {
1419
+ switch (IRP.getPositionKind ()) {
1420
+ case IRPosition::IRP_ARGUMENT:
1421
+ return *new (A.Allocator ) AAAMDGPUUniformArgument (IRP, A);
1422
+ // TODO: Since inreg is also allowed for return value, maybe we need to add
1423
+ // AAAMDGPUUniformCallSiteReturned?
1424
+ default :
1425
+ llvm_unreachable (" not a valid position for AAAMDGPUUniform" );
1426
+ }
1427
+ }
1428
+
1302
1429
// / Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
1303
1430
// / based on the finalized 'amdgpu-flat-work-group-size' attribute.
1304
1431
// / Both attributes start with narrow ranges that expand during iteration.
@@ -1367,6 +1494,59 @@ static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
1367
1494
return Changed;
1368
1495
}
1369
1496
1497
+ // / Emit the readfirstlane intrinsic for all inreg uniform function arguments at
1498
+ // / each call site. The inreg uniform attribute combination is set by
1499
+ // / AAAMDGPUUniform. This function provides a workaround for a downstream issue
1500
+ // / where failing to emit a waterfall loop for 'inreg' arguments may result in
1501
+ // / an invalid VGPR-to-SGPR copy. However, we intentionally avoid a waterfall
1502
+ // / loop for inreg uniform arguments here, because the 'inreg' attribute set by
1503
+ // / AAAMDGPUUniform guarantees uniformity, making the readfirstlane intrinsic
1504
+ // / appropriate.
1505
+ static bool emitReadFirstLaneForInregUniformArgs (Module &M) {
1506
+ std::vector<std::pair<CallBase *, unsigned >> WorkList;
1507
+
1508
+ for (Function &F : M) {
1509
+ if (F.isDeclaration ())
1510
+ continue ;
1511
+ for (Argument &Arg : F.args ()) {
1512
+ if (!Arg.hasAttribute (Attribute::InReg) || !Arg.hasAttribute (" uniform" ))
1513
+ continue ;
1514
+ unsigned ArgNo = Arg.getArgNo ();
1515
+ for (Use &U : F.uses ()) {
1516
+ auto *CB = dyn_cast<CallBase>(U.getUser ());
1517
+ if (!CB)
1518
+ continue ;
1519
+ // We will skip the call site argument when itself is an inreg argument.
1520
+ // In this case, it will already be in SGPR.
1521
+ if (auto *CSArg = dyn_cast<Argument>(CB->getArgOperand (ArgNo))) {
1522
+ if (CSArg->hasAttribute (Attribute::InReg))
1523
+ continue ;
1524
+ }
1525
+ WorkList.emplace_back (CB, ArgNo);
1526
+ }
1527
+ // We don't count this as changed since it just stays within this pass.
1528
+ Arg.removeAttr (" uniform" );
1529
+ }
1530
+ }
1531
+
1532
+ if (WorkList.empty ())
1533
+ return false ;
1534
+
1535
+ for (auto &[CB, ArgNo] : WorkList) {
1536
+ Value *V = CB->getArgOperand (ArgNo);
1537
+ IRBuilder<> Builder (CB);
1538
+ Value *NewV = Builder.CreateIntrinsic (V->getType (),
1539
+ Intrinsic::amdgcn_readfirstlane, {V});
1540
+ CB->setArgOperand (ArgNo, NewV);
1541
+ if (auto *I = dyn_cast<Instruction>(V)) {
1542
+ if (I->use_empty ())
1543
+ I->eraseFromParent ();
1544
+ }
1545
+ }
1546
+
1547
+ return true ;
1548
+ }
1549
+
1370
1550
static bool runImpl (Module &M, AnalysisGetter &AG, TargetMachine &TM,
1371
1551
AMDGPUAttributorOptions Options,
1372
1552
ThinOrFullLTOPhase LTOPhase) {
@@ -1385,7 +1565,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1385
1565
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
1386
1566
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
1387
1567
&AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1388
- &AAInstanceInfo::ID});
1568
+ &AAInstanceInfo::ID, &AAAMDGPUUniform::ID });
1389
1569
1390
1570
AttributorConfig AC (CGUpdater);
1391
1571
AC.IsClosedWorldModule = Options.IsClosedWorld ;
@@ -1438,11 +1618,17 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1438
1618
IRPosition::value (*CmpX->getPointerOperand ()));
1439
1619
}
1440
1620
}
1621
+
1622
+ if (!AMDGPU::isEntryFunctionCC (F->getCallingConv ())) {
1623
+ for (auto &Arg : F->args ())
1624
+ A.getOrCreateAAFor <AAAMDGPUUniform>(IRPosition::argument (Arg));
1625
+ }
1441
1626
}
1442
1627
1443
1628
bool Changed = A.run () == ChangeStatus::CHANGED;
1444
1629
1445
1630
Changed |= updateWavesPerEU (M, TM);
1631
+ Changed |= emitReadFirstLaneForInregUniformArgs (M);
1446
1632
1447
1633
return Changed;
1448
1634
}
@@ -1470,6 +1656,7 @@ class AMDGPUAttributorLegacy : public ModulePass {
1470
1656
1471
1657
void getAnalysisUsage (AnalysisUsage &AU) const override {
1472
1658
AU.addRequired <CycleInfoWrapperPass>();
1659
+ AU.addRequired <UniformityInfoWrapperPass>();
1473
1660
}
1474
1661
1475
1662
StringRef getPassName () const override { return " AMDGPU Attributor" ; }
0 commit comments