14
14
#include " GCNSubtarget.h"
15
15
#include " Utils/AMDGPUBaseInfo.h"
16
16
#include " llvm/Analysis/CycleAnalysis.h"
17
+ #include " llvm/Analysis/TargetTransformInfo.h"
18
+ #include " llvm/Analysis/UniformityAnalysis.h"
17
19
#include " llvm/CodeGen/TargetPassConfig.h"
20
+ #include " llvm/IR/IRBuilder.h"
18
21
#include " llvm/IR/IntrinsicsAMDGPU.h"
19
22
#include " llvm/IR/IntrinsicsR600.h"
20
23
#include " llvm/InitializePasses.h"
@@ -1295,6 +1298,134 @@ struct AAAMDGPUNoAGPR
1295
1298
1296
1299
const char AAAMDGPUNoAGPR::ID = 0 ;
1297
1300
1301
+ struct AAAMDGPUUniform : public StateWrapper <BooleanState, AbstractAttribute> {
1302
+ using Base = StateWrapper<BooleanState, AbstractAttribute>;
1303
+ AAAMDGPUUniform (const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1304
+
1305
+ // / Create an abstract attribute view for the position \p IRP.
1306
+ static AAAMDGPUUniform &createForPosition (const IRPosition &IRP,
1307
+ Attributor &A);
1308
+
1309
+ // / See AbstractAttribute::getName()
1310
+ StringRef getName () const override { return " AAAMDGPUUniform" ; }
1311
+
1312
+ const std::string getAsStr (Attributor *A) const override {
1313
+ return getAssumed () ? " uniform" : " divergent" ;
1314
+ }
1315
+
1316
+ void trackStatistics () const override {}
1317
+
1318
+ // / See AbstractAttribute::getIdAddr()
1319
+ const char *getIdAddr () const override { return &ID; }
1320
+
1321
+ // / This function should return true if the type of the \p AA is
1322
+ // / AAAMDGPUUniform
1323
+ static bool classof (const AbstractAttribute *AA) {
1324
+ return (AA->getIdAddr () == &ID);
1325
+ }
1326
+
1327
+ // / Unique ID (due to the unique address)
1328
+ static const char ID;
1329
+ };
1330
+
1331
+ const char AAAMDGPUUniform::ID = 0 ;
1332
+
1333
+ // / This AA is to infer the inreg attribute for a function argument.
1334
+ struct AAAMDGPUUniformArgument : public AAAMDGPUUniform {
1335
+ AAAMDGPUUniformArgument (const IRPosition &IRP, Attributor &A)
1336
+ : AAAMDGPUUniform(IRP, A) {}
1337
+
1338
+ void initialize (Attributor &A) override {
1339
+ Argument *Arg = getAssociatedArgument ();
1340
+ CallingConv::ID CC = Arg->getParent ()->getCallingConv ();
1341
+ if (Arg->hasAttribute (Attribute::InReg)) {
1342
+ indicateOptimisticFixpoint ();
1343
+ return ;
1344
+ }
1345
+ if (AMDGPU::isEntryFunctionCC (CC)) {
1346
+ // We only use isArgPassedInSGPR on kernel entry function argument, so
1347
+ // even if we will use VPGR for inreg i1 argument passing, it will not
1348
+ // affect this.
1349
+ if (AMDGPU::isArgPassedInSGPR (Arg))
1350
+ indicateOptimisticFixpoint ();
1351
+ else
1352
+ indicatePessimisticFixpoint ();
1353
+ }
1354
+ }
1355
+
1356
+ ChangeStatus updateImpl (Attributor &A) override {
1357
+ unsigned ArgNo = getAssociatedArgument ()->getArgNo ();
1358
+
1359
+ auto isUniform = [&](AbstractCallSite ACS) -> bool {
1360
+ CallBase *CB = ACS.getInstruction ();
1361
+ Value *V = CB->getArgOperandUse (ArgNo);
1362
+ if (isa<Constant>(V))
1363
+ return true ;
1364
+ Function *F = nullptr ;
1365
+ if (auto *Arg = dyn_cast<Argument>(V)) {
1366
+ auto *AA =
1367
+ A.getOrCreateAAFor <AAAMDGPUUniform>(IRPosition::argument (*Arg));
1368
+ if (AA)
1369
+ return AA->isValidState ();
1370
+ F = Arg->getParent ();
1371
+ } else if (auto *I = dyn_cast<Instruction>(V)) {
1372
+ F = I->getFunction ();
1373
+ }
1374
+
1375
+ if (F) {
1376
+ auto *UA =
1377
+ A.getInfoCache ()
1378
+ .getAnalysisResultForFunction <UniformityInfoAnalysis>(*F);
1379
+ return UA && UA->isUniform (V);
1380
+ }
1381
+
1382
+ return false ;
1383
+ };
1384
+
1385
+ bool UsedAssumedInformation = true ;
1386
+ if (!A.checkForAllCallSites (isUniform, *this , /* RequireAllCallSites=*/ true ,
1387
+ UsedAssumedInformation))
1388
+ return indicatePessimisticFixpoint ();
1389
+
1390
+ if (!UsedAssumedInformation)
1391
+ return indicateOptimisticFixpoint ();
1392
+
1393
+ return ChangeStatus::UNCHANGED;
1394
+ }
1395
+
1396
+ ChangeStatus manifest (Attributor &A) override {
1397
+ Argument *Arg = getAssociatedArgument ();
1398
+ // If the argument already has inreg attribute, we will not do anything
1399
+ // about it.
1400
+ if (Arg->hasAttribute (Attribute::InReg))
1401
+ return ChangeStatus::UNCHANGED;
1402
+ if (AMDGPU::isEntryFunctionCC (Arg->getParent ()->getCallingConv ()))
1403
+ return ChangeStatus::UNCHANGED;
1404
+ // We don't directly emit readfirstlane here because it will cause multiple
1405
+ // replacements of a single use in the manifest map, which is not supported
1406
+ // at this moment.
1407
+ // Add both inreg and "uniform" attribute to the argument. We will emit a
1408
+ // readfirstlane at each call site for inreg uniform argument, and the
1409
+ // "uniform" attribute will be removed later.
1410
+ LLVMContext &Ctx = Arg->getContext ();
1411
+ return A.manifestAttrs (getIRPosition (),
1412
+ {Attribute::get (Ctx, Attribute::InReg),
1413
+ Attribute::get (Ctx, " uniform" )});
1414
+ }
1415
+ };
1416
+
1417
+ AAAMDGPUUniform &AAAMDGPUUniform::createForPosition (const IRPosition &IRP,
1418
+ Attributor &A) {
1419
+ switch (IRP.getPositionKind ()) {
1420
+ case IRPosition::IRP_ARGUMENT:
1421
+ return *new (A.Allocator ) AAAMDGPUUniformArgument (IRP, A);
1422
+ // TODO: Since inreg is also allowed for return value, maybe we need to add
1423
+ // AAAMDGPUUniformCallSiteReturned?
1424
+ default :
1425
+ llvm_unreachable (" not a valid position for AAAMDGPUUniform" );
1426
+ }
1427
+ }
1428
+
1298
1429
// / Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
1299
1430
// / based on the finalized 'amdgpu-flat-work-group-size' attribute.
1300
1431
// / Both attributes start with narrow ranges that expand during iteration.
@@ -1363,6 +1494,64 @@ static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
1363
1494
return Changed;
1364
1495
}
1365
1496
1497
+ // / Emit the readfirstlane intrinsic for all inreg uniform function arguments at
1498
+ // / each call site. The inreg uniform attribute combination is set by
1499
+ // / AAAMDGPUUniform. This function provides a workaround for a downstream issue
1500
+ // / where failing to emit a waterfall loop for 'inreg' arguments may result in
1501
+ // / an invalid VGPR-to-SGPR copy. However, we intentionally avoid a waterfall
1502
+ // / loop for inreg uniform arguments here, because the 'inreg' attribute set by
1503
+ // / AAAMDGPUUniform guarantees uniformity, making the readfirstlane intrinsic
1504
+ // / appropriate.
1505
+ static bool emitReadFirstLaneForInregUniformArgs (Module &M) {
1506
+ bool Changed = false ;
1507
+ std::vector<std::pair<CallBase *, unsigned >> WorkList;
1508
+
1509
+ for (Function &F : M) {
1510
+ if (F.isDeclaration ())
1511
+ continue ;
1512
+ for (Argument &Arg : F.args ()) {
1513
+ if (!Arg.hasAttribute (Attribute::InReg) || !Arg.hasAttribute (" uniform" ))
1514
+ continue ;
1515
+ unsigned ArgNo = Arg.getArgNo ();
1516
+ for (Use &U : F.uses ()) {
1517
+ auto *CB = dyn_cast<CallBase>(U.getUser ());
1518
+ if (!CB)
1519
+ continue ;
1520
+ Value *CSArg = CB->getArgOperand (ArgNo);
1521
+ // We don't need readfirstvalue for a global value.
1522
+ if (isa<GlobalValue>(CSArg))
1523
+ continue ;
1524
+ // We will skip the call site argument when itself is an inreg argument.
1525
+ // In this case, it will already be in SGPR.
1526
+ if (auto *CSArgArg = dyn_cast<Argument>(CSArg)) {
1527
+ if (CSArgArg->hasAttribute (Attribute::InReg))
1528
+ continue ;
1529
+ }
1530
+ WorkList.emplace_back (CB, ArgNo);
1531
+ }
1532
+ Arg.removeAttr (" uniform" );
1533
+ Changed = true ;
1534
+ }
1535
+ }
1536
+
1537
+ if (WorkList.empty ())
1538
+ return Changed;
1539
+
1540
+ for (auto &[CB, ArgNo] : WorkList) {
1541
+ Value *V = CB->getArgOperand (ArgNo);
1542
+ IRBuilder<> Builder (CB);
1543
+ Value *NewV = Builder.CreateIntrinsic (V->getType (),
1544
+ Intrinsic::amdgcn_readfirstlane, {V});
1545
+ CB->setArgOperand (ArgNo, NewV);
1546
+ if (auto *I = dyn_cast<Instruction>(V)) {
1547
+ if (I->use_empty ())
1548
+ I->eraseFromParent ();
1549
+ }
1550
+ }
1551
+
1552
+ return true ;
1553
+ }
1554
+
1366
1555
static bool runImpl (Module &M, AnalysisGetter &AG, TargetMachine &TM,
1367
1556
AMDGPUAttributorOptions Options,
1368
1557
ThinOrFullLTOPhase LTOPhase) {
@@ -1381,7 +1570,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1381
1570
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
1382
1571
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
1383
1572
&AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1384
- &AAInstanceInfo::ID});
1573
+ &AAInstanceInfo::ID, &AAAMDGPUUniform::ID });
1385
1574
1386
1575
AttributorConfig AC (CGUpdater);
1387
1576
AC.IsClosedWorldModule = Options.IsClosedWorld ;
@@ -1434,11 +1623,17 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1434
1623
IRPosition::value (*CmpX->getPointerOperand ()));
1435
1624
}
1436
1625
}
1626
+
1627
+ if (!AMDGPU::isEntryFunctionCC (F->getCallingConv ())) {
1628
+ for (auto &Arg : F->args ())
1629
+ A.getOrCreateAAFor <AAAMDGPUUniform>(IRPosition::argument (Arg));
1630
+ }
1437
1631
}
1438
1632
1439
1633
bool Changed = A.run () == ChangeStatus::CHANGED;
1440
1634
1441
1635
Changed |= updateWavesPerEU (M, TM);
1636
+ Changed |= emitReadFirstLaneForInregUniformArgs (M);
1442
1637
1443
1638
return Changed;
1444
1639
}
0 commit comments