Skip to content

Commit 1c64e7c

Browse files
committed
[WIP][AMDGPU][Attributor] Infer inreg attribute in AMDGPUAttributor
1 parent 569b6f6 commit 1c64e7c

File tree

7 files changed

+473
-15
lines changed

7 files changed

+473
-15
lines changed

llvm/include/llvm/IR/Argument.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,8 @@ class Argument final : public Value {
173173
/// Remove attributes from an argument.
174174
void removeAttr(Attribute::AttrKind Kind);
175175

176+
void removeAttr(StringRef Kind);
177+
176178
void removeAttrs(const AttributeMask &AM);
177179

178180
/// Check if an argument has a given attribute.

llvm/lib/IR/Function.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,10 @@ void Argument::removeAttr(Attribute::AttrKind Kind) {
340340
getParent()->removeParamAttr(getArgNo(), Kind);
341341
}
342342

343+
void Argument::removeAttr(StringRef Kind) {
344+
getParent()->removeParamAttr(getArgNo(), Kind);
345+
}
346+
343347
void Argument::removeAttrs(const AttributeMask &AM) {
344348
AttributeList AL = getParent()->getAttributes();
345349
AL = AL.removeParamAttributes(Parent->getContext(), getArgNo(), AM);

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 188 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414
#include "GCNSubtarget.h"
1515
#include "Utils/AMDGPUBaseInfo.h"
1616
#include "llvm/Analysis/CycleAnalysis.h"
17+
#include "llvm/Analysis/TargetTransformInfo.h"
18+
#include "llvm/Analysis/UniformityAnalysis.h"
1719
#include "llvm/CodeGen/TargetPassConfig.h"
20+
#include "llvm/IR/IRBuilder.h"
1821
#include "llvm/IR/IntrinsicsAMDGPU.h"
1922
#include "llvm/IR/IntrinsicsR600.h"
2023
#include "llvm/InitializePasses.h"
@@ -1299,6 +1302,130 @@ struct AAAMDGPUNoAGPR
12991302

13001303
const char AAAMDGPUNoAGPR::ID = 0;
13011304

1305+
struct AAAMDGPUUniform : public StateWrapper<BooleanState, AbstractAttribute> {
1306+
using Base = StateWrapper<BooleanState, AbstractAttribute>;
1307+
AAAMDGPUUniform(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1308+
1309+
/// Create an abstract attribute view for the position \p IRP.
1310+
static AAAMDGPUUniform &createForPosition(const IRPosition &IRP,
1311+
Attributor &A);
1312+
1313+
/// See AbstractAttribute::getName()
1314+
const std::string getName() const override { return "AAAMDGPUUniform"; }
1315+
1316+
const std::string getAsStr(Attributor *A) const override {
1317+
return getAssumed() ? "inreg" : "non-inreg";
1318+
}
1319+
1320+
void trackStatistics() const override {}
1321+
1322+
/// See AbstractAttribute::getIdAddr()
1323+
const char *getIdAddr() const override { return &ID; }
1324+
1325+
/// This function should return true if the type of the \p AA is
1326+
/// AAAMDGPUUniform
1327+
static bool classof(const AbstractAttribute *AA) {
1328+
return (AA->getIdAddr() == &ID);
1329+
}
1330+
1331+
/// Unique ID (due to the unique address)
1332+
static const char ID;
1333+
};
1334+
1335+
const char AAAMDGPUUniform::ID = 0;
1336+
1337+
struct AAAMDGPUUniformArgument : public AAAMDGPUUniform {
1338+
AAAMDGPUUniformArgument(const IRPosition &IRP, Attributor &A)
1339+
: AAAMDGPUUniform(IRP, A) {}
1340+
1341+
void initialize(Attributor &A) override {
1342+
Argument *Arg = getAssociatedArgument();
1343+
CallingConv::ID CC = Arg->getParent()->getCallingConv();
1344+
if (Arg->hasAttribute(Attribute::InReg)) {
1345+
indicateOptimisticFixpoint();
1346+
return;
1347+
}
1348+
if (AMDGPU::isEntryFunctionCC(CC)) {
1349+
// We only use isArgPassedInSGPR on kernel entry function argument, so the
1350+
// potential i1 argument change will not affect this.
1351+
if (AMDGPU::isArgPassedInSGPR(Arg))
1352+
indicateOptimisticFixpoint();
1353+
else
1354+
indicatePessimisticFixpoint();
1355+
}
1356+
}
1357+
1358+
ChangeStatus updateImpl(Attributor &A) override {
1359+
unsigned ArgNo = getAssociatedArgument()->getArgNo();
1360+
1361+
auto isUniform = [&](AbstractCallSite ACS) -> bool {
1362+
CallBase *CB = ACS.getInstruction();
1363+
Value *V = CB->getArgOperandUse(ArgNo);
1364+
if (isa<Constant>(V))
1365+
return true;
1366+
Function *F = nullptr;
1367+
if (auto *Arg = dyn_cast<Argument>(V)) {
1368+
auto *AA =
1369+
A.getOrCreateAAFor<AAAMDGPUUniform>(IRPosition::argument(*Arg));
1370+
if (AA)
1371+
return AA->isValidState();
1372+
F = Arg->getParent();
1373+
} else if (auto *I = dyn_cast<Instruction>(V)) {
1374+
F = I->getFunction();
1375+
}
1376+
1377+
if (F) {
1378+
auto *UA =
1379+
A.getInfoCache()
1380+
.getAnalysisResultForFunction<UniformityInfoAnalysis>(*F);
1381+
return UA && UA->isUniform(V);
1382+
}
1383+
1384+
// What else can it be here?
1385+
return false;
1386+
};
1387+
1388+
bool UsedAssumedInformation = true;
1389+
if (!A.checkForAllCallSites(isUniform, *this, /*RequireAllCallSites=*/true,
1390+
UsedAssumedInformation))
1391+
return indicatePessimisticFixpoint();
1392+
1393+
if (!UsedAssumedInformation)
1394+
return indicateOptimisticFixpoint();
1395+
1396+
return ChangeStatus::UNCHANGED;
1397+
}
1398+
1399+
ChangeStatus manifest(Attributor &A) override {
1400+
Argument *Arg = getAssociatedArgument();
1401+
if (AMDGPU::isEntryFunctionCC(Arg->getParent()->getCallingConv()))
1402+
return ChangeStatus::UNCHANGED;
1403+
// If the argument already has inreg attribute, we will not do anything
1404+
// about it.
1405+
if (Arg->hasAttribute(Attribute::InReg))
1406+
return ChangeStatus::UNCHANGED;
1407+
// Add both inreg and "uniform" attribute to the argument. We will emit a
1408+
// readfirstlane at each call site for inreg uniform argument, and the
1409+
// "uniform" attribute will be removed later.
1410+
LLVMContext &Ctx = Arg->getContext();
1411+
return A.manifestAttrs(getIRPosition(),
1412+
{Attribute::get(Ctx, Attribute::InReg),
1413+
Attribute::get(Ctx, "uniform")});
1414+
}
1415+
};
1416+
1417+
AAAMDGPUUniform &AAAMDGPUUniform::createForPosition(const IRPosition &IRP,
1418+
Attributor &A) {
1419+
switch (IRP.getPositionKind()) {
1420+
case IRPosition::IRP_ARGUMENT:
1421+
return *new (A.Allocator) AAAMDGPUUniformArgument(IRP, A);
1422+
// TODO: Since inreg is also allowed for return value, maybe we need to add
1423+
// AAAMDGPUUniformCallSiteReturned?
1424+
default:
1425+
llvm_unreachable("not a valid position for AAAMDGPUUniform");
1426+
}
1427+
}
1428+
13021429
/// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
13031430
/// based on the finalized 'amdgpu-flat-work-group-size' attribute.
13041431
/// Both attributes start with narrow ranges that expand during iteration.
@@ -1367,6 +1494,59 @@ static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
13671494
return Changed;
13681495
}
13691496

1497+
/// Emit the readfirstlane intrinsic for all inreg uniform function arguments at
1498+
/// each call site. The inreg uniform attribute combination is set by
1499+
/// AAAMDGPUUniform. This function provides a workaround for a downstream issue
1500+
/// where failing to emit a waterfall loop for 'inreg' arguments may result in
1501+
/// an invalid VGPR-to-SGPR copy. However, we intentionally avoid a waterfall
1502+
/// loop for inreg uniform arguments here, because the 'inreg' attribute set by
1503+
/// AAAMDGPUUniform guarantees uniformity, making the readfirstlane intrinsic
1504+
/// appropriate.
1505+
static bool emitReadFirstLaneForInregUniformArgs(Module &M) {
1506+
std::vector<std::pair<CallBase *, unsigned>> WorkList;
1507+
1508+
for (Function &F : M) {
1509+
if (F.isDeclaration())
1510+
continue;
1511+
for (Argument &Arg : F.args()) {
1512+
if (!Arg.hasAttribute(Attribute::InReg) || !Arg.hasAttribute("uniform"))
1513+
continue;
1514+
unsigned ArgNo = Arg.getArgNo();
1515+
for (Use &U : F.uses()) {
1516+
auto *CB = dyn_cast<CallBase>(U.getUser());
1517+
if (!CB)
1518+
continue;
1519+
// We will skip the call site argument when itself is an inreg argument.
1520+
// In this case, it will already be in SGPR.
1521+
if (auto *CSArg = dyn_cast<Argument>(CB->getArgOperand(ArgNo))) {
1522+
if (CSArg->hasAttribute(Attribute::InReg))
1523+
continue;
1524+
}
1525+
WorkList.emplace_back(CB, ArgNo);
1526+
}
1527+
// We don't count this as changed since it just stays within this pass.
1528+
Arg.removeAttr("uniform");
1529+
}
1530+
}
1531+
1532+
if (WorkList.empty())
1533+
return false;
1534+
1535+
for (auto &[CB, ArgNo] : WorkList) {
1536+
Value *V = CB->getArgOperand(ArgNo);
1537+
IRBuilder<> Builder(CB);
1538+
Value *NewV = Builder.CreateIntrinsic(V->getType(),
1539+
Intrinsic::amdgcn_readfirstlane, {V});
1540+
CB->setArgOperand(ArgNo, NewV);
1541+
if (auto *I = dyn_cast<Instruction>(V)) {
1542+
if (I->use_empty())
1543+
I->eraseFromParent();
1544+
}
1545+
}
1546+
1547+
return true;
1548+
}
1549+
13701550
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13711551
AMDGPUAttributorOptions Options,
13721552
ThinOrFullLTOPhase LTOPhase) {
@@ -1385,7 +1565,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13851565
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
13861566
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
13871567
&AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1388-
&AAInstanceInfo::ID});
1568+
&AAInstanceInfo::ID, &AAAMDGPUUniform::ID});
13891569

13901570
AttributorConfig AC(CGUpdater);
13911571
AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1438,11 +1618,17 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
14381618
IRPosition::value(*CmpX->getPointerOperand()));
14391619
}
14401620
}
1621+
1622+
if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
1623+
for (auto &Arg : F->args())
1624+
A.getOrCreateAAFor<AAAMDGPUUniform>(IRPosition::argument(Arg));
1625+
}
14411626
}
14421627

14431628
bool Changed = A.run() == ChangeStatus::CHANGED;
14441629

14451630
Changed |= updateWavesPerEU(M, TM);
1631+
Changed |= emitReadFirstLaneForInregUniformArgs(M);
14461632

14471633
return Changed;
14481634
}
@@ -1470,6 +1656,7 @@ class AMDGPUAttributorLegacy : public ModulePass {
14701656

14711657
void getAnalysisUsage(AnalysisUsage &AU) const override {
14721658
AU.addRequired<CycleInfoWrapperPass>();
1659+
AU.addRequired<UniformityInfoWrapperPass>();
14731660
}
14741661

14751662
StringRef getPassName() const override { return "AMDGPU Attributor"; }

llvm/test/CodeGen/AMDGPU/aa-as-infer.ll

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ define void @call_volatile_load_store_as_4(ptr addrspace(4) %p1, ptr addrspace(4
9090

9191
define internal void @can_infer_cmpxchg(ptr %word) {
9292
; CHECK-LABEL: define internal void @can_infer_cmpxchg(
93-
; CHECK-SAME: ptr [[WORD:%.*]]) #[[ATTR0]] {
93+
; CHECK-SAME: ptr inreg [[WORD:%.*]]) #[[ATTR0]] {
9494
; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
9595
; CHECK-NEXT: [[CMPXCHG_0:%.*]] = cmpxchg ptr addrspace(1) [[TMP1]], i32 0, i32 4 monotonic monotonic, align 4
9696
; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
@@ -144,7 +144,7 @@ define internal void @can_not_infer_cmpxchg(ptr %word) {
144144

145145
define internal void @can_infer_atomicrmw(ptr %word) {
146146
; CHECK-LABEL: define internal void @can_infer_atomicrmw(
147-
; CHECK-SAME: ptr [[WORD:%.*]]) #[[ATTR0]] {
147+
; CHECK-SAME: ptr inreg [[WORD:%.*]]) #[[ATTR0]] {
148148
; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
149149
; CHECK-NEXT: [[ATOMICRMW_XCHG:%.*]] = atomicrmw xchg ptr addrspace(1) [[TMP1]], i32 12 monotonic, align 4
150150
; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
@@ -215,13 +215,17 @@ define void @foo(ptr addrspace(3) %val) {
215215
; CHECK-LABEL: define void @foo(
216216
; CHECK-SAME: ptr addrspace(3) [[VAL:%.*]]) #[[ATTR1:[0-9]+]] {
217217
; CHECK-NEXT: [[VAL_CAST:%.*]] = addrspacecast ptr addrspace(3) [[VAL]] to ptr
218-
; CHECK-NEXT: call void @can_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
219-
; CHECK-NEXT: call void @can_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
218+
; CHECK-NEXT: [[TMP1:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
219+
; CHECK-NEXT: call void @can_infer_cmpxchg(ptr [[TMP1]])
220+
; CHECK-NEXT: [[TMP2:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
221+
; CHECK-NEXT: call void @can_infer_cmpxchg(ptr [[TMP2]])
220222
; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
221223
; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
222224
; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr [[VAL_CAST]])
223-
; CHECK-NEXT: call void @can_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
224-
; CHECK-NEXT: call void @can_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
225+
; CHECK-NEXT: [[TMP3:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
226+
; CHECK-NEXT: call void @can_infer_atomicrmw(ptr [[TMP3]])
227+
; CHECK-NEXT: [[TMP4:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
228+
; CHECK-NEXT: call void @can_infer_atomicrmw(ptr [[TMP4]])
225229
; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
226230
; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
227231
; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr [[VAL_CAST]])

llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
define internal fastcc void @foo(ptr %kg) {
1010
; CHECK-LABEL: define internal fastcc void @foo(
11-
; CHECK-SAME: ptr [[KG:%.*]]) #[[ATTR0:[0-9]+]] {
11+
; CHECK-SAME: ptr inreg [[KG:%.*]]) #[[ATTR0:[0-9]+]] {
1212
; CHECK-NEXT: [[ENTRY:.*:]]
1313
; CHECK-NEXT: [[CLOSURE_I25_I:%.*]] = getelementptr i8, ptr [[KG]], i64 336
1414
; CHECK-NEXT: [[NUM_CLOSURE_I26_I:%.*]] = getelementptr i8, ptr [[KG]], i64 276
@@ -80,7 +80,8 @@ define amdgpu_kernel void @kernel() #0 {
8080
; CHECK-NEXT: [[KGLOBALS_ASCAST1:%.*]] = addrspacecast ptr addrspace(5) [[SD]] to ptr
8181
; CHECK-NEXT: [[NUM_CLOSURE_I_I:%.*]] = getelementptr i8, ptr addrspace(5) [[SD]], i32 276
8282
; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[NUM_CLOSURE_I_I]], align 4
83-
; CHECK-NEXT: call fastcc void @foo(ptr [[KGLOBALS_ASCAST1]])
83+
; CHECK-NEXT: [[TMP0:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr [[KGLOBALS_ASCAST1]])
84+
; CHECK-NEXT: call fastcc void @foo(ptr [[TMP0]])
8485
; CHECK-NEXT: ret void
8586
;
8687
entry:

0 commit comments

Comments
 (0)