Skip to content

Commit 357582f

Browse files
committed
[WIP][AMDGPU][Attributor] Infer inreg attribute in AMDGPUAttributor
1 parent 8d83d04 commit 357582f

File tree

7 files changed

+472
-15
lines changed

7 files changed

+472
-15
lines changed

llvm/include/llvm/IR/Argument.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,8 @@ class Argument final : public Value {
176176

177177
LLVM_ABI void removeAttrs(const AttributeMask &AM);
178178

179+
LLVM_ABI void removeAttr(StringRef Kind);
180+
179181
/// Check if an argument has a given attribute.
180182
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const;
181183

llvm/lib/IR/Function.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,10 @@ void Argument::removeAttr(Attribute::AttrKind Kind) {
323323
getParent()->removeParamAttr(getArgNo(), Kind);
324324
}
325325

326+
void Argument::removeAttr(StringRef Kind) {
327+
getParent()->removeParamAttr(getArgNo(), Kind);
328+
}
329+
326330
void Argument::removeAttrs(const AttributeMask &AM) {
327331
AttributeList AL = getParent()->getAttributes();
328332
AL = AL.removeParamAttributes(Parent->getContext(), getArgNo(), AM);

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 187 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414
#include "GCNSubtarget.h"
1515
#include "Utils/AMDGPUBaseInfo.h"
1616
#include "llvm/Analysis/CycleAnalysis.h"
17+
#include "llvm/Analysis/TargetTransformInfo.h"
18+
#include "llvm/Analysis/UniformityAnalysis.h"
1719
#include "llvm/CodeGen/TargetPassConfig.h"
20+
#include "llvm/IR/IRBuilder.h"
1821
#include "llvm/IR/IntrinsicsAMDGPU.h"
1922
#include "llvm/IR/IntrinsicsR600.h"
2023
#include "llvm/InitializePasses.h"
@@ -1295,6 +1298,130 @@ struct AAAMDGPUNoAGPR
12951298

12961299
const char AAAMDGPUNoAGPR::ID = 0;
12971300

1301+
struct AAAMDGPUUniform : public StateWrapper<BooleanState, AbstractAttribute> {
1302+
using Base = StateWrapper<BooleanState, AbstractAttribute>;
1303+
AAAMDGPUUniform(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1304+
1305+
/// Create an abstract attribute view for the position \p IRP.
1306+
static AAAMDGPUUniform &createForPosition(const IRPosition &IRP,
1307+
Attributor &A);
1308+
1309+
/// See AbstractAttribute::getName()
1310+
StringRef getName() const override { return "AAAMDGPUUniform"; }
1311+
1312+
const std::string getAsStr(Attributor *A) const override {
1313+
return getAssumed() ? "inreg" : "non-inreg";
1314+
}
1315+
1316+
void trackStatistics() const override {}
1317+
1318+
/// See AbstractAttribute::getIdAddr()
1319+
const char *getIdAddr() const override { return &ID; }
1320+
1321+
/// This function should return true if the type of the \p AA is
1322+
/// AAAMDGPUUniform
1323+
static bool classof(const AbstractAttribute *AA) {
1324+
return (AA->getIdAddr() == &ID);
1325+
}
1326+
1327+
/// Unique ID (due to the unique address)
1328+
static const char ID;
1329+
};
1330+
1331+
const char AAAMDGPUUniform::ID = 0;
1332+
1333+
struct AAAMDGPUUniformArgument : public AAAMDGPUUniform {
1334+
AAAMDGPUUniformArgument(const IRPosition &IRP, Attributor &A)
1335+
: AAAMDGPUUniform(IRP, A) {}
1336+
1337+
void initialize(Attributor &A) override {
1338+
Argument *Arg = getAssociatedArgument();
1339+
CallingConv::ID CC = Arg->getParent()->getCallingConv();
1340+
if (Arg->hasAttribute(Attribute::InReg)) {
1341+
indicateOptimisticFixpoint();
1342+
return;
1343+
}
1344+
if (AMDGPU::isEntryFunctionCC(CC)) {
1345+
// We only use isArgPassedInSGPR on kernel entry function argument, so the
1346+
// potential i1 argument change will not affect this.
1347+
if (AMDGPU::isArgPassedInSGPR(Arg))
1348+
indicateOptimisticFixpoint();
1349+
else
1350+
indicatePessimisticFixpoint();
1351+
}
1352+
}
1353+
1354+
ChangeStatus updateImpl(Attributor &A) override {
1355+
unsigned ArgNo = getAssociatedArgument()->getArgNo();
1356+
1357+
auto isUniform = [&](AbstractCallSite ACS) -> bool {
1358+
CallBase *CB = ACS.getInstruction();
1359+
Value *V = CB->getArgOperandUse(ArgNo);
1360+
if (isa<Constant>(V))
1361+
return true;
1362+
Function *F = nullptr;
1363+
if (auto *Arg = dyn_cast<Argument>(V)) {
1364+
auto *AA =
1365+
A.getOrCreateAAFor<AAAMDGPUUniform>(IRPosition::argument(*Arg));
1366+
if (AA)
1367+
return AA->isValidState();
1368+
F = Arg->getParent();
1369+
} else if (auto *I = dyn_cast<Instruction>(V)) {
1370+
F = I->getFunction();
1371+
}
1372+
1373+
if (F) {
1374+
auto *UA =
1375+
A.getInfoCache()
1376+
.getAnalysisResultForFunction<UniformityInfoAnalysis>(*F);
1377+
return UA && UA->isUniform(V);
1378+
}
1379+
1380+
// What else can it be here?
1381+
return false;
1382+
};
1383+
1384+
bool UsedAssumedInformation = true;
1385+
if (!A.checkForAllCallSites(isUniform, *this, /*RequireAllCallSites=*/true,
1386+
UsedAssumedInformation))
1387+
return indicatePessimisticFixpoint();
1388+
1389+
if (!UsedAssumedInformation)
1390+
return indicateOptimisticFixpoint();
1391+
1392+
return ChangeStatus::UNCHANGED;
1393+
}
1394+
1395+
ChangeStatus manifest(Attributor &A) override {
1396+
Argument *Arg = getAssociatedArgument();
1397+
if (AMDGPU::isEntryFunctionCC(Arg->getParent()->getCallingConv()))
1398+
return ChangeStatus::UNCHANGED;
1399+
// If the argument already has inreg attribute, we will not do anything
1400+
// about it.
1401+
if (Arg->hasAttribute(Attribute::InReg))
1402+
return ChangeStatus::UNCHANGED;
1403+
// Add both inreg and "uniform" attribute to the argument. We will emit a
1404+
// readfirstlane at each call site for inreg uniform argument, and the
1405+
// "uniform" attribute will be removed later.
1406+
LLVMContext &Ctx = Arg->getContext();
1407+
return A.manifestAttrs(getIRPosition(),
1408+
{Attribute::get(Ctx, Attribute::InReg),
1409+
Attribute::get(Ctx, "uniform")});
1410+
}
1411+
};
1412+
1413+
AAAMDGPUUniform &AAAMDGPUUniform::createForPosition(const IRPosition &IRP,
1414+
Attributor &A) {
1415+
switch (IRP.getPositionKind()) {
1416+
case IRPosition::IRP_ARGUMENT:
1417+
return *new (A.Allocator) AAAMDGPUUniformArgument(IRP, A);
1418+
// TODO: Since inreg is also allowed for return value, maybe we need to add
1419+
// AAAMDGPUUniformCallSiteReturned?
1420+
default:
1421+
llvm_unreachable("not a valid position for AAAMDGPUUniform");
1422+
}
1423+
}
1424+
12981425
/// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
12991426
/// based on the finalized 'amdgpu-flat-work-group-size' attribute.
13001427
/// Both attributes start with narrow ranges that expand during iteration.
@@ -1363,6 +1490,59 @@ static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
13631490
return Changed;
13641491
}
13651492

1493+
/// Emit the readfirstlane intrinsic for all inreg uniform function arguments at
1494+
/// each call site. The inreg uniform attribute combination is set by
1495+
/// AAAMDGPUUniform. This function provides a workaround for a downstream issue
1496+
/// where failing to emit a waterfall loop for 'inreg' arguments may result in
1497+
/// an invalid VGPR-to-SGPR copy. However, we intentionally avoid a waterfall
1498+
/// loop for inreg uniform arguments here, because the 'inreg' attribute set by
1499+
/// AAAMDGPUUniform guarantees uniformity, making the readfirstlane intrinsic
1500+
/// appropriate.
1501+
static bool emitReadFirstLaneForInregUniformArgs(Module &M) {
1502+
std::vector<std::pair<CallBase *, unsigned>> WorkList;
1503+
1504+
for (Function &F : M) {
1505+
if (F.isDeclaration())
1506+
continue;
1507+
for (Argument &Arg : F.args()) {
1508+
if (!Arg.hasAttribute(Attribute::InReg) || !Arg.hasAttribute("uniform"))
1509+
continue;
1510+
unsigned ArgNo = Arg.getArgNo();
1511+
for (Use &U : F.uses()) {
1512+
auto *CB = dyn_cast<CallBase>(U.getUser());
1513+
if (!CB)
1514+
continue;
1515+
// We will skip the call site argument when itself is an inreg argument.
1516+
// In this case, it will already be in SGPR.
1517+
if (auto *CSArg = dyn_cast<Argument>(CB->getArgOperand(ArgNo))) {
1518+
if (CSArg->hasAttribute(Attribute::InReg))
1519+
continue;
1520+
}
1521+
WorkList.emplace_back(CB, ArgNo);
1522+
}
1523+
// We don't count this as changed since it just stays within this pass.
1524+
Arg.removeAttr("uniform");
1525+
}
1526+
}
1527+
1528+
if (WorkList.empty())
1529+
return false;
1530+
1531+
for (auto &[CB, ArgNo] : WorkList) {
1532+
Value *V = CB->getArgOperand(ArgNo);
1533+
IRBuilder<> Builder(CB);
1534+
Value *NewV = Builder.CreateIntrinsic(V->getType(),
1535+
Intrinsic::amdgcn_readfirstlane, {V});
1536+
CB->setArgOperand(ArgNo, NewV);
1537+
if (auto *I = dyn_cast<Instruction>(V)) {
1538+
if (I->use_empty())
1539+
I->eraseFromParent();
1540+
}
1541+
}
1542+
1543+
return true;
1544+
}
1545+
13661546
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13671547
AMDGPUAttributorOptions Options,
13681548
ThinOrFullLTOPhase LTOPhase) {
@@ -1381,7 +1561,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13811561
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
13821562
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
13831563
&AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1384-
&AAInstanceInfo::ID});
1564+
&AAInstanceInfo::ID, &AAAMDGPUUniform::ID});
13851565

13861566
AttributorConfig AC(CGUpdater);
13871567
AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1434,11 +1614,17 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
14341614
IRPosition::value(*CmpX->getPointerOperand()));
14351615
}
14361616
}
1617+
1618+
if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
1619+
for (auto &Arg : F->args())
1620+
A.getOrCreateAAFor<AAAMDGPUUniform>(IRPosition::argument(Arg));
1621+
}
14371622
}
14381623

14391624
bool Changed = A.run() == ChangeStatus::CHANGED;
14401625

14411626
Changed |= updateWavesPerEU(M, TM);
1627+
Changed |= emitReadFirstLaneForInregUniformArgs(M);
14421628

14431629
return Changed;
14441630
}

llvm/test/CodeGen/AMDGPU/aa-as-infer.ll

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ define void @call_volatile_load_store_as_4(ptr addrspace(4) %p1, ptr addrspace(4
9090

9191
define internal void @can_infer_cmpxchg(ptr %word) {
9292
; CHECK-LABEL: define internal void @can_infer_cmpxchg(
93-
; CHECK-SAME: ptr [[WORD:%.*]]) #[[ATTR0]] {
93+
; CHECK-SAME: ptr inreg [[WORD:%.*]]) #[[ATTR0]] {
9494
; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
9595
; CHECK-NEXT: [[CMPXCHG_0:%.*]] = cmpxchg ptr addrspace(1) [[TMP1]], i32 0, i32 4 monotonic monotonic, align 4
9696
; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
@@ -144,7 +144,7 @@ define internal void @can_not_infer_cmpxchg(ptr %word) {
144144

145145
define internal void @can_infer_atomicrmw(ptr %word) {
146146
; CHECK-LABEL: define internal void @can_infer_atomicrmw(
147-
; CHECK-SAME: ptr [[WORD:%.*]]) #[[ATTR0]] {
147+
; CHECK-SAME: ptr inreg [[WORD:%.*]]) #[[ATTR0]] {
148148
; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
149149
; CHECK-NEXT: [[ATOMICRMW_XCHG:%.*]] = atomicrmw xchg ptr addrspace(1) [[TMP1]], i32 12 monotonic, align 4
150150
; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
@@ -215,13 +215,17 @@ define void @foo(ptr addrspace(3) %val) {
215215
; CHECK-LABEL: define void @foo(
216216
; CHECK-SAME: ptr addrspace(3) [[VAL:%.*]]) #[[ATTR1:[0-9]+]] {
217217
; CHECK-NEXT: [[VAL_CAST:%.*]] = addrspacecast ptr addrspace(3) [[VAL]] to ptr
218-
; CHECK-NEXT: call void @can_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
219-
; CHECK-NEXT: call void @can_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
218+
; CHECK-NEXT: [[TMP1:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
219+
; CHECK-NEXT: call void @can_infer_cmpxchg(ptr [[TMP1]])
220+
; CHECK-NEXT: [[TMP2:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
221+
; CHECK-NEXT: call void @can_infer_cmpxchg(ptr [[TMP2]])
220222
; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
221223
; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
222224
; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr [[VAL_CAST]])
223-
; CHECK-NEXT: call void @can_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
224-
; CHECK-NEXT: call void @can_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
225+
; CHECK-NEXT: [[TMP3:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
226+
; CHECK-NEXT: call void @can_infer_atomicrmw(ptr [[TMP3]])
227+
; CHECK-NEXT: [[TMP4:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
228+
; CHECK-NEXT: call void @can_infer_atomicrmw(ptr [[TMP4]])
225229
; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
226230
; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
227231
; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr [[VAL_CAST]])

llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
define internal fastcc void @foo(ptr %kg) {
1010
; CHECK-LABEL: define internal fastcc void @foo(
11-
; CHECK-SAME: ptr [[KG:%.*]]) #[[ATTR0:[0-9]+]] {
11+
; CHECK-SAME: ptr inreg [[KG:%.*]]) #[[ATTR0:[0-9]+]] {
1212
; CHECK-NEXT: [[ENTRY:.*:]]
1313
; CHECK-NEXT: [[CLOSURE_I25_I:%.*]] = getelementptr i8, ptr [[KG]], i64 336
1414
; CHECK-NEXT: [[NUM_CLOSURE_I26_I:%.*]] = getelementptr i8, ptr [[KG]], i64 276
@@ -80,7 +80,8 @@ define amdgpu_kernel void @kernel() #0 {
8080
; CHECK-NEXT: [[KGLOBALS_ASCAST1:%.*]] = addrspacecast ptr addrspace(5) [[SD]] to ptr
8181
; CHECK-NEXT: [[NUM_CLOSURE_I_I:%.*]] = getelementptr i8, ptr addrspace(5) [[SD]], i32 276
8282
; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[NUM_CLOSURE_I_I]], align 4
83-
; CHECK-NEXT: call fastcc void @foo(ptr [[KGLOBALS_ASCAST1]])
83+
; CHECK-NEXT: [[TMP0:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr [[KGLOBALS_ASCAST1]])
84+
; CHECK-NEXT: call fastcc void @foo(ptr [[TMP0]])
8485
; CHECK-NEXT: ret void
8586
;
8687
entry:

0 commit comments

Comments
 (0)