Skip to content

Commit d0efab1

Browse files
committed
[WIP][AMDGPU][Attributor] Infer inreg attribute in AMDGPUAttributor
1 parent 5a194c1 commit d0efab1

File tree

7 files changed

+520
-15
lines changed

7 files changed

+520
-15
lines changed

llvm/include/llvm/IR/Argument.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,8 @@ class Argument final : public Value {
176176

177177
LLVM_ABI void removeAttrs(const AttributeMask &AM);
178178

179+
LLVM_ABI void removeAttr(StringRef Kind);
180+
179181
/// Check if an argument has a given attribute.
180182
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const;
181183

llvm/lib/IR/Function.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,10 @@ void Argument::removeAttr(Attribute::AttrKind Kind) {
323323
getParent()->removeParamAttr(getArgNo(), Kind);
324324
}
325325

326+
void Argument::removeAttr(StringRef Kind) {
327+
getParent()->removeParamAttr(getArgNo(), Kind);
328+
}
329+
326330
void Argument::removeAttrs(const AttributeMask &AM) {
327331
AttributeList AL = getParent()->getAttributes();
328332
AL = AL.removeParamAttributes(Parent->getContext(), getArgNo(), AM);

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 196 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414
#include "GCNSubtarget.h"
1515
#include "Utils/AMDGPUBaseInfo.h"
1616
#include "llvm/Analysis/CycleAnalysis.h"
17+
#include "llvm/Analysis/TargetTransformInfo.h"
18+
#include "llvm/Analysis/UniformityAnalysis.h"
1719
#include "llvm/CodeGen/TargetPassConfig.h"
20+
#include "llvm/IR/IRBuilder.h"
1821
#include "llvm/IR/IntrinsicsAMDGPU.h"
1922
#include "llvm/IR/IntrinsicsR600.h"
2023
#include "llvm/InitializePasses.h"
@@ -1295,6 +1298,134 @@ struct AAAMDGPUNoAGPR
12951298

12961299
const char AAAMDGPUNoAGPR::ID = 0;
12971300

1301+
struct AAAMDGPUUniform : public StateWrapper<BooleanState, AbstractAttribute> {
1302+
using Base = StateWrapper<BooleanState, AbstractAttribute>;
1303+
AAAMDGPUUniform(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1304+
1305+
/// Create an abstract attribute view for the position \p IRP.
1306+
static AAAMDGPUUniform &createForPosition(const IRPosition &IRP,
1307+
Attributor &A);
1308+
1309+
/// See AbstractAttribute::getName()
1310+
StringRef getName() const override { return "AAAMDGPUUniform"; }
1311+
1312+
const std::string getAsStr(Attributor *A) const override {
1313+
return getAssumed() ? "uniform" : "divergent";
1314+
}
1315+
1316+
void trackStatistics() const override {}
1317+
1318+
/// See AbstractAttribute::getIdAddr()
1319+
const char *getIdAddr() const override { return &ID; }
1320+
1321+
/// This function should return true if the type of the \p AA is
1322+
/// AAAMDGPUUniform
1323+
static bool classof(const AbstractAttribute *AA) {
1324+
return (AA->getIdAddr() == &ID);
1325+
}
1326+
1327+
/// Unique ID (due to the unique address)
1328+
static const char ID;
1329+
};
1330+
1331+
const char AAAMDGPUUniform::ID = 0;
1332+
1333+
/// This AA is to infer the inreg attribute for a function argument.
1334+
struct AAAMDGPUUniformArgument : public AAAMDGPUUniform {
1335+
AAAMDGPUUniformArgument(const IRPosition &IRP, Attributor &A)
1336+
: AAAMDGPUUniform(IRP, A) {}
1337+
1338+
void initialize(Attributor &A) override {
1339+
Argument *Arg = getAssociatedArgument();
1340+
CallingConv::ID CC = Arg->getParent()->getCallingConv();
1341+
if (Arg->hasAttribute(Attribute::InReg)) {
1342+
indicateOptimisticFixpoint();
1343+
return;
1344+
}
1345+
if (AMDGPU::isEntryFunctionCC(CC)) {
1346+
// We only use isArgPassedInSGPR on kernel entry function argument, so
1347+
// even if we will use VPGR for inreg i1 argument passing, it will not
1348+
// affect this.
1349+
if (AMDGPU::isArgPassedInSGPR(Arg))
1350+
indicateOptimisticFixpoint();
1351+
else
1352+
indicatePessimisticFixpoint();
1353+
}
1354+
}
1355+
1356+
ChangeStatus updateImpl(Attributor &A) override {
1357+
unsigned ArgNo = getAssociatedArgument()->getArgNo();
1358+
1359+
auto isUniform = [&](AbstractCallSite ACS) -> bool {
1360+
CallBase *CB = ACS.getInstruction();
1361+
Value *V = CB->getArgOperandUse(ArgNo);
1362+
if (isa<Constant>(V))
1363+
return true;
1364+
Function *F = nullptr;
1365+
if (auto *Arg = dyn_cast<Argument>(V)) {
1366+
auto *AA =
1367+
A.getOrCreateAAFor<AAAMDGPUUniform>(IRPosition::argument(*Arg));
1368+
if (AA)
1369+
return AA->isValidState();
1370+
F = Arg->getParent();
1371+
} else if (auto *I = dyn_cast<Instruction>(V)) {
1372+
F = I->getFunction();
1373+
}
1374+
1375+
if (F) {
1376+
auto *UA =
1377+
A.getInfoCache()
1378+
.getAnalysisResultForFunction<UniformityInfoAnalysis>(*F);
1379+
return UA && UA->isUniform(V);
1380+
}
1381+
1382+
return false;
1383+
};
1384+
1385+
bool UsedAssumedInformation = true;
1386+
if (!A.checkForAllCallSites(isUniform, *this, /*RequireAllCallSites=*/true,
1387+
UsedAssumedInformation))
1388+
return indicatePessimisticFixpoint();
1389+
1390+
if (!UsedAssumedInformation)
1391+
return indicateOptimisticFixpoint();
1392+
1393+
return ChangeStatus::UNCHANGED;
1394+
}
1395+
1396+
ChangeStatus manifest(Attributor &A) override {
1397+
Argument *Arg = getAssociatedArgument();
1398+
// If the argument already has inreg attribute, we will not do anything
1399+
// about it.
1400+
if (Arg->hasAttribute(Attribute::InReg))
1401+
return ChangeStatus::UNCHANGED;
1402+
if (AMDGPU::isEntryFunctionCC(Arg->getParent()->getCallingConv()))
1403+
return ChangeStatus::UNCHANGED;
1404+
// We don't directly emit readfirstlane here because it will cause multiple
1405+
// replacements of a single use in the manifest map, which is not supported
1406+
// at this moment.
1407+
// Add both inreg and "uniform" attribute to the argument. We will emit a
1408+
// readfirstlane at each call site for inreg uniform argument, and the
1409+
// "uniform" attribute will be removed later.
1410+
LLVMContext &Ctx = Arg->getContext();
1411+
return A.manifestAttrs(getIRPosition(),
1412+
{Attribute::get(Ctx, Attribute::InReg),
1413+
Attribute::get(Ctx, "uniform")});
1414+
}
1415+
};
1416+
1417+
AAAMDGPUUniform &AAAMDGPUUniform::createForPosition(const IRPosition &IRP,
1418+
Attributor &A) {
1419+
switch (IRP.getPositionKind()) {
1420+
case IRPosition::IRP_ARGUMENT:
1421+
return *new (A.Allocator) AAAMDGPUUniformArgument(IRP, A);
1422+
// TODO: Since inreg is also allowed for return value, maybe we need to add
1423+
// AAAMDGPUUniformCallSiteReturned?
1424+
default:
1425+
llvm_unreachable("not a valid position for AAAMDGPUUniform");
1426+
}
1427+
}
1428+
12981429
/// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
12991430
/// based on the finalized 'amdgpu-flat-work-group-size' attribute.
13001431
/// Both attributes start with narrow ranges that expand during iteration.
@@ -1363,6 +1494,64 @@ static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
13631494
return Changed;
13641495
}
13651496

1497+
/// Emit the readfirstlane intrinsic for all inreg uniform function arguments at
1498+
/// each call site. The inreg uniform attribute combination is set by
1499+
/// AAAMDGPUUniform. This function provides a workaround for a downstream issue
1500+
/// where failing to emit a waterfall loop for 'inreg' arguments may result in
1501+
/// an invalid VGPR-to-SGPR copy. However, we intentionally avoid a waterfall
1502+
/// loop for inreg uniform arguments here, because the 'inreg' attribute set by
1503+
/// AAAMDGPUUniform guarantees uniformity, making the readfirstlane intrinsic
1504+
/// appropriate.
1505+
static bool emitReadFirstLaneForInregUniformArgs(Module &M) {
1506+
bool Changed = false;
1507+
std::vector<std::pair<CallBase *, unsigned>> WorkList;
1508+
1509+
for (Function &F : M) {
1510+
if (F.isDeclaration())
1511+
continue;
1512+
for (Argument &Arg : F.args()) {
1513+
if (!Arg.hasAttribute(Attribute::InReg) || !Arg.hasAttribute("uniform"))
1514+
continue;
1515+
unsigned ArgNo = Arg.getArgNo();
1516+
for (Use &U : F.uses()) {
1517+
auto *CB = dyn_cast<CallBase>(U.getUser());
1518+
if (!CB)
1519+
continue;
1520+
Value *CSArg = CB->getArgOperand(ArgNo);
1521+
// We don't need readfirstvalue for a global value.
1522+
if (isa<GlobalValue>(CSArg))
1523+
continue;
1524+
// We will skip the call site argument when itself is an inreg argument.
1525+
// In this case, it will already be in SGPR.
1526+
if (auto *CSArgArg = dyn_cast<Argument>(CSArg)) {
1527+
if (CSArgArg->hasAttribute(Attribute::InReg))
1528+
continue;
1529+
}
1530+
WorkList.emplace_back(CB, ArgNo);
1531+
}
1532+
Arg.removeAttr("uniform");
1533+
Changed = true;
1534+
}
1535+
}
1536+
1537+
if (WorkList.empty())
1538+
return Changed;
1539+
1540+
for (auto &[CB, ArgNo] : WorkList) {
1541+
Value *V = CB->getArgOperand(ArgNo);
1542+
IRBuilder<> Builder(CB);
1543+
Value *NewV = Builder.CreateIntrinsic(V->getType(),
1544+
Intrinsic::amdgcn_readfirstlane, {V});
1545+
CB->setArgOperand(ArgNo, NewV);
1546+
if (auto *I = dyn_cast<Instruction>(V)) {
1547+
if (I->use_empty())
1548+
I->eraseFromParent();
1549+
}
1550+
}
1551+
1552+
return true;
1553+
}
1554+
13661555
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13671556
AMDGPUAttributorOptions Options,
13681557
ThinOrFullLTOPhase LTOPhase) {
@@ -1381,7 +1570,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13811570
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
13821571
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
13831572
&AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1384-
&AAInstanceInfo::ID});
1573+
&AAInstanceInfo::ID, &AAAMDGPUUniform::ID});
13851574

13861575
AttributorConfig AC(CGUpdater);
13871576
AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1434,11 +1623,17 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
14341623
IRPosition::value(*CmpX->getPointerOperand()));
14351624
}
14361625
}
1626+
1627+
if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
1628+
for (auto &Arg : F->args())
1629+
A.getOrCreateAAFor<AAAMDGPUUniform>(IRPosition::argument(Arg));
1630+
}
14371631
}
14381632

14391633
bool Changed = A.run() == ChangeStatus::CHANGED;
14401634

14411635
Changed |= updateWavesPerEU(M, TM);
1636+
Changed |= emitReadFirstLaneForInregUniformArgs(M);
14421637

14431638
return Changed;
14441639
}

llvm/test/CodeGen/AMDGPU/aa-as-infer.ll

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ define void @call_volatile_load_store_as_4(ptr addrspace(4) %p1, ptr addrspace(4
9090

9191
define internal void @can_infer_cmpxchg(ptr %word) {
9292
; CHECK-LABEL: define internal void @can_infer_cmpxchg(
93-
; CHECK-SAME: ptr [[WORD:%.*]]) #[[ATTR0]] {
93+
; CHECK-SAME: ptr inreg [[WORD:%.*]]) #[[ATTR0]] {
9494
; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
9595
; CHECK-NEXT: [[CMPXCHG_0:%.*]] = cmpxchg ptr addrspace(1) [[TMP1]], i32 0, i32 4 monotonic monotonic, align 4
9696
; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
@@ -144,7 +144,7 @@ define internal void @can_not_infer_cmpxchg(ptr %word) {
144144

145145
define internal void @can_infer_atomicrmw(ptr %word) {
146146
; CHECK-LABEL: define internal void @can_infer_atomicrmw(
147-
; CHECK-SAME: ptr [[WORD:%.*]]) #[[ATTR0]] {
147+
; CHECK-SAME: ptr inreg [[WORD:%.*]]) #[[ATTR0]] {
148148
; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
149149
; CHECK-NEXT: [[ATOMICRMW_XCHG:%.*]] = atomicrmw xchg ptr addrspace(1) [[TMP1]], i32 12 monotonic, align 4
150150
; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
@@ -215,13 +215,17 @@ define void @foo(ptr addrspace(3) %val) {
215215
; CHECK-LABEL: define void @foo(
216216
; CHECK-SAME: ptr addrspace(3) [[VAL:%.*]]) #[[ATTR1:[0-9]+]] {
217217
; CHECK-NEXT: [[VAL_CAST:%.*]] = addrspacecast ptr addrspace(3) [[VAL]] to ptr
218-
; CHECK-NEXT: call void @can_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
219-
; CHECK-NEXT: call void @can_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
218+
; CHECK-NEXT: [[TMP1:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
219+
; CHECK-NEXT: call void @can_infer_cmpxchg(ptr [[TMP1]])
220+
; CHECK-NEXT: [[TMP2:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
221+
; CHECK-NEXT: call void @can_infer_cmpxchg(ptr [[TMP2]])
220222
; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
221223
; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
222224
; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr [[VAL_CAST]])
223-
; CHECK-NEXT: call void @can_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
224-
; CHECK-NEXT: call void @can_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
225+
; CHECK-NEXT: [[TMP3:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
226+
; CHECK-NEXT: call void @can_infer_atomicrmw(ptr [[TMP3]])
227+
; CHECK-NEXT: [[TMP4:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
228+
; CHECK-NEXT: call void @can_infer_atomicrmw(ptr [[TMP4]])
225229
; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
226230
; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
227231
; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr [[VAL_CAST]])

0 commit comments

Comments
 (0)