Skip to content

[AMDGPU][Attributor] Infer inreg attribute in AMDGPUAttributor #101609

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llvm/include/llvm/IR/Argument.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ class Argument final : public Value {

LLVM_ABI void removeAttrs(const AttributeMask &AM);

LLVM_ABI void removeAttr(StringRef Kind);

/// Check if an argument has a given attribute.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const;

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/IR/Function.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,10 @@ void Argument::removeAttr(Attribute::AttrKind Kind) {
getParent()->removeParamAttr(getArgNo(), Kind);
}

void Argument::removeAttr(StringRef Kind) {
getParent()->removeParamAttr(getArgNo(), Kind);
}

void Argument::removeAttrs(const AttributeMask &AM) {
AttributeList AL = getParent()->getAttributes();
AL = AL.removeParamAttributes(Parent->getContext(), getArgNo(), AM);
Expand Down
197 changes: 196 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/CycleAnalysis.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/InitializePasses.h"
Expand Down Expand Up @@ -1295,6 +1298,134 @@ struct AAAMDGPUNoAGPR

const char AAAMDGPUNoAGPR::ID = 0;

struct AAAMDGPUUniform : public StateWrapper<BooleanState, AbstractAttribute> {
using Base = StateWrapper<BooleanState, AbstractAttribute>;
AAAMDGPUUniform(const IRPosition &IRP, Attributor &A) : Base(IRP) {}

/// Create an abstract attribute view for the position \p IRP.
static AAAMDGPUUniform &createForPosition(const IRPosition &IRP,
Attributor &A);

/// See AbstractAttribute::getName()
StringRef getName() const override { return "AAAMDGPUUniform"; }

const std::string getAsStr(Attributor *A) const override {
return getAssumed() ? "uniform" : "divergent";
}

void trackStatistics() const override {}

/// See AbstractAttribute::getIdAddr()
const char *getIdAddr() const override { return &ID; }

/// This function should return true if the type of the \p AA is
/// AAAMDGPUUniform
static bool classof(const AbstractAttribute *AA) {
return (AA->getIdAddr() == &ID);
}

/// Unique ID (due to the unique address)
static const char ID;
};

const char AAAMDGPUUniform::ID = 0;

/// This AA is to infer the inreg attribute for a function argument.
struct AAAMDGPUUniformArgument : public AAAMDGPUUniform {
AAAMDGPUUniformArgument(const IRPosition &IRP, Attributor &A)
: AAAMDGPUUniform(IRP, A) {}

void initialize(Attributor &A) override {
Argument *Arg = getAssociatedArgument();
CallingConv::ID CC = Arg->getParent()->getCallingConv();
if (Arg->hasAttribute(Attribute::InReg)) {
indicateOptimisticFixpoint();
return;
}
if (AMDGPU::isEntryFunctionCC(CC)) {
// We only use isArgPassedInSGPR on kernel entry function argument, so
// even if we will use VPGR for inreg i1 argument passing, it will not
// affect this.
if (AMDGPU::isArgPassedInSGPR(Arg))
indicateOptimisticFixpoint();
else
indicatePessimisticFixpoint();
}
}

ChangeStatus updateImpl(Attributor &A) override {
unsigned ArgNo = getAssociatedArgument()->getArgNo();

auto isUniform = [&](AbstractCallSite ACS) -> bool {
CallBase *CB = ACS.getInstruction();
Value *V = CB->getArgOperandUse(ArgNo);
if (isa<Constant>(V))
return true;
Function *F = nullptr;
if (auto *Arg = dyn_cast<Argument>(V)) {
auto *AA =
A.getOrCreateAAFor<AAAMDGPUUniform>(IRPosition::argument(*Arg));
if (AA)
return AA->isValidState();
F = Arg->getParent();
} else if (auto *I = dyn_cast<Instruction>(V)) {
F = I->getFunction();
}

if (F) {
auto *UA =
A.getInfoCache()
.getAnalysisResultForFunction<UniformityInfoAnalysis>(*F);
return UA && UA->isUniform(V);
}

return false;
};

bool UsedAssumedInformation = true;
if (!A.checkForAllCallSites(isUniform, *this, /*RequireAllCallSites=*/true,
UsedAssumedInformation))
return indicatePessimisticFixpoint();

if (!UsedAssumedInformation)
return indicateOptimisticFixpoint();

return ChangeStatus::UNCHANGED;
}

ChangeStatus manifest(Attributor &A) override {
Argument *Arg = getAssociatedArgument();
// If the argument already has inreg attribute, we will not do anything
// about it.
if (Arg->hasAttribute(Attribute::InReg))
return ChangeStatus::UNCHANGED;
if (AMDGPU::isEntryFunctionCC(Arg->getParent()->getCallingConv()))
return ChangeStatus::UNCHANGED;
// We don't directly emit readfirstlane here because it will cause multiple
// replacements of a single use in the manifest map, which is not supported
// at this moment.
Comment on lines +1404 to +1406
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really understand the problem

// Add both inreg and "uniform" attribute to the argument. We will emit a
// readfirstlane at each call site for inreg uniform argument, and the
// "uniform" attribute will be removed later.
LLVMContext &Ctx = Arg->getContext();
return A.manifestAttrs(getIRPosition(),
{Attribute::get(Ctx, Attribute::InReg),
Attribute::get(Ctx, "uniform")});
Comment on lines +1407 to +1413
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"uniform" is a hack on a hack, and is potentially unsafe on later code motion of the callsite. This comment doesn't explain why you would want to insert the readfirstlane in the first place, which is another giant hack.

Once again, I think it would be easiest to restrict this to the trivially uniform case for now. Extending this to arbitrary uniform analysis and the readfirstlanes should be a second step. The trivially uniform case is the most important case, and every problem is going to be in these other cases

}
};

AAAMDGPUUniform &AAAMDGPUUniform::createForPosition(const IRPosition &IRP,
Attributor &A) {
switch (IRP.getPositionKind()) {
case IRPosition::IRP_ARGUMENT:
return *new (A.Allocator) AAAMDGPUUniformArgument(IRP, A);
// TODO: Since inreg is also allowed for return value, maybe we need to add
// AAAMDGPUUniformCallSiteReturned?
default:
llvm_unreachable("not a valid position for AAAMDGPUUniform");
}
}

/// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
/// based on the finalized 'amdgpu-flat-work-group-size' attribute.
/// Both attributes start with narrow ranges that expand during iteration.
Expand Down Expand Up @@ -1363,6 +1494,64 @@ static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
return Changed;
}

/// Emit the readfirstlane intrinsic for all inreg uniform function arguments at
/// each call site. The inreg uniform attribute combination is set by
/// AAAMDGPUUniform. This function provides a workaround for a downstream issue
/// where failing to emit a waterfall loop for 'inreg' arguments may result in
/// an invalid VGPR-to-SGPR copy. However, we intentionally avoid a waterfall
/// loop for inreg uniform arguments here, because the 'inreg' attribute set by
/// AAAMDGPUUniform guarantees uniformity, making the readfirstlane intrinsic
/// appropriate.
static bool emitReadFirstLaneForInregUniformArgs(Module &M) {
bool Changed = false;
std::vector<std::pair<CallBase *, unsigned>> WorkList;

for (Function &F : M) {
if (F.isDeclaration())
continue;
for (Argument &Arg : F.args()) {
if (!Arg.hasAttribute(Attribute::InReg) || !Arg.hasAttribute("uniform"))
continue;
unsigned ArgNo = Arg.getArgNo();
for (Use &U : F.uses()) {
auto *CB = dyn_cast<CallBase>(U.getUser());
if (!CB)
continue;
Value *CSArg = CB->getArgOperand(ArgNo);
// We don't need readfirstvalue for a global value.
if (isa<GlobalValue>(CSArg))
continue;
Comment on lines +1521 to +1523
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any constant, or really any isTriviallyUniform

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any constant

Not necessarily. Address space cast can diverge in the future (the code has not been upstream yet I think).

// We will skip the call site argument when itself is an inreg argument.
// In this case, it will already be in SGPR.
if (auto *CSArgArg = dyn_cast<Argument>(CSArg)) {
if (CSArgArg->hasAttribute(Attribute::InReg))
continue;
}
WorkList.emplace_back(CB, ArgNo);
}
Arg.removeAttr("uniform");
Changed = true;
}
}

if (WorkList.empty())
return Changed;

for (auto &[CB, ArgNo] : WorkList) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for (auto &[CB, ArgNo] : WorkList) {
for (auto [CB, ArgNo] : WorkList) {

Value *V = CB->getArgOperand(ArgNo);
IRBuilder<> Builder(CB);
Value *NewV = Builder.CreateIntrinsic(V->getType(),
Intrinsic::amdgcn_readfirstlane, {V});
Comment on lines +1543 to +1544
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the function uses convergence tokens it should add the convergencectrl bundle

CB->setArgOperand(ArgNo, NewV);
Comment on lines +1543 to +1545
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you directly do this in the manifest of the attribute? Ideally this would only do this for cases where this pass introduced the inreg argument and leave existing cases alone

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally yes, but it is not feasible at the moment. Long story short, it could cause to register multiple updates/replacement of a single IRP in the manifest map, which is currently not supported.

if (auto *I = dyn_cast<Instruction>(V)) {
if (I->use_empty())
I->eraseFromParent();
}
}

return true;
}

static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
AMDGPUAttributorOptions Options,
ThinOrFullLTOPhase LTOPhase) {
Expand All @@ -1381,7 +1570,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
&AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
&AAInstanceInfo::ID});
&AAInstanceInfo::ID, &AAAMDGPUUniform::ID});

AttributorConfig AC(CGUpdater);
AC.IsClosedWorldModule = Options.IsClosedWorld;
Expand Down Expand Up @@ -1434,11 +1623,17 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
IRPosition::value(*CmpX->getPointerOperand()));
}
}

if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
for (auto &Arg : F->args())
A.getOrCreateAAFor<AAAMDGPUUniform>(IRPosition::argument(Arg));
}
}

bool Changed = A.run() == ChangeStatus::CHANGED;

Changed |= updateWavesPerEU(M, TM);
Changed |= emitReadFirstLaneForInregUniformArgs(M);

return Changed;
}
Expand Down
16 changes: 10 additions & 6 deletions llvm/test/CodeGen/AMDGPU/aa-as-infer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ define void @call_volatile_load_store_as_4(ptr addrspace(4) %p1, ptr addrspace(4

define internal void @can_infer_cmpxchg(ptr %word) {
; CHECK-LABEL: define internal void @can_infer_cmpxchg(
; CHECK-SAME: ptr [[WORD:%.*]]) #[[ATTR0]] {
; CHECK-SAME: ptr inreg [[WORD:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
; CHECK-NEXT: [[CMPXCHG_0:%.*]] = cmpxchg ptr addrspace(1) [[TMP1]], i32 0, i32 4 monotonic monotonic, align 4
; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
Expand Down Expand Up @@ -144,7 +144,7 @@ define internal void @can_not_infer_cmpxchg(ptr %word) {

define internal void @can_infer_atomicrmw(ptr %word) {
; CHECK-LABEL: define internal void @can_infer_atomicrmw(
; CHECK-SAME: ptr [[WORD:%.*]]) #[[ATTR0]] {
; CHECK-SAME: ptr inreg [[WORD:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
; CHECK-NEXT: [[ATOMICRMW_XCHG:%.*]] = atomicrmw xchg ptr addrspace(1) [[TMP1]], i32 12 monotonic, align 4
; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[WORD]] to ptr addrspace(1)
Expand Down Expand Up @@ -215,13 +215,17 @@ define void @foo(ptr addrspace(3) %val) {
; CHECK-LABEL: define void @foo(
; CHECK-SAME: ptr addrspace(3) [[VAL:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[VAL_CAST:%.*]] = addrspacecast ptr addrspace(3) [[VAL]] to ptr
; CHECK-NEXT: call void @can_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
; CHECK-NEXT: call void @can_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
; CHECK-NEXT: [[TMP1:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
; CHECK-NEXT: call void @can_infer_cmpxchg(ptr [[TMP1]])
; CHECK-NEXT: [[TMP2:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
; CHECK-NEXT: call void @can_infer_cmpxchg(ptr [[TMP2]])
; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
; CHECK-NEXT: call void @can_not_infer_cmpxchg(ptr [[VAL_CAST]])
; CHECK-NEXT: call void @can_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
; CHECK-NEXT: call void @can_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
; CHECK-NEXT: [[TMP3:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
; CHECK-NEXT: call void @can_infer_atomicrmw(ptr [[TMP3]])
; CHECK-NEXT: [[TMP4:%.*]] = call ptr @llvm.amdgcn.readfirstlane.p0(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
; CHECK-NEXT: call void @can_infer_atomicrmw(ptr [[TMP4]])
; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g1 to ptr))
; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr addrspacecast (ptr addrspace(1) @g2 to ptr))
; CHECK-NEXT: call void @can_not_infer_atomicrmw(ptr [[VAL_CAST]])
Expand Down
Loading
Loading