Skip to content

Commit 772d36d

Browse files
committed
[AMDGPU] Improve detection of non-null addrspacecast operands
Use IR analysis to infer when an addrspacecast operand is nonnull, then lower it to an intrinsic that the DAG can use to infer nonnull. Solves SWDEV-316445
1 parent 0074e6b commit 772d36d

File tree

6 files changed

+218
-149
lines changed

6 files changed

+218
-149
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3196,4 +3196,11 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
31963196
[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
31973197
[IntrNoMem, IntrSpeculatable]
31983198
>;
3199+
3200+
/// Emit an addrspacecast without null pointer checking.
3201+
/// Should only be inserted by a pass based on analysis of an addrspacecast's src.
3202+
def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
3203+
[llvm_anyptr_ty], [llvm_anyptr_ty],
3204+
[IntrNoMem, IntrSpeculatable]
3205+
>;
31993206
}

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,12 @@ static cl::opt<bool> Widen16BitOps(
5151
cl::ReallyHidden,
5252
cl::init(true));
5353

54+
static cl::opt<bool> LowerAddrSpaceCast(
55+
"amdgpu-codegenprepare-addrspacecast",
56+
cl::desc("Detect non-null addrspacecast source and lower them early to "
57+
"avoid the null pointer check"),
58+
cl::ReallyHidden, cl::init(true));
59+
5460
static cl::opt<bool>
5561
BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
5662
cl::desc("Break large PHI nodes for DAGISel"),
@@ -99,6 +105,7 @@ class AMDGPUCodeGenPrepareImpl
99105
: public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
100106
public:
101107
const GCNSubtarget *ST = nullptr;
108+
const AMDGPUTargetMachine *TM = nullptr;
102109
const TargetLibraryInfo *TLInfo = nullptr;
103110
AssumptionCache *AC = nullptr;
104111
DominatorTree *DT = nullptr;
@@ -310,6 +317,7 @@ class AMDGPUCodeGenPrepareImpl
310317
bool visitICmpInst(ICmpInst &I);
311318
bool visitSelectInst(SelectInst &I);
312319
bool visitPHINode(PHINode &I);
320+
bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
313321

314322
bool visitIntrinsicInst(IntrinsicInst &I);
315323
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
@@ -2013,6 +2021,78 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
20132021
return true;
20142022
}
20152023

2024+
bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
2025+
if (!LowerAddrSpaceCast)
2026+
return false;
2027+
2028+
// Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2029+
// This is only worthwhile for casts from/to priv/local to flat.
2030+
const unsigned SrcAS = I.getSrcAddressSpace();
2031+
const unsigned DstAS = I.getDestAddressSpace();
2032+
2033+
bool CanLower = false;
2034+
if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2035+
CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
2036+
DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2037+
else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2038+
CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2039+
SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2040+
if (!CanLower)
2041+
return false;
2042+
2043+
// Check the Src operand, and look through Phis.
2044+
SmallVector<Value *, 4> WorkList;
2045+
DenseSet<const PHINode *> SeenPHIs;
2046+
WorkList.push_back(I.getOperand(0));
2047+
while (!WorkList.empty()) {
2048+
Value *Cur = getUnderlyingObject(WorkList.pop_back_val());
2049+
2050+
// Pointer cannot be null if it's a block address, GV or alloca.
2051+
// NOTE: We don't support extern_weak, but if we did, we'd need to check for
2052+
// it as the symbol could be null in such cases.
2053+
if (isa<BlockAddress>(Cur) || isa<GlobalValue>(Cur) || isa<AllocaInst>(Cur))
2054+
continue;
2055+
2056+
// Check nonnull arguments.
2057+
if (const auto *Arg = dyn_cast<Argument>(Cur); Arg && Arg->hasNonNullAttr())
2058+
continue;
2059+
2060+
// TODO: Calls that return nonnull?
2061+
2062+
// Look through PHIs - add all incoming values to the queue.
2063+
if (const auto *Phi = dyn_cast<PHINode>(Cur)) {
2064+
auto [It, Inserted] = SeenPHIs.insert(Phi);
2065+
if (!Inserted)
2066+
return false; // infinite recursion
2067+
2068+
for (auto &Inc : Phi->incoming_values())
2069+
WorkList.push_back(Inc.get());
2070+
continue;
2071+
}
2072+
2073+
// For all other things, use KnownBits.
2074+
// We either use 0 or all bits set to indicate null, so check whether the
2075+
// value can be zero or all ones.
2076+
auto SrcPtrKB =
2077+
computeKnownBits(Cur, *DL).trunc(DL->getPointerSizeInBits(SrcAS));
2078+
const auto NullVal = TM->getNullPointerValue(SrcAS);
2079+
assert((NullVal == 0 || NullVal == -1) &&
2080+
"don't know how to check for this null value!");
2081+
if (NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero())
2082+
continue;
2083+
2084+
// Value is unknown so we can't lower.
2085+
return false;
2086+
}
2087+
2088+
IRBuilder<> B(&I);
2089+
auto *Intrin = B.CreateIntrinsic(
2090+
I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2091+
I.replaceAllUsesWith(Intrin);
2092+
I.eraseFromParent();
2093+
return true;
2094+
}
2095+
20162096
bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
20172097
switch (I.getIntrinsicID()) {
20182098
case Intrinsic::bitreverse:
@@ -2196,6 +2276,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
21962276
return false;
21972277

21982278
const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2279+
Impl.TM = &TM;
21992280
Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
22002281
Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
22012282
Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -2214,6 +2295,7 @@ PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
22142295
AMDGPUCodeGenPrepareImpl Impl;
22152296
Impl.Mod = F.getParent();
22162297
Impl.DL = &Impl.Mod->getDataLayout();
2298+
Impl.TM = static_cast<const AMDGPUTargetMachine *>(&TM);
22172299
Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
22182300
Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
22192301
Impl.AC = &FAM.getResult<AssumptionAnalysis>(F);

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2247,10 +2247,16 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
22472247
MachineIRBuilder &B) const {
22482248
MachineFunction &MF = B.getMF();
22492249

2250+
// MI can either be a G_ADDRSPACE_CAST or a
2251+
// G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2252+
assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2253+
(isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2254+
Intrinsic::amdgcn_addrspacecast_nonnull));
2255+
22502256
const LLT S32 = LLT::scalar(32);
22512257
Register Dst = MI.getOperand(0).getReg();
2252-
Register Src = MI.getOperand(1).getReg();
2253-
2258+
Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2259+
: MI.getOperand(1).getReg();
22542260
LLT DstTy = MRI.getType(Dst);
22552261
LLT SrcTy = MRI.getType(Src);
22562262
unsigned DestAS = DstTy.getAddressSpace();
@@ -2263,6 +2269,11 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
22632269
const AMDGPUTargetMachine &TM
22642270
= static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
22652271

2272+
// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2273+
// G_ADDRSPACE_CAST we need to guess.
2274+
const bool IsKnownNonNull =
2275+
isa<GIntrinsic>(MI) ? true : isKnownNonNull(Src, MRI, TM, SrcAS);
2276+
22662277
if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
22672278
MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
22682279
return true;
@@ -2271,7 +2282,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
22712282
if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
22722283
(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
22732284
DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2274-
if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2285+
if (IsKnownNonNull) {
22752286
// Extract low 32-bits of the pointer.
22762287
B.buildExtract(Dst, Src, 0);
22772288
MI.eraseFromParent();
@@ -2308,7 +2319,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
23082319
// avoid the ptrtoint?
23092320
auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
23102321

2311-
if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2322+
if (IsKnownNonNull) {
23122323
B.buildCopy(Dst, BuildPtr);
23132324
MI.eraseFromParent();
23142325
return true;
@@ -7020,6 +7031,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
70207031

70217032
return false;
70227033
}
7034+
case Intrinsic::amdgcn_addrspacecast_nonnull:
7035+
return legalizeAddrSpaceCast(MI, MRI, B);
70237036
case Intrinsic::amdgcn_make_buffer_rsrc:
70247037
return legalizePointerAsRsrcIntrin(MI, MRI, B);
70257038
case Intrinsic::amdgcn_kernarg_segment_ptr:

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1415,6 +1415,24 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
14151415
}
14161416
}
14171417

1418+
void SITargetLowering::CollectTargetIntrinsicOperands(
1419+
const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1420+
switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1421+
case Intrinsic::amdgcn_addrspacecast_nonnull: {
1422+
// The DAG's ValueType loses the addrspaces.
1423+
// Add them as 2 extra Constant operands "from" and "to".
1424+
unsigned SrcAS =
1425+
I.getOperand(0)->getType()->getScalarType()->getPointerAddressSpace();
1426+
unsigned DstAS = I.getType()->getScalarType()->getPointerAddressSpace();
1427+
Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1428+
Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1429+
break;
1430+
}
1431+
default:
1432+
break;
1433+
}
1434+
}
1435+
14181436
bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
14191437
SmallVectorImpl<Value*> &Ops,
14201438
Type *&AccessTy) const {
@@ -6635,24 +6653,37 @@ static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
66356653
SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
66366654
SelectionDAG &DAG) const {
66376655
SDLoc SL(Op);
6638-
const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
6639-
6640-
SDValue Src = ASC->getOperand(0);
6641-
SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
6642-
unsigned SrcAS = ASC->getSrcAddressSpace();
66436656

66446657
const AMDGPUTargetMachine &TM =
66456658
static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
66466659

6660+
unsigned DestAS, SrcAS;
6661+
SDValue Src;
6662+
bool KnownNonNull;
6663+
if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
6664+
SrcAS = ASC->getSrcAddressSpace();
6665+
Src = ASC->getOperand(0);
6666+
DestAS = ASC->getDestAddressSpace();
6667+
KnownNonNull = isKnownNonNull(Op, DAG, TM, SrcAS);
6668+
} else {
6669+
assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
6670+
Op.getConstantOperandVal(0) ==
6671+
Intrinsic::amdgcn_addrspacecast_nonnull);
6672+
Src = Op->getOperand(1);
6673+
SrcAS = Op->getConstantOperandVal(2);
6674+
DestAS = Op->getConstantOperandVal(3);
6675+
KnownNonNull = true;
6676+
}
6677+
6678+
SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
6679+
66476680
// flat -> local/private
66486681
if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
6649-
unsigned DestAS = ASC->getDestAddressSpace();
6650-
66516682
if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
66526683
DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
66536684
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
66546685

6655-
if (isKnownNonNull(Src, DAG, TM, SrcAS))
6686+
if (KnownNonNull)
66566687
return Ptr;
66576688

66586689
unsigned NullVal = TM.getNullPointerValue(DestAS);
@@ -6665,16 +6696,16 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
66656696
}
66666697

66676698
// local/private -> flat
6668-
if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
6699+
if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
66696700
if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
66706701
SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
66716702

6672-
SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
6703+
SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
66736704
SDValue CvtPtr =
66746705
DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
66756706
CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
66766707

6677-
if (isKnownNonNull(Src, DAG, TM, SrcAS))
6708+
if (KnownNonNull)
66786709
return CvtPtr;
66796710

66806711
unsigned NullVal = TM.getNullPointerValue(SrcAS);
@@ -6697,7 +6728,7 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
66976728
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
66986729
}
66996730

6700-
if (ASC->getDestAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6731+
if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
67016732
Src.getValueType() == MVT::i64)
67026733
return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
67036734

@@ -6708,7 +6739,7 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
67086739
MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
67096740
DAG.getContext()->diagnose(InvalidAddrSpaceCast);
67106741

6711-
return DAG.getUNDEF(ASC->getValueType(0));
6742+
return DAG.getUNDEF(Op->getValueType(0));
67126743
}
67136744

67146745
// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
@@ -8325,6 +8356,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
83258356
Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
83268357
IndexKeyi32, Op.getOperand(7)});
83278358
}
8359+
case Intrinsic::amdgcn_addrspacecast_nonnull:
8360+
return lowerADDRSPACECAST(Op, DAG);
83288361
default:
83298362
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
83308363
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
305305
MachineFunction &MF,
306306
unsigned IntrinsicID) const override;
307307

308+
void CollectTargetIntrinsicOperands(const CallInst &I,
309+
SmallVectorImpl<SDValue> &Ops,
310+
SelectionDAG &DAG) const override;
311+
308312
bool getAddrModeArguments(IntrinsicInst * /*I*/,
309313
SmallVectorImpl<Value*> &/*Ops*/,
310314
Type *&/*AccessTy*/) const override;

0 commit comments

Comments
 (0)