Skip to content

Commit a11bf9a

Browse files
committed
[AMDGPU][Inliner] Remove amdgpu-inline and add a new TTI inline hook
Having a custom inliner doesn't really fit in with the new PM's pipeline. It's also extra technical debt. amdgpu-inline only does a couple of custom things compared to the normal inliner: 1) It disables inlining if the number of BBs in a function would exceed some limit 2) It increases the threshold if there are pointers to private arrays(?) These can all be handled as TTI inliner hooks. There already exists a hook for backends to multiply the inlining threshold. This way we can remove the custom amdgpu-inline pass. This caused inline-hint.ll to fail, and after some investigation, it looks like getInliningThresholdMultiplier() was previously getting applied twice in amdgpu-inline (https://reviews.llvm.org/D62707 fixed it not applying at all, so some later inliner change must have fixed something), so I had to change the threshold in the test. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D94153
1 parent ba9b4ea commit a11bf9a

File tree

18 files changed

+92
-212
lines changed

18 files changed

+92
-212
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,9 @@ class TargetTransformInfo {
289289
/// individual classes of instructions would be better.
290290
unsigned getInliningThresholdMultiplier() const;
291291

292+
/// \returns A value to be added to the inlining threshold.
293+
unsigned adjustInliningThreshold(const CallBase *CB) const;
294+
292295
/// \returns Vector bonus in percent.
293296
///
294297
/// Vector bonuses: We want to more aggressively inline vector-dense kernels
@@ -1395,6 +1398,7 @@ class TargetTransformInfo::Concept {
13951398
ArrayRef<const Value *> Operands,
13961399
TTI::TargetCostKind CostKind) = 0;
13971400
virtual unsigned getInliningThresholdMultiplier() = 0;
1401+
virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0;
13981402
virtual int getInlinerVectorBonusPercent() = 0;
13991403
virtual int getMemcpyCost(const Instruction *I) = 0;
14001404
virtual unsigned
@@ -1679,6 +1683,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
16791683
unsigned getInliningThresholdMultiplier() override {
16801684
return Impl.getInliningThresholdMultiplier();
16811685
}
1686+
unsigned adjustInliningThreshold(const CallBase *CB) override {
1687+
return Impl.adjustInliningThreshold(CB);
1688+
}
16821689
int getInlinerVectorBonusPercent() override {
16831690
return Impl.getInlinerVectorBonusPercent();
16841691
}

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ class TargetTransformInfoImplBase {
6767
}
6868

6969
unsigned getInliningThresholdMultiplier() const { return 1; }
70+
unsigned adjustInliningThreshold(const CallBase *CB) const { return 0; }
7071

7172
int getInlinerVectorBonusPercent() const { return 150; }
7273

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
401401
}
402402

403403
unsigned getInliningThresholdMultiplier() { return 1; }
404+
unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
404405

405406
int getInlinerVectorBonusPercent() { return 150; }
406407

llvm/lib/Analysis/InlineCost.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1580,6 +1580,7 @@ void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
15801580
// Finally, take the target-specific inlining threshold multiplier into
15811581
// account.
15821582
Threshold *= TTI.getInliningThresholdMultiplier();
1583+
Threshold += TTI.adjustInliningThreshold(&Call);
15831584

15841585
SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
15851586
VectorBonus = Threshold * VectorBonusPercent / 100;

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,11 @@ unsigned TargetTransformInfo::getInliningThresholdMultiplier() const {
247247
return TTIImpl->getInliningThresholdMultiplier();
248248
}
249249

250+
unsigned
251+
TargetTransformInfo::adjustInliningThreshold(const CallBase *CB) const {
252+
return TTIImpl->adjustInliningThreshold(CB);
253+
}
254+
250255
int TargetTransformInfo::getInlinerVectorBonusPercent() const {
251256
return TTIImpl->getInlinerVectorBonusPercent();
252257
}

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -327,9 +327,6 @@ void initializeAMDGPUExternalAAWrapperPass(PassRegistry&);
327327

328328
void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
329329

330-
Pass *createAMDGPUFunctionInliningPass();
331-
void initializeAMDGPUInlinerPass(PassRegistry&);
332-
333330
ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();
334331
void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
335332
extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;

llvm/lib/Target/AMDGPU/AMDGPUInline.cpp

Lines changed: 0 additions & 195 deletions
This file was deleted.

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
255255
initializeAMDGPUExternalAAWrapperPass(*PR);
256256
initializeAMDGPUUseNativeCallsPass(*PR);
257257
initializeAMDGPUSimplifyLibCallsPass(*PR);
258-
initializeAMDGPUInlinerPass(*PR);
259258
initializeAMDGPUPrintfRuntimeBindingPass(*PR);
260259
initializeGCNRegBankReassignPass(*PR);
261260
initializeGCNNSAReassignPass(*PR);
@@ -423,7 +422,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
423422

424423
if (EnableFunctionCalls) {
425424
delete Builder.Inliner;
426-
Builder.Inliner = createAMDGPUFunctionInliningPass();
425+
Builder.Inliner = createFunctionInliningPass();
427426
}
428427

429428
Builder.addExtension(

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,24 @@ static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
5656
cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
5757
cl::init(32), cl::Hidden);
5858

59+
static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
60+
cl::Hidden, cl::init(4000),
61+
cl::desc("Cost of alloca argument"));
62+
63+
// If the amount of scratch memory to eliminate exceeds our ability to allocate
64+
// it into registers we gain nothing by aggressively inlining functions for that
65+
// heuristic.
66+
static cl::opt<unsigned>
67+
ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
68+
cl::init(256),
69+
cl::desc("Maximum alloca size to use for inline cost"));
70+
71+
// Inliner constraint to achieve reasonable compilation time.
72+
static cl::opt<size_t> InlineMaxBB(
73+
"amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
74+
cl::desc("Maximum number of BBs allowed in a function after inlining"
75+
" (compile time constraint)"));
76+
5977
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
6078
unsigned Depth = 0) {
6179
const Instruction *I = dyn_cast<Instruction>(Cond);
@@ -1120,7 +1138,47 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
11201138
// no way to support merge for backend defined attributes.
11211139
AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
11221140
AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
1123-
return CallerMode.isInlineCompatible(CalleeMode);
1141+
if (!CallerMode.isInlineCompatible(CalleeMode))
1142+
return false;
1143+
1144+
// Hack to make compile times reasonable.
1145+
if (InlineMaxBB && !Callee->hasFnAttribute(Attribute::InlineHint)) {
1146+
// Single BB does not increase total BB amount, thus subtract 1.
1147+
size_t BBSize = Caller->size() + Callee->size() - 1;
1148+
return BBSize <= InlineMaxBB;
1149+
}
1150+
1151+
return true;
1152+
}
1153+
1154+
unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
1155+
// If we have a pointer to private array passed into a function
1156+
// it will not be optimized out, leaving scratch usage.
1157+
// Increase the inline threshold to allow inlining in this case.
1158+
uint64_t AllocaSize = 0;
1159+
SmallPtrSet<const AllocaInst *, 8> AIVisited;
1160+
for (Value *PtrArg : CB->args()) {
1161+
PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1162+
if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
1163+
Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
1164+
continue;
1165+
1166+
PtrArg = getUnderlyingObject(PtrArg);
1167+
if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
1168+
if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1169+
continue;
1170+
AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1171+
// If the amount of stack memory is excessive we will not be able
1172+
// to get rid of the scratch anyway, bail out.
1173+
if (AllocaSize > ArgAllocaCutoff) {
1174+
AllocaSize = 0;
1175+
break;
1176+
}
1177+
}
1178+
}
1179+
if (AllocaSize)
1180+
return ArgAllocaCost;
1181+
return 0;
11241182
}
11251183

11261184
void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
203203
const Function *Callee) const;
204204

205205
unsigned getInliningThresholdMultiplier() { return 11; }
206+
unsigned adjustInliningThreshold(const CallBase *CB) const;
206207

207208
int getInlinerVectorBonusPercent() { return 0; }
208209

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ add_llvm_target(AMDGPUCodeGen
8686
AMDGPUTargetTransformInfo.cpp
8787
AMDGPUUnifyDivergentExitNodes.cpp
8888
AMDGPUUnifyMetadata.cpp
89-
AMDGPUInline.cpp
9089
AMDGPUPerfHintAnalysis.cpp
9190
AMDILCFGStructurizer.cpp
9291
AMDGPUPrintfRuntimeBinding.cpp

llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -O3 -S -inline-threshold=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INL1 %s
22
; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -O3 -S < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INLDEF %s
3+
; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -passes='default<O3>' -S -inline-threshold=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INL1 %s
4+
; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -passes='default<O3>' -S < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INLDEF %s
35

46
define coldcc float @foo(float %x, float %y) {
57
entry:

llvm/test/CodeGen/AMDGPU/inline-maxbb.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
; RUN: opt -mtriple=amdgcn-- --amdgpu-inline -S -amdgpu-inline-max-bb=2 %s | FileCheck %s --check-prefix=NOINL
2-
; RUN: opt -mtriple=amdgcn-- --amdgpu-inline -S -amdgpu-inline-max-bb=3 %s | FileCheck %s --check-prefix=INL
1+
; RUN: opt -mtriple=amdgcn-- -inline -S -amdgpu-inline-max-bb=2 %s | FileCheck %s --check-prefix=NOINL
2+
; RUN: opt -mtriple=amdgcn-- -inline -S -amdgpu-inline-max-bb=3 %s | FileCheck %s --check-prefix=INL
3+
; RUN: opt -mtriple=amdgcn-- -passes=inline -S -amdgpu-inline-max-bb=2 %s | FileCheck %s --check-prefix=NOINL
4+
; RUN: opt -mtriple=amdgcn-- -passes=inline -S -amdgpu-inline-max-bb=3 %s | FileCheck %s --check-prefix=INL
35

46
define i32 @callee(i32 %x) {
57
entry:

0 commit comments

Comments
 (0)