Skip to content

Commit d912a9b

Browse files
committed
[AMDGPU] Tune inlining parameters for AMDGPU target
Summary: Since the target has no significant advantage of vectorization, vector instructions bous threshold bonus should be optional. amdgpu-inline-arg-alloca-cost parameter default value and the target InliningThresholdMultiplier value tuned then respectively. Reviewers: arsenm, rampitec Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, eraman, hiraditya, haicheng, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D64642 llvm-svn: 366348
1 parent 3fce6b5 commit d912a9b

File tree

9 files changed

+60
-19
lines changed

9 files changed

+60
-19
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,18 @@ class TargetTransformInfo {
263263
/// individual classes of instructions would be better.
264264
unsigned getInliningThresholdMultiplier() const;
265265

266+
/// \returns Vector bonus in percent.
267+
///
268+
/// Vector bonuses: We want to more aggressively inline vector-dense kernels
269+
/// and apply this bonus based on the percentage of vector instructions. A
270+
/// bonus is applied if the vector instructions exceed 50% and half that amount
271+
/// is applied if it exceeds 10%. Note that these bonuses are some what
272+
/// arbitrary and evolved over time by accident as much as because they are
273+
/// principled bonuses.
274+
/// FIXME: It would be nice to base the bonus values on something more
275+
/// scientific. A target may has no bonus on vector instructions.
276+
int getInlinerVectorBonusPercent() const;
277+
266278
/// Estimate the cost of an intrinsic when lowered.
267279
///
268280
/// Mirrors the \c getCallCost method but uses an intrinsic identifier.
@@ -1128,6 +1140,7 @@ class TargetTransformInfo::Concept {
11281140
virtual int getCallCost(const Function *F,
11291141
ArrayRef<const Value *> Arguments, const User *U) = 0;
11301142
virtual unsigned getInliningThresholdMultiplier() = 0;
1143+
virtual int getInlinerVectorBonusPercent() = 0;
11311144
virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
11321145
ArrayRef<Type *> ParamTys, const User *U) = 0;
11331146
virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
@@ -1351,6 +1364,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
13511364
unsigned getInliningThresholdMultiplier() override {
13521365
return Impl.getInliningThresholdMultiplier();
13531366
}
1367+
int getInlinerVectorBonusPercent() override {
1368+
return Impl.getInlinerVectorBonusPercent();
1369+
}
13541370
int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
13551371
ArrayRef<Type *> ParamTys, const User *U = nullptr) override {
13561372
return Impl.getIntrinsicCost(IID, RetTy, ParamTys, U);

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,8 @@ class TargetTransformInfoImplBase {
140140

141141
unsigned getInliningThresholdMultiplier() { return 1; }
142142

143+
int getInlinerVectorBonusPercent() { return 150; }
144+
143145
unsigned getMemcpyCost(const Instruction *I) {
144146
return TTI::TCC_Expensive;
145147
}

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
427427

428428
unsigned getInliningThresholdMultiplier() { return 1; }
429429

430+
int getInlinerVectorBonusPercent() { return 150; }
431+
430432
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
431433
TTI::UnrollingPreferences &UP) {
432434
// This unrolling functionality is target independent, but to provide some

llvm/lib/Analysis/InlineCost.cpp

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -880,23 +880,14 @@ void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
880880
// basic block at the given callsite context. This is speculatively applied
881881
// and withdrawn if more than one basic block is seen.
882882
//
883-
// Vector bonuses: We want to more aggressively inline vector-dense kernels
884-
// and apply this bonus based on the percentage of vector instructions. A
885-
// bonus is applied if the vector instructions exceed 50% and half that amount
886-
// is applied if it exceeds 10%. Note that these bonuses are some what
887-
// arbitrary and evolved over time by accident as much as because they are
888-
// principled bonuses.
889-
// FIXME: It would be nice to base the bonus values on something more
890-
// scientific.
891-
//
892883
// LstCallToStaticBonus: This large bonus is applied to ensure the inlining
893884
// of the last call to a static function as inlining such functions is
894885
// guaranteed to reduce code size.
895886
//
896887
// These bonus percentages may be set to 0 based on properties of the caller
897888
// and the callsite.
898889
int SingleBBBonusPercent = 50;
899-
int VectorBonusPercent = 150;
890+
int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
900891
int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
901892

902893
// Lambda to set all the above bonus and bonus percentages to 0.

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,10 @@ unsigned TargetTransformInfo::getInliningThresholdMultiplier() const {
176176
return TTIImpl->getInliningThresholdMultiplier();
177177
}
178178

179+
int TargetTransformInfo::getInlinerVectorBonusPercent() const {
180+
return TTIImpl->getInlinerVectorBonusPercent();
181+
}
182+
179183
int TargetTransformInfo::getGEPCost(Type *PointeeType, const Value *Ptr,
180184
ArrayRef<const Value *> Operands) const {
181185
return TTIImpl->getGEPCost(PointeeType, Ptr, Operands);

llvm/lib/Target/AMDGPU/AMDGPUInline.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ using namespace llvm;
3939
#define DEBUG_TYPE "inline"
4040

4141
static cl::opt<int>
42-
ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200),
42+
ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(1500),
4343
cl::desc("Cost of alloca argument"));
4444

4545
// If the amount of scratch memory to eliminate exceeds our ability to allocate

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
191191
bool areInlineCompatible(const Function *Caller,
192192
const Function *Callee) const;
193193

194-
unsigned getInliningThresholdMultiplier() { return 9; }
194+
unsigned getInliningThresholdMultiplier() { return 7; }
195+
196+
int getInlinerVectorBonusPercent() { return 0; }
195197

196198
int getArithmeticReductionCost(unsigned Opcode,
197199
Type *Ty,

llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,8 @@ if.end: ; preds = %if.then, %entry
2828
define coldcc void @foo_private_ptr2(float addrspace(5)* nocapture %p1, float addrspace(5)* nocapture %p2) {
2929
entry:
3030
%tmp1 = load float, float addrspace(5)* %p1, align 4
31-
%cmp = fcmp ogt float %tmp1, 1.000000e+00
32-
br i1 %cmp, label %if.then, label %if.end
33-
34-
if.then: ; preds = %entry
3531
%div = fdiv float 2.000000e+00, %tmp1
3632
store float %div, float addrspace(5)* %p2, align 4
37-
br label %if.end
38-
39-
if.end: ; preds = %if.then, %entry
4033
ret void
4134
}
4235

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-inline --inline-threshold=1 < %s | FileCheck %s
2+
3+
define hidden <16 x i32> @div_vecbonus(<16 x i32> %x, <16 x i32> %y) {
4+
entry:
5+
%div.1 = udiv <16 x i32> %x, %y
6+
%div.2 = udiv <16 x i32> %div.1, %y
7+
%div.3 = udiv <16 x i32> %div.2, %y
8+
%div.4 = udiv <16 x i32> %div.3, %y
9+
%div.5 = udiv <16 x i32> %div.4, %y
10+
%div.6 = udiv <16 x i32> %div.5, %y
11+
%div.7 = udiv <16 x i32> %div.6, %y
12+
%div.8 = udiv <16 x i32> %div.7, %y
13+
%div.9 = udiv <16 x i32> %div.8, %y
14+
%div.10 = udiv <16 x i32> %div.9, %y
15+
%div.11 = udiv <16 x i32> %div.10, %y
16+
%div.12 = udiv <16 x i32> %div.11, %y
17+
ret <16 x i32> %div.12
18+
}
19+
20+
; CHECK-LABEL: define amdgpu_kernel void @caller_vecbonus
21+
; CHECK-NOT: udiv
22+
; CHECK: tail call <16 x i32> @div_vecbonus
23+
; CHECK: ret void
24+
define amdgpu_kernel void @caller_vecbonus(<16 x i32> addrspace(1)* nocapture %x, <16 x i32> addrspace(1)* nocapture readonly %y) {
25+
entry:
26+
%tmp = load <16 x i32>, <16 x i32> addrspace(1)* %x
27+
%tmp1 = load <16 x i32>, <16 x i32> addrspace(1)* %y
28+
%div.i = tail call <16 x i32> @div_vecbonus(<16 x i32> %tmp, <16 x i32> %tmp1)
29+
store <16 x i32> %div.i, <16 x i32> addrspace(1)* %x
30+
ret void
31+
}

0 commit comments

Comments
 (0)