Skip to content

Commit fe7c50a

Browse files
committed
[SimplifyCFG] Increase budget for FoldTwoEntryPHINode() if the branch
is unpredictable. The `!unpredictable` metadata has been present for a long time, but it's usage in optimizations is still limited. This patch teaches `FoldTwoEntryPHINode()` to be more aggressive with an unpredictable branch to reduce mispredictions. A TTI interface `getBranchMispredictPenalty()` is added to distinguish between different hardwares to ensure we don't go too far for simpler cores. For simplicity, only a naive x86 implementation is included for the time being.
1 parent 9d2f81e commit fe7c50a

File tree

7 files changed

+124
-3
lines changed

7 files changed

+124
-3
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,11 @@ class TargetTransformInfo {
419419
/// this factor, it is very likely to be predicted correctly.
420420
BranchProbability getPredictableBranchThreshold() const;
421421

422+
// Returns an integer indicating how aggressive the target wants for
423+
// eliminating unpredictable branches. A zero return value means extra
424+
// optimization applied to them should be minimal.
425+
unsigned getBranchMispredictPenalty() const;
426+
422427
/// Return true if branch divergence exists.
423428
///
424429
/// Branch divergence has a significantly negative impact on GPU performance
@@ -1832,6 +1837,7 @@ class TargetTransformInfo::Concept {
18321837
ArrayRef<const Value *> Operands,
18331838
TargetCostKind CostKind) = 0;
18341839
virtual BranchProbability getPredictableBranchThreshold() = 0;
1840+
virtual unsigned getBranchMispredictPenalty() = 0;
18351841
virtual bool hasBranchDivergence(const Function *F = nullptr) = 0;
18361842
virtual bool isSourceOfDivergence(const Value *V) = 0;
18371843
virtual bool isAlwaysUniform(const Value *V) = 0;
@@ -2243,6 +2249,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
22432249
BranchProbability getPredictableBranchThreshold() override {
22442250
return Impl.getPredictableBranchThreshold();
22452251
}
2252+
unsigned getBranchMispredictPenalty() override {
2253+
return Impl.getBranchMispredictPenalty();
2254+
}
22462255
bool hasBranchDivergence(const Function *F = nullptr) override {
22472256
return Impl.hasBranchDivergence(F);
22482257
}

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ class TargetTransformInfoImplBase {
9999
return BranchProbability(99, 100);
100100
}
101101

102+
unsigned getBranchMispredictPenalty() const { return 0; }
103+
102104
bool hasBranchDivergence(const Function *F = nullptr) const { return false; }
103105

104106
bool isSourceOfDivergence(const Value *V) const { return false; }

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,10 @@ BranchProbability TargetTransformInfo::getPredictableBranchThreshold() const {
279279
: TTIImpl->getPredictableBranchThreshold();
280280
}
281281

282+
unsigned TargetTransformInfo::getBranchMispredictPenalty() const {
283+
return TTIImpl->getBranchMispredictPenalty();
284+
}
285+
282286
bool TargetTransformInfo::hasBranchDivergence(const Function *F) const {
283287
return TTIImpl->hasBranchDivergence(F);
284288
}

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6756,3 +6756,8 @@ InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
67566756
return AM.Scale != 0;
67576757
return -1;
67586758
}
6759+
6760+
unsigned X86TTIImpl::getBranchMispredictPenalty() const {
6761+
// TODO: Hook MispredictPenalty of SchedMachineModel into this.
6762+
return 14;
6763+
}

llvm/lib/Target/X86/X86TargetTransformInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,8 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
294294
bool supportsEfficientVectorElementLoadStore() const;
295295
bool enableInterleavedAccessVectorization();
296296

297+
unsigned getBranchMispredictPenalty() const;
298+
297299
private:
298300
bool supportsGather() const;
299301
InstructionCost getGSVectorCost(unsigned Opcode, TTI::TargetCostKind CostKind,

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3508,7 +3508,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
35083508
// jump to one specific 'then' block (if we have two of them).
35093509
// It isn't beneficial to speculatively execute the code
35103510
// from the block that we know is predictably not entered.
3511-
if (!DomBI->getMetadata(LLVMContext::MD_unpredictable)) {
3511+
bool IsUnpredictable = DomBI->getMetadata(LLVMContext::MD_unpredictable);
3512+
if (!IsUnpredictable) {
35123513
uint64_t TWeight, FWeight;
35133514
if (extractBranchWeights(*DomBI, TWeight, FWeight) &&
35143515
(TWeight + FWeight) != 0) {
@@ -3549,8 +3550,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
35493550
// that need to be moved to the dominating block.
35503551
SmallPtrSet<Instruction *, 4> AggressiveInsts;
35513552
InstructionCost Cost = 0;
3552-
InstructionCost Budget =
3553-
TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
3553+
unsigned Threshold = TwoEntryPHINodeFoldingThreshold;
3554+
if (IsUnpredictable)
3555+
Threshold += TTI.getBranchMispredictPenalty();
3556+
InstructionCost Budget = Threshold * TargetTransformInfo::TCC_Basic;
35543557

35553558
bool Changed = false;
35563559
for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
2+
; Two-entry phi nodes with unpredictable conditions may get increased budget for folding.
3+
; RUN: opt < %s -S -passes=simplifycfg | FileCheck --check-prefix=CHECK-NOFOLD %s
4+
; RUN: opt -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s -S -passes=simplifycfg | FileCheck --check-prefix=CHECK-FOLD %s
5+
6+
define { <2 x float>, <2 x float> } @foo(float %speed, <2 x float> %velocity.coerce0, <2 x float> %velocity.coerce1) {
7+
; CHECK-NOFOLD-LABEL: define { <2 x float>, <2 x float> } @foo(
8+
; CHECK-NOFOLD-SAME: float [[SPEED:%.*]], <2 x float> [[VELOCITY_COERCE0:%.*]], <2 x float> [[VELOCITY_COERCE1:%.*]]) {
9+
; CHECK-NOFOLD-NEXT: [[ENTRY:.*]]:
10+
; CHECK-NOFOLD-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[SPEED]], 0x3F747AE140000000
11+
; CHECK-NOFOLD-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]], !unpredictable [[META0:![0-9]+]]
12+
; CHECK-NOFOLD: [[IF_THEN]]:
13+
; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 0
14+
; CHECK-NOFOLD-NEXT: [[MUL_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_0_VEC_EXTRACT]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]]
15+
; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 1
16+
; CHECK-NOFOLD-NEXT: [[MUL8_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_4_VEC_EXTRACT]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]]
17+
; CHECK-NOFOLD-NEXT: [[ADD_I_I_I_I:%.*]] = fadd fast float [[MUL8_I_I_I_I]], [[MUL_I_I_I_I]]
18+
; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_14_8_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE1]], i64 0
19+
; CHECK-NOFOLD-NEXT: [[MUL13_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_14_8_VEC_EXTRACT]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]]
20+
; CHECK-NOFOLD-NEXT: [[ADD14_I_I_I_I:%.*]] = fadd fast float [[ADD_I_I_I_I]], [[MUL13_I_I_I_I]]
21+
; CHECK-NOFOLD-NEXT: [[TMP0:%.*]] = tail call fast noundef float @llvm.sqrt.f32(float [[ADD14_I_I_I_I]])
22+
; CHECK-NOFOLD-NEXT: [[MUL_I_I_I:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[TMP0]]
23+
; CHECK-NOFOLD-NEXT: [[SUB_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]]
24+
; CHECK-NOFOLD-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[SUB_I]], i64 0
25+
; CHECK-NOFOLD-NEXT: [[SUB8_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]]
26+
; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_0_4_VEC_INSERT25:%.*]] = insertelement <2 x float> [[TMP1]], float [[SUB8_I]], i64 1
27+
; CHECK-NOFOLD-NEXT: [[SUB13_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]]
28+
; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_14_8_VEC_INSERT35:%.*]] = insertelement <2 x float> [[VELOCITY_COERCE1]], float [[SUB13_I]], i64 0
29+
; CHECK-NOFOLD-NEXT: br label %[[IF_END]]
30+
; CHECK-NOFOLD: [[IF_END]]:
31+
; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_0_0:%.*]] = phi nsz <2 x float> [ [[VELOCITY_SROA_0_4_VEC_INSERT25]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
32+
; CHECK-NOFOLD-NEXT: [[VELOCITY_SROA_14_0:%.*]] = phi nsz <2 x float> [ [[VELOCITY_SROA_14_8_VEC_INSERT35]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
33+
; CHECK-NOFOLD-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VELOCITY_SROA_0_0]], 0
34+
; CHECK-NOFOLD-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[DOTFCA_0_INSERT]], <2 x float> [[VELOCITY_SROA_14_0]], 1
35+
; CHECK-NOFOLD-NEXT: ret { <2 x float>, <2 x float> } [[DOTFCA_1_INSERT]]
36+
;
37+
; CHECK-FOLD-LABEL: define { <2 x float>, <2 x float> } @foo(
38+
; CHECK-FOLD-SAME: float [[SPEED:%.*]], <2 x float> [[VELOCITY_COERCE0:%.*]], <2 x float> [[VELOCITY_COERCE1:%.*]]) #[[ATTR0:[0-9]+]] {
39+
; CHECK-FOLD-NEXT: [[ENTRY:.*:]]
40+
; CHECK-FOLD-NEXT: [[CMP:%.*]] = fcmp fast ogt float [[SPEED]], 0x3F747AE140000000
41+
; CHECK-FOLD-NEXT: [[VELOCITY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 0
42+
; CHECK-FOLD-NEXT: [[MUL_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_0_VEC_EXTRACT]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]]
43+
; CHECK-FOLD-NEXT: [[VELOCITY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 1
44+
; CHECK-FOLD-NEXT: [[MUL8_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_4_VEC_EXTRACT]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]]
45+
; CHECK-FOLD-NEXT: [[ADD_I_I_I_I:%.*]] = fadd fast float [[MUL8_I_I_I_I]], [[MUL_I_I_I_I]]
46+
; CHECK-FOLD-NEXT: [[VELOCITY_SROA_14_8_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE1]], i64 0
47+
; CHECK-FOLD-NEXT: [[MUL13_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_14_8_VEC_EXTRACT]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]]
48+
; CHECK-FOLD-NEXT: [[ADD14_I_I_I_I:%.*]] = fadd fast float [[ADD_I_I_I_I]], [[MUL13_I_I_I_I]]
49+
; CHECK-FOLD-NEXT: [[TMP0:%.*]] = tail call fast float @llvm.sqrt.f32(float [[ADD14_I_I_I_I]])
50+
; CHECK-FOLD-NEXT: [[MUL_I_I_I:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[TMP0]]
51+
; CHECK-FOLD-NEXT: [[SUB_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]]
52+
; CHECK-FOLD-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[SUB_I]], i64 0
53+
; CHECK-FOLD-NEXT: [[SUB8_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]]
54+
; CHECK-FOLD-NEXT: [[VELOCITY_SROA_0_4_VEC_INSERT25:%.*]] = insertelement <2 x float> [[TMP1]], float [[SUB8_I]], i64 1
55+
; CHECK-FOLD-NEXT: [[SUB13_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]]
56+
; CHECK-FOLD-NEXT: [[VELOCITY_SROA_14_8_VEC_INSERT35:%.*]] = insertelement <2 x float> [[VELOCITY_COERCE1]], float [[SUB13_I]], i64 0
57+
; CHECK-FOLD-NEXT: [[VELOCITY_SROA_0_0:%.*]] = select nsz i1 [[CMP]], <2 x float> [[VELOCITY_SROA_0_4_VEC_INSERT25]], <2 x float> zeroinitializer, !unpredictable [[META0:![0-9]+]]
58+
; CHECK-FOLD-NEXT: [[VELOCITY_SROA_14_0:%.*]] = select nsz i1 [[CMP]], <2 x float> [[VELOCITY_SROA_14_8_VEC_INSERT35]], <2 x float> zeroinitializer, !unpredictable [[META0]]
59+
; CHECK-FOLD-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VELOCITY_SROA_0_0]], 0
60+
; CHECK-FOLD-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[DOTFCA_0_INSERT]], <2 x float> [[VELOCITY_SROA_14_0]], 1
61+
; CHECK-FOLD-NEXT: ret { <2 x float>, <2 x float> } [[DOTFCA_1_INSERT]]
62+
;
63+
entry:
64+
%cmp = fcmp fast ogt float %speed, 0x3F747AE140000000
65+
br i1 %cmp, label %if.then, label %if.end, !unpredictable !0
66+
67+
if.then:
68+
%velocity.sroa.0.0.vec.extract = extractelement <2 x float> %velocity.coerce0, i64 0
69+
%mul.i.i.i.i = fmul fast float %velocity.sroa.0.0.vec.extract, %velocity.sroa.0.0.vec.extract
70+
%velocity.sroa.0.4.vec.extract = extractelement <2 x float> %velocity.coerce0, i64 1
71+
%mul8.i.i.i.i = fmul fast float %velocity.sroa.0.4.vec.extract, %velocity.sroa.0.4.vec.extract
72+
%add.i.i.i.i = fadd fast float %mul8.i.i.i.i, %mul.i.i.i.i
73+
%velocity.sroa.14.8.vec.extract = extractelement <2 x float> %velocity.coerce1, i64 0
74+
%mul13.i.i.i.i = fmul fast float %velocity.sroa.14.8.vec.extract, %velocity.sroa.14.8.vec.extract
75+
%add14.i.i.i.i = fadd fast float %add.i.i.i.i, %mul13.i.i.i.i
76+
%0 = tail call fast noundef float @llvm.sqrt.f32(float %add14.i.i.i.i)
77+
%mul.i.i.i = fdiv fast float 0x3FEFD70A40000000, %0
78+
%sub.i = fmul fast float %mul.i.i.i, %velocity.sroa.0.0.vec.extract
79+
%1 = insertelement <2 x float> poison, float %sub.i, i64 0
80+
%sub8.i = fmul fast float %mul.i.i.i, %velocity.sroa.0.4.vec.extract
81+
%velocity.sroa.0.4.vec.insert25 = insertelement <2 x float> %1, float %sub8.i, i64 1
82+
%sub13.i = fmul fast float %mul.i.i.i, %velocity.sroa.14.8.vec.extract
83+
%velocity.sroa.14.8.vec.insert35 = insertelement <2 x float> %velocity.coerce1, float %sub13.i, i64 0
84+
br label %if.end
85+
86+
if.end:
87+
%velocity.sroa.0.0 = phi nsz <2 x float> [ %velocity.sroa.0.4.vec.insert25, %if.then ], [ zeroinitializer, %entry ]
88+
%velocity.sroa.14.0 = phi nsz <2 x float> [ %velocity.sroa.14.8.vec.insert35, %if.then ], [ zeroinitializer, %entry ]
89+
%.fca.0.insert = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> %velocity.sroa.0.0, 0
90+
%.fca.1.insert = insertvalue { <2 x float>, <2 x float> } %.fca.0.insert, <2 x float> %velocity.sroa.14.0, 1
91+
ret { <2 x float>, <2 x float> } %.fca.1.insert
92+
}
93+
94+
declare float @llvm.sqrt.f32(float)
95+
96+
!0 = !{}

0 commit comments

Comments
 (0)