Skip to content

Commit 475e0fb

Browse files
committed
[X86] Don't always seperate conditions in (br (and/or cond0, cond1)) into seperate branches
It makes sense to split if the cost of computing `cond1` is high (proportionally to how likely `cond0` is), but it doesn't really make sense to introduce a second branch if its only a few instructions. Splitting can also get in the way of potentially folding patterns. This patch introduces some logic to try to check if the cost of computing `cond1` is relatively low, and if so don't split the branches. Modest improvement on clang bootstrap build: https://llvm-compile-time-tracker.com/compare.php?from=79ce933114e46c891a5632f7ad4a004b93a5b808&to=978278eabc0bafe2f390ca8fcdad24154f954020&stat=cycles Average stage2-O3: 0.59% Improvement (cycles) Average stage2-O0-g: 1.20% Improvement (cycles) Likewise on llvm-test-suite on SKX saw a net 0.84% improvement (cycles) There is also a modest compile time improvement with this patch: https://llvm-compile-time-tracker.com/compare.php?from=79ce933114e46c891a5632f7ad4a004b93a5b808&to=978278eabc0bafe2f390ca8fcdad24154f954020&stat=instructions%3Au Note that the stage2 instruction count increases is expected, this patch trades instructions for decreasing branch-misses (which is proportionately lower): https://llvm-compile-time-tracker.com/compare.php?from=79ce933114e46c891a5632f7ad4a004b93a5b808&to=978278eabc0bafe2f390ca8fcdad24154f954020&stat=branch-misses NB: This will also likely help for APX targets with the new `CCMP` and `CTEST` instructions.
1 parent f6c200a commit 475e0fb

28 files changed

+917
-740
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,39 @@ class TargetLoweringBase {
596596
/// avoided.
597597
bool isJumpExpensive() const { return JumpIsExpensive; }
598598

599+
// Costs paramateres used by
600+
// SelectionDAGBuilder::shouldKeepJumpConditionsTogether.
601+
// shouldKeepJumpConditionsTogether will use these parameter value to
602+
// determine if two conditions in the form `br (and/or cond1, cond2)` should
603+
// be split into two branches or left as one.
604+
//
605+
// BaseCost is the cost threshold (in latency). If the estimated latency of
606+
// computing both `cond1` and `cond2` is below the cost of just computing
607+
// `cond1` + BaseCost, the two conditions will be kept together. Otherwise
608+
// they will be split.
609+
//
610+
// LikelyBias increases BaseCost if branch probability info indicates that it
611+
// is likely that both `cond1` and `cond2` will be computed.
612+
//
613+
// UnlikelyBias decreases BaseCost if branch probability info indicates that
614+
// it is likely that both `cond1` and `cond2` will be computed.
615+
//
616+
// Set any field to -1 to make it ignored (setting BaseCost to -1 results in
617+
// `shouldKeepJumpConditionsTogether` always returning false).
618+
struct CondMergingParams {
619+
int BaseCost;
620+
int LikelyBias;
621+
int UnlikelyBias;
622+
};
623+
// Return params for deciding if we should keep two branch conditions merged
624+
// or split them into two seperate branches.
625+
virtual CondMergingParams
626+
getJumpConditionMergingParams(Instruction::BinaryOps, const Value *,
627+
const Value *) const {
628+
// -1 will always result in splitting.
629+
return {-1, -1, -1};
630+
}
631+
599632
/// Return true if selects are only cheaper than branches if the branch is
600633
/// unlikely to be predicted right.
601634
bool isPredictableSelectExpensive() const {

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 150 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "llvm/Analysis/Loads.h"
2727
#include "llvm/Analysis/MemoryLocation.h"
2828
#include "llvm/Analysis/TargetLibraryInfo.h"
29+
#include "llvm/Analysis/TargetTransformInfo.h"
2930
#include "llvm/Analysis/ValueTracking.h"
3031
#include "llvm/Analysis/VectorUtils.h"
3132
#include "llvm/CodeGen/Analysis.h"
@@ -93,6 +94,7 @@
9394
#include "llvm/Support/CommandLine.h"
9495
#include "llvm/Support/Compiler.h"
9596
#include "llvm/Support/Debug.h"
97+
#include "llvm/Support/InstructionCost.h"
9698
#include "llvm/Support/MathExtras.h"
9799
#include "llvm/Support/raw_ostream.h"
98100
#include "llvm/Target/TargetIntrinsicInfo.h"
@@ -2433,6 +2435,147 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
24332435
SL->SwitchCases.push_back(CB);
24342436
}
24352437

2438+
// Collect dependencies on V recursively. This is used for the cost analysis in
2439+
// `shouldKeepJumpConditionsTogether`.
2440+
static bool
2441+
collectInstructionDeps(SmallPtrSet<const Instruction *, 8> *Deps,
2442+
const Value *V,
2443+
SmallPtrSet<const Instruction *, 8> *Necessary = nullptr,
2444+
unsigned Depth = 0) {
2445+
// Return false if we have an incomplete count.
2446+
if (Depth >= SelectionDAG::MaxRecursionDepth)
2447+
return false;
2448+
2449+
auto *I = dyn_cast<Instruction>(V);
2450+
if (I == nullptr)
2451+
return true;
2452+
2453+
if (Necessary != nullptr) {
2454+
// This instruction is necessary for the other side of the condition so
2455+
// don't count it.
2456+
if (Necessary->contains(I))
2457+
return true;
2458+
}
2459+
2460+
// Already added this dep.
2461+
if (!Deps->insert(I).second)
2462+
return true;
2463+
2464+
for (unsigned OpIdx = 0, E = I->getNumOperands(); OpIdx < E; ++OpIdx)
2465+
if (!collectInstructionDeps(Deps, I->getOperand(OpIdx), Necessary,
2466+
Depth + 1))
2467+
return false;
2468+
return true;
2469+
}
2470+
2471+
bool SelectionDAGBuilder::shouldKeepJumpConditionsTogether(
2472+
const FunctionLoweringInfo &FuncInfo, const BranchInst &I,
2473+
Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs,
2474+
TargetLoweringBase::CondMergingParams Params) const {
2475+
if (I.getNumSuccessors() != 2)
2476+
return false;
2477+
2478+
if (Params.BaseCost < 0)
2479+
return false;
2480+
2481+
// Baseline cost.
2482+
InstructionCost CostThresh = Params.BaseCost;
2483+
2484+
BranchProbabilityInfo *BPI = nullptr;
2485+
if (Params.LikelyBias || Params.UnlikelyBias)
2486+
BPI = FuncInfo.BPI;
2487+
if (BPI != nullptr) {
2488+
// See if we are either likely to get an early out or compute both lhs/rhs
2489+
// of the condition.
2490+
BasicBlock *IfFalse = I.getSuccessor(0);
2491+
BasicBlock *IfTrue = I.getSuccessor(1);
2492+
2493+
std::optional<bool> Likely;
2494+
if (BPI->isEdgeHot(I.getParent(), IfTrue))
2495+
Likely = true;
2496+
else if (BPI->isEdgeHot(I.getParent(), IfFalse))
2497+
Likely = false;
2498+
2499+
if (Likely) {
2500+
if (Opc == (*Likely ? Instruction::And : Instruction::Or))
2501+
// Its likely we will have to compute both lhs and rhs of condition
2502+
CostThresh += Params.LikelyBias;
2503+
else {
2504+
if (Params.UnlikelyBias < 0)
2505+
return false;
2506+
// Its likely we will get an early out.
2507+
CostThresh -= Params.UnlikelyBias;
2508+
}
2509+
}
2510+
}
2511+
2512+
if (CostThresh <= 0)
2513+
return false;
2514+
2515+
// Collect "all" instructions that lhs condition is dependent on.
2516+
SmallPtrSet<const Instruction *, 8> LhsDeps, RhsDeps;
2517+
collectInstructionDeps(&LhsDeps, Lhs);
2518+
// Collect "all" instructions that rhs condition is dependent on AND are
2519+
// dependencies of lhs. This gives us an estimate on which instructions we
2520+
// stand to save by splitting the condition.
2521+
if (!collectInstructionDeps(&RhsDeps, Rhs, &LhsDeps))
2522+
return false;
2523+
// Add the compare instruction itself unless its a dependency on the LHS.
2524+
if (const auto *RhsI = dyn_cast<Instruction>(Rhs))
2525+
if (!LhsDeps.contains(RhsI))
2526+
RhsDeps.insert(RhsI);
2527+
2528+
const auto &TLI = DAG.getTargetLoweringInfo();
2529+
const auto &TTI =
2530+
TLI.getTargetMachine().getTargetTransformInfo(*I.getFunction());
2531+
2532+
InstructionCost CostOfIncluding = 0;
2533+
// See if this instruction will need to computed independently of whether RHS
2534+
// is.
2535+
auto ShouldCountInsn = [&RhsDeps](const Instruction *Ins) {
2536+
for (const auto *U : Ins->users()) {
2537+
// If user is independent of RHS calculation we don't need to count it.
2538+
if (auto *UIns = dyn_cast<Instruction>(U))
2539+
if (!RhsDeps.contains(UIns))
2540+
return false;
2541+
}
2542+
return true;
2543+
};
2544+
2545+
// Prune instructions from RHS Deps that are dependencies of unrelated
2546+
// instructions. The value (SelectionDAG::MaxRecursionDepth) is fairly
2547+
// arbitrary and just meant to cap the how much time we spend in the pruning
2548+
// loop. Its highly unlikely to come into affect.
2549+
const unsigned MaxPruneIters = SelectionDAG::MaxRecursionDepth;
2550+
// Stop after a certain point. No incorrectness from including too many
2551+
// instructions.
2552+
for (unsigned PruneIters = 0; PruneIters < MaxPruneIters; ++PruneIters) {
2553+
const Instruction *ToDrop = nullptr;
2554+
for (const auto *Ins : RhsDeps) {
2555+
if (!ShouldCountInsn(Ins)) {
2556+
ToDrop = Ins;
2557+
break;
2558+
}
2559+
}
2560+
if (ToDrop == nullptr)
2561+
break;
2562+
RhsDeps.erase(ToDrop);
2563+
}
2564+
2565+
for (const auto *Ins : RhsDeps) {
2566+
// Finally accumulate latency that we can only attribute to computing the
2567+
// RHS condition. Use latency because we are essentially trying to calculate
2568+
// the cost of the dependency chain.
2569+
// Possible TODO: We could try to estimate ILP and make this more precise.
2570+
CostOfIncluding +=
2571+
TTI.getInstructionCost(Ins, TargetTransformInfo::TCK_Latency);
2572+
2573+
if (CostOfIncluding > CostThresh)
2574+
return false;
2575+
}
2576+
return true;
2577+
}
2578+
24362579
void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
24372580
MachineBasicBlock *TBB,
24382581
MachineBasicBlock *FBB,
@@ -2647,8 +2790,13 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
26472790
else if (match(BOp, m_LogicalOr(m_Value(BOp0), m_Value(BOp1))))
26482791
Opcode = Instruction::Or;
26492792

2650-
if (Opcode && !(match(BOp0, m_ExtractElt(m_Value(Vec), m_Value())) &&
2651-
match(BOp1, m_ExtractElt(m_Specific(Vec), m_Value())))) {
2793+
if (Opcode &&
2794+
!(match(BOp0, m_ExtractElt(m_Value(Vec), m_Value())) &&
2795+
match(BOp1, m_ExtractElt(m_Specific(Vec), m_Value()))) &&
2796+
!shouldKeepJumpConditionsTogether(
2797+
FuncInfo, I, Opcode, BOp0, BOp1,
2798+
DAG.getTargetLoweringInfo().getJumpConditionMergingParams(
2799+
Opcode, BOp0, BOp1))) {
26522800
FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB, Opcode,
26532801
getEdgeProbability(BrMBB, Succ0MBB),
26542802
getEdgeProbability(BrMBB, Succ1MBB),

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,11 @@ class SelectionDAGBuilder {
385385
N = NewN;
386386
}
387387

388+
bool shouldKeepJumpConditionsTogether(
389+
const FunctionLoweringInfo &FuncInfo, const BranchInst &I,
390+
Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs,
391+
TargetLoweringBase::CondMergingParams Params) const;
392+
388393
void FindMergedConditions(const Value *Cond, MachineBasicBlock *TBB,
389394
MachineBasicBlock *FBB, MachineBasicBlock *CurBB,
390395
MachineBasicBlock *SwitchBB,

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,37 @@ static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
7777
"alignment set by x86-experimental-pref-loop-alignment."),
7878
cl::Hidden);
7979

80+
static cl::opt<int> BrMergingBaseCostThresh(
81+
"x86-br-merging-base-cost", cl::init(1),
82+
cl::desc(
83+
"Sets the cost threshold for when multiple conditionals will be merged "
84+
"into one branch versus be split in multiple branches. Merging "
85+
"conditionals saves branches at the cost of additional instructions. "
86+
"This value sets the instruction cost limit, below which conditionals "
87+
"will be merged, and above which conditionals will be split."),
88+
cl::Hidden);
89+
90+
static cl::opt<int> BrMergingLikelyBias(
91+
"x86-br-merging-likely-bias", cl::init(0),
92+
cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
93+
"that all conditionals will be executed. For example for merging "
94+
"the conditionals (a == b && c > d), if its known that a == b is "
95+
"likely, then it is likely that if the conditionals are split "
96+
"both sides will be executed, so it may be desirable to increase "
97+
"the instruction cost threshold."),
98+
cl::Hidden);
99+
100+
static cl::opt<int> BrMergingUnlikelyBias(
101+
"x86-br-merging-unlikely-bias", cl::init(1),
102+
cl::desc(
103+
"Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
104+
"that all conditionals will be executed. For example for merging "
105+
"the conditionals (a == b && c > d), if its known that a == b is "
106+
"unlikely, then it is unlikely that if the conditionals are split "
107+
"both sides will be executed, so it may be desirable to decrease "
108+
"the instruction cost threshold."),
109+
cl::Hidden);
110+
80111
static cl::opt<bool> MulConstantOptimization(
81112
"mul-constant-optimization", cl::init(true),
82113
cl::desc("Replace 'mul x, Const' with more effective instructions like "
@@ -3339,6 +3370,24 @@ unsigned X86TargetLowering::preferedOpcodeForCmpEqPiecesOfOperand(
33393370
return ISD::SRL;
33403371
}
33413372

3373+
TargetLoweringBase::CondMergingParams
3374+
X86TargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc,
3375+
const Value *Lhs,
3376+
const Value *Rhs) const {
3377+
using namespace llvm::PatternMatch;
3378+
int BaseCost = BrMergingBaseCostThresh.getValue();
3379+
// a == b && a == c is a fast pattern on x86.
3380+
ICmpInst::Predicate Pred;
3381+
if (BaseCost >= 0 && Opc == Instruction::And &&
3382+
match(Lhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3383+
Pred == ICmpInst::ICMP_EQ &&
3384+
match(Rhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3385+
Pred == ICmpInst::ICMP_EQ)
3386+
BaseCost += 1;
3387+
return {BaseCost, BrMergingLikelyBias.getValue(),
3388+
BrMergingUnlikelyBias.getValue()};
3389+
}
3390+
33423391
bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const {
33433392
return N->getOpcode() != ISD::FP_EXTEND;
33443393
}

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1150,6 +1150,10 @@ namespace llvm {
11501150

11511151
bool preferScalarizeSplat(SDNode *N) const override;
11521152

1153+
CondMergingParams
1154+
getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs,
1155+
const Value *Rhs) const override;
1156+
11531157
bool shouldFoldConstantShiftPairToMask(const SDNode *N,
11541158
CombineLevel Level) const override;
11551159

llvm/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,16 @@ define i1 @loadAndRLEsource_no_exit_2E_1_label_2E_0(i32 %tmp.21.reload, i32 %tmp
1818
; CHECK-NEXT: movl _block, %esi
1919
; CHECK-NEXT: movb %al, 1(%esi,%edx)
2020
; CHECK-NEXT: cmpl %ecx, _last
21-
; CHECK-NEXT: jge LBB0_3
22-
; CHECK-NEXT: ## %bb.1: ## %label.0
21+
; CHECK-NEXT: setl %cl
2322
; CHECK-NEXT: cmpl $257, %eax ## imm = 0x101
24-
; CHECK-NEXT: je LBB0_3
25-
; CHECK-NEXT: ## %bb.2: ## %label.0.no_exit.1_crit_edge.exitStub
23+
; CHECK-NEXT: setne %al
24+
; CHECK-NEXT: testb %al, %cl
25+
; CHECK-NEXT: je LBB0_2
26+
; CHECK-NEXT: ## %bb.1: ## %label.0.no_exit.1_crit_edge.exitStub
2627
; CHECK-NEXT: movb $1, %al
2728
; CHECK-NEXT: popl %esi
2829
; CHECK-NEXT: retl
29-
; CHECK-NEXT: LBB0_3: ## %codeRepl5.exitStub
30+
; CHECK-NEXT: LBB0_2: ## %codeRepl5.exitStub
3031
; CHECK-NEXT: xorl %eax, %eax
3132
; CHECK-NEXT: popl %esi
3233
; CHECK-NEXT: retl

0 commit comments

Comments
 (0)