Skip to content

Commit 67be40d

Browse files
committed
Recommit "[SelectOpti][5/5] Optimize select-to-branch transformation"
Use container::size_type directly to avoid type mismatch causing build failures in Windows. Original commit message: This patch optimizes the transformation of selects to a branch when the heuristics deemed it profitable. It aggressively sinks eligible instructions to the newly created true/false blocks to prevent their execution on the common path and interleaves dependence slices to maximize ILP. Depends on D120232 Reviewed By: davidxl Differential Revision: https://reviews.llvm.org/D120233
1 parent 03095bd commit 67be40d

File tree

2 files changed

+155
-43
lines changed

2 files changed

+155
-43
lines changed

llvm/lib/CodeGen/SelectOptimize.cpp

Lines changed: 129 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,8 @@ class SelectOptimize : public FunctionPass {
179179
// For a given source instruction, collect its backwards dependence slice
180180
// consisting of instructions exclusively computed for producing the operands
181181
// of the source instruction.
182-
void getExclBackwardsSlice(Instruction *I,
183-
SmallVector<Instruction *, 2> &Slice);
182+
void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice,
183+
bool ForSinking = false);
184184

185185
// Returns true if the condition of the select is highly predictable.
186186
bool isSelectHighlyPredictable(const SelectInst *SI);
@@ -329,6 +329,10 @@ getTrueOrFalseValue(SelectInst *SI, bool isTrue,
329329

330330
void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
331331
for (SelectGroup &ASI : ProfSIGroups) {
332+
// The code transformation here is a modified version of the sinking
333+
// transformation in CodeGenPrepare::optimizeSelectInst with a more
334+
// aggressive strategy of which instructions to sink.
335+
//
332336
// TODO: eliminate the redundancy of logic transforming selects to branches
333337
// by removing CodeGenPrepare::optimizeSelectInst and optimizing here
334338
// selects for all cases (with and without profile information).
@@ -342,13 +346,73 @@ void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
342346
// start:
343347
// %cmp = cmp uge i32 %a, %b
344348
// %cmp.frozen = freeze %cmp
345-
// br i1 %cmp.frozen, label %select.end, label %select.false
349+
// br i1 %cmp.frozen, label %select.true, label %select.false
350+
// select.true:
351+
// br label %select.end
346352
// select.false:
347353
// br label %select.end
348354
// select.end:
349-
// %sel = phi i32 [ %c, %start ], [ %d, %select.false ]
355+
// %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
350356
//
351357
// %cmp should be frozen, otherwise it may introduce undefined behavior.
358+
// In addition, we may sink instructions that produce %c or %d into the
359+
// destination(s) of the new branch.
360+
// If the true or false blocks do not contain a sunken instruction, that
361+
// block and its branch may be optimized away. In that case, one side of the
362+
// first branch will point directly to select.end, and the corresponding PHI
363+
// predecessor block will be the start block.
364+
365+
// Find all the instructions that can be soundly sunk to the true/false
366+
// blocks. These are instructions that are computed solely for producing the
367+
// operands of the select instructions in the group and can be sunk without
368+
// breaking the semantics of the LLVM IR (e.g., cannot sink instructions
369+
// with side effects).
370+
SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
371+
typedef std::stack<Instruction *>::size_type StackSizeType;
372+
StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0;
373+
for (SelectInst *SI : ASI) {
374+
// For each select, compute the sinkable dependence chains of the true and
375+
// false operands.
376+
if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue())) {
377+
std::stack<Instruction *> TrueSlice;
378+
getExclBackwardsSlice(TI, TrueSlice, true);
379+
maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size());
380+
TrueSlices.push_back(TrueSlice);
381+
}
382+
if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue())) {
383+
std::stack<Instruction *> FalseSlice;
384+
getExclBackwardsSlice(FI, FalseSlice, true);
385+
maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
386+
FalseSlices.push_back(FalseSlice);
387+
}
388+
}
389+
// In the case of multiple select instructions in the same group, the order
390+
// of non-dependent instructions (instructions of different dependence
391+
// slices) in the true/false blocks appears to affect performance.
392+
// Interleaving the slices seems to experimentally be the optimal approach.
393+
// This interleaving scheduling allows for more ILP (with a natural downside
394+
// of increasing a bit register pressure) compared to a simple ordering of
395+
// one whole chain after another. One would expect that this ordering would
396+
// not matter since the scheduling in the backend of the compiler would
397+
// take care of it, but apparently the scheduler fails to deliver optimal
398+
// ILP with a naive ordering here.
399+
SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved;
400+
for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) {
401+
for (auto &S : TrueSlices) {
402+
if (!S.empty()) {
403+
TrueSlicesInterleaved.push_back(S.top());
404+
S.pop();
405+
}
406+
}
407+
}
408+
for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) {
409+
for (auto &S : FalseSlices) {
410+
if (!S.empty()) {
411+
FalseSlicesInterleaved.push_back(S.top());
412+
S.pop();
413+
}
414+
}
415+
}
352416

353417
// We split the block containing the select(s) into two blocks.
354418
SelectInst *SI = ASI.front();
@@ -374,24 +438,55 @@ void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
374438
}
375439

376440
// These are the new basic blocks for the conditional branch.
377-
// For now, no instruction sinking to the true/false blocks.
378-
// Thus both True and False blocks will be empty.
441+
// At least one will become an actual new basic block.
379442
BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
380-
381-
// Use the 'false' side for a new input value to the PHI.
382-
FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
383-
EndBlock->getParent(), EndBlock);
384-
auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
385-
FalseBranch->setDebugLoc(SI->getDebugLoc());
386-
387-
// For the 'true' side the path originates from the start block from the
388-
// point view of the new PHI.
389-
TrueBlock = StartBlock;
443+
BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
444+
if (!TrueSlicesInterleaved.empty()) {
445+
TrueBlock = BasicBlock::Create(LastSI->getContext(), "select.true.sink",
446+
EndBlock->getParent(), EndBlock);
447+
TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
448+
TrueBranch->setDebugLoc(LastSI->getDebugLoc());
449+
for (Instruction *TrueInst : TrueSlicesInterleaved)
450+
TrueInst->moveBefore(TrueBranch);
451+
}
452+
if (!FalseSlicesInterleaved.empty()) {
453+
FalseBlock = BasicBlock::Create(LastSI->getContext(), "select.false.sink",
454+
EndBlock->getParent(), EndBlock);
455+
FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
456+
FalseBranch->setDebugLoc(LastSI->getDebugLoc());
457+
for (Instruction *FalseInst : FalseSlicesInterleaved)
458+
FalseInst->moveBefore(FalseBranch);
459+
}
460+
// If there was nothing to sink, then arbitrarily choose the 'false' side
461+
// for a new input value to the PHI.
462+
if (TrueBlock == FalseBlock) {
463+
assert(TrueBlock == nullptr &&
464+
"Unexpected basic block transform while optimizing select");
465+
466+
FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
467+
EndBlock->getParent(), EndBlock);
468+
auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
469+
FalseBranch->setDebugLoc(SI->getDebugLoc());
470+
}
390471

391472
// Insert the real conditional branch based on the original condition.
473+
// If we did not create a new block for one of the 'true' or 'false' paths
474+
// of the condition, it means that side of the branch goes to the end block
475+
// directly and the path originates from the start block from the point of
476+
// view of the new PHI.
392477
BasicBlock *TT, *FT;
393-
TT = EndBlock;
394-
FT = FalseBlock;
478+
if (TrueBlock == nullptr) {
479+
TT = EndBlock;
480+
FT = FalseBlock;
481+
TrueBlock = StartBlock;
482+
} else if (FalseBlock == nullptr) {
483+
TT = TrueBlock;
484+
FT = EndBlock;
485+
FalseBlock = StartBlock;
486+
} else {
487+
TT = TrueBlock;
488+
FT = FalseBlock;
489+
}
395490
IRBuilder<> IB(SI);
396491
auto *CondFr =
397492
IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");
@@ -586,12 +681,13 @@ bool SelectOptimize::hasExpensiveColdOperand(
586681
HotWeight = TrueWeight;
587682
}
588683
if (ColdI) {
589-
SmallVector<Instruction *, 2> ColdSlice;
684+
std::stack<Instruction *> ColdSlice;
590685
getExclBackwardsSlice(ColdI, ColdSlice);
591686
InstructionCost SliceCost = 0;
592-
for (auto *ColdII : ColdSlice) {
593-
SliceCost +=
594-
TTI->getInstructionCost(ColdII, TargetTransformInfo::TCK_Latency);
687+
while (!ColdSlice.empty()) {
688+
SliceCost += TTI->getInstructionCost(ColdSlice.top(),
689+
TargetTransformInfo::TCK_Latency);
690+
ColdSlice.pop();
595691
}
596692
// The colder the cold value operand of the select is the more expensive
597693
// the cmov becomes for computing the cold value operand every time. Thus,
@@ -613,8 +709,9 @@ bool SelectOptimize::hasExpensiveColdOperand(
613709
// (sufficiently-accurate in practice), we populate this set with the
614710
// instructions of the backwards dependence slice that only have one-use and
615711
// form an one-use chain that leads to the source instruction.
616-
void SelectOptimize::getExclBackwardsSlice(
617-
Instruction *I, SmallVector<Instruction *, 2> &Slice) {
712+
void SelectOptimize::getExclBackwardsSlice(Instruction *I,
713+
std::stack<Instruction *> &Slice,
714+
bool ForSinking) {
618715
SmallPtrSet<Instruction *, 2> Visited;
619716
std::queue<Instruction *> Worklist;
620717
Worklist.push(I);
@@ -630,13 +727,20 @@ void SelectOptimize::getExclBackwardsSlice(
630727
if (!II->hasOneUse())
631728
continue;
632729

730+
// Cannot soundly sink instructions with side-effects.
731+
// Terminator or phi instructions cannot be sunk.
732+
// Avoid sinking other select instructions (should be handled separetely).
733+
if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() ||
734+
isa<SelectInst>(II) || isa<PHINode>(II)))
735+
continue;
736+
633737
// Avoid considering instructions with less frequency than the source
634738
// instruction (i.e., avoid colder code regions of the dependence slice).
635739
if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent()))
636740
continue;
637741

638742
// Eligible one-use instruction added to the dependence slice.
639-
Slice.push_back(II);
743+
Slice.push(II);
640744

641745
// Explore all the operands of the current instruction to expand the slice.
642746
for (unsigned k = 0; k < II->getNumOperands(); ++k)

llvm/test/CodeGen/X86/select-optimize.ll

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -105,20 +105,28 @@ define i32 @weighted_selects(i32 %a, i32 %b) !prof !19 {
105105
; If select group predictable, turn it into a branch.
106106
define i32 @weighted_select_group(i32 %a, i32 %b, i32 %c, i1 %cmp) !prof !19 {
107107
; CHECK-LABEL: @weighted_select_group(
108+
; CHECK-NEXT: [[A1:%.*]] = add i32 [[A:%.*]], 1
108109
; CHECK-NEXT: [[SEL1_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
109-
; CHECK-NEXT: br i1 [[SEL1_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF16]]
110-
; CHECK: select.false:
110+
; CHECK-NEXT: br i1 [[SEL1_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_FALSE_SINK:%.*]], !prof [[PROF16]]
111+
; CHECK: select.true.sink:
112+
; CHECK-NEXT: [[C1:%.*]] = add i32 [[C:%.*]], 1
113+
; CHECK-NEXT: br label [[SELECT_END:%.*]]
114+
; CHECK: select.false.sink:
115+
; CHECK-NEXT: [[B1:%.*]] = add i32 [[B:%.*]], 1
111116
; CHECK-NEXT: br label [[SELECT_END]]
112117
; CHECK: select.end:
113-
; CHECK-NEXT: [[SEL1:%.*]] = phi i32 [ [[A:%.*]], [[TMP0:%.*]] ], [ [[B:%.*]], [[SELECT_FALSE]] ]
114-
; CHECK-NEXT: [[SEL2:%.*]] = phi i32 [ [[C:%.*]], [[TMP0]] ], [ [[A]], [[SELECT_FALSE]] ]
118+
; CHECK-NEXT: [[SEL1:%.*]] = phi i32 [ [[A1]], [[SELECT_TRUE_SINK]] ], [ [[B1]], [[SELECT_FALSE_SINK]] ]
119+
; CHECK-NEXT: [[SEL2:%.*]] = phi i32 [ [[C1]], [[SELECT_TRUE_SINK]] ], [ [[A1]], [[SELECT_FALSE_SINK]] ]
115120
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[SEL1]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG26:![0-9]+]]
116121
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SEL1]], [[SEL2]]
117122
; CHECK-NEXT: ret i32 [[ADD]]
118123
;
119-
%sel1 = select i1 %cmp, i32 %a, i32 %b, !prof !15
124+
%a1 = add i32 %a, 1
125+
%b1 = add i32 %b, 1
126+
%c1 = add i32 %c, 1
127+
%sel1 = select i1 %cmp, i32 %a1, i32 %b1, !prof !15
120128
call void @llvm.dbg.value(metadata i32 %sel1, metadata !24, metadata !DIExpression()), !dbg !DILocation(scope: !23)
121-
%sel2 = select i1 %cmp, i32 %c, i32 %a, !prof !15
129+
%sel2 = select i1 %cmp, i32 %c1, i32 %a1, !prof !15
122130
%add = add i32 %sel1, %sel2
123131
ret i32 %add
124132
}
@@ -151,13 +159,13 @@ define i32 @select_group_intra_group(i32 %a, i32 %b, i32 %c, i1 %cmp) {
151159
; sink load
152160
define i32 @expensive_val_operand1(i32* nocapture %a, i32 %y, i1 %cmp) {
153161
; CHECK-LABEL: @expensive_val_operand1(
154-
; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8
155162
; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
156-
; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF18]]
157-
; CHECK: select.false:
163+
; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]]
164+
; CHECK: select.true.sink:
165+
; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8
158166
; CHECK-NEXT: br label [[SELECT_END]]
159167
; CHECK: select.end:
160-
; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[LOAD]], [[TMP0:%.*]] ], [ [[Y:%.*]], [[SELECT_FALSE]] ]
168+
; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[LOAD]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ]
161169
; CHECK-NEXT: ret i32 [[SEL]]
162170
;
163171
%load = load i32, i32* %a, align 8
@@ -181,14 +189,14 @@ define i32 @expensive_val_operand2(i32* nocapture %a, i32 %x, i1 %cmp) {
181189
; into a branch with sinked dependence slice.
182190
define i32 @expensive_val_operand3(i32* nocapture %a, i32 %b, i32 %y, i1 %cmp) {
183191
; CHECK-LABEL: @expensive_val_operand3(
192+
; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
193+
; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]]
194+
; CHECK: select.true.sink:
184195
; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[A:%.*]], align 8
185196
; CHECK-NEXT: [[X:%.*]] = add i32 [[LOAD]], [[B:%.*]]
186-
; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
187-
; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]], !prof [[PROF18]]
188-
; CHECK: select.false:
189197
; CHECK-NEXT: br label [[SELECT_END]]
190198
; CHECK: select.end:
191-
; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[X]], [[TMP0:%.*]] ], [ [[Y:%.*]], [[SELECT_FALSE]] ]
199+
; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[X]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ]
192200
; CHECK-NEXT: ret i32 [[SEL]]
193201
;
194202
%load = load i32, i32* %a, align 8
@@ -242,14 +250,14 @@ define double @cmov_on_critical_path(i32 %n, double %x, double* nocapture %a) {
242250
; CHECK-NEXT: [[X1:%.*]] = phi double [ [[X2:%.*]], [[SELECT_END]] ], [ [[X]], [[FOR_BODY_PREHEADER]] ]
243251
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
244252
; CHECK-NEXT: [[R:%.*]] = load double, double* [[ARRAYIDX]], align 8
245-
; CHECK-NEXT: [[SUB:%.*]] = fsub double [[X1]], [[R]]
246253
; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt double [[X1]], [[R]]
247254
; CHECK-NEXT: [[X2_FROZEN:%.*]] = freeze i1 [[CMP2]]
248-
; CHECK-NEXT: br i1 [[X2_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]], !prof [[PROF27:![0-9]+]]
249-
; CHECK: select.false:
255+
; CHECK-NEXT: br i1 [[X2_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END]], !prof [[PROF27:![0-9]+]]
256+
; CHECK: select.true.sink:
257+
; CHECK-NEXT: [[SUB:%.*]] = fsub double [[X1]], [[R]]
250258
; CHECK-NEXT: br label [[SELECT_END]]
251259
; CHECK: select.end:
252-
; CHECK-NEXT: [[X2]] = phi double [ [[SUB]], [[FOR_BODY]] ], [ [[X1]], [[SELECT_FALSE]] ]
260+
; CHECK-NEXT: [[X2]] = phi double [ [[SUB]], [[SELECT_TRUE_SINK]] ], [ [[X1]], [[FOR_BODY]] ]
253261
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
254262
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
255263
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]

0 commit comments

Comments
 (0)