@@ -179,8 +179,8 @@ class SelectOptimize : public FunctionPass {
179
179
// For a given source instruction, collect its backwards dependence slice
180
180
// consisting of instructions exclusively computed for producing the operands
181
181
// of the source instruction.
182
- void getExclBackwardsSlice (Instruction *I,
183
- SmallVector<Instruction *, 2 > &Slice );
182
+ void getExclBackwardsSlice (Instruction *I, std::stack<Instruction *> &Slice,
183
+ bool ForSinking = false );
184
184
185
185
// Returns true if the condition of the select is highly predictable.
186
186
bool isSelectHighlyPredictable (const SelectInst *SI);
@@ -329,6 +329,10 @@ getTrueOrFalseValue(SelectInst *SI, bool isTrue,
329
329
330
330
void SelectOptimize::convertProfitableSIGroups (SelectGroups &ProfSIGroups) {
331
331
for (SelectGroup &ASI : ProfSIGroups) {
332
+ // The code transformation here is a modified version of the sinking
333
+ // transformation in CodeGenPrepare::optimizeSelectInst with a more
334
+ // aggressive strategy of which instructions to sink.
335
+ //
332
336
// TODO: eliminate the redundancy of logic transforming selects to branches
333
337
// by removing CodeGenPrepare::optimizeSelectInst and optimizing here
334
338
// selects for all cases (with and without profile information).
@@ -342,13 +346,73 @@ void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
342
346
// start:
343
347
// %cmp = cmp uge i32 %a, %b
344
348
// %cmp.frozen = freeze %cmp
345
- // br i1 %cmp.frozen, label %select.end, label %select.false
349
+ // br i1 %cmp.frozen, label %select.true, label %select.false
350
+ // select.true:
351
+ // br label %select.end
346
352
// select.false:
347
353
// br label %select.end
348
354
// select.end:
349
- // %sel = phi i32 [ %c, %start ], [ %d, %select.false ]
355
+ // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
350
356
//
351
357
// %cmp should be frozen, otherwise it may introduce undefined behavior.
358
+ // In addition, we may sink instructions that produce %c or %d into the
359
+ // destination(s) of the new branch.
360
+ // If the true or false blocks do not contain a sunken instruction, that
361
+ // block and its branch may be optimized away. In that case, one side of the
362
+ // first branch will point directly to select.end, and the corresponding PHI
363
+ // predecessor block will be the start block.
364
+
365
+ // Find all the instructions that can be soundly sunk to the true/false
366
+ // blocks. These are instructions that are computed solely for producing the
367
+ // operands of the select instructions in the group and can be sunk without
368
+ // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
369
+ // with side effects).
370
+ SmallVector<std::stack<Instruction *>, 2 > TrueSlices, FalseSlices;
371
+ typedef std::stack<Instruction *>::size_type StackSizeType;
372
+ StackSizeType maxTrueSliceLen = 0 , maxFalseSliceLen = 0 ;
373
+ for (SelectInst *SI : ASI) {
374
+ // For each select, compute the sinkable dependence chains of the true and
375
+ // false operands.
376
+ if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue ())) {
377
+ std::stack<Instruction *> TrueSlice;
378
+ getExclBackwardsSlice (TI, TrueSlice, true );
379
+ maxTrueSliceLen = std::max (maxTrueSliceLen, TrueSlice.size ());
380
+ TrueSlices.push_back (TrueSlice);
381
+ }
382
+ if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue ())) {
383
+ std::stack<Instruction *> FalseSlice;
384
+ getExclBackwardsSlice (FI, FalseSlice, true );
385
+ maxFalseSliceLen = std::max (maxFalseSliceLen, FalseSlice.size ());
386
+ FalseSlices.push_back (FalseSlice);
387
+ }
388
+ }
389
+ // In the case of multiple select instructions in the same group, the order
390
+ // of non-dependent instructions (instructions of different dependence
391
+ // slices) in the true/false blocks appears to affect performance.
392
+ // Interleaving the slices seems to experimentally be the optimal approach.
393
+ // This interleaving scheduling allows for more ILP (with a natural downside
394
+ // of increasing a bit register pressure) compared to a simple ordering of
395
+ // one whole chain after another. One would expect that this ordering would
396
+ // not matter since the scheduling in the backend of the compiler would
397
+ // take care of it, but apparently the scheduler fails to deliver optimal
398
+ // ILP with a naive ordering here.
399
+ SmallVector<Instruction *, 2 > TrueSlicesInterleaved, FalseSlicesInterleaved;
400
+ for (StackSizeType IS = 0 ; IS < maxTrueSliceLen; ++IS) {
401
+ for (auto &S : TrueSlices) {
402
+ if (!S.empty ()) {
403
+ TrueSlicesInterleaved.push_back (S.top ());
404
+ S.pop ();
405
+ }
406
+ }
407
+ }
408
+ for (StackSizeType IS = 0 ; IS < maxFalseSliceLen; ++IS) {
409
+ for (auto &S : FalseSlices) {
410
+ if (!S.empty ()) {
411
+ FalseSlicesInterleaved.push_back (S.top ());
412
+ S.pop ();
413
+ }
414
+ }
415
+ }
352
416
353
417
// We split the block containing the select(s) into two blocks.
354
418
SelectInst *SI = ASI.front ();
@@ -374,24 +438,55 @@ void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
374
438
}
375
439
376
440
// These are the new basic blocks for the conditional branch.
377
- // For now, no instruction sinking to the true/false blocks.
378
- // Thus both True and False blocks will be empty.
441
+ // At least one will become an actual new basic block.
379
442
BasicBlock *TrueBlock = nullptr , *FalseBlock = nullptr ;
380
-
381
- // Use the 'false' side for a new input value to the PHI.
382
- FalseBlock = BasicBlock::Create (SI->getContext (), " select.false" ,
383
- EndBlock->getParent (), EndBlock);
384
- auto *FalseBranch = BranchInst::Create (EndBlock, FalseBlock);
385
- FalseBranch->setDebugLoc (SI->getDebugLoc ());
386
-
387
- // For the 'true' side the path originates from the start block from the
388
- // point view of the new PHI.
389
- TrueBlock = StartBlock;
443
+ BranchInst *TrueBranch = nullptr , *FalseBranch = nullptr ;
444
+ if (!TrueSlicesInterleaved.empty ()) {
445
+ TrueBlock = BasicBlock::Create (LastSI->getContext (), " select.true.sink" ,
446
+ EndBlock->getParent (), EndBlock);
447
+ TrueBranch = BranchInst::Create (EndBlock, TrueBlock);
448
+ TrueBranch->setDebugLoc (LastSI->getDebugLoc ());
449
+ for (Instruction *TrueInst : TrueSlicesInterleaved)
450
+ TrueInst->moveBefore (TrueBranch);
451
+ }
452
+ if (!FalseSlicesInterleaved.empty ()) {
453
+ FalseBlock = BasicBlock::Create (LastSI->getContext (), " select.false.sink" ,
454
+ EndBlock->getParent (), EndBlock);
455
+ FalseBranch = BranchInst::Create (EndBlock, FalseBlock);
456
+ FalseBranch->setDebugLoc (LastSI->getDebugLoc ());
457
+ for (Instruction *FalseInst : FalseSlicesInterleaved)
458
+ FalseInst->moveBefore (FalseBranch);
459
+ }
460
+ // If there was nothing to sink, then arbitrarily choose the 'false' side
461
+ // for a new input value to the PHI.
462
+ if (TrueBlock == FalseBlock) {
463
+ assert (TrueBlock == nullptr &&
464
+ " Unexpected basic block transform while optimizing select" );
465
+
466
+ FalseBlock = BasicBlock::Create (SI->getContext (), " select.false" ,
467
+ EndBlock->getParent (), EndBlock);
468
+ auto *FalseBranch = BranchInst::Create (EndBlock, FalseBlock);
469
+ FalseBranch->setDebugLoc (SI->getDebugLoc ());
470
+ }
390
471
391
472
// Insert the real conditional branch based on the original condition.
473
+ // If we did not create a new block for one of the 'true' or 'false' paths
474
+ // of the condition, it means that side of the branch goes to the end block
475
+ // directly and the path originates from the start block from the point of
476
+ // view of the new PHI.
392
477
BasicBlock *TT, *FT;
393
- TT = EndBlock;
394
- FT = FalseBlock;
478
+ if (TrueBlock == nullptr ) {
479
+ TT = EndBlock;
480
+ FT = FalseBlock;
481
+ TrueBlock = StartBlock;
482
+ } else if (FalseBlock == nullptr ) {
483
+ TT = TrueBlock;
484
+ FT = EndBlock;
485
+ FalseBlock = StartBlock;
486
+ } else {
487
+ TT = TrueBlock;
488
+ FT = FalseBlock;
489
+ }
395
490
IRBuilder<> IB (SI);
396
491
auto *CondFr =
397
492
IB.CreateFreeze (SI->getCondition (), SI->getName () + " .frozen" );
@@ -586,12 +681,13 @@ bool SelectOptimize::hasExpensiveColdOperand(
586
681
HotWeight = TrueWeight;
587
682
}
588
683
if (ColdI) {
589
- SmallVector <Instruction *, 2 > ColdSlice;
684
+ std::stack <Instruction *> ColdSlice;
590
685
getExclBackwardsSlice (ColdI, ColdSlice);
591
686
InstructionCost SliceCost = 0 ;
592
- for (auto *ColdII : ColdSlice) {
593
- SliceCost +=
594
- TTI->getInstructionCost (ColdII, TargetTransformInfo::TCK_Latency);
687
+ while (!ColdSlice.empty ()) {
688
+ SliceCost += TTI->getInstructionCost (ColdSlice.top (),
689
+ TargetTransformInfo::TCK_Latency);
690
+ ColdSlice.pop ();
595
691
}
596
692
// The colder the cold value operand of the select is the more expensive
597
693
// the cmov becomes for computing the cold value operand every time. Thus,
@@ -613,8 +709,9 @@ bool SelectOptimize::hasExpensiveColdOperand(
613
709
// (sufficiently-accurate in practice), we populate this set with the
614
710
// instructions of the backwards dependence slice that only have one-use and
615
711
// form an one-use chain that leads to the source instruction.
616
- void SelectOptimize::getExclBackwardsSlice (
617
- Instruction *I, SmallVector<Instruction *, 2 > &Slice) {
712
+ void SelectOptimize::getExclBackwardsSlice (Instruction *I,
713
+ std::stack<Instruction *> &Slice,
714
+ bool ForSinking) {
618
715
SmallPtrSet<Instruction *, 2 > Visited;
619
716
std::queue<Instruction *> Worklist;
620
717
Worklist.push (I);
@@ -630,13 +727,20 @@ void SelectOptimize::getExclBackwardsSlice(
630
727
if (!II->hasOneUse ())
631
728
continue ;
632
729
730
+ // Cannot soundly sink instructions with side-effects.
731
+ // Terminator or phi instructions cannot be sunk.
732
+ // Avoid sinking other select instructions (should be handled separetely).
733
+ if (ForSinking && (II->isTerminator () || II->mayHaveSideEffects () ||
734
+ isa<SelectInst>(II) || isa<PHINode>(II)))
735
+ continue ;
736
+
633
737
// Avoid considering instructions with less frequency than the source
634
738
// instruction (i.e., avoid colder code regions of the dependence slice).
635
739
if (BFI->getBlockFreq (II->getParent ()) < BFI->getBlockFreq (I->getParent ()))
636
740
continue ;
637
741
638
742
// Eligible one-use instruction added to the dependence slice.
639
- Slice.push_back (II);
743
+ Slice.push (II);
640
744
641
745
// Explore all the operands of the current instruction to expand the slice.
642
746
for (unsigned k = 0 ; k < II->getNumOperands (); ++k)
0 commit comments