@@ -4568,6 +4568,71 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
4568
4568
}
4569
4569
}
4570
4570
4571
+ // This function returns true if the loop:
4572
+ // 1. Has a valid cost, and
4573
+ // 2. Has a cost within the supplied budget.
4574
+ // Otherwise it returns false.
4575
+ static bool isLoopSizeWithinBudget (Loop *L, AArch64TTIImpl &TTI,
4576
+ InstructionCost Budget,
4577
+ unsigned *FinalSize) {
4578
+ // Estimate the size of the loop.
4579
+ InstructionCost LoopCost = 0 ;
4580
+
4581
+ for (auto *BB : L->getBlocks ()) {
4582
+ for (auto &I : *BB) {
4583
+ SmallVector<const Value *, 4 > Operands (I.operand_values ());
4584
+ InstructionCost Cost =
4585
+ TTI.getInstructionCost (&I, Operands, TTI::TCK_CodeSize);
4586
+ // This can happen with intrinsics that don't currently have a cost model
4587
+ // or for some operations that require SVE.
4588
+ if (!Cost.isValid ())
4589
+ return false ;
4590
+
4591
+ LoopCost += Cost;
4592
+ if (LoopCost > Budget)
4593
+ return false ;
4594
+ }
4595
+ }
4596
+
4597
+ if (FinalSize)
4598
+ *FinalSize = *LoopCost.getValue ();
4599
+ return true ;
4600
+ }
4601
+
4602
+ static bool shouldUnrollMultiExitLoop (Loop *L, ScalarEvolution &SE,
4603
+ AArch64TTIImpl &TTI) {
4604
+ // Only consider loops with unknown trip counts for which we can determine
4605
+ // a symbolic expression. Multi-exit loops with small known trip counts will
4606
+ // likely be unrolled anyway.
4607
+ const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount (L);
4608
+ if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
4609
+ return false ;
4610
+
4611
+ // It might not be worth unrolling loops with low max trip counts. Restrict
4612
+ // this to max trip counts > 32 for now.
4613
+ unsigned MaxTC = SE.getSmallConstantMaxTripCount (L);
4614
+ if (MaxTC > 0 && MaxTC <= 32 )
4615
+ return false ;
4616
+
4617
+ // Make sure the loop size is <= 5.
4618
+ if (!isLoopSizeWithinBudget (L, TTI, 5 , nullptr ))
4619
+ return false ;
4620
+
4621
+ // Small search loops with multiple exits can be highly beneficial to unroll.
4622
+ // We only care about loops with exactly two exiting blocks, although each
4623
+ // block could jump to the same exit block.
4624
+ ArrayRef<BasicBlock *> Blocks = L->getBlocks ();
4625
+ if (Blocks.size () != 2 )
4626
+ return false ;
4627
+
4628
+ if (any_of (Blocks, [](BasicBlock *BB) {
4629
+ return !isa<BranchInst>(BB->getTerminator ());
4630
+ }))
4631
+ return false ;
4632
+
4633
+ return true ;
4634
+ }
4635
+
4571
4636
// / For Apple CPUs, we want to runtime-unroll loops to make better use if the
4572
4637
// / OOO engine's wide instruction window and various predictors.
4573
4638
static void
@@ -4582,43 +4647,18 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
4582
4647
if (!L->isInnermost () || L->getNumBlocks () > 8 )
4583
4648
return ;
4584
4649
4650
+ // Loops with multiple exits are handled by common code.
4651
+ if (!L->getExitBlock ())
4652
+ return ;
4653
+
4585
4654
const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount (L);
4586
4655
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
4587
4656
(SE.getSmallConstantMaxTripCount (L) > 0 &&
4588
4657
SE.getSmallConstantMaxTripCount (L) <= 32 ))
4589
4658
return ;
4590
- if (findStringMetadataForLoop (L, " llvm.loop.isvectorized" ))
4591
- return ;
4592
-
4593
- int64_t Size = 0 ;
4594
- for (auto *BB : L->getBlocks ()) {
4595
- for (auto &I : *BB) {
4596
- if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
4597
- return ;
4598
- SmallVector<const Value *, 4 > Operands (I.operand_values ());
4599
- Size +=
4600
- *TTI.getInstructionCost (&I, Operands, TTI::TCK_CodeSize).getValue ();
4601
- }
4602
- }
4603
4659
4604
- // Small search loops with multiple exits can be highly beneficial to unroll.
4605
- if (!L->getExitBlock ()) {
4606
- if (L->getNumBlocks () == 2 && Size < 6 &&
4607
- all_of (
4608
- L->getBlocks (),
4609
- [](BasicBlock *BB) {
4610
- return isa<BranchInst>(BB->getTerminator ());
4611
- })) {
4612
- UP.RuntimeUnrollMultiExit = true ;
4613
- UP.Runtime = true ;
4614
- // Limit unroll count.
4615
- UP.DefaultUnrollRuntimeCount = 4 ;
4616
- // Allow slightly more costly trip-count expansion to catch search loops
4617
- // with pointer inductions.
4618
- UP.SCEVExpansionBudget = 5 ;
4619
- }
4660
+ if (findStringMetadataForLoop (L, " llvm.loop.isvectorized" ))
4620
4661
return ;
4621
- }
4622
4662
4623
4663
if (SE.getSymbolicMaxBackedgeTakenCount (L) != SE.getBackedgeTakenCount (L))
4624
4664
return ;
@@ -4630,7 +4670,9 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
4630
4670
// dependencies, to expose more parallel memory access streams.
4631
4671
BasicBlock *Header = L->getHeader ();
4632
4672
if (Header == L->getLoopLatch ()) {
4633
- if (Size > 8 )
4673
+ // Estimate the size of the loop.
4674
+ unsigned Size;
4675
+ if (!isLoopSizeWithinBudget (L, TTI, 8 , &Size))
4634
4676
return ;
4635
4677
4636
4678
SmallPtrSet<Value *, 8 > LoadedValues;
@@ -4727,6 +4769,25 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
4727
4769
// Disable partial & runtime unrolling on -Os.
4728
4770
UP.PartialOptSizeThreshold = 0 ;
4729
4771
4772
+ // Scan the loop: don't unroll loops with calls as this could prevent
4773
+ // inlining. Don't unroll vector loops either, as they don't benefit much from
4774
+ // unrolling.
4775
+ for (auto *BB : L->getBlocks ()) {
4776
+ for (auto &I : *BB) {
4777
+ // Don't unroll vectorised loop.
4778
+ if (I.getType ()->isVectorTy ())
4779
+ return ;
4780
+
4781
+ if (isa<CallBase>(I)) {
4782
+ if (isa<CallInst>(I) || isa<InvokeInst>(I))
4783
+ if (const Function *F = cast<CallBase>(I).getCalledFunction ())
4784
+ if (!isLoweredToCall (F))
4785
+ continue ;
4786
+ return ;
4787
+ }
4788
+ }
4789
+ }
4790
+
4730
4791
// Apply subtarget-specific unrolling preferences.
4731
4792
switch (ST->getProcFamily ()) {
4732
4793
case AArch64Subtarget::AppleA14:
@@ -4743,23 +4804,17 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
4743
4804
break ;
4744
4805
}
4745
4806
4746
- // Scan the loop: don't unroll loops with calls as this could prevent
4747
- // inlining. Don't unroll vector loops either, as they don't benefit much from
4748
- // unrolling.
4749
- for (auto *BB : L->getBlocks ()) {
4750
- for (auto &I : *BB) {
4751
- // Don't unroll vectorised loop.
4752
- if (I.getType ()->isVectorTy ())
4753
- return ;
4754
-
4755
- if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4756
- if (const Function *F = cast<CallBase>(I).getCalledFunction ()) {
4757
- if (!isLoweredToCall (F))
4758
- continue ;
4759
- }
4760
- return ;
4761
- }
4762
- }
4807
+ // If this is a small, multi-exit loop similar to something like std::find,
4808
+ // then there is typically a performance improvement achieved by unrolling.
4809
+ if (!L->getExitBlock () && shouldUnrollMultiExitLoop (L, SE, *this )) {
4810
+ UP.RuntimeUnrollMultiExit = true ;
4811
+ UP.Runtime = true ;
4812
+ // Limit unroll count.
4813
+ UP.DefaultUnrollRuntimeCount = 4 ;
4814
+ // Allow slightly more costly trip-count expansion to catch search loops
4815
+ // with pointer inductions.
4816
+ UP.SCEVExpansionBudget = 5 ;
4817
+ return ;
4763
4818
}
4764
4819
4765
4820
// Enable runtime unrolling for in-order models
0 commit comments