@@ -4528,6 +4528,95 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
4528
4528
}
4529
4529
}
4530
4530
4531
+ static bool shouldUnrollLoopWithInstruction (Instruction &I,
4532
+ AArch64TTIImpl &TTI) {
4533
+ // Don't unroll vectorised loop.
4534
+ if (I.getType ()->isVectorTy ())
4535
+ return false ;
4536
+
4537
+ if (isa<CallBase>(I)) {
4538
+ if (isa<CallInst>(I) || isa<InvokeInst>(I))
4539
+ if (const Function *F = cast<CallBase>(I).getCalledFunction ())
4540
+ if (!TTI.isLoweredToCall (F))
4541
+ return true ;
4542
+ return false ;
4543
+ }
4544
+
4545
+ return true ;
4546
+ }
4547
+
4548
+ // This function returns true if the loop:
4549
+ // 1. Contains only those instructions that should be unrolled,
4550
+ // 2. Has a valid cost,
4551
+ // 3. Has a cost within the supplied budget.
4552
+ // Otherwise it returns false.
4553
+ static bool isLoopSizeWithinBudget (Loop *L, AArch64TTIImpl &TTI,
4554
+ InstructionCost Budget,
4555
+ unsigned *FinalSize) {
4556
+ // Estimate the size of the loop.
4557
+ InstructionCost LoopCost = 0 ;
4558
+
4559
+ if (findStringMetadataForLoop (L, " llvm.loop.isvectorized" ))
4560
+ return 0 ;
4561
+
4562
+ for (auto *BB : L->getBlocks ()) {
4563
+ for (auto &I : *BB) {
4564
+ if (!shouldUnrollLoopWithInstruction (I, TTI))
4565
+ return 0 ;
4566
+
4567
+ SmallVector<const Value *, 4 > Operands (I.operand_values ());
4568
+ InstructionCost Cost =
4569
+ TTI.getInstructionCost (&I, Operands, TTI::TCK_CodeSize);
4570
+ // This can happen with intrinsics that don't currently have a cost model
4571
+ // or for some operations that require SVE.
4572
+ if (!Cost.isValid ())
4573
+ return 0 ;
4574
+
4575
+ LoopCost += Cost;
4576
+ if (LoopCost > Budget)
4577
+ return 0 ;
4578
+ }
4579
+ }
4580
+
4581
+ if (FinalSize)
4582
+ *FinalSize = *LoopCost.getValue ();
4583
+ return true ;
4584
+ }
4585
+
4586
+ static bool shouldUnrollMultiExitLoop (Loop *L, ScalarEvolution &SE,
4587
+ AArch64TTIImpl &TTI) {
4588
+ // Only consider loops with unknown trip counts for which we can determine
4589
+ // a symbolic expression. Multi-exit loops with small known trip counts will
4590
+ // likely be unrolled anyway.
4591
+ const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount (L);
4592
+ if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
4593
+ return false ;
4594
+
4595
+ // It might not be worth unrolling loops with low max trip counts. Restrict
4596
+ // this to max trip counts > 32 for now.
4597
+ unsigned MaxTC = SE.getSmallConstantMaxTripCount (L);
4598
+ if (MaxTC > 0 && MaxTC <= 32 )
4599
+ return false ;
4600
+
4601
+ // Estimate the size of the loop.
4602
+ if (!isLoopSizeWithinBudget (L, TTI, 5 , nullptr ))
4603
+ return false ;
4604
+
4605
+ // Small search loops with multiple exits can be highly beneficial to unroll.
4606
+ // We only care about loops with exactly two exiting blocks, although each
4607
+ // block could jump to the same exit block.
4608
+ ArrayRef<BasicBlock *> Blocks = L->getBlocks ();
4609
+ if (Blocks.size () != 2 )
4610
+ return false ;
4611
+
4612
+ if (any_of (Blocks, [](BasicBlock *BB) {
4613
+ return !isa<BranchInst>(BB->getTerminator ());
4614
+ }))
4615
+ return false ;
4616
+
4617
+ return true ;
4618
+ }
4619
+
4531
4620
// / For Apple CPUs, we want to runtime-unroll loops to make better use if the
4532
4621
// / OOO engine's wide instruction window and various predictors.
4533
4622
static void
@@ -4542,43 +4631,18 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
4542
4631
if (!L->isInnermost () || L->getNumBlocks () > 8 )
4543
4632
return ;
4544
4633
4634
+ // This is handled by common code.
4635
+ if (!L->getExitBlock ())
4636
+ return ;
4637
+
4545
4638
const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount (L);
4546
4639
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
4547
4640
(SE.getSmallConstantMaxTripCount (L) > 0 &&
4548
4641
SE.getSmallConstantMaxTripCount (L) <= 32 ))
4549
4642
return ;
4550
- if (findStringMetadataForLoop (L, " llvm.loop.isvectorized" ))
4551
- return ;
4552
4643
4553
- int64_t Size = 0 ;
4554
- for (auto *BB : L->getBlocks ()) {
4555
- for (auto &I : *BB) {
4556
- if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
4557
- return ;
4558
- SmallVector<const Value *, 4 > Operands (I.operand_values ());
4559
- Size +=
4560
- *TTI.getInstructionCost (&I, Operands, TTI::TCK_CodeSize).getValue ();
4561
- }
4562
- }
4563
-
4564
- // Small search loops with multiple exits can be highly beneficial to unroll.
4565
- if (!L->getExitBlock ()) {
4566
- if (L->getNumBlocks () == 2 && Size < 6 &&
4567
- all_of (
4568
- L->getBlocks (),
4569
- [](BasicBlock *BB) {
4570
- return isa<BranchInst>(BB->getTerminator ());
4571
- })) {
4572
- UP.RuntimeUnrollMultiExit = true ;
4573
- UP.Runtime = true ;
4574
- // Limit unroll count.
4575
- UP.DefaultUnrollRuntimeCount = 4 ;
4576
- // Allow slightly more costly trip-count expansion to catch search loops
4577
- // with pointer inductions.
4578
- UP.SCEVExpansionBudget = 5 ;
4579
- }
4644
+ if (findStringMetadataForLoop (L, " llvm.loop.isvectorized" ))
4580
4645
return ;
4581
- }
4582
4646
4583
4647
if (SE.getSymbolicMaxBackedgeTakenCount (L) != SE.getBackedgeTakenCount (L))
4584
4648
return ;
@@ -4590,7 +4654,9 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
4590
4654
// dependencies, to expose more parallel memory access streams.
4591
4655
BasicBlock *Header = L->getHeader ();
4592
4656
if (Header == L->getLoopLatch ()) {
4593
- if (Size > 8 )
4657
+ // Estimate the size of the loop.
4658
+ unsigned Size;
4659
+ if (!isLoopSizeWithinBudget (L, TTI, 8 , &Size))
4594
4660
return ;
4595
4661
4596
4662
SmallPtrSet<Value *, 8 > LoadedValues;
@@ -4703,22 +4769,24 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
4703
4769
break ;
4704
4770
}
4705
4771
4772
+ if (!L->getExitBlock () && shouldUnrollMultiExitLoop (L, SE, *this )) {
4773
+ UP.RuntimeUnrollMultiExit = true ;
4774
+ UP.Runtime = true ;
4775
+ // Limit unroll count.
4776
+ UP.DefaultUnrollRuntimeCount = 4 ;
4777
+ // Allow slightly more costly trip-count expansion to catch search loops
4778
+ // with pointer inductions.
4779
+ UP.SCEVExpansionBudget = 5 ;
4780
+ return ;
4781
+ }
4782
+
4706
4783
// Scan the loop: don't unroll loops with calls as this could prevent
4707
4784
// inlining. Don't unroll vector loops either, as they don't benefit much from
4708
4785
// unrolling.
4709
4786
for (auto *BB : L->getBlocks ()) {
4710
4787
for (auto &I : *BB) {
4711
- // Don't unroll vectorised loop.
4712
- if (I.getType ()->isVectorTy ())
4788
+ if (!shouldUnrollLoopWithInstruction (I, *this ))
4713
4789
return ;
4714
-
4715
- if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4716
- if (const Function *F = cast<CallBase>(I).getCalledFunction ()) {
4717
- if (!isLoweredToCall (F))
4718
- continue ;
4719
- }
4720
- return ;
4721
- }
4722
4790
}
4723
4791
}
4724
4792
0 commit comments