@@ -68,6 +68,11 @@ static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
68
68
static cl::opt<bool > EnableLSRCostOpt (" enable-aarch64-lsr-cost-opt" ,
69
69
cl::init (true ), cl::Hidden);
70
70
71
+ static cl::opt<unsigned > SmallMultiExitLoopUF (
72
+ " small-multi-exit-loop-unroll-factor" , cl::init(0 ), cl::Hidden,
73
+ cl::desc(
74
+ " Force unrolling of small multi-exit loops with given unroll factor" ));
75
+
71
76
// A complete guess as to a reasonable cost.
72
77
static cl::opt<unsigned >
73
78
BaseHistCntCost (" aarch64-base-histcnt-cost" , cl::init(8 ), cl::Hidden,
@@ -4375,6 +4380,70 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
4375
4380
}
4376
4381
}
4377
4382
4383
+ static bool shouldUnrollLoopWithInstruction (Instruction &I,
4384
+ AArch64TTIImpl &TTI) {
4385
+ // Don't unroll vectorised loop.
4386
+ if (I.getType ()->isVectorTy ())
4387
+ return false ;
4388
+
4389
+ if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4390
+ if (const Function *F = cast<CallBase>(I).getCalledFunction ())
4391
+ if (!TTI.isLoweredToCall (F))
4392
+ return true ;
4393
+ return false ;
4394
+ }
4395
+
4396
+ return true ;
4397
+ }
4398
+
4399
+ static bool shouldUnrollSmallMultiExitLoop (Loop *L, ScalarEvolution &SE,
4400
+ AArch64TTIImpl &TTI) {
4401
+ // Small search loops with multiple exits can be highly beneficial to unroll.
4402
+ // We only care about loops with exactly two exiting blocks, although each
4403
+ // block could jump to the same exit block.
4404
+ SmallVector<BasicBlock *> Blocks (L->getBlocks ());
4405
+ if (Blocks.size () != 2 || L->getExitingBlock ())
4406
+ return false ;
4407
+
4408
+ if (any_of (Blocks, [](BasicBlock *BB) {
4409
+ return !isa<BranchInst>(BB->getTerminator ());
4410
+ }))
4411
+ return false ;
4412
+
4413
+ // Only consider loops with unknown trip counts for which we can determine
4414
+ // a symbolic expression. Multi-exit loops with small known trip counts will
4415
+ // likely be unrolled anyway.
4416
+ const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount (L);
4417
+ if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
4418
+ return false ;
4419
+
4420
+ // It might not be worth unrolling loops with low max trip counts. Restrict
4421
+ // this to max trip counts > 32 for now.
4422
+ unsigned MaxTC = SE.getSmallConstantMaxTripCount (L);
4423
+ if (MaxTC > 0 && MaxTC <= 32 )
4424
+ return false ;
4425
+
4426
+ // Estimate the size of the loop.
4427
+ int64_t Size = 0 ;
4428
+ for (auto *BB : L->getBlocks ()) {
4429
+ for (auto &I : *BB) {
4430
+ if (!shouldUnrollLoopWithInstruction (I, TTI))
4431
+ return false ;
4432
+
4433
+ SmallVector<const Value *, 4 > Operands (I.operand_values ());
4434
+ InstructionCost Cost =
4435
+ TTI.getInstructionCost (&I, Operands, TTI::TCK_CodeSize);
4436
+ // This can happen with intrinsics that don't currently have a cost model
4437
+ // or for some operations that require SVE.
4438
+ if (!Cost.isValid ())
4439
+ return false ;
4440
+ Size += *Cost.getValue ();
4441
+ }
4442
+ }
4443
+
4444
+ return Size < 6 ;
4445
+ }
4446
+
4378
4447
// / For Apple CPUs, we want to runtime-unroll loops to make better use if the
4379
4448
// / OOO engine's wide instruction window and various predictors.
4380
4449
static void
@@ -4550,22 +4619,23 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
4550
4619
break ;
4551
4620
}
4552
4621
4622
+ if (SmallMultiExitLoopUF && shouldUnrollSmallMultiExitLoop (L, SE, *this )) {
4623
+ UP.RuntimeUnrollMultiExit = true ;
4624
+ UP.Runtime = true ;
4625
+ // Limit unroll count.
4626
+ UP.DefaultUnrollRuntimeCount = SmallMultiExitLoopUF;
4627
+ // Allow slightly more costly trip-count expansion to catch search loops
4628
+ // with pointer inductions.
4629
+ UP.SCEVExpansionBudget = 5 ;
4630
+ }
4631
+
4553
4632
// Scan the loop: don't unroll loops with calls as this could prevent
4554
4633
// inlining. Don't unroll vector loops either, as they don't benefit much from
4555
4634
// unrolling.
4556
4635
for (auto *BB : L->getBlocks ()) {
4557
4636
for (auto &I : *BB) {
4558
- // Don't unroll vectorised loop.
4559
- if (I.getType ()->isVectorTy ())
4637
+ if (!shouldUnrollLoopWithInstruction (I, *this ))
4560
4638
return ;
4561
-
4562
- if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4563
- if (const Function *F = cast<CallBase>(I).getCalledFunction ()) {
4564
- if (!isLoweredToCall (F))
4565
- continue ;
4566
- }
4567
- return ;
4568
- }
4569
4639
}
4570
4640
}
4571
4641
0 commit comments