@@ -68,6 +68,11 @@ static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
68
68
static cl::opt<bool > EnableLSRCostOpt (" enable-aarch64-lsr-cost-opt" ,
69
69
cl::init (true ), cl::Hidden);
70
70
71
+ static cl::opt<unsigned > SmallMultiExitLoopUF (
72
+ " small-multi-exit-loop-unroll-factor" , cl::init(0 ), cl::Hidden,
73
+ cl::desc(
74
+ " Force unrolling of small multi-exit loops with given unroll factor" ));
75
+
71
76
// A complete guess as to a reasonable cost.
72
77
static cl::opt<unsigned >
73
78
BaseHistCntCost (" aarch64-base-histcnt-cost" , cl::init(8 ), cl::Hidden,
@@ -4370,6 +4375,70 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
4370
4375
}
4371
4376
}
4372
4377
4378
+ static bool shouldUnrollLoopWithInstruction (Instruction &I,
4379
+ AArch64TTIImpl &TTI) {
4380
+ // Don't unroll vectorised loop.
4381
+ if (I.getType ()->isVectorTy ())
4382
+ return false ;
4383
+
4384
+ if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4385
+ if (const Function *F = cast<CallBase>(I).getCalledFunction ())
4386
+ if (!TTI.isLoweredToCall (F))
4387
+ return true ;
4388
+ return false ;
4389
+ }
4390
+
4391
+ return true ;
4392
+ }
4393
+
4394
+ static bool shouldUnrollSmallMultiExitLoop (Loop *L, ScalarEvolution &SE,
4395
+ AArch64TTIImpl &TTI) {
4396
+ // Small search loops with multiple exits can be highly beneficial to unroll.
4397
+ // We only care about loops with exactly two exiting blocks, although each
4398
+ // block could jump to the same exit block.
4399
+ SmallVector<BasicBlock *> Blocks (L->getBlocks ());
4400
+ if (Blocks.size () != 2 || L->getExitingBlock ())
4401
+ return false ;
4402
+
4403
+ if (any_of (Blocks, [](BasicBlock *BB) {
4404
+ return !isa<BranchInst>(BB->getTerminator ());
4405
+ }))
4406
+ return false ;
4407
+
4408
+ // Only consider loops with unknown trip counts for which we can determine
4409
+ // a symbolic expression. Multi-exit loops with small known trip counts will
4410
+ // likely be unrolled anyway.
4411
+ const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount (L);
4412
+ if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
4413
+ return false ;
4414
+
4415
+ // It might not be worth unrolling loops with low max trip counts. Restrict
4416
+ // this to max trip counts > 32 for now.
4417
+ unsigned MaxTC = SE.getSmallConstantMaxTripCount (L);
4418
+ if (MaxTC > 0 && MaxTC <= 32 )
4419
+ return false ;
4420
+
4421
+ // Estimate the size of the loop.
4422
+ int64_t Size = 0 ;
4423
+ for (auto *BB : L->getBlocks ()) {
4424
+ for (auto &I : *BB) {
4425
+ if (!shouldUnrollLoopWithInstruction (I, TTI))
4426
+ return false ;
4427
+
4428
+ SmallVector<const Value *, 4 > Operands (I.operand_values ());
4429
+ InstructionCost Cost =
4430
+ TTI.getInstructionCost (&I, Operands, TTI::TCK_CodeSize);
4431
+ // This can happen with intrinsics that don't currently have a cost model
4432
+ // or for some operations that require SVE.
4433
+ if (!Cost.isValid ())
4434
+ return false ;
4435
+ Size += *Cost.getValue ();
4436
+ }
4437
+ }
4438
+
4439
+ return Size < 6 ;
4440
+ }
4441
+
4373
4442
// / For Apple CPUs, we want to runtime-unroll loops to make better use if the
4374
4443
// / OOO engine's wide instruction window and various predictors.
4375
4444
static void
@@ -4545,22 +4614,23 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
4545
4614
break ;
4546
4615
}
4547
4616
4617
+ if (SmallMultiExitLoopUF && shouldUnrollSmallMultiExitLoop (L, SE, *this )) {
4618
+ UP.RuntimeUnrollMultiExit = true ;
4619
+ UP.Runtime = true ;
4620
+ // Limit unroll count.
4621
+ UP.DefaultUnrollRuntimeCount = SmallMultiExitLoopUF;
4622
+ // Allow slightly more costly trip-count expansion to catch search loops
4623
+ // with pointer inductions.
4624
+ UP.SCEVExpansionBudget = 5 ;
4625
+ }
4626
+
4548
4627
// Scan the loop: don't unroll loops with calls as this could prevent
4549
4628
// inlining. Don't unroll vector loops either, as they don't benefit much from
4550
4629
// unrolling.
4551
4630
for (auto *BB : L->getBlocks ()) {
4552
4631
for (auto &I : *BB) {
4553
- // Don't unroll vectorised loop.
4554
- if (I.getType ()->isVectorTy ())
4632
+ if (!shouldUnrollLoopWithInstruction (I, *this ))
4555
4633
return ;
4556
-
4557
- if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4558
- if (const Function *F = cast<CallBase>(I).getCalledFunction ()) {
4559
- if (!isLoweredToCall (F))
4560
- continue ;
4561
- }
4562
- return ;
4563
- }
4564
4634
}
4565
4635
}
4566
4636
0 commit comments