@@ -68,6 +68,11 @@ static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
68
68
static cl::opt<bool > EnableLSRCostOpt (" enable-aarch64-lsr-cost-opt" ,
69
69
cl::init (true ), cl::Hidden);
70
70
71
+ static cl::opt<unsigned > SmallMultiExitLoopUF (
72
+ " small-multi-exit-loop-unroll-factor" , cl::init(0 ), cl::Hidden,
73
+ cl::desc(
74
+ " Force unrolling of small multi-exit loops with given unroll factor" ));
75
+
71
76
// A complete guess as to a reasonable cost.
72
77
static cl::opt<unsigned >
73
78
BaseHistCntCost (" aarch64-base-histcnt-cost" , cl::init(8 ), cl::Hidden,
@@ -4237,6 +4242,70 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
4237
4242
}
4238
4243
}
4239
4244
4245
+ static bool shouldUnrollLoopWithInstruction (Instruction &I,
4246
+ AArch64TTIImpl &TTI) {
4247
+ // Don't unroll vectorised loop.
4248
+ if (I.getType ()->isVectorTy ())
4249
+ return false ;
4250
+
4251
+ if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4252
+ if (const Function *F = cast<CallBase>(I).getCalledFunction ())
4253
+ if (!TTI.isLoweredToCall (F))
4254
+ return true ;
4255
+ return false ;
4256
+ }
4257
+
4258
+ return true ;
4259
+ }
4260
+
4261
+ static bool shouldUnrollSmallMultiExitLoop (Loop *L, ScalarEvolution &SE,
4262
+ AArch64TTIImpl &TTI) {
4263
+ // Small search loops with multiple exits can be highly beneficial to unroll.
4264
+ // We only care about loops with exactly two exiting blocks, although each
4265
+ // block could jump to the same exit block.
4266
+ SmallVector<BasicBlock *> Blocks (L->getBlocks ());
4267
+ if (Blocks.size () != 2 || L->getExitingBlock ())
4268
+ return false ;
4269
+
4270
+ if (any_of (Blocks, [](BasicBlock *BB) {
4271
+ return !isa<BranchInst>(BB->getTerminator ());
4272
+ }))
4273
+ return false ;
4274
+
4275
+ // Only consider loops with unknown trip counts for which we can determine
4276
+ // a symbolic expression. Multi-exit loops with small known trip counts will
4277
+ // likely be unrolled anyway.
4278
+ const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount (L);
4279
+ if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
4280
+ return false ;
4281
+
4282
+ // It might not be worth unrolling loops with low max trip counts. Restrict
4283
+ // this to max trip counts > 32 for now.
4284
+ unsigned MaxTC = SE.getSmallConstantMaxTripCount (L);
4285
+ if (MaxTC > 0 && MaxTC <= 32 )
4286
+ return false ;
4287
+
4288
+ // Estimate the size of the loop.
4289
+ int64_t Size = 0 ;
4290
+ for (auto *BB : L->getBlocks ()) {
4291
+ for (auto &I : *BB) {
4292
+ if (!shouldUnrollLoopWithInstruction (I, TTI))
4293
+ return false ;
4294
+
4295
+ SmallVector<const Value *, 4 > Operands (I.operand_values ());
4296
+ InstructionCost Cost =
4297
+ TTI.getInstructionCost (&I, Operands, TTI::TCK_CodeSize);
4298
+ // This can happen with intrinsics that don't currently have a cost model
4299
+ // or for some operations that require SVE.
4300
+ if (!Cost.isValid ())
4301
+ return false ;
4302
+ Size += *Cost.getValue ();
4303
+ }
4304
+ }
4305
+
4306
+ return Size < 6 ;
4307
+ }
4308
+
4240
4309
// / For Apple CPUs, we want to runtime-unroll loops to make better use if the
4241
4310
// / OOO engine's wide instruction window and various predictors.
4242
4311
static void
@@ -4412,22 +4481,23 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
4412
4481
break ;
4413
4482
}
4414
4483
4484
+ if (SmallMultiExitLoopUF && shouldUnrollSmallMultiExitLoop (L, SE, *this )) {
4485
+ UP.RuntimeUnrollMultiExit = true ;
4486
+ UP.Runtime = true ;
4487
+ // Limit unroll count.
4488
+ UP.DefaultUnrollRuntimeCount = SmallMultiExitLoopUF;
4489
+ // Allow slightly more costly trip-count expansion to catch search loops
4490
+ // with pointer inductions.
4491
+ UP.SCEVExpansionBudget = 5 ;
4492
+ }
4493
+
4415
4494
// Scan the loop: don't unroll loops with calls as this could prevent
4416
4495
// inlining. Don't unroll vector loops either, as they don't benefit much from
4417
4496
// unrolling.
4418
4497
for (auto *BB : L->getBlocks ()) {
4419
4498
for (auto &I : *BB) {
4420
- // Don't unroll vectorised loop.
4421
- if (I.getType ()->isVectorTy ())
4499
+ if (!shouldUnrollLoopWithInstruction (I, *this ))
4422
4500
return ;
4423
-
4424
- if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4425
- if (const Function *F = cast<CallBase>(I).getCalledFunction ()) {
4426
- if (!isLoweredToCall (F))
4427
- continue ;
4428
- }
4429
- return ;
4430
- }
4431
4501
}
4432
4502
}
4433
4503
0 commit comments