Skip to content

Commit 9ebd338

Browse files
committed
[AArch64] Add flag to control unrolling for small multi-exit loops
It can be highly beneficial to unroll small, two-block search loops that look for a value in an array. An example of this would be something that uses std::find to find a value in libc++. Older versions of std::find in the libstdc++ headers are manually unrolled in the source code, but this might change in newer releases where the compiler is expected to either vectorise or unroll itself. This patch adds a new flag -small-multi-exit-loop-unroll-factor that controls the amount of unrolling for such loops. This is currently off by default, but in a future patch I plan to enable this for some targets along with details of any performance improvements.
1 parent 5c65a32 commit 9ebd338

File tree

2 files changed

+793
-10
lines changed

2 files changed

+793
-10
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 80 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,11 @@ static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
6868
static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
6969
cl::init(true), cl::Hidden);
7070

71+
static cl::opt<unsigned> SmallMultiExitLoopUF(
72+
"small-multi-exit-loop-unroll-factor", cl::init(0), cl::Hidden,
73+
cl::desc(
74+
"Force unrolling of small multi-exit loops with given unroll factor"));
75+
7176
// A complete guess as to a reasonable cost.
7277
static cl::opt<unsigned>
7378
BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
@@ -4375,6 +4380,70 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
43754380
}
43764381
}
43774382

4383+
static bool shouldUnrollLoopWithInstruction(Instruction &I,
4384+
AArch64TTIImpl &TTI) {
4385+
// Don't unroll vectorised loop.
4386+
if (I.getType()->isVectorTy())
4387+
return false;
4388+
4389+
if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4390+
if (const Function *F = cast<CallBase>(I).getCalledFunction())
4391+
if (!TTI.isLoweredToCall(F))
4392+
return true;
4393+
return false;
4394+
}
4395+
4396+
return true;
4397+
}
4398+
4399+
static bool shouldUnrollSmallMultiExitLoop(Loop *L, ScalarEvolution &SE,
4400+
AArch64TTIImpl &TTI) {
4401+
// Small search loops with multiple exits can be highly beneficial to unroll.
4402+
// We only care about loops with exactly two exiting blocks, although each
4403+
// block could jump to the same exit block.
4404+
SmallVector<BasicBlock *> Blocks(L->getBlocks());
4405+
if (Blocks.size() != 2 || L->getExitingBlock())
4406+
return false;
4407+
4408+
if (any_of(Blocks, [](BasicBlock *BB) {
4409+
return !isa<BranchInst>(BB->getTerminator());
4410+
}))
4411+
return false;
4412+
4413+
// Only consider loops with unknown trip counts for which we can determine
4414+
// a symbolic expression. Multi-exit loops with small known trip counts will
4415+
// likely be unrolled anyway.
4416+
const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4417+
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
4418+
return false;
4419+
4420+
// It might not be worth unrolling loops with low max trip counts. Restrict
4421+
// this to max trip counts > 32 for now.
4422+
unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
4423+
if (MaxTC > 0 && MaxTC <= 32)
4424+
return false;
4425+
4426+
// Estimate the size of the loop.
4427+
int64_t Size = 0;
4428+
for (auto *BB : L->getBlocks()) {
4429+
for (auto &I : *BB) {
4430+
if (!shouldUnrollLoopWithInstruction(I, TTI))
4431+
return false;
4432+
4433+
SmallVector<const Value *, 4> Operands(I.operand_values());
4434+
InstructionCost Cost =
4435+
TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
4436+
// This can happen with intrinsics that don't currently have a cost model
4437+
// or for some operations that require SVE.
4438+
if (!Cost.isValid())
4439+
return false;
4440+
Size += *Cost.getValue();
4441+
}
4442+
}
4443+
4444+
return Size < 6;
4445+
}
4446+
43784447
/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
43794448
/// OOO engine's wide instruction window and various predictors.
43804449
static void
@@ -4550,22 +4619,23 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
45504619
break;
45514620
}
45524621

4622+
if (SmallMultiExitLoopUF && shouldUnrollSmallMultiExitLoop(L, SE, *this)) {
4623+
UP.RuntimeUnrollMultiExit = true;
4624+
UP.Runtime = true;
4625+
// Limit unroll count.
4626+
UP.DefaultUnrollRuntimeCount = SmallMultiExitLoopUF;
4627+
// Allow slightly more costly trip-count expansion to catch search loops
4628+
// with pointer inductions.
4629+
UP.SCEVExpansionBudget = 5;
4630+
}
4631+
45534632
// Scan the loop: don't unroll loops with calls as this could prevent
45544633
// inlining. Don't unroll vector loops either, as they don't benefit much from
45554634
// unrolling.
45564635
for (auto *BB : L->getBlocks()) {
45574636
for (auto &I : *BB) {
4558-
// Don't unroll vectorised loop.
4559-
if (I.getType()->isVectorTy())
4637+
if (!shouldUnrollLoopWithInstruction(I, *this))
45604638
return;
4561-
4562-
if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4563-
if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
4564-
if (!isLoweredToCall(F))
4565-
continue;
4566-
}
4567-
return;
4568-
}
45694639
}
45704640
}
45714641

0 commit comments

Comments
 (0)