Skip to content

Commit 64c9960

Browse files
committed
[AArch64] Add flag to control unrolling for small multi-exit loops
It can be highly beneficial to unroll small, two-block search loops that look for a value in an array. An example of this would be something that uses std::find to find a value in libc++. Older versions of std::find in the libstdc++ headers are manually unrolled in the source code, but this might change in newer releases where the compiler is expected to either vectorise or unroll itself. This patch adds a new flag -small-multi-exit-loop-unroll-factor that controls the amount of unrolling for such loops. This is currently off by default, but in a future patch I plan to enable this for some targets along with details of any performance improvements.
1 parent f6b1b91 commit 64c9960

File tree

2 files changed

+793
-10
lines changed

2 files changed

+793
-10
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 80 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,11 @@ static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
6868
static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
6969
cl::init(true), cl::Hidden);
7070

71+
static cl::opt<unsigned> SmallMultiExitLoopUF(
72+
"small-multi-exit-loop-unroll-factor", cl::init(0), cl::Hidden,
73+
cl::desc(
74+
"Force unrolling of small multi-exit loops with given unroll factor"));
75+
7176
// A complete guess as to a reasonable cost.
7277
static cl::opt<unsigned>
7378
BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
@@ -4237,6 +4242,70 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
42374242
}
42384243
}
42394244

4245+
static bool shouldUnrollLoopWithInstruction(Instruction &I,
4246+
AArch64TTIImpl &TTI) {
4247+
// Don't unroll vectorised loop.
4248+
if (I.getType()->isVectorTy())
4249+
return false;
4250+
4251+
if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4252+
if (const Function *F = cast<CallBase>(I).getCalledFunction())
4253+
if (!TTI.isLoweredToCall(F))
4254+
return true;
4255+
return false;
4256+
}
4257+
4258+
return true;
4259+
}
4260+
4261+
static bool shouldUnrollSmallMultiExitLoop(Loop *L, ScalarEvolution &SE,
4262+
AArch64TTIImpl &TTI) {
4263+
// Small search loops with multiple exits can be highly beneficial to unroll.
4264+
// We only care about loops with exactly two exiting blocks, although each
4265+
// block could jump to the same exit block.
4266+
SmallVector<BasicBlock *> Blocks(L->getBlocks());
4267+
if (Blocks.size() != 2 || L->getExitingBlock())
4268+
return false;
4269+
4270+
if (any_of(Blocks, [](BasicBlock *BB) {
4271+
return !isa<BranchInst>(BB->getTerminator());
4272+
}))
4273+
return false;
4274+
4275+
// Only consider loops with unknown trip counts for which we can determine
4276+
// a symbolic expression. Multi-exit loops with small known trip counts will
4277+
// likely be unrolled anyway.
4278+
const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4279+
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
4280+
return false;
4281+
4282+
// It might not be worth unrolling loops with low max trip counts. Restrict
4283+
// this to max trip counts > 32 for now.
4284+
unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
4285+
if (MaxTC > 0 && MaxTC <= 32)
4286+
return false;
4287+
4288+
// Estimate the size of the loop.
4289+
int64_t Size = 0;
4290+
for (auto *BB : L->getBlocks()) {
4291+
for (auto &I : *BB) {
4292+
if (!shouldUnrollLoopWithInstruction(I, TTI))
4293+
return false;
4294+
4295+
SmallVector<const Value *, 4> Operands(I.operand_values());
4296+
InstructionCost Cost =
4297+
TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
4298+
// This can happen with intrinsics that don't currently have a cost model
4299+
// or for some operations that require SVE.
4300+
if (!Cost.isValid())
4301+
return false;
4302+
Size += *Cost.getValue();
4303+
}
4304+
}
4305+
4306+
return Size < 6;
4307+
}
4308+
42404309
/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
42414310
/// OOO engine's wide instruction window and various predictors.
42424311
static void
@@ -4412,22 +4481,23 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
44124481
break;
44134482
}
44144483

4484+
if (SmallMultiExitLoopUF && shouldUnrollSmallMultiExitLoop(L, SE, *this)) {
4485+
UP.RuntimeUnrollMultiExit = true;
4486+
UP.Runtime = true;
4487+
// Limit unroll count.
4488+
UP.DefaultUnrollRuntimeCount = SmallMultiExitLoopUF;
4489+
// Allow slightly more costly trip-count expansion to catch search loops
4490+
// with pointer inductions.
4491+
UP.SCEVExpansionBudget = 5;
4492+
}
4493+
44154494
// Scan the loop: don't unroll loops with calls as this could prevent
44164495
// inlining. Don't unroll vector loops either, as they don't benefit much from
44174496
// unrolling.
44184497
for (auto *BB : L->getBlocks()) {
44194498
for (auto &I : *BB) {
4420-
// Don't unroll vectorised loop.
4421-
if (I.getType()->isVectorTy())
4499+
if (!shouldUnrollLoopWithInstruction(I, *this))
44224500
return;
4423-
4424-
if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4425-
if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
4426-
if (!isLoweredToCall(F))
4427-
continue;
4428-
}
4429-
return;
4430-
}
44314501
}
44324502
}
44334503

0 commit comments

Comments
 (0)