Skip to content

Commit 7f15657

Browse files
committed
[AArch64] Add flag to control unrolling for small multi-exit loops
It can be highly beneficial to unroll small, two-block search loops that look for a value in an array. An example of this would be something that uses std::find to find a value in libc++. Older versions of std::find in the libstdc++ headers are manually unrolled in the source code, but this might change in newer releases where the compiler is expected to either vectorise or unroll itself. This patch adds a new flag -small-multi-exit-loop-unroll-factor that controls the amount of unrolling for such loops. This is currently off by default, but in a future patch I plan to enable this for some targets along with details of any performance improvements.
1 parent 4775e6d commit 7f15657

File tree

2 files changed

+793
-10
lines changed

2 files changed

+793
-10
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 80 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,11 @@ static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
6868
static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
6969
cl::init(true), cl::Hidden);
7070

71+
static cl::opt<unsigned> SmallMultiExitLoopUF(
72+
"small-multi-exit-loop-unroll-factor", cl::init(0), cl::Hidden,
73+
cl::desc(
74+
"Force unrolling of small multi-exit loops with given unroll factor"));
75+
7176
// A complete guess as to a reasonable cost.
7277
static cl::opt<unsigned>
7378
BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
@@ -4370,6 +4375,70 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
43704375
}
43714376
}
43724377

4378+
static bool shouldUnrollLoopWithInstruction(Instruction &I,
4379+
AArch64TTIImpl &TTI) {
4380+
// Don't unroll vectorised loop.
4381+
if (I.getType()->isVectorTy())
4382+
return false;
4383+
4384+
if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4385+
if (const Function *F = cast<CallBase>(I).getCalledFunction())
4386+
if (!TTI.isLoweredToCall(F))
4387+
return true;
4388+
return false;
4389+
}
4390+
4391+
return true;
4392+
}
4393+
4394+
static bool shouldUnrollSmallMultiExitLoop(Loop *L, ScalarEvolution &SE,
4395+
AArch64TTIImpl &TTI) {
4396+
// Small search loops with multiple exits can be highly beneficial to unroll.
4397+
// We only care about loops with exactly two exiting blocks, although each
4398+
// block could jump to the same exit block.
4399+
SmallVector<BasicBlock *> Blocks(L->getBlocks());
4400+
if (Blocks.size() != 2 || L->getExitingBlock())
4401+
return false;
4402+
4403+
if (any_of(Blocks, [](BasicBlock *BB) {
4404+
return !isa<BranchInst>(BB->getTerminator());
4405+
}))
4406+
return false;
4407+
4408+
// Only consider loops with unknown trip counts for which we can determine
4409+
// a symbolic expression. Multi-exit loops with small known trip counts will
4410+
// likely be unrolled anyway.
4411+
const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4412+
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
4413+
return false;
4414+
4415+
// It might not be worth unrolling loops with low max trip counts. Restrict
4416+
// this to max trip counts > 32 for now.
4417+
unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
4418+
if (MaxTC > 0 && MaxTC <= 32)
4419+
return false;
4420+
4421+
// Estimate the size of the loop.
4422+
int64_t Size = 0;
4423+
for (auto *BB : L->getBlocks()) {
4424+
for (auto &I : *BB) {
4425+
if (!shouldUnrollLoopWithInstruction(I, TTI))
4426+
return false;
4427+
4428+
SmallVector<const Value *, 4> Operands(I.operand_values());
4429+
InstructionCost Cost =
4430+
TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
4431+
// This can happen with intrinsics that don't currently have a cost model
4432+
// or for some operations that require SVE.
4433+
if (!Cost.isValid())
4434+
return false;
4435+
Size += *Cost.getValue();
4436+
}
4437+
}
4438+
4439+
return Size < 6;
4440+
}
4441+
43734442
/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
43744443
/// OOO engine's wide instruction window and various predictors.
43754444
static void
@@ -4545,22 +4614,23 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
45454614
break;
45464615
}
45474616

4617+
if (SmallMultiExitLoopUF && shouldUnrollSmallMultiExitLoop(L, SE, *this)) {
4618+
UP.RuntimeUnrollMultiExit = true;
4619+
UP.Runtime = true;
4620+
// Limit unroll count.
4621+
UP.DefaultUnrollRuntimeCount = SmallMultiExitLoopUF;
4622+
// Allow slightly more costly trip-count expansion to catch search loops
4623+
// with pointer inductions.
4624+
UP.SCEVExpansionBudget = 5;
4625+
}
4626+
45484627
// Scan the loop: don't unroll loops with calls as this could prevent
45494628
// inlining. Don't unroll vector loops either, as they don't benefit much from
45504629
// unrolling.
45514630
for (auto *BB : L->getBlocks()) {
45524631
for (auto &I : *BB) {
4553-
// Don't unroll vectorised loop.
4554-
if (I.getType()->isVectorTy())
4632+
if (!shouldUnrollLoopWithInstruction(I, *this))
45554633
return;
4556-
4557-
if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4558-
if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
4559-
if (!isLoweredToCall(F))
4560-
continue;
4561-
}
4562-
return;
4563-
}
45644634
}
45654635
}
45664636

0 commit comments

Comments
 (0)