Skip to content

Commit 712c213

Browse files
authored
[AArch64] Enable unrolling for small multi-exit loops (#131998)
It can be highly beneficial to unroll small, two-block search loops that look for a value in an array. An example of this would be something that uses std::find to find a value in libc++. Older versions of std::find in the libstdc++ headers are manually unrolled in the source code, but this might change in newer releases where the compiler is expected to either vectorise or unroll itself.
1 parent 08e080e commit 712c213

File tree

3 files changed

+1020
-167
lines changed

3 files changed

+1020
-167
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 103 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -4568,6 +4568,71 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
45684568
}
45694569
}
45704570

4571+
// This function returns true if the loop:
4572+
// 1. Has a valid cost, and
4573+
// 2. Has a cost within the supplied budget.
4574+
// Otherwise it returns false.
4575+
static bool isLoopSizeWithinBudget(Loop *L, AArch64TTIImpl &TTI,
4576+
InstructionCost Budget,
4577+
unsigned *FinalSize) {
4578+
// Estimate the size of the loop.
4579+
InstructionCost LoopCost = 0;
4580+
4581+
for (auto *BB : L->getBlocks()) {
4582+
for (auto &I : *BB) {
4583+
SmallVector<const Value *, 4> Operands(I.operand_values());
4584+
InstructionCost Cost =
4585+
TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
4586+
// This can happen with intrinsics that don't currently have a cost model
4587+
// or for some operations that require SVE.
4588+
if (!Cost.isValid())
4589+
return false;
4590+
4591+
LoopCost += Cost;
4592+
if (LoopCost > Budget)
4593+
return false;
4594+
}
4595+
}
4596+
4597+
if (FinalSize)
4598+
*FinalSize = *LoopCost.getValue();
4599+
return true;
4600+
}
4601+
4602+
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
4603+
AArch64TTIImpl &TTI) {
4604+
// Only consider loops with unknown trip counts for which we can determine
4605+
// a symbolic expression. Multi-exit loops with small known trip counts will
4606+
// likely be unrolled anyway.
4607+
const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4608+
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
4609+
return false;
4610+
4611+
// It might not be worth unrolling loops with low max trip counts. Restrict
4612+
// this to max trip counts > 32 for now.
4613+
unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
4614+
if (MaxTC > 0 && MaxTC <= 32)
4615+
return false;
4616+
4617+
// Make sure the loop size is <= 5.
4618+
if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
4619+
return false;
4620+
4621+
// Small search loops with multiple exits can be highly beneficial to unroll.
4622+
// We only care about loops with exactly two exiting blocks, although each
4623+
// block could jump to the same exit block.
4624+
ArrayRef<BasicBlock *> Blocks = L->getBlocks();
4625+
if (Blocks.size() != 2)
4626+
return false;
4627+
4628+
if (any_of(Blocks, [](BasicBlock *BB) {
4629+
return !isa<BranchInst>(BB->getTerminator());
4630+
}))
4631+
return false;
4632+
4633+
return true;
4634+
}
4635+
45714636
/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
45724637
/// OOO engine's wide instruction window and various predictors.
45734638
static void
@@ -4582,43 +4647,18 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
45824647
if (!L->isInnermost() || L->getNumBlocks() > 8)
45834648
return;
45844649

4650+
// Loops with multiple exits are handled by common code.
4651+
if (!L->getExitBlock())
4652+
return;
4653+
45854654
const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
45864655
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
45874656
(SE.getSmallConstantMaxTripCount(L) > 0 &&
45884657
SE.getSmallConstantMaxTripCount(L) <= 32))
45894658
return;
4590-
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4591-
return;
4592-
4593-
int64_t Size = 0;
4594-
for (auto *BB : L->getBlocks()) {
4595-
for (auto &I : *BB) {
4596-
if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
4597-
return;
4598-
SmallVector<const Value *, 4> Operands(I.operand_values());
4599-
Size +=
4600-
*TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue();
4601-
}
4602-
}
46034659

4604-
// Small search loops with multiple exits can be highly beneficial to unroll.
4605-
if (!L->getExitBlock()) {
4606-
if (L->getNumBlocks() == 2 && Size < 6 &&
4607-
all_of(
4608-
L->getBlocks(),
4609-
[](BasicBlock *BB) {
4610-
return isa<BranchInst>(BB->getTerminator());
4611-
})) {
4612-
UP.RuntimeUnrollMultiExit = true;
4613-
UP.Runtime = true;
4614-
// Limit unroll count.
4615-
UP.DefaultUnrollRuntimeCount = 4;
4616-
// Allow slightly more costly trip-count expansion to catch search loops
4617-
// with pointer inductions.
4618-
UP.SCEVExpansionBudget = 5;
4619-
}
4660+
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
46204661
return;
4621-
}
46224662

46234663
if (SE.getSymbolicMaxBackedgeTakenCount(L) != SE.getBackedgeTakenCount(L))
46244664
return;
@@ -4630,7 +4670,9 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
46304670
// dependencies, to expose more parallel memory access streams.
46314671
BasicBlock *Header = L->getHeader();
46324672
if (Header == L->getLoopLatch()) {
4633-
if (Size > 8)
4673+
// Estimate the size of the loop.
4674+
unsigned Size;
4675+
if (!isLoopSizeWithinBudget(L, TTI, 8, &Size))
46344676
return;
46354677

46364678
SmallPtrSet<Value *, 8> LoadedValues;
@@ -4727,6 +4769,25 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
47274769
// Disable partial & runtime unrolling on -Os.
47284770
UP.PartialOptSizeThreshold = 0;
47294771

4772+
// Scan the loop: don't unroll loops with calls as this could prevent
4773+
// inlining. Don't unroll vector loops either, as they don't benefit much from
4774+
// unrolling.
4775+
for (auto *BB : L->getBlocks()) {
4776+
for (auto &I : *BB) {
4777+
// Don't unroll vectorised loop.
4778+
if (I.getType()->isVectorTy())
4779+
return;
4780+
4781+
if (isa<CallBase>(I)) {
4782+
if (isa<CallInst>(I) || isa<InvokeInst>(I))
4783+
if (const Function *F = cast<CallBase>(I).getCalledFunction())
4784+
if (!isLoweredToCall(F))
4785+
continue;
4786+
return;
4787+
}
4788+
}
4789+
}
4790+
47304791
// Apply subtarget-specific unrolling preferences.
47314792
switch (ST->getProcFamily()) {
47324793
case AArch64Subtarget::AppleA14:
@@ -4743,23 +4804,17 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
47434804
break;
47444805
}
47454806

4746-
// Scan the loop: don't unroll loops with calls as this could prevent
4747-
// inlining. Don't unroll vector loops either, as they don't benefit much from
4748-
// unrolling.
4749-
for (auto *BB : L->getBlocks()) {
4750-
for (auto &I : *BB) {
4751-
// Don't unroll vectorised loop.
4752-
if (I.getType()->isVectorTy())
4753-
return;
4754-
4755-
if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4756-
if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
4757-
if (!isLoweredToCall(F))
4758-
continue;
4759-
}
4760-
return;
4761-
}
4762-
}
4807+
// If this is a small, multi-exit loop similar to something like std::find,
4808+
// then there is typically a performance improvement achieved by unrolling.
4809+
if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
4810+
UP.RuntimeUnrollMultiExit = true;
4811+
UP.Runtime = true;
4812+
// Limit unroll count.
4813+
UP.DefaultUnrollRuntimeCount = 4;
4814+
// Allow slightly more costly trip-count expansion to catch search loops
4815+
// with pointer inductions.
4816+
UP.SCEVExpansionBudget = 5;
4817+
return;
47634818
}
47644819

47654820
// Enable runtime unrolling for in-order models

0 commit comments

Comments
 (0)