Skip to content

Commit df8d20c

Browse files
committed
[AArch64] Enable unrolling for small multi-exit loops
It can be highly beneficial to unroll small, two-block search loops that look for a value in an array. An example of this would be something that uses std::find to find a value in libc++. Older versions of std::find in the libstdc++ headers are manually unrolled in the source code, but this might change in newer releases where the compiler is expected to either vectorise or unroll itself.
1 parent f46d641 commit df8d20c

File tree

3 files changed

+1161
-57
lines changed

3 files changed

+1161
-57
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 109 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -4568,6 +4568,95 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
45684568
}
45694569
}
45704570

4571+
static bool shouldUnrollLoopWithInstruction(Instruction &I,
4572+
AArch64TTIImpl &TTI) {
4573+
// Don't unroll vectorised loop.
4574+
if (I.getType()->isVectorTy())
4575+
return false;
4576+
4577+
if (isa<CallBase>(I)) {
4578+
if (isa<CallInst>(I) || isa<InvokeInst>(I))
4579+
if (const Function *F = cast<CallBase>(I).getCalledFunction())
4580+
if (!TTI.isLoweredToCall(F))
4581+
return true;
4582+
return false;
4583+
}
4584+
4585+
return true;
4586+
}
4587+
4588+
// This function returns true if the loop:
4589+
// 1. Contains only those instructions that should be unrolled,
4590+
// 2. Has a valid cost,
4591+
// 3. Has a cost within the supplied budget.
4592+
// Otherwise it returns false.
4593+
static bool isLoopSizeWithinBudget(Loop *L, AArch64TTIImpl &TTI,
4594+
InstructionCost Budget,
4595+
unsigned *FinalSize) {
4596+
// Estimate the size of the loop.
4597+
InstructionCost LoopCost = 0;
4598+
4599+
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4600+
return 0;
4601+
4602+
for (auto *BB : L->getBlocks()) {
4603+
for (auto &I : *BB) {
4604+
if (!shouldUnrollLoopWithInstruction(I, TTI))
4605+
return 0;
4606+
4607+
SmallVector<const Value *, 4> Operands(I.operand_values());
4608+
InstructionCost Cost =
4609+
TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
4610+
// This can happen with intrinsics that don't currently have a cost model
4611+
// or for some operations that require SVE.
4612+
if (!Cost.isValid())
4613+
return 0;
4614+
4615+
LoopCost += Cost;
4616+
if (LoopCost > Budget)
4617+
return 0;
4618+
}
4619+
}
4620+
4621+
if (FinalSize)
4622+
*FinalSize = *LoopCost.getValue();
4623+
return true;
4624+
}
4625+
4626+
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
4627+
AArch64TTIImpl &TTI) {
4628+
// Only consider loops with unknown trip counts for which we can determine
4629+
// a symbolic expression. Multi-exit loops with small known trip counts will
4630+
// likely be unrolled anyway.
4631+
const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4632+
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
4633+
return false;
4634+
4635+
// It might not be worth unrolling loops with low max trip counts. Restrict
4636+
// this to max trip counts > 32 for now.
4637+
unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
4638+
if (MaxTC > 0 && MaxTC <= 32)
4639+
return false;
4640+
4641+
// Estimate the size of the loop.
4642+
if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
4643+
return false;
4644+
4645+
// Small search loops with multiple exits can be highly beneficial to unroll.
4646+
// We only care about loops with exactly two exiting blocks, although each
4647+
// block could jump to the same exit block.
4648+
ArrayRef<BasicBlock *> Blocks = L->getBlocks();
4649+
if (Blocks.size() != 2)
4650+
return false;
4651+
4652+
if (any_of(Blocks, [](BasicBlock *BB) {
4653+
return !isa<BranchInst>(BB->getTerminator());
4654+
}))
4655+
return false;
4656+
4657+
return true;
4658+
}
4659+
45714660
/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
45724661
/// OOO engine's wide instruction window and various predictors.
45734662
static void
@@ -4582,43 +4671,18 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
45824671
if (!L->isInnermost() || L->getNumBlocks() > 8)
45834672
return;
45844673

4674+
// This is handled by common code.
4675+
if (!L->getExitBlock())
4676+
return;
4677+
45854678
const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
45864679
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
45874680
(SE.getSmallConstantMaxTripCount(L) > 0 &&
45884681
SE.getSmallConstantMaxTripCount(L) <= 32))
45894682
return;
4590-
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4591-
return;
45924683

4593-
int64_t Size = 0;
4594-
for (auto *BB : L->getBlocks()) {
4595-
for (auto &I : *BB) {
4596-
if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
4597-
return;
4598-
SmallVector<const Value *, 4> Operands(I.operand_values());
4599-
Size +=
4600-
*TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue();
4601-
}
4602-
}
4603-
4604-
// Small search loops with multiple exits can be highly beneficial to unroll.
4605-
if (!L->getExitBlock()) {
4606-
if (L->getNumBlocks() == 2 && Size < 6 &&
4607-
all_of(
4608-
L->getBlocks(),
4609-
[](BasicBlock *BB) {
4610-
return isa<BranchInst>(BB->getTerminator());
4611-
})) {
4612-
UP.RuntimeUnrollMultiExit = true;
4613-
UP.Runtime = true;
4614-
// Limit unroll count.
4615-
UP.DefaultUnrollRuntimeCount = 4;
4616-
// Allow slightly more costly trip-count expansion to catch search loops
4617-
// with pointer inductions.
4618-
UP.SCEVExpansionBudget = 5;
4619-
}
4684+
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
46204685
return;
4621-
}
46224686

46234687
if (SE.getSymbolicMaxBackedgeTakenCount(L) != SE.getBackedgeTakenCount(L))
46244688
return;
@@ -4630,7 +4694,9 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
46304694
// dependencies, to expose more parallel memory access streams.
46314695
BasicBlock *Header = L->getHeader();
46324696
if (Header == L->getLoopLatch()) {
4633-
if (Size > 8)
4697+
// Estimate the size of the loop.
4698+
unsigned Size;
4699+
if (!isLoopSizeWithinBudget(L, TTI, 8, &Size))
46344700
return;
46354701

46364702
SmallPtrSet<Value *, 8> LoadedValues;
@@ -4743,22 +4809,24 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
47434809
break;
47444810
}
47454811

4812+
if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
4813+
UP.RuntimeUnrollMultiExit = true;
4814+
UP.Runtime = true;
4815+
// Limit unroll count.
4816+
UP.DefaultUnrollRuntimeCount = 4;
4817+
// Allow slightly more costly trip-count expansion to catch search loops
4818+
// with pointer inductions.
4819+
UP.SCEVExpansionBudget = 5;
4820+
return;
4821+
}
4822+
47464823
// Scan the loop: don't unroll loops with calls as this could prevent
47474824
// inlining. Don't unroll vector loops either, as they don't benefit much from
47484825
// unrolling.
47494826
for (auto *BB : L->getBlocks()) {
47504827
for (auto &I : *BB) {
4751-
// Don't unroll vectorised loop.
4752-
if (I.getType()->isVectorTy())
4828+
if (!shouldUnrollLoopWithInstruction(I, *this))
47534829
return;
4754-
4755-
if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4756-
if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
4757-
if (!isLoweredToCall(F))
4758-
continue;
4759-
}
4760-
return;
4761-
}
47624830
}
47634831
}
47644832

0 commit comments

Comments
 (0)