Skip to content

Commit a8f5914

Browse files
committed
[AArch64] Add flag to control unrolling for small multi-exit loops
It can be highly beneficial to unroll small, two-block search loops that look for a value in an array. An example of this would be something that uses std::find to find a value in libc++. Older versions of std::find in the libstdc++ headers are manually unrolled in the source code, but this might change in newer releases where the compiler is expected to either vectorise or unroll itself.
1 parent 5812516 commit a8f5914

File tree

3 files changed

+1161
-57
lines changed

3 files changed

+1161
-57
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 109 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -4528,6 +4528,95 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
45284528
}
45294529
}
45304530

4531+
static bool shouldUnrollLoopWithInstruction(Instruction &I,
4532+
AArch64TTIImpl &TTI) {
4533+
// Don't unroll vectorised loop.
4534+
if (I.getType()->isVectorTy())
4535+
return false;
4536+
4537+
if (isa<CallBase>(I)) {
4538+
if (isa<CallInst>(I) || isa<InvokeInst>(I))
4539+
if (const Function *F = cast<CallBase>(I).getCalledFunction())
4540+
if (!TTI.isLoweredToCall(F))
4541+
return true;
4542+
return false;
4543+
}
4544+
4545+
return true;
4546+
}
4547+
4548+
// This function returns true if the loop:
4549+
// 1. Contains only those instructions that should be unrolled,
4550+
// 2. Has a valid cost,
4551+
// 3. Has a cost within the supplied budget.
4552+
// Otherwise it returns false.
4553+
static bool isLoopSizeWithinBudget(Loop *L, AArch64TTIImpl &TTI,
4554+
InstructionCost Budget,
4555+
unsigned *FinalSize) {
4556+
// Estimate the size of the loop.
4557+
InstructionCost LoopCost = 0;
4558+
4559+
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4560+
return 0;
4561+
4562+
for (auto *BB : L->getBlocks()) {
4563+
for (auto &I : *BB) {
4564+
if (!shouldUnrollLoopWithInstruction(I, TTI))
4565+
return 0;
4566+
4567+
SmallVector<const Value *, 4> Operands(I.operand_values());
4568+
InstructionCost Cost =
4569+
TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
4570+
// This can happen with intrinsics that don't currently have a cost model
4571+
// or for some operations that require SVE.
4572+
if (!Cost.isValid())
4573+
return 0;
4574+
4575+
LoopCost += Cost;
4576+
if (LoopCost > Budget)
4577+
return 0;
4578+
}
4579+
}
4580+
4581+
if (FinalSize)
4582+
*FinalSize = *LoopCost.getValue();
4583+
return true;
4584+
}
4585+
4586+
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
4587+
AArch64TTIImpl &TTI) {
4588+
// Only consider loops with unknown trip counts for which we can determine
4589+
// a symbolic expression. Multi-exit loops with small known trip counts will
4590+
// likely be unrolled anyway.
4591+
const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4592+
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
4593+
return false;
4594+
4595+
// It might not be worth unrolling loops with low max trip counts. Restrict
4596+
// this to max trip counts > 32 for now.
4597+
unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
4598+
if (MaxTC > 0 && MaxTC <= 32)
4599+
return false;
4600+
4601+
// Estimate the size of the loop.
4602+
if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
4603+
return false;
4604+
4605+
// Small search loops with multiple exits can be highly beneficial to unroll.
4606+
// We only care about loops with exactly two exiting blocks, although each
4607+
// block could jump to the same exit block.
4608+
ArrayRef<BasicBlock*> Blocks = L->getBlocks();
4609+
if (Blocks.size() != 2)
4610+
return false;
4611+
4612+
if (any_of(Blocks, [](BasicBlock *BB) {
4613+
return !isa<BranchInst>(BB->getTerminator());
4614+
}))
4615+
return false;
4616+
4617+
return true;
4618+
}
4619+
45314620
/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
45324621
/// OOO engine's wide instruction window and various predictors.
45334622
static void
@@ -4542,43 +4631,18 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
45424631
if (!L->isInnermost() || L->getNumBlocks() > 8)
45434632
return;
45444633

4634+
// This is handled by common code.
4635+
if (!L->getExitBlock())
4636+
return;
4637+
45454638
const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
45464639
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
45474640
(SE.getSmallConstantMaxTripCount(L) > 0 &&
45484641
SE.getSmallConstantMaxTripCount(L) <= 32))
45494642
return;
4550-
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4551-
return;
45524643

4553-
int64_t Size = 0;
4554-
for (auto *BB : L->getBlocks()) {
4555-
for (auto &I : *BB) {
4556-
if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
4557-
return;
4558-
SmallVector<const Value *, 4> Operands(I.operand_values());
4559-
Size +=
4560-
*TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue();
4561-
}
4562-
}
4563-
4564-
// Small search loops with multiple exits can be highly beneficial to unroll.
4565-
if (!L->getExitBlock()) {
4566-
if (L->getNumBlocks() == 2 && Size < 6 &&
4567-
all_of(
4568-
L->getBlocks(),
4569-
[](BasicBlock *BB) {
4570-
return isa<BranchInst>(BB->getTerminator());
4571-
})) {
4572-
UP.RuntimeUnrollMultiExit = true;
4573-
UP.Runtime = true;
4574-
// Limit unroll count.
4575-
UP.DefaultUnrollRuntimeCount = 4;
4576-
// Allow slightly more costly trip-count expansion to catch search loops
4577-
// with pointer inductions.
4578-
UP.SCEVExpansionBudget = 5;
4579-
}
4644+
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
45804645
return;
4581-
}
45824646

45834647
if (SE.getSymbolicMaxBackedgeTakenCount(L) != SE.getBackedgeTakenCount(L))
45844648
return;
@@ -4590,7 +4654,9 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
45904654
// dependencies, to expose more parallel memory access streams.
45914655
BasicBlock *Header = L->getHeader();
45924656
if (Header == L->getLoopLatch()) {
4593-
if (Size > 8)
4657+
// Estimate the size of the loop.
4658+
unsigned Size;
4659+
if (!isLoopSizeWithinBudget(L, TTI, 8, &Size))
45944660
return;
45954661

45964662
SmallPtrSet<Value *, 8> LoadedValues;
@@ -4703,22 +4769,24 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
47034769
break;
47044770
}
47054771

4772+
if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
4773+
UP.RuntimeUnrollMultiExit = true;
4774+
UP.Runtime = true;
4775+
// Limit unroll count.
4776+
UP.DefaultUnrollRuntimeCount = 4;
4777+
// Allow slightly more costly trip-count expansion to catch search loops
4778+
// with pointer inductions.
4779+
UP.SCEVExpansionBudget = 5;
4780+
return;
4781+
}
4782+
47064783
// Scan the loop: don't unroll loops with calls as this could prevent
47074784
// inlining. Don't unroll vector loops either, as they don't benefit much from
47084785
// unrolling.
47094786
for (auto *BB : L->getBlocks()) {
47104787
for (auto &I : *BB) {
4711-
// Don't unroll vectorised loop.
4712-
if (I.getType()->isVectorTy())
4788+
if (!shouldUnrollLoopWithInstruction(I, *this))
47134789
return;
4714-
4715-
if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4716-
if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
4717-
if (!isLoweredToCall(F))
4718-
continue;
4719-
}
4720-
return;
4721-
}
47224790
}
47234791
}
47244792

0 commit comments

Comments
 (0)