Skip to content

[AArch64] Enable unrolling for small multi-exit loops #131998

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Apr 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 103 additions & 48 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4568,6 +4568,71 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
}

// This function returns true if the loop:
// 1. Has a valid cost, and
// 2. Has a cost within the supplied budget.
// Otherwise it returns false.
static bool isLoopSizeWithinBudget(Loop *L, AArch64TTIImpl &TTI,
InstructionCost Budget,
unsigned *FinalSize) {
// Estimate the size of the loop.
InstructionCost LoopCost = 0;

for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
SmallVector<const Value *, 4> Operands(I.operand_values());
InstructionCost Cost =
TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
// This can happen with intrinsics that don't currently have a cost model
// or for some operations that require SVE.
if (!Cost.isValid())
return false;

LoopCost += Cost;
if (LoopCost > Budget)
return false;
}
}

if (FinalSize)
*FinalSize = *LoopCost.getValue();
return true;
}

static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
AArch64TTIImpl &TTI) {
// Only consider loops with unknown trip counts for which we can determine
// a symbolic expression. Multi-exit loops with small known trip counts will
// likely be unrolled anyway.
const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
return false;

// It might not be worth unrolling loops with low max trip counts. Restrict
// this to max trip counts > 32 for now.
unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
if (MaxTC > 0 && MaxTC <= 32)
return false;

// Make sure the loop size is <= 5.
if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
return false;

// Small search loops with multiple exits can be highly beneficial to unroll.
// We only care about loops with exactly two exiting blocks, although each
// block could jump to the same exit block.
ArrayRef<BasicBlock *> Blocks = L->getBlocks();
if (Blocks.size() != 2)
return false;

if (any_of(Blocks, [](BasicBlock *BB) {
return !isa<BranchInst>(BB->getTerminator());
}))
return false;

return true;
}

/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
/// OOO engine's wide instruction window and various predictors.
static void
Expand All @@ -4582,43 +4647,18 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
if (!L->isInnermost() || L->getNumBlocks() > 8)
return;

// Loops with multiple exits are handled by common code.
if (!L->getExitBlock())
return;

const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
(SE.getSmallConstantMaxTripCount(L) > 0 &&
SE.getSmallConstantMaxTripCount(L) <= 32))
return;
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
return;

int64_t Size = 0;
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
return;
SmallVector<const Value *, 4> Operands(I.operand_values());
Size +=
*TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue();
}
}

// Small search loops with multiple exits can be highly beneficial to unroll.
if (!L->getExitBlock()) {
if (L->getNumBlocks() == 2 && Size < 6 &&
all_of(
L->getBlocks(),
[](BasicBlock *BB) {
return isa<BranchInst>(BB->getTerminator());
})) {
UP.RuntimeUnrollMultiExit = true;
UP.Runtime = true;
// Limit unroll count.
UP.DefaultUnrollRuntimeCount = 4;
// Allow slightly more costly trip-count expansion to catch search loops
// with pointer inductions.
UP.SCEVExpansionBudget = 5;
}
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
return;
}

if (SE.getSymbolicMaxBackedgeTakenCount(L) != SE.getBackedgeTakenCount(L))
return;
Expand All @@ -4630,7 +4670,9 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
// dependencies, to expose more parallel memory access streams.
BasicBlock *Header = L->getHeader();
if (Header == L->getLoopLatch()) {
if (Size > 8)
// Estimate the size of the loop.
unsigned Size;
if (!isLoopSizeWithinBudget(L, TTI, 8, &Size))
return;

SmallPtrSet<Value *, 8> LoadedValues;
Expand Down Expand Up @@ -4727,6 +4769,25 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;

// Scan the loop: don't unroll loops with calls as this could prevent
// inlining. Don't unroll vector loops either, as they don't benefit much from
// unrolling.
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
// Don't unroll vectorised loop.
if (I.getType()->isVectorTy())
return;

if (isa<CallBase>(I)) {
if (isa<CallInst>(I) || isa<InvokeInst>(I))
if (const Function *F = cast<CallBase>(I).getCalledFunction())
if (!isLoweredToCall(F))
continue;
return;
}
}
}

// Apply subtarget-specific unrolling preferences.
switch (ST->getProcFamily()) {
case AArch64Subtarget::AppleA14:
Expand All @@ -4743,23 +4804,17 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
break;
}

// Scan the loop: don't unroll loops with calls as this could prevent
// inlining. Don't unroll vector loops either, as they don't benefit much from
// unrolling.
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
// Don't unroll vectorised loop.
if (I.getType()->isVectorTy())
return;

if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
if (!isLoweredToCall(F))
continue;
}
return;
}
}
// If this is a small, multi-exit loop similar to something like std::find,
// then there is typically a performance improvement achieved by unrolling.
if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
UP.RuntimeUnrollMultiExit = true;
UP.Runtime = true;
// Limit unroll count.
UP.DefaultUnrollRuntimeCount = 4;
// Allow slightly more costly trip-count expansion to catch search loops
// with pointer inductions.
UP.SCEVExpansionBudget = 5;
return;
}

// Enable runtime unrolling for in-order models
Expand Down
Loading