Skip to content

Commit c2ab633

Browse files
committed
[AArch64] Unroll some loops with early-continues on Apple Silicon.
Try to runtime-unroll loops with early-continues depending on loop-varying loads; this helps with branch-prediction for the early-continues and can significantly improve performance for such loops
1 parent 4993a30 commit c2ab633

File tree

2 files changed

+305
-42
lines changed

2 files changed

+305
-42
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 72 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -4068,51 +4068,86 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
40684068

40694069
// Try to unroll small, single block loops, if they have load/store
40704070
// dependencies, to expose more parallel memory access streams.
4071-
if (L->getHeader() != L->getLoopLatch() || Size > 8)
4072-
return;
4071+
BasicBlock *Header = L->getHeader();
4072+
if (Header == L->getLoopLatch()) {
4073+
if (Size > 8)
4074+
return;
40734075

4074-
SmallPtrSet<Value *, 8> LoadedValues;
4075-
SmallVector<StoreInst *> Stores;
4076-
for (auto *BB : L->blocks()) {
4077-
for (auto &I : *BB) {
4078-
Value *Ptr = getLoadStorePointerOperand(&I);
4079-
if (!Ptr)
4080-
continue;
4081-
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4082-
if (SE.isLoopInvariant(PtrSCEV, L))
4083-
continue;
4084-
if (isa<LoadInst>(&I))
4085-
LoadedValues.insert(&I);
4086-
else
4087-
Stores.push_back(cast<StoreInst>(&I));
4076+
SmallPtrSet<Value *, 8> LoadedValues;
4077+
SmallVector<StoreInst *> Stores;
4078+
for (auto *BB : L->blocks()) {
4079+
for (auto &I : *BB) {
4080+
Value *Ptr = getLoadStorePointerOperand(&I);
4081+
if (!Ptr)
4082+
continue;
4083+
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4084+
if (SE.isLoopInvariant(PtrSCEV, L))
4085+
continue;
4086+
if (isa<LoadInst>(&I))
4087+
LoadedValues.insert(&I);
4088+
else
4089+
Stores.push_back(cast<StoreInst>(&I));
4090+
}
40884091
}
4089-
}
40904092

4091-
// Try to find an unroll count that maximizes the use of the instruction
4092-
// window, i.e. trying to fetch as many instructions per cycle as possible.
4093-
unsigned MaxInstsPerLine = 16;
4094-
unsigned UC = 1;
4095-
unsigned BestUC = 1;
4096-
unsigned SizeWithBestUC = BestUC * Size;
4097-
while (UC <= 8) {
4098-
unsigned SizeWithUC = UC * Size;
4099-
if (SizeWithUC > 48)
4100-
break;
4101-
if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4102-
(SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4103-
BestUC = UC;
4104-
SizeWithBestUC = BestUC * Size;
4093+
// Try to find an unroll count that maximizes the use of the instruction
4094+
// window, i.e. trying to fetch as many instructions per cycle as possible.
4095+
unsigned MaxInstsPerLine = 16;
4096+
unsigned UC = 1;
4097+
unsigned BestUC = 1;
4098+
unsigned SizeWithBestUC = BestUC * Size;
4099+
while (UC <= 8) {
4100+
unsigned SizeWithUC = UC * Size;
4101+
if (SizeWithUC > 48)
4102+
break;
4103+
if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4104+
(SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4105+
BestUC = UC;
4106+
SizeWithBestUC = BestUC * Size;
4107+
}
4108+
UC++;
41054109
}
4106-
UC++;
4110+
4111+
if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
4112+
return LoadedValues.contains(SI->getOperand(0));
4113+
}))
4114+
return;
4115+
4116+
UP.Runtime = true;
4117+
UP.DefaultUnrollRuntimeCount = BestUC;
4118+
return;
41074119
}
41084120

4109-
if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
4110-
return LoadedValues.contains(SI->getOperand(0));
4111-
}))
4121+
// Try to runtime-unroll loops with early-continues depending on loop-varying
4122+
// loads; this helps with branch-prediction for the early-continues.
4123+
auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
4124+
auto *Latch = L->getLoopLatch();
4125+
SmallVector<BasicBlock *> Preds(predecessors(Latch));
4126+
if (!Term || !Term->isConditional() || Preds.size() == 1 ||
4127+
none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
4128+
none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
41124129
return;
41134130

4114-
UP.Runtime = true;
4115-
UP.DefaultUnrollRuntimeCount = BestUC;
4131+
std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
4132+
[&](Instruction *I, unsigned Depth) -> bool {
4133+
if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
4134+
return false;
4135+
4136+
if (auto *LI = dyn_cast<LoadInst>(I))
4137+
return true;
4138+
4139+
return any_of(I->operands(), [&](Value *V) {
4140+
auto *I = dyn_cast<Instruction>(V);
4141+
return I && DependsOnLoopLoad(I, Depth + 1);
4142+
});
4143+
};
4144+
CmpInst::Predicate Pred;
4145+
Instruction *I;
4146+
if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
4147+
m_Value())) &&
4148+
DependsOnLoopLoad(I, 0)) {
4149+
UP.Runtime = true;
4150+
}
41164151
}
41174152

41184153
void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,

0 commit comments

Comments
 (0)