@@ -4068,51 +4068,86 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
4068
4068
4069
4069
// Try to unroll small, single block loops, if they have load/store
4070
4070
// dependencies, to expose more parallel memory access streams.
4071
- if (L->getHeader () != L->getLoopLatch () || Size > 8 )
4072
- return ;
4071
+ BasicBlock *Header = L->getHeader ();
4072
+ if (Header == L->getLoopLatch ()) {
4073
+ if (Size > 8 )
4074
+ return ;
4073
4075
4074
- SmallPtrSet<Value *, 8 > LoadedValues;
4075
- SmallVector<StoreInst *> Stores;
4076
- for (auto *BB : L->blocks ()) {
4077
- for (auto &I : *BB) {
4078
- Value *Ptr = getLoadStorePointerOperand (&I);
4079
- if (!Ptr)
4080
- continue ;
4081
- const SCEV *PtrSCEV = SE.getSCEV (Ptr);
4082
- if (SE.isLoopInvariant (PtrSCEV, L))
4083
- continue ;
4084
- if (isa<LoadInst>(&I))
4085
- LoadedValues.insert (&I);
4086
- else
4087
- Stores.push_back (cast<StoreInst>(&I));
4076
+ SmallPtrSet<Value *, 8 > LoadedValues;
4077
+ SmallVector<StoreInst *> Stores;
4078
+ for (auto *BB : L->blocks ()) {
4079
+ for (auto &I : *BB) {
4080
+ Value *Ptr = getLoadStorePointerOperand (&I);
4081
+ if (!Ptr)
4082
+ continue ;
4083
+ const SCEV *PtrSCEV = SE.getSCEV (Ptr);
4084
+ if (SE.isLoopInvariant (PtrSCEV, L))
4085
+ continue ;
4086
+ if (isa<LoadInst>(&I))
4087
+ LoadedValues.insert (&I);
4088
+ else
4089
+ Stores.push_back (cast<StoreInst>(&I));
4090
+ }
4088
4091
}
4089
- }
4090
4092
4091
- // Try to find an unroll count that maximizes the use of the instruction
4092
- // window, i.e. trying to fetch as many instructions per cycle as possible.
4093
- unsigned MaxInstsPerLine = 16 ;
4094
- unsigned UC = 1 ;
4095
- unsigned BestUC = 1 ;
4096
- unsigned SizeWithBestUC = BestUC * Size;
4097
- while (UC <= 8 ) {
4098
- unsigned SizeWithUC = UC * Size;
4099
- if (SizeWithUC > 48 )
4100
- break ;
4101
- if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4102
- (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4103
- BestUC = UC;
4104
- SizeWithBestUC = BestUC * Size;
4093
+ // Try to find an unroll count that maximizes the use of the instruction
4094
+ // window, i.e. trying to fetch as many instructions per cycle as possible.
4095
+ unsigned MaxInstsPerLine = 16 ;
4096
+ unsigned UC = 1 ;
4097
+ unsigned BestUC = 1 ;
4098
+ unsigned SizeWithBestUC = BestUC * Size;
4099
+ while (UC <= 8 ) {
4100
+ unsigned SizeWithUC = UC * Size;
4101
+ if (SizeWithUC > 48 )
4102
+ break ;
4103
+ if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4104
+ (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4105
+ BestUC = UC;
4106
+ SizeWithBestUC = BestUC * Size;
4107
+ }
4108
+ UC++;
4105
4109
}
4106
- UC++;
4110
+
4111
+ if (BestUC == 1 || none_of (Stores, [&LoadedValues](StoreInst *SI) {
4112
+ return LoadedValues.contains (SI->getOperand (0 ));
4113
+ }))
4114
+ return ;
4115
+
4116
+ UP.Runtime = true ;
4117
+ UP.DefaultUnrollRuntimeCount = BestUC;
4118
+ return ;
4107
4119
}
4108
4120
4109
- if (BestUC == 1 || none_of (Stores, [&LoadedValues](StoreInst *SI) {
4110
- return LoadedValues.contains (SI->getOperand (0 ));
4111
- }))
4121
+ // Try to runtime-unroll loops with early-continues depending on loop-varying
4122
+ // loads; this helps with branch-prediction for the early-continues.
4123
+ auto *Term = dyn_cast<BranchInst>(Header->getTerminator ());
4124
+ auto *Latch = L->getLoopLatch ();
4125
+ SmallVector<BasicBlock *> Preds (predecessors (Latch));
4126
+ if (!Term || !Term->isConditional () || Preds.size () == 1 ||
4127
+ none_of (Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
4128
+ none_of (Preds, [L](BasicBlock *Pred) { return L->contains (Pred); }))
4112
4129
return ;
4113
4130
4114
- UP.Runtime = true ;
4115
- UP.DefaultUnrollRuntimeCount = BestUC;
4131
+ std::function<bool (Instruction *, unsigned )> DependsOnLoopLoad =
4132
+ [&](Instruction *I, unsigned Depth) -> bool {
4133
+ if (isa<PHINode>(I) || L->isLoopInvariant (I) || Depth > 8 )
4134
+ return false ;
4135
+
4136
+ if (auto *LI = dyn_cast<LoadInst>(I))
4137
+ return true ;
4138
+
4139
+ return any_of (I->operands (), [&](Value *V) {
4140
+ auto *I = dyn_cast<Instruction>(V);
4141
+ return I && DependsOnLoopLoad (I, Depth + 1 );
4142
+ });
4143
+ };
4144
+ CmpInst::Predicate Pred;
4145
+ Instruction *I;
4146
+ if (match (Term, m_Br (m_ICmp (Pred, m_Instruction (I), m_Value ()), m_Value (),
4147
+ m_Value ())) &&
4148
+ DependsOnLoopLoad (I, 0 )) {
4149
+ UP.Runtime = true ;
4150
+ }
4116
4151
}
4117
4152
4118
4153
void AArch64TTIImpl::getUnrollingPreferences (Loop *L, ScalarEvolution &SE,
0 commit comments