Skip to content

Commit f1b6330

Browse files
juliannagelefhahn
andauthored
[cherrypick][LV] Vectorize Epilogues for loops with small VF but high IC (#9666)
* [SCEV] Collect and merge loop guards through PHI nodes with multiple incoming values (llvm#113915) This patch aims to strengthen collection of loop guards by processing PHI nodes with multiple incoming values as follows: collect guards for all incoming values/blocks and try to merge them into a single one for the PHI node. The goal is to determine tighter bounds on the trip counts of scalar tail loops after vectorization, helping to avoid unnecessary transforms. In particular we'd like to avoid vectorizing scalar tails of hand-vectorized loops, for example in [Transforms/PhaseOrdering/X86/pr38280.ll](https://github.com/llvm/llvm-project/blob/231e03ba7e82896847dbc27d457dbb208f04699c/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll), discovered via llvm#108190 Compile-time impact: https://llvm-compile-time-tracker.com/compare.php?from=a55248789ed3f653740e0723d016203b9d585f26&to=500e4c46e79f60b93b11a752698c520e345948e3&stat=instructions:u PR: llvm#113915 (cherry picked from commit 7c8e05a) * [SCEV] Address post-commit comments for llvm#113915. Address post-commit comments for llvm#113915. (cherry picked from commit feb9b37) * [LV] Vectorize Epilogues for loops with small VF but high IC (llvm#108190) - Consider MainLoopVF * IC when determining whether Epilogue Vectorization is profitable - Allow the same VF for the Epilogue as for the main loop - Use an upper bound for the trip count of the Epilogue when choosing the Epilogue VF PR: llvm#108190 --------- Co-authored-by: Florian Hahn <[email protected]> (cherry picked from commit a8538b9) --------- Co-authored-by: Florian Hahn <[email protected]>
1 parent 42f3e8e commit f1b6330

30 files changed

+3960
-1529
lines changed

llvm/include/llvm/Analysis/ScalarEvolution.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1307,6 +1307,25 @@ class ScalarEvolution {
13071307

13081308
LoopGuards(ScalarEvolution &SE) : SE(SE) {}
13091309

1310+
/// Recursively collect loop guards in \p Guards, starting from
1311+
/// block \p Block with predecessor \p Pred. The intended starting point
1312+
/// is to collect from a loop header and its predecessor.
1313+
static void
1314+
collectFromBlock(ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
1315+
const BasicBlock *Block, const BasicBlock *Pred,
1316+
SmallPtrSetImpl<const BasicBlock *> &VisitedBlocks,
1317+
unsigned Depth = 0);
1318+
1319+
/// Collect loop guards in \p Guards, starting from PHINode \p
1320+
/// Phi, by calling \p collectFromBlock on the incoming blocks of
1321+
/// \Phi and trying to merge the found constraints into a single
1322+
/// combined one for \p Phi.
1323+
static void collectFromPHI(
1324+
ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
1325+
const PHINode &Phi, SmallPtrSetImpl<const BasicBlock *> &VisitedBlocks,
1326+
SmallDenseMap<const BasicBlock *, LoopGuards> &IncomingGuards,
1327+
unsigned Depth);
1328+
13101329
public:
13111330
/// Collect rewrite map for loop guards for loop \p L, together with flags
13121331
/// indicating if NUW and NSW can be preserved during rewriting.

llvm/lib/Analysis/ScalarEvolution.cpp

Lines changed: 101 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,10 @@ static cl::opt<unsigned> RangeIterThreshold(
222222
cl::desc("Threshold for switching to iteratively computing SCEV ranges"),
223223
cl::init(32));
224224

225+
static cl::opt<unsigned> MaxLoopGuardCollectionDepth(
226+
"scalar-evolution-max-loop-guard-collection-depth", cl::Hidden,
227+
cl::desc("Maximum depth for recrusive loop guard collection"), cl::init(1));
228+
225229
static cl::opt<bool>
226230
ClassifyExpressions("scalar-evolution-classify-expressions",
227231
cl::Hidden, cl::init(true),
@@ -10608,7 +10612,7 @@ ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(const BasicBlock *BB)
1060810612
if (const Loop *L = LI.getLoopFor(BB))
1060910613
return {L->getLoopPredecessor(), L->getHeader()};
1061010614

10611-
return {nullptr, nullptr};
10615+
return {nullptr, BB};
1061210616
}
1061310617

1061410618
/// SCEV structural equivalence is usually sufficient for testing whether two
@@ -15089,7 +15093,81 @@ bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,
1508915093

1509015094
ScalarEvolution::LoopGuards
1509115095
ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
15096+
BasicBlock *Header = L->getHeader();
15097+
BasicBlock *Pred = L->getLoopPredecessor();
1509215098
LoopGuards Guards(SE);
15099+
SmallPtrSet<const BasicBlock *, 8> VisitedBlocks;
15100+
collectFromBlock(SE, Guards, Header, Pred, VisitedBlocks);
15101+
return Guards;
15102+
}
15103+
15104+
void ScalarEvolution::LoopGuards::collectFromPHI(
15105+
ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
15106+
const PHINode &Phi, SmallPtrSetImpl<const BasicBlock *> &VisitedBlocks,
15107+
SmallDenseMap<const BasicBlock *, LoopGuards> &IncomingGuards,
15108+
unsigned Depth) {
15109+
if (!SE.isSCEVable(Phi.getType()))
15110+
return;
15111+
15112+
using MinMaxPattern = std::pair<const SCEVConstant *, SCEVTypes>;
15113+
auto GetMinMaxConst = [&](unsigned IncomingIdx) -> MinMaxPattern {
15114+
const BasicBlock *InBlock = Phi.getIncomingBlock(IncomingIdx);
15115+
if (!VisitedBlocks.insert(InBlock).second)
15116+
return {nullptr, scCouldNotCompute};
15117+
auto [G, Inserted] = IncomingGuards.try_emplace(InBlock, LoopGuards(SE));
15118+
if (Inserted)
15119+
collectFromBlock(SE, G->second, Phi.getParent(), InBlock, VisitedBlocks,
15120+
Depth + 1);
15121+
auto &RewriteMap = G->second.RewriteMap;
15122+
if (RewriteMap.empty())
15123+
return {nullptr, scCouldNotCompute};
15124+
auto S = RewriteMap.find(SE.getSCEV(Phi.getIncomingValue(IncomingIdx)));
15125+
if (S == RewriteMap.end())
15126+
return {nullptr, scCouldNotCompute};
15127+
auto *SM = dyn_cast_if_present<SCEVMinMaxExpr>(S->second);
15128+
if (!SM)
15129+
return {nullptr, scCouldNotCompute};
15130+
if (const SCEVConstant *C0 = dyn_cast<SCEVConstant>(SM->getOperand(0)))
15131+
return {C0, SM->getSCEVType()};
15132+
return {nullptr, scCouldNotCompute};
15133+
};
15134+
auto MergeMinMaxConst = [](MinMaxPattern P1,
15135+
MinMaxPattern P2) -> MinMaxPattern {
15136+
auto [C1, T1] = P1;
15137+
auto [C2, T2] = P2;
15138+
if (!C1 || !C2 || T1 != T2)
15139+
return {nullptr, scCouldNotCompute};
15140+
switch (T1) {
15141+
case scUMaxExpr:
15142+
return {C1->getAPInt().ult(C2->getAPInt()) ? C1 : C2, T1};
15143+
case scSMaxExpr:
15144+
return {C1->getAPInt().slt(C2->getAPInt()) ? C1 : C2, T1};
15145+
case scUMinExpr:
15146+
return {C1->getAPInt().ugt(C2->getAPInt()) ? C1 : C2, T1};
15147+
case scSMinExpr:
15148+
return {C1->getAPInt().sgt(C2->getAPInt()) ? C1 : C2, T1};
15149+
default:
15150+
llvm_unreachable("Trying to merge non-MinMaxExpr SCEVs.");
15151+
}
15152+
};
15153+
auto P = GetMinMaxConst(0);
15154+
for (unsigned int In = 1; In < Phi.getNumIncomingValues(); In++) {
15155+
if (!P.first)
15156+
break;
15157+
P = MergeMinMaxConst(P, GetMinMaxConst(In));
15158+
}
15159+
if (P.first) {
15160+
const SCEV *LHS = SE.getSCEV(const_cast<PHINode *>(&Phi));
15161+
SmallVector<const SCEV *, 2> Ops({P.first, LHS});
15162+
const SCEV *RHS = SE.getMinMaxExpr(P.second, Ops);
15163+
Guards.RewriteMap.insert({LHS, RHS});
15164+
}
15165+
}
15166+
15167+
void ScalarEvolution::LoopGuards::collectFromBlock(
15168+
ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
15169+
const BasicBlock *Block, const BasicBlock *Pred,
15170+
SmallPtrSetImpl<const BasicBlock *> &VisitedBlocks, unsigned Depth) {
1509315171
SmallVector<const SCEV *> ExprsToRewrite;
1509415172
auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS,
1509515173
const SCEV *RHS,
@@ -15428,14 +15506,13 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
1542815506
}
1542915507
};
1543015508

15431-
BasicBlock *Header = L->getHeader();
1543215509
SmallVector<PointerIntPair<Value *, 1, bool>> Terms;
1543315510
// First, collect information from assumptions dominating the loop.
1543415511
for (auto &AssumeVH : SE.AC.assumptions()) {
1543515512
if (!AssumeVH)
1543615513
continue;
1543715514
auto *AssumeI = cast<CallInst>(AssumeVH);
15438-
if (!SE.DT.dominates(AssumeI, Header))
15515+
if (!SE.DT.dominates(AssumeI, Block))
1543915516
continue;
1544015517
Terms.emplace_back(AssumeI->getOperand(0), true);
1544115518
}
@@ -15446,27 +15523,42 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
1544615523
if (GuardDecl)
1544715524
for (const auto *GU : GuardDecl->users())
1544815525
if (const auto *Guard = dyn_cast<IntrinsicInst>(GU))
15449-
if (Guard->getFunction() == Header->getParent() &&
15450-
SE.DT.dominates(Guard, Header))
15526+
if (Guard->getFunction() == Block->getParent() &&
15527+
SE.DT.dominates(Guard, Block))
1545115528
Terms.emplace_back(Guard->getArgOperand(0), true);
1545215529

1545315530
// Third, collect conditions from dominating branches. Starting at the loop
1545415531
// predecessor, climb up the predecessor chain, as long as there are
1545515532
// predecessors that can be found that have unique successors leading to the
1545615533
// original header.
1545715534
// TODO: share this logic with isLoopEntryGuardedByCond.
15458-
for (std::pair<const BasicBlock *, const BasicBlock *> Pair(
15459-
L->getLoopPredecessor(), Header);
15460-
Pair.first;
15535+
std::pair<const BasicBlock *, const BasicBlock *> Pair(Pred, Block);
15536+
for (; Pair.first;
1546115537
Pair = SE.getPredecessorWithUniqueSuccessorForBB(Pair.first)) {
15462-
15538+
VisitedBlocks.insert(Pair.second);
1546315539
const BranchInst *LoopEntryPredicate =
1546415540
dyn_cast<BranchInst>(Pair.first->getTerminator());
1546515541
if (!LoopEntryPredicate || LoopEntryPredicate->isUnconditional())
1546615542
continue;
1546715543

1546815544
Terms.emplace_back(LoopEntryPredicate->getCondition(),
1546915545
LoopEntryPredicate->getSuccessor(0) == Pair.second);
15546+
15547+
// If we are recursively collecting guards stop after 2
15548+
// predecessors to limit compile-time impact for now.
15549+
if (Depth > 0 && Terms.size() == 2)
15550+
break;
15551+
}
15552+
// Finally, if we stopped climbing the predecessor chain because
15553+
// there wasn't a unique one to continue, try to collect conditions
15554+
// for PHINodes by recursively following all of their incoming
15555+
// blocks and try to merge the found conditions to build a new one
15556+
// for the Phi.
15557+
if (Pair.second->hasNPredecessorsOrMore(2) &&
15558+
Depth < MaxLoopGuardCollectionDepth) {
15559+
SmallDenseMap<const BasicBlock *, LoopGuards> IncomingGuards;
15560+
for (auto &Phi : Pair.second->phis())
15561+
collectFromPHI(SE, Guards, Phi, VisitedBlocks, IncomingGuards, Depth);
1547015562
}
1547115563

1547215564
// Now apply the information from the collected conditions to
@@ -15523,7 +15615,6 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
1552315615
Guards.RewriteMap.insert({Expr, Guards.rewrite(RewriteTo)});
1552415616
}
1552515617
}
15526-
return Guards;
1552715618
}
1552815619

1552915620
const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const {

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,12 @@ class LoopVectorizationPlanner {
457457
bool isMoreProfitable(const VectorizationFactor &A,
458458
const VectorizationFactor &B) const;
459459

460+
/// Returns true if the per-lane cost of VectorizationFactor A is lower than
461+
/// that of B in the context of vectorizing a loop with known \p MaxTripCount.
462+
bool isMoreProfitable(const VectorizationFactor &A,
463+
const VectorizationFactor &B,
464+
const unsigned MaxTripCount) const;
465+
460466
/// Determines if we have the infrastructure to vectorize the loop and its
461467
/// epilogue, assuming the main loop is vectorized by \p VF.
462468
bool isCandidateForEpilogueVectorization(const ElementCount VF) const;

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1554,7 +1554,10 @@ class LoopVectorizationCostModel {
15541554
/// Returns true if epilogue vectorization is considered profitable, and
15551555
/// false otherwise.
15561556
/// \p VF is the vectorization factor chosen for the original loop.
1557-
bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1557+
/// \p Multiplier is an aditional scaling factor applied to VF before
1558+
/// comparing to EpilogueVectorizationMinVF.
1559+
bool isEpilogueVectorizationProfitable(const ElementCount VF,
1560+
const unsigned Multiplier) const;
15581561

15591562
/// Returns the execution time cost of an instruction for a given vector
15601563
/// width. Vector width of one means scalar.
@@ -4293,12 +4296,11 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
42934296
}
42944297

42954298
bool LoopVectorizationPlanner::isMoreProfitable(
4296-
const VectorizationFactor &A, const VectorizationFactor &B) const {
4299+
const VectorizationFactor &A, const VectorizationFactor &B,
4300+
const unsigned MaxTripCount) const {
42974301
InstructionCost CostA = A.Cost;
42984302
InstructionCost CostB = B.Cost;
42994303

4300-
unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4301-
43024304
// Improve estimate for the vector width if it is scalable.
43034305
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
43044306
unsigned EstimatedWidthB = B.Width.getKnownMinValue();
@@ -4347,6 +4349,13 @@ bool LoopVectorizationPlanner::isMoreProfitable(
43474349
return CmpFn(RTCostA, RTCostB);
43484350
}
43494351

4352+
bool LoopVectorizationPlanner::isMoreProfitable(
4353+
const VectorizationFactor &A, const VectorizationFactor &B) const {
4354+
const unsigned MaxTripCount =
4355+
PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4356+
return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount);
4357+
}
4358+
43504359
static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
43514360
OptimizationRemarkEmitter *ORE,
43524361
Loop *TheLoop) {
@@ -4626,7 +4635,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
46264635
}
46274636

46284637
bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4629-
const ElementCount VF) const {
4638+
const ElementCount VF, const unsigned Multiplier) const {
46304639
// FIXME: We need a much better cost-model to take different parameters such
46314640
// as register pressure, code size increase and cost of extra branches into
46324641
// account. For now we apply a very crude heuristic and only consider loops
@@ -4641,9 +4650,6 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
46414650
if (TTI.getMaxInterleaveFactor(VF) <= 1)
46424651
return false;
46434652

4644-
unsigned Multiplier = 1;
4645-
if (VF.isScalable())
4646-
Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
46474653
if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
46484654
return true;
46494655
return false;
@@ -4690,7 +4696,11 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
46904696
return Result;
46914697
}
46924698

4693-
if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
4699+
unsigned Multiplier = IC;
4700+
if (MainLoopVF.isScalable())
4701+
Multiplier = getVScaleForTuning(OrigLoop, TTI).value_or(1);
4702+
4703+
if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, Multiplier)) {
46944704
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
46954705
"this loop\n");
46964706
return Result;
@@ -4709,16 +4719,20 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47094719
ScalarEvolution &SE = *PSE.getSE();
47104720
Type *TCType = Legal->getWidestInductionType();
47114721
const SCEV *RemainingIterations = nullptr;
4722+
unsigned MaxTripCount = 0;
47124723
for (auto &NextVF : ProfitableVFs) {
47134724
// Skip candidate VFs without a corresponding VPlan.
47144725
if (!hasPlanWithVF(NextVF.Width))
47154726
continue;
47164727

4717-
// Skip candidate VFs with widths >= the estimate runtime VF (scalable
4718-
// vectors) or the VF of the main loop (fixed vectors).
4728+
// Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4729+
// vectors) or > the VF of the main loop (fixed vectors).
47194730
if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
47204731
ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4721-
ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
4732+
(NextVF.Width.isScalable() &&
4733+
ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4734+
(!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4735+
ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
47224736
continue;
47234737

47244738
// If NextVF is greater than the number of remaining iterations, the
@@ -4729,6 +4743,14 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47294743
const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
47304744
RemainingIterations = SE.getURemExpr(
47314745
TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4746+
MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
4747+
if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4748+
SE.getConstant(TCType, MaxTripCount))) {
4749+
MaxTripCount =
4750+
SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4751+
}
4752+
LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4753+
<< MaxTripCount << "\n");
47324754
}
47334755
if (SE.isKnownPredicate(
47344756
CmpInst::ICMP_UGT,
@@ -4737,7 +4759,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47374759
continue;
47384760
}
47394761

4740-
if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
4762+
if (Result.Width.isScalar() ||
4763+
isMoreProfitable(NextVF, Result, MaxTripCount))
47414764
Result = NextVF;
47424765
}
47434766

0 commit comments

Comments
 (0)