Skip to content

[cherrypick][LV] Vectorize Epilogues for loops with small VF but high IC #9666

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions llvm/include/llvm/Analysis/ScalarEvolution.h
Original file line number Diff line number Diff line change
Expand Up @@ -1307,6 +1307,25 @@ class ScalarEvolution {

LoopGuards(ScalarEvolution &SE) : SE(SE) {}

/// Recursively collect loop guards in \p Guards, starting from
/// block \p Block with predecessor \p Pred. The intended starting point
/// is to collect from a loop header and its predecessor.
static void
collectFromBlock(ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
const BasicBlock *Block, const BasicBlock *Pred,
SmallPtrSetImpl<const BasicBlock *> &VisitedBlocks,
unsigned Depth = 0);

/// Collect loop guards in \p Guards, starting from PHINode \p
/// Phi, by calling \p collectFromBlock on the incoming blocks of
/// \Phi and trying to merge the found constraints into a single
/// combined one for \p Phi.
static void collectFromPHI(
ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
const PHINode &Phi, SmallPtrSetImpl<const BasicBlock *> &VisitedBlocks,
SmallDenseMap<const BasicBlock *, LoopGuards> &IncomingGuards,
unsigned Depth);

public:
/// Collect rewrite map for loop guards for loop \p L, together with flags
/// indicating if NUW and NSW can be preserved during rewriting.
Expand Down
111 changes: 101 additions & 10 deletions llvm/lib/Analysis/ScalarEvolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,10 @@ static cl::opt<unsigned> RangeIterThreshold(
cl::desc("Threshold for switching to iteratively computing SCEV ranges"),
cl::init(32));

static cl::opt<unsigned> MaxLoopGuardCollectionDepth(
"scalar-evolution-max-loop-guard-collection-depth", cl::Hidden,
cl::desc("Maximum depth for recrusive loop guard collection"), cl::init(1));

static cl::opt<bool>
ClassifyExpressions("scalar-evolution-classify-expressions",
cl::Hidden, cl::init(true),
Expand Down Expand Up @@ -10608,7 +10612,7 @@ ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(const BasicBlock *BB)
if (const Loop *L = LI.getLoopFor(BB))
return {L->getLoopPredecessor(), L->getHeader()};

return {nullptr, nullptr};
return {nullptr, BB};
}

/// SCEV structural equivalence is usually sufficient for testing whether two
Expand Down Expand Up @@ -15089,7 +15093,81 @@ bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,

ScalarEvolution::LoopGuards
ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
BasicBlock *Header = L->getHeader();
BasicBlock *Pred = L->getLoopPredecessor();
LoopGuards Guards(SE);
SmallPtrSet<const BasicBlock *, 8> VisitedBlocks;
collectFromBlock(SE, Guards, Header, Pred, VisitedBlocks);
return Guards;
}

void ScalarEvolution::LoopGuards::collectFromPHI(
ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
const PHINode &Phi, SmallPtrSetImpl<const BasicBlock *> &VisitedBlocks,
SmallDenseMap<const BasicBlock *, LoopGuards> &IncomingGuards,
unsigned Depth) {
if (!SE.isSCEVable(Phi.getType()))
return;

using MinMaxPattern = std::pair<const SCEVConstant *, SCEVTypes>;
auto GetMinMaxConst = [&](unsigned IncomingIdx) -> MinMaxPattern {
const BasicBlock *InBlock = Phi.getIncomingBlock(IncomingIdx);
if (!VisitedBlocks.insert(InBlock).second)
return {nullptr, scCouldNotCompute};
auto [G, Inserted] = IncomingGuards.try_emplace(InBlock, LoopGuards(SE));
if (Inserted)
collectFromBlock(SE, G->second, Phi.getParent(), InBlock, VisitedBlocks,
Depth + 1);
auto &RewriteMap = G->second.RewriteMap;
if (RewriteMap.empty())
return {nullptr, scCouldNotCompute};
auto S = RewriteMap.find(SE.getSCEV(Phi.getIncomingValue(IncomingIdx)));
if (S == RewriteMap.end())
return {nullptr, scCouldNotCompute};
auto *SM = dyn_cast_if_present<SCEVMinMaxExpr>(S->second);
if (!SM)
return {nullptr, scCouldNotCompute};
if (const SCEVConstant *C0 = dyn_cast<SCEVConstant>(SM->getOperand(0)))
return {C0, SM->getSCEVType()};
return {nullptr, scCouldNotCompute};
};
auto MergeMinMaxConst = [](MinMaxPattern P1,
MinMaxPattern P2) -> MinMaxPattern {
auto [C1, T1] = P1;
auto [C2, T2] = P2;
if (!C1 || !C2 || T1 != T2)
return {nullptr, scCouldNotCompute};
switch (T1) {
case scUMaxExpr:
return {C1->getAPInt().ult(C2->getAPInt()) ? C1 : C2, T1};
case scSMaxExpr:
return {C1->getAPInt().slt(C2->getAPInt()) ? C1 : C2, T1};
case scUMinExpr:
return {C1->getAPInt().ugt(C2->getAPInt()) ? C1 : C2, T1};
case scSMinExpr:
return {C1->getAPInt().sgt(C2->getAPInt()) ? C1 : C2, T1};
default:
llvm_unreachable("Trying to merge non-MinMaxExpr SCEVs.");
}
};
auto P = GetMinMaxConst(0);
for (unsigned int In = 1; In < Phi.getNumIncomingValues(); In++) {
if (!P.first)
break;
P = MergeMinMaxConst(P, GetMinMaxConst(In));
}
if (P.first) {
const SCEV *LHS = SE.getSCEV(const_cast<PHINode *>(&Phi));
SmallVector<const SCEV *, 2> Ops({P.first, LHS});
const SCEV *RHS = SE.getMinMaxExpr(P.second, Ops);
Guards.RewriteMap.insert({LHS, RHS});
}
}

void ScalarEvolution::LoopGuards::collectFromBlock(
ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
const BasicBlock *Block, const BasicBlock *Pred,
SmallPtrSetImpl<const BasicBlock *> &VisitedBlocks, unsigned Depth) {
SmallVector<const SCEV *> ExprsToRewrite;
auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS,
const SCEV *RHS,
Expand Down Expand Up @@ -15428,14 +15506,13 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
}
};

BasicBlock *Header = L->getHeader();
SmallVector<PointerIntPair<Value *, 1, bool>> Terms;
// First, collect information from assumptions dominating the loop.
for (auto &AssumeVH : SE.AC.assumptions()) {
if (!AssumeVH)
continue;
auto *AssumeI = cast<CallInst>(AssumeVH);
if (!SE.DT.dominates(AssumeI, Header))
if (!SE.DT.dominates(AssumeI, Block))
continue;
Terms.emplace_back(AssumeI->getOperand(0), true);
}
Expand All @@ -15446,27 +15523,42 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
if (GuardDecl)
for (const auto *GU : GuardDecl->users())
if (const auto *Guard = dyn_cast<IntrinsicInst>(GU))
if (Guard->getFunction() == Header->getParent() &&
SE.DT.dominates(Guard, Header))
if (Guard->getFunction() == Block->getParent() &&
SE.DT.dominates(Guard, Block))
Terms.emplace_back(Guard->getArgOperand(0), true);

// Third, collect conditions from dominating branches. Starting at the loop
// predecessor, climb up the predecessor chain, as long as there are
// predecessors that can be found that have unique successors leading to the
// original header.
// TODO: share this logic with isLoopEntryGuardedByCond.
for (std::pair<const BasicBlock *, const BasicBlock *> Pair(
L->getLoopPredecessor(), Header);
Pair.first;
std::pair<const BasicBlock *, const BasicBlock *> Pair(Pred, Block);
for (; Pair.first;
Pair = SE.getPredecessorWithUniqueSuccessorForBB(Pair.first)) {

VisitedBlocks.insert(Pair.second);
const BranchInst *LoopEntryPredicate =
dyn_cast<BranchInst>(Pair.first->getTerminator());
if (!LoopEntryPredicate || LoopEntryPredicate->isUnconditional())
continue;

Terms.emplace_back(LoopEntryPredicate->getCondition(),
LoopEntryPredicate->getSuccessor(0) == Pair.second);

// If we are recursively collecting guards stop after 2
// predecessors to limit compile-time impact for now.
if (Depth > 0 && Terms.size() == 2)
break;
}
// Finally, if we stopped climbing the predecessor chain because
// there wasn't a unique one to continue, try to collect conditions
// for PHINodes by recursively following all of their incoming
// blocks and try to merge the found conditions to build a new one
// for the Phi.
if (Pair.second->hasNPredecessorsOrMore(2) &&
Depth < MaxLoopGuardCollectionDepth) {
SmallDenseMap<const BasicBlock *, LoopGuards> IncomingGuards;
for (auto &Phi : Pair.second->phis())
collectFromPHI(SE, Guards, Phi, VisitedBlocks, IncomingGuards, Depth);
}

// Now apply the information from the collected conditions to
Expand Down Expand Up @@ -15523,7 +15615,6 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
Guards.RewriteMap.insert({Expr, Guards.rewrite(RewriteTo)});
}
}
return Guards;
}

const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const {
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,12 @@ class LoopVectorizationPlanner {
bool isMoreProfitable(const VectorizationFactor &A,
const VectorizationFactor &B) const;

/// Returns true if the per-lane cost of VectorizationFactor A is lower than
/// that of B in the context of vectorizing a loop with known \p MaxTripCount.
bool isMoreProfitable(const VectorizationFactor &A,
const VectorizationFactor &B,
const unsigned MaxTripCount) const;

/// Determines if we have the infrastructure to vectorize the loop and its
/// epilogue, assuming the main loop is vectorized by \p VF.
bool isCandidateForEpilogueVectorization(const ElementCount VF) const;
Expand Down
49 changes: 36 additions & 13 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1554,7 +1554,10 @@ class LoopVectorizationCostModel {
/// Returns true if epilogue vectorization is considered profitable, and
/// false otherwise.
/// \p VF is the vectorization factor chosen for the original loop.
bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
/// \p Multiplier is an aditional scaling factor applied to VF before
/// comparing to EpilogueVectorizationMinVF.
bool isEpilogueVectorizationProfitable(const ElementCount VF,
const unsigned Multiplier) const;

/// Returns the execution time cost of an instruction for a given vector
/// width. Vector width of one means scalar.
Expand Down Expand Up @@ -4293,12 +4296,11 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
}

bool LoopVectorizationPlanner::isMoreProfitable(
const VectorizationFactor &A, const VectorizationFactor &B) const {
const VectorizationFactor &A, const VectorizationFactor &B,
const unsigned MaxTripCount) const {
InstructionCost CostA = A.Cost;
InstructionCost CostB = B.Cost;

unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);

// Improve estimate for the vector width if it is scalable.
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
unsigned EstimatedWidthB = B.Width.getKnownMinValue();
Expand Down Expand Up @@ -4347,6 +4349,13 @@ bool LoopVectorizationPlanner::isMoreProfitable(
return CmpFn(RTCostA, RTCostB);
}

bool LoopVectorizationPlanner::isMoreProfitable(
const VectorizationFactor &A, const VectorizationFactor &B) const {
const unsigned MaxTripCount =
PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount);
}

static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
OptimizationRemarkEmitter *ORE,
Loop *TheLoop) {
Expand Down Expand Up @@ -4626,7 +4635,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
}

bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
const ElementCount VF) const {
const ElementCount VF, const unsigned Multiplier) const {
// FIXME: We need a much better cost-model to take different parameters such
// as register pressure, code size increase and cost of extra branches into
// account. For now we apply a very crude heuristic and only consider loops
Expand All @@ -4641,9 +4650,6 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
if (TTI.getMaxInterleaveFactor(VF) <= 1)
return false;

unsigned Multiplier = 1;
if (VF.isScalable())
Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
return true;
return false;
Expand Down Expand Up @@ -4690,7 +4696,11 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
return Result;
}

if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
unsigned Multiplier = IC;
if (MainLoopVF.isScalable())
Multiplier = getVScaleForTuning(OrigLoop, TTI).value_or(1);

if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, Multiplier)) {
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
"this loop\n");
return Result;
Expand All @@ -4709,16 +4719,20 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
ScalarEvolution &SE = *PSE.getSE();
Type *TCType = Legal->getWidestInductionType();
const SCEV *RemainingIterations = nullptr;
unsigned MaxTripCount = 0;
for (auto &NextVF : ProfitableVFs) {
// Skip candidate VFs without a corresponding VPlan.
if (!hasPlanWithVF(NextVF.Width))
continue;

// Skip candidate VFs with widths >= the estimate runtime VF (scalable
// vectors) or the VF of the main loop (fixed vectors).
// Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
// vectors) or > the VF of the main loop (fixed vectors).
if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
(NextVF.Width.isScalable() &&
ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
(!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
continue;

// If NextVF is greater than the number of remaining iterations, the
Expand All @@ -4729,6 +4743,14 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
RemainingIterations = SE.getURemExpr(
TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
SE.getConstant(TCType, MaxTripCount))) {
MaxTripCount =
SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
}
LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
<< MaxTripCount << "\n");
}
if (SE.isKnownPredicate(
CmpInst::ICMP_UGT,
Expand All @@ -4737,7 +4759,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
continue;
}

if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
if (Result.Width.isScalar() ||
isMoreProfitable(NextVF, Result, MaxTripCount))
Result = NextVF;
}

Expand Down
Loading