-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LV] Vectorize Epilogues for loops with small VF but high IC #108190
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
adac0aa
db8a41c
3c75a59
aeed89a
0978b38
b6f44e0
add1358
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1516,7 +1516,10 @@ class LoopVectorizationCostModel { | |
/// Returns true if epilogue vectorization is considered profitable, and | ||
/// false otherwise. | ||
/// \p VF is the vectorization factor chosen for the original loop. | ||
bool isEpilogueVectorizationProfitable(const ElementCount VF) const; | ||
/// \p Multiplier is an aditional scaling factor applied to VF before | ||
/// comparing to EpilogueVectorizationMinVF. | ||
bool isEpilogueVectorizationProfitable(const ElementCount VF, | ||
juliannagele marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Only in the non-SVE case, for SVE it's |
||
const unsigned Multiplier) const; | ||
|
||
/// Returns the execution time cost of an instruction for a given vector | ||
/// width. Vector width of one means scalar. | ||
|
@@ -4289,12 +4292,11 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { | |
} | ||
|
||
bool LoopVectorizationPlanner::isMoreProfitable( | ||
const VectorizationFactor &A, const VectorizationFactor &B) const { | ||
const VectorizationFactor &A, const VectorizationFactor &B, | ||
const unsigned MaxTripCount) const { | ||
InstructionCost CostA = A.Cost; | ||
InstructionCost CostB = B.Cost; | ||
|
||
unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); | ||
|
||
// Improve estimate for the vector width if it is scalable. | ||
unsigned EstimatedWidthA = A.Width.getKnownMinValue(); | ||
unsigned EstimatedWidthB = B.Width.getKnownMinValue(); | ||
|
@@ -4343,6 +4345,12 @@ bool LoopVectorizationPlanner::isMoreProfitable( | |
return CmpFn(RTCostA, RTCostB); | ||
} | ||
|
||
bool LoopVectorizationPlanner::isMoreProfitable( | ||
const VectorizationFactor &A, const VectorizationFactor &B) const { | ||
const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); | ||
return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount); | ||
} | ||
|
||
void LoopVectorizationPlanner::emitInvalidCostRemarks( | ||
OptimizationRemarkEmitter *ORE) { | ||
using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>; | ||
|
@@ -4661,7 +4669,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( | |
} | ||
|
||
bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( | ||
const ElementCount VF) const { | ||
const ElementCount VF, const unsigned Multiplier) const { | ||
// FIXME: We need a much better cost-model to take different parameters such | ||
// as register pressure, code size increase and cost of extra branches into | ||
// account. For now we apply a very crude heuristic and only consider loops | ||
|
@@ -4676,9 +4684,6 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( | |
if (TTI.getMaxInterleaveFactor(VF) <= 1) | ||
return false; | ||
|
||
unsigned Multiplier = 1; | ||
if (VF.isScalable()) | ||
Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1); | ||
if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) | ||
return true; | ||
return false; | ||
|
@@ -4724,7 +4729,11 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( | |
return Result; | ||
} | ||
|
||
if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) { | ||
unsigned Multiplier = IC; | ||
if (MainLoopVF.isScalable()) | ||
Multiplier = getVScaleForTuning(OrigLoop, TTI).value_or(1); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe this could be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Possibly, but I didn't really look at SVE and since the remaining part of this change depends on using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this change looks incorrect. Previously in
i.e. for fixed-width VFs
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @fhahn can you verify this as well? I think it should be one of the following:
or
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wouldn't say it is incorrect, it just keeps the original behavior for scalable vectors as I think @juliannagele doesn't have access to HW with scalable vectors, which would be needed to evaluate the impact of changing this for scalable vectors. At this point already picked the VF for the main loop, so the only change is that we consider epilogue vectorization for more cases with fixed vectors. To avoid regression with fixed vectors, this patch relies on code that checks the number of remaining iterations, which currently doesn't support scalable vectors (look for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can volunteer for that, but I am really keen that this lands first. :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, okay, didn't realise that, thanks! |
||
|
||
if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, Multiplier)) { | ||
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " | ||
"this loop\n"); | ||
return Result; | ||
|
@@ -4743,16 +4752,20 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( | |
ScalarEvolution &SE = *PSE.getSE(); | ||
Type *TCType = Legal->getWidestInductionType(); | ||
const SCEV *RemainingIterations = nullptr; | ||
unsigned MaxTripCount = 0; | ||
for (auto &NextVF : ProfitableVFs) { | ||
// Skip candidate VFs without a corresponding VPlan. | ||
if (!hasPlanWithVF(NextVF.Width)) | ||
continue; | ||
|
||
// Skip candidate VFs with widths >= the estimate runtime VF (scalable | ||
// vectors) or the VF of the main loop (fixed vectors). | ||
// Skip candidate VFs with widths >= the (estimated) runtime VF (scalable | ||
// vectors) or > the VF of the main loop (fixed vectors). | ||
if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && | ||
ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || | ||
ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) | ||
(NextVF.Width.isScalable() && | ||
ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) || | ||
(!NextVF.Width.isScalable() && !MainLoopVF.isScalable() && | ||
ElementCount::isKnownGT(NextVF.Width, MainLoopVF))) | ||
continue; | ||
|
||
// If NextVF is greater than the number of remaining iterations, the | ||
|
@@ -4766,6 +4779,14 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( | |
"Trip count SCEV must be computable"); | ||
RemainingIterations = SE.getURemExpr( | ||
TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); | ||
MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1; | ||
if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations, | ||
SE.getConstant(TCType, MaxTripCount))) { | ||
MaxTripCount = | ||
SE.getUnsignedRangeMax(RemainingIterations).getZExtValue(); | ||
} | ||
LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: " | ||
<< MaxTripCount << "\n"); | ||
} | ||
if (SE.isKnownPredicate( | ||
CmpInst::ICMP_UGT, | ||
|
@@ -4774,7 +4795,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( | |
continue; | ||
} | ||
|
||
if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) | ||
if (Result.Width.isScalar() || | ||
isMoreProfitable(NextVF, Result, MaxTripCount)) | ||
Result = NextVF; | ||
} | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.