Skip to content

Commit 5c606ce

Browse files
author
Sjoerd Meijer
committed
[LV] Scalar Epilogue Lowering. NFC.
This refactors boolean 'OptForSize' that was passed around in a lot of places. It controlled folding of the tail loop, the scalar epilogue, into the main loop but code-size reasons may not be the only reason to do this. Thus, this is a first step to generalise the concept of tail-loop folding, and hence OptForSize has been renamed and is using an enum ScalarEpilogueStatus that holds the status how the epilogue should be lowered. This will be followed up by D65197, that picks up the predicate loop hint and performs the tail-loop folding. Differential Revision: https://reviews.llvm.org/D64916 llvm-svn: 366993
1 parent 85d2fe7 commit 5c606ce

File tree

2 files changed

+66
-57
lines changed

2 files changed

+66
-57
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -228,11 +228,11 @@ class LoopVectorizationPlanner {
228228

229229
/// Plan how to best vectorize, return the best VF and its cost, or None if
230230
/// vectorization and interleaving should be avoided up front.
231-
Optional<VectorizationFactor> plan(bool OptForSize, unsigned UserVF);
231+
Optional<VectorizationFactor> plan(unsigned UserVF);
232232

233233
/// Use the VPlan-native path to plan how to best vectorize, return the best
234234
/// VF and its cost.
235-
VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF);
235+
VectorizationFactor planInVPlanNativePath(unsigned UserVF);
236236

237237
/// Finalize the best decision and dispose of all other VPlans.
238238
void setBestPlan(unsigned VF, unsigned UF);

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 64 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -836,6 +836,14 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
836836

837837
namespace llvm {
838838

839+
// Loop vectorization cost-model hints how the scalar epilogue loop should be
840+
// lowered.
841+
enum ScalarEpilogueLowering {
842+
CM_ScalarEpilogueAllowed,
843+
CM_ScalarEpilogueNotAllowedOptSize,
844+
CM_ScalarEpilogueNotAllowedLowTripLoop
845+
};
846+
839847
/// LoopVectorizationCostModel - estimates the expected speedups due to
840848
/// vectorization.
841849
/// In many cases vectorization is not profitable. This can happen because of
@@ -845,20 +853,22 @@ namespace llvm {
845853
/// different operations.
846854
class LoopVectorizationCostModel {
847855
public:
848-
LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
856+
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
857+
PredicatedScalarEvolution &PSE,
849858
LoopInfo *LI, LoopVectorizationLegality *Legal,
850859
const TargetTransformInfo &TTI,
851860
const TargetLibraryInfo *TLI, DemandedBits *DB,
852861
AssumptionCache *AC,
853862
OptimizationRemarkEmitter *ORE, const Function *F,
854863
const LoopVectorizeHints *Hints,
855864
InterleavedAccessInfo &IAI)
856-
: TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
857-
AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
865+
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE),
866+
LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE),
867+
TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
858868

859869
/// \return An upper bound for the vectorization factor, or None if
860870
/// vectorization and interleaving should be avoided up front.
861-
Optional<unsigned> computeMaxVF(bool OptForSize);
871+
Optional<unsigned> computeMaxVF();
862872

863873
/// \return The most profitable vectorization factor and the cost of that VF.
864874
/// This method checks every power of two up to MaxVF. If UserVF is not ZERO
@@ -881,8 +891,7 @@ class LoopVectorizationCostModel {
881891
/// If interleave count has been specified by metadata it will be returned.
882892
/// Otherwise, the interleave count is computed and returned. VF and LoopCost
883893
/// are the selected vectorization factor and the cost of the selected VF.
884-
unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
885-
unsigned LoopCost);
894+
unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
886895

887896
/// Memory access instruction may be vectorized in more than one way.
888897
/// Form of instruction after vectorization depends on cost.
@@ -1157,11 +1166,14 @@ class LoopVectorizationCostModel {
11571166
/// to handle accesses with gaps, and there is nothing preventing us from
11581167
/// creating a scalar epilogue.
11591168
bool requiresScalarEpilogue() const {
1160-
return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
1169+
return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
11611170
}
11621171

1163-
/// Returns true if a scalar epilogue is not allowed due to optsize.
1164-
bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
1172+
/// Returns true if a scalar epilogue is not allowed due to optsize or a
1173+
/// loop hint annotation.
1174+
bool isScalarEpilogueAllowed() const {
1175+
return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1176+
}
11651177

11661178
/// Returns true if all loop blocks should be masked to fold tail loop.
11671179
bool foldTailByMasking() const { return FoldTailByMasking; }
@@ -1187,7 +1199,7 @@ class LoopVectorizationCostModel {
11871199

11881200
/// \return An upper bound for the vectorization factor, larger than zero.
11891201
/// One is returned if vectorization should best be avoided due to cost.
1190-
unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
1202+
unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
11911203

11921204
/// The vectorization cost is a combination of the cost itself and a boolean
11931205
/// indicating whether any of the contributing operations will actually
@@ -1270,13 +1282,13 @@ class LoopVectorizationCostModel {
12701282
SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
12711283

12721284
/// Records whether it is allowed to have the original scalar loop execute at
1273-
/// least once. This may be needed as a fallback loop in case runtime
1285+
/// least once. This may be needed as a fallback loop in case runtime
12741286
/// aliasing/dependence checks fail, or to handle the tail/remainder
12751287
/// iterations when the trip count is unknown or doesn't divide by the VF,
12761288
/// or as a peel-loop to handle gaps in interleave-groups.
12771289
/// Under optsize and when the trip count is very small we don't allow any
12781290
/// iterations to execute in the scalar loop.
1279-
bool IsScalarEpilogueAllowed = true;
1291+
ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
12801292

12811293
/// All blocks of loop are to be masked to fold tail of scalar iterations.
12821294
bool FoldTailByMasking = false;
@@ -4452,10 +4464,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
44524464
// Check if masking is required.
44534465
// A Group may need masking for one of two reasons: it resides in a block that
44544466
// needs predication, or it was decided to use masking to deal with gaps.
4455-
bool PredicatedAccessRequiresMasking =
4467+
bool PredicatedAccessRequiresMasking =
44564468
Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4457-
bool AccessWithGapsRequiresMasking =
4458-
Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
4469+
bool AccessWithGapsRequiresMasking =
4470+
Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
44594471
if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
44604472
return true;
44614473

@@ -4675,7 +4687,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
46754687
Uniforms[VF].insert(Worklist.begin(), Worklist.end());
46764688
}
46774689

4678-
Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
4690+
Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
46794691
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
46804692
// TODO: It may by useful to do since it's still likely to be dynamically
46814693
// uniform if the target can skip.
@@ -4690,8 +4702,11 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
46904702
}
46914703

46924704
unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4693-
if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
4694-
return computeFeasibleMaxVF(OptForSize, TC);
4705+
if (isScalarEpilogueAllowed())
4706+
return computeFeasibleMaxVF(TC);
4707+
4708+
LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue.\n" <<
4709+
"LV: Performing code size checks.\n");
46954710

46964711
if (Legal->getRuntimePointerChecking()->Need) {
46974712
ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
@@ -4740,15 +4755,13 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
47404755
// Record that scalar epilogue is not allowed.
47414756
LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
47424757

4743-
IsScalarEpilogueAllowed = !OptForSize;
4744-
47454758
// We don't create an epilogue when optimizing for size.
47464759
// Invalidate interleave groups that require an epilogue if we can't mask
47474760
// the interleave-group.
4748-
if (!useMaskedInterleavedAccesses(TTI))
4761+
if (!useMaskedInterleavedAccesses(TTI))
47494762
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
47504763

4751-
unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
4764+
unsigned MaxVF = computeFeasibleMaxVF(TC);
47524765

47534766
if (TC > 0 && TC % MaxVF == 0) {
47544767
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
@@ -4779,8 +4792,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
47794792
}
47804793

47814794
unsigned
4782-
LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
4783-
unsigned ConstTripCount) {
4795+
LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
47844796
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
47854797
unsigned SmallestType, WidestType;
47864798
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -4818,8 +4830,8 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
48184830
}
48194831

48204832
unsigned MaxVF = MaxVectorSize;
4821-
if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
4822-
(MaximizeBandwidth && !OptForSize)) {
4833+
if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
4834+
(MaximizeBandwidth && isScalarEpilogueAllowed())) {
48234835
// Collect all viable vectorization factors larger than the default MaxVF
48244836
// (i.e. MaxVectorSize).
48254837
SmallVector<unsigned, 8> VFs;
@@ -4958,8 +4970,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
49584970
return {MinWidth, MaxWidth};
49594971
}
49604972

4961-
unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
4962-
unsigned VF,
4973+
unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
49634974
unsigned LoopCost) {
49644975
// -- The interleave heuristics --
49654976
// We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -4975,8 +4986,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
49754986
// 3. We don't interleave if we think that we will spill registers to memory
49764987
// due to the increased register pressure.
49774988

4978-
// When we optimize for size, we don't interleave.
4979-
if (OptForSize)
4989+
if (!isScalarEpilogueAllowed())
49804990
return 1;
49814991

49824992
// We used the distance for the interleave count.
@@ -5626,8 +5636,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
56265636
}
56275637

56285638
// Calculate the cost of the whole interleaved group.
5629-
bool UseMaskForGaps =
5630-
Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
5639+
bool UseMaskForGaps =
5640+
Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
56315641
unsigned Cost = TTI.getInterleavedMemoryOpCost(
56325642
I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
56335643
Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
@@ -6167,8 +6177,7 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
61676177
}
61686178

61696179
VectorizationFactor
6170-
LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
6171-
unsigned UserVF) {
6180+
LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
61726181
unsigned VF = UserVF;
61736182
// Outer loop handling: They may require CFG and instruction level
61746183
// transformations before even evaluating whether vectorization is profitable.
@@ -6207,10 +6216,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
62076216
return VectorizationFactor::Disabled();
62086217
}
62096218

6210-
Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize,
6211-
unsigned UserVF) {
6219+
Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
62126220
assert(OrigLoop->empty() && "Inner loop expected.");
6213-
Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
6221+
Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
62146222
if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
62156223
return None;
62166224

@@ -7213,8 +7221,15 @@ static bool processLoopInVPlanNativePath(
72137221
assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
72147222
Function *F = L->getHeader()->getParent();
72157223
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7216-
LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7217-
&Hints, IAI);
7224+
7225+
ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7226+
if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7227+
(F->hasOptSize() ||
7228+
llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
7229+
SEL = CM_ScalarEpilogueNotAllowedOptSize;
7230+
7231+
LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI,
7232+
DB, AC, ORE, F, &Hints, IAI);
72187233
// Use the planner for outer loop vectorization.
72197234
// TODO: CM is not used at this point inside the planner. Turn CM into an
72207235
// optional argument if we don't need it in the future.
@@ -7223,15 +7238,8 @@ static bool processLoopInVPlanNativePath(
72237238
// Get user vectorization factor.
72247239
const unsigned UserVF = Hints.getWidth();
72257240

7226-
// Check the function attributes and profiles to find out if this function
7227-
// should be optimized for size.
7228-
bool OptForSize =
7229-
Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7230-
(F->hasOptSize() ||
7231-
llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
7232-
72337241
// Plan how to best vectorize, return the best VF and its cost.
7234-
const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);
7242+
const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
72357243

72367244
// If we are stress testing VPlan builds, do not attempt to generate vector
72377245
// code. Masked vector code generation support will follow soon.
@@ -7310,10 +7318,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
73107318

73117319
// Check the function attributes and profiles to find out if this function
73127320
// should be optimized for size.
7313-
bool OptForSize =
7314-
Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7321+
ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7322+
if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
73157323
(F->hasOptSize() ||
7316-
llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
7324+
llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
7325+
SEL = CM_ScalarEpilogueNotAllowedOptSize;
73177326

73187327
// Entrance to the VPlan-native vectorization path. Outer loops are processed
73197328
// here. They may require CFG and instruction level transformations before
@@ -7365,7 +7374,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
73657374
// Loops with a very small trip count are considered for vectorization
73667375
// under OptForSize, thereby making sure the cost of their loop body is
73677376
// dominant, free of runtime guards and scalar iteration overheads.
7368-
OptForSize = true;
7377+
SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
73697378
}
73707379
}
73717380

@@ -7411,8 +7420,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
74117420
}
74127421

74137422
// Use the cost model.
7414-
LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
7415-
&Hints, IAI);
7423+
LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI,
7424+
DB, AC, ORE, F, &Hints, IAI);
74167425
CM.collectValuesToIgnore();
74177426

74187427
// Use the planner for vectorization.
@@ -7422,7 +7431,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
74227431
unsigned UserVF = Hints.getWidth();
74237432

74247433
// Plan how to best vectorize, return the best VF and its cost.
7425-
Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF);
7434+
Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
74267435

74277436
VectorizationFactor VF = VectorizationFactor::Disabled();
74287437
unsigned IC = 1;
@@ -7431,7 +7440,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
74317440
if (MaybeVF) {
74327441
VF = *MaybeVF;
74337442
// Select the interleave count.
7434-
IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
7443+
IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
74357444
}
74367445

74377446
// Identify the diagnostic messages that should be produced.

0 commit comments

Comments
 (0)