Skip to content

Commit 6efcff1

Browse files
committed
[LV][AArch64] Prefer Fixed over Scalable if cost-model is equal (Neoverse V2)
For the Neoverse V2, we would like to prefer fixed width over scalable vectorisation if the cost-model assigns an equal cost for certain loops. This improves 7 kernels from TSVC-2 by about 2x, and does not affect SPEC21017 INT and FP. This also adds a new TTI new hook that can steer the loop vectoriser to preferring fixed width vectorization, which can be set per CPU. For now, this is only enabled for the Neoverse V2. This tends to benefit small kernels, like the ones in TSVC, for a number of reasons: processing the predicates does not come entirely for free, NEON tends to generate slightly less code which can have a big impact on these small kernels, and then there are second order effects that SVE codegen is slightly less optimal in some areas. This codegen strategy to generate more NEON is inline with GCC's codegen strategy, which is actually even more aggressive in generating NEON when no predication is required. We could be smarter and more aggressive too about generating more NEON (and improve performance), but this seems to be a first good and straight forward step.
1 parent 35ddc17 commit 6efcff1

File tree

9 files changed

+274
-5
lines changed

9 files changed

+274
-5
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1674,6 +1674,11 @@ class TargetTransformInfo {
16741674
false; ///< If op is an fp min/max, whether NaNs may be present.
16751675
};
16761676

1677+
/// \returns True if the targets prefers fixed width vectorization if the
1678+
/// loop vectorizer's cost-model assigns an equal cost to the fixed and
1679+
/// scalable version of the vectorized loop.
1680+
bool preferFixedOverScalableIfEqualCost() const;
1681+
16771682
/// \returns True if the target prefers reductions in loop.
16781683
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
16791684
ReductionFlags Flags) const;
@@ -2143,6 +2148,7 @@ class TargetTransformInfo::Concept {
21432148
virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
21442149
unsigned ChainSizeInBytes,
21452150
VectorType *VecTy) const = 0;
2151+
virtual bool preferFixedOverScalableIfEqualCost() const = 0;
21462152
virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty,
21472153
ReductionFlags) const = 0;
21482154
virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
@@ -2873,6 +2879,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
28732879
VectorType *VecTy) const override {
28742880
return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
28752881
}
2882+
bool preferFixedOverScalableIfEqualCost() const override {
2883+
return Impl.preferFixedOverScalableIfEqualCost();
2884+
}
28762885
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
28772886
ReductionFlags Flags) const override {
28782887
return Impl.preferInLoopReduction(Opcode, Ty, Flags);

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -913,6 +913,8 @@ class TargetTransformInfoImplBase {
913913
return VF;
914914
}
915915

916+
bool preferFixedOverScalableIfEqualCost() const { return false; }
917+
916918
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
917919
TTI::ReductionFlags Flags) const {
918920
return false;

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1282,6 +1282,10 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF,
12821282
return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
12831283
}
12841284

1285+
bool TargetTransformInfo::preferFixedOverScalableIfEqualCost() const {
1286+
return TTIImpl->preferFixedOverScalableIfEqualCost();
1287+
}
1288+
12851289
bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode, Type *Ty,
12861290
ReductionFlags Flags) const {
12871291
return TTIImpl->preferInLoopReduction(Opcode, Ty, Flags);

llvm/lib/Target/AArch64/AArch64Features.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,10 @@ def FeatureExperimentalZeroingPseudos
244244
def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl",
245245
"UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">;
246246

247+
def FeatureUseFixedOverScalableIfEqualCost: SubtargetFeature<"use-fixed-over-scalable-equal-cost",
248+
"UseFixedOverScalableIfEqualCost", "true",
249+
"Prefer fixed width loop vectorization over scalable if the cost-model assigns equal costs">;
250+
247251
def FeatureBF16 : Extension<"bf16", "BF16",
248252
"Enable BFloat16 Extension (FEAT_BF16)", [],
249253
"FEAT_BF16", "+bf16", 280>;

llvm/lib/Target/AArch64/AArch64Processors.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,6 +489,7 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
489489
FeatureALULSLFast,
490490
FeaturePostRAScheduler,
491491
FeatureEnableSelectOptimize,
492+
FeatureUseFixedOverScalableIfEqualCost,
492493
FeaturePredictableSelectIsExpensive]>;
493494

494495
def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3",

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,19 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
371371
return TailFoldingStyle::DataWithoutLaneMask;
372372
}
373373

374+
bool preferFixedOverScalableIfEqualCost() const {
375+
// TODO: Ideally we only check getVScaleForTuning() == 1, but we do
376+
// also check if the CPU has the useFixed feature enabled, which was
377+
// introduced to reduce the impact of this for other targets.
378+
//
379+
// With the getVScaleForTuning() == 1 check, we ask if we're tuning based
380+
// on the assumption the SVE registers are no bigger than the NEON ones.
381+
// If this is the case, and the loop vectorisation cost-model is a tie,
382+
// we prefer NEON as there should be no advantage of using SVE.
383+
return ST->useFixedOverScalableIfEqualCost() &&
384+
ST->getVScaleForTuning() == 1;
385+
}
386+
374387
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI);
375388

376389
bool supportsScalableVectors() const { return ST->hasSVE(); }

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -447,10 +447,13 @@ class LoopVectorizationPlanner {
447447
VectorizationFactor
448448
selectVectorizationFactor(const ElementCountSet &CandidateVFs);
449449

450+
bool preferFixedOverScalableIfEqualCost(const Loop *L, ElementCount VF,
451+
unsigned IC) const;
452+
450453
/// Returns true if the per-lane cost of VectorizationFactor A is lower than
451454
/// that of B.
452455
bool isMoreProfitable(const VectorizationFactor &A,
453-
const VectorizationFactor &B) const;
456+
const VectorizationFactor &B, unsigned IC = 0) const;
454457

455458
/// Determines if we have the infrastructure to vectorize the loop and its
456459
/// epilogue, assuming the main loop is vectorized by \p VF.

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4760,8 +4760,42 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
47604760
return TTI.getVScaleForTuning();
47614761
}
47624762

4763-
bool LoopVectorizationPlanner::isMoreProfitable(
4764-
const VectorizationFactor &A, const VectorizationFactor &B) const {
4763+
bool LoopVectorizationPlanner::preferFixedOverScalableIfEqualCost(
4764+
const Loop *L, ElementCount VF, unsigned IC) const {
4765+
// Check if the Subtarget has the feature enabled that it might prefer fixed
4766+
// over scalable vectorisation.
4767+
if (!TTI.preferFixedOverScalableIfEqualCost())
4768+
return false;
4769+
4770+
// With an interleaving count of 1, we don't expect the potential use of
4771+
// LDP/STP, which are instructions that SVE lacks, to make a difference for
4772+
// fixed with vectorisation.
4773+
if (IC == 1)
4774+
return false;
4775+
4776+
for (BasicBlock *BB : L->blocks()) {
4777+
for (Instruction &I : *BB) {
4778+
if (!(isa<LoadInst>(I) || isa<StoreInst>(I)))
4779+
continue;
4780+
4781+
// TODO: This could be more sophisiticated, but the initial idea here is
4782+
// that if the cost-model is a tie, and gathers/scatters or predication
4783+
// is required, then SVE is probably more efficient so favour SVE in
4784+
// these cases.
4785+
auto Decision = CM.getWideningDecision(&I, VF);
4786+
if (Decision == LoopVectorizationCostModel::CM_GatherScatter)
4787+
return false;
4788+
else if (Decision == LoopVectorizationCostModel::CM_Widen)
4789+
return !Legal->isMaskRequired(&I);
4790+
}
4791+
}
4792+
4793+
return false;
4794+
}
4795+
4796+
bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
4797+
const VectorizationFactor &B,
4798+
unsigned IC) const {
47654799
InstructionCost CostA = A.Cost;
47664800
InstructionCost CostB = B.Cost;
47674801

@@ -4780,7 +4814,10 @@ bool LoopVectorizationPlanner::isMoreProfitable(
47804814
// Assume vscale may be larger than 1 (or the value being tuned for),
47814815
// so that scalable vectorization is slightly favorable over fixed-width
47824816
// vectorization.
4783-
bool PreferScalable = A.Width.isScalable() && !B.Width.isScalable();
4817+
bool PreferScalable = false;
4818+
if (!preferFixedOverScalableIfEqualCost(OrigLoop, A.Width, IC))
4819+
PreferScalable = A.Width.isScalable() && !B.Width.isScalable();
4820+
47844821
auto CmpFn = [PreferScalable](const InstructionCost &LHS,
47854822
const InstructionCost &RHS) {
47864823
return PreferScalable ? LHS <= RHS : LHS < RHS;
@@ -5100,7 +5137,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
51005137
continue;
51015138
}
51025139

5103-
if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
5140+
if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result, IC))
51045141
Result = NextVF;
51055142
}
51065143

0 commit comments

Comments
 (0)