-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LV]Set tailfolding styles before computing feasible max VF. #91403
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LV]Set tailfolding styles before computing feasible max VF. #91403
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesTrying to set tail folding styles before actually doing the analysis for Patch is 31.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/91403.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3be0102bea3e3..d27391142b5f3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1554,6 +1554,12 @@ class LoopVectorizationCostModel {
}
}
+ void disableTailFolding() {
+ assert(ChosenTailFoldingStyle && "Tail folding must be selected.");
+ ChosenTailFoldingStyle =
+ std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
+ }
+
/// Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailByMasking() const {
// TODO: check if it is possible to check for None style independent of
@@ -1642,6 +1648,14 @@ class LoopVectorizationCostModel {
ElementCount MaxSafeVF,
bool FoldTailByMasking);
+ /// true of scalable vectorization is supported and enabled.
+ std::optional<bool> IsScalableVectorizationAllowed;
+
+ /// Checks if the scalable vectorization is supported and enabled. The result
+ /// is stored in \p IsScalableVectorizationAllowed and used later, if
+ /// requested.
+ bool isScalableVectorizationAllowed();
+
/// \return the maximum legal scalable VF, based on the safe max number
/// of elements.
ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
@@ -4079,9 +4093,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
// needs predication, or it was decided to use masking to deal with gaps
// (either a gap at the end of a load-access that may result in a speculative
// load, or any gaps in a store-access).
- bool PredicatedAccessRequiresMasking =
- blockNeedsPredicationForAnyReason(I->getParent()) &&
- Legal->isMaskRequired(I);
+ bool PredicatedAccessRequiresMasking = isPredicatedInst(I);
bool LoadAccessWithGapsRequiresEpilogMasking =
isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
!isScalarEpilogueAllowed();
@@ -4397,15 +4409,17 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
return false;
}
-ElementCount
-LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
+bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
+ if (IsScalableVectorizationAllowed)
+ return *IsScalableVectorizationAllowed;
+ IsScalableVectorizationAllowed = false;
if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
- return ElementCount::getScalable(0);
+ return false;
if (Hints->isScalableVectorizationDisabled()) {
reportVectorizationInfo("Scalable vectorization is explicitly disabled",
"ScalableVectorizationDisabled", ORE, TheLoop);
- return ElementCount::getScalable(0);
+ return false;
}
LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
@@ -4425,7 +4439,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
"Scalable vectorization not supported for the reduction "
"operations found in this loop.",
"ScalableVFUnfeasible", ORE, TheLoop);
- return ElementCount::getScalable(0);
+ return false;
}
// Disable scalable vectorization if the loop contains any instructions
@@ -4437,9 +4451,20 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
reportVectorizationInfo("Scalable vectorization is not supported "
"for all element types found in this loop.",
"ScalableVFUnfeasible", ORE, TheLoop);
- return ElementCount::getScalable(0);
+ return false;
}
+ IsScalableVectorizationAllowed = true;
+ return true;
+}
+
+ElementCount
+LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
+ if (!isScalableVectorizationAllowed())
+ return ElementCount::getScalable(0);
+
+ auto MaxScalableVF = ElementCount::getScalable(
+ std::numeric_limits<ElementCount::ScalarTy>::max());
if (Legal->isSafeForAnyVectorWidth())
return MaxScalableVF;
@@ -4642,6 +4667,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
+ // If we don't know the precise trip count, or if the trip count that we
+ // found modulo the vectorization factor is not zero, try to fold the tail
+ // by masking.
+ // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
+ setTailFoldingStyles(isScalableVectorizationAllowed(), UserIC);
FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
// Avoid tail folding if the trip count is known to be a multiple of any VF
@@ -4673,15 +4703,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
if (Rem->isZero()) {
// Accept MaxFixedVF if we do not have a tail.
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+ disableTailFolding();
return MaxFactors;
}
}
- // If we don't know the precise trip count, or if the trip count that we
- // found modulo the vectorization factor is not zero, try to fold the tail
- // by masking.
- // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
- setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
if (foldTailByMasking()) {
if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
LLVM_DEBUG(
@@ -6096,7 +6122,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
"Stride should be 1 or -1 for consecutive memory access");
const Align Alignment = getLoadStoreAlignment(I);
InstructionCost Cost = 0;
- if (Legal->isMaskRequired(I)) {
+ if (isPredicatedInst(I)) {
Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
CostKind);
} else {
@@ -6150,7 +6176,7 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
return TTI.getAddressComputationCost(VectorTy) +
TTI.getGatherScatterOpCost(
- I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
+ I->getOpcode(), VectorTy, Ptr, isPredicatedInst(I), Alignment,
TargetTransformInfo::TCK_RecipThroughput, I);
}
@@ -6180,7 +6206,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
(isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
- AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
+ AS, CostKind, isPredicatedInst(I), UseMaskForGaps);
if (Group->isReverse()) {
// TODO: Add support for reversed masked interleaved access.
@@ -6675,7 +6701,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
Function *ScalarFunc = CI->getCalledFunction();
Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;
- bool MaskRequired = Legal->isMaskRequired(CI);
+ bool MaskRequired = isPredicatedInst(CI);
for (auto &ArgOp : CI->args())
ScalarTys.push_back(ArgOp->getType());
@@ -7072,8 +7098,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
return TTI::CastContextHint::Interleave;
case LoopVectorizationCostModel::CM_Scalarize:
case LoopVectorizationCostModel::CM_Widen:
- return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
- : TTI::CastContextHint::Normal;
+ return isPredicatedInst(I) ? TTI::CastContextHint::Masked
+ : TTI::CastContextHint::Normal;
case LoopVectorizationCostModel::CM_Widen_Reverse:
return TTI::CastContextHint::Reversed;
case LoopVectorizationCostModel::CM_Unknown:
@@ -8121,7 +8147,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
return nullptr;
VPValue *Mask = nullptr;
- if (Legal->isMaskRequired(I))
+ if (CM.isPredicatedInst(I))
Mask = getBlockInMask(I->getParent());
// Determine if the pointer operand of the access is either consecutive or
@@ -8329,7 +8355,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
// vector variant at this VF requires a mask, so we synthesize an
// all-true mask.
VPValue *Mask = nullptr;
- if (Legal->isMaskRequired(CI))
+ if (CM.isPredicatedInst(CI))
Mask = getBlockInMask(CI->getParent());
else
Mask = Plan.getOrAddLiveIn(ConstantInt::getTrue(
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
index 0b495bc680f0c..404c48facbefe 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
@@ -13,18 +13,18 @@
define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL-LABEL: @interleave(
; IF-EVL-NEXT: entry:
-; IF-EVL-NEXT: [[TMP17:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP31]], 8
-; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP17]], [[TMP2]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8
-; IF-EVL-NEXT: [[TMP32:%.*]] = sub i64 [[TMP7]], 1
-; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP32]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
@@ -36,8 +36,8 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP13]]
; IF-EVL-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; IF-EVL-NEXT: [[TMP37:%.*]] = mul i64 1, [[TMP15]]
-; IF-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP37]], i64 0
+; IF-EVL-NEXT: [[TMP16:%.*]] = mul i64 1, [[TMP15]]
+; IF-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP16]], i64 0
; IF-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -46,12 +46,12 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 0
; IF-EVL-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4
-; IF-EVL-NEXT: [[TMP38:%.*]] = add i64 [[TMP19]], 0
-; IF-EVL-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 1
-; IF-EVL-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], [[TMP39]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = add i64 [[TMP19]], 0
+; IF-EVL-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 1
+; IF-EVL-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], [[TMP21]]
; IF-EVL-NEXT: [[TMP23:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[TMP24:%.*]] = icmp ule <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
@@ -64,18 +64,18 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP28]], i32 4, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
; IF-EVL-NEXT: [[TMP29:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER3]], [[WIDE_MASKED_GATHER]]
; IF-EVL-NEXT: [[TMP30:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER2]]
-; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
-; IF-EVL-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; IF-EVL-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP17]]
+; IF-EVL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]]
+; IF-EVL-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 0
; IF-EVL-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 4
-; IF-EVL-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP35]]
+; IF-EVL-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i64 [[TMP35]]
; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP29]], ptr [[TMP33]], i32 4, <vscale x 4 x i1> [[TMP23]])
; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP30]], ptr [[TMP36]], i32 4, <vscale x 4 x i1> [[TMP24]])
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
-; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
@@ -84,10 +84,10 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
; IF-EVL: for.body:
; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
-; IF-EVL-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
-; IF-EVL-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP21]]
+; IF-EVL-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP39]], [[TMP38]]
; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
@@ -98,35 +98,49 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
;
; NO-VP-LABEL: @interleave(
; NO-VP-NEXT: entry:
-; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; NO-VP: vector.ph:
-; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
; NO-VP: vector.body:
; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-VP-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP10]], i32 0
-; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP1]], i32 0
-; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; NO-VP-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
-; NO-VP-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
-; NO-VP-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; NO-VP-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; NO-VP-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; NO-VP-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; NO-VP-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
-; NO-VP-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]]
-; NO-VP-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
-; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
-; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
-; NO-VP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 8
-; NO-VP-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP12]], align 4
-; NO-VP-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP11]], align 4
-; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; NO-VP-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0
+; NO-VP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1
+; NO-VP-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
+; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP6]], i32 0
+; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP11]], i32 0
+; NO-VP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; NO-VP-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; NO-VP-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
+; NO-VP-NEXT: [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP15]], align 4
+; NO-VP-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<...
[truncated]
|
Ping! |
Is this just the refactoring to move the point where tail-folding styles are computed? I might have missed this, but can the test changes be avoided? |
I hoped to make it a refactoring, but unfortunately, it is not. Looks like there is a bug in the current implementation of the loop vectorizer. When calling Legal->prepareToFoldTailByMasking(), it fills LoopVectorizationLegality::MaskedOp, which is later used in Legal->isMaskRequired() to check if the instruction requires masking. But if after Legal->prepareToFoldTailByMasking() the compiler decides to turn off tail folding, it does not clear LoopVectorizationLegality::MaskedOp and still uses this data. It leads to incorrect cost modeling. Instead of Legal->isMaskRequired() the pass should use LoopVectorizationCostModel::isPredicatedInst(), which checks that tail folding is enabled before checking for Legal->isMaskRequired(). But it causes the changes in the tests, which are pass Legal->prepareToFoldTailByMasking() check, but later disable tail folding for some reason (say, using the option). |
Ping! |
1 similar comment
Ping! |
Do you think splitting up checking if tail-folding is possible from collecting masked ops would help simplify this change? There's #77612 which I could update if it would help |
Yes, I think so. |
Hmm I just rebased but there's a single test failure I need to check out |
Ping! |
2 similar comments
Ping! |
Ping! |
@@ -5997,7 +6023,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, | |||
"Stride should be 1 or -1 for consecutive memory access"); | |||
const Align Alignment = getLoadStoreAlignment(I); | |||
InstructionCost Cost = 0; | |||
if (Legal->isMaskRequired(I)) { | |||
if (isPredicatedInst(I)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are those changes still needed for the latest version?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we'd better use this function, it has some extra checks for the predicated instructions (like dropping some uniform instructions)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Possible, but this could be done separately (+ test showing an improvement) if it is not needed to change where the styles are set?
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the update. Would be good to update the title/description (remove reference to isPredicatedInst, reword the first sentence as it isn't trying to set it earlier but doing so)
Created using spr 1.3.5
Done |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks! Also added @ayalz for visibility, would be good to wait till Monday before landing in case he has additional suggestions
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This patch seems to involve two parts which best be committed separately, unless there's a reason not to?
- introducing isScalableVectorizationAllowed() to refactor getMaxLegalScalableVF() which spends most of its time checking if isScalableVectorizationAllowed() and only if so to compute and return the desired VF(). This seems to be consistent with having getMaxVF() also take care of isThereAnyAllowedVF(), sigh.
- set tail folding style before being sure there is tail to fold, and unset it later when sure there isn't. This part lacks an explanation why this is needed, in a comment and in a test, in addition to the commit message.
The first part should be a simple standalone NFC. Regarding the second part, note VPlan's roadmap direction to first strip-mine the loop considering its tail folded, and later consider if to unfold the tail - potentially leading to epilog vectorization, etc.
// If we don't know the precise trip count, or if the trip count that we | ||
// found modulo the vectorization factor is not zero, try to fold the tail | ||
// by masking. | ||
// FIXME: look for a smaller MaxVF that does divide TC rather than masking. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This comment best continue to appear below before the if (foldTailByMasking())
part which deals with actually folding the tail, rather than here which tries to avoid tail folding if the precise trip count is known - to be a multiple of any VF we choose, possibly times UserIC? (i.e., not necessarily a power of 2)
Some other comment is needed here to explain why tail folding style is being set here (before being sure there is a tail, possibly to be reset below when we're sure there isn't), before calling computeFeasibleMaxVF(MaxTC, UserVF, /* FoldTail */ true)
, rather than below, before the first time it is checked explicitly via foldTailByMasking()
.
Perhaps that last boolean parameter of computeFeasibleMaxVF() is insufficient/redundant?
@@ -1540,6 +1540,14 @@ class LoopVectorizationCostModel { | |||
} | |||
} | |||
|
|||
/// Disables previously chosen tail folding policy, sets it to None. Expects, | |||
/// that the tail policy was selected. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
independent of this patch: perhaps this should be setting a new policy/style - one which explicitly states there is NoTail - to fold nor unfold, rather than "disabling" and having None mean both unfolded tail and no tail.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe. This needs to be investigated separately.
/// Checks if the scalable vectorization is supported and enabled. The result | ||
/// is stored in \p IsScalableVectorizationAllowed and used later, if | ||
/// requested. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// Checks if the scalable vectorization is supported and enabled. The result | |
/// is stored in \p IsScalableVectorizationAllowed and used later, if | |
/// requested. | |
/// Checks if scalable vectorization is supported and enabled. Caches the result to avoid repeated debug dumps for repeated queries. |
return ElementCount::getScalable(0); | ||
|
||
auto MaxScalableVF = ElementCount::getScalable( | ||
std::numeric_limits<ElementCount::ScalarTy>::max()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
independent of this patch - this potentially overrides MaxSafeElements
- worth some assert?
The report below refers to getScalable(MaxSafeElements / *MaxVScale) returning false, rather than getMaxVScale() returning false?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
-
Do not know how to add the assertion here without looking into internals of the LoopAccessAnalysis. Legal->isSafeForAnyVectorWidth() checks that MaxSafeVectorWidthInBits == UINT_MAX, and MaxSafeElements = bit_floor(MaxSafeVectorWidthInBits / WidestType).
-
Not only. If MaxSafeElements < MaxVScale too.
/// Checks if the scalable vectorization is supported and enabled. The result | ||
/// is stored in \p IsScalableVectorizationAllowed and used later, if | ||
/// requested. | ||
bool isScalableVectorizationAllowed(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not const
because of debug dumps?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, it may change the value of IsScalableVectorizationAllowed, if it is not set yet.
The second part is described here "This change is required for supporting safe max dist |
…xLegalScalableVF(). Adds isScalableVectorizationAllowed() and the corresponding data member to query if the scalable vectorization is supported rather than performing the analysis each time the scalable vector factor is requested. Part of #91403 Reviewers: ayalz, fhahn Reviewed By: fhahn, ayalz Pull Request: #98916
…xLegalScalableVF(). Summary: Adds isScalableVectorizationAllowed() and the corresponding data member to query if the scalable vectorization is supported rather than performing the analysis each time the scalable vector factor is requested. Part of #91403 Test Plan: Reviewers: Reviewed By: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251745
Set tail folding styles before actually doing the analysis for
max vector factor. This change is required for supporting safe max dist
for predicated vectorization (DataWithEVL tail folding mode).
DataWithEVL tail folded loops still use scalable vectorization with
the special check for max safe distance, which allows to support
non-power-of-2 dists.
This change required introducing disableTailFolding() function to
disable tail folding, if previously set, if it is known that the trip
count modulo VF is zero.