-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LV]Initial support for safe distance in predicated DataWithEVL vectorization mode. #102897
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LV]Initial support for safe distance in predicated DataWithEVL vectorization mode. #102897
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesEnabled initial support for max safe distance in DataWithEVL mode. If while vectorize the loop in DataWithEVL tail folding mode. Patch is 23.09 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102897.diff 6 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 542d74ef0e1ef1..3b1c4830159969 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1445,9 +1445,8 @@ class LoopVectorizationCostModel {
/// Selects and saves TailFoldingStyle for 2 options - if IV update may
/// overflow or not.
- /// \param IsScalableVF true if scalable vector factors enabled.
/// \param UserIC User specific interleave count.
- void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
+ void setTailFoldingStyles(unsigned UserIC) {
assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
if (!Legal->canFoldTailByMasking()) {
ChosenTailFoldingStyle =
@@ -1470,12 +1469,9 @@ class LoopVectorizationCostModel {
// Override forced styles if needed.
// FIXME: use actual opcode/data type for analysis here.
// FIXME: Investigate opportunity for fixed vector factor.
- bool EVLIsLegal =
- IsScalableVF && UserIC <= 1 &&
- TTI.hasActiveVectorLength(0, nullptr, Align()) &&
- !EnableVPlanNativePath &&
- // FIXME: implement support for max safe dependency distance.
- Legal->isSafeForAnyVectorWidth();
+ bool EVLIsLegal = UserIC <= 1 &&
+ TTI.hasActiveVectorLength(0, nullptr, Align()) &&
+ !EnableVPlanNativePath;
if (!EVLIsLegal) {
// If for some reason EVL mode is unsupported, fallback to
// DataWithoutLaneMask to try to vectorize the loop with folded tail
@@ -1493,6 +1489,14 @@ class LoopVectorizationCostModel {
}
}
+ /// Disables previously chosen tail folding policy, sets it to None. Expects,
+ /// that the tail policy was selected.
+ void disableTailFolding() {
+ assert(ChosenTailFoldingStyle && "Tail folding must be selected.");
+ ChosenTailFoldingStyle =
+ std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
+ }
+
/// Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailByMasking() const {
// TODO: check if it is possible to check for None style independent of
@@ -1500,6 +1504,14 @@ class LoopVectorizationCostModel {
return getTailFoldingStyle() != TailFoldingStyle::None;
}
+ /// Return maximum safe number of elements to be processed, which do not
+ /// prevent store-load forwarding.
+ /// TODO: need to consider adjusting cost model to use this value as a
+ /// vectorization factor for EVL-based vectorization.
+ std::optional<unsigned> getMaxEVLSafeElements() const {
+ return MaxEVLSafeElements;
+ }
+
/// Returns true if the instructions in this block requires predication
/// for any reason, e.g. because tail folding now requires a predicate
/// or because the block in the original loop was predicated.
@@ -1651,6 +1663,10 @@ class LoopVectorizationCostModel {
/// true if scalable vectorization is supported and enabled.
std::optional<bool> IsScalableVectorizationAllowed;
+ /// Maximum safe number of elements to be processed, which do not
+ /// prevent store-load forwarding.
+ std::optional<unsigned> MaxEVLSafeElements;
+
/// A map holding scalar costs for different vectorization factors. The
/// presence of a cost for an instruction in the mapping indicates that the
/// instruction will be scalarized when vectorizing with the associated
@@ -3903,9 +3919,14 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
// dependence distance).
unsigned MaxSafeElements =
llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
+ unsigned MaxScalableSafeElements = MaxSafeElements;
+ if (foldTailWithEVL() && !Legal->isSafeForAnyVectorWidth()) {
+ MaxScalableSafeElements = std::numeric_limits<unsigned>::max();
+ MaxEVLSafeElements = MaxSafeElements;
+ }
auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
- auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
+ auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxScalableSafeElements);
LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
<< ".\n");
@@ -4075,7 +4096,13 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
- FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
+ // If we don't know the precise trip count, or if the trip count that we
+ // found modulo the vectorization factor is not zero, try to fold the tail
+ // by masking.
+ // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
+ setTailFoldingStyles(UserIC);
+ FixedScalableVFPair MaxFactors =
+ computeFeasibleMaxVF(MaxTC, UserVF, foldTailByMasking());
// Avoid tail folding if the trip count is known to be a multiple of any VF
// we choose.
@@ -4106,15 +4133,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
if (Rem->isZero()) {
// Accept MaxFixedVF if we do not have a tail.
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+ disableTailFolding();
return MaxFactors;
}
}
- // If we don't know the precise trip count, or if the trip count that we
- // found modulo the vectorization factor is not zero, try to fold the tail
- // by masking.
- // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
- setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
if (foldTailByMasking()) {
if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
LLVM_DEBUG(
@@ -8496,8 +8519,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
VPlanTransforms::optimize(*Plan, *PSE.getSE());
// TODO: try to put it close to addActiveLaneMask().
// Discard the plan if it is not EVL-compatible
- if (CM.foldTailWithEVL() &&
- !VPlanTransforms::tryAddExplicitVectorLength(*Plan))
+ if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(
+ *Plan, CM.getMaxEVLSafeElements()))
break;
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 016ad75c21ceb0..c80538c03b5751 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1239,6 +1239,13 @@ class VPInstruction : public VPRecipeWithIRFlags {
SLPLoad,
SLPStore,
ActiveLaneMask,
+ /// Creates special scalar explicit-vector-length instruction, which
+ /// calculates the vectorization factor (number of iterations, that can be
+ /// executed simultaneously) at runtime.
+ /// Has two mandatory parameters - EVL (effective vector length) on the
+ /// previous iteration and original trip count.
+ /// Also, has one optional parameter - max safe distance, allowed for the
+ /// loop.
ExplicitVectorLength,
/// Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
/// The first operand is the incoming value from the predecessor in VPlan,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1a93f275a39f5f..cd7b854ea848d6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -471,6 +471,11 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
assert(State.VF.isScalable() && "Expected scalable vector factor.");
Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
+ if (getNumOperands() == 3) {
+ Value *MaxSafeVF = State.get(getOperand(2), VPIteration(0, 0));
+ AVL = State.Builder.CreateBinaryIntrinsic(Intrinsic::umin, AVL,
+ MaxSafeVF);
+ }
Value *EVL = State.Builder.CreateIntrinsic(
State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
{AVL, VFArg, State.Builder.getTrue()});
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 045f6c356669fa..17fffa0ac2e0a9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1427,7 +1427,8 @@ void VPlanTransforms::addActiveLaneMask(
/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
/// ...
///
-bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
+bool VPlanTransforms::tryAddExplicitVectorLength(
+ VPlan &Plan, const std::optional<unsigned> &MaxEVLSafeElements) {
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
// The transform updates all users of inductions to work based on EVL, instead
// of the VF directly. At the moment, widened inductions cannot be updated, so
@@ -1452,8 +1453,12 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
// Create the ExplicitVectorLengthPhi recipe in the main loop.
auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
EVLPhi->insertAfter(CanonicalIVPHI);
- auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength,
- {EVLPhi, Plan.getTripCount()});
+ SmallVector<VPValue *, 3> Operands = {EVLPhi, Plan.getTripCount()};
+ if (MaxEVLSafeElements)
+ Operands.push_back(Plan.getOrAddLiveIn(ConstantInt::get(
+ CanonicalIVPHI->getScalarType(), *MaxEVLSafeElements)));
+ auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength, Operands,
+ DebugLoc());
VPEVL->insertBefore(*Header, Header->getFirstNonPhi());
auto *CanonicalIVIncrement =
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 96b8a6639723c2..8158c832f1a951 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -105,7 +105,9 @@ struct VPlanTransforms {
/// VPCanonicalIVPHIRecipe is only used to control the loop after
/// this transformation.
/// \returns true if the transformation succeeds, or false if it doesn't.
- static bool tryAddExplicitVectorLength(VPlan &Plan);
+ static bool
+ tryAddExplicitVectorLength(VPlan &Plan,
+ const std::optional<unsigned> &MaxEVLSafeElements);
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
index 2dd47d5c1ea8a7..a1401a247a53e9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
@@ -103,24 +103,38 @@ define void @test_may_clobber1(ptr %p) {
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 200, [[TMP2]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; IF-EVL-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; IF-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
-; IF-EVL-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 100
-; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; IF-EVL-NEXT: store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 4)
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 2, i1 true)
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP10]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[TMP8]], 100
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP13]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
-; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; IF-EVL-NEXT: br label [[LOOP:%.*]]
; IF-EVL: loop:
; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -245,24 +259,38 @@ define void @test_may_clobber3(ptr %p) {
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 200, [[TMP2]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; IF-EVL-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; IF-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32
-; IF-EVL-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 10
-; IF-EVL-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; IF-EVL-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 2)
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 2, i1 true)
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP10]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[TMP8]], 10
+; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP13]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
-; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; IF-EVL-NEXT: br label [[LOOP:%.*]]
; IF-EVL: loop:
; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -422,28 +450,38 @@ define void @no_high_lmul_or_interleave(ptr %p) {
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 3002, [[TMP2]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
-; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], <i64 3001, i64 3001, i64 3001, i64 3001>
-; IF-EVL-NEX...
[truncated]
|
Ping! |
@@ -3903,9 +3919,14 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( | |||
// dependence distance). | |||
unsigned MaxSafeElements = | |||
llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); | |||
unsigned MaxScalableSafeElements = MaxSafeElements; | |||
if (foldTailWithEVL() && !Legal->isSafeForAnyVectorWidth()) { | |||
MaxScalableSafeElements = std::numeric_limits<unsigned>::max(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Might be good to document this. IIUC this means if we have something like MaxSafeElements == 4
, we will still try VFs > 4, even thought we know only at most 4 lanes will execute? Is this desirable?
Or should this limit to the next power-of-2 of MaxSafeElements?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought about this and at first I thought it might be good to use max here. But after some thoughts, I think next-power-of-2 is good here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Where should I document this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Might be good to document this. IIUC this means if we have something like MaxSafeElements == 4, we will still try VFs > 4, even thought we know only at most 4 lanes will execute? Is this desirable?
IIUC, we need to "try" a single "VF" - in the sense of the factor that sets the vector-trip-count - regardless of max dependence distance. In the sense of the static type, that could be set to any scalable or fixed(?) value that is large enough - at-least max dependence distance - to maximize vector lengths.
@@ -4106,15 +4133,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { | |||
if (Rem->isZero()) { | |||
// Accept MaxFixedVF if we do not have a tail. | |||
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); | |||
disableTailFolding(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is needed, because we need to first need to compute the max VFs, assuming tail folding, right?
Could we end up in a scenario, where we MaxSafeNumberOfElements is 3, we max scalable VF of 4 is picked and then tail-folding is disabled here and no EVL will be used, vectorizing incorrectly with VF 4?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- Right.
- Yes, looks so. Need to do extra processing here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this still pending or is there a check now elsewhere?
Would it be simpler to just increase the scalable VF when tail-folding with EVL below where we already adjust the max fixed VF, instead of needing to reset the tail folding decision?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- Added the extra check above, when setting MaxPowerOf2RuntimeVF
- I'm afraid there might be some side effects, if we'll keep tail-folding mode ON, while it is OFF. Better explicitly set it to OFF, I think
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- I'm afraid there might be some side effects, if we'll keep tail-folding mode ON, while it is OFF. Better explicitly set it to OFF, I think
Agreed, I was suggesting if it is possible to keep setTailFoldingStyles
at its original place and then try to maximize MaxScalableVF below where we already deal with the EVL case ( if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
below)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe it can be done right now, but the following patch still requires moving it up (for-non-power-2 support)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If possible to keep it in the original place without much extra work that would be preferable, moving when actually needed?
Would it suffice to introduce a VPInstruction for computing "SAFE_AVL = std::min(AVL, MAX_SAFE_DISTANCE)"? |
Instead of 3rd operand in the recipe - yes, but it will require adding special VPInstruction |
Created using spr 1.3.5
Created using spr 1.3.5
Ping! |
Created using spr 1.3.5
Ping! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm still somewhat confused about how EVL tail-folding, MaxVF, max-dependence-distance, and powers-of-2 are all related. Posting various thoughts and comments.
if (MaxEVLSafeElements) | ||
VPEVL->insertAfter(AVL); | ||
else | ||
VPEVL->insertBefore(*Header, Header->getFirstNonPhi()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: using VPBuilder would probably help generate the phi and sequence of non-phi instructions more easily, setting its insertion point once (to first non-phi) rather than inserting each recipe individually. BTW, EVLPhi might also be inserted there, instead of immediately after CanonicalIVPHI.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
EVLPhi better do in the separate patch
|
||
auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); | ||
auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); | ||
auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxScalableSafeElements); | ||
|
||
LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF | ||
<< ".\n"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Also dump max EVL safe VF?)
(while we're here, the next else
is redundant - follows a return
)
; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[EVL_BASED_IV]], 1024 | ||
; IF-EVL-NEXT: [[SAFE_AVL:%.*]] = select i1 [[TMP3]], i64 [[EVL_BASED_IV]], i64 1024 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Must the (max) dependence distance be a power of 2, currently?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, need to check if vectorization is safe for any VF before setting MaxSafeElements
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is power-of-2 here, no problem
// If scalable vector factor is not set or power-of-2 or tail folding with | ||
// EVL is not set, try to avoid tail folding if the trip count is known to | ||
// be a multiple of any chosen VF. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Explanation above talks about the complementary condition checked in the code below, better have them consistent.
What if tail folding with EVL is set - but MaxEVLSafeElements is not a power of 2? Unclear why the latter matters - EVL should presumably handle any max safe distance(?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If MaxEVLSafeElements is not power-of-2, cannot set MaxPowerOf2RuntimeVF and need to use regular vectorization with min(AVL, MaxSafeElements). With power of 2 can avoid this by disabling tail folding completely.
// found modulo the vectorization factor is not zero, try to fold the tail | ||
// by masking. | ||
// FIXME: look for a smaller MaxVF that does divide TC rather than masking. | ||
setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC); | ||
if (foldTailByMasking()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Following above mentioned thought, should the following correction for over-speculated MaxFactors
take place in the else
, (!foldTailByMasking())
case -
MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, false);
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As I said, some special processing will be required for EVL case with non-power-of-2 safe distance
@@ -3903,9 +3919,14 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( | |||
// dependence distance). | |||
unsigned MaxSafeElements = | |||
llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); | |||
unsigned MaxScalableSafeElements = MaxSafeElements; | |||
if (foldTailWithEVL() && !Legal->isSafeForAnyVectorWidth()) { | |||
MaxScalableSafeElements = std::numeric_limits<unsigned>::max(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Might be good to document this. IIUC this means if we have something like MaxSafeElements == 4, we will still try VFs > 4, even thought we know only at most 4 lanes will execute? Is this desirable?
IIUC, we need to "try" a single "VF" - in the sense of the factor that sets the vector-trip-count - regardless of max dependence distance. In the sense of the static type, that could be set to any scalable or fixed(?) value that is large enough - at-least max dependence distance - to maximize vector lengths.
Created using spr 1.3.5
✅ With the latest revision this PR passed the C/C++ code formatter. |
Created using spr 1.3.5
Ping! |
1 similar comment
Ping! |
Created using spr 1.3.5
; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 200, [[EVL_BASED_IV]] | ||
; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) | ||
; IF-EVL-NEXT: [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0 | ||
; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] | ||
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 0 | ||
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP5]]) | ||
; IF-EVL-NEXT: [[TMP9:%.*]] = add i64 [[TMP6]], 200 | ||
; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP9]] | ||
; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0 | ||
; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 8 [[TMP11]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP5]]) | ||
; IF-EVL-NEXT: [[TMP12:%.*]] = zext i32 [[TMP5]] to i64 | ||
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP12]], [[EVL_BASED_IV]] | ||
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] | ||
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] | ||
; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] | ||
; IF-EVL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] | ||
; IF-EVL-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just renaming
llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
Outdated
Show resolved
Hide resolved
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
Looks like there is one pending doc suggestion
Created using spr 1.3.5
Fixed |
Enabled initial support for max safe distance in DataWithEVL mode. If
max safe distance is required, need to emit special code:
CMP = icmp ult AVL, MAX_SAFE_DISTANCE
SAFE_AVL = select CMP, AVL, MAX_SAFE_DISTANCE
EVL = call i32 @llvm.experimental.get.vector.length(i64 SAFE_AVL)
while vectorize the loop in DataWithEVL tail folding mode.