Skip to content

Commit f148d57

Browse files
[LV]Initial support for safe distance in predicated DataWithEVL vectorization mode.
Enabled initial support for max safe distance in DataWithEVL mode. If max safe distance is required, need to emit special code: CMP = icmp ult AVL, MAX_SAFE_DISTANCE SAFE_AVL = select CMP, AVL, MAX_SAFE_DISTANCE EVL = call i32 @llvm.experimental.get.vector.length(i64 SAFE_AVL) while vectorize the loop in DataWithEVL tail folding mode. Reviewers: fhahn Reviewed By: fhahn Pull Request: #102897
1 parent 7f2e937 commit f148d57

File tree

5 files changed

+81
-31
lines changed

5 files changed

+81
-31
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1427,12 +1427,9 @@ class LoopVectorizationCostModel {
14271427
// Override forced styles if needed.
14281428
// FIXME: use actual opcode/data type for analysis here.
14291429
// FIXME: Investigate opportunity for fixed vector factor.
1430-
bool EVLIsLegal =
1431-
IsScalableVF && UserIC <= 1 &&
1432-
TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1433-
!EnableVPlanNativePath &&
1434-
// FIXME: implement support for max safe dependency distance.
1435-
Legal->isSafeForAnyVectorWidth();
1430+
bool EVLIsLegal = UserIC <= 1 &&
1431+
TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1432+
!EnableVPlanNativePath;
14361433
if (!EVLIsLegal) {
14371434
// If for some reason EVL mode is unsupported, fallback to
14381435
// DataWithoutLaneMask to try to vectorize the loop with folded tail
@@ -1457,6 +1454,15 @@ class LoopVectorizationCostModel {
14571454
return getTailFoldingStyle() != TailFoldingStyle::None;
14581455
}
14591456

1457+
/// Return maximum safe number of elements to be processed per vector
1458+
/// iteration, which do not prevent store-load forwarding and are safe with
1459+
/// regard to the memory dependencies. Required for EVL-based VPlans to
1460+
/// correctly calculate AVL (application vector length) as min(remaining AVL,
1461+
/// MaxSafeElements).
1462+
/// TODO: need to consider adjusting cost model to use this value as a
1463+
/// vectorization factor for EVL-based vectorization.
1464+
std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1465+
14601466
/// Returns true if the instructions in this block requires predication
14611467
/// for any reason, e.g. because tail folding now requires a predicate
14621468
/// or because the block in the original loop was predicated.
@@ -1608,6 +1614,12 @@ class LoopVectorizationCostModel {
16081614
/// true if scalable vectorization is supported and enabled.
16091615
std::optional<bool> IsScalableVectorizationAllowed;
16101616

1617+
/// Maximum safe number of elements to be processed per vector iteration,
1618+
/// which do not prevent store-load forwarding and are safe with regard to the
1619+
/// memory dependencies. Required for EVL-based veectorization, where this
1620+
/// value is used as the upper bound of the safe AVL.
1621+
std::optional<unsigned> MaxSafeElements;
1622+
16111623
/// A map holding scalar costs for different vectorization factors. The
16121624
/// presence of a cost for an instruction in the mapping indicates that the
16131625
/// instruction will be scalarized when vectorizing with the associated
@@ -3858,6 +3870,8 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
38583870

38593871
auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
38603872
auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3873+
if (!Legal->isSafeForAnyVectorWidth())
3874+
this->MaxSafeElements = MaxSafeElements;
38613875

38623876
LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
38633877
<< ".\n");
@@ -8686,8 +8700,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
86868700
VPlanTransforms::optimize(*Plan);
86878701
// TODO: try to put it close to addActiveLaneMask().
86888702
// Discard the plan if it is not EVL-compatible
8689-
if (CM.foldTailWithEVL() &&
8690-
!VPlanTransforms::tryAddExplicitVectorLength(*Plan))
8703+
if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(
8704+
*Plan, CM.getMaxSafeElements()))
86918705
break;
86928706
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
86938707
VPlans.push_back(std::move(Plan));

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
392392
return true;
393393
switch (Opcode) {
394394
case Instruction::ICmp:
395+
case Instruction::Select:
395396
case VPInstruction::BranchOnCond:
396397
case VPInstruction::BranchOnCount:
397398
case VPInstruction::CalculateTripCountMinusVF:
@@ -440,9 +441,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
440441
return Builder.CreateCmp(getPredicate(), A, B, Name);
441442
}
442443
case Instruction::Select: {
443-
Value *Cond = State.get(getOperand(0));
444-
Value *Op1 = State.get(getOperand(1));
445-
Value *Op2 = State.get(getOperand(2));
444+
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
445+
Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
446+
Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
447+
Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
446448
return Builder.CreateSelect(Cond, Op1, Op2, Name);
447449
}
448450
case VPInstruction::ActiveLaneMask: {
@@ -742,6 +744,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
742744
default:
743745
return false;
744746
case Instruction::ICmp:
747+
case Instruction::Select:
745748
case VPInstruction::PtrAdd:
746749
// TODO: Cover additional opcodes.
747750
return vputils::onlyFirstLaneUsed(this);

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1439,7 +1439,24 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
14391439
/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
14401440
/// ...
14411441
///
1442-
bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
1442+
/// If MaxSafeElements is provided, the function adds the following recipes:
1443+
/// vector.ph:
1444+
/// ...
1445+
///
1446+
/// vector.body:
1447+
/// ...
1448+
/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
1449+
/// [ %NextEVLIV, %vector.body ]
1450+
/// %AVL = sub original TC, %EVLPhi
1451+
/// %cmp = cmp ult %AVL, MaxSafeElements
1452+
/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
1453+
/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
1454+
/// ...
1455+
/// %NextEVLIV = add IVSize (cast i32 %VPEVL to IVSize), %EVLPhi
1456+
/// ...
1457+
///
1458+
bool VPlanTransforms::tryAddExplicitVectorLength(
1459+
VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
14431460
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
14441461
// The transform updates all users of inductions to work based on EVL, instead
14451462
// of the VF directly. At the moment, widened inductions cannot be updated, so
@@ -1464,14 +1481,19 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
14641481
// Create the ExplicitVectorLengthPhi recipe in the main loop.
14651482
auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
14661483
EVLPhi->insertAfter(CanonicalIVPHI);
1467-
// TODO: Add support for MaxSafeDist for correct loop emission.
1484+
VPBuilder Builder(Header, Header->getFirstNonPhi());
14681485
// Compute original TC - IV as the AVL (application vector length).
1469-
auto *AVL = new VPInstruction(Instruction::Sub, {Plan.getTripCount(), EVLPhi},
1470-
DebugLoc(), "avl");
1471-
AVL->insertBefore(*Header, Header->getFirstNonPhi());
1472-
auto *VPEVL =
1473-
new VPInstruction(VPInstruction::ExplicitVectorLength, AVL, DebugLoc());
1474-
VPEVL->insertAfter(AVL);
1486+
VPValue *AVL = Builder.createNaryOp(
1487+
Instruction::Sub, {Plan.getTripCount(), EVLPhi}, DebugLoc(), "avl");
1488+
if (MaxSafeElements) {
1489+
// Support for MaxSafeDist for correct loop emission.
1490+
VPValue *AVLSafe = Plan.getOrAddLiveIn(
1491+
ConstantInt::get(CanonicalIVPHI->getScalarType(), *MaxSafeElements));
1492+
VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
1493+
AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl");
1494+
}
1495+
auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
1496+
DebugLoc());
14751497

14761498
auto *CanonicalIVIncrement =
14771499
cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,9 @@ struct VPlanTransforms {
108108
/// VPCanonicalIVPHIRecipe is only used to control the loop after
109109
/// this transformation.
110110
/// \returns true if the transformation succeeds, or false if it doesn't.
111-
static bool tryAddExplicitVectorLength(VPlan &Plan);
111+
static bool
112+
tryAddExplicitVectorLength(VPlan &Plan,
113+
const std::optional<unsigned> &MaxEVLSafeElements);
112114

113115
// For each Interleave Group in \p InterleaveGroups replace the Recipes
114116
// widening its memory instructions with a single VPInterleaveRecipe at its

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -422,28 +422,37 @@ define void @no_high_lmul_or_interleave(ptr %p) {
422422
; IF-EVL-NEXT: entry:
423423
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
424424
; IF-EVL: vector.ph:
425+
; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
426+
; IF-EVL-NEXT: [[TMP1:%.*]] = sub i64 [[TMP7]], 1
427+
; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 3002, [[TMP1]]
428+
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
429+
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
430+
; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
425431
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
426432
; IF-EVL: vector.body:
427433
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
428-
; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
429-
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
430-
; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
431-
; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
432-
; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], <i64 3001, i64 3001, i64 3001, i64 3001>
434+
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
435+
; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 3002, [[EVL_BASED_IV]]
436+
; IF-EVL-NEXT: [[TMP9:%.*]] = icmp ult i64 [[AVL]], 1024
437+
; IF-EVL-NEXT: [[SAFE_AVL:%.*]] = select i1 [[TMP9]], i64 [[AVL]], i64 1024
438+
; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[SAFE_AVL]], i32 1, i1 true)
439+
; IF-EVL-NEXT: [[TMP0:%.*]] = add i64 [[EVL_BASED_IV]], 0
433440
; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
434441
; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
435-
; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[TMP3]], i32 32, <4 x i1> [[TMP1]], <4 x i64> poison)
442+
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr align 32 [[TMP3]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]])
436443
; IF-EVL-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], 1024
437444
; IF-EVL-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP4]]
438445
; IF-EVL-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
439-
; IF-EVL-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_MASKED_LOAD]], ptr [[TMP6]], i32 32, <4 x i1> [[TMP1]])
440-
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
441-
; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3004
442-
; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
446+
; IF-EVL-NEXT: call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP6]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]])
447+
; IF-EVL-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
448+
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[EVL_BASED_IV]]
449+
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
450+
; IF-EVL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
451+
; IF-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
443452
; IF-EVL: middle.block:
444453
; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
445454
; IF-EVL: scalar.ph:
446-
; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3004, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
455+
; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
447456
; IF-EVL-NEXT: br label [[LOOP:%.*]]
448457
; IF-EVL: loop:
449458
; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]

0 commit comments

Comments
 (0)