Skip to content

Commit 4480a22

Browse files
authored
[LV][EVL] Emit vp.merge intrinsic to enable out-loop reduction in EVL vectorization. (#101641)
Following #90184, this patch emits vp.merge intrinsic, which is used to set the inactive lanes in a select operation to the RHS instead of undef. Currently, it is applied to out-loop reduction for EVL vectorization. This patch performs transformation to convert select(header_mask, LHS, RHS) into vp.merge(all-true, LHS, RHS, EVL) And always use the predicated reduction select to set the incoming value of the reduction phi to support out-loop reduction when using tail folding with EVL. TODO: Postpone the adjustment of the predicated reduction select to VPlanTransform. The current adjustment might be too early, which could lead to a situation where the predicated reduction select is adjusted, but the EVL recipes cannot be successfully generated during VPlanTransform.
1 parent 84ce230 commit 4480a22

10 files changed

+1201
-237
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1484,6 +1484,18 @@ class LoopVectorizationCostModel {
14841484
return InLoopReductions.contains(Phi);
14851485
}
14861486

1487+
/// Returns true if the predicated reduction select should be used to set the
1488+
/// incoming value for the reduction phi.
1489+
bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const {
1490+
// Force to use predicated reduction select since the EVL of the
1491+
// second-to-last iteration might not be VF*UF.
1492+
if (foldTailWithEVL())
1493+
return true;
1494+
return PreferPredicatedReductionSelect ||
1495+
TTI.preferPredicatedReductionSelect(
1496+
Opcode, PhiTy, TargetTransformInfo::ReductionFlags());
1497+
}
1498+
14871499
/// Estimate cost of an intrinsic call instruction CI if it were vectorized
14881500
/// with factor VF. Return the cost of the instruction, including
14891501
/// scalarization overhead if it's needed.
@@ -9453,10 +9465,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
94539465
cast<VPInstruction>(&U)->getOpcode() ==
94549466
VPInstruction::ComputeReductionResult;
94559467
});
9456-
if (PreferPredicatedReductionSelect ||
9457-
TTI.preferPredicatedReductionSelect(
9458-
PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9459-
TargetTransformInfo::ReductionFlags()))
9468+
if (CM.usePredicatedReductionSelect(
9469+
PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy))
94609470
PhiR->setOperand(1, NewExitingVPV);
94619471
}
94629472

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1672,6 +1672,12 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
16721672
!Attrs.hasFnAttr(Attribute::WillReturn);
16731673
}
16741674

1675+
VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
1676+
std::initializer_list<VPValue *> CallArguments,
1677+
Type *Ty, DebugLoc DL = {})
1678+
: VPWidenIntrinsicRecipe(VectorIntrinsicID,
1679+
ArrayRef<VPValue *>(CallArguments), Ty, DL) {}
1680+
16751681
~VPWidenIntrinsicRecipe() override = default;
16761682

16771683
VPWidenIntrinsicRecipe *clone() override {

llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,17 @@ template <typename Class> struct bind_ty {
5555
}
5656
};
5757

58+
/// Match a specified VPValue.
59+
struct specificval_ty {
60+
const VPValue *Val;
61+
62+
specificval_ty(const VPValue *V) : Val(V) {}
63+
64+
bool match(VPValue *VPV) const { return VPV == Val; }
65+
};
66+
67+
inline specificval_ty m_Specific(const VPValue *VPV) { return VPV; }
68+
5869
/// Match a specified integer value or vector of all elements of that
5970
/// value. \p BitWidth optionally specifies the bitwidth the matched constant
6071
/// must have. If it is 0, the matched constant can have any bitwidth.

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1442,8 +1442,11 @@ void VPlanTransforms::addActiveLaneMask(
14421442

14431443
/// Replace recipes with their EVL variants.
14441444
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
1445+
using namespace llvm::VPlanPatternMatch;
1446+
Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
1447+
VPTypeAnalysis TypeInfo(CanonicalIVType);
1448+
LLVMContext &Ctx = CanonicalIVType->getContext();
14451449
SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
1446-
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
14471450
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
14481451
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
14491452
auto *CurRecipe = cast<VPRecipeBase>(U);
@@ -1480,7 +1483,23 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
14801483
TypeInfo.inferScalarType(Sel),
14811484
Sel->getDebugLoc());
14821485
})
1483-
1486+
.Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
1487+
VPValue *LHS, *RHS;
1488+
// Transform select with a header mask condition
1489+
// select(header_mask, LHS, RHS)
1490+
// into vector predication merge.
1491+
// vp.merge(all-true, LHS, RHS, EVL)
1492+
if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
1493+
m_VPValue(RHS))))
1494+
return nullptr;
1495+
// Use all true as the condition because this transformation is
1496+
// limited to selects whose condition is a header mask.
1497+
VPValue *AllTrue =
1498+
Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx));
1499+
return new VPWidenIntrinsicRecipe(
1500+
Intrinsic::vp_merge, {AllTrue, LHS, RHS, &EVL},
1501+
TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
1502+
})
14841503
.Default([&](VPRecipeBase *R) { return nullptr; });
14851504

14861505
if (!NewRecipe)
@@ -1553,14 +1572,7 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
15531572
return isa<VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>(
15541573
&Phi);
15551574
});
1556-
// FIXME: Remove this once we can transform (select header_mask, true_value,
1557-
// false_value) into vp.merge.
1558-
bool ContainsOutloopReductions =
1559-
any_of(Header->phis(), [&](VPRecipeBase &Phi) {
1560-
auto *R = dyn_cast<VPReductionPHIRecipe>(&Phi);
1561-
return R && !R->isInLoop();
1562-
});
1563-
if (ContainsWidenInductions || ContainsOutloopReductions)
1575+
if (ContainsWidenInductions)
15641576
return false;
15651577

15661578
auto *CanonicalIVPHI = Plan.getCanonicalIV();

0 commit comments

Comments
 (0)