Skip to content

[LV] Use original trip-count as the vector-trip-count if use predicated EVL instructions for tail-folding. #132675

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2427,6 +2427,15 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
Value *TC = getTripCount();
IRBuilder<> Builder(InsertBlock->getTerminator());

// Use original trip count as the vector trip count if use predicated EVL
// instructions for tail-folding.
if (VF.isVector() && Cost->foldTailWithEVL()) {
assert(!Cost->requiresScalarEpilogue(true) &&
"Use predicated EVL instructions for tail-folding does not allow "
"scalar epilogue");
return VectorTripCount = TC;
}

Comment on lines +2430 to +2438
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't look like it is the right place to update. The vector trip count is guaranteed to be a multiple of VF.

The caller would need updating to not use it/request when folding the tail with EVL

Type *Ty = TC->getType();
// This is where we can make the step a runtime constant.
Value *Step = createStepForVF(Builder, Ty, VF, UF);
Expand Down Expand Up @@ -4468,7 +4477,6 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPVectorPointerSC:
case VPDef::VPVectorEndPointerSC:
case VPDef::VPExpandSCEVSC:
case VPDef::VPEVLBasedIVPHISC:
case VPDef::VPPredInstPHISC:
case VPDef::VPBranchOnMaskSC:
continue;
Expand Down
44 changes: 0 additions & 44 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
static inline bool classof(const VPRecipeBase *R) {
switch (R->getVPDefID()) {
case VPRecipeBase::VPDerivedIVSC:
case VPRecipeBase::VPEVLBasedIVPHISC:
case VPRecipeBase::VPExpandSCEVSC:
case VPRecipeBase::VPInstructionSC:
case VPRecipeBase::VPReductionEVLSC:
Expand Down Expand Up @@ -2919,49 +2918,6 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
#endif
};

/// A recipe for generating the phi node for the current index of elements,
/// adjusted in accordance with EVL value. It starts at the start value of the
/// canonical induction and gets incremented by EVL in each iteration of the
/// vector loop.
class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe {
public:
VPEVLBasedIVPHIRecipe(VPValue *StartIV, DebugLoc DL)
: VPHeaderPHIRecipe(VPDef::VPEVLBasedIVPHISC, nullptr, StartIV, DL) {}

~VPEVLBasedIVPHIRecipe() override = default;

VPEVLBasedIVPHIRecipe *clone() override {
llvm_unreachable("cloning not implemented yet");
}

VP_CLASSOF_IMPL(VPDef::VPEVLBasedIVPHISC)

void execute(VPTransformState &State) override {
llvm_unreachable("cannot execute this recipe, should be replaced by a "
"scalar phi recipe");
}

/// Return the cost of this VPEVLBasedIVPHIRecipe.
InstructionCost computeCost(ElementCount VF,
VPCostContext &Ctx) const override {
// For now, match the behavior of the legacy cost model.
return 0;
}

/// Returns true if the recipe only uses the first lane of operand \p Op.
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
return true;
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
};

/// A Recipe for widening the canonical induction variable of the vector loop.
class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe,
public VPUnrollPartAccessor<1> {
Expand Down
15 changes: 7 additions & 8 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,14 +245,13 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
.Case<VPActiveLaneMaskPHIRecipe, VPCanonicalIVPHIRecipe,
VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe,
VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe>(
[this](const auto *R) {
// Handle header phi recipes, except VPWidenIntOrFpInduction
// which needs special handling due it being possibly truncated.
// TODO: consider inferring/caching type of siblings, e.g.,
// backedge value, here and in cases below.
return inferScalarType(R->getStartValue());
})
VPWidenPointerInductionRecipe>([this](const auto *R) {
// Handle header phi recipes, except VPWidenIntOrFpInduction which
// needs special handling due it being possibly truncated.
// TODO: consider inferring/caching type of siblings, e.g., backedge
// value, here and in cases below.
return inferScalarType(R->getStartValue());
})
.Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
[](const auto *R) { return R->getScalarType(); })
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
Expand Down
11 changes: 0 additions & 11 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3749,14 +3749,3 @@ void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
printOperands(O, SlotTracker);
}
#endif

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";

printAsOperand(O, SlotTracker);
O << " = phi ";
printOperands(O, SlotTracker);
}
#endif
61 changes: 21 additions & 40 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2021,10 +2021,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
}
}

/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
/// replaces all uses except the canonical IV increment of
/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
/// is used only for loop iterations counting after this transformation.
/// Add a VPInstruction::ExplicitVectorLength and related recipes to \p Plan and
/// adjust the increment of the canonical induction variable to an explicit
/// vector length.
///
/// The function uses the following definitions:
/// %StartV is the canonical induction start value.
Expand All @@ -2035,29 +2034,25 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
/// ...
///
/// vector.body:
/// ...
/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
/// [ %NextEVLIV, %vector.body ]
/// %AVL = sub original TC, %EVLPhi
/// %IVPhi = CANONICAL-INDUCTION ir<0>, %IV.NEXT
/// %AVL = sub original TC, %IVPhi
/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
/// ...
/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
/// %IV.NEXT = add %IVPhi, IVSize (cast i32 %VPEVL to IVSize)
/// ...
///
/// If MaxSafeElements is provided, the function adds the following recipes:
/// vector.ph:
/// ...
///
/// vector.body:
/// ...
/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
/// [ %NextEVLIV, %vector.body ]
/// %IVPhi = CANONICAL-INDUCTION ir<0>, %IV.NEXT
/// %AVL = sub original TC, %EVLPhi
/// %cmp = cmp ult %AVL, MaxSafeElements
/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
/// ...
/// %NextEVLIV = add IVSize (cast i32 %VPEVL to IVSize), %EVLPhi
/// %IV.NEXT = add %IVPhi, IVSize (cast i32 %VPEVL to IVSize)
/// ...
///
bool VPlanTransforms::tryAddExplicitVectorLength(
Expand All @@ -2073,15 +2068,11 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
return false;

auto *CanonicalIVPHI = Plan.getCanonicalIV();
VPValue *StartV = CanonicalIVPHI->getStartValue();

// Create the ExplicitVectorLengthPhi recipe in the main loop.
auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
EVLPhi->insertAfter(CanonicalIVPHI);
VPBuilder Builder(Header, Header->getFirstNonPhi());
// Compute original TC - IV as the AVL (application vector length).
VPValue *AVL = Builder.createNaryOp(
Instruction::Sub, {Plan.getTripCount(), EVLPhi}, DebugLoc(), "avl");
Instruction::Sub, {&Plan.getVectorTripCount(), CanonicalIVPHI},
DebugLoc(), "avl");
if (MaxSafeElements) {
// Support for MaxSafeDist for correct loop emission.
VPValue *AVLSafe = Plan.getOrAddLiveIn(
Expand All @@ -2094,27 +2085,20 @@ bool VPlanTransforms::tryAddExplicitVectorLength(

auto *CanonicalIVIncrement =
cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
Builder.setInsertPoint(CanonicalIVIncrement);
VPSingleDefRecipe *OpVPEVL = VPEVL;
if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits();
IVSize != 32) {
OpVPEVL = Builder.createScalarCast(
IVSize < 32 ? Instruction::Trunc : Instruction::ZExt, OpVPEVL,
CanonicalIVPHI->getScalarType(), CanonicalIVIncrement->getDebugLoc());
}
auto *NextEVLIV = Builder.createOverflowingOp(
Instruction::Add, {OpVPEVL, EVLPhi},
{CanonicalIVIncrement->hasNoUnsignedWrap(),
CanonicalIVIncrement->hasNoSignedWrap()},
CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
EVLPhi->addOperand(NextEVLIV);

transformRecipestoEVLRecipes(Plan, *VPEVL);

// Replace all uses of VPCanonicalIVPHIRecipe by
// VPEVLBasedIVPHIRecipe except for the canonical IV increment.
CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
// Adjust the increment of the canonical induction variable to an explicit
// vector length.
CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
CanonicalIVIncrement->setOperand(1, OpVPEVL);
// TODO: support unroll factor > 1.
Plan.setUF(1);
return true;
Expand Down Expand Up @@ -2298,17 +2282,14 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(VPBB->phis())) {
if (!isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(&R))
continue;
auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
StringRef Name =
isa<VPCanonicalIVPHIRecipe>(PhiR) ? "index" : "evl.based.iv";
auto *ScalarR = new VPInstruction(
Instruction::PHI, {PhiR->getStartValue(), PhiR->getBackedgeValue()},
PhiR->getDebugLoc(), Name);
ScalarR->insertBefore(PhiR);
PhiR->replaceAllUsesWith(ScalarR);
PhiR->eraseFromParent();
if (auto *PhiR = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
auto *ScalarR = new VPInstruction(
Instruction::PHI, {PhiR->getStartValue(), PhiR->getBackedgeValue()},
PhiR->getDebugLoc(), "index");
ScalarR->insertBefore(PhiR);
PhiR->replaceAllUsesWith(ScalarR);
PhiR->eraseFromParent();
}
Comment on lines +2285 to +2292
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

VPCanonicalIVPHIRecipe's step must be loop invariant, otherwise it isn't an induction.

If you want to do this transform, you must replace the canonical IV with a plain scalar phi recipe, like done in convertToConcreteRecipes

}
}
}
Expand Down
8 changes: 3 additions & 5 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,11 +143,9 @@ struct VPlanTransforms {
VPlan &Plan,
const std::function<bool(BasicBlock *)> &BlockNeedsPredication);

/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
/// replaces all uses except the canonical IV increment of
/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe.
/// VPCanonicalIVPHIRecipe is only used to control the loop after
/// this transformation.
/// Add a VPInstruction::ExplicitVectorLength and related recipes to \p Plan
/// and adjust the increment of the canonical induction variable to an
/// explicit vector length.
/// \returns true if the transformation succeeds, or false if it doesn't.
static bool
tryAddExplicitVectorLength(VPlan &Plan,
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanValue.h
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,6 @@ class VPDef {
// VPHeaderPHIRecipe need to be kept together.
VPCanonicalIVPHISC,
VPActiveLaneMaskPHISC,
VPEVLBasedIVPHISC,
VPFirstOrderRecurrencePHISC,
VPWidenIntOrFpInductionSC,
VPWidenPointerInductionSC,
Expand Down
10 changes: 0 additions & 10 deletions llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,16 +156,6 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
errs() << "EVL is used as an operand in non-VPInstruction::Add\n";
return false;
}
if (I->getNumUsers() != 1) {
errs() << "EVL is used in VPInstruction:Add with multiple "
"users\n";
return false;
}
if (!isa<VPEVLBasedIVPHIRecipe>(*I->users().begin())) {
errs() << "Result of VPInstruction::Add with EVL operand is "
"not used by VPEVLBasedIVPHIRecipe\n";
return false;
}
return true;
})
.Default([&](const VPUser *U) {
Expand Down
Loading