Skip to content

[LV] Create block in mask up-front if needed. #76635

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 30 additions & 22 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7947,7 +7947,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
if (ECEntryIt != EdgeMaskCache.end())
return ECEntryIt->second;

VPValue *SrcMask = createBlockInMask(Src, Plan);
VPValue *SrcMask = getBlockInMask(Src);

// The terminator has to be a branch inst!
BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
Expand Down Expand Up @@ -8009,14 +8009,17 @@ void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
BlockMaskCache[Header] = BlockMask;
}

VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
assert(OrigLoop->contains(BB) && "Block is not a part of a loop");

// Look for cached value.
BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
if (BCEntryIt != BlockMaskCache.end())
return BCEntryIt->second;
VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
// Return the cached value.
BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
assert(BCEntryIt != BlockMaskCache.end() &&
"Trying to access mask for block without one.");
return BCEntryIt->second;
}

void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
assert(OrigLoop->getHeader() != BB &&
"Loop header must have cached block mask");

Expand All @@ -8026,8 +8029,9 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
// This is the block mask. We OR all incoming edges.
for (auto *Predecessor : predecessors(BB)) {
VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
return BlockMaskCache[BB] = EdgeMask;
if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
BlockMaskCache[BB] = EdgeMask;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should retain early-exit and return here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, re-added in 8b7bbed

}

if (!BlockMask) { // BlockMask has its initialized nullptr value.
BlockMask = EdgeMask;
Expand All @@ -8037,7 +8041,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
}

return BlockMaskCache[BB] = BlockMask;
BlockMaskCache[BB] = BlockMask;
}

VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
Expand Down Expand Up @@ -8065,7 +8069,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,

VPValue *Mask = nullptr;
if (Legal->isMaskRequired(I))
Mask = createBlockInMask(I->getParent(), *Plan);
Mask = getBlockInMask(I->getParent());

// Determine if the pointer operand of the access is either consecutive or
// reverse consecutive.
Expand Down Expand Up @@ -8287,7 +8291,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
// all-true mask.
VPValue *Mask = nullptr;
if (Legal->isMaskRequired(CI))
Mask = createBlockInMask(CI->getParent(), *Plan);
Mask = getBlockInMask(CI->getParent());
else
Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
Expand Down Expand Up @@ -8330,7 +8334,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
// div/rem operation itself. Otherwise fall through to general handling below.
if (CM.isPredicatedInst(I)) {
SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
VPValue *Mask = createBlockInMask(I->getParent(), *Plan);
VPValue *Mask = getBlockInMask(I->getParent());
VPValue *One = Plan->getVPValueOrAddLiveIn(
ConstantInt::get(I->getType(), 1u, false));
auto *SafeRHS =
Expand Down Expand Up @@ -8424,7 +8428,7 @@ VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
// added initially. Masked replicate recipes will later be placed under an
// if-then construct to prevent side-effects. Generate recipes to compute
// the block mask for this region.
BlockInMask = createBlockInMask(I->getParent(), Plan);
BlockInMask = getBlockInMask(I->getParent());
}

auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
Expand Down Expand Up @@ -8659,23 +8663,28 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
bool HasNUW = Style == TailFoldingStyle::None;
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);

// Proactively create header mask. Masks for other blocks are created on
// demand.
RecipeBuilder.createHeaderMask(*Plan);

// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
LoopBlocksDFS DFS(OrigLoop);
DFS.perform(LI);

VPBasicBlock *VPBB = HeaderVPBB;
bool NeedsMasks = CM.foldTailByMasking() ||
any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
return Legal->blockNeedsPredication(BB);
});
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
// Relevant instructions from basic block BB will be grouped into VPRecipe
// ingredients and fill a new VPBasicBlock.
if (VPBB != HeaderVPBB)
VPBB->setName(BB->getName());
Builder.setInsertPoint(VPBB);

if (VPBB == HeaderVPBB)
RecipeBuilder.createHeaderMask(*Plan);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: this retains the previous behavior of always creating a mask for HeaderVPBB, proactively. But we can refrain from doing so if !NeedsMasks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately prepareToFoldTailByMasking at the moment is used to check if tail-folding is possible and to collect the instructions that need masking with tail-folding.

So even if tail-folding wasn't picked by the cost-model, loads/stores (and others) will be marked as requiring a mask, which will then lead to looking up a header mask. This could be potentially un-tangled, as in #77612

else if (NeedsMasks)
RecipeBuilder.createBlockInMask(BB, *Plan);

// Introduce each ingredient into VPlan.
// TODO: Model and preserve debug intrinsics in VPlan.
for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
Expand Down Expand Up @@ -9024,7 +9033,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
if (CM.blockNeedsPredicationForAnyReason(BB)) {
VPBuilder::InsertPointGuard Guard(Builder);
Builder.setInsertPoint(CurrentLink);
CondOp = RecipeBuilder.createBlockInMask(BB, *Plan);
CondOp = RecipeBuilder.getBlockInMask(BB);
}

VPReductionRecipe *RedRecipe = new VPReductionRecipe(
Expand Down Expand Up @@ -9052,8 +9061,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
auto *OrigExitingVPV = PhiR->getBackedgeValue();
auto *NewExitingVPV = PhiR->getBackedgeValue();
if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
VPValue *Cond =
RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan);
VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
"reduction recipe must be defined before latch");
Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
Expand Down
7 changes: 5 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,11 @@ class VPRecipeBuilder {

/// A helper function that computes the predicate of the block BB, assuming
/// that the header block of the loop is set to True or the loop mask when
/// tail folding. It returns the *entry* mask for the block BB.
VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan);
/// tail folding.
void createBlockInMask(BasicBlock *BB, VPlan &Plan);

/// Returns the *entry* mask for the block \p BB.
VPValue *getBlockInMask(BasicBlock *BB) const;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't these two be createMaskInBlock and getMaskInBlock respectively?


/// A helper function that computes the predicate of the edge between SRC
/// and DST.
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
return false;
case VPInstructionSC:
switch (cast<VPInstruction>(this)->getOpcode()) {
case Instruction::Or:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fix independent of this patch?
Worth noting somewhere that the default is conservative, and should be revisited when new ones are introduced?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep it is technically independent but effectively dead code without this change, hence the direct inclusion here.

case Instruction::ICmp:
case Instruction::Select:
case VPInstruction::Not:
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,9 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
; TFCOMMON-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP7]])
; TFCOMMON-NEXT: [[TMP9:%.*]] = xor <vscale x 2 x i1> [[TMP6]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
; TFCOMMON-NEXT: [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i1> zeroinitializer
; TFCOMMON-NEXT: [[TMP12:%.*]] = or <vscale x 2 x i1> [[TMP7]], [[TMP10]]
; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP8]]
; TFCOMMON-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
; TFCOMMON-NEXT: [[TMP12:%.*]] = or <vscale x 2 x i1> [[TMP7]], [[TMP10]]
; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[TMP12]])
; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
Expand Down Expand Up @@ -298,9 +298,9 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
; TFCOMMON-NEXT: [[TMP9:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> [[TMP8]])
; TFCOMMON-NEXT: [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> zeroinitializer
; TFCOMMON-NEXT: [[TMP11:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP10]])
; TFCOMMON-NEXT: [[TMP13:%.*]] = or <vscale x 2 x i1> [[TMP8]], [[TMP10]]
; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP8]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP11]]
; TFCOMMON-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
; TFCOMMON-NEXT: [[TMP13:%.*]] = or <vscale x 2 x i1> [[TMP8]], [[TMP10]]
; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[TMP13]])
; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]]
; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1250,14 +1250,14 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_MASKED_LOAD]], zeroinitializer
; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP10]]
; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP10]]
; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[TMP14]], i32 0
; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x float> poison)
; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> zeroinitializer
; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD1]]
; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = or <vscale x 4 x i1> [[TMP15]], [[TMP18]]
; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD1]]
; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP20]])
; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,10 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1> zeroinitializer
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 1
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1> zeroinitializer
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
Expand Down Expand Up @@ -309,10 +309,10 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP7]]
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP8]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> zeroinitializer
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP6]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP11]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> zeroinitializer
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP13]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
Expand Down Expand Up @@ -479,15 +479,15 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> zeroinitializer
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP6]] to <vscale x 16 x i64>
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP8]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> zeroinitializer
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> [[TMP10]])
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = or disjoint <vscale x 16 x i32> [[TMP6]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP12]] to <vscale x 16 x i64>
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP15]])
; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP17]]
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
Expand Down
Loading