Skip to content

Commit 51afb10

Browse files
authored
[LV] Create block in mask up-front if needed. (#76635)
At the moment, block and edge masks are created on demand, which means that they are inserted at the point where they are demanded and then cached. It is possible that the mask for a block is looked up later at a point that's not dominated by the point where the mask has been inserted. To avoid this, create masks up front on entry to the corresponding basic block and leave it to VPlan simplification to remove unneeded masks. Note that we need to create masks for all blocks, if any of the blocks in the loop needs predication, as computing the mask of a block depends on the masks of its predecessor. Needed for #76090. #76635
1 parent f92b928 commit 51afb10

26 files changed

+148
-136
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7947,7 +7947,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
79477947
if (ECEntryIt != EdgeMaskCache.end())
79487948
return ECEntryIt->second;
79497949

7950-
VPValue *SrcMask = createBlockInMask(Src, Plan);
7950+
VPValue *SrcMask = getBlockInMask(Src);
79517951

79527952
// The terminator has to be a branch inst!
79537953
BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
@@ -8009,14 +8009,17 @@ void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
80098009
BlockMaskCache[Header] = BlockMask;
80108010
}
80118011

8012-
VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
8013-
assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8014-
8015-
// Look for cached value.
8016-
BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8017-
if (BCEntryIt != BlockMaskCache.end())
8018-
return BCEntryIt->second;
8012+
VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
8013+
// Return the cached value.
8014+
BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8015+
assert(BCEntryIt != BlockMaskCache.end() &&
8016+
"Trying to access mask for block without one.");
8017+
return BCEntryIt->second;
8018+
}
80198019

8020+
void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
8021+
assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8022+
assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
80208023
assert(OrigLoop->getHeader() != BB &&
80218024
"Loop header must have cached block mask");
80228025

@@ -8026,8 +8029,9 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
80268029
// This is the block mask. We OR all incoming edges.
80278030
for (auto *Predecessor : predecessors(BB)) {
80288031
VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8029-
if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8030-
return BlockMaskCache[BB] = EdgeMask;
8032+
if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8033+
BlockMaskCache[BB] = EdgeMask;
8034+
}
80318035

80328036
if (!BlockMask) { // BlockMask has its initialized nullptr value.
80338037
BlockMask = EdgeMask;
@@ -8037,7 +8041,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
80378041
BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
80388042
}
80398043

8040-
return BlockMaskCache[BB] = BlockMask;
8044+
BlockMaskCache[BB] = BlockMask;
80418045
}
80428046

80438047
VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
@@ -8065,7 +8069,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
80658069

80668070
VPValue *Mask = nullptr;
80678071
if (Legal->isMaskRequired(I))
8068-
Mask = createBlockInMask(I->getParent(), *Plan);
8072+
Mask = getBlockInMask(I->getParent());
80698073

80708074
// Determine if the pointer operand of the access is either consecutive or
80718075
// reverse consecutive.
@@ -8287,7 +8291,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
82878291
// all-true mask.
82888292
VPValue *Mask = nullptr;
82898293
if (Legal->isMaskRequired(CI))
8290-
Mask = createBlockInMask(CI->getParent(), *Plan);
8294+
Mask = getBlockInMask(CI->getParent());
82918295
else
82928296
Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
82938297
IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
@@ -8330,7 +8334,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
83308334
// div/rem operation itself. Otherwise fall through to general handling below.
83318335
if (CM.isPredicatedInst(I)) {
83328336
SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8333-
VPValue *Mask = createBlockInMask(I->getParent(), *Plan);
8337+
VPValue *Mask = getBlockInMask(I->getParent());
83348338
VPValue *One = Plan->getVPValueOrAddLiveIn(
83358339
ConstantInt::get(I->getType(), 1u, false));
83368340
auto *SafeRHS =
@@ -8424,7 +8428,7 @@ VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
84248428
// added initially. Masked replicate recipes will later be placed under an
84258429
// if-then construct to prevent side-effects. Generate recipes to compute
84268430
// the block mask for this region.
8427-
BlockInMask = createBlockInMask(I->getParent(), Plan);
8431+
BlockInMask = getBlockInMask(I->getParent());
84288432
}
84298433

84308434
auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
@@ -8659,23 +8663,28 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
86598663
bool HasNUW = Style == TailFoldingStyle::None;
86608664
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
86618665

8662-
// Proactively create header mask. Masks for other blocks are created on
8663-
// demand.
8664-
RecipeBuilder.createHeaderMask(*Plan);
8665-
86668666
// Scan the body of the loop in a topological order to visit each basic block
86678667
// after having visited its predecessor basic blocks.
86688668
LoopBlocksDFS DFS(OrigLoop);
86698669
DFS.perform(LI);
86708670

86718671
VPBasicBlock *VPBB = HeaderVPBB;
8672+
bool NeedsMasks = CM.foldTailByMasking() ||
8673+
any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
8674+
return Legal->blockNeedsPredication(BB);
8675+
});
86728676
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
86738677
// Relevant instructions from basic block BB will be grouped into VPRecipe
86748678
// ingredients and fill a new VPBasicBlock.
86758679
if (VPBB != HeaderVPBB)
86768680
VPBB->setName(BB->getName());
86778681
Builder.setInsertPoint(VPBB);
86788682

8683+
if (VPBB == HeaderVPBB)
8684+
RecipeBuilder.createHeaderMask(*Plan);
8685+
else if (NeedsMasks)
8686+
RecipeBuilder.createBlockInMask(BB, *Plan);
8687+
86798688
// Introduce each ingredient into VPlan.
86808689
// TODO: Model and preserve debug intrinsics in VPlan.
86818690
for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
@@ -9024,7 +9033,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
90249033
if (CM.blockNeedsPredicationForAnyReason(BB)) {
90259034
VPBuilder::InsertPointGuard Guard(Builder);
90269035
Builder.setInsertPoint(CurrentLink);
9027-
CondOp = RecipeBuilder.createBlockInMask(BB, *Plan);
9036+
CondOp = RecipeBuilder.getBlockInMask(BB);
90289037
}
90299038

90309039
VPReductionRecipe *RedRecipe = new VPReductionRecipe(
@@ -9052,8 +9061,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
90529061
auto *OrigExitingVPV = PhiR->getBackedgeValue();
90539062
auto *NewExitingVPV = PhiR->getBackedgeValue();
90549063
if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9055-
VPValue *Cond =
9056-
RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan);
9064+
VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
90579065
assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
90589066
"reduction recipe must be defined before latch");
90599067
Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();

llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,11 @@ class VPRecipeBuilder {
138138

139139
/// A helper function that computes the predicate of the block BB, assuming
140140
/// that the header block of the loop is set to True or the loop mask when
141-
/// tail folding. It returns the *entry* mask for the block BB.
142-
VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan);
141+
/// tail folding.
142+
void createBlockInMask(BasicBlock *BB, VPlan &Plan);
143+
144+
/// Returns the *entry* mask for the block \p BB.
145+
VPValue *getBlockInMask(BasicBlock *BB) const;
143146

144147
/// A helper function that computes the predicate of the edge between SRC
145148
/// and DST.

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
120120
return false;
121121
case VPInstructionSC:
122122
switch (cast<VPInstruction>(this)->getOpcode()) {
123+
case Instruction::Or:
123124
case Instruction::ICmp:
124125
case Instruction::Select:
125126
case VPInstruction::Not:

llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,9 +175,9 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
175175
; TFCOMMON-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP7]])
176176
; TFCOMMON-NEXT: [[TMP9:%.*]] = xor <vscale x 2 x i1> [[TMP6]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
177177
; TFCOMMON-NEXT: [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i1> zeroinitializer
178+
; TFCOMMON-NEXT: [[TMP12:%.*]] = or <vscale x 2 x i1> [[TMP7]], [[TMP10]]
178179
; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP8]]
179180
; TFCOMMON-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
180-
; TFCOMMON-NEXT: [[TMP12:%.*]] = or <vscale x 2 x i1> [[TMP7]], [[TMP10]]
181181
; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[TMP12]])
182182
; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
183183
; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
@@ -298,9 +298,9 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
298298
; TFCOMMON-NEXT: [[TMP9:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> [[TMP8]])
299299
; TFCOMMON-NEXT: [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> zeroinitializer
300300
; TFCOMMON-NEXT: [[TMP11:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP10]])
301+
; TFCOMMON-NEXT: [[TMP13:%.*]] = or <vscale x 2 x i1> [[TMP8]], [[TMP10]]
301302
; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP8]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP11]]
302303
; TFCOMMON-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
303-
; TFCOMMON-NEXT: [[TMP13:%.*]] = or <vscale x 2 x i1> [[TMP8]], [[TMP10]]
304304
; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[TMP13]])
305305
; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]]
306306
; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)

llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1250,14 +1250,14 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
12501250
; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
12511251
; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
12521252
; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_MASKED_LOAD]], zeroinitializer
1253-
; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP10]]
12541253
; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
1254+
; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP10]]
12551255
; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[TMP14]], i32 0
12561256
; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x float> poison)
12571257
; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
12581258
; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> zeroinitializer
1259-
; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD1]]
12601259
; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = or <vscale x 4 x i1> [[TMP15]], [[TMP18]]
1260+
; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD1]]
12611261
; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
12621262
; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP20]])
12631263
; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]]

llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -130,10 +130,10 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
130130
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
131131
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
132132
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
133+
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1> zeroinitializer
133134
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 1
134135
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
135136
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]]
136-
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1> zeroinitializer
137137
; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
138138
; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
139139
; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
@@ -309,10 +309,10 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
309309
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP7]]
310310
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP8]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
311311
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
312+
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> zeroinitializer
312313
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP6]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
313314
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
314315
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP11]]
315-
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> zeroinitializer
316316
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP13]])
317317
; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]]
318318
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
@@ -479,15 +479,15 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
479479
; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
480480
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
481481
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
482+
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> zeroinitializer
482483
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP6]] to <vscale x 16 x i64>
483484
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP8]]
484-
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> zeroinitializer
485485
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> [[TMP10]])
486486
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
487+
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
487488
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = or disjoint <vscale x 16 x i32> [[TMP6]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
488489
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP12]] to <vscale x 16 x i64>
489490
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
490-
; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
491491
; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP15]])
492492
; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP17]]
493493
; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])

0 commit comments

Comments
 (0)