Skip to content

[LV] Don't predicate divs with invariant divisor when folding tail #98904

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 36 additions & 27 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3339,45 +3339,54 @@ bool LoopVectorizationCostModel::isScalarWithPredication(
}
}

// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
if (!blockNeedsPredicationForAnyReason(I->getParent()))
// If predication is not needed, avoid it.
// TODO: We can use the loop-preheader as context point here and get
// context sensitive reasoning for isSafeToSpeculativelyExecute.
if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
isSafeToSpeculativelyExecute(I) ||
(isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
isa<BranchInst, PHINode>(I))
return false;

// Can we prove this instruction is safe to unconditionally execute?
// If not, we must use some form of predication.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this comment removal intended.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

code restructured in the latest version, with the code separate if being removed.

// If the instruction was executed conditionally in the original scalar loop,
// predication is needed with a mask whose lanes are all possibly inactive.
if (Legal->blockNeedsPredication(I->getParent()))
return true;

// All that remain are instructions with side-effects originally executed in
// the loop unconditionally, but now execute under a tail-fold mask (only)
// having at least one active lane (the first). If the side-effects of the
// instruction are invariant, executing it w/o (the tail-folding) mask is safe
// - it will cause the same side-effects as when masked.
switch(I->getOpcode()) {
default:
return false;
llvm_unreachable(
"instruction should have been considered by earlier checks");
case Instruction::Call:
// Side-effects of a Call are assumed to be non-invariant, needing a
// (fold-tail) mask.
assert(Legal->isMaskRequired(I) &&
"should have returned earlier for calls not needing a mask");
return true;
case Instruction::Load:
// If the address is loop invariant no predication is needed.
return !Legal->isInvariant(getLoadStorePointerOperand(I));
case Instruction::Store: {
if (!Legal->isMaskRequired(I))
return false;
// When we know the load's address is loop invariant and the instruction
// in the original scalar loop was unconditionally executed then we
// don't need to mark it as a predicated instruction. Tail folding may
// introduce additional predication, but we're guaranteed to always have
// at least one active lane. We call Legal->blockNeedsPredication here
// because it doesn't query tail-folding. For stores, we need to prove
// both speculation safety (which follows from the same argument as loads),
// but also must prove the value being stored is correct. The easiest
// form of the later is to require that all values stored are the same.
if (Legal->isInvariant(getLoadStorePointerOperand(I)) &&
(isa<LoadInst>(I) ||
(isa<StoreInst>(I) &&
TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
!Legal->blockNeedsPredication(I->getParent()))
return false;
return true;
// For stores, we need to prove both speculation safety (which follows from
// the same argument as loads), but also must prove the value being stored
// is correct. The easiest form of the later is to require that all values
// stored are the same.
return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
}
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::SRem:
case Instruction::URem:
// TODO: We can use the loop-preheader as context point here and get
// context sensitive reasoning
return !isSafeToSpeculativelyExecute(I);
case Instruction::Call:
return Legal->isMaskRequired(I);
// If the divisor is loop-invariant no predication is needed.
return !TheLoop->isLoopInvariant(I->getOperand(1));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return !TheLoop->isLoopInvariant(I->getOperand(1));
return !TheLoop->isLoopInvariant(I->getOperand(1));
case Instruction::Call:
// Side-effects of a Call are assumed to be non-invariant, needing a (fold-tail) mask.
return true;

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated, thanks!

}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -274,50 +274,38 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
; CHECK-NEXT: [[TMP20:%.*]] = mul i64 1, [[TMP19]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP20]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_2_I]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP20]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_1_I]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[X]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_2_I]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP21:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP22:%.*]] = udiv <vscale x 2 x i64> [[VEC_IND]], [[TMP21]]
; CHECK-NEXT: [[TMP23:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP24:%.*]] = urem <vscale x 2 x i64> [[VEC_IND]], [[TMP23]]
; CHECK-NEXT: [[TMP25:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP26:%.*]] = udiv <vscale x 2 x i64> [[TMP24]], [[TMP25]]
; CHECK-NEXT: [[TMP27:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP28:%.*]] = urem <vscale x 2 x i64> [[TMP24]], [[TMP27]]
; CHECK-NEXT: [[TMP29:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT4]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP30:%.*]] = udiv <vscale x 2 x i64> [[TMP28]], [[TMP29]]
; CHECK-NEXT: [[TMP31:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT4]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP32:%.*]] = urem <vscale x 2 x i64> [[TMP28]], [[TMP31]]
; CHECK-NEXT: [[TMP33:%.*]] = extractelement <vscale x 2 x i64> [[TMP22]], i32 0
; CHECK-NEXT: [[TMP34:%.*]] = mul i64 [[X]], [[TMP33]]
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <vscale x 2 x i64> [[TMP26]], i32 0
; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[TMP34]], [[TMP35]]
; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], [[X]]
; CHECK-NEXT: [[TMP38:%.*]] = extractelement <vscale x 2 x i64> [[TMP30]], i32 0
; CHECK-NEXT: [[TMP39:%.*]] = add i64 [[TMP37]], [[TMP38]]
; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], [[X]]
; CHECK-NEXT: [[TMP41:%.*]] = extractelement <vscale x 2 x i64> [[TMP32]], i32 0
; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[TMP40]], [[TMP41]]
; CHECK-NEXT: [[TMP43:%.*]] = shl i64 [[TMP42]], 32
; CHECK-NEXT: [[TMP44:%.*]] = ashr i64 [[TMP43]], 32
; CHECK-NEXT: [[TMP45:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP44]]
; CHECK-NEXT: [[TMP46:%.*]] = getelementptr i64, ptr [[TMP45]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP22]], ptr [[TMP46]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP23:%.*]] = udiv <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT4]]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems right - broadcasting the invariant divisors rather than filling poison for masked lanes, or computing a single unmasked lane if only it is used.
Doing so for fixed vectors in X86/divs-with-tail-folding.ll led to abandoning vectorization - see below.

; CHECK-NEXT: [[TMP24:%.*]] = urem i64 [[TMP21]], [[MUL_2_I]]
; CHECK-NEXT: [[TMP25:%.*]] = udiv i64 [[TMP24]], [[MUL_1_I]]
; CHECK-NEXT: [[TMP26:%.*]] = urem i64 [[TMP24]], [[MUL_1_I]]
; CHECK-NEXT: [[TMP27:%.*]] = udiv i64 [[TMP26]], [[X]]
; CHECK-NEXT: [[TMP28:%.*]] = urem i64 [[TMP26]], [[X]]
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <vscale x 2 x i64> [[TMP23]], i32 0
; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[X]], [[TMP29]]
; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[TMP30]], [[TMP25]]
; CHECK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], [[X]]
; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[TMP32]], [[TMP27]]
; CHECK-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], [[X]]
; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[TMP34]], [[TMP28]]
; CHECK-NEXT: [[TMP36:%.*]] = shl i64 [[TMP35]], 32
; CHECK-NEXT: [[TMP37:%.*]] = ashr i64 [[TMP36]], 32
; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP37]]
; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i64, ptr [[TMP38]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP23]], ptr [[TMP39]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]])
; CHECK-NEXT: [[TMP47:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP48:%.*]] = extractelement <vscale x 2 x i1> [[TMP47]], i32 0
; CHECK-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -380,41 +380,40 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; PRED: vector.ph:
; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8
; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this change from 8 to 4 intentional, desirable?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The costs of both options are very close together. Previously the need to have a vector select in addition to the vector udiv made the difference in favor of 8. So for the current costs, this seems to be working as intended

; PRED-NEXT: [[TMP5:%.*]] = sub i64 [[TMP2]], 1
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP5]]
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8
; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
; PRED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8
; PRED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4
; PRED-NEXT: [[TMP10:%.*]] = sub i64 [[TMP0]], [[TMP9]]
; PRED-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], [[TMP9]]
; PRED-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]])
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[X]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]])
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[X]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
; PRED: vector.body:
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; PRED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; PRED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
; PRED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0
; PRED-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP13]]
; PRED-NEXT: [[TMP15:%.*]] = getelementptr i16, ptr [[TMP14]], i32 0
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP15]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> poison)
; PRED-NEXT: [[TMP16:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[BROADCAST_SPLAT]], <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
; PRED-NEXT: [[TMP17:%.*]] = udiv <vscale x 8 x i16> [[WIDE_MASKED_LOAD]], [[TMP16]]
; PRED-NEXT: [[TMP18:%.*]] = or <vscale x 8 x i16> [[TMP17]], [[VEC_PHI]]
; PRED-NEXT: [[TMP19]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[TMP18]], <vscale x 8 x i16> [[VEC_PHI]]
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP15]], i32 2, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i16> poison)
; PRED-NEXT: [[TMP19:%.*]] = udiv <vscale x 4 x i16> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
; PRED-NEXT: [[TMP20:%.*]] = or <vscale x 4 x i16> [[TMP19]], [[VEC_PHI]]
; PRED-NEXT: [[TMP16]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i16> [[TMP20]], <vscale x 4 x i16> [[VEC_PHI]]
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP12]])
; PRED-NEXT: [[TMP20:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
; PRED-NEXT: [[TMP21:%.*]] = extractelement <vscale x 8 x i1> [[TMP20]], i32 0
; PRED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]])
; PRED-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; PRED-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
; PRED-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; PRED: middle.block:
; PRED-NEXT: [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.nxv8i16(<vscale x 8 x i16> [[TMP19]])
; PRED-NEXT: [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16> [[TMP16]])
; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; PRED: scalar.ph:
; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
Expand Down
Loading