Skip to content

Commit 1782e6c

Browse files
committed
[LV] Don't predicate uniform divides with loop-invariant divisor.
When folding the tail, at least one of the lanes must execute unconditionally. If the divisor is loop-invariant no predication is needed, as predication would not prevent the divide-by-0 on the executed lane. Depends on llvm#98892.
1 parent b6ab904 commit 1782e6c

File tree

4 files changed

+70
-128
lines changed

4 files changed

+70
-128
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1402,7 +1402,8 @@ class LoopVectorizationCostModel {
14021402
/// Returns true if \p I is an instruction that needs to be predicated
14031403
/// at runtime. The result is independent of the predication mechanism.
14041404
/// Superset of instructions that return true for isScalarWithPredication.
1405-
bool isPredicatedInst(Instruction *I) const;
1405+
bool isPredicatedInst(Instruction *I, ElementCount VF,
1406+
bool IsKnownUniform = false) const;
14061407

14071408
/// Return the costs for our two available strategies for lowering a
14081409
/// div/rem operation which requires speculating at least one lane.
@@ -3637,7 +3638,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
36373638

36383639
bool LoopVectorizationCostModel::isScalarWithPredication(
36393640
Instruction *I, ElementCount VF) const {
3640-
if (!isPredicatedInst(I))
3641+
if (!isPredicatedInst(I, VF))
36413642
return false;
36423643

36433644
// Do we have a non-scalar lowering for this predicated
@@ -3676,7 +3677,9 @@ bool LoopVectorizationCostModel::isScalarWithPredication(
36763677
}
36773678
}
36783679

3679-
bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3680+
bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I,
3681+
ElementCount VF,
3682+
bool IsKnownUniform) const {
36803683
if (!blockNeedsPredicationForAnyReason(I->getParent()))
36813684
return false;
36823685

@@ -3710,6 +3713,15 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
37103713
case Instruction::SDiv:
37113714
case Instruction::SRem:
37123715
case Instruction::URem:
3716+
// When folding the tail, at least one of the lanes must execute
3717+
// unconditionally. If the divisor is loop-invariant no predication is
3718+
// needed, as predication would not prevent the divide-by-0 on the executed
3719+
// lane.
3720+
if (foldTailByMasking() && !Legal->blockNeedsPredication(I->getParent()) &&
3721+
TheLoop->isLoopInvariant(I->getOperand(1)) &&
3722+
(IsKnownUniform || isUniformAfterVectorization(I, VF)))
3723+
return false;
3724+
37133725
// TODO: We can use the loop-preheader as context point here and get
37143726
// context sensitive reasoning
37153727
return !isSafeToSpeculativelyExecute(I);
@@ -3917,7 +3929,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
39173929
<< *I << "\n");
39183930
return;
39193931
}
3920-
if (isPredicatedInst(I)) {
3932+
if (isPredicatedInst(I, VF, true)) {
39213933
LLVM_DEBUG(
39223934
dbgs() << "LV: Found not uniform due to requiring predication: " << *I
39233935
<< "\n");
@@ -5634,7 +5646,7 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
56345646
// from moving "masked load/store" check from legality to cost model.
56355647
// Masked Load/Gather emulation was previously never allowed.
56365648
// Limited number of Masked Store/Scatter emulation was allowed.
5637-
assert((isPredicatedInst(I)) &&
5649+
assert((isPredicatedInst(I, VF)) &&
56385650
"Expecting a scalar emulated instruction");
56395651
return isa<LoadInst>(I) ||
56405652
(isa<StoreInst>(I) &&
@@ -5913,7 +5925,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
59135925
// If we have a predicated load/store, it will need extra i1 extracts and
59145926
// conditional branches, but may not be executed for each vector lane. Scale
59155927
// the cost by the probability of executing the predicated block.
5916-
if (isPredicatedInst(I)) {
5928+
if (isPredicatedInst(I, VF)) {
59175929
Cost /= getReciprocalPredBlockProb();
59185930

59195931
// Add the cost of an i1 extract and a branch
@@ -6773,7 +6785,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
67736785
case Instruction::SDiv:
67746786
case Instruction::URem:
67756787
case Instruction::SRem:
6776-
if (VF.isVector() && isPredicatedInst(I)) {
6788+
if (VF.isVector() && isPredicatedInst(I, VF)) {
67776789
const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
67786790
return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
67796791
ScalarCost : SafeDivisorCost;
@@ -8445,7 +8457,7 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
84458457

84468458
VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
84478459
ArrayRef<VPValue *> Operands,
8448-
VPBasicBlock *VPBB) {
8460+
VPBasicBlock *VPBB, VFRange &Range) {
84498461
switch (I->getOpcode()) {
84508462
default:
84518463
return nullptr;
@@ -8455,7 +8467,10 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
84558467
case Instruction::URem: {
84568468
// If not provably safe, use a select to form a safe divisor before widening the
84578469
// div/rem operation itself. Otherwise fall through to general handling below.
8458-
if (CM.isPredicatedInst(I)) {
8470+
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8471+
[&](ElementCount VF) -> bool { return CM.isPredicatedInst(I, VF); },
8472+
Range);
8473+
if (IsPredicated) {
84598474
SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
84608475
VPValue *Mask = getBlockInMask(I->getParent());
84618476
VPValue *One =
@@ -8505,8 +8520,8 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
85058520
[&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
85068521
Range);
85078522

8508-
bool IsPredicated = CM.isPredicatedInst(I);
8509-
8523+
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8524+
[&](ElementCount VF) { return CM.isPredicatedInst(I, VF); }, Range);
85108525
// Even if the instruction is not marked as uniform, there are certain
85118526
// intrinsic calls that can be effectively treated as such, so we check for
85128527
// them here. Conservatively, we only do this for scalable vectors, since
@@ -8626,7 +8641,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
86268641
*CI);
86278642
}
86288643

8629-
return tryToWiden(Instr, Operands, VPBB);
8644+
return tryToWiden(Instr, Operands, VPBB, Range);
86308645
}
86318646

86328647
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,

llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ class VPRecipeBuilder {
100100
/// if it can. The function should only be called if the cost-model indicates
101101
/// that widening should be performed.
102102
VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef<VPValue *> Operands,
103-
VPBasicBlock *VPBB);
103+
VPBasicBlock *VPBB, VFRange &Range);
104104

105105
public:
106106
VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI,

llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll

Lines changed: 12 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -274,50 +274,39 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
274274
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
275275
; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
276276
; CHECK-NEXT: [[TMP20:%.*]] = mul i64 1, [[TMP19]]
277-
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP20]], i64 0
278-
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
279-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_2_I]], i64 0
277+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP20]], i64 0
280278
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
281-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_1_I]], i64 0
282-
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
283-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[X]], i64 0
279+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_2_I]], i64 0
284280
; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
285281
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
286282
; CHECK: [[VECTOR_BODY]]:
287283
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
288284
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
289-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
290-
; CHECK-NEXT: [[TMP21:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
291-
; CHECK-NEXT: [[TMP22:%.*]] = udiv <vscale x 2 x i64> [[VEC_IND]], [[TMP21]]
292-
; CHECK-NEXT: [[TMP23:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
293-
; CHECK-NEXT: [[TMP24:%.*]] = urem <vscale x 2 x i64> [[VEC_IND]], [[TMP23]]
294-
; CHECK-NEXT: [[TMP25:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
295-
; CHECK-NEXT: [[TMP26:%.*]] = udiv <vscale x 2 x i64> [[TMP24]], [[TMP25]]
296-
; CHECK-NEXT: [[TMP27:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
297-
; CHECK-NEXT: [[TMP28:%.*]] = urem <vscale x 2 x i64> [[TMP24]], [[TMP27]]
285+
; CHECK-NEXT: [[TMP28:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
286+
; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 0
298287
; CHECK-NEXT: [[TMP29:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT4]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
299288
; CHECK-NEXT: [[TMP30:%.*]] = udiv <vscale x 2 x i64> [[TMP28]], [[TMP29]]
300-
; CHECK-NEXT: [[TMP31:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT4]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
301-
; CHECK-NEXT: [[TMP32:%.*]] = urem <vscale x 2 x i64> [[TMP28]], [[TMP31]]
302-
; CHECK-NEXT: [[TMP33:%.*]] = extractelement <vscale x 2 x i64> [[TMP22]], i32 0
289+
; CHECK-NEXT: [[TMP24:%.*]] = urem i64 [[TMP21]], [[MUL_2_I]]
290+
; CHECK-NEXT: [[TMP35:%.*]] = udiv i64 [[TMP24]], [[MUL_1_I]]
291+
; CHECK-NEXT: [[TMP26:%.*]] = urem i64 [[TMP24]], [[MUL_1_I]]
292+
; CHECK-NEXT: [[TMP38:%.*]] = udiv i64 [[TMP26]], [[X]]
293+
; CHECK-NEXT: [[TMP41:%.*]] = urem i64 [[TMP26]], [[X]]
294+
; CHECK-NEXT: [[TMP33:%.*]] = extractelement <vscale x 2 x i64> [[TMP30]], i32 0
303295
; CHECK-NEXT: [[TMP34:%.*]] = mul i64 [[X]], [[TMP33]]
304-
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <vscale x 2 x i64> [[TMP26]], i32 0
305296
; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[TMP34]], [[TMP35]]
306297
; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], [[X]]
307-
; CHECK-NEXT: [[TMP38:%.*]] = extractelement <vscale x 2 x i64> [[TMP30]], i32 0
308298
; CHECK-NEXT: [[TMP39:%.*]] = add i64 [[TMP37]], [[TMP38]]
309299
; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], [[X]]
310-
; CHECK-NEXT: [[TMP41:%.*]] = extractelement <vscale x 2 x i64> [[TMP32]], i32 0
311300
; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[TMP40]], [[TMP41]]
312301
; CHECK-NEXT: [[TMP43:%.*]] = shl i64 [[TMP42]], 32
313302
; CHECK-NEXT: [[TMP44:%.*]] = ashr i64 [[TMP43]], 32
314303
; CHECK-NEXT: [[TMP45:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP44]]
315304
; CHECK-NEXT: [[TMP46:%.*]] = getelementptr i64, ptr [[TMP45]], i32 0
316-
; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP22]], ptr [[TMP46]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
305+
; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP30]], ptr [[TMP46]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
317306
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
318307
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]])
319308
; CHECK-NEXT: [[TMP47:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
320-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
309+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[TMP28]], [[BROADCAST_SPLAT]]
321310
; CHECK-NEXT: [[TMP48:%.*]] = extractelement <vscale x 2 x i1> [[TMP47]], i32 0
322311
; CHECK-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
323312
; CHECK: [[MIDDLE_BLOCK]]:

0 commit comments

Comments
 (0)