Skip to content

Commit b6a8f54

Browse files
committed
[LV] Consider all exit branch conditions uniform.
If we vectorize a loop with multiple exits, all exiting branches should be considered uniform, as the resulting loop will be controlled by the canonical IV only. Previously we were overestimating the cost of values contributing to the other exits.
1 parent 19d2d3f commit b6a8f54

File tree

2 files changed

+28
-24
lines changed

2 files changed

+28
-24
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4167,7 +4167,6 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
41674167

41684168
// Worklist containing uniform instructions demanding lane 0.
41694169
SetVector<Instruction *> Worklist;
4170-
BasicBlock *Latch = TheLoop->getLoopLatch();
41714170

41724171
// Add uniform instructions demanding lane 0 to the worklist. Instructions
41734172
// that are scalar with predication must not be considered uniform after
@@ -4189,12 +4188,16 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
41894188
Worklist.insert(I);
41904189
};
41914190

4192-
// Start with the conditional branch. If the branch condition is an
4193-
// instruction contained in the loop that is only used by the branch, it is
4194-
// uniform.
4195-
auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4196-
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4197-
addToWorklistIfAllowed(Cmp);
4191+
// Start with the conditional branches exiting the loop. If the branch
4192+
// condition is an instruction contained in the loop that is only used by the
4193+
// branch, it is uniform.
4194+
SmallVector<BasicBlock *> Exiting;
4195+
TheLoop->getExitingBlocks(Exiting);
4196+
for (BasicBlock *E : Exiting) {
4197+
auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
4198+
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4199+
addToWorklistIfAllowed(Cmp);
4200+
}
41984201

41994202
auto PrevVF = VF.divideCoefficientBy(2);
42004203
// Return true if all lanes perform the same memory operation, and we can
@@ -4335,6 +4338,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
43354338
// nodes separately. An induction variable will remain uniform if all users
43364339
// of the induction variable and induction variable update remain uniform.
43374340
// The code below handles both pointer and non-pointer induction variables.
4341+
BasicBlock *Latch = TheLoop->getLoopLatch();
43384342
for (const auto &Induction : Legal->getInductionVars()) {
43394343
auto *Ind = Induction.first;
43404344
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));

llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,37 +9,37 @@ define i32 @multi_exit_iv_uniform(i32 %a, i64 %N, ptr %dst) {
99
; CHECK-NEXT: entry:
1010
; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 2147483648)
1111
; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[UMIN]], 1
12-
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4
12+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 8
1313
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1414
; CHECK: vector.ph:
15-
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
15+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
1616
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
17-
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]]
17+
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 8, i64 [[N_MOD_VF]]
1818
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
19-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0
20-
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
19+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
20+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
2121
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
2222
; CHECK: vector.body:
2323
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
24-
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
25-
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
24+
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
25+
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
2626
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
27-
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 2
27+
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
2828
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP3]]
2929
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP4]]
30-
; CHECK-NEXT: [[TMP7:%.*]] = zext <2 x i32> [[BROADCAST_SPLAT]] to <2 x i64>
30+
; CHECK-NEXT: [[TMP7:%.*]] = zext <4 x i32> [[BROADCAST_SPLAT]] to <4 x i64>
3131
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
32-
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP5]], i32 2
33-
; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP8]], align 8
34-
; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP9]], align 8
35-
; CHECK-NEXT: [[TMP10]] = add <2 x i32> [[VEC_PHI]], <i32 -1, i32 -1>
36-
; CHECK-NEXT: [[TMP11]] = add <2 x i32> [[VEC_PHI1]], <i32 -1, i32 -1>
37-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
32+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP5]], i32 4
33+
; CHECK-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP8]], align 8
34+
; CHECK-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP9]], align 8
35+
; CHECK-NEXT: [[TMP10]] = add <4 x i32> [[VEC_PHI]], <i32 -1, i32 -1, i32 -1, i32 -1>
36+
; CHECK-NEXT: [[TMP11]] = add <4 x i32> [[VEC_PHI1]], <i32 -1, i32 -1, i32 -1, i32 -1>
37+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
3838
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
3939
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
4040
; CHECK: middle.block:
41-
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[TMP11]], [[TMP10]]
42-
; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
41+
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[TMP10]]
42+
; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
4343
; CHECK-NEXT: br label [[SCALAR_PH]]
4444
; CHECK: scalar.ph:
4545
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]

0 commit comments

Comments
 (0)