Skip to content

Commit 8df64ed

Browse files
committed
[LV] Don't consider IV increments uniform if exit value is used outside.
In some cases, there might be a chain of uniform instructions producing the exit value. To generate correct code in all cases, consider the IV increment not uniform, if there are users outside the loop. Instead, let VPlan narrow the IV, if possible using the logic from 3ff1d01. Test case from #122602 verified with Alive2: https://alive2.llvm.org/ce/z/bA4EGj Fixes #122496. Fixes #122602.
1 parent b4ce29a commit 8df64ed

File tree

5 files changed

+101
-32
lines changed

5 files changed

+101
-32
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3806,7 +3806,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
38063806
// uniform after vectorization.
38073807
bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
38083808
auto *I = cast<Instruction>(U);
3809-
return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3809+
return I == Ind || Worklist.count(I) ||
38103810
IsVectorizedMemAccessUse(I, IndUpdate);
38113811
});
38123812
if (!UniformIndUpdate)

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -621,28 +621,6 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
621621
Def->replaceAllUsesWith(Clone);
622622
}
623623

624-
// Check if any uniform VPReplicateRecipes using the phi recipe are used by
625-
// ExtractFromEnd. Those must be replaced by a regular VPReplicateRecipe to
626-
// ensure the final value is available.
627-
// TODO: Remove once uniformity analysis is done on VPlan.
628-
for (VPUser *U : Users) {
629-
auto *ExitIRI = dyn_cast<VPIRInstruction>(U);
630-
VPValue *Op;
631-
if (!ExitIRI || !match(ExitIRI->getOperand(0),
632-
m_VPInstruction<VPInstruction::ExtractFromEnd>(
633-
m_VPValue(Op), m_VPValue())))
634-
continue;
635-
auto *RepR = dyn_cast<VPReplicateRecipe>(Op);
636-
if (!RepR || !RepR->isUniform())
637-
continue;
638-
assert(!RepR->isPredicated() && "RepR must not be predicated");
639-
Instruction *I = RepR->getUnderlyingInstr();
640-
auto *Clone =
641-
new VPReplicateRecipe(I, RepR->operands(), /*IsUniform*/ false);
642-
Clone->insertAfter(RepR);
643-
RepR->replaceAllUsesWith(Clone);
644-
}
645-
646624
// Replace wide pointer inductions which have only their scalars used by
647625
// PtrAdd(IndStart, ScalarIVSteps (0, Step)).
648626
if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {

llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,6 @@ for.end:
5151

5252
; CHECK-LABEL: goo
5353
; Check %indvars.iv and %indvars.iv.next are uniform instructions even if they are used outside of loop.
54-
; CHECK-DAG: LV: Found uniform instruction: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
55-
; CHECK-DAG: LV: Found uniform instruction: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
5654
; CHECK-DAG: LV: Found uniform instruction: %exitcond = icmp eq i64 %indvars.iv, 1599
5755

5856
define i64 @goo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) #0 {

llvm/test/Transforms/LoopVectorize/iv_outside_user.ll

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1176,7 +1176,6 @@ e.exit:
11761176
}
11771177

11781178
; Test case for https://github.com/llvm/llvm-project/issues/122496.
1179-
; FIXME: Currently an incorrect live-out is used.
11801179
define i32 @iv_ext_used_outside( ptr %dst) {
11811180
; VEC-LABEL: define i32 @iv_ext_used_outside(
11821181
; VEC-SAME: ptr [[DST:%.*]]) {
@@ -1186,15 +1185,19 @@ define i32 @iv_ext_used_outside( ptr %dst) {
11861185
; VEC-NEXT: br label %[[VECTOR_BODY:.*]]
11871186
; VEC: [[VECTOR_BODY]]:
11881187
; VEC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
1188+
; VEC-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
11891189
; VEC-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
11901190
; VEC-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
11911191
; VEC-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i16 [[TMP0]]
11921192
; VEC-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
11931193
; VEC-NEXT: store <2 x i32> zeroinitializer, ptr [[TMP2]], align 4
1194-
; VEC-NEXT: [[TMP3:%.*]] = add nuw nsw i16 [[TMP0]], 1
1194+
; VEC-NEXT: [[TMP5:%.*]] = add nuw nsw <2 x i16> [[VEC_IND]], splat (i16 1)
1195+
; VEC-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0
11951196
; VEC-NEXT: [[TMP4:%.*]] = zext nneg i16 [[TMP3]] to i32
1196-
; VEC-NEXT: [[TMP5:%.*]] = zext nneg i16 [[TMP3]] to i32
1197+
; VEC-NEXT: [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
1198+
; VEC-NEXT: [[TMP7:%.*]] = zext nneg i16 [[TMP8]] to i32
11971199
; VEC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
1200+
; VEC-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
11981201
; VEC-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
11991202
; VEC-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
12001203
; VEC: [[MIDDLE_BLOCK]]:
@@ -1213,7 +1216,7 @@ define i32 @iv_ext_used_outside( ptr %dst) {
12131216
; VEC-NEXT: [[EC:%.*]] = icmp samesign ult i16 [[IV_1]], 128
12141217
; VEC-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], {{!llvm.loop ![0-9]+}}
12151218
; VEC: [[EXIT]]:
1216-
; VEC-NEXT: [[IV_1_EXT_LCSSA:%.*]] = phi i32 [ [[IV_1_EXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
1219+
; VEC-NEXT: [[IV_1_EXT_LCSSA:%.*]] = phi i32 [ [[IV_1_EXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
12171220
; VEC-NEXT: ret i32 [[IV_1_EXT_LCSSA]]
12181221
;
12191222
; INTERLEAVE-LABEL: define i32 @iv_ext_used_outside(
@@ -1274,7 +1277,6 @@ exit:
12741277
}
12751278

12761279
; Test case for https://github.com/llvm/llvm-project/issues/122602.
1277-
; FIXME: Currently an incorrect live-out is used.
12781280
define i64 @test_iv_increment_incremented(ptr %dst) {
12791281
; VEC-LABEL: define i64 @test_iv_increment_incremented(
12801282
; VEC-SAME: ptr [[DST:%.*]]) {
@@ -1288,8 +1290,9 @@ define i64 @test_iv_increment_incremented(ptr %dst) {
12881290
; VEC-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 -1
12891291
; VEC-NEXT: store <2 x i16> splat (i16 1), ptr [[TMP2]], align 2
12901292
; VEC-NEXT: [[TMP3:%.*]] = add i64 2, -1
1293+
; VEC-NEXT: [[TMP5:%.*]] = add i64 1, -1
12911294
; VEC-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], 1
1292-
; VEC-NEXT: [[TMP5:%.*]] = add i64 [[TMP3]], 1
1295+
; VEC-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], 1
12931296
; VEC-NEXT: br label %[[MIDDLE_BLOCK:.*]]
12941297
; VEC: [[MIDDLE_BLOCK]]:
12951298
; VEC-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -1307,7 +1310,7 @@ define i64 @test_iv_increment_incremented(ptr %dst) {
13071310
; VEC-NEXT: [[IV_1_NEXT]] = add i64 [[IV_2_NEXT]], 1
13081311
; VEC-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
13091312
; VEC: [[EXIT]]:
1310-
; VEC-NEXT: [[IV_1_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_1_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
1313+
; VEC-NEXT: [[IV_1_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_1_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
13111314
; VEC-NEXT: ret i64 [[IV_1_NEXT_LCSSA]]
13121315
;
13131316
; INTERLEAVE-LABEL: define i64 @test_iv_increment_incremented(
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -scalable-vectorization=on -force-target-supports-scalable-vectors=true -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S %s | FileCheck %s
3+
4+
define i32 @iv_live_out_wide(ptr %dst) {
5+
; CHECK-LABEL: define i32 @iv_live_out_wide(
6+
; CHECK-SAME: ptr [[DST:%.*]]) {
7+
; CHECK-NEXT: [[ENTRY:.*]]:
8+
; CHECK-NEXT: [[STEP_1:%.*]] = sext i8 0 to i32
9+
; CHECK-NEXT: [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1
10+
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
11+
; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4
12+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 2000, [[TMP1]]
13+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
14+
; CHECK: [[VECTOR_PH]]:
15+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
16+
; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4
17+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 2000, [[TMP3]]
18+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 2000, [[N_MOD_VF]]
19+
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
20+
; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 2
21+
; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], 2
22+
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
23+
; CHECK-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i32> [[TMP7]], splat (i32 1)
24+
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i32> zeroinitializer, [[TMP8]]
25+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP5]], i64 0
26+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
27+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[STEP_2]], i64 0
28+
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
29+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
30+
; CHECK: [[VECTOR_BODY]]:
31+
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
32+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
33+
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <vscale x 2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
34+
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 0
35+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP9]]
36+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
37+
; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
38+
; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2
39+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i64 [[TMP13]]
40+
; CHECK-NEXT: store <vscale x 2 x i16> zeroinitializer, ptr [[TMP11]], align 2
41+
; CHECK-NEXT: store <vscale x 2 x i16> zeroinitializer, ptr [[TMP14]], align 2
42+
; CHECK-NEXT: [[TMP15:%.*]] = add <vscale x 2 x i32> [[BROADCAST_SPLAT2]], [[STEP_ADD]]
43+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]]
44+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
45+
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
46+
; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
47+
; CHECK: [[MIDDLE_BLOCK]]:
48+
; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
49+
; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], 2
50+
; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP18]], 1
51+
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <vscale x 2 x i32> [[TMP15]], i32 [[TMP19]]
52+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 2000, [[N_VEC]]
53+
; CHECK-NEXT: br i1 [[CMP_N]], label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
54+
; CHECK: [[SCALAR_PH]]:
55+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
56+
; CHECK-NEXT: br label %[[LOOP:.*]]
57+
; CHECK: [[LOOP]]:
58+
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
59+
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
60+
; CHECK-NEXT: store i16 0, ptr [[GEP_DST]], align 2
61+
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[STEP_2]], [[IV]]
62+
; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 2000
63+
; CHECK-NEXT: br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
64+
; CHECK: [[E_EXIT]]:
65+
; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP20]], %[[MIDDLE_BLOCK]] ]
66+
; CHECK-NEXT: ret i32 [[RES]]
67+
;
68+
entry:
69+
%step.1 = sext i8 0 to i32
70+
%step.2 = add nsw i32 %step.1, 1
71+
br label %loop
72+
73+
loop:
74+
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
75+
%gep.dst = getelementptr inbounds i16, ptr %dst, i32 %iv
76+
store i16 0, ptr %gep.dst, align 2
77+
%iv.next = add i32 %step.2, %iv
78+
%cmp.i = icmp slt i32 %iv.next, 2000
79+
br i1 %cmp.i, label %loop, label %e.exit
80+
81+
e.exit:
82+
%res = phi i32 [ %iv.next, %loop ]
83+
ret i32 %res
84+
}
85+
;.
86+
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
87+
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
88+
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
89+
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
90+
;.

0 commit comments

Comments
 (0)