Skip to content

Commit eaf0d82

Browse files
authored
[LV] Disable fold tail by masking when IV is used outside (llvm#81609)
When induction variable are used outside the loop body, tail folding by masking mis-compiles, because for users outside of the loop the final value of the induction is computed separately from the vector loop. Fixes llvm#76069 Fixes llvm#51677
1 parent 5c54f72 commit eaf0d82

File tree

2 files changed

+29
-52
lines changed

2 files changed

+29
-52
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1552,6 +1552,19 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
15521552
}
15531553
}
15541554

1555+
for (const auto &Entry : getInductionVars()) {
1556+
PHINode *OrigPhi = Entry.first;
1557+
for (User *U : OrigPhi->users()) {
1558+
auto *UI = cast<Instruction>(U);
1559+
if (!TheLoop->contains(UI)) {
1560+
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop IV has an "
1561+
"outside user for "
1562+
<< *UI << "\n");
1563+
return false;
1564+
}
1565+
}
1566+
}
1567+
15551568
// The list of pointers that we can safely read and write to remains empty.
15561569
SmallPtrSet<Value *, 8> SafePointers;
15571570

llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll

Lines changed: 16 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
22
; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s
33

4-
; FIXME: The vectorizer should refuse to fold the tail by masking because
4+
5+
; The vectorizer should refuse to fold the tail by masking because
56
; %conv is used outside of the loop. Test for this by checking that
67
; %n.vec, the vector trip count, is rounded down to the next multiple of
78
; 4. If folding the tail, it would have been rounded up instead.
@@ -14,7 +15,8 @@ define i32 @test(ptr %arr, i64 %n) {
1415
; CHECK-NEXT: br i1 [[CMP1]], label [[PREHEADER:%.*]], label [[DONE:%.*]]
1516
; CHECK: preheader:
1617
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
17-
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
18+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
19+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
1820
; CHECK: vector.scevcheck:
1921
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[N]], -2
2022
; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i8
@@ -30,69 +32,31 @@ define i32 @test(ptr %arr, i64 %n) {
3032
; CHECK-NEXT: [[TMP12:%.*]] = or i1 [[TMP6]], [[TMP11]]
3133
; CHECK-NEXT: br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
3234
; CHECK: vector.ph:
33-
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
34-
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
35-
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
35+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
36+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
3637
; CHECK-NEXT: [[IND_END:%.*]] = add i64 1, [[N_VEC]]
3738
; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i8
3839
; CHECK-NEXT: [[IND_END1:%.*]] = add i8 1, [[DOTCAST]]
39-
; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
40-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
41-
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
4240
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
4341
; CHECK: vector.body:
44-
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE10:%.*]] ]
45-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1, i64 2, i64 3, i64 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE10]] ]
42+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
4643
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
4744
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
4845
; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 1
4946
; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 2
5047
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 3
51-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
52-
; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT3]], <4 x i64> poison, <4 x i32> zeroinitializer
53-
; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT4]], <i64 0, i64 1, i64 2, i64 3>
54-
; CHECK-NEXT: [[TMP17:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
55-
; CHECK-NEXT: [[TMP18:%.*]] = add nsw <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
56-
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP17]], i32 0
57-
; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
58-
; CHECK: pred.store.if:
59-
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0
60-
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP20]]
61-
; CHECK-NEXT: store i32 65, ptr [[TMP21]], align 4
62-
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
63-
; CHECK: pred.store.continue:
64-
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP17]], i32 1
65-
; CHECK-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
66-
; CHECK: pred.store.if5:
67-
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1
68-
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP23]]
69-
; CHECK-NEXT: store i32 65, ptr [[TMP24]], align 4
70-
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
71-
; CHECK: pred.store.continue6:
72-
; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i1> [[TMP17]], i32 2
73-
; CHECK-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
74-
; CHECK: pred.store.if7:
75-
; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2
76-
; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP26]]
77-
; CHECK-NEXT: store i32 65, ptr [[TMP27]], align 4
78-
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]]
79-
; CHECK: pred.store.continue8:
80-
; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP17]], i32 3
81-
; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10]]
82-
; CHECK: pred.store.if9:
83-
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3
84-
; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP29]]
85-
; CHECK-NEXT: store i32 65, ptr [[TMP30]], align 4
86-
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]]
87-
; CHECK: pred.store.continue10:
88-
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
89-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
90-
; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
91-
; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
48+
; CHECK-NEXT: [[TMP17:%.*]] = add nsw i64 [[TMP13]], -1
49+
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP17]]
50+
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
51+
; CHECK-NEXT: store <4 x i32> <i32 65, i32 65, i32 65, i32 65>, ptr [[TMP19]], align 4
52+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
53+
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
54+
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
9255
; CHECK: middle.block:
56+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
9357
; CHECK-NEXT: [[CMO:%.*]] = sub i64 [[N_VEC]], 1
9458
; CHECK-NEXT: [[IND_ESCAPE:%.*]] = add i64 1, [[CMO]]
95-
; CHECK-NEXT: br i1 true, label [[LOAD_VAL:%.*]], label [[SCALAR_PH]]
59+
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOAD_VAL:%.*]], label [[SCALAR_PH]]
9660
; CHECK: scalar.ph:
9761
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ], [ 1, [[VECTOR_SCEVCHECK]] ]
9862
; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i8 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ], [ 1, [[VECTOR_SCEVCHECK]] ]

0 commit comments

Comments
 (0)