Skip to content

Commit 67d840b

Browse files
committed
[VPlan] Relax over-aggressive assertion in VPTransformState::get().
There are cases where a vector value has some users that demand the the single scalar value only (NeedsScalar), while other users demand the vector value (see attached test cases). In those cases, the NeedsScalar users should only demand the first lane. Fixes #91883.
1 parent 90109d4 commit 67d840b

File tree

2 files changed

+227
-1
lines changed

2 files changed

+227
-1
lines changed

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
246246

247247
Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) {
248248
if (NeedsScalar) {
249-
assert((VF.isScalar() || Def->isLiveIn() ||
249+
assert((VF.isScalar() || Def->isLiveIn() || hasVectorValue(Def, Part) ||
250250
(hasScalarValue(Def, VPIteration(Part, 0)) &&
251251
Data.PerPartScalars[Def][Part].size() == 1)) &&
252252
"Trying to access a single scalar per part but has multiple scalars "
Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2+
; RUN: opt -S -passes=loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -S %s | FileCheck %s
3+
4+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
5+
6+
; Test cases based on https://github.com/llvm/llvm-project/issues/91883.
7+
define void @iv.4_used_as_vector_and_first_lane(ptr %src, ptr noalias %dst) {
8+
; CHECK-LABEL: define void @iv.4_used_as_vector_and_first_lane(
9+
; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
10+
; CHECK-NEXT: entry:
11+
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
12+
; CHECK: vector.ph:
13+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
14+
; CHECK: vector.body:
15+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
16+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
17+
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
18+
; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
19+
; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4>
20+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
21+
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
22+
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8
23+
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12
24+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP0]]
25+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
26+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP2]]
27+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP3]]
28+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
29+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4
30+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 8
31+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 12
32+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
33+
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
34+
; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
35+
; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
36+
; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
37+
; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
38+
; CHECK-NEXT: [[TMP14:%.*]] = add <4 x i64> [[STEP_ADD1]], <i64 4, i64 4, i64 4, i64 4>
39+
; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4>
40+
; CHECK-NEXT: [[TMP16:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD]], <i64 128, i64 128, i64 128, i64 128>
41+
; CHECK-NEXT: [[TMP17:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD4]], <i64 128, i64 128, i64 128, i64 128>
42+
; CHECK-NEXT: [[TMP18:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD5]], <i64 128, i64 128, i64 128, i64 128>
43+
; CHECK-NEXT: [[TMP19:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD6]], <i64 128, i64 128, i64 128, i64 128>
44+
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP12]], i32 0
45+
; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP20]], 1
46+
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP13]], i32 0
47+
; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP22]], 1
48+
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP14]], i32 0
49+
; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], 1
50+
; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP15]], i32 0
51+
; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[TMP26]], 1
52+
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP21]]
53+
; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP23]]
54+
; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP25]]
55+
; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP27]]
56+
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i64, ptr [[TMP28]], i32 0
57+
; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i64, ptr [[TMP28]], i32 4
58+
; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i64, ptr [[TMP28]], i32 8
59+
; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i64, ptr [[TMP28]], i32 12
60+
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP12]], ptr [[TMP32]], i32 4, <4 x i1> [[TMP16]])
61+
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP13]], ptr [[TMP33]], i32 4, <4 x i1> [[TMP17]])
62+
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP14]], ptr [[TMP34]], i32 4, <4 x i1> [[TMP18]])
63+
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP15]], ptr [[TMP35]], i32 4, <4 x i1> [[TMP19]])
64+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
65+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], <i64 4, i64 4, i64 4, i64 4>
66+
; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
67+
; CHECK-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
68+
; CHECK: middle.block:
69+
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
70+
; CHECK: scalar.ph:
71+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
72+
; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
73+
; CHECK: loop.header:
74+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
75+
; CHECK-NEXT: [[G_SRC:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
76+
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[G_SRC]], align 8
77+
; CHECK-NEXT: [[IV_4:%.*]] = add nuw nsw i64 [[IV]], 4
78+
; CHECK-NEXT: [[C:%.*]] = icmp ule i64 [[L]], 128
79+
; CHECK-NEXT: br i1 [[C]], label [[LOOP_THEN:%.*]], label [[LOOP_LATCH]]
80+
; CHECK: loop.then:
81+
; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[IV_4]], 1
82+
; CHECK-NEXT: [[G_DST:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[OR]]
83+
; CHECK-NEXT: store i64 [[IV_4]], ptr [[G_DST]], align 4
84+
; CHECK-NEXT: br label [[LOOP_LATCH]]
85+
; CHECK: loop.latch:
86+
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
87+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 32
88+
; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
89+
; CHECK: exit:
90+
; CHECK-NEXT: ret void
91+
;
92+
entry:
93+
br label %loop.header
94+
95+
loop.header:
96+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
97+
%g.src = getelementptr inbounds i64, ptr %src, i64 %iv
98+
%l = load i64, ptr %g.src
99+
%iv.4 = add nuw nsw i64 %iv, 4
100+
%c = icmp ule i64 %l, 128
101+
br i1 %c, label %loop.then, label %loop.latch
102+
103+
loop.then:
104+
%or = or disjoint i64 %iv.4, 1
105+
%g.dst = getelementptr inbounds i64, ptr %dst, i64 %or
106+
store i64 %iv.4, ptr %g.dst, align 4
107+
br label %loop.latch
108+
109+
loop.latch:
110+
%iv.next = add nuw nsw i64 %iv, 1
111+
%exitcond = icmp eq i64 %iv.next, 32
112+
br i1 %exitcond, label %exit, label %loop.header
113+
114+
exit:
115+
ret void
116+
}
117+
118+
define void @iv.4_used_as_first_lane(ptr %src, ptr noalias %dst) {
119+
; CHECK-LABEL: define void @iv.4_used_as_first_lane(
120+
; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0]] {
121+
; CHECK-NEXT: entry:
122+
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
123+
; CHECK: vector.ph:
124+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
125+
; CHECK: vector.body:
126+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
127+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
128+
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
129+
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8
130+
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12
131+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP0]]
132+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
133+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP2]]
134+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP3]]
135+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
136+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4
137+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 8
138+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 12
139+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
140+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
141+
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
142+
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
143+
; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP0]], 4
144+
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP1]], 4
145+
; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP2]], 4
146+
; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP3]], 4
147+
; CHECK-NEXT: [[TMP16:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD]], <i64 128, i64 128, i64 128, i64 128>
148+
; CHECK-NEXT: [[TMP17:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD1]], <i64 128, i64 128, i64 128, i64 128>
149+
; CHECK-NEXT: [[TMP18:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD2]], <i64 128, i64 128, i64 128, i64 128>
150+
; CHECK-NEXT: [[TMP19:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD3]], <i64 128, i64 128, i64 128, i64 128>
151+
; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[TMP12]], 1
152+
; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP13]], 1
153+
; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[TMP14]], 1
154+
; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP15]], 1
155+
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP20]]
156+
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP21]]
157+
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP22]]
158+
; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP23]]
159+
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i64, ptr [[TMP24]], i32 0
160+
; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i64, ptr [[TMP24]], i32 4
161+
; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i64, ptr [[TMP24]], i32 8
162+
; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i64, ptr [[TMP24]], i32 12
163+
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD]], ptr [[TMP28]], i32 4, <4 x i1> [[TMP16]])
164+
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD1]], ptr [[TMP29]], i32 4, <4 x i1> [[TMP17]])
165+
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD2]], ptr [[TMP30]], i32 4, <4 x i1> [[TMP18]])
166+
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD3]], ptr [[TMP31]], i32 4, <4 x i1> [[TMP19]])
167+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
168+
; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
169+
; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
170+
; CHECK: middle.block:
171+
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
172+
; CHECK: scalar.ph:
173+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
174+
; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
175+
; CHECK: loop.header:
176+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
177+
; CHECK-NEXT: [[G_SRC:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
178+
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[G_SRC]], align 8
179+
; CHECK-NEXT: [[IV_4:%.*]] = add nuw nsw i64 [[IV]], 4
180+
; CHECK-NEXT: [[C:%.*]] = icmp ule i64 [[L]], 128
181+
; CHECK-NEXT: br i1 [[C]], label [[LOOP_THEN:%.*]], label [[LOOP_LATCH]]
182+
; CHECK: loop.then:
183+
; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[IV_4]], 1
184+
; CHECK-NEXT: [[G_DST:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[OR]]
185+
; CHECK-NEXT: store i64 [[L]], ptr [[G_DST]], align 4
186+
; CHECK-NEXT: br label [[LOOP_LATCH]]
187+
; CHECK: loop.latch:
188+
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
189+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 32
190+
; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
191+
; CHECK: exit:
192+
; CHECK-NEXT: ret void
193+
;
194+
entry:
195+
br label %loop.header
196+
197+
loop.header:
198+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
199+
%g.src = getelementptr inbounds i64, ptr %src, i64 %iv
200+
%l = load i64, ptr %g.src
201+
%iv.4 = add nuw nsw i64 %iv, 4
202+
%c = icmp ule i64 %l, 128
203+
br i1 %c, label %loop.then, label %loop.latch
204+
205+
loop.then:
206+
%or = or disjoint i64 %iv.4, 1
207+
%g.dst = getelementptr inbounds i64, ptr %dst, i64 %or
208+
store i64 %l, ptr %g.dst, align 4
209+
br label %loop.latch
210+
211+
loop.latch:
212+
%iv.next = add nuw nsw i64 %iv, 1
213+
%exitcond = icmp eq i64 %iv.next, 32
214+
br i1 %exitcond, label %exit, label %loop.header
215+
216+
exit:
217+
ret void
218+
}
219+
;.
220+
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
221+
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
222+
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
223+
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
224+
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
225+
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
226+
;.

0 commit comments

Comments
 (0)