Skip to content

Commit d216615

Browse files
committed
[LV] Process dead interleave pointer ops in reverse order.
Process dead interleave pointer ops in reverse order. This also catches cases where the same base pointer is used by multiple different interleave groups. This fixes another case where the legacy cost model inaccuarately estimates cost, surfaced by b841e2e.
1 parent 8d28a41 commit d216615

File tree

2 files changed

+200
-2
lines changed

2 files changed

+200
-2
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6998,7 +6998,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
69986998
// Ignore ephemeral values.
69996999
CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
70007000

7001-
SmallSetVector<Value *, 4> DeadInterleavePointerOps;
7001+
SmallVector<Value *> InitialInterleavePointersOps;
70027002
for (BasicBlock *BB : TheLoop->blocks())
70037003
for (Instruction &I : *BB) {
70047004
// Find all stores to invariant variables. Since they are going to sink
@@ -7016,10 +7016,13 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
70167016
if (Group->getInsertPos() == &I)
70177017
continue;
70187018
Value *PointerOp = getLoadStorePointerOperand(&I);
7019-
DeadInterleavePointerOps.insert(PointerOp);
7019+
InitialInterleavePointersOps.push_back(PointerOp);
70207020
}
70217021
}
70227022

7023+
SmallSetVector<Value *, 4> DeadInterleavePointerOps(
7024+
InitialInterleavePointersOps.rbegin(),
7025+
InitialInterleavePointersOps.rend());
70237026
// Mark ops feeding interleave group members as free, if they are only used
70247027
// by other dead computations.
70257028
for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {

llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,9 +182,204 @@ loop:
182182
exit:
183183
ret void
184184
}
185+
186+
define void @geps_feeding_interleave_groups_with_reuse(ptr %arg, i64 %arg1, ptr %arg2) #0 {
187+
; CHECK-LABEL: define void @geps_feeding_interleave_groups_with_reuse(
188+
; CHECK-SAME: ptr [[ARG:%.*]], i64 [[ARG1:%.*]], ptr [[ARG2:%.*]]) #[[ATTR0:[0-9]+]] {
189+
; CHECK-NEXT: [[ENTRY:.*]]:
190+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[ARG1]], 1
191+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 30
192+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
193+
; CHECK: [[VECTOR_SCEVCHECK]]:
194+
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ARG2]], i64 8
195+
; CHECK-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[ARG1]])
196+
; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
197+
; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
198+
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 0, [[MUL_RESULT]]
199+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT]]
200+
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult ptr [[TMP2]], [[SCEVGEP]]
201+
; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]]
202+
; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[ARG2]], i64 12
203+
; CHECK-NEXT: [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[ARG1]])
204+
; CHECK-NEXT: [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
205+
; CHECK-NEXT: [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
206+
; CHECK-NEXT: [[TMP5:%.*]] = sub i64 0, [[MUL_RESULT3]]
207+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[MUL_RESULT3]]
208+
; CHECK-NEXT: [[TMP7:%.*]] = icmp ult ptr [[TMP6]], [[SCEVGEP1]]
209+
; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW4]]
210+
; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[ARG2]], i64 4
211+
; CHECK-NEXT: [[MUL6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[ARG1]])
212+
; CHECK-NEXT: [[MUL_RESULT7:%.*]] = extractvalue { i64, i1 } [[MUL6]], 0
213+
; CHECK-NEXT: [[MUL_OVERFLOW8:%.*]] = extractvalue { i64, i1 } [[MUL6]], 1
214+
; CHECK-NEXT: [[TMP9:%.*]] = sub i64 0, [[MUL_RESULT7]]
215+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[SCEVGEP5]], i64 [[MUL_RESULT7]]
216+
; CHECK-NEXT: [[TMP11:%.*]] = icmp ult ptr [[TMP10]], [[SCEVGEP5]]
217+
; CHECK-NEXT: [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW8]]
218+
; CHECK-NEXT: [[MUL9:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[ARG1]])
219+
; CHECK-NEXT: [[MUL_RESULT10:%.*]] = extractvalue { i64, i1 } [[MUL9]], 0
220+
; CHECK-NEXT: [[MUL_OVERFLOW11:%.*]] = extractvalue { i64, i1 } [[MUL9]], 1
221+
; CHECK-NEXT: [[TMP13:%.*]] = sub i64 0, [[MUL_RESULT10]]
222+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[MUL_RESULT10]]
223+
; CHECK-NEXT: [[TMP15:%.*]] = icmp ult ptr [[TMP14]], [[ARG2]]
224+
; CHECK-NEXT: [[TMP16:%.*]] = or i1 [[TMP15]], [[MUL_OVERFLOW11]]
225+
; CHECK-NEXT: [[TMP17:%.*]] = or i1 [[TMP4]], [[TMP8]]
226+
; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP17]], [[TMP12]]
227+
; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP18]], [[TMP16]]
228+
; CHECK-NEXT: br i1 [[TMP19]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
229+
; CHECK: [[VECTOR_MEMCHECK]]:
230+
; CHECK-NEXT: [[TMP20:%.*]] = shl i64 [[ARG1]], 4
231+
; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP20]], 16
232+
; CHECK-NEXT: [[SCEVGEP12:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[TMP21]]
233+
; CHECK-NEXT: [[TMP22:%.*]] = shl i64 [[ARG1]], 5
234+
; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP22]], 32
235+
; CHECK-NEXT: [[SCEVGEP13:%.*]] = getelementptr i8, ptr [[ARG]], i64 [[TMP23]]
236+
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP13]]
237+
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[ARG]], [[SCEVGEP12]]
238+
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
239+
; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
240+
; CHECK: [[VECTOR_PH]]:
241+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
242+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
243+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
244+
; CHECK: [[VECTOR_BODY]]:
245+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
246+
; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 0
247+
; CHECK-NEXT: [[TMP25:%.*]] = shl i64 [[TMP24]], 5
248+
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[ARG]], i64 [[TMP25]]
249+
; CHECK-NEXT: [[TMP27:%.*]] = shl i64 [[TMP24]], 4
250+
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[TMP27]]
251+
; CHECK-NEXT: [[TMP29:%.*]] = getelementptr float, ptr [[TMP26]], i32 0
252+
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x float>, ptr [[TMP29]], align 4
253+
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> <i32 0, i32 8>
254+
; CHECK-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> <i32 1, i32 9>
255+
; CHECK-NEXT: [[STRIDED_VEC15:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> <i32 2, i32 10>
256+
; CHECK-NEXT: [[STRIDED_VEC16:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> <i32 3, i32 11>
257+
; CHECK-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> <i32 4, i32 12>
258+
; CHECK-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> <i32 5, i32 13>
259+
; CHECK-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> <i32 6, i32 14>
260+
; CHECK-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> <i32 7, i32 15>
261+
; CHECK-NEXT: [[TMP30:%.*]] = fadd <2 x float> [[STRIDED_VEC]], [[STRIDED_VEC17]]
262+
; CHECK-NEXT: [[TMP31:%.*]] = fmul <2 x float> [[TMP30]], zeroinitializer
263+
; CHECK-NEXT: [[TMP32:%.*]] = fadd <2 x float> [[STRIDED_VEC14]], [[STRIDED_VEC18]]
264+
; CHECK-NEXT: [[TMP33:%.*]] = fmul <2 x float> [[TMP32]], zeroinitializer
265+
; CHECK-NEXT: [[TMP34:%.*]] = fadd <2 x float> [[STRIDED_VEC15]], [[STRIDED_VEC19]]
266+
; CHECK-NEXT: [[TMP35:%.*]] = fmul <2 x float> [[TMP34]], zeroinitializer
267+
; CHECK-NEXT: [[TMP36:%.*]] = fadd <2 x float> [[STRIDED_VEC16]], [[STRIDED_VEC20]]
268+
; CHECK-NEXT: [[TMP37:%.*]] = fmul <2 x float> [[TMP36]], zeroinitializer
269+
; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[TMP28]], i64 12
270+
; CHECK-NEXT: [[TMP39:%.*]] = getelementptr float, ptr [[TMP38]], i32 -3
271+
; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <2 x float> [[TMP31]], <2 x float> [[TMP33]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
272+
; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <2 x float> [[TMP35]], <2 x float> [[TMP37]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
273+
; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <4 x float> [[TMP40]], <4 x float> [[TMP41]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
274+
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP42]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
275+
; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP39]], align 4
276+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
277+
; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
278+
; CHECK-NEXT: br i1 [[TMP43]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
279+
; CHECK: [[MIDDLE_BLOCK]]:
280+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
281+
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
282+
; CHECK: [[SCALAR_PH]]:
283+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
284+
; CHECK-NEXT: br label %[[LOOP:.*]]
285+
; CHECK: [[LOOP]]:
286+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
287+
; CHECK-NEXT: [[SHL_IV_5:%.*]] = shl i64 [[IV]], 5
288+
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[ARG]], i64 [[SHL_IV_5]]
289+
; CHECK-NEXT: [[ADD_5:%.*]] = or disjoint i64 [[SHL_IV_5]], 16
290+
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i8, ptr [[ARG]], i64 [[ADD_5]]
291+
; CHECK-NEXT: [[SHL_IV_4:%.*]] = shl i64 [[IV]], 4
292+
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[SHL_IV_4]]
293+
; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_1]], align 4
294+
; CHECK-NEXT: [[L_2:%.*]] = load float, ptr [[GEP_2]], align 4
295+
; CHECK-NEXT: [[ADD_1:%.*]] = fadd float [[L_1]], [[L_2]]
296+
; CHECK-NEXT: [[MUL_1:%.*]] = fmul float [[ADD_1]], 0.000000e+00
297+
; CHECK-NEXT: store float [[MUL_1]], ptr [[GEP_3]], align 4
298+
; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr i8, ptr [[GEP_1]], i64 4
299+
; CHECK-NEXT: [[L_3:%.*]] = load float, ptr [[GEP_4]], align 4
300+
; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr i8, ptr [[GEP_2]], i64 4
301+
; CHECK-NEXT: [[L_4:%.*]] = load float, ptr [[GEP_5]], align 4
302+
; CHECK-NEXT: [[ADD_2:%.*]] = fadd float [[L_3]], [[L_4]]
303+
; CHECK-NEXT: [[MUL_2:%.*]] = fmul float [[ADD_2]], 0.000000e+00
304+
; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr i8, ptr [[GEP_3]], i64 4
305+
; CHECK-NEXT: store float [[MUL_2]], ptr [[GEP_6]], align 4
306+
; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr i8, ptr [[GEP_1]], i64 8
307+
; CHECK-NEXT: [[L_5:%.*]] = load float, ptr [[GEP_7]], align 4
308+
; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr i8, ptr [[GEP_2]], i64 8
309+
; CHECK-NEXT: [[L_6:%.*]] = load float, ptr [[GEP_8]], align 4
310+
; CHECK-NEXT: [[ADD_3:%.*]] = fadd float [[L_5]], [[L_6]]
311+
; CHECK-NEXT: [[MUL_3:%.*]] = fmul float [[ADD_3]], 0.000000e+00
312+
; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr i8, ptr [[GEP_3]], i64 8
313+
; CHECK-NEXT: store float [[MUL_3]], ptr [[GEP_9]], align 4
314+
; CHECK-NEXT: [[I27:%.*]] = getelementptr i8, ptr [[GEP_1]], i64 12
315+
; CHECK-NEXT: [[L_7:%.*]] = load float, ptr [[I27]], align 4
316+
; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr i8, ptr [[GEP_2]], i64 12
317+
; CHECK-NEXT: [[L_8:%.*]] = load float, ptr [[GEP_10]], align 4
318+
; CHECK-NEXT: [[ADD_4:%.*]] = fadd float [[L_7]], [[L_8]]
319+
; CHECK-NEXT: [[MUL_4:%.*]] = fmul float [[ADD_4]], 0.000000e+00
320+
; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr i8, ptr [[GEP_3]], i64 12
321+
; CHECK-NEXT: store float [[MUL_4]], ptr [[GEP_11]], align 4
322+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
323+
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[ARG1]]
324+
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
325+
; CHECK: [[EXIT]]:
326+
; CHECK-NEXT: ret void
327+
;
328+
entry:
329+
br label %loop
330+
331+
loop:
332+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
333+
%shl.iv.5 = shl i64 %iv, 5
334+
%gep.1 = getelementptr i8, ptr %arg, i64 %shl.iv.5
335+
%add.5 = or disjoint i64 %shl.iv.5, 16
336+
%gep.2 = getelementptr i8, ptr %arg, i64 %add.5
337+
%shl.iv.4 = shl i64 %iv, 4
338+
%gep.3 = getelementptr i8, ptr %arg2, i64 %shl.iv.4
339+
%l.1 = load float, ptr %gep.1, align 4
340+
%l.2 = load float, ptr %gep.2, align 4
341+
%add.1 = fadd float %l.1, %l.2
342+
%mul.1 = fmul float %add.1, 0.000000e+00
343+
store float %mul.1, ptr %gep.3, align 4
344+
%gep.4 = getelementptr i8, ptr %gep.1, i64 4
345+
%l.3 = load float, ptr %gep.4, align 4
346+
%gep.5 = getelementptr i8, ptr %gep.2, i64 4
347+
%l.4 = load float, ptr %gep.5, align 4
348+
%add.2 = fadd float %l.3, %l.4
349+
%mul.2 = fmul float %add.2, 0.000000e+00
350+
%gep.6 = getelementptr i8, ptr %gep.3, i64 4
351+
store float %mul.2, ptr %gep.6, align 4
352+
%gep.7 = getelementptr i8, ptr %gep.1, i64 8
353+
%l.5 = load float, ptr %gep.7, align 4
354+
%gep.8 = getelementptr i8, ptr %gep.2, i64 8
355+
%l.6 = load float, ptr %gep.8, align 4
356+
%add.3 = fadd float %l.5, %l.6
357+
%mul.3 = fmul float %add.3, 0.000000e+00
358+
%gep.9 = getelementptr i8, ptr %gep.3, i64 8
359+
store float %mul.3, ptr %gep.9, align 4
360+
%i27 = getelementptr i8, ptr %gep.1, i64 12
361+
%l.7 = load float, ptr %i27, align 4
362+
%gep.10 = getelementptr i8, ptr %gep.2, i64 12
363+
%l.8 = load float, ptr %gep.10, align 4
364+
%add.4 = fadd float %l.7, %l.8
365+
%mul.4 = fmul float %add.4, 0.000000e+00
366+
%gep.11 = getelementptr i8, ptr %gep.3, i64 12
367+
store float %mul.4, ptr %gep.11, align 4
368+
%iv.next = add i64 %iv, 1
369+
%ec = icmp eq i64 %iv, %arg1
370+
br i1 %ec, label %exit, label %loop
371+
372+
exit:
373+
ret void
374+
}
375+
376+
attributes #0 = { "target-features"="+sse4.2" }
377+
185378
;.
186379
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
187380
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
188381
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
189382
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
383+
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
384+
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
190385
;.

0 commit comments

Comments
 (0)