Skip to content

Commit 81e9ede

Browse files
committed
[VectorCombine] forward walk through instructions to improve chaining of transforms
This is split off from D79799 - where I was proposing to fully iterate over a function until there are no more transforms. I suspect we are still going to want to do something like that eventually. But we can achieve the same gains much more efficiently on the current set of regression tests just by reversing the order that we visit the instructions. This may also reduce the motivation for D79078, but we are still not getting the optimal pattern for a reduction.
1 parent 43017ce commit 81e9ede

File tree

4 files changed

+41
-42
lines changed

4 files changed

+41
-42
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -381,11 +381,10 @@ static bool runImpl(Function &F, const TargetTransformInfo &TTI,
381381
if (!DT.isReachableFromEntry(&BB))
382382
continue;
383383
// Do not delete instructions under here and invalidate the iterator.
384-
// Walk the block backwards for efficiency. We're matching a chain of
385-
// use->defs, so we're more likely to succeed by starting from the bottom.
384+
// Walk the block forwards to enable simple iterative chains of transforms.
386385
// TODO: It could be more efficient to remove dead instructions
387386
// iteratively in this loop rather than waiting until the end.
388-
for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
387+
for (Instruction &I : BB) {
389388
if (isa<DbgInfoIntrinsic>(I))
390389
continue;
391390
MadeChange |= foldExtractExtract(I, TTI);

llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,19 @@
55
target triple = "x86_64--"
66
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
77

8+
; FIXME: This should only need 2 'or' instructions.
9+
810
define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) {
911
; CHECK-LABEL: @ext_ext_or_reduction_v4i32(
1012
; CHECK-NEXT: [[Z:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]]
1113
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1214
; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> [[Z]], [[TMP1]]
13-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
14-
; CHECK-NEXT: [[Z2:%.*]] = extractelement <4 x i32> [[Z]], i32 2
15-
; CHECK-NEXT: [[Z012:%.*]] = or i32 [[TMP3]], [[Z2]]
16-
; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i32> [[Z]], i32 3
17-
; CHECK-NEXT: [[Z0123:%.*]] = or i32 [[Z012]], [[Z3]]
18-
; CHECK-NEXT: ret i32 [[Z0123]]
15+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
16+
; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
17+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
18+
; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i32> [[TMP4]], [[TMP5]]
19+
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
20+
; CHECK-NEXT: ret i32 [[TMP7]]
1921
;
2022
%z = and <4 x i32> %x, %y
2123
%z0 = extractelement <4 x i32> %z, i32 0
@@ -32,10 +34,10 @@ define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) {
3234
; CHECK-LABEL: @ext_ext_partial_add_reduction_v4i32(
3335
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
3436
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[X]]
35-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
36-
; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
37-
; CHECK-NEXT: [[X210:%.*]] = add i32 [[TMP3]], [[X2]]
38-
; CHECK-NEXT: ret i32 [[X210]]
37+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
38+
; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
39+
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 0
40+
; CHECK-NEXT: ret i32 [[TMP5]]
3941
;
4042
%x0 = extractelement <4 x i32> %x, i32 0
4143
%x1 = extractelement <4 x i32> %x, i32 1
@@ -47,14 +49,14 @@ define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) {
4749

4850
define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x i32> %y) {
4951
; CHECK-LABEL: @ext_ext_partial_add_reduction_and_extra_add_v4i32(
50-
; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 1
51-
; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i32> [[Y]], i32 2
52-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
53-
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[Y]]
54-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
55-
; CHECK-NEXT: [[Y210:%.*]] = add i32 [[TMP3]], [[Y1]]
56-
; CHECK-NEXT: [[X2Y210:%.*]] = add i32 [[Y210]], [[Y2]]
57-
; CHECK-NEXT: ret i32 [[X2Y210]]
52+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
53+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
54+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
55+
; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[Y]]
56+
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP1]]
57+
; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP5]], [[TMP2]]
58+
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
59+
; CHECK-NEXT: ret i32 [[TMP7]]
5860
;
5961
%y0 = extractelement <4 x i32> %y, i32 0
6062
%y1 = extractelement <4 x i32> %y, i32 1

llvm/test/Transforms/VectorCombine/X86/extract-binop.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -492,12 +492,12 @@ define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) {
492492
; CHECK-NEXT: [[Z:%.*]] = and <4 x i32> [[X:%.*]], [[Y:%.*]]
493493
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
494494
; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> [[Z]], [[TMP1]]
495-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
496-
; CHECK-NEXT: [[Z2:%.*]] = extractelement <4 x i32> [[Z]], i32 2
497-
; CHECK-NEXT: [[Z012:%.*]] = or i32 [[TMP3]], [[Z2]]
498-
; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i32> [[Z]], i32 3
499-
; CHECK-NEXT: [[Z0123:%.*]] = or i32 [[Z3]], [[Z012]]
500-
; CHECK-NEXT: ret i32 [[Z0123]]
495+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
496+
; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
497+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
498+
; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i32> [[TMP5]], [[TMP4]]
499+
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i64 0
500+
; CHECK-NEXT: ret i32 [[TMP7]]
501501
;
502502
%z = and <4 x i32> %x, %y
503503
%z0 = extractelement <4 x i32> %z, i32 0
@@ -514,10 +514,10 @@ define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) {
514514
; CHECK-LABEL: @ext_ext_partial_add_reduction_v4i32(
515515
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
516516
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[X]]
517-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
518-
; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
519-
; CHECK-NEXT: [[X210:%.*]] = add i32 [[X2]], [[TMP3]]
520-
; CHECK-NEXT: ret i32 [[X210]]
517+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
518+
; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[TMP2]]
519+
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i64 0
520+
; CHECK-NEXT: ret i32 [[TMP5]]
521521
;
522522
%x0 = extractelement <4 x i32> %x, i32 0
523523
%x1 = extractelement <4 x i32> %x, i32 1
@@ -531,12 +531,12 @@ define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x
531531
; CHECK-LABEL: @ext_ext_partial_add_reduction_and_extra_add_v4i32(
532532
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
533533
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[Y]]
534-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
535-
; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i32> [[Y]], i32 2
536-
; CHECK-NEXT: [[Y210:%.*]] = add i32 [[Y2]], [[TMP3]]
537-
; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2
538-
; CHECK-NEXT: [[X2Y210:%.*]] = add i32 [[X2]], [[Y210]]
539-
; CHECK-NEXT: ret i32 [[X2Y210]]
534+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
535+
; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[TMP2]]
536+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
537+
; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP5]], [[TMP4]]
538+
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i64 0
539+
; CHECK-NEXT: ret i32 [[TMP7]]
540540
;
541541
%y0 = extractelement <4 x i32> %y, i32 0
542542
%y1 = extractelement <4 x i32> %y, i32 1

llvm/test/Transforms/VectorCombine/X86/insert-binop.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,9 @@ define <2 x i64> @ins1_ins1_xor(i64 %x, i64 %y) {
5151
define <2 x i64> @ins1_ins1_iterate(i64 %w, i64 %x, i64 %y, i64 %z) {
5252
; CHECK-LABEL: @ins1_ins1_iterate(
5353
; CHECK-NEXT: [[S0_SCALAR:%.*]] = sub i64 [[W:%.*]], [[X:%.*]]
54-
; CHECK-NEXT: [[S0:%.*]] = insertelement <2 x i64> undef, i64 [[S0_SCALAR]], i64 1
55-
; CHECK-NEXT: [[I2:%.*]] = insertelement <2 x i64> undef, i64 [[Y:%.*]], i32 1
56-
; CHECK-NEXT: [[S1:%.*]] = or <2 x i64> [[S0]], [[I2]]
57-
; CHECK-NEXT: [[I3:%.*]] = insertelement <2 x i64> undef, i64 [[Z:%.*]], i32 1
58-
; CHECK-NEXT: [[S2:%.*]] = shl <2 x i64> [[I3]], [[S1]]
54+
; CHECK-NEXT: [[S1_SCALAR:%.*]] = or i64 [[S0_SCALAR]], [[Y:%.*]]
55+
; CHECK-NEXT: [[S2_SCALAR:%.*]] = shl i64 [[Z:%.*]], [[S1_SCALAR]]
56+
; CHECK-NEXT: [[S2:%.*]] = insertelement <2 x i64> undef, i64 [[S2_SCALAR]], i64 1
5957
; CHECK-NEXT: ret <2 x i64> [[S2]]
6058
;
6159
%i0 = insertelement <2 x i64> undef, i64 %w, i64 1

0 commit comments

Comments
 (0)