Skip to content

Commit 32994cc

Browse files
[SLP]Improve findReusedOrderedScalars and graph rotation.
Patch syncs the code in findReusedOrderedScalars with cost estimation/codegen. It tries to use similar logic to better determine best order. Before, it just tried to find previously vectorized node without checking if it is possible to use the vectorized value in the shuffle. Now it relies on the more generalized version. If it determines, that a single vector must be reordered (using same mechanism, as codegen and cost estimation), it generates better order. The comparison between new/ref ordering: Metric: SLP.NumVectorInstructions Program SLP.NumVectorInstructions results results0 diff test-suite :: MultiSource/Benchmarks/nbench/nbench.test 139.00 140.00 0.7% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 344.00 346.00 0.6% test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test 1293.00 1292.00 -0.1% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 5176.00 5170.00 -0.1% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 5173.00 5167.00 -0.1% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 11692.00 11660.00 -0.3% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 1621.00 1615.00 -0.4% test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test 795.00 792.00 -0.4% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 26499.00 26338.00 -0.6% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 7343.00 7281.00 -0.8% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 1104.00 1094.00 -0.9% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 2216.00 2180.00 -1.6% test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 787.00 637.00 -19.1% Less 0% is better. Most of the benchmarks see more vectorized code. The first ones just have shuffles removed. The ordering analysis still may require some improvements (e.g. for alternate nodes), but this one should be produce better results. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #77529
1 parent 3168af5 commit 32994cc

13 files changed

+447
-175
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 360 additions & 86 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -76,10 +76,10 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
7676
; CHECK-NEXT: [[PB_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PB]], [[ENTRY]] ], [ [[SCEVGEP311]], [[WHILE_END_LOOPEXIT]] ]
7777
; CHECK-NEXT: [[PA_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PA]], [[ENTRY]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ]
7878
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
79-
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
79+
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i32> <i32 0, i32 2>
8080
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8181
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
82-
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
82+
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i32> <i32 1, i32 3>
8383
; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8484
; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i64> [[TMP12]], [[TMP15]]
8585
; CHECK-NEXT: [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32>
@@ -107,12 +107,12 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
107107
; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <2 x i32> zeroinitializer
108108
; CHECK-NEXT: [[TMP24:%.*]] = icmp eq <2 x i32> [[TMP23]], zeroinitializer
109109
; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP23]], zeroinitializer
110-
; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
110+
; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
111111
; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x i32> poison, i32 [[AND95]], i32 0
112112
; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> zeroinitializer
113113
; CHECK-NEXT: [[TMP29:%.*]] = icmp ne <2 x i32> [[TMP28]], zeroinitializer
114114
; CHECK-NEXT: [[TMP30:%.*]] = icmp eq <2 x i32> [[TMP28]], zeroinitializer
115-
; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
115+
; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
116116
; CHECK-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP31]], <4 x i1> zeroinitializer
117117
; CHECK-NEXT: [[TMP33:%.*]] = zext <4 x i1> [[TMP32]] to <4 x i32>
118118
; CHECK-NEXT: [[TMP34]] = add <4 x i32> [[TMP21]], [[TMP33]]
@@ -152,12 +152,12 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
152152
; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <2 x i32> [[TMP39]], <2 x i32> poison, <2 x i32> zeroinitializer
153153
; CHECK-NEXT: [[TMP41:%.*]] = icmp eq <2 x i32> [[TMP40]], zeroinitializer
154154
; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <2 x i32> [[TMP40]], zeroinitializer
155-
; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
155+
; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
156156
; CHECK-NEXT: [[TMP44:%.*]] = insertelement <2 x i32> poison, i32 [[AND134]], i32 0
157157
; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> zeroinitializer
158158
; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <2 x i32> [[TMP45]], zeroinitializer
159159
; CHECK-NEXT: [[TMP47:%.*]] = icmp eq <2 x i32> [[TMP45]], zeroinitializer
160-
; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
160+
; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
161161
; CHECK-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP43]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
162162
; CHECK-NEXT: [[TMP50:%.*]] = zext <4 x i1> [[TMP49]] to <4 x i32>
163163
; CHECK-NEXT: [[TMP51]] = add <4 x i32> [[TMP38]], [[TMP50]]
@@ -166,9 +166,9 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
166166
; CHECK-NEXT: br i1 [[CMP130_NOT]], label [[WHILE_END166]], label [[WHILE_BODY132]]
167167
; CHECK: while.end166:
168168
; CHECK-NEXT: [[TMP52:%.*]] = phi <4 x i32> [ [[TMP35]], [[WHILE_END122]] ], [ [[TMP51]], [[WHILE_BODY132]] ]
169-
; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2
169+
; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3
170170
; CHECK-NEXT: store i32 [[TMP53]], ptr [[CTT:%.*]], align 4
171-
; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3
171+
; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2
172172
; CHECK-NEXT: store i32 [[TMP54]], ptr [[CFF:%.*]], align 4
173173
; CHECK-NEXT: [[TMP55:%.*]] = extractelement <4 x i32> [[TMP52]], i32 1
174174
; CHECK-NEXT: store i32 [[TMP55]], ptr [[CTF:%.*]], align 4

llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ define i32 @foo(i32 %v1, double %v2) {
66
; CHECK-NEXT: entry:
77
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 undef>, i32 [[V1:%.*]], i32 0
88
; CHECK-NEXT: [[TMP1:%.*]] = sitofp <2 x i32> [[TMP0]] to <2 x double>
9-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
9+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
1010
; CHECK-NEXT: br label [[FOR_COND15_PREHEADER:%.*]]
1111
; CHECK: for.cond15.preheader:
1212
; CHECK-NEXT: br label [[IF_END:%.*]]
@@ -26,14 +26,15 @@ define i32 @foo(i32 %v1, double %v2) {
2626
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
2727
; CHECK-NEXT: [[TMP6:%.*]] = load <4 x double>, ptr [[ARRAYIDX43]], align 8
2828
; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x double> [[TMP6]], [[TMP5]]
29-
; CHECK-NEXT: [[TMP8:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP7]])
29+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
30+
; CHECK-NEXT: [[TMP9:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP8]])
3031
; CHECK-NEXT: br label [[SW_EPILOG:%.*]]
3132
; CHECK: sw.bb195:
3233
; CHECK-NEXT: br label [[SW_EPILOG]]
3334
; CHECK: do.body:
3435
; CHECK-NEXT: unreachable
3536
; CHECK: sw.epilog:
36-
; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x double> [ undef, [[SW_BB195]] ], [ [[TMP8]], [[SW_BB]] ]
37+
; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x double> [ undef, [[SW_BB195]] ], [ [[TMP9]], [[SW_BB]] ]
3738
; CHECK-NEXT: ret i32 undef
3839
; CHECK: if.end.1:
3940
; CHECK-NEXT: br label [[FOR_COND15_1:%.*]]

llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,17 +20,17 @@ define void @s116_modified(ptr %a) {
2020
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
2121
; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 3
2222
; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[A]], align 4
23-
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
24-
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
25-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0
26-
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
27-
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
28-
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
29-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
30-
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 poison, i32 2, i32 4>
31-
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
32-
; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <4 x float> [[TMP9]], [[TMP11]]
33-
; CHECK-NEXT: store <4 x float> [[TMP12]], ptr [[A]], align 4
23+
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
24+
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
25+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0
26+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
27+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
28+
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
29+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
30+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 poison, i32 2, i32 4>
31+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
32+
; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP7]], [[TMP9]]
33+
; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[A]], align 4
3434
; CHECK-NEXT: ret void
3535
;
3636
%gep1 = getelementptr inbounds float, ptr %a, i64 1

llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -143,16 +143,17 @@ define void @gather_2(ptr %mat1, float %0, float %1) {
143143
; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
144144
; CHECK-NEXT: entry:
145145
; CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
146-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
147-
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP0]], i32 1
148-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
149-
; CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> zeroinitializer)
150-
; CHECK-NEXT: [[TMP7:%.*]] = fmul float [[TMP2]], 0.000000e+00
146+
; CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float [[TMP0]], float 0.000000e+00)
147+
; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
148+
; CHECK-NEXT: [[TMP5:%.*]] = fmul float [[TMP2]], 0.000000e+00
149+
; CHECK-NEXT: [[TMP6:%.*]] = fmul float [[TMP3]], 0.000000e+00
150+
; CHECK-NEXT: [[TMP7:%.*]] = fmul float [[TMP4]], 0.000000e+00
151151
; CHECK-NEXT: [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1
152152
; CHECK-NEXT: [[ARRAYIDX2_I_I_I278:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 1
153-
; CHECK-NEXT: store float [[TMP7]], ptr [[ARRAYIDX163]], align 4
154-
; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP6]], zeroinitializer
155-
; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[ARRAYIDX2_I_I_I278]], align 4
153+
; CHECK-NEXT: [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2
154+
; CHECK-NEXT: store float [[TMP5]], ptr [[ARRAYIDX163]], align 4
155+
; CHECK-NEXT: store float [[TMP6]], ptr [[ARRAYIDX2_I_I_I278]], align 4
156+
; CHECK-NEXT: store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4
156157
; CHECK-NEXT: ret void
157158
;
158159
entry:
@@ -183,19 +184,18 @@ define i32 @reorder_indices_1(float %0) {
183184
; CHECK-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]]
184185
; CHECK-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
185186
; CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
186-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
187-
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
187+
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
188+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1
188189
; CHECK-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
189190
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
190191
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
191192
; CHECK-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
192-
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
193-
; CHECK-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
194-
; CHECK-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
195-
; CHECK-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
196-
; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
197-
; CHECK-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
198-
; CHECK-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
193+
; CHECK-NEXT: [[TMP12:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP11]])
194+
; CHECK-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP12]], <2 x float> zeroinitializer)
195+
; CHECK-NEXT: [[TMP14:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
196+
; CHECK-NEXT: [[TMP15:%.*]] = fmul <2 x float> [[TMP13]], zeroinitializer
197+
; CHECK-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP14]], 0.000000e+00
198+
; CHECK-NEXT: store <2 x float> [[TMP15]], ptr [[NOR1]], align 4
199199
; CHECK-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
200200
; CHECK-NEXT: ret i32 0
201201
;

llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,10 @@ define void @pr35497() local_unnamed_addr #0 {
6868
; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
6969
; SSE-NEXT: [[TMP5:%.*]] = add nuw nsw <2 x i64> [[TMP4]], zeroinitializer
7070
; SSE-NEXT: store <2 x i64> [[TMP5]], ptr undef, align 1
71-
; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 0
72-
; SSE-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], <i64 2, i64 2>
73-
; SSE-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], <i64 20, i64 20>
74-
; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
71+
; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
72+
; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
73+
; SSE-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
74+
; SSE-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
7575
; SSE-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP5]], <i64 6, i64 6>
7676
; SSE-NEXT: [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]]
7777
; SSE-NEXT: store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1
@@ -88,10 +88,10 @@ define void @pr35497() local_unnamed_addr #0 {
8888
; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
8989
; AVX-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
9090
; AVX-NEXT: store <2 x i64> [[TMP4]], ptr undef, align 1
91-
; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[ADD]], i32 0
92-
; AVX-NEXT: [[TMP6:%.*]] = shl <2 x i64> [[TMP5]], <i64 2, i64 2>
93-
; AVX-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP6]], <i64 20, i64 20>
94-
; AVX-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
91+
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
92+
; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 1
93+
; AVX-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], <i64 2, i64 2>
94+
; AVX-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], <i64 20, i64 20>
9595
; AVX-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
9696
; AVX-NEXT: [[TMP10:%.*]] = add nuw nsw <2 x i64> [[TMP8]], [[TMP9]]
9797
; AVX-NEXT: store <2 x i64> [[TMP10]], ptr [[ARRAYIDX2_2]], align 1

0 commit comments

Comments
 (0)