Skip to content

Commit 4b1b51a

Browse files
committed
[SLP]Initial non-power-of-2 support (but still whole register) for reductions
Enables initial non-power-of-2 support (but still requires number of elements, forming whole registers) for reductions. Enables extra vectorization for MultiSource/Benchmarks/7zip/7zip-benchmark, CINT2006/464.h264ref and CFP2017rate/526.blender_r (checked for SSE2) Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #112361
1 parent 622e398 commit 4b1b51a

File tree

9 files changed

+381
-239
lines changed

9 files changed

+381
-239
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 341 additions & 182 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,17 @@
1717

1818
define void @s116_modified(ptr %a) {
1919
; CHECK-LABEL: @s116_modified(
20-
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
21-
; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 3
20+
; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 2
21+
; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[GEP1]], i64 3
2222
; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[A]], align 4
2323
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
2424
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
25-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0
2625
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
27-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
26+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[LD0]], i32 1
2827
; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2)
29-
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> <i32 0, i32 0, i32 1, i32 2>
28+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> <i32 1, i32 1, i32 5, i32 6>
3029
; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]]
31-
; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[A]], align 4
30+
; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[GEP1]], align 4
3231
; CHECK-NEXT: ret void
3332
;
3433
%gep1 = getelementptr inbounds float, ptr %a, i64 1

llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@ define void @vec3_vectorize_call(ptr %Colour, float %0) {
77
; NON-POWER-OF-2-NEXT: entry:
88
; NON-POWER-OF-2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
99
; NON-POWER-OF-2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2
10-
; NON-POWER-OF-2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
11-
; NON-POWER-OF-2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> <i32 3, i32 4, i32 2>
10+
; NON-POWER-OF-2-NEXT: [[TMP4:%.*]] = call <3 x float> @llvm.vector.insert.v3f32.v2f32(<3 x float> [[TMP2]], <2 x float> [[TMP1]], i64 0)
1211
; NON-POWER-OF-2-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
1312
; NON-POWER-OF-2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4
1413
; NON-POWER-OF-2-NEXT: ret void

llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,21 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) {
1111
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
1212
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer
1313
; CHECK-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float>
14-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
14+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1515
; CHECK-NEXT: [[Y0:%.*]] = getelementptr i8, ptr [[RC21]], i64 8
1616
; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[Y0]], align 4
1717
; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[I7]], align 4
1818
; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[RC21]], align 4
19-
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 2
20-
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3
21-
; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP8]], i64 0)
19+
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
20+
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 2
21+
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i32 3
22+
; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP11]], i64 0)
2223
; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP13]], zeroinitializer
2324
; CHECK-NEXT: [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer
2425
; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer
2526
; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x float> zeroinitializer, <4 x float> [[TMP15]]
26-
; CHECK-NEXT: store <4 x float> [[TMP16]], ptr [[RC21]], align 4
27+
; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
28+
; CHECK-NEXT: store <4 x float> [[TMP27]], ptr [[RC21]], align 4
2729
; CHECK-NEXT: br label [[IF_END:%.*]]
2830
; CHECK: entry.if.end72_crit_edge:
2931
; CHECK-NEXT: br label [[IF_END72:%.*]]
@@ -46,8 +48,7 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) {
4648
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> zeroinitializer
4749
; CHECK-NEXT: [[TMP25:%.*]] = mul <4 x i32> [[TMP23]], [[TMP24]]
4850
; CHECK-NEXT: [[TMP26:%.*]] = sitofp <4 x i32> [[TMP25]] to <4 x float>
49-
; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
50-
; CHECK-NEXT: store <4 x float> [[TMP27]], ptr [[RC21]], align 4
51+
; CHECK-NEXT: store <4 x float> [[TMP26]], ptr [[RC21]], align 4
5152
; CHECK-NEXT: ret void
5253
;
5354
entry:

llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll

Lines changed: 8 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -318,22 +318,14 @@ entry:
318318
define float @f(ptr nocapture readonly %x) {
319319
; CHECK-LABEL: @f(
320320
; CHECK-NEXT: entry:
321-
; CHECK-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
322-
; CHECK-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
323-
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
324-
; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
325-
; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
326-
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
321+
; CHECK-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
322+
; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
327323
; CHECK-NEXT: ret float [[OP_RDX]]
328324
;
329325
; THRESHOLD-LABEL: @f(
330326
; THRESHOLD-NEXT: entry:
331-
; THRESHOLD-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
332-
; THRESHOLD-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
333-
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
334-
; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
335-
; THRESHOLD-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
336-
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
327+
; THRESHOLD-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
328+
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
337329
; THRESHOLD-NEXT: ret float [[OP_RDX]]
338330
;
339331
entry:
@@ -606,18 +598,14 @@ define float @loadadd31(ptr nocapture readonly %x) {
606598
; CHECK-LABEL: @loadadd31(
607599
; CHECK-NEXT: entry:
608600
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
609-
; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
610-
; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
611-
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
601+
; CHECK-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
612602
; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
613603
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
614604
; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
615605
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
616606
; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
617607
; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
618-
; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
619-
; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
620-
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
608+
; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
621609
; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
622610
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
623611
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
@@ -627,18 +615,14 @@ define float @loadadd31(ptr nocapture readonly %x) {
627615
; THRESHOLD-LABEL: @loadadd31(
628616
; THRESHOLD-NEXT: entry:
629617
; THRESHOLD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
630-
; THRESHOLD-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
631-
; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
632-
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
618+
; THRESHOLD-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
633619
; THRESHOLD-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
634620
; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
635621
; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
636622
; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
637623
; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
638624
; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
639-
; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
640-
; THRESHOLD-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
641-
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
625+
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
642626
; THRESHOLD-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
643627
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
644628
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]

llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1013,11 +1013,11 @@ define i32 @maxi8_wrong_parent(i32) {
10131013
; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16
10141014
; THRESH-NEXT: br label [[PP:%.*]]
10151015
; THRESH: pp:
1016-
; THRESH-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
1017-
; THRESH-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16
1018-
; THRESH-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP4]], i64 4)
1019-
; THRESH-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 0)
1020-
; THRESH-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 2)
1016+
; THRESH-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
1017+
; THRESH-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
1018+
; THRESH-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP3]], i64 0)
1019+
; THRESH-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP4]], i64 4)
1020+
; THRESH-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP2]], i64 6)
10211021
; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP7]])
10221022
; THRESH-NEXT: ret i32 [[TMP8]]
10231023
;

llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,19 @@ define void @e(ptr %c, i64 %0) {
77
; CHECK-NEXT: [[ENTRY:.*:]]
88
; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[C]], align 8
99
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[TMP1]], i64 96
10-
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i8, ptr [[TMP1]], i64 112
11-
; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX1]], align 8
10+
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[TMP1]], i64 104
11+
; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
1212
; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[C]], align 8
13-
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8
13+
; CHECK-NEXT: [[TMP18:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX5]], align 8
14+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x ptr> [[TMP18]], <2 x ptr> poison, <2 x i32> <i32 1, i32 0>
1415
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP3]], i32 0
1516
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x ptr> [[TMP5]], <2 x ptr> poison, <2 x i32> zeroinitializer
1617
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <6 x ptr> poison, ptr [[TMP2]], i32 2
1718
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <6 x ptr> [[TMP7]], ptr [[TMP1]], i32 3
1819
; CHECK-NEXT: [[TMP9:%.*]] = call <6 x ptr> @llvm.vector.insert.v6p0.v2p0(<6 x ptr> [[TMP8]], <2 x ptr> [[TMP4]], i64 0)
1920
; CHECK-NEXT: [[TMP10:%.*]] = call <6 x ptr> @llvm.vector.insert.v6p0.v2p0(<6 x ptr> [[TMP9]], <2 x ptr> [[TMP6]], i64 4)
2021
; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint <6 x ptr> [[TMP10]] to <6 x i64>
21-
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x i64> [[TMP11]], <6 x i64> poison, <32 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5>
22+
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x i64> [[TMP11]], <6 x i64> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5>
2223
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <32 x i64> poison, i64 [[TMP0]], i32 0
2324
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <32 x i64> [[TMP13]], <32 x i64> poison, <32 x i32> zeroinitializer
2425
; CHECK-NEXT: [[TMP15:%.*]] = or <32 x i64> [[TMP14]], [[TMP12]]

llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@ define void @vec3_vectorize_call(ptr %Colour, float %0) {
77
; NON-POW2-NEXT: entry:
88
; NON-POW2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
99
; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2
10-
; NON-POW2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
11-
; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> <i32 3, i32 4, i32 2>
10+
; NON-POW2-NEXT: [[TMP4:%.*]] = call <3 x float> @llvm.vector.insert.v3f32.v2f32(<3 x float> [[TMP2]], <2 x float> [[TMP1]], i64 0)
1211
; NON-POW2-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
1312
; NON-POW2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4
1413
; NON-POW2-NEXT: ret void

llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,18 @@ define void @test(ptr %a, ptr %b) {
88
; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr null, align 4
99
; CHECK-NEXT: [[ARRAYIDX120:%.*]] = getelementptr [4 x float], ptr [[B:%.*]], i64 0, i64 3
1010
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX120]], align 4
11-
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 3
11+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 0>
12+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 2
1213
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
1314
; CHECK: for.body:
1415
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr null, align 4
1516
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4
16-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
17-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 2
18-
; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP6]], <2 x float> [[TMP1]], i64 0)
17+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
18+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 1
1919
; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]]
20-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
21-
; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP5]], zeroinitializer
22-
; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP9]], [[TMP10]]
20+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
21+
; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP9]], zeroinitializer
22+
; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP8]], [[TMP10]]
2323
; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x float> [[TMP11]], zeroinitializer
2424
; CHECK-NEXT: store <4 x float> [[TMP12]], ptr [[RESULT]], align 4
2525
; CHECK-NEXT: br label [[FOR_BODY]]

0 commit comments

Comments
 (0)