Skip to content

Commit e588fd9

Browse files
committed
Revert "[SLP]Vectorize gathered loads"
This reverts commit dc2deb5 to fix the issue reported in https://lab.llvm.org/buildbot/#/builders/25/builds/2668
1 parent c64277d commit e588fd9

21 files changed

+835
-1286
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 118 additions & 753 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,14 @@ define void @s116_modified(ptr %a) {
2323
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
2424
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
2525
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0
26-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
26+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
2727
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
2828
; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2)
29-
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> <i32 0, i32 0, i32 1, i32 2>
30-
; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]]
31-
; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[A]], align 4
29+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
30+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP7]], <4 x i32> <i32 0, i32 poison, i32 2, i32 4>
31+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
32+
; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP9]]
33+
; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[A]], align 4
3234
; CHECK-NEXT: ret void
3335
;
3436
%gep1 = getelementptr inbounds float, ptr %a, i64 1

llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,17 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,NON-POWER-OF-2 %s
3-
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,POWER-OF-2 %s
2+
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK %s
3+
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK %s
44

55
define void @vec3_vectorize_call(ptr %Colour, float %0) {
6-
; NON-POWER-OF-2-LABEL: @vec3_vectorize_call(
7-
; NON-POWER-OF-2-NEXT: entry:
8-
; NON-POWER-OF-2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
9-
; NON-POWER-OF-2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2
10-
; NON-POWER-OF-2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
11-
; NON-POWER-OF-2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> <i32 3, i32 4, i32 2>
12-
; NON-POWER-OF-2-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
13-
; NON-POWER-OF-2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4
14-
; NON-POWER-OF-2-NEXT: ret void
15-
;
16-
; POWER-OF-2-LABEL: @vec3_vectorize_call(
17-
; POWER-OF-2-NEXT: entry:
18-
; POWER-OF-2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
19-
; POWER-OF-2-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> zeroinitializer, <2 x float> zeroinitializer)
20-
; POWER-OF-2-NEXT: store <2 x float> [[TMP2]], ptr [[COLOUR]], align 4
21-
; POWER-OF-2-NEXT: [[ARRAYIDX99_I1:%.*]] = getelementptr float, ptr [[COLOUR]], i64 2
22-
; POWER-OF-2-NEXT: [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0:%.*]], float 0.000000e+00, float 0.000000e+00)
23-
; POWER-OF-2-NEXT: store float [[TMP3]], ptr [[ARRAYIDX99_I1]], align 4
24-
; POWER-OF-2-NEXT: ret void
6+
; CHECK-LABEL: @vec3_vectorize_call(
7+
; CHECK-NEXT: entry:
8+
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
9+
; CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> zeroinitializer, <2 x float> zeroinitializer)
10+
; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[COLOUR]], align 4
11+
; CHECK-NEXT: [[ARRAYIDX99_I1:%.*]] = getelementptr float, ptr [[COLOUR]], i64 2
12+
; CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0:%.*]], float 0.000000e+00, float 0.000000e+00)
13+
; CHECK-NEXT: store float [[TMP3]], ptr [[ARRAYIDX99_I1]], align 4
14+
; CHECK-NEXT: ret void
2515
;
2616
entry:
2717
%1 = load float, ptr %Colour, align 4

llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -245,24 +245,34 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) {
245245
; CHECK-NEXT: [[L_8:%.*]] = load i8, ptr [[GEP_8]], align 1
246246
; CHECK-NEXT: [[CMP_8:%.*]] = icmp ugt i8 [[L_8]], -1
247247
; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 9
248+
; CHECK-NEXT: [[L_9:%.*]] = load i8, ptr [[GEP_9]], align 1
249+
; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 10
250+
; CHECK-NEXT: [[L_10:%.*]] = load i8, ptr [[GEP_10]], align 1
248251
; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 11
249252
; CHECK-NEXT: [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1
250253
; CHECK-NEXT: [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12
251254
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1
252255
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0
253256
; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]]
254-
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[GEP_9]], align 1
255-
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1
256-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
257-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP4]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
257+
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1
258+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
259+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[L_9]], i32 9
260+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[L_10]], i32 10
258261
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L_11]], i32 11
259262
; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP0]], i64 0)
260-
; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP3]], i64 12)
263+
; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP2]], i64 12)
261264
; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
262-
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0
263-
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> poison, <16 x i32> zeroinitializer
264-
; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP8]], <16 x i8> [[TMP11]]
265-
; CHECK-NEXT: store <16 x i8> [[TMP12]], ptr [[PTR]], align 2
265+
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
266+
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
267+
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 8, i32 9, i32 10, i32 11>
268+
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[L_9]], i32 9
269+
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[L_10]], i32 10
270+
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[L_11]], i32 11
271+
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
272+
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0
273+
; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> poison, <16 x i32> zeroinitializer
274+
; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP16]], <16 x i8> [[TMP18]]
275+
; CHECK-NEXT: store <16 x i8> [[TMP19]], ptr [[PTR]], align 2
266276
; CHECK-NEXT: ret void
267277
;
268278
entry:

llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@ define void @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(ptr %__first, ptr n
1010
; CHECK-LABEL: @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(
1111
; CHECK-NEXT: entry:
1212
; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__FIRST:%.*]], align 8
13-
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x ptr>, ptr [[__LAST:%.*]], align 8
14-
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[TMP0]], i32 0
13+
; CHECK-NEXT: [[_M_FIRST3_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", ptr [[__FIRST]], i64 0, i32 1
14+
; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__LAST:%.*]], align 8
15+
; CHECK-NEXT: [[_M_FIRST3_I_I83:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", ptr [[__LAST]], i64 0, i32 1
16+
; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_M_FIRST3_I_I83]], align 8
1517
; CHECK-NEXT: br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT:%.*]], label [[WHILE_COND_I_PREHEADER:%.*]]
1618
; CHECK: while.cond.i.preheader:
1719
; CHECK-NEXT: br label [[WHILE_COND_I:%.*]]
@@ -20,8 +22,10 @@ define void @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(ptr %__first, ptr n
2022
; CHECK: while.body.i:
2123
; CHECK-NEXT: br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_COND_I]]
2224
; CHECK: _ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit:
23-
; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x ptr> [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP1]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ]
24-
; CHECK-NEXT: store <2 x ptr> [[TMP3]], ptr [[__FIRST]], align 8
25+
; CHECK-NEXT: [[TMP3:%.*]] = phi ptr [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP2]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ]
26+
; CHECK-NEXT: [[TMP4:%.*]] = phi ptr [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ]
27+
; CHECK-NEXT: store ptr [[TMP4]], ptr [[__FIRST]], align 8
28+
; CHECK-NEXT: store ptr [[TMP3]], ptr [[_M_FIRST3_I_I]], align 8
2529
; CHECK-NEXT: br i1 undef, label [[IF_THEN_I55:%.*]], label [[WHILE_COND:%.*]]
2630
; CHECK: if.then.i55:
2731
; CHECK-NEXT: br label [[WHILE_COND]]

llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -837,18 +837,21 @@ define i32 @maxi8_mutiple_uses(i32) {
837837
; THRESH-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
838838
; THRESH-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]]
839839
; THRESH-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
840-
; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP7]])
841-
; THRESH-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
842-
; THRESH-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i32 0
843-
; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP6]], i32 1
844-
; THRESH-NEXT: [[TMP12:%.*]] = icmp sgt <2 x i32> [[TMP10]], [[TMP11]]
845-
; THRESH-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]]
846-
; THRESH-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0
847-
; THRESH-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
848-
; THRESH-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
849-
; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP14]], i32 [[TMP15]]
850-
; THRESH-NEXT: [[TMP16:%.*]] = select i1 [[TMP5]], i32 3, i32 4
851-
; THRESH-NEXT: store i32 [[TMP16]], ptr @var, align 8
840+
; THRESH-NEXT: [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
841+
; THRESH-NEXT: [[TMP9:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
842+
; THRESH-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP7]])
843+
; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
844+
; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP9]], i32 1
845+
; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0
846+
; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP6]], i32 1
847+
; THRESH-NEXT: [[TMP15:%.*]] = icmp sgt <2 x i32> [[TMP12]], [[TMP14]]
848+
; THRESH-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP12]], <2 x i32> [[TMP14]]
849+
; THRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
850+
; THRESH-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
851+
; THRESH-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
852+
; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP17]], i32 [[TMP18]]
853+
; THRESH-NEXT: [[TMP19:%.*]] = select i1 [[TMP5]], i32 3, i32 4
854+
; THRESH-NEXT: store i32 [[TMP19]], ptr @var, align 8
852855
; THRESH-NEXT: ret i32 [[OP_RDX5]]
853856
;
854857
%2 = load i32, ptr @arr, align 16

llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -100,14 +100,21 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
100100

101101
define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
102102
; CHECK-LABEL: @PR16739_byval(
103-
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
104-
; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16
103+
; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X:%.*]], align 16
104+
; CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X]], i64 0, i64 2
105+
; CHECK-NEXT: [[T4:%.*]] = load i64, ptr [[T2]], align 8
106+
; CHECK-NEXT: [[T5:%.*]] = trunc i64 [[T1]] to i32
107+
; CHECK-NEXT: [[T6:%.*]] = bitcast i32 [[T5]] to float
108+
; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> poison, float [[T6]], i32 0
105109
; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32
106-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
107-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
108-
; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
109-
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
110-
; CHECK-NEXT: ret <4 x float> [[TMP5]]
110+
; CHECK-NEXT: [[T9:%.*]] = trunc i64 [[T8]] to i32
111+
; CHECK-NEXT: [[T10:%.*]] = bitcast i32 [[T9]] to float
112+
; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1
113+
; CHECK-NEXT: [[T12:%.*]] = trunc i64 [[T4]] to i32
114+
; CHECK-NEXT: [[T13:%.*]] = bitcast i32 [[T12]] to float
115+
; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2
116+
; CHECK-NEXT: [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3
117+
; CHECK-NEXT: ret <4 x float> [[T15]]
111118
;
112119
%t1 = load i64, ptr %x, align 16
113120
%t2 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 2

llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -100,14 +100,21 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
100100

101101
define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
102102
; CHECK-LABEL: @PR16739_byval(
103-
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
104-
; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16
103+
; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X:%.*]], align 16
104+
; CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X]], i64 0, i64 2
105+
; CHECK-NEXT: [[T4:%.*]] = load i64, ptr [[T2]], align 8
106+
; CHECK-NEXT: [[T5:%.*]] = trunc i64 [[T1]] to i32
107+
; CHECK-NEXT: [[T6:%.*]] = bitcast i32 [[T5]] to float
108+
; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0
105109
; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32
106-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
107-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
108-
; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
109-
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
110-
; CHECK-NEXT: ret <4 x float> [[TMP5]]
110+
; CHECK-NEXT: [[T9:%.*]] = trunc i64 [[T8]] to i32
111+
; CHECK-NEXT: [[T10:%.*]] = bitcast i32 [[T9]] to float
112+
; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1
113+
; CHECK-NEXT: [[T12:%.*]] = trunc i64 [[T4]] to i32
114+
; CHECK-NEXT: [[T13:%.*]] = bitcast i32 [[T12]] to float
115+
; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2
116+
; CHECK-NEXT: [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3
117+
; CHECK-NEXT: ret <4 x float> [[T15]]
111118
;
112119
%t1 = load i64, ptr %x, align 16
113120
%t2 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 2

0 commit comments

Comments
 (0)