Skip to content

Commit 37ae4ad

Browse files
[SLP]Support minbitwidth analisys for buildvector nodes.
Metric: size..text Program size..text exp ref diff test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 42906.00 42986.00 0.2% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 42909.00 42989.00 0.2% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 664581.00 664661.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 664581.00 664661.00 0.0% Less is better. Replaces `buildvector <p x in> + trunc <p x in> to <p x im>` sequences to `buildvector <p x im> of { trunc in to im }` scalars, which is free in most cases, results in better code. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #88504
1 parent 040b5a1 commit 37ae4ad

File tree

6 files changed

+217
-137
lines changed

6 files changed

+217
-137
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 204 additions & 124 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,7 @@ define void @h() {
55
; CHECK-LABEL: define void @h() {
66
; CHECK-NEXT: entry:
77
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
8-
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0
9-
; CHECK-NEXT: [[TMP1:%.*]] = trunc <8 x i32> [[TMP0]] to <8 x i1>
10-
; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i1> zeroinitializer, [[TMP1]]
11-
; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i1> [[TMP2]], zeroinitializer
12-
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i1> [[TMP4]] to <8 x i16>
13-
; CHECK-NEXT: store <8 x i16> [[TMP3]], ptr [[ARRAYIDX2]], align 2
8+
; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr [[ARRAYIDX2]], align 2
149
; CHECK-NEXT: ret void
1510
;
1611
entry:

llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ define void @h() {
55
; CHECK-LABEL: define void @h() {
66
; CHECK-NEXT: entry:
77
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
8-
; CHECK-NEXT: [[TMP0:%.*]] = trunc <8 x i32> zeroinitializer to <8 x i1>
8+
; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 0 to i1
9+
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false>, i1 [[TMP6]], i32 4
910
; CHECK-NEXT: [[TMP1:%.*]] = sub <8 x i1> [[TMP0]], zeroinitializer
1011
; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i1> [[TMP0]], zeroinitializer
1112
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>

llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,12 @@ define void @h() {
55
; CHECK-LABEL: define void @h() {
66
; CHECK-NEXT: entry:
77
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
8-
; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr [[ARRAYIDX2]], align 2
8+
; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 0 to i1
9+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false>, i1 [[TMP0]], i32 4
10+
; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i1> zeroinitializer, [[TMP1]]
11+
; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i1> zeroinitializer, [[TMP2]]
12+
; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i1> [[TMP3]] to <8 x i16>
13+
; CHECK-NEXT: store <8 x i16> [[TMP4]], ptr [[ARRAYIDX2]], align 2
914
; CHECK-NEXT: ret void
1015
;
1116
entry:

llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ define void @test(ptr %a, i8 %0, i16 %b.promoted.i) {
77
; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP0]] to i128
88
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[B_PROMOTED_I]], i32 0
99
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
10-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i128> poison, i128 [[TMP2]], i32 0
11-
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i128> [[TMP5]], <4 x i128> poison, <4 x i32> zeroinitializer
12-
; CHECK-NEXT: [[TMP7:%.*]] = trunc <4 x i128> [[TMP6]] to <4 x i16>
10+
; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP2]] to i16
11+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i16> poison, i16 [[TMP5]], i32 0
12+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <4 x i32> zeroinitializer
1313
; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i16> [[TMP4]], [[TMP7]]
1414
; CHECK-NEXT: [[TMP9:%.*]] = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> [[TMP8]])
1515
; CHECK-NEXT: [[TMP11:%.*]] = zext i16 [[TMP9]] to i64

llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@ define void @test() {
1616
; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1>
1717
; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]]
1818
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]]
19-
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
20-
; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP16]] to <4 x i32>
19+
; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP15]] to <4 x i32>
2120
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
2221
; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
2322
; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]])

0 commit comments

Comments
 (0)