Skip to content

Commit 014db29

Browse files
committed
Enable vectorizer-maximize-bandwidth by default.
Summary: vectorizer-maximize-bandwidth is generally useful in terms of performance. I've tested the impact of changing this to default on speccpu benchmarks on sandybridge machines. The result shows non-negative impact: spec/2006/fp/C++/444.namd 26.84 -0.31% spec/2006/fp/C++/447.dealII 46.19 +0.89% spec/2006/fp/C++/450.soplex 42.92 -0.44% spec/2006/fp/C++/453.povray 38.57 -2.25% spec/2006/fp/C/433.milc 24.54 -0.76% spec/2006/fp/C/470.lbm 41.08 +0.26% spec/2006/fp/C/482.sphinx3 47.58 -0.99% spec/2006/int/C++/471.omnetpp 22.06 +1.87% spec/2006/int/C++/473.astar 22.65 -0.12% spec/2006/int/C++/483.xalancbmk 33.69 +4.97% spec/2006/int/C/400.perlbench 33.43 +1.70% spec/2006/int/C/401.bzip2 23.02 -0.19% spec/2006/int/C/403.gcc 32.57 -0.43% spec/2006/int/C/429.mcf 40.35 +0.27% spec/2006/int/C/445.gobmk 26.96 +0.06% spec/2006/int/C/456.hmmer 24.4 +0.19% spec/2006/int/C/458.sjeng 27.91 -0.08% spec/2006/int/C/462.libquantum 57.47 -0.20% spec/2006/int/C/464.h264ref 46.52 +1.35% geometric mean +0.29% The regression on 453.povray seems real, but is due to secondary effects as all hot functions are bit-identical with and without the flag. I started this patch to consult upstream opinions on this. It will be greatly appreciated if the community can help test the performance impact of this change on other architectures so that we can decided if this should be target-dependent. Reviewers: hfinkel, mkuper, davidxl, chandlerc Reviewed By: chandlerc Subscribers: rengolin, sanjoy, javed.absar, bjope, dorit, magabari, RKSimon, llvm-commits, mzolotukhin Differential Revision: https://reviews.llvm.org/D33341 llvm-svn: 305960
1 parent 7b87161 commit 014db29

12 files changed

+77
-68
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(
122122
"value."));
123123

124124
static cl::opt<bool> MaximizeBandwidth(
125-
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
125+
"vectorizer-maximize-bandwidth", cl::init(true), cl::Hidden,
126126
cl::desc("Maximize bandwidth when selecting vectorization factor which "
127127
"will be determined by the smallest type in loop."));
128128

llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,9 @@ for.body: ; preds = %entry, %for.body
8888
}
8989

9090
; CHECK-LABEL: @add_c(
91-
; CHECK: load <8 x i8>, <8 x i8>*
92-
; CHECK: add <8 x i16>
93-
; CHECK: store <8 x i16>
91+
; CHECK: load <16 x i8>, <16 x i8>*
92+
; CHECK: add <16 x i16>
93+
; CHECK: store <16 x i16>
9494
; Function Attrs: nounwind
9595
define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
9696
entry:
@@ -116,9 +116,9 @@ for.body: ; preds = %entry, %for.body
116116
}
117117

118118
; CHECK-LABEL: @add_d(
119-
; CHECK: load <4 x i16>
120-
; CHECK: add nsw <4 x i32>
121-
; CHECK: store <4 x i32>
119+
; CHECK: load <8 x i16>
120+
; CHECK: add nsw <8 x i32>
121+
; CHECK: store <8 x i32>
122122
define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
123123
entry:
124124
%cmp7 = icmp sgt i32 %len, 0
@@ -187,16 +187,16 @@ for.body: ; preds = %for.body, %for.body
187187
}
188188

189189
; CHECK-LABEL: @add_f
190-
; CHECK: load <8 x i16>
191-
; CHECK: trunc <8 x i16>
192-
; CHECK: shl <8 x i8>
193-
; CHECK: add <8 x i8>
194-
; CHECK: or <8 x i8>
195-
; CHECK: mul <8 x i8>
196-
; CHECK: and <8 x i8>
197-
; CHECK: xor <8 x i8>
198-
; CHECK: mul <8 x i8>
199-
; CHECK: store <8 x i8>
190+
; CHECK: load <16 x i16>
191+
; CHECK: trunc <16 x i16>
192+
; CHECK: shl <16 x i8>
193+
; CHECK: add <16 x i8>
194+
; CHECK: or <16 x i8>
195+
; CHECK: mul <16 x i8>
196+
; CHECK: and <16 x i8>
197+
; CHECK: xor <16 x i8>
198+
; CHECK: mul <16 x i8>
199+
; CHECK: store <16 x i8>
200200
define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
201201
entry:
202202
%cmp.32 = icmp sgt i32 %len, 0

llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -123,16 +123,16 @@ for.body:
123123
; }
124124
;
125125
; CHECK: vector.body:
126-
; CHECK: phi <8 x i16>
127-
; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8>
128-
; CHECK: zext <8 x i8> [[Ld1]] to <8 x i16>
129-
; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8>
130-
; CHECK: zext <8 x i8> [[Ld2]] to <8 x i16>
131-
; CHECK: add <8 x i16>
132-
; CHECK: add <8 x i16>
126+
; CHECK: phi <16 x i16>
127+
; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8>
128+
; CHECK: zext <16 x i8> [[Ld1]] to <16 x i16>
129+
; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8>
130+
; CHECK: zext <16 x i8> [[Ld2]] to <16 x i16>
131+
; CHECK: add <16 x i16>
132+
; CHECK: add <16 x i16>
133133
;
134134
; CHECK: middle.block:
135-
; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>
135+
; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>
136136
; CHECK: zext i16 [[Rdx]] to i32
137137
;
138138
define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {

llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@ define void @example1() nounwind uwtable ssp {
3535
}
3636

3737
;CHECK-LABEL: @example10b(
38-
;CHECK: load <4 x i16>
39-
;CHECK: sext <4 x i16>
40-
;CHECK: store <4 x i32>
38+
;CHECK: load <8 x i16>
39+
;CHECK: sext <8 x i16>
40+
;CHECK: store <8 x i32>
4141
;CHECK: ret void
4242
define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
4343
br label %1

llvm/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ target triple = "x86_64-apple-macosx"
99

1010
; If we need to scalarize the fptoui and then use inserts to build up the
1111
; vector again, then there is certainly no value in going 256-bit wide.
12-
; CHECK-NOT: vpinsrd
12+
; But as we default to maximize bandwidth, we should convert it to 256-bit
13+
; anyway.
14+
; CHECK: vpinsrd
1315

1416
define void @convert() {
1517
entry:

llvm/test/Transforms/LoopVectorize/X86/gcc-examples.ll

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -44,17 +44,16 @@ define void @example1() nounwind uwtable ssp {
4444
ret void
4545
}
4646

47-
; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive.
4847
;CHECK-LABEL: @example10b(
49-
;CHECK: load <4 x i16>
50-
;CHECK: sext <4 x i16>
51-
;CHECK: store <4 x i32>
48+
;CHECK: load <8 x i16>
49+
;CHECK: sext <8 x i16>
50+
;CHECK: store <8 x i32>
5251
;CHECK: ret void
5352
;UNROLL-LABEL: @example10b(
54-
;UNROLL: load <4 x i16>
55-
;UNROLL: load <4 x i16>
56-
;UNROLL: store <4 x i32>
57-
;UNROLL: store <4 x i32>
53+
;UNROLL: load <8 x i16>
54+
;UNROLL: load <8 x i16>
55+
;UNROLL: store <8 x i32>
56+
;UNROLL: store <8 x i32>
5857
;UNROLL: ret void
5958
define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
6059
br label %1

llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll

Lines changed: 34 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -260,20 +260,28 @@ for.end: ; preds = %for.cond
260260
; }
261261
;}
262262

263-
;AVX-LABEL: @foo3
264-
;AVX: icmp slt <4 x i32> %wide.load, <i32 100, i32 100,
265-
;AVX: call <4 x double> @llvm.masked.load.v4f64.p0v4f64
266-
;AVX: sitofp <4 x i32> %wide.load to <4 x double>
267-
;AVX: fadd <4 x double>
268-
;AVX: call void @llvm.masked.store.v4f64.p0v4f64
269-
;AVX: ret void
263+
;AVX1-LABEL: @foo3
264+
;AVX1: icmp slt <4 x i32> %wide.load, <i32 100, i32 100,
265+
;AVX1: call <4 x double> @llvm.masked.load.v4f64.p0v4f64
266+
;AVX1: sitofp <4 x i32> %wide.load to <4 x double>
267+
;AVX1: fadd <4 x double>
268+
;AVX1: call void @llvm.masked.store.v4f64.p0v4f64
269+
;AVX1: ret void
270+
271+
;AVX2-LABEL: @foo3
272+
;AVX2: icmp slt <8 x i32> %wide.load, <i32 100, i32 100,
273+
;AVX2: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
274+
;AVX2: sitofp <8 x i32> %wide.load to <8 x double>
275+
;AVX2: fadd <8 x double>
276+
;AVX2: call void @llvm.masked.store.v8f64.p0v8f64
277+
;AVX2: ret void
270278

271279
;AVX512-LABEL: @foo3
272-
;AVX512: icmp slt <8 x i32> %wide.load, <i32 100, i32 100,
273-
;AVX512: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
274-
;AVX512: sitofp <8 x i32> %wide.load to <8 x double>
275-
;AVX512: fadd <8 x double>
276-
;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
280+
;AVX512: icmp slt <16 x i32> %wide.load, <i32 100, i32 100,
281+
;AVX512: call <16 x double> @llvm.masked.load.v16f64.p0v16f64
282+
;AVX512: sitofp <16 x i32> %wide.load to <16 x double>
283+
;AVX512: fadd <16 x double>
284+
;AVX512: call void @llvm.masked.store.v16f64.p0v16f64
277285
;AVX512: ret void
278286

279287

@@ -502,19 +510,19 @@ for.end: ; preds = %for.cond
502510
; }
503511
;}
504512
;AVX2-LABEL: @foo6
505-
;AVX2: icmp sgt <4 x i32> %reverse, zeroinitializer
506-
;AVX2: shufflevector <4 x i1>{{.*}}<4 x i32> <i32 3, i32 2, i32 1, i32 0>
507-
;AVX2: call <4 x double> @llvm.masked.load.v4f64.p0v4f64
508-
;AVX2: fadd <4 x double>
509-
;AVX2: call void @llvm.masked.store.v4f64.p0v4f64
513+
;AVX2: icmp sgt <8 x i32> %reverse, zeroinitializer
514+
;AVX2: shufflevector <8 x i1>{{.*}}<8 x i32> <i32 7, i32 6, i32 5, i32 4
515+
;AVX2: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
516+
;AVX2: fadd <8 x double>
517+
;AVX2: call void @llvm.masked.store.v8f64.p0v8f64
510518
;AVX2: ret void
511519

512520
;AVX512-LABEL: @foo6
513-
;AVX512: icmp sgt <8 x i32> %reverse, zeroinitializer
514-
;AVX512: shufflevector <8 x i1>{{.*}}<8 x i32> <i32 7, i32 6, i32 5, i32 4
515-
;AVX512: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
516-
;AVX512: fadd <8 x double>
517-
;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
521+
;AVX512: icmp sgt <16 x i32> %reverse, zeroinitializer
522+
;AVX512: shufflevector <16 x i1>{{.*}}<16 x i32> <i32 15, i32 14, i32 13, i32 12
523+
;AVX512: call <16 x double> @llvm.masked.load.v16f64.p0v16f64
524+
;AVX512: fadd <16 x double>
525+
;AVX512: call void @llvm.masked.store.v16f64.p0v16f64
518526
;AVX512: ret void
519527

520528

@@ -582,8 +590,8 @@ for.end: ; preds = %for.cond
582590
; }
583591

584592
;AVX512-LABEL: @foo7
585-
;AVX512: call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>*
586-
;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
593+
;AVX512: call <64 x double*> @llvm.masked.load.v64p0f64.p0v64p0f64(<64 x double*>*
594+
;AVX512: call void @llvm.masked.store.v64f64.p0v64f64
587595
;AVX512: ret void
588596

589597
define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigger, i32 %size) #0 {
@@ -654,8 +662,8 @@ for.end: ; preds = %for.cond
654662
;}
655663

656664
;AVX512-LABEL: @foo8
657-
;AVX512: call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* %
658-
;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
665+
;AVX512: call <64 x i32 ()*> @llvm.masked.load.v64p0f_i32f.p0v64p0f_i32f(<64 x i32 ()*>* %
666+
;AVX512: call void @llvm.masked.store.v64f64.p0v64f64
659667
;AVX512: ret void
660668

661669
define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigger, i32 %size) #0 {

llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations
44
; CHECK: remark: no_fpmath.c:6:14: loop not vectorized
5-
; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2)
5+
; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 4, interleaved count: 2)
66

77
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
88
target triple = "x86_64-apple-macosx10.10.0"

llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations (hotness: 300)
55
; CHECK: remark: no_fpmath.c:6:14: loop not vectorized
6-
; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2) (hotness: 300)
6+
; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 4, interleaved count: 2) (hotness: 300)
77

88
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
99
target triple = "x86_64-apple-macosx10.10.0"

llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ target triple = "i386-apple-darwin"
77
define void @test1(float* nocapture %arg, i32 %arg1) nounwind {
88
; CHECK-LABEL: @test1(
99
; CHECK: preheader
10-
; CHECK: insertelement <2 x double> zeroinitializer, double %tmp, i32 0
10+
; CHECK: insertelement <4 x double> zeroinitializer, double %tmp, i32 0
1111
; CHECK: vector.memcheck
1212

1313
bb:

llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
; DEBUG-OUTPUT-NOT: .loc
77
; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
88

9-
; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1)
9+
; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 16, interleaved count: 1)
1010
; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4)
1111
; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1
1212

llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
; DEBUG-OUTPUT-NOT: .loc
77
; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
88

9-
; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1)
9+
; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 16, interleaved count: 1)
1010
; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4)
1111
; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1
1212

0 commit comments

Comments
 (0)