Skip to content

Commit 29e75d2

Browse files
committed
[X86] Disable lowering constant build vectors to broadcasts on AVX512 targets
On AVX512 targets we're better off keeping constant vector at full width to ensure that they can be load folded into vector instructions, reducing register pressure. If a vector constant remains as a basic load, X86FixupVectorConstantsPass will still convert this to a broadcast instruction for us. Non-VLX targets are still seeing some regressions due to these being implicitly widened to 512-bit ops in isel patterns and not in the DAG, so I've limited this to just 512-bit vectors for now. We still use lowerBuildVectorAsBroadcast on AVX512 targets if we're optimizing for size.
1 parent c59cc2b commit 29e75d2

File tree

111 files changed

+93989
-97482
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

111 files changed

+93989
-97482
lines changed

llvm/lib/Target/X86/X86FixupVectorConstants.cpp

Lines changed: 260 additions & 1 deletion
Large diffs are not rendered by default.

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7646,6 +7646,21 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
76467646
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
76477647
"Unsupported vector type for broadcast.");
76487648

7649+
// When optimizing for size, generate up to 5 extra bytes for a broadcast
7650+
// instruction to save 8 or more bytes of constant pool data.
7651+
// TODO: If multiple splats are generated to load the same constant,
7652+
// it may be detrimental to overall size. There needs to be a way to detect
7653+
// that condition to know if this is truly a size win.
7654+
bool OptForSize = DAG.shouldOptForSize();
7655+
7656+
// On AVX512VL targets we're better off keeping the full width constant load
7657+
// and letting X86FixupVectorConstantsPass handle conversion to
7658+
// broadcast/broadcast-fold.
7659+
// AVX512 targets without AVX512VL can do this only for 512-bit vectors.
7660+
if (Subtarget.hasAVX512() && (Subtarget.hasVLX() || VT.is512BitVector()) &&
7661+
BVOp->isConstant() && !OptForSize)
7662+
return SDValue();
7663+
76497664
// See if the build vector is a repeating sequence of scalars (inc. splat).
76507665
SDValue Ld;
76517666
BitVector UndefElements;
@@ -7771,12 +7786,6 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
77717786
unsigned ScalarSize = Ld.getValueSizeInBits();
77727787
bool IsGE256 = (VT.getSizeInBits() >= 256);
77737788

7774-
// When optimizing for size, generate up to 5 extra bytes for a broadcast
7775-
// instruction to save 8 or more bytes of constant pool data.
7776-
// TODO: If multiple splats are generated to load the same constant,
7777-
// it may be detrimental to overall size. There needs to be a way to detect
7778-
// that condition to know if this is truly a size win.
7779-
bool OptForSize = DAG.shouldOptForSize();
77807789

77817790
// Handle broadcasting a single constant scalar from the constant pool
77827791
// into a vector.

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1478,10 +1478,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
14781478
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
14791479
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
14801480
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1481-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1482-
; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
1483-
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
1484-
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1481+
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
1482+
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
14851483
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
14861484
; AVX512F-NEXT: vzeroupper
14871485
; AVX512F-NEXT: retq
@@ -1493,10 +1491,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
14931491
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
14941492
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
14951493
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1496-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1497-
; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
1498-
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
1499-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1494+
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
1495+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
15001496
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
15011497
; AVX512DQ-NEXT: vzeroupper
15021498
; AVX512DQ-NEXT: retq
@@ -3235,10 +3231,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
32353231
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
32363232
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
32373233
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3238-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3239-
; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
3240-
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm1 ^ ymm2))
3241-
; AVX512F-NEXT: vpaddb (%rdx), %ymm3, %ymm1
3234+
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
3235+
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
32423236
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
32433237
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
32443238
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
@@ -3252,10 +3246,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
32523246
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
32533247
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
32543248
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3255-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3256-
; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
3257-
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm1 ^ ymm2))
3258-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm3, %ymm1
3249+
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
3250+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
32593251
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
32603252
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
32613253
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
@@ -3762,8 +3754,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
37623754
; AVX512BW: # %bb.0:
37633755
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
37643756
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
3765-
; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,25,26,27,0,29,30,31]
3766-
; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
3757+
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,27,0,29,30,31]
37673758
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
37683759
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
37693760
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -4015,8 +4006,7 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
40154006
; AVX512BW: # %bb.0:
40164007
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
40174008
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
4018-
; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,25,26,27,28,29,30,31]
4019-
; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
4009+
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,27,28,29,30,31]
40204010
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1
40214011
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
40224012
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1205,21 +1205,19 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
12051205
;
12061206
; AVX512F-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
12071207
; AVX512F: # %bb.0:
1208-
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1]
1209-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1210-
; AVX512F-NEXT: # ymm1 = mem[0,1,0,1]
1211-
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm1 & (ymm0 ^ mem))
1208+
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
1209+
; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = mem[0,1,0,1]
1210+
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
12121211
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0
12131212
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
12141213
; AVX512F-NEXT: vzeroupper
12151214
; AVX512F-NEXT: retq
12161215
;
12171216
; AVX512DQ-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
12181217
; AVX512DQ: # %bb.0:
1219-
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1]
1220-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1221-
; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1]
1222-
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm1 & (ymm0 ^ mem))
1218+
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
1219+
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = mem[0,1,0,1]
1220+
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
12231221
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0
12241222
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
12251223
; AVX512DQ-NEXT: vzeroupper
@@ -2574,10 +2572,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
25742572
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
25752573
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
25762574
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1]
2577-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2578-
; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
2579-
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm0 ^ ymm2))
2580-
; AVX512F-NEXT: vpaddb (%rsi), %ymm3, %ymm0
2575+
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
2576+
; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0
25812577
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
25822578
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
25832579
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
@@ -2589,10 +2585,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
25892585
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
25902586
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1
25912587
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1]
2592-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2593-
; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
2594-
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm0 ^ ymm2))
2595-
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm3, %ymm0
2588+
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
2589+
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0
25962590
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
25972591
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
25982592
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)

llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1524,8 +1524,8 @@ define <2 x i64> @test_x86_avx2_psrlv_q_const() {
15241524
;
15251525
; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_q_const:
15261526
; X64-AVX512VL: # %bb.0:
1527-
; X64-AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,4]
1528-
; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A]
1527+
; X64-AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,4]
1528+
; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A]
15291529
; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
15301530
; X64-AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A]
15311531
; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
@@ -1581,8 +1581,8 @@ define <4 x i64> @test_x86_avx2_psrlv_q_256_const() {
15811581
;
15821582
; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_q_256_const:
15831583
; X64-AVX512VL: # %bb.0:
1584-
; X64-AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,4,4,4]
1585-
; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x05,A,A,A,A]
1584+
; X64-AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,4,4,4]
1585+
; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0x05,A,A,A,A]
15861586
; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
15871587
; X64-AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0x05,A,A,A,A]
15881588
; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte

llvm/test/CodeGen/X86/avx512-arith.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ define <2 x i64> @imulq128_bcast(<2 x i64> %x) {
303303
;
304304
; AVX512VL-LABEL: imulq128_bcast:
305305
; AVX512VL: # %bb.0:
306-
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086]
306+
; AVX512VL-NEXT: vpmovsxwq {{.*#+}} xmm1 = [8086,8086]
307307
; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
308308
; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0
309309
; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0

0 commit comments

Comments
 (0)