Skip to content

Commit bf2be9b

Browse files
committed
[X86] Disable lowering constant build vectors to broadcasts on AVX512 targets
On AVX512 targets we're better off keeping constant vector at full width to ensure that they can be load folded into vector instructions, reducing register pressure. If a vector constant remains as a basic load, X86FixupVectorConstantsPass will still convert this to a broadcast instruction for us. Non-VLX targets are still seeing some regressions due to these being implicitly widened to 512-bit ops in isel patterns and not in the DAG, so I've limited this to just 512-bit vectors for now.
1 parent c938436 commit bf2be9b

File tree

107 files changed

+93647
-97174
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

107 files changed

+93647
-97174
lines changed

llvm/lib/Target/X86/X86FixupVectorConstants.cpp

Lines changed: 260 additions & 1 deletion
Large diffs are not rendered by default.

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7512,6 +7512,14 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
75127512
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
75137513
"Unsupported vector type for broadcast.");
75147514

7515+
// On AVX512VL targets we're better off keeping the full width constant load
7516+
// and letting X86FixupVectorConstantsPass handle conversion to
7517+
// broadcast/broadcast-fold.
7518+
// AVX512 targets without AVX512VL can do this only for 512-bit vectors.
7519+
if (Subtarget.hasAVX512() && (Subtarget.hasVLX() || VT.is512BitVector()) &&
7520+
BVOp->isConstant())
7521+
return SDValue();
7522+
75157523
// See if the build vector is a repeating sequence of scalars (inc. splat).
75167524
SDValue Ld;
75177525
BitVector UndefElements;

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1480,10 +1480,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
14801480
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
14811481
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
14821482
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1483-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1484-
; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
1485-
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
1486-
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1483+
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
1484+
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
14871485
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
14881486
; AVX512F-NEXT: vzeroupper
14891487
; AVX512F-NEXT: retq
@@ -1495,10 +1493,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
14951493
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
14961494
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
14971495
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1498-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1499-
; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
1500-
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
1501-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1496+
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
1497+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
15021498
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
15031499
; AVX512DQ-NEXT: vzeroupper
15041500
; AVX512DQ-NEXT: retq
@@ -3253,10 +3249,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
32533249
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
32543250
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
32553251
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3256-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3257-
; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
3258-
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm1 ^ ymm2))
3259-
; AVX512F-NEXT: vpaddb (%rdx), %ymm3, %ymm1
3252+
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
3253+
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
32603254
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
32613255
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
32623256
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
@@ -3270,10 +3264,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
32703264
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
32713265
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
32723266
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3273-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3274-
; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
3275-
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm1 ^ ymm2))
3276-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm3, %ymm1
3267+
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
3268+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
32773269
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
32783270
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
32793271
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1211,21 +1211,19 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
12111211
;
12121212
; AVX512F-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
12131213
; AVX512F: # %bb.0:
1214-
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1]
1215-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1216-
; AVX512F-NEXT: # ymm1 = mem[0,1,0,1]
1217-
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm1 & (ymm0 ^ mem))
1214+
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
1215+
; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = mem[0,1,0,1]
1216+
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
12181217
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0
12191218
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
12201219
; AVX512F-NEXT: vzeroupper
12211220
; AVX512F-NEXT: retq
12221221
;
12231222
; AVX512DQ-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2:
12241223
; AVX512DQ: # %bb.0:
1225-
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1]
1226-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1227-
; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1]
1228-
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm1 & (ymm0 ^ mem))
1224+
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
1225+
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = mem[0,1,0,1]
1226+
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
12291227
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0
12301228
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
12311229
; AVX512DQ-NEXT: vzeroupper
@@ -2622,10 +2620,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
26222620
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
26232621
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
26242622
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1]
2625-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2626-
; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
2627-
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm0 ^ ymm2))
2628-
; AVX512F-NEXT: vpaddb (%rsi), %ymm3, %ymm0
2623+
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
2624+
; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0
26292625
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
26302626
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
26312627
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
@@ -2637,10 +2633,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
26372633
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0
26382634
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1
26392635
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1]
2640-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2641-
; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
2642-
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm0 ^ ymm2))
2643-
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm3, %ymm0
2636+
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
2637+
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0
26442638
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
26452639
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
26462640
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)

llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1524,8 +1524,8 @@ define <2 x i64> @test_x86_avx2_psrlv_q_const() {
15241524
;
15251525
; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_q_const:
15261526
; X64-AVX512VL: # %bb.0:
1527-
; X64-AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,4]
1528-
; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A]
1527+
; X64-AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,4]
1528+
; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A]
15291529
; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
15301530
; X64-AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A]
15311531
; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
@@ -1581,8 +1581,8 @@ define <4 x i64> @test_x86_avx2_psrlv_q_256_const() {
15811581
;
15821582
; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_q_256_const:
15831583
; X64-AVX512VL: # %bb.0:
1584-
; X64-AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,4,4,4]
1585-
; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x05,A,A,A,A]
1584+
; X64-AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,4,4,4]
1585+
; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0x05,A,A,A,A]
15861586
; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
15871587
; X64-AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0x05,A,A,A,A]
15881588
; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte

llvm/test/CodeGen/X86/avx512-arith.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ define <2 x i64> @imulq128_bcast(<2 x i64> %x) {
303303
;
304304
; AVX512VL-LABEL: imulq128_bcast:
305305
; AVX512VL: # %bb.0:
306-
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086]
306+
; AVX512VL-NEXT: vpmovsxwq {{.*#+}} xmm1 = [8086,8086]
307307
; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
308308
; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0
309309
; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0

0 commit comments

Comments
 (0)