Skip to content

Commit 1e31a45

Browse files
committed
[X86] lowerShuffleWithPERMV - commute VPERMV3 shuffles so any load is on the RHS
This helps ensure we lower to VPERMI2/T2 instructions that we can commute the index arg to VPERMT2/I2. Prep work for #79799
1 parent dbb21df commit 1e31a45

File tree

7 files changed

+86
-105
lines changed

7 files changed

+86
-105
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14300,9 +14300,17 @@ static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1430014300
// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
1430114301
// the active subvector is extracted.
1430214302
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
14303-
ArrayRef<int> Mask, SDValue V1, SDValue V2,
14304-
const X86Subtarget &Subtarget,
14303+
ArrayRef<int> OriginalMask, SDValue V1,
14304+
SDValue V2, const X86Subtarget &Subtarget,
1430514305
SelectionDAG &DAG) {
14306+
// Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14307+
SmallVector<int, 32> Mask(OriginalMask);
14308+
if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14309+
!isShuffleFoldableLoad(V2)) {
14310+
ShuffleVectorSDNode::commuteMask(Mask);
14311+
std::swap(V1, V2);
14312+
}
14313+
1430614314
MVT MaskVT = VT.changeTypeToInteger();
1430714315
SDValue MaskNode;
1430814316
MVT ShuffleVT = VT;

llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2540,9 +2540,9 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64>
25402540
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
25412541
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4:
25422542
; CHECK: # %bb.0:
2543-
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2544-
; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,4,6,1]
2545-
; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2543+
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2544+
; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,0,2,5]
2545+
; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3
25462546
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
25472547
; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
25482548
; CHECK-NEXT: retq
@@ -2556,10 +2556,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %
25562556
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %mask) {
25572557
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4:
25582558
; CHECK: # %bb.0:
2559-
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2560-
; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,4,6,1]
2559+
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2560+
; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,0,2,5]
25612561
; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2562-
; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2562+
; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
25632563
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
25642564
; CHECK-NEXT: retq
25652565
%vec = load <8 x i64>, ptr %vp
@@ -4398,9 +4398,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp,
43984398
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
43994399
; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
44004400
; CHECK-FAST: # %bb.0:
4401-
; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4402-
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,4,2,6]
4403-
; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3
4401+
; CHECK-FAST-NEXT: vbroadcastsd 32(%rdi), %ymm2
4402+
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [7,0,6,2]
4403+
; CHECK-FAST-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3
44044404
; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
44054405
; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
44064406
; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1}
@@ -4423,11 +4423,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4
44234423
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %mask) {
44244424
; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
44254425
; CHECK-FAST: # %bb.0:
4426-
; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4427-
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,6]
4426+
; CHECK-FAST-NEXT: vbroadcastsd 32(%rdi), %ymm2
4427+
; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [7,0,6,2]
44284428
; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
44294429
; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4430-
; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}
4430+
; CHECK-FAST-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
44314431
; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0
44324432
; CHECK-FAST-NEXT: retq
44334433
;

llvm/test/CodeGen/X86/insert-into-constant-vector.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -447,9 +447,8 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
447447
; X64-AVX512F-LABEL: elt5_v8i64:
448448
; X64-AVX512F: # %bb.0:
449449
; X64-AVX512F-NEXT: vmovq %rdi, %xmm1
450-
; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,8,6,7]
451-
; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [42,1,2,3,4,0,6,7]
452-
; X64-AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
450+
; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,9,10,11,12,0,14,15]
451+
; X64-AVX512F-NEXT: vpermi2q {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
453452
; X64-AVX512F-NEXT: retq
454453
%ins = insertelement <8 x i64> <i64 42, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, i64 %x, i32 5
455454
ret <8 x i64> %ins

llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2516,10 +2516,8 @@ define <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, ptr %a1) {
25162516
;
25172517
; AVX512VL-LABEL: shuffle_mem_v4f32_0624:
25182518
; AVX512VL: # %bb.0:
2519-
; AVX512VL-NEXT: vmovaps (%rdi), %xmm2
2520-
; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,2,4]
2521-
; AVX512VL-NEXT: vpermi2ps %xmm0, %xmm2, %xmm1
2522-
; AVX512VL-NEXT: vmovaps %xmm1, %xmm0
2519+
; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [4,2,6,0]
2520+
; AVX512VL-NEXT: vpermt2ps (%rdi), %xmm1, %xmm0
25232521
; AVX512VL-NEXT: retq
25242522
%1 = load <4 x float>, ptr %a1
25252523
%2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 0, i32 6, i32 2, i32 4>

llvm/test/CodeGen/X86/vector-shuffle-avx512.ll

Lines changed: 18 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -481,37 +481,33 @@ define <8 x float> @test_masked_permps_v8f32(ptr %vp, <8 x float> %vec2) {
481481
; X86-AVX512-LABEL: test_masked_permps_v8f32:
482482
; X86-AVX512: # %bb.0:
483483
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
484-
; X86-AVX512-NEXT: vmovaps (%eax), %ymm2
485-
; X86-AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,3,11,7,6,14,15]
486-
; X86-AVX512-NEXT: vpermi2ps %ymm0, %ymm2, %ymm1
487-
; X86-AVX512-NEXT: vmovaps %ymm1, %ymm0
484+
; X86-AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [15,14,11,3,15,14,6,7]
485+
; X86-AVX512-NEXT: vpermt2ps (%eax), %ymm1, %ymm0
488486
; X86-AVX512-NEXT: retl
489487
;
490488
; X64-AVX512-LABEL: test_masked_permps_v8f32:
491489
; X64-AVX512: # %bb.0:
492-
; X64-AVX512-NEXT: vmovaps (%rdi), %ymm2
493-
; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,3,11,7,6,14,15]
494-
; X64-AVX512-NEXT: vpermi2ps %ymm0, %ymm2, %ymm1
495-
; X64-AVX512-NEXT: vmovaps %ymm1, %ymm0
490+
; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [15,14,11,3,15,14,6,7]
491+
; X64-AVX512-NEXT: vpermt2ps (%rdi), %ymm1, %ymm0
496492
; X64-AVX512-NEXT: retq
497493
;
498494
; X86-AVX512F-LABEL: test_masked_permps_v8f32:
499495
; X86-AVX512F: # %bb.0:
500496
; X86-AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
501497
; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
502498
; X86-AVX512F-NEXT: vmovaps (%eax), %ymm1
503-
; X86-AVX512F-NEXT: vmovaps {{.*#+}} ymm2 = [7,6,3,19,7,6,22,23]
504-
; X86-AVX512F-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
505-
; X86-AVX512F-NEXT: vmovaps %ymm1, %ymm0
499+
; X86-AVX512F-NEXT: vmovaps {{.*#+}} ymm2 = [23,22,19,3,23,22,6,7]
500+
; X86-AVX512F-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
501+
; X86-AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
506502
; X86-AVX512F-NEXT: retl
507503
;
508504
; X64-AVX512F-LABEL: test_masked_permps_v8f32:
509505
; X64-AVX512F: # %bb.0:
510506
; X64-AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
511507
; X64-AVX512F-NEXT: vmovaps (%rdi), %ymm1
512-
; X64-AVX512F-NEXT: vmovaps {{.*#+}} ymm2 = [7,6,3,19,7,6,22,23]
513-
; X64-AVX512F-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
514-
; X64-AVX512F-NEXT: vmovaps %ymm1, %ymm0
508+
; X64-AVX512F-NEXT: vmovaps {{.*#+}} ymm2 = [23,22,19,3,23,22,6,7]
509+
; X64-AVX512F-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
510+
; X64-AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
515511
; X64-AVX512F-NEXT: retq
516512
%vec = load <8 x float>, ptr %vp
517513
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 3, i32 0, i32 7, i32 6, i32 3, i32 0>
@@ -523,35 +519,27 @@ define <16 x float> @test_masked_permps_v16f32(ptr %vp, <16 x float> %vec2) {
523519
; X86-AVX512-LABEL: test_masked_permps_v16f32:
524520
; X86-AVX512: # %bb.0:
525521
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
526-
; X86-AVX512-NEXT: vmovaps (%eax), %zmm2
527-
; X86-AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [15,13,11,19,14,12,22,23,7,6,3,27,7,29,3,31]
528-
; X86-AVX512-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1
529-
; X86-AVX512-NEXT: vmovaps %zmm1, %zmm0
522+
; X86-AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
523+
; X86-AVX512-NEXT: vpermt2ps (%eax), %zmm1, %zmm0
530524
; X86-AVX512-NEXT: retl
531525
;
532526
; X64-AVX512-LABEL: test_masked_permps_v16f32:
533527
; X64-AVX512: # %bb.0:
534-
; X64-AVX512-NEXT: vmovaps (%rdi), %zmm2
535-
; X64-AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [15,13,11,19,14,12,22,23,7,6,3,27,7,29,3,31]
536-
; X64-AVX512-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1
537-
; X64-AVX512-NEXT: vmovaps %zmm1, %zmm0
528+
; X64-AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
529+
; X64-AVX512-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0
538530
; X64-AVX512-NEXT: retq
539531
;
540532
; X86-AVX512F-LABEL: test_masked_permps_v16f32:
541533
; X86-AVX512F: # %bb.0:
542534
; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
543-
; X86-AVX512F-NEXT: vmovaps (%eax), %zmm2
544-
; X86-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [15,13,11,19,14,12,22,23,7,6,3,27,7,29,3,31]
545-
; X86-AVX512F-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1
546-
; X86-AVX512F-NEXT: vmovaps %zmm1, %zmm0
535+
; X86-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
536+
; X86-AVX512F-NEXT: vpermt2ps (%eax), %zmm1, %zmm0
547537
; X86-AVX512F-NEXT: retl
548538
;
549539
; X64-AVX512F-LABEL: test_masked_permps_v16f32:
550540
; X64-AVX512F: # %bb.0:
551-
; X64-AVX512F-NEXT: vmovaps (%rdi), %zmm2
552-
; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [15,13,11,19,14,12,22,23,7,6,3,27,7,29,3,31]
553-
; X64-AVX512F-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1
554-
; X64-AVX512F-NEXT: vmovaps %zmm1, %zmm0
541+
; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15]
542+
; X64-AVX512F-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0
555543
; X64-AVX512F-NEXT: retq
556544
%vec = load <16 x float>, ptr %vp
557545
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 14, i32 12, i32 10, i32 8, i32 7, i32 6, i32 3, i32 0, i32 7, i32 6, i32 3, i32 0>

llvm/test/CodeGen/X86/vector-shuffle-v48.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,10 +79,9 @@ define <32 x i8> @foo(ptr %x0) {
7979
;
8080
; AVX512VBMI-LABEL: foo:
8181
; AVX512VBMI: # %bb.0:
82-
; AVX512VBMI-NEXT: vmovdqu (%rdi), %ymm1
83-
; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %xmm2
84-
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,3,4,6,7,9,10,12,13,15,16,18,19,21,22,24,25,27,28,30,31,33,34,36,37,39,40,42,43,45,46]
85-
; AVX512VBMI-NEXT: vpermi2b %ymm2, %ymm1, %ymm0
82+
; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %xmm1
83+
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [32,33,35,36,38,39,41,42,44,45,47,48,50,51,53,54,56,57,59,60,62,63,1,2,4,5,7,8,10,11,13,14]
84+
; AVX512VBMI-NEXT: vpermi2b (%rdi), %ymm1, %ymm0
8685
; AVX512VBMI-NEXT: retq
8786
%1 = load <48 x i8>, ptr %x0, align 1
8887
%2 = shufflevector <48 x i8> %1, <48 x i8> undef, <32 x i32> <i32 0, i32 1, i32 3, i32 4, i32 6, i32 7, i32 9, i32 10, i32 12, i32 13, i32 15, i32 16, i32 18, i32 19, i32 21, i32 22, i32 24, i32 25, i32 27, i32 28, i32 30, i32 31, i32 33, i32 34, i32 36, i32 37, i32 39, i32 40, i32 42, i32 43, i32 45, i32 46>

llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll

Lines changed: 39 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -4895,11 +4895,10 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
48954895
;
48964896
; AVX512BW-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
48974897
; AVX512BW: # %bb.0:
4898-
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
4898+
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,0,2,0,8,0,6,0]
48994899
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
4900-
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,10,0,0,0,14,0]
4901-
; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
4902-
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
4900+
; AVX512BW-NEXT: vpermt2q (%rdi), %zmm0, %zmm1
4901+
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
49034902
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
49044903
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
49054904
; AVX512BW-NEXT: vzeroupper
@@ -4997,11 +4996,10 @@ define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %i
49974996
;
49984997
; AVX512BW-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
49994998
; AVX512BW: # %bb.0:
5000-
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
4999+
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [32,1,32,3,32,5,32,7,32,9,32,11,32,13,32,15,32,17,32,19,32,21,32,23,32,25,32,27,32,29,32,31]
50015000
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
5002-
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,33,0,35,0,37,0,39,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,57,0,59,0,61,0,63]
5003-
; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
5004-
; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0
5001+
; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1
5002+
; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0
50055003
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
50065004
; AVX512BW-NEXT: vzeroupper
50075005
; AVX512BW-NEXT: retq
@@ -5411,39 +5409,36 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in.
54115409
;
54125410
; AVX512F-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
54135411
; AVX512F: # %bb.0:
5414-
; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0
5412+
; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,16,3,16,5,16,7,16,9,16,11,16,13,16,15]
54155413
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
5416-
; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
5417-
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
5418-
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
5414+
; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1
5415+
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0
54195416
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
5420-
; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1
5417+
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
54215418
; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
54225419
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
54235420
; AVX512F-NEXT: vzeroupper
54245421
; AVX512F-NEXT: retq
54255422
;
54265423
; AVX512DQ-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
54275424
; AVX512DQ: # %bb.0:
5428-
; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
5425+
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,16,3,16,5,16,7,16,9,16,11,16,13,16,15]
54295426
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
5430-
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
5431-
; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
5432-
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
5427+
; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1
5428+
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0
54335429
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
5434-
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1
5430+
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
54355431
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
54365432
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
54375433
; AVX512DQ-NEXT: vzeroupper
54385434
; AVX512DQ-NEXT: retq
54395435
;
54405436
; AVX512BW-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
54415437
; AVX512BW: # %bb.0:
5442-
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
5438+
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,16,3,16,5,16,7,16,9,16,11,16,13,16,15]
54435439
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
5444-
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31]
5445-
; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
5446-
; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0
5440+
; AVX512BW-NEXT: vpermt2d (%rdi), %zmm0, %zmm1
5441+
; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0
54475442
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
54485443
; AVX512BW-NEXT: vzeroupper
54495444
; AVX512BW-NEXT: retq
@@ -5679,39 +5674,36 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i
56795674
;
56805675
; AVX512F-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
56815676
; AVX512F: # %bb.0:
5682-
; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0
5677+
; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,1,8,3,8,5,8,7]
56835678
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
5684-
; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15]
5685-
; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
5686-
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
5679+
; AVX512F-NEXT: vpermt2q (%rdi), %zmm0, %zmm1
5680+
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0
56875681
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
5688-
; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1
5682+
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
56895683
; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
56905684
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
56915685
; AVX512F-NEXT: vzeroupper
56925686
; AVX512F-NEXT: retq
56935687
;
56945688
; AVX512DQ-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
56955689
; AVX512DQ: # %bb.0:
5696-
; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
5690+
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,1,8,3,8,5,8,7]
56975691
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
5698-
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15]
5699-
; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
5700-
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
5692+
; AVX512DQ-NEXT: vpermt2q (%rdi), %zmm0, %zmm1
5693+
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0
57015694
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
5702-
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1
5695+
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
57035696
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
57045697
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
57055698
; AVX512DQ-NEXT: vzeroupper
57065699
; AVX512DQ-NEXT: retq
57075700
;
57085701
; AVX512BW-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
57095702
; AVX512BW: # %bb.0:
5710-
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
5703+
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,1,8,3,8,5,8,7]
57115704
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
5712-
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15]
5713-
; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
5714-
; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0
5705+
; AVX512BW-NEXT: vpermt2q (%rdi), %zmm0, %zmm1
5706+
; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0
57155707
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
57165708
; AVX512BW-NEXT: vzeroupper
57175709
; AVX512BW-NEXT: retq
@@ -5938,39 +5930,36 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %
59385930
;
59395931
; AVX512F-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
59405932
; AVX512F: # %bb.0:
5941-
; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0
5933+
; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,9,2,3,8,9,6,7]
59425934
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
5943-
; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15]
5944-
; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
5945-
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
5935+
; AVX512F-NEXT: vpermt2q (%rdi), %zmm0, %zmm1
5936+
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0
59465937
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
5947-
; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1
5938+
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
59485939
; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
59495940
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
59505941
; AVX512F-NEXT: vzeroupper
59515942
; AVX512F-NEXT: retq
59525943
;
59535944
; AVX512DQ-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
59545945
; AVX512DQ: # %bb.0:
5955-
; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
5946+
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,9,2,3,8,9,6,7]
59565947
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
5957-
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15]
5958-
; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
5959-
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
5948+
; AVX512DQ-NEXT: vpermt2q (%rdi), %zmm0, %zmm1
5949+
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0
59605950
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
5961-
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1
5951+
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
59625952
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
59635953
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
59645954
; AVX512DQ-NEXT: vzeroupper
59655955
; AVX512DQ-NEXT: retq
59665956
;
59675957
; AVX512BW-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
59685958
; AVX512BW: # %bb.0:
5969-
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
5959+
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,9,2,3,8,9,6,7]
59705960
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
5971-
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15]
5972-
; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
5973-
; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0
5961+
; AVX512BW-NEXT: vpermt2q (%rdi), %zmm0, %zmm1
5962+
; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0
59745963
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
59755964
; AVX512BW-NEXT: vzeroupper
59765965
; AVX512BW-NEXT: retq

0 commit comments

Comments
 (0)