Skip to content

Commit 6ec350b

Browse files
committed
[X86] SimplifyDemandedVectorEltsForTargetShuffle - don't simplify constant mask if it has multiple uses
Avoid generating extra constant vectors
1 parent a711b04 commit 6ec350b

6 files changed

+2976
-2965
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40857,7 +40857,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
4085740857
SDValue BC = peekThroughOneUseBitcasts(Mask);
4085840858
EVT BCVT = BC.getValueType();
4085940859
auto *Load = dyn_cast<LoadSDNode>(BC);
40860-
if (!Load)
40860+
if (!Load || !Load->getBasePtr().hasOneUse())
4086140861
return false;
4086240862

4086340863
const Constant *C = getTargetConstantFromNode(Load);

llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll

Lines changed: 2021 additions & 2007 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll

Lines changed: 229 additions & 230 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll

Lines changed: 225 additions & 225 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll

Lines changed: 40 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -4775,15 +4775,13 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
47754775
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm20
47764776
; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm27, %zmm20 {%k1}
47774777
; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm30[0,1,2,3],zmm14[4,5,6,7]
4778-
; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
4779-
; AVX512BW-SLOW-NEXT: vpshufb %zmm28, %zmm27, %zmm27
4778+
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm27 = zmm27[6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u]
47804779
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm27 = zmm27[2,2,2,3,6,6,6,7]
47814780
; AVX512BW-SLOW-NEXT: movl $-1840700270, %ecx # imm = 0x92492492
47824781
; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2
47834782
; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm27, %zmm20 {%k2}
47844783
; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm12[4,5,6,7]
4785-
; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
4786-
; AVX512BW-SLOW-NEXT: vpshufb %zmm27, %zmm24, %zmm24
4784+
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm24 = zmm24[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
47874785
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm24 = zmm24[2,2,2,3,6,6,6,7]
47884786
; AVX512BW-SLOW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208
47894787
; AVX512BW-SLOW-NEXT: kmovq %rcx, %k3
@@ -4804,11 +4802,11 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
48044802
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16
48054803
; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm17, %zmm16 {%k1}
48064804
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm14, %zmm14
4807-
; AVX512BW-SLOW-NEXT: vpshufb %zmm28, %zmm14, %zmm14
4805+
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm14 = zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u]
48084806
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7]
48094807
; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm14, %zmm16 {%k2}
48104808
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm12, %zmm12
4811-
; AVX512BW-SLOW-NEXT: vpshufb %zmm27, %zmm12, %zmm12
4809+
; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm12 = zmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
48124810
; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7]
48134811
; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm12, %zmm16 {%k3}
48144812
; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
@@ -4882,24 +4880,24 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
48824880
; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm8
48834881
; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm10
48844882
; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm3
4885-
; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
4886-
; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm0
4883+
; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
4884+
; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm0
48874885
; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm4
4888-
; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm1
4886+
; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm1
48894887
; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
48904888
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
48914889
; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
48924890
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
48934891
; AVX512BW-FAST-NEXT: vpermw %ymm1, %ymm9, %ymm1
48944892
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
4895-
; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm5
4893+
; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm6
48964894
; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm17 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
4897-
; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm5, %ymm0
4895+
; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm6, %ymm0
48984896
; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %ymm7
48994897
; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm7, %ymm2
49004898
; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
49014899
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
4902-
; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15],ymm7[24],ymm5[24],ymm7[25],ymm5[25],ymm7[26],ymm5[26],ymm7[27],ymm5[27],ymm7[28],ymm5[28],ymm7[29],ymm5[29],ymm7[30],ymm5[30],ymm7[31],ymm5[31]
4900+
; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15],ymm7[24],ymm6[24],ymm7[25],ymm6[25],ymm7[26],ymm6[26],ymm7[27],ymm6[27],ymm7[28],ymm6[28],ymm7[29],ymm6[29],ymm7[30],ymm6[30],ymm7[31],ymm6[31]
49034901
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
49044902
; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm12, %ymm2
49054903
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@@ -4908,49 +4906,47 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
49084906
; AVX512BW-FAST-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
49094907
; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %ymm2
49104908
; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm8[4,5,6,7]
4911-
; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u>
4912-
; AVX512BW-FAST-NEXT: vpshufb %zmm13, %zmm1, %zmm1
4909+
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u]
49134910
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
49144911
; AVX512BW-FAST-NEXT: movl $-1840700270, %eax # imm = 0x92492492
49154912
; AVX512BW-FAST-NEXT: kmovd %eax, %k2
49164913
; AVX512BW-FAST-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2}
49174914
; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %ymm1
49184915
; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm1[0,1,2,3],zmm10[4,5,6,7]
4919-
; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15>
4920-
; AVX512BW-FAST-NEXT: vpshufb %zmm15, %zmm11, %zmm11
4916+
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm11 = zmm11[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
49214917
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,2,2,3,6,6,6,7]
49224918
; AVX512BW-FAST-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208
49234919
; AVX512BW-FAST-NEXT: kmovq %rax, %k3
49244920
; AVX512BW-FAST-NEXT: vmovdqu8 %zmm11, %zmm0 {%k3}
49254921
; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm11
4926-
; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm16
4922+
; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm13
49274923
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm14
4928-
; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm6
4929-
; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm16[0],ymm6[1],ymm16[1],ymm6[2],ymm16[2],ymm6[3],ymm16[3],ymm6[4],ymm16[4],ymm6[5],ymm16[5],ymm6[6],ymm16[6],ymm6[7],ymm16[7],ymm6[16],ymm16[16],ymm6[17],ymm16[17],ymm6[18],ymm16[18],ymm6[19],ymm16[19],ymm6[20],ymm16[20],ymm6[21],ymm16[21],ymm6[22],ymm16[22],ymm6[23],ymm16[23]
4930-
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
4931-
; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11],ymm14[12],ymm11[12],ymm14[13],ymm11[13],ymm14[14],ymm11[14],ymm14[15],ymm11[15],ymm14[24],ymm11[24],ymm14[25],ymm11[25],ymm14[26],ymm11[26],ymm14[27],ymm11[27],ymm14[28],ymm11[28],ymm14[29],ymm11[29],ymm14[30],ymm11[30],ymm14[31],ymm11[31]
4932-
; AVX512BW-FAST-NEXT: vpermw %ymm16, %ymm9, %ymm9
4933-
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm9
4924+
; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm5
4925+
; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm13[0],ymm5[1],ymm13[1],ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[4],ymm13[4],ymm5[5],ymm13[5],ymm5[6],ymm13[6],ymm5[7],ymm13[7],ymm5[16],ymm13[16],ymm5[17],ymm13[17],ymm5[18],ymm13[18],ymm5[19],ymm13[19],ymm5[20],ymm13[20],ymm5[21],ymm13[21],ymm5[22],ymm13[22],ymm5[23],ymm13[23]
4926+
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
4927+
; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm13 = ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11],ymm14[12],ymm11[12],ymm14[13],ymm11[13],ymm14[14],ymm11[14],ymm14[15],ymm11[15],ymm14[24],ymm11[24],ymm14[25],ymm11[25],ymm14[26],ymm11[26],ymm14[27],ymm11[27],ymm14[28],ymm11[28],ymm14[29],ymm11[29],ymm14[30],ymm11[30],ymm14[31],ymm11[31]
4928+
; AVX512BW-FAST-NEXT: vpermw %ymm13, %ymm9, %ymm9
4929+
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm5, %zmm9
49344930
; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %ymm19
4935-
; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm19, %ymm6
4931+
; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm19, %ymm5
49364932
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %ymm20
4937-
; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm20, %ymm16
4938-
; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm16[0],ymm6[0],ymm16[1],ymm6[1],ymm16[2],ymm6[2],ymm16[3],ymm6[3],ymm16[4],ymm6[4],ymm16[5],ymm6[5],ymm16[6],ymm6[6],ymm16[7],ymm6[7],ymm16[16],ymm6[16],ymm16[17],ymm6[17],ymm16[18],ymm6[18],ymm16[19],ymm6[19],ymm16[20],ymm6[20],ymm16[21],ymm6[21],ymm16[22],ymm6[22],ymm16[23],ymm6[23]
4939-
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
4940-
; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm20[8],ymm19[8],ymm20[9],ymm19[9],ymm20[10],ymm19[10],ymm20[11],ymm19[11],ymm20[12],ymm19[12],ymm20[13],ymm19[13],ymm20[14],ymm19[14],ymm20[15],ymm19[15],ymm20[24],ymm19[24],ymm20[25],ymm19[25],ymm20[26],ymm19[26],ymm20[27],ymm19[27],ymm20[28],ymm19[28],ymm20[29],ymm19[29],ymm20[30],ymm19[30],ymm20[31],ymm19[31]
4941-
; AVX512BW-FAST-NEXT: vpermw %ymm16, %ymm12, %ymm12
4942-
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm6
4943-
; AVX512BW-FAST-NEXT: vmovdqu16 %zmm9, %zmm6 {%k1}
4933+
; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm20, %ymm13
4934+
; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm13[0],ymm5[0],ymm13[1],ymm5[1],ymm13[2],ymm5[2],ymm13[3],ymm5[3],ymm13[4],ymm5[4],ymm13[5],ymm5[5],ymm13[6],ymm5[6],ymm13[7],ymm5[7],ymm13[16],ymm5[16],ymm13[17],ymm5[17],ymm13[18],ymm5[18],ymm13[19],ymm5[19],ymm13[20],ymm5[20],ymm13[21],ymm5[21],ymm13[22],ymm5[22],ymm13[23],ymm5[23]
4935+
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
4936+
; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm13 = ymm20[8],ymm19[8],ymm20[9],ymm19[9],ymm20[10],ymm19[10],ymm20[11],ymm19[11],ymm20[12],ymm19[12],ymm20[13],ymm19[13],ymm20[14],ymm19[14],ymm20[15],ymm19[15],ymm20[24],ymm19[24],ymm20[25],ymm19[25],ymm20[26],ymm19[26],ymm20[27],ymm19[27],ymm20[28],ymm19[28],ymm20[29],ymm19[29],ymm20[30],ymm19[30],ymm20[31],ymm19[31]
4937+
; AVX512BW-FAST-NEXT: vpermw %ymm13, %ymm12, %ymm12
4938+
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm5
4939+
; AVX512BW-FAST-NEXT: vmovdqu16 %zmm9, %zmm5 {%k1}
49444940
; AVX512BW-FAST-NEXT: vmovdqa (%r8), %ymm9
49454941
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8
4946-
; AVX512BW-FAST-NEXT: vpshufb %zmm13, %zmm8, %zmm8
4942+
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm8 = zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u]
49474943
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7]
4948-
; AVX512BW-FAST-NEXT: vmovdqu16 %zmm8, %zmm6 {%k2}
4944+
; AVX512BW-FAST-NEXT: vmovdqu16 %zmm8, %zmm5 {%k2}
49494945
; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm8
49504946
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm10
4951-
; AVX512BW-FAST-NEXT: vpshufb %zmm15, %zmm10, %zmm10
4947+
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
49524948
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,2,2,3,6,6,6,7]
4953-
; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm6 {%k3}
4949+
; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm5 {%k3}
49544950
; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm22
49554951
; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %xmm13
49564952
; AVX512BW-FAST-NEXT: vpshufb %xmm17, %xmm13, %xmm10
@@ -5029,17 +5025,17 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
50295025
; AVX512BW-FAST-NEXT: vpermw %zmm11, %zmm14, %zmm11
50305026
; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30]
50315027
; AVX512BW-FAST-NEXT: vpermw %zmm19, %zmm20, %zmm11 {%k1}
5032-
; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[16],ymm5[16],ymm7[17],ymm5[17],ymm7[18],ymm5[18],ymm7[19],ymm5[19],ymm7[20],ymm5[20],ymm7[21],ymm5[21],ymm7[22],ymm5[22],ymm7[23],ymm5[23]
5028+
; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23]
50335029
; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15]
5034-
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5
5030+
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6
50355031
; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
50365032
; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm21[8],xmm18[8],xmm21[9],xmm18[9],xmm21[10],xmm18[10],xmm21[11],xmm18[11],xmm21[12],xmm18[12],xmm21[13],xmm18[13],xmm21[14],xmm18[14],xmm21[15],xmm18[15]
50375033
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
50385034
; AVX512BW-FAST-NEXT: vpermw %zmm3, %zmm14, %zmm3
50395035
; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm25[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
5040-
; AVX512BW-FAST-NEXT: vpermw %zmm5, %zmm20, %zmm3 {%k1}
5041-
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7]
5042-
; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5, %ymm4
5036+
; AVX512BW-FAST-NEXT: vpermw %zmm6, %zmm20, %zmm3 {%k1}
5037+
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7]
5038+
; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm6, %ymm4
50435039
; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
50445040
; AVX512BW-FAST-NEXT: # ymm7 = mem[0,1,0,1]
50455041
; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9
@@ -5049,7 +5045,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
50495045
; AVX512BW-FAST-NEXT: kmovd %eax, %k1
50505046
; AVX512BW-FAST-NEXT: vmovdqu16 %zmm4, %zmm11 {%k1}
50515047
; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm24[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
5052-
; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5, %ymm4
5048+
; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm6, %ymm4
50535049
; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
50545050
; AVX512BW-FAST-NEXT: # ymm9 = mem[0,1,0,1]
50555051
; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm8
@@ -5060,12 +5056,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
50605056
; AVX512BW-FAST-NEXT: vmovdqu8 %zmm4, %zmm11 {%k2}
50615057
; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2
50625058
; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
5063-
; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5, %ymm4
5059+
; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm6, %ymm4
50645060
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
50655061
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2
50665062
; AVX512BW-FAST-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
50675063
; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
5068-
; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm5, %ymm2
5064+
; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm6, %ymm2
50695065
; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1
50705066
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
50715067
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
@@ -5075,7 +5071,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
50755071
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax)
50765072
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, (%rax)
50775073
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax)
5078-
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax)
5074+
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 128(%rax)
50795075
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax)
50805076
; AVX512BW-FAST-NEXT: vzeroupper
50815077
; AVX512BW-FAST-NEXT: retq

0 commit comments

Comments
 (0)