Skip to content

Commit 7c77a46

Browse files
authored
[X86] combineConcatVectorOps - convert ISD::VECTOR_SHUFFLE concatenation to use combineConcatVectorOps recursion (#130610)
Only concatenate ISD::VECTOR_SHUFFLE nodes if at least one operand is beneficial to concatenate.
1 parent f33dca4 commit 7c77a46

10 files changed

+267
-251
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -57973,24 +57973,28 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5797357973
// TODO: Relax VBMI requirement for repeated shuffle ops - currently
5797457974
// limited to targets that should always have good cross lane shuffles.
5797557975
if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
57976-
(EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
57977-
(IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1) ||
57978-
(Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
57979-
Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
57980-
Subtarget.hasVBMI()))) {
57981-
int NumSubElts = Op0.getValueType().getVectorNumElements();
57982-
SmallVector<int> NewMask;
57983-
for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
57984-
M = M >= NumSubElts ? M + NumSubElts : M;
57985-
NewMask.push_back(M);
57986-
}
57987-
for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
57988-
if (0 <= M)
57989-
M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
57990-
NewMask.push_back(M);
57976+
(EltSizeInBits >= 32 || Subtarget.hasInt256())) {
57977+
SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
57978+
SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
57979+
if (Concat0 || Concat1 ||
57980+
(Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
57981+
Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
57982+
Subtarget.hasVBMI())) {
57983+
int NumSubElts = Op0.getValueType().getVectorNumElements();
57984+
SmallVector<int> NewMask;
57985+
for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
57986+
M = M >= NumSubElts ? M + NumSubElts : M;
57987+
NewMask.push_back(M);
57988+
}
57989+
for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
57990+
if (0 <= M)
57991+
M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
57992+
NewMask.push_back(M);
57993+
}
57994+
Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
57995+
Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
57996+
return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);
5799157997
}
57992-
return DAG.getVectorShuffle(VT, DL, ConcatSubOperand(VT, Ops, 0),
57993-
ConcatSubOperand(VT, Ops, 1), NewMask);
5799457998
}
5799557999
break;
5799658000
}

llvm/test/CodeGen/X86/gfni-rotates.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -255,9 +255,9 @@ define <16 x i8> @splatvar_rotr_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind {
255255
;
256256
; GFNIAVX512BW-LABEL: splatvar_rotr_v16i8:
257257
; GFNIAVX512BW: # %bb.0:
258-
; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
259-
; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
260-
; GFNIAVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
258+
; GFNIAVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
259+
; GFNIAVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
260+
; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
261261
; GFNIAVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
262262
; GFNIAVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
263263
; GFNIAVX512BW-NEXT: vpmovwb %ymm0, %xmm0

llvm/test/CodeGen/X86/known-bits-vector.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -384,19 +384,19 @@ declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
384384
define <8 x float> @knownbits_mask_concat_uitofp(<4 x i32> %a0, <4 x i32> %a1) nounwind {
385385
; X86-LABEL: knownbits_mask_concat_uitofp:
386386
; X86: # %bb.0:
387-
; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
388-
; X86-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
387+
; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
389388
; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
390389
; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
390+
; X86-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,2,5,7,5,7]
391391
; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
392392
; X86-NEXT: retl
393393
;
394394
; X64-LABEL: knownbits_mask_concat_uitofp:
395395
; X64: # %bb.0:
396-
; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
397-
; X64-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
396+
; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
398397
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
399398
; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
399+
; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,2,5,7,5,7]
400400
; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
401401
; X64-NEXT: retq
402402
%1 = and <4 x i32> %a0, <i32 131071, i32 -1, i32 131071, i32 -1>

llvm/test/CodeGen/X86/matrix-multiply.ll

Lines changed: 59 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -974,35 +974,65 @@ define <16 x float> @test_mul4x4_f32(<16 x float> %a0, <16 x float> %a1) nounwin
974974
; SSE-NEXT: movaps %xmm5, %xmm2
975975
; SSE-NEXT: retq
976976
;
977-
; AVX1OR2-LABEL: test_mul4x4_f32:
978-
; AVX1OR2: # %bb.0: # %entry
979-
; AVX1OR2-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5]
980-
; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,2,3]
981-
; AVX1OR2-NEXT: vmulps %ymm4, %ymm5, %ymm4
982-
; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6
983-
; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0,0,0,4,4,4,4]
984-
; AVX1OR2-NEXT: vmulps %ymm0, %ymm6, %ymm0
985-
; AVX1OR2-NEXT: vaddps %ymm4, %ymm0, %ymm0
986-
; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm4
987-
; AVX1OR2-NEXT: vshufps {{.*#+}} ymm7 = ymm2[2,2,2,2,6,6,6,6]
988-
; AVX1OR2-NEXT: vmulps %ymm7, %ymm4, %ymm7
989-
; AVX1OR2-NEXT: vaddps %ymm7, %ymm0, %ymm0
990-
; AVX1OR2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
991-
; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
992-
; AVX1OR2-NEXT: vmulps %ymm2, %ymm1, %ymm2
993-
; AVX1OR2-NEXT: vaddps %ymm2, %ymm0, %ymm0
994-
; AVX1OR2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
995-
; AVX1OR2-NEXT: vmulps %ymm2, %ymm5, %ymm2
996-
; AVX1OR2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4]
997-
; AVX1OR2-NEXT: vmulps %ymm5, %ymm6, %ymm5
998-
; AVX1OR2-NEXT: vaddps %ymm2, %ymm5, %ymm2
999-
; AVX1OR2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,2,2,2,6,6,6,6]
1000-
; AVX1OR2-NEXT: vmulps %ymm5, %ymm4, %ymm4
1001-
; AVX1OR2-NEXT: vaddps %ymm4, %ymm2, %ymm2
1002-
; AVX1OR2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7]
1003-
; AVX1OR2-NEXT: vmulps %ymm3, %ymm1, %ymm1
1004-
; AVX1OR2-NEXT: vaddps %ymm1, %ymm2, %ymm1
1005-
; AVX1OR2-NEXT: retq
977+
; AVX1-LABEL: test_mul4x4_f32:
978+
; AVX1: # %bb.0: # %entry
979+
; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5]
980+
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,2,3]
981+
; AVX1-NEXT: vmulps %ymm4, %ymm5, %ymm4
982+
; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4]
983+
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7
984+
; AVX1-NEXT: vmulps %ymm6, %ymm7, %ymm0
985+
; AVX1-NEXT: vaddps %ymm4, %ymm0, %ymm0
986+
; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6]
987+
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm6
988+
; AVX1-NEXT: vmulps %ymm4, %ymm6, %ymm4
989+
; AVX1-NEXT: vaddps %ymm4, %ymm0, %ymm0
990+
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
991+
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
992+
; AVX1-NEXT: vmulps %ymm2, %ymm1, %ymm2
993+
; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0
994+
; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
995+
; AVX1-NEXT: vmulps %ymm2, %ymm5, %ymm2
996+
; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4]
997+
; AVX1-NEXT: vmulps %ymm4, %ymm7, %ymm4
998+
; AVX1-NEXT: vaddps %ymm2, %ymm4, %ymm2
999+
; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
1000+
; AVX1-NEXT: vmulps %ymm4, %ymm6, %ymm4
1001+
; AVX1-NEXT: vaddps %ymm4, %ymm2, %ymm2
1002+
; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7]
1003+
; AVX1-NEXT: vmulps %ymm3, %ymm1, %ymm1
1004+
; AVX1-NEXT: vaddps %ymm1, %ymm2, %ymm1
1005+
; AVX1-NEXT: retq
1006+
;
1007+
; AVX2-LABEL: test_mul4x4_f32:
1008+
; AVX2: # %bb.0: # %entry
1009+
; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5]
1010+
; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,3,2,3]
1011+
; AVX2-NEXT: vmulps %ymm4, %ymm5, %ymm4
1012+
; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4]
1013+
; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,0,1]
1014+
; AVX2-NEXT: vmulps %ymm6, %ymm7, %ymm0
1015+
; AVX2-NEXT: vaddps %ymm4, %ymm0, %ymm0
1016+
; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6]
1017+
; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[0,1,0,1]
1018+
; AVX2-NEXT: vmulps %ymm4, %ymm6, %ymm4
1019+
; AVX2-NEXT: vaddps %ymm4, %ymm0, %ymm0
1020+
; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
1021+
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,2,3]
1022+
; AVX2-NEXT: vmulps %ymm2, %ymm1, %ymm2
1023+
; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
1024+
; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
1025+
; AVX2-NEXT: vmulps %ymm2, %ymm5, %ymm2
1026+
; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4]
1027+
; AVX2-NEXT: vmulps %ymm4, %ymm7, %ymm4
1028+
; AVX2-NEXT: vaddps %ymm2, %ymm4, %ymm2
1029+
; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
1030+
; AVX2-NEXT: vmulps %ymm4, %ymm6, %ymm4
1031+
; AVX2-NEXT: vaddps %ymm4, %ymm2, %ymm2
1032+
; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7]
1033+
; AVX2-NEXT: vmulps %ymm3, %ymm1, %ymm1
1034+
; AVX2-NEXT: vaddps %ymm1, %ymm2, %ymm1
1035+
; AVX2-NEXT: retq
10061036
;
10071037
; AVX512-LABEL: test_mul4x4_f32:
10081038
; AVX512: # %bb.0: # %entry

llvm/test/CodeGen/X86/mulvi32.ll

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -286,12 +286,8 @@ define <4 x i64> @_mul4xi32toi64c(<4 x i32>, <4 x i32>) {
286286
;
287287
; AVX2-LABEL: _mul4xi32toi64c:
288288
; AVX2: # %bb.0:
289-
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
290-
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
291-
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
292-
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
293-
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
294-
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
289+
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
290+
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
295291
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
296292
; AVX2-NEXT: retq
297293
%lower0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>

llvm/test/CodeGen/X86/vector-fshr-rot-128.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,9 +1194,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
11941194
;
11951195
; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
11961196
; AVX512VLBW: # %bb.0:
1197-
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1198-
; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1199-
; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
1197+
; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1198+
; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1199+
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
12001200
; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
12011201
; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
12021202
; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0

llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -671,15 +671,16 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
671671
; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
672672
; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
673673
; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
674-
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
675-
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
676674
; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
677-
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
678675
; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
676+
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
677+
; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
679678
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
680-
; AVX2-NEXT: vmovdqa %xmm0, 16(%rcx)
681-
; AVX2-NEXT: vmovdqa %xmm1, (%rcx)
679+
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
680+
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
682681
; AVX2-NEXT: vmovdqa %xmm2, 32(%rcx)
682+
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
683+
; AVX2-NEXT: vzeroupper
683684
; AVX2-NEXT: retq
684685
;
685686
; AVX2-FP-LABEL: store_i8_stride3_vf16:
@@ -693,15 +694,16 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
693694
; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
694695
; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
695696
; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
696-
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
697-
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
698697
; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
699-
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
700698
; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
699+
; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
700+
; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1]
701701
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
702-
; AVX2-FP-NEXT: vmovdqa %xmm0, 16(%rcx)
703-
; AVX2-FP-NEXT: vmovdqa %xmm1, (%rcx)
702+
; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
703+
; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
704704
; AVX2-FP-NEXT: vmovdqa %xmm2, 32(%rcx)
705+
; AVX2-FP-NEXT: vmovdqa %ymm0, (%rcx)
706+
; AVX2-FP-NEXT: vzeroupper
705707
; AVX2-FP-NEXT: retq
706708
;
707709
; AVX2-FCP-LABEL: store_i8_stride3_vf16:
@@ -715,15 +717,16 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
715717
; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
716718
; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
717719
; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
718-
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
719-
; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
720720
; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
721-
; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
722721
; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
722+
; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
723+
; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
723724
; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
724-
; AVX2-FCP-NEXT: vmovdqa %xmm0, 16(%rcx)
725-
; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rcx)
725+
; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
726+
; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
726727
; AVX2-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
728+
; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rcx)
729+
; AVX2-FCP-NEXT: vzeroupper
727730
; AVX2-FCP-NEXT: retq
728731
;
729732
; AVX512-LABEL: store_i8_stride3_vf16:

llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1682,12 +1682,19 @@ define <4 x i64> @shuffle_v4i64_0044_v2i64(<2 x i64> %a, <2 x i64> %b) {
16821682
}
16831683

16841684
define <4 x i64> @shuffle_v4i64_1032_v2i64(<2 x i64> %a, <2 x i64> %b) {
1685-
; AVX1OR2-LABEL: shuffle_v4i64_1032_v2i64:
1686-
; AVX1OR2: # %bb.0:
1687-
; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1688-
; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1689-
; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1690-
; AVX1OR2-NEXT: retq
1685+
; AVX1-LABEL: shuffle_v4i64_1032_v2i64:
1686+
; AVX1: # %bb.0:
1687+
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1688+
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1689+
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1690+
; AVX1-NEXT: retq
1691+
;
1692+
; AVX2-LABEL: shuffle_v4i64_1032_v2i64:
1693+
; AVX2: # %bb.0:
1694+
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1695+
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1696+
; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1697+
; AVX2-NEXT: retq
16911698
;
16921699
; AVX512VL-SLOW-LABEL: shuffle_v4i64_1032_v2i64:
16931700
; AVX512VL-SLOW: # %bb.0:

0 commit comments

Comments
 (0)