Skip to content

Commit c1af6ab

Browse files
committed
[X86] getFauxShuffleMask - recognise CONCAT(SUB0, SUB1) style patterns
Handles the INSERT_SUBVECTOR(INSERT_SUBVECTOR(UNDEF,SUB0,0),SUB1,N) pattern Currently limited to v8i64/v8f64 cases as only AVX512 has decent cross lane 2-input shuffles, the plan is to relax this as I deal with some regressions
1 parent c4e517f commit c1af6ab

6 files changed

+233
-242
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5858,6 +5858,23 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
58585858
Ops.push_back(SubBCSrc);
58595859
return true;
58605860
}
5861+
// Handle CONCAT(SUB0, SUB1).
5862+
// Limit this to vXi64 512-bit vector cases to make the most of AVX512
5863+
// cross lane shuffles.
5864+
if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
5865+
NumBitsPerElt == 64 && NumSizeInBits == 512 &&
5866+
Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
5867+
Src.getOperand(0).isUndef() &&
5868+
Src.getOperand(1).getValueType() == SubVT &&
5869+
Src.getConstantOperandVal(2) == 0) {
5870+
for (int i = 0; i != (int)NumSubElts; ++i)
5871+
Mask.push_back(i);
5872+
for (int i = 0; i != (int)NumSubElts; ++i)
5873+
Mask.push_back(i + NumElts);
5874+
Ops.push_back(Src.getOperand(1));
5875+
Ops.push_back(Sub);
5876+
return true;
5877+
}
58615878
// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
58625879
SmallVector<int, 64> SubMask;
58635880
SmallVector<SDValue, 2> SubInputs;

llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -821,9 +821,8 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
821821
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
822822
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
823823
; AVX512BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
824-
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0
825-
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6,10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0]
826-
; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0
824+
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,4,8,12,32,36,40,1,5,9,13,33,37,41,2,6,10,14,34,38,42,3,7,11,15,35,39,43,0,0,0,0]
825+
; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm0
827826
; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
828827
; AVX512BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
829828
; AVX512BW-FCP-NEXT: vmovq %xmm1, 48(%rax)
@@ -873,9 +872,8 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
873872
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
874873
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
875874
; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
876-
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0
877-
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6,10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0]
878-
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0
875+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,4,8,12,32,36,40,1,5,9,13,33,37,41,2,6,10,14,34,38,42,3,7,11,15,35,39,43,0,0,0,0]
876+
; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm0
879877
; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
880878
; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
881879
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 48(%rax)

llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -762,10 +762,9 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
762762
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
763763
; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
764764
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
765-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
766-
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31]
767-
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
768-
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax)
765+
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47]
766+
; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
767+
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax)
769768
; AVX512BW-NEXT: vzeroupper
770769
; AVX512BW-NEXT: retq
771770
;
@@ -788,10 +787,9 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
788787
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
789788
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
790789
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
791-
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
792-
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31]
793-
; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0
794-
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
790+
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47]
791+
; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
792+
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
795793
; AVX512BW-FCP-NEXT: vzeroupper
796794
; AVX512BW-FCP-NEXT: retq
797795
;
@@ -814,10 +812,9 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
814812
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
815813
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
816814
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
817-
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
818-
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31]
819-
; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
820-
; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax)
815+
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47]
816+
; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
817+
; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax)
821818
; AVX512DQ-BW-NEXT: vzeroupper
822819
; AVX512DQ-BW-NEXT: retq
823820
;
@@ -840,10 +837,9 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
840837
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
841838
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
842839
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
843-
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
844-
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29,2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31]
845-
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0
846-
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
840+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47]
841+
; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
842+
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
847843
; AVX512DQ-BW-FCP-NEXT: vzeroupper
848844
; AVX512DQ-BW-FCP-NEXT: retq
849845
%in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64

llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -227,9 +227,8 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
227227
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
228228
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
229229
; AVX512-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
230-
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0
231-
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1,3,5,7,9,11,13,0,0]
232-
; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm0
230+
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,0,0]
231+
; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm0
233232
; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
234233
; AVX512-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
235234
; AVX512-FCP-NEXT: vmovq %xmm1, 48(%rax)
@@ -279,9 +278,8 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
279278
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
280279
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
281280
; AVX512DQ-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
282-
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0
283-
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1,3,5,7,9,11,13,0,0]
284-
; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm0
281+
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,0,0]
282+
; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm0
285283
; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
286284
; AVX512DQ-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
287285
; AVX512DQ-FCP-NEXT: vmovq %xmm1, 48(%rax)
@@ -331,9 +329,8 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
331329
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
332330
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
333331
; AVX512BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
334-
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0
335-
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1,3,5,7,9,11,13,0,0]
336-
; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm0
332+
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,0,0]
333+
; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm0
337334
; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
338335
; AVX512BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
339336
; AVX512BW-FCP-NEXT: vmovq %xmm1, 48(%rax)
@@ -383,9 +380,8 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
383380
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
384381
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
385382
; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
386-
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0
387-
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1,3,5,7,9,11,13,0,0]
388-
; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm0
383+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,0,0]
384+
; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm0
389385
; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
390386
; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
391387
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 48(%rax)

0 commit comments

Comments
 (0)