Skip to content

Commit 170ba6e

Browse files
committed
[X86] combineINSERT_SUBVECTOR - attempt to combine concatenated shuffles
If all the concatenated subvectors are targets shuffle nodes, then call combineX86ShufflesRecursively to attempt to combine them. Unlike the existing shuffle concatenation in collectConcatOps, this isn't limited to splat cases and won't attempt to concat the source nodes prior to creating the larger shuffle node, so will usually only combine to create cross-lane shuffles. This exposed a hidden issue in matchBinaryShuffle that wasn't limiting v64i8/v32i16 UNPACK nodes to AVX512BW targets.
1 parent 0662791 commit 170ba6e

9 files changed

+11166
-10046
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37317,7 +37317,8 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
3731737317
(MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
3731837318
(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
3731937319
(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37320-
(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
37320+
(MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
37321+
(32 <= EltSizeInBits || Subtarget.hasBWI()))) {
3732137322
if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
3732237323
Subtarget)) {
3732337324
SrcVT = DstVT = MaskVT;
@@ -55005,6 +55006,15 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
5500555006
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
5500655007
getZeroVector(OpVT, Subtarget, DAG, dl),
5500755008
SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
55009+
55010+
// Attempt to recursively combine to a shuffle.
55011+
if (all_of(SubVectorOps, [](SDValue SubOp) {
55012+
return isTargetShuffle(SubOp.getOpcode());
55013+
})) {
55014+
SDValue Op(N, 0);
55015+
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55016+
return Res;
55017+
}
5500855018
}
5500955019

5501055020
// If this is a broadcast insert into an upper undef, use a larger broadcast.

llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll

Lines changed: 1267 additions & 951 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll

Lines changed: 853 additions & 905 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll

Lines changed: 2579 additions & 2719 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll

Lines changed: 4292 additions & 3327 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -159,26 +159,44 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
159159
; AVX2-ONLY-NEXT: vzeroupper
160160
; AVX2-ONLY-NEXT: retq
161161
;
162-
; AVX512-LABEL: store_i64_stride4_vf4:
163-
; AVX512: # %bb.0:
164-
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
165-
; AVX512-NEXT: vmovdqa (%rsi), %ymm1
166-
; AVX512-NEXT: vmovdqa (%rdx), %ymm2
167-
; AVX512-NEXT: vmovdqa (%rcx), %ymm3
168-
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
169-
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
170-
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
171-
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
172-
; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
173-
; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
174-
; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
175-
; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
176-
; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm1
177-
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
178-
; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r8)
179-
; AVX512-NEXT: vmovdqa64 %zmm1, (%r8)
180-
; AVX512-NEXT: vzeroupper
181-
; AVX512-NEXT: retq
162+
; AVX512-SLOW-LABEL: store_i64_stride4_vf4:
163+
; AVX512-SLOW: # %bb.0:
164+
; AVX512-SLOW-NEXT: vmovdqa (%rdi), %ymm0
165+
; AVX512-SLOW-NEXT: vmovdqa (%rsi), %ymm1
166+
; AVX512-SLOW-NEXT: vmovdqa (%rdx), %ymm2
167+
; AVX512-SLOW-NEXT: vmovdqa (%rcx), %ymm3
168+
; AVX512-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
169+
; AVX512-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
170+
; AVX512-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
171+
; AVX512-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
172+
; AVX512-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
173+
; AVX512-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
174+
; AVX512-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
175+
; AVX512-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
176+
; AVX512-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm1
177+
; AVX512-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
178+
; AVX512-SLOW-NEXT: vmovdqa64 %zmm0, 64(%r8)
179+
; AVX512-SLOW-NEXT: vmovdqa64 %zmm1, (%r8)
180+
; AVX512-SLOW-NEXT: vzeroupper
181+
; AVX512-SLOW-NEXT: retq
182+
;
183+
; AVX512-FAST-LABEL: store_i64_stride4_vf4:
184+
; AVX512-FAST: # %bb.0:
185+
; AVX512-FAST-NEXT: vmovdqa (%rdi), %ymm0
186+
; AVX512-FAST-NEXT: vmovdqa (%rsi), %ymm1
187+
; AVX512-FAST-NEXT: vmovdqa (%rdx), %ymm2
188+
; AVX512-FAST-NEXT: vmovdqa (%rcx), %ymm3
189+
; AVX512-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
190+
; AVX512-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
191+
; AVX512-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
192+
; AVX512-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
193+
; AVX512-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11]
194+
; AVX512-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
195+
; AVX512-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
196+
; AVX512-FAST-NEXT: vmovdqa64 %zmm0, 64(%r8)
197+
; AVX512-FAST-NEXT: vmovdqa64 %zmm4, (%r8)
198+
; AVX512-FAST-NEXT: vzeroupper
199+
; AVX512-FAST-NEXT: retq
182200
%in.vec0 = load <4 x i64>, ptr %in.vecptr0, align 64
183201
%in.vec1 = load <4 x i64>, ptr %in.vecptr1, align 64
184202
%in.vec2 = load <4 x i64>, ptr %in.vecptr2, align 64

0 commit comments

Comments
 (0)