Skip to content

Commit 24184db

Browse files
committed
[X86] Fold CONCAT(VPERMV3(X,Y,M0),VPERMV3(Z,W,M1)) -> VPERMV3(CONCAT(X,Z),CONCAT(Y,W),CONCAT(M0,M1))
Further prep work toward supporting different subvector sizes in combineX86ShufflesRecursively
1 parent 6f13445 commit 24184db

File tree

2 files changed

+68
-42
lines changed

2 files changed

+68
-42
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48813,6 +48813,38 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
4881348813
return DAG.getBitcast(VT, Res);
4881448814
}
4881548815
break;
48816+
case X86ISD::VPERMV3:
48817+
if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
48818+
MVT OpVT = Op0.getSimpleValueType();
48819+
int NumSrcElts = OpVT.getVectorNumElements();
48820+
SmallVector<int, 64> ConcatMask;
48821+
for (unsigned i = 0; i != NumOps; ++i) {
48822+
bool IsUnary;
48823+
SmallVector<int, 64> SubMask;
48824+
SmallVector<SDValue, 2> SubOps;
48825+
if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
48826+
SubMask, IsUnary))
48827+
break;
48828+
for (int M : SubMask) {
48829+
if (0 <= M) {
48830+
M += M < NumSrcElts ? 0 : NumSrcElts;
48831+
M += i * NumSrcElts;
48832+
}
48833+
ConcatMask.push_back(M);
48834+
}
48835+
}
48836+
if (ConcatMask.size() == (NumOps * NumSrcElts)) {
48837+
SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
48838+
Ops[1].getOperand(0), DAG, DL);
48839+
SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
48840+
Ops[1].getOperand(2), DAG, DL);
48841+
MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
48842+
MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
48843+
SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
48844+
return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
48845+
}
48846+
}
48847+
break;
4881648848
case X86ISD::VSHLI:
4881748849
case X86ISD::VSRAI:
4881848850
case X86ISD::VSRLI:

llvm/test/CodeGen/X86/vector-pack-512.ll

Lines changed: 36 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -145,11 +145,10 @@ define <32 x i16> @concat_trunc_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) nou
145145
; AVX512-NEXT: vpsrad $23, %zmm1, %zmm1
146146
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
147147
; AVX512-NEXT: vpmovdw %zmm1, %ymm1
148-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
149-
; AVX512-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
150-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
151-
; AVX512-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
152-
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
148+
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
149+
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
150+
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
151+
; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
153152
; AVX512-NEXT: retq
154153
%1 = ashr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
155154
%2 = ashr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
@@ -166,11 +165,10 @@ define <32 x i16> @concat_trunc_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nou
166165
; AVX512-NEXT: vpsrld $23, %zmm1, %zmm1
167166
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
168167
; AVX512-NEXT: vpmovdw %zmm1, %ymm1
169-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
170-
; AVX512-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
171-
; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
172-
; AVX512-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
173-
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
168+
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
169+
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
170+
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
171+
; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
174172
; AVX512-NEXT: retq
175173
%1 = lshr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
176174
%2 = lshr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
@@ -183,26 +181,25 @@ define <32 x i16> @concat_trunc_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nou
183181
define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
184182
; AVX512F-LABEL: concat_trunc_packsswb_512:
185183
; AVX512F: # %bb.0:
186-
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
187-
; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
184+
; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2
185+
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
188186
; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0
189187
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
190188
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
189+
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
191190
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
192191
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
193-
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
194-
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
195-
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
192+
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
193+
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
196194
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
197195
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
198196
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
199-
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
197+
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
200198
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
201-
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
202-
; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
203-
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
204-
; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
205-
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
199+
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2
200+
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
201+
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
202+
; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
206203
; AVX512F-NEXT: retq
207204
;
208205
; AVX512BW-LABEL: concat_trunc_packsswb_512:
@@ -211,11 +208,10 @@ define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) noun
211208
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
212209
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
213210
; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
214-
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
215-
; AVX512BW-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
216-
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
217-
; AVX512BW-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
218-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
211+
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
212+
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
213+
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
214+
; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
219215
; AVX512BW-NEXT: retq
220216
%1 = ashr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
221217
%2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -228,26 +224,25 @@ define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) noun
228224
define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
229225
; AVX512F-LABEL: concat_trunc_packuswb_512:
230226
; AVX512F: # %bb.0:
231-
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
232-
; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm2
227+
; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm2
228+
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
233229
; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0
234230
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
235231
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
232+
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
236233
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
237234
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
238-
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
239-
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
240-
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
235+
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
236+
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
241237
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
242238
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
243239
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
244-
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
240+
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
245241
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
246-
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
247-
; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
248-
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
249-
; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
250-
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
242+
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2
243+
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
244+
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
245+
; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
251246
; AVX512F-NEXT: retq
252247
;
253248
; AVX512BW-LABEL: concat_trunc_packuswb_512:
@@ -256,11 +251,10 @@ define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) noun
256251
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
257252
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
258253
; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
259-
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
260-
; AVX512BW-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
261-
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
262-
; AVX512BW-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
263-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
254+
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
255+
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
256+
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
257+
; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
264258
; AVX512BW-NEXT: retq
265259
%1 = lshr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
266260
%2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>

0 commit comments

Comments
 (0)