@@ -58493,14 +58493,23 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
58493
58493
const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
58494
58494
const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
58495
58495
// concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58496
- // Only concat of subvector high halves which vperm2x128 is best at.
58496
+ // Only concat of subvector high halves which vperm2x128 is best at or if
58497
+ // it should fold into a subvector broadcast.
58497
58498
if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58498
- SrcVT1.is256BitVector() && SrcIdx0 == (NumSrcElts0 / 2) &&
58499
- SrcIdx1 == (NumSrcElts1 / 2)) {
58500
- return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58501
- DAG.getBitcast(VT, Src0.getOperand(0)),
58502
- DAG.getBitcast(VT, Src1.getOperand(0)),
58503
- DAG.getTargetConstant(0x31, DL, MVT::i8));
58499
+ SrcVT1.is256BitVector()) {
58500
+ assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58501
+ (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58502
+ "Bad subvector index");
58503
+ if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58504
+ (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58505
+ unsigned Index = 0;
58506
+ Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58507
+ Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58508
+ return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58509
+ DAG.getBitcast(VT, Src0.getOperand(0)),
58510
+ DAG.getBitcast(VT, Src1.getOperand(0)),
58511
+ DAG.getTargetConstant(Index, DL, MVT::i8));
58512
+ }
58504
58513
}
58505
58514
// Widen extract_subvector
58506
58515
// concat(extract_subvector(x,lo), extract_subvector(x,hi))
@@ -59313,6 +59322,45 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
59313
59322
return DAG.getBitcast(VT, Res);
59314
59323
}
59315
59324
59325
+ // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59326
+ if (!IsSplat && NumOps == 2 && VT == MVT::v4f64 &&
59327
+ all_of(Ops, [](SDValue Op) {
59328
+ return Op.hasOneUse() && (Op.getOpcode() == X86ISD::MOVDDUP ||
59329
+ Op.getOpcode() == X86ISD::SHUFP ||
59330
+ Op.getOpcode() == X86ISD::VPERMILPI ||
59331
+ Op.getOpcode() == X86ISD::BLENDI ||
59332
+ Op.getOpcode() == X86ISD::UNPCKL ||
59333
+ Op.getOpcode() == X86ISD::UNPCKH);
59334
+ })) {
59335
+ SmallVector<SDValue, 2> SrcOps0, SrcOps1;
59336
+ SmallVector<int, 8> SrcMask0, SrcMask1;
59337
+ if (getTargetShuffleMask(Ops[0], /*AllowSentinelZero=*/false, SrcOps0,
59338
+ SrcMask0) &&
59339
+ getTargetShuffleMask(Ops[1], /*AllowSentinelZero=*/false, SrcOps1,
59340
+ SrcMask1)) {
59341
+ assert(SrcMask0.size() == 2 && SrcMask1.size() == 2 && "Bad shuffles");
59342
+ SDValue LHS[] = {SrcOps0[SrcMask0[0] / 2], SrcOps1[SrcMask1[0] / 2]};
59343
+ SDValue RHS[] = {SrcOps0[SrcMask0[1] / 2], SrcOps1[SrcMask1[1] / 2]};
59344
+ SDValue Concat0 =
59345
+ combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59346
+ SDValue Concat1 =
59347
+ combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59348
+ if (Concat0 || Concat1) {
59349
+ unsigned SHUFPDMask = 0;
59350
+ SHUFPDMask |= (SrcMask0[0] & 1) << 0;
59351
+ SHUFPDMask |= (SrcMask0[1] & 1) << 1;
59352
+ SHUFPDMask |= (SrcMask1[0] & 1) << 2;
59353
+ SHUFPDMask |= (SrcMask1[1] & 1) << 3;
59354
+ Concat0 =
59355
+ Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59356
+ Concat1 =
59357
+ Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59358
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59359
+ DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59360
+ }
59361
+ }
59362
+ }
59363
+
59316
59364
return SDValue();
59317
59365
}
59318
59366
0 commit comments