@@ -58492,14 +58492,23 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
58492
58492
const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
58493
58493
const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
58494
58494
// concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58495
- // Only concat of subvector high halves which vperm2x128 is best at.
58495
+ // Only concat of subvector high halves which vperm2x128 is best at or if
58496
+ // it should fold into a subvector broadcast.
58496
58497
if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58497
- SrcVT1.is256BitVector() && SrcIdx0 == (NumSrcElts0 / 2) &&
58498
- SrcIdx1 == (NumSrcElts1 / 2)) {
58499
- return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58500
- DAG.getBitcast(VT, Src0.getOperand(0)),
58501
- DAG.getBitcast(VT, Src1.getOperand(0)),
58502
- DAG.getTargetConstant(0x31, DL, MVT::i8));
58498
+ SrcVT1.is256BitVector()) {
58499
+ assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58500
+ (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58501
+ "Bad subvector index");
58502
+ if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58503
+ (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58504
+ unsigned Index = 0;
58505
+ Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58506
+ Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58507
+ return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58508
+ DAG.getBitcast(VT, Src0.getOperand(0)),
58509
+ DAG.getBitcast(VT, Src1.getOperand(0)),
58510
+ DAG.getTargetConstant(Index, DL, MVT::i8));
58511
+ }
58503
58512
}
58504
58513
// Widen extract_subvector
58505
58514
// concat(extract_subvector(x,lo), extract_subvector(x,hi))
@@ -59312,6 +59321,45 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
59312
59321
return DAG.getBitcast(VT, Res);
59313
59322
}
59314
59323
59324
+ // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59325
+ if (!IsSplat && NumOps == 2 && VT == MVT::v4f64 &&
59326
+ all_of(Ops, [](SDValue Op) {
59327
+ return Op.hasOneUse() && (Op.getOpcode() == X86ISD::MOVDDUP ||
59328
+ Op.getOpcode() == X86ISD::SHUFP ||
59329
+ Op.getOpcode() == X86ISD::VPERMILPI ||
59330
+ Op.getOpcode() == X86ISD::BLENDI ||
59331
+ Op.getOpcode() == X86ISD::UNPCKL ||
59332
+ Op.getOpcode() == X86ISD::UNPCKH);
59333
+ })) {
59334
+ SmallVector<SDValue, 2> SrcOps0, SrcOps1;
59335
+ SmallVector<int, 8> SrcMask0, SrcMask1;
59336
+ if (getTargetShuffleMask(Ops[0], /*AllowSentinelZero=*/false, SrcOps0,
59337
+ SrcMask0) &&
59338
+ getTargetShuffleMask(Ops[1], /*AllowSentinelZero=*/false, SrcOps1,
59339
+ SrcMask1)) {
59340
+ assert(SrcMask0.size() == 2 && SrcMask1.size() == 2 && "Bad shuffles");
59341
+ SDValue LHS[] = {SrcOps0[SrcMask0[0] / 2], SrcOps1[SrcMask1[0] / 2]};
59342
+ SDValue RHS[] = {SrcOps0[SrcMask0[1] / 2], SrcOps1[SrcMask1[1] / 2]};
59343
+ SDValue Concat0 =
59344
+ combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59345
+ SDValue Concat1 =
59346
+ combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59347
+ if (Concat0 || Concat1) {
59348
+ unsigned SHUFPDMask = 0;
59349
+ SHUFPDMask |= (SrcMask0[0] & 1) << 0;
59350
+ SHUFPDMask |= (SrcMask0[1] & 1) << 1;
59351
+ SHUFPDMask |= (SrcMask1[0] & 1) << 2;
59352
+ SHUFPDMask |= (SrcMask1[1] & 1) << 3;
59353
+ Concat0 =
59354
+ Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59355
+ Concat1 =
59356
+ Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59357
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59358
+ DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59359
+ }
59360
+ }
59361
+ }
59362
+
59315
59363
return SDValue();
59316
59364
}
59317
59365
0 commit comments