@@ -39640,13 +39640,6 @@ static bool matchBinaryPermuteShuffle(
39640
39640
return false;
39641
39641
}
39642
39642
39643
- static SDValue combineX86ShuffleChainWithExtract(
39644
- ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39645
- ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39646
- bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39647
- bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39648
- const X86Subtarget &Subtarget);
39649
-
39650
39643
/// Combine an arbitrary chain of shuffles into a single instruction if
39651
39644
/// possible.
39652
39645
///
@@ -40191,14 +40184,6 @@ static SDValue combineX86ShuffleChain(
40191
40184
return DAG.getBitcast(RootVT, Res);
40192
40185
}
40193
40186
40194
- // If that failed and either input is extracted then try to combine as a
40195
- // shuffle with the larger type.
40196
- if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40197
- Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40198
- AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40199
- IsMaskedShuffle, DAG, DL, Subtarget))
40200
- return WideShuffle;
40201
-
40202
40187
// If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40203
40188
// (non-VLX will pad to 512-bit shuffles).
40204
40189
if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
@@ -40364,14 +40349,6 @@ static SDValue combineX86ShuffleChain(
40364
40349
return DAG.getBitcast(RootVT, Res);
40365
40350
}
40366
40351
40367
- // If that failed and either input is extracted then try to combine as a
40368
- // shuffle with the larger type.
40369
- if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40370
- Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40371
- AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40372
- DAG, DL, Subtarget))
40373
- return WideShuffle;
40374
-
40375
40352
// If we have a dual input shuffle then lower to VPERMV3,
40376
40353
// (non-VLX will pad to 512-bit shuffles)
40377
40354
if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
@@ -40397,148 +40374,6 @@ static SDValue combineX86ShuffleChain(
40397
40374
return SDValue();
40398
40375
}
40399
40376
40400
- // Combine an arbitrary chain of shuffles + extract_subvectors into a single
40401
- // instruction if possible.
40402
- //
40403
- // Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40404
- // type size to attempt to combine:
40405
- // shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40406
- // -->
40407
- // extract_subvector(shuffle(x,y,m2),0)
40408
- static SDValue combineX86ShuffleChainWithExtract(
40409
- ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40410
- ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40411
- bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40412
- bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40413
- const X86Subtarget &Subtarget) {
40414
- unsigned NumMaskElts = BaseMask.size();
40415
- unsigned NumInputs = Inputs.size();
40416
- if (NumInputs == 0)
40417
- return SDValue();
40418
-
40419
- unsigned RootSizeInBits = RootVT.getSizeInBits();
40420
- unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40421
- assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40422
-
40423
- // Peek through subvectors to find widest legal vector.
40424
- // TODO: Handle ISD::TRUNCATE
40425
- unsigned WideSizeInBits = RootSizeInBits;
40426
- for (SDValue Input : Inputs) {
40427
- Input = peekThroughBitcasts(Input);
40428
- while (1) {
40429
- if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40430
- Input = peekThroughBitcasts(Input.getOperand(0));
40431
- continue;
40432
- }
40433
- if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40434
- Input.getOperand(0).isUndef()) {
40435
- Input = peekThroughBitcasts(Input.getOperand(1));
40436
- continue;
40437
- }
40438
- break;
40439
- }
40440
- if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40441
- WideSizeInBits < Input.getValueSizeInBits())
40442
- WideSizeInBits = Input.getValueSizeInBits();
40443
- }
40444
-
40445
- // Bail if we fail to find a source larger than the existing root.
40446
- if (WideSizeInBits <= RootSizeInBits ||
40447
- (WideSizeInBits % RootSizeInBits) != 0)
40448
- return SDValue();
40449
-
40450
- // Create new mask for larger type.
40451
- SmallVector<int, 64> WideMask;
40452
- growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40453
-
40454
- // Attempt to peek through inputs and adjust mask when we extract from an
40455
- // upper subvector.
40456
- int AdjustedMasks = 0;
40457
- SmallVector<SDValue, 4> WideInputs(Inputs);
40458
- for (unsigned I = 0; I != NumInputs; ++I) {
40459
- SDValue &Input = WideInputs[I];
40460
- Input = peekThroughBitcasts(Input);
40461
- while (1) {
40462
- if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40463
- Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40464
- uint64_t Idx = Input.getConstantOperandVal(1);
40465
- if (Idx != 0) {
40466
- ++AdjustedMasks;
40467
- unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40468
- Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40469
-
40470
- int lo = I * WideMask.size();
40471
- int hi = (I + 1) * WideMask.size();
40472
- for (int &M : WideMask)
40473
- if (lo <= M && M < hi)
40474
- M += Idx;
40475
- }
40476
- Input = peekThroughBitcasts(Input.getOperand(0));
40477
- continue;
40478
- }
40479
- // TODO: Handle insertions into upper subvectors.
40480
- if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40481
- Input.getOperand(0).isUndef() &&
40482
- isNullConstant(Input.getOperand(2))) {
40483
- Input = peekThroughBitcasts(Input.getOperand(1));
40484
- continue;
40485
- }
40486
- break;
40487
- }
40488
- }
40489
-
40490
- // Remove unused/repeated shuffle source ops.
40491
- resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40492
- assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40493
-
40494
- // Bail if we're always extracting from the lowest subvectors,
40495
- // combineX86ShuffleChain should match this for the current width, or the
40496
- // shuffle still references too many inputs.
40497
- if (AdjustedMasks == 0 || WideInputs.size() > 2)
40498
- return SDValue();
40499
-
40500
- // Minor canonicalization of the accumulated shuffle mask to make it easier
40501
- // to match below. All this does is detect masks with sequential pairs of
40502
- // elements, and shrink them to the half-width mask. It does this in a loop
40503
- // so it will reduce the size of the mask to the minimal width mask which
40504
- // performs an equivalent shuffle.
40505
- while (WideMask.size() > 1) {
40506
- SmallVector<int, 64> WidenedMask;
40507
- if (!canWidenShuffleElements(WideMask, WidenedMask))
40508
- break;
40509
- WideMask = std::move(WidenedMask);
40510
- }
40511
-
40512
- // Canonicalization of binary shuffle masks to improve pattern matching by
40513
- // commuting the inputs.
40514
- if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40515
- ShuffleVectorSDNode::commuteMask(WideMask);
40516
- std::swap(WideInputs[0], WideInputs[1]);
40517
- }
40518
-
40519
- // Increase depth for every upper subvector we've peeked through.
40520
- Depth += AdjustedMasks;
40521
-
40522
- // Attempt to combine wider chain.
40523
- // TODO: Can we use a better Root?
40524
- SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40525
- WideInputs.back().getValueSizeInBits()
40526
- ? WideInputs.front()
40527
- : WideInputs.back();
40528
- assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40529
- "WideRootSize mismatch");
40530
-
40531
- if (SDValue WideShuffle = combineX86ShuffleChain(
40532
- WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40533
- Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40534
- IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40535
- WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40536
- return DAG.getBitcast(RootVT, WideShuffle);
40537
- }
40538
-
40539
- return SDValue();
40540
- }
40541
-
40542
40377
// Canonicalize the combined shuffle mask chain with horizontal ops.
40543
40378
// NOTE: This may update the Ops and Mask.
40544
40379
static SDValue canonicalizeShuffleMaskWithHorizOp(
@@ -40951,6 +40786,54 @@ static SDValue combineX86ShufflesRecursively(
40951
40786
OpMask.assign(NumElts, SM_SentinelUndef);
40952
40787
std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
40953
40788
OpZero = OpUndef = APInt::getZero(NumElts);
40789
+ } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40790
+ TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
40791
+ Op.getOperand(0).getValueSizeInBits() > RootSizeInBits &&
40792
+ (Op.getOperand(0).getValueSizeInBits() % RootSizeInBits) == 0) {
40793
+ // Extracting from vector larger than RootVT - scale the mask and attempt to
40794
+ // fold the shuffle with the larger root type, then extract the lower
40795
+ // elements.
40796
+ unsigned NewRootSizeInBits = Op.getOperand(0).getValueSizeInBits();
40797
+ unsigned Scale = NewRootSizeInBits / RootSizeInBits;
40798
+ MVT NewRootVT = MVT::getVectorVT(RootVT.getScalarType(),
40799
+ Scale * RootVT.getVectorNumElements());
40800
+ SmallVector<int, 64> NewRootMask;
40801
+ growShuffleMask(RootMask, NewRootMask, RootSizeInBits, NewRootSizeInBits);
40802
+ // If we're using the lowest subvector, just replace it directly in the src
40803
+ // ops/nodes.
40804
+ SmallVector<SDValue, 16> NewSrcOps(SrcOps);
40805
+ SmallVector<const SDNode *, 16> NewSrcNodes(SrcNodes);
40806
+ if (isNullConstant(Op.getOperand(1))) {
40807
+ NewSrcOps[SrcOpIndex] = Op.getOperand(0);
40808
+ NewSrcNodes.push_back(Op.getNode());
40809
+ }
40810
+ // Don't increase the combine depth - we're effectively working on the same
40811
+ // nodes, just with a wider type.
40812
+ if (SDValue WideShuffle = combineX86ShufflesRecursively(
40813
+ NewSrcOps, SrcOpIndex, RootOpc, NewRootVT, NewRootMask, NewSrcNodes,
40814
+ Depth, MaxDepth, AllowVariableCrossLaneMask,
40815
+ AllowVariablePerLaneMask, IsMaskedShuffle, DAG, DL, Subtarget))
40816
+ return DAG.getBitcast(
40817
+ RootVT, extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits));
40818
+ return SDValue();
40819
+ } else if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
40820
+ Op.getOperand(1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40821
+ Op.getOperand(1).getOperand(0).getValueSizeInBits() >
40822
+ RootSizeInBits) {
40823
+ // If we're inserting an subvector extracted from a vector larger than
40824
+ // RootVT, then combine the insert_subvector as a shuffle, the
40825
+ // extract_subvector will be folded in a later recursion.
40826
+ SDValue BaseVec = Op.getOperand(0);
40827
+ SDValue SubVec = Op.getOperand(1);
40828
+ int InsertIdx = Op.getConstantOperandVal(2);
40829
+ unsigned NumBaseElts = VT.getVectorNumElements();
40830
+ unsigned NumSubElts = SubVec.getValueType().getVectorNumElements();
40831
+ OpInputs.assign({BaseVec, SubVec});
40832
+ OpMask.resize(NumBaseElts);
40833
+ std::iota(OpMask.begin(), OpMask.end(), 0);
40834
+ std::iota(OpMask.begin() + InsertIdx,
40835
+ OpMask.begin() + InsertIdx + NumSubElts, NumBaseElts);
40836
+ OpZero = OpUndef = APInt::getZero(NumBaseElts);
40954
40837
} else {
40955
40838
return SDValue();
40956
40839
}
@@ -41297,25 +41180,9 @@ static SDValue combineX86ShufflesRecursively(
41297
41180
AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
41298
41181
IsMaskedShuffle, DAG, DL, Subtarget))
41299
41182
return Shuffle;
41300
-
41301
- // If all the operands come from the same larger vector, fallthrough and try
41302
- // to use combineX86ShuffleChainWithExtract.
41303
- SDValue LHS = peekThroughBitcasts(Ops.front());
41304
- SDValue RHS = peekThroughBitcasts(Ops.back());
41305
- if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41306
- (RootSizeInBits / Mask.size()) != 64 ||
41307
- LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41308
- RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41309
- LHS.getOperand(0) != RHS.getOperand(0))
41310
- return SDValue();
41311
41183
}
41312
41184
41313
- // If that failed and any input is extracted then try to combine as a
41314
- // shuffle with the larger type.
41315
- return combineX86ShuffleChainWithExtract(
41316
- Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41317
- AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41318
- DAG, DL, Subtarget);
41185
+ return SDValue();
41319
41186
}
41320
41187
41321
41188
/// Helper entry wrapper to combineX86ShufflesRecursively.
@@ -43928,6 +43795,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
43928
43795
case X86ISD::UNPCKL:
43929
43796
case X86ISD::UNPCKH:
43930
43797
case X86ISD::BLENDI:
43798
+ case X86ISD::SHUFP:
43931
43799
// Integer ops.
43932
43800
case X86ISD::PACKSS:
43933
43801
case X86ISD::PACKUS:
0 commit comments