@@ -39852,13 +39852,6 @@ static bool matchBinaryPermuteShuffle(
39852
39852
return false;
39853
39853
}
39854
39854
39855
- static SDValue combineX86ShuffleChainWithExtract(
39856
- ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39857
- ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39858
- bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39859
- bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39860
- const X86Subtarget &Subtarget);
39861
-
39862
39855
/// Combine an arbitrary chain of shuffles into a single instruction if
39863
39856
/// possible.
39864
39857
///
@@ -40403,14 +40396,6 @@ static SDValue combineX86ShuffleChain(
40403
40396
return DAG.getBitcast(RootVT, Res);
40404
40397
}
40405
40398
40406
- // If that failed and either input is extracted then try to combine as a
40407
- // shuffle with the larger type.
40408
- if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40409
- Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40410
- AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40411
- IsMaskedShuffle, DAG, DL, Subtarget))
40412
- return WideShuffle;
40413
-
40414
40399
// If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40415
40400
// (non-VLX will pad to 512-bit shuffles).
40416
40401
if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
@@ -40576,14 +40561,6 @@ static SDValue combineX86ShuffleChain(
40576
40561
return DAG.getBitcast(RootVT, Res);
40577
40562
}
40578
40563
40579
- // If that failed and either input is extracted then try to combine as a
40580
- // shuffle with the larger type.
40581
- if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40582
- Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40583
- AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40584
- DAG, DL, Subtarget))
40585
- return WideShuffle;
40586
-
40587
40564
// If we have a dual input shuffle then lower to VPERMV3,
40588
40565
// (non-VLX will pad to 512-bit shuffles)
40589
40566
if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
@@ -40609,148 +40586,6 @@ static SDValue combineX86ShuffleChain(
40609
40586
return SDValue();
40610
40587
}
40611
40588
40612
- // Combine an arbitrary chain of shuffles + extract_subvectors into a single
40613
- // instruction if possible.
40614
- //
40615
- // Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40616
- // type size to attempt to combine:
40617
- // shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40618
- // -->
40619
- // extract_subvector(shuffle(x,y,m2),0)
40620
- static SDValue combineX86ShuffleChainWithExtract(
40621
- ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40622
- ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40623
- bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40624
- bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40625
- const X86Subtarget &Subtarget) {
40626
- unsigned NumMaskElts = BaseMask.size();
40627
- unsigned NumInputs = Inputs.size();
40628
- if (NumInputs == 0)
40629
- return SDValue();
40630
-
40631
- unsigned RootSizeInBits = RootVT.getSizeInBits();
40632
- unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40633
- assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40634
-
40635
- // Peek through subvectors to find widest legal vector.
40636
- // TODO: Handle ISD::TRUNCATE
40637
- unsigned WideSizeInBits = RootSizeInBits;
40638
- for (SDValue Input : Inputs) {
40639
- Input = peekThroughBitcasts(Input);
40640
- while (1) {
40641
- if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40642
- Input = peekThroughBitcasts(Input.getOperand(0));
40643
- continue;
40644
- }
40645
- if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40646
- Input.getOperand(0).isUndef()) {
40647
- Input = peekThroughBitcasts(Input.getOperand(1));
40648
- continue;
40649
- }
40650
- break;
40651
- }
40652
- if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40653
- WideSizeInBits < Input.getValueSizeInBits())
40654
- WideSizeInBits = Input.getValueSizeInBits();
40655
- }
40656
-
40657
- // Bail if we fail to find a source larger than the existing root.
40658
- if (WideSizeInBits <= RootSizeInBits ||
40659
- (WideSizeInBits % RootSizeInBits) != 0)
40660
- return SDValue();
40661
-
40662
- // Create new mask for larger type.
40663
- SmallVector<int, 64> WideMask;
40664
- growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40665
-
40666
- // Attempt to peek through inputs and adjust mask when we extract from an
40667
- // upper subvector.
40668
- int AdjustedMasks = 0;
40669
- SmallVector<SDValue, 4> WideInputs(Inputs);
40670
- for (unsigned I = 0; I != NumInputs; ++I) {
40671
- SDValue &Input = WideInputs[I];
40672
- Input = peekThroughBitcasts(Input);
40673
- while (1) {
40674
- if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40675
- Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40676
- uint64_t Idx = Input.getConstantOperandVal(1);
40677
- if (Idx != 0) {
40678
- ++AdjustedMasks;
40679
- unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40680
- Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40681
-
40682
- int lo = I * WideMask.size();
40683
- int hi = (I + 1) * WideMask.size();
40684
- for (int &M : WideMask)
40685
- if (lo <= M && M < hi)
40686
- M += Idx;
40687
- }
40688
- Input = peekThroughBitcasts(Input.getOperand(0));
40689
- continue;
40690
- }
40691
- // TODO: Handle insertions into upper subvectors.
40692
- if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40693
- Input.getOperand(0).isUndef() &&
40694
- isNullConstant(Input.getOperand(2))) {
40695
- Input = peekThroughBitcasts(Input.getOperand(1));
40696
- continue;
40697
- }
40698
- break;
40699
- }
40700
- }
40701
-
40702
- // Remove unused/repeated shuffle source ops.
40703
- resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40704
- assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40705
-
40706
- // Bail if we're always extracting from the lowest subvectors,
40707
- // combineX86ShuffleChain should match this for the current width, or the
40708
- // shuffle still references too many inputs.
40709
- if (AdjustedMasks == 0 || WideInputs.size() > 2)
40710
- return SDValue();
40711
-
40712
- // Minor canonicalization of the accumulated shuffle mask to make it easier
40713
- // to match below. All this does is detect masks with sequential pairs of
40714
- // elements, and shrink them to the half-width mask. It does this in a loop
40715
- // so it will reduce the size of the mask to the minimal width mask which
40716
- // performs an equivalent shuffle.
40717
- while (WideMask.size() > 1) {
40718
- SmallVector<int, 64> WidenedMask;
40719
- if (!canWidenShuffleElements(WideMask, WidenedMask))
40720
- break;
40721
- WideMask = std::move(WidenedMask);
40722
- }
40723
-
40724
- // Canonicalization of binary shuffle masks to improve pattern matching by
40725
- // commuting the inputs.
40726
- if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40727
- ShuffleVectorSDNode::commuteMask(WideMask);
40728
- std::swap(WideInputs[0], WideInputs[1]);
40729
- }
40730
-
40731
- // Increase depth for every upper subvector we've peeked through.
40732
- Depth += AdjustedMasks;
40733
-
40734
- // Attempt to combine wider chain.
40735
- // TODO: Can we use a better Root?
40736
- SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40737
- WideInputs.back().getValueSizeInBits()
40738
- ? WideInputs.front()
40739
- : WideInputs.back();
40740
- assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40741
- "WideRootSize mismatch");
40742
-
40743
- if (SDValue WideShuffle = combineX86ShuffleChain(
40744
- WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40745
- Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40746
- IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40747
- WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40748
- return DAG.getBitcast(RootVT, WideShuffle);
40749
- }
40750
-
40751
- return SDValue();
40752
- }
40753
-
40754
40589
// Canonicalize the combined shuffle mask chain with horizontal ops.
40755
40590
// NOTE: This may update the Ops and Mask.
40756
40591
static SDValue canonicalizeShuffleMaskWithHorizOp(
@@ -41163,6 +40998,54 @@ static SDValue combineX86ShufflesRecursively(
41163
40998
OpMask.assign(NumElts, SM_SentinelUndef);
41164
40999
std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41165
41000
OpZero = OpUndef = APInt::getZero(NumElts);
41001
+ } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41002
+ TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
41003
+ Op.getOperand(0).getValueSizeInBits() > RootSizeInBits &&
41004
+ (Op.getOperand(0).getValueSizeInBits() % RootSizeInBits) == 0) {
41005
+ // Extracting from vector larger than RootVT - scale the mask and attempt to
41006
+ // fold the shuffle with the larger root type, then extract the lower
41007
+ // elements.
41008
+ unsigned NewRootSizeInBits = Op.getOperand(0).getValueSizeInBits();
41009
+ unsigned Scale = NewRootSizeInBits / RootSizeInBits;
41010
+ MVT NewRootVT = MVT::getVectorVT(RootVT.getScalarType(),
41011
+ Scale * RootVT.getVectorNumElements());
41012
+ SmallVector<int, 64> NewRootMask;
41013
+ growShuffleMask(RootMask, NewRootMask, RootSizeInBits, NewRootSizeInBits);
41014
+ // If we're using the lowest subvector, just replace it directly in the src
41015
+ // ops/nodes.
41016
+ SmallVector<SDValue, 16> NewSrcOps(SrcOps);
41017
+ SmallVector<const SDNode *, 16> NewSrcNodes(SrcNodes);
41018
+ if (isNullConstant(Op.getOperand(1))) {
41019
+ NewSrcOps[SrcOpIndex] = Op.getOperand(0);
41020
+ NewSrcNodes.push_back(Op.getNode());
41021
+ }
41022
+ // Don't increase the combine depth - we're effectively working on the same
41023
+ // nodes, just with a wider type.
41024
+ if (SDValue WideShuffle = combineX86ShufflesRecursively(
41025
+ NewSrcOps, SrcOpIndex, RootOpc, NewRootVT, NewRootMask, NewSrcNodes,
41026
+ Depth, MaxDepth, AllowVariableCrossLaneMask,
41027
+ AllowVariablePerLaneMask, IsMaskedShuffle, DAG, DL, Subtarget))
41028
+ return DAG.getBitcast(
41029
+ RootVT, extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits));
41030
+ return SDValue();
41031
+ } else if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
41032
+ Op.getOperand(1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41033
+ Op.getOperand(1).getOperand(0).getValueSizeInBits() >
41034
+ RootSizeInBits) {
41035
+ // If we're inserting an subvector extracted from a vector larger than
41036
+ // RootVT, then combine the insert_subvector as a shuffle, the
41037
+ // extract_subvector will be folded in a later recursion.
41038
+ SDValue BaseVec = Op.getOperand(0);
41039
+ SDValue SubVec = Op.getOperand(1);
41040
+ int InsertIdx = Op.getConstantOperandVal(2);
41041
+ unsigned NumBaseElts = VT.getVectorNumElements();
41042
+ unsigned NumSubElts = SubVec.getValueType().getVectorNumElements();
41043
+ OpInputs.assign({BaseVec, SubVec});
41044
+ OpMask.resize(NumBaseElts);
41045
+ std::iota(OpMask.begin(), OpMask.end(), 0);
41046
+ std::iota(OpMask.begin() + InsertIdx,
41047
+ OpMask.begin() + InsertIdx + NumSubElts, NumBaseElts);
41048
+ OpZero = OpUndef = APInt::getZero(NumBaseElts);
41166
41049
} else {
41167
41050
return SDValue();
41168
41051
}
@@ -41509,25 +41392,9 @@ static SDValue combineX86ShufflesRecursively(
41509
41392
AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
41510
41393
IsMaskedShuffle, DAG, DL, Subtarget))
41511
41394
return Shuffle;
41512
-
41513
- // If all the operands come from the same larger vector, fallthrough and try
41514
- // to use combineX86ShuffleChainWithExtract.
41515
- SDValue LHS = peekThroughBitcasts(Ops.front());
41516
- SDValue RHS = peekThroughBitcasts(Ops.back());
41517
- if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41518
- (RootSizeInBits / Mask.size()) != 64 ||
41519
- LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41520
- RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41521
- LHS.getOperand(0) != RHS.getOperand(0))
41522
- return SDValue();
41523
41395
}
41524
41396
41525
- // If that failed and any input is extracted then try to combine as a
41526
- // shuffle with the larger type.
41527
- return combineX86ShuffleChainWithExtract(
41528
- Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41529
- AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41530
- DAG, DL, Subtarget);
41397
+ return SDValue();
41531
41398
}
41532
41399
41533
41400
/// Helper entry wrapper to combineX86ShufflesRecursively.
@@ -44196,6 +44063,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
44196
44063
case X86ISD::UNPCKL:
44197
44064
case X86ISD::UNPCKH:
44198
44065
case X86ISD::BLENDI:
44066
+ case X86ISD::SHUFP:
44199
44067
// Integer ops.
44200
44068
case X86ISD::PACKSS:
44201
44069
case X86ISD::PACKUS:
0 commit comments