@@ -3577,6 +3577,16 @@ static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3577
3577
Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3578
3578
}
3579
3579
3580
+ /// Return true if every element in Mask, is an in-place blend/select mask or is
3581
+ /// undef.
3582
+ static bool isBlendOrUndef(ArrayRef<int> Mask) {
3583
+ unsigned NumElts = Mask.size();
3584
+ for (auto [I, M] : enumerate(Mask))
3585
+ if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3586
+ return false;
3587
+ return true;
3588
+ }
3589
+
3580
3590
/// Return true if every element in Mask, beginning
3581
3591
/// from position Pos and ending in Pos + Size, falls within the specified
3582
3592
/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
@@ -40021,6 +40031,93 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
40021
40031
return SDValue();
40022
40032
}
40023
40033
40034
+ // Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
40035
+ // iff we don't demand the same element index for both X and Y.
40036
+ static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1,
40037
+ ArrayRef<int> BlendMask,
40038
+ const APInt &DemandedElts,
40039
+ SelectionDAG &DAG, const SDLoc &DL) {
40040
+ assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
40041
+ if (!N0.hasOneUse() || !N1.hasOneUse())
40042
+ return SDValue();
40043
+
40044
+ unsigned NumElts = VT.getVectorNumElements();
40045
+ SDValue BC0 = peekThroughOneUseBitcasts(N0);
40046
+ SDValue BC1 = peekThroughOneUseBitcasts(N1);
40047
+
40048
+ // See if both operands are shuffles, and that we can scale the shuffle masks
40049
+ // to the same width as the blend mask.
40050
+ // TODO: Support SM_SentinelZero?
40051
+ SmallVector<SDValue, 2> Ops0, Ops1;
40052
+ SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
40053
+ if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
40054
+ !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
40055
+ !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
40056
+ !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
40057
+ return SDValue();
40058
+
40059
+ // Determine the demanded elts from both permutes.
40060
+ APInt Demanded0, DemandedLHS0, DemandedRHS0;
40061
+ APInt Demanded1, DemandedLHS1, DemandedRHS1;
40062
+ if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
40063
+ Demanded1,
40064
+ /*AllowUndefElts=*/true) ||
40065
+ !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
40066
+ DemandedRHS0, /*AllowUndefElts=*/true) ||
40067
+ !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
40068
+ DemandedRHS1, /*AllowUndefElts=*/true))
40069
+ return SDValue();
40070
+
40071
+ // Confirm that we only use a single operand from both permutes and that we
40072
+ // don't demand the same index from both.
40073
+ if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
40074
+ DemandedLHS0.intersects(DemandedLHS1))
40075
+ return SDValue();
40076
+
40077
+ // Use the permute demanded elts masks as the new blend mask.
40078
+ // Create the new permute mask as a blend of the 2 original permute masks.
40079
+ SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
40080
+ SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
40081
+ for (int I = 0; I != NumElts; ++I) {
40082
+ if (Demanded0[I]) {
40083
+ int M = ScaledMask0[I];
40084
+ if (0 <= M) {
40085
+ assert(isUndefOrEqual(NewBlendMask[M], M) &&
40086
+ "BlendMask demands LHS AND RHS");
40087
+ NewBlendMask[M] = M;
40088
+ NewPermuteMask[I] = M;
40089
+ }
40090
+ } else if (Demanded1[I]) {
40091
+ int M = ScaledMask1[I];
40092
+ if (0 <= M) {
40093
+ assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
40094
+ "BlendMask demands LHS AND RHS");
40095
+ NewBlendMask[M] = M + NumElts;
40096
+ NewPermuteMask[I] = M;
40097
+ }
40098
+ }
40099
+ }
40100
+ assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
40101
+ assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
40102
+
40103
+ // v16i16 shuffles can explode in complexity very easily, only accept them if
40104
+ // the blend mask is the same in the 128-bit subvectors (or can widen to
40105
+ // v8i32) and the permute can be widened as well.
40106
+ if (VT == MVT::v16i16) {
40107
+ if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
40108
+ !canWidenShuffleElements(NewBlendMask))
40109
+ return SDValue();
40110
+ if (!canWidenShuffleElements(NewPermuteMask))
40111
+ return SDValue();
40112
+ }
40113
+
40114
+ SDValue NewBlend =
40115
+ DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
40116
+ DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
40117
+ return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
40118
+ NewPermuteMask);
40119
+ }
40120
+
40024
40121
// TODO - move this to TLI like isBinOp?
40025
40122
static bool isUnaryOp(unsigned Opcode) {
40026
40123
switch (Opcode) {
@@ -41773,6 +41870,15 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
41773
41870
KnownUndef = SrcUndef.zextOrTrunc(NumElts);
41774
41871
break;
41775
41872
}
41873
+ case X86ISD::BLENDI: {
41874
+ SmallVector<int, 16> BlendMask;
41875
+ DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
41876
+ if (SDValue R = combineBlendOfPermutes(VT.getSimpleVT(), Op.getOperand(0),
41877
+ Op.getOperand(1), BlendMask,
41878
+ DemandedElts, TLO.DAG, SDLoc(Op)))
41879
+ return TLO.CombineTo(Op, R);
41880
+ break;
41881
+ }
41776
41882
case X86ISD::BLENDV: {
41777
41883
APInt SelUndef, SelZero;
41778
41884
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
0 commit comments