@@ -10964,10 +10964,10 @@ static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10964
10964
// This entry crosses lanes, so there is no way to model this shuffle.
10965
10965
return false;
10966
10966
10967
- // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10968
- // Adjust second vector indices to start at LaneSize instead of Size.
10969
- int LocalM =
10970
- Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize ;
10967
+ // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10968
+ // later vector indices to start at multiples of LaneSize instead of Size.
10969
+ int LaneM = Mask[i] / Size;
10970
+ int LocalM = ( Mask[i] % LaneSize) + (LaneM * LaneSize) ;
10971
10971
if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10972
10972
// This is the first non-undef entry in this slot of a 128-bit lane.
10973
10973
RepeatedMask[i % LaneSize] = LocalM;
@@ -36225,24 +36225,25 @@ static SDValue canonicalizeShuffleMaskWithHorizOp(
36225
36225
int NumEltsPerLane = NumElts / NumLanes;
36226
36226
int NumHalfEltsPerLane = NumEltsPerLane / 2;
36227
36227
MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
36228
+ unsigned EltSizeInBits = RootSizeInBits / Mask.size();
36228
36229
36229
- // TODO: Add support for 256/512-bit vectors.
36230
- if (RootSizeInBits == 128 && NumEltsPerLane >= 4 &&
36230
+ if (NumEltsPerLane >= 4 &&
36231
36231
(isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
36232
- SmallVector<int> ScaledMask;
36233
- if (scaleShuffleElements(Mask, 4, ScaledMask)) {
36232
+ SmallVector<int> LaneMask, ScaledMask;
36233
+ if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
36234
+ scaleShuffleElements(LaneMask, 4, ScaledMask)) {
36234
36235
// See if we can remove the shuffle by resorting the HOP chain so that
36235
36236
// the HOP args are pre-shuffled.
36236
36237
// TODO: Generalize to any sized/depth chain.
36237
36238
// TODO: Add support for PACKSS/PACKUS.
36238
- if (isHoriz && NumEltsPerLane == 4 ) {
36239
+ if (isHoriz) {
36239
36240
// Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
36240
36241
auto GetHOpSrc = [&](int M) {
36241
36242
if (M == SM_SentinelUndef)
36242
36243
return DAG.getUNDEF(VT0);
36243
36244
if (M == SM_SentinelZero)
36244
36245
return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
36245
- SDValue Src0 = BC[M / NumElts ];
36246
+ SDValue Src0 = BC[M / 4 ];
36246
36247
SDValue Src1 = Src0.getOperand((M % 4) >= 2);
36247
36248
if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
36248
36249
return Src1.getOperand(M % 2);
@@ -36253,8 +36254,8 @@ static SDValue canonicalizeShuffleMaskWithHorizOp(
36253
36254
SDValue M2 = GetHOpSrc(ScaledMask[2]);
36254
36255
SDValue M3 = GetHOpSrc(ScaledMask[3]);
36255
36256
if (M0 && M1 && M2 && M3) {
36256
- SDValue LHS = DAG.getNode(Opcode0, DL, VT0 , M0, M1);
36257
- SDValue RHS = DAG.getNode(Opcode0, DL, VT0 , M2, M3);
36257
+ SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT , M0, M1);
36258
+ SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT , M2, M3);
36258
36259
return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
36259
36260
}
36260
36261
}
@@ -36348,7 +36349,6 @@ static SDValue canonicalizeShuffleMaskWithHorizOp(
36348
36349
// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
36349
36350
// single instruction. Attempt to match a v2X64 repeating shuffle pattern that
36350
36351
// represents the LHS/RHS inputs for the lower/upper halves.
36351
- unsigned EltSizeInBits = RootSizeInBits / Mask.size();
36352
36352
SmallVector<int, 16> TargetMask128, WideMask128;
36353
36353
if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
36354
36354
scaleShuffleElements(TargetMask128, 2, WideMask128)) {
@@ -37564,29 +37564,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
37564
37564
}
37565
37565
return SDValue();
37566
37566
}
37567
- case X86ISD::UNPCKL:
37568
- case X86ISD::UNPCKH: {
37569
- // unpcklo(hop(x,y),hop(z,w)) -> permute(hop(x,z)).
37570
- // unpckhi(hop(x,y),hop(z,w)) -> permute(hop(y,w)).
37571
- // Don't fold if hop(x,y) == hop(z,w).
37572
- // TODO: Merge this into canonicalizeShuffleMaskWithHorizOp?
37573
- SDValue N0 = N.getOperand(0);
37574
- SDValue N1 = N.getOperand(1);
37575
- if (VT.getScalarSizeInBits() == 32 && N0 != N1 &&
37576
- N0.getOpcode() == N1.getOpcode() && isHorizOp(N0.getOpcode())) {
37577
- unsigned LoHi = Opcode == X86ISD::UNPCKL ? 0 : 1;
37578
- SDValue Res = DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(LoHi),
37579
- N1.getOperand(LoHi));
37580
- // Use SHUFPS for the permute so this will work on SSE3 targets, shuffle
37581
- // combining and domain handling will simplify this later on.
37582
- EVT ShuffleVT = VT.changeVectorElementType(MVT::f32);
37583
- Res = DAG.getBitcast(ShuffleVT, Res);
37584
- Res = DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
37585
- getV4X86ShuffleImm8ForMask({0, 2, 1, 3}, DL, DAG));
37586
- return DAG.getBitcast(VT, Res);
37587
- }
37588
- return SDValue();
37589
- }
37590
37567
case X86ISD::VPERMI: {
37591
37568
// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
37592
37569
// TODO: Remove when we have preferred domains in combineX86ShuffleChain.
0 commit comments