@@ -4467,12 +4467,22 @@ static SDValue getPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
4467
4467
}
4468
4468
4469
4469
static bool isZeroVector (SDValue N) {
4470
+ if (N->getOpcode () == ISD::BITCAST)
4471
+ N = N->getOperand (0 );
4470
4472
if (N->getOpcode () == ISD::SPLAT_VECTOR)
4471
4473
if (auto *Op = dyn_cast<ConstantSDNode>(N->getOperand (0 )))
4472
4474
return Op->getZExtValue () == 0 ;
4473
4475
return ISD::isBuildVectorAllZeros (N.getNode ());
4474
4476
}
4475
4477
4478
+ // Return the index of the zero/undef vector, or UINT32_MAX if not found.
4479
+ static uint32_t findZeroVectorIdx (SDValue *Ops, unsigned Num) {
4480
+ for (unsigned I = 0 ; I < Num ; I++)
4481
+ if (isZeroVector (Ops[I]))
4482
+ return I;
4483
+ return UINT32_MAX;
4484
+ }
4485
+
4476
4486
// Bytes is a VPERM-like permute vector, except that -1 is used for
4477
4487
// undefined bytes. Implement it on operands Ops[0] and Ops[1] using
4478
4488
// VSLDB or VPERM.
@@ -4491,9 +4501,8 @@ static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
4491
4501
4492
4502
// Fall back on VPERM. Construct an SDNode for the permute vector. Try to
4493
4503
// eliminate a zero vector by reusing any zero index in the permute vector.
4494
- unsigned ZeroVecIdx =
4495
- isZeroVector (Ops[0 ]) ? 0 : (isZeroVector (Ops[1 ]) ? 1 : UINT_MAX);
4496
- if (ZeroVecIdx != UINT_MAX) {
4504
+ unsigned ZeroVecIdx = findZeroVectorIdx (&Ops[0 ], 2 );
4505
+ if (ZeroVecIdx != UINT32_MAX) {
4497
4506
bool MaskFirst = true ;
4498
4507
int ZeroIdx = -1 ;
4499
4508
for (unsigned I = 0 ; I < SystemZ::VectorBytes; ++I) {
@@ -4551,10 +4560,13 @@ static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
4551
4560
namespace {
4552
4561
// Describes a general N-operand vector shuffle.
4553
4562
struct GeneralShuffle {
4554
- GeneralShuffle (EVT vt) : VT(vt) {}
4563
+ GeneralShuffle (EVT vt) : VT(vt), UnpackFromEltSize(UINT_MAX) {}
4555
4564
void addUndef ();
4556
4565
bool add (SDValue, unsigned );
4557
4566
SDValue getNode (SelectionDAG &, const SDLoc &);
4567
+ void tryPrepareForUnpack ();
4568
+ bool unpackWasPrepared () { return UnpackFromEltSize <= 4 ; }
4569
+ SDValue insertUnpackIfPrepared (SelectionDAG &DAG, const SDLoc &DL, SDValue Op);
4558
4570
4559
4571
// The operands of the shuffle.
4560
4572
SmallVector<SDValue, SystemZ::VectorBytes> Ops;
@@ -4566,6 +4578,9 @@ struct GeneralShuffle {
4566
4578
4567
4579
// The type of the shuffle result.
4568
4580
EVT VT;
4581
+
4582
+ // Holds a value of 1, 2 or 4 if a final unpack has been prepared for.
4583
+ unsigned UnpackFromEltSize;
4569
4584
};
4570
4585
}
4571
4586
@@ -4648,6 +4663,9 @@ SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) {
4648
4663
if (Ops.size () == 0 )
4649
4664
return DAG.getUNDEF (VT);
4650
4665
4666
+ // Use a single unpack if possible as the last operation.
4667
+ tryPrepareForUnpack ();
4668
+
4651
4669
// Make sure that there are at least two shuffle operands.
4652
4670
if (Ops.size () == 1 )
4653
4671
Ops.push_back (DAG.getUNDEF (MVT::v16i8));
@@ -4713,13 +4731,117 @@ SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) {
4713
4731
// to VPERM.
4714
4732
unsigned OpNo0, OpNo1;
4715
4733
SDValue Op;
4716
- if (const Permute *P = matchPermute (Bytes, OpNo0, OpNo1))
4734
+ if (unpackWasPrepared () && Ops[1 ].isUndef ())
4735
+ Op = Ops[0 ];
4736
+ else if (const Permute *P = matchPermute (Bytes, OpNo0, OpNo1))
4717
4737
Op = getPermuteNode (DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]);
4718
4738
else
4719
4739
Op = getGeneralPermuteNode (DAG, DL, &Ops[0 ], Bytes);
4740
+
4741
+ Op = insertUnpackIfPrepared (DAG, DL, Op);
4742
+
4720
4743
return DAG.getNode (ISD::BITCAST, DL, VT, Op);
4721
4744
}
4722
4745
4746
+ #ifndef NDEBUG
4747
+ static void dumpBytes (const SmallVectorImpl<int > &Bytes, std::string Msg) {
4748
+ dbgs () << Msg.c_str () << " { " ;
4749
+ for (unsigned i = 0 ; i < Bytes.size (); i++)
4750
+ dbgs () << Bytes[i] << " " ;
4751
+ dbgs () << " }\n " ;
4752
+ }
4753
+ #endif
4754
+
4755
+ // If the Bytes vector matches an unpack operation, prepare to do the unpack
4756
+ // after all else by removing the zero vector and the effect of the unpack on
4757
+ // Bytes.
4758
+ void GeneralShuffle::tryPrepareForUnpack () {
4759
+ uint32_t ZeroVecOpNo = findZeroVectorIdx (&Ops[0 ], Ops.size ());
4760
+ if (ZeroVecOpNo == UINT32_MAX || Ops.size () == 1 )
4761
+ return ;
4762
+
4763
+ // Only do this if removing the zero vector reduces the depth, otherwise
4764
+ // the critical path will increase with the final unpack.
4765
+ if (Ops.size () > 2 &&
4766
+ Log2_32_Ceil (Ops.size ()) == Log2_32_Ceil (Ops.size () - 1 ))
4767
+ return ;
4768
+
4769
+ // Find an unpack that would allow removing the zero vector from Ops.
4770
+ UnpackFromEltSize = 1 ;
4771
+ for (; UnpackFromEltSize <= 4 ; UnpackFromEltSize *= 2 ) {
4772
+ bool MatchUnpack = true ;
4773
+ SmallVector<int , SystemZ::VectorBytes> SrcBytes;
4774
+ for (unsigned Elt = 0 ; Elt < SystemZ::VectorBytes; Elt++) {
4775
+ unsigned ToEltSize = UnpackFromEltSize * 2 ;
4776
+ bool IsZextByte = (Elt % ToEltSize) < UnpackFromEltSize;
4777
+ if (!IsZextByte)
4778
+ SrcBytes.push_back (Bytes[Elt]);
4779
+ if (Bytes[Elt] != -1 ) {
4780
+ unsigned OpNo = unsigned (Bytes[Elt]) / SystemZ::VectorBytes;
4781
+ if (IsZextByte != (OpNo == ZeroVecOpNo)) {
4782
+ MatchUnpack = false ;
4783
+ break ;
4784
+ }
4785
+ }
4786
+ }
4787
+ if (MatchUnpack) {
4788
+ if (Ops.size () == 2 ) {
4789
+ // Don't use unpack if a single source operand needs rearrangement.
4790
+ for (unsigned i = 0 ; i < SystemZ::VectorBytes / 2 ; i++)
4791
+ if (SrcBytes[i] != -1 && SrcBytes[i] % 16 != int (i)) {
4792
+ UnpackFromEltSize = UINT_MAX;
4793
+ return ;
4794
+ }
4795
+ }
4796
+ break ;
4797
+ }
4798
+ }
4799
+ if (UnpackFromEltSize > 4 )
4800
+ return ;
4801
+
4802
+ LLVM_DEBUG (dbgs () << " Preparing for final unpack of element size "
4803
+ << UnpackFromEltSize << " . Zero vector is Op#" << ZeroVecOpNo
4804
+ << " .\n " ;
4805
+ dumpBytes (Bytes, " Original Bytes vector:" ););
4806
+
4807
+ // Apply the unpack in reverse to the Bytes array.
4808
+ unsigned B = 0 ;
4809
+ for (unsigned Elt = 0 ; Elt < SystemZ::VectorBytes;) {
4810
+ Elt += UnpackFromEltSize;
4811
+ for (unsigned i = 0 ; i < UnpackFromEltSize; i++, Elt++, B++)
4812
+ Bytes[B] = Bytes[Elt];
4813
+ }
4814
+ while (B < SystemZ::VectorBytes)
4815
+ Bytes[B++] = -1 ;
4816
+
4817
+ // Remove the zero vector from Ops
4818
+ Ops.erase (&Ops[ZeroVecOpNo]);
4819
+ for (unsigned I = 0 ; I < SystemZ::VectorBytes; ++I)
4820
+ if (Bytes[I] >= 0 ) {
4821
+ unsigned OpNo = unsigned (Bytes[I]) / SystemZ::VectorBytes;
4822
+ if (OpNo > ZeroVecOpNo)
4823
+ Bytes[I] -= SystemZ::VectorBytes;
4824
+ }
4825
+
4826
+ LLVM_DEBUG (dumpBytes (Bytes, " Resulting Bytes vector, zero vector removed:" );
4827
+ dbgs () << " \n " ;);
4828
+ }
4829
+
4830
+ SDValue GeneralShuffle::insertUnpackIfPrepared (SelectionDAG &DAG,
4831
+ const SDLoc &DL,
4832
+ SDValue Op) {
4833
+ if (!unpackWasPrepared ())
4834
+ return Op;
4835
+ unsigned InBits = UnpackFromEltSize * 8 ;
4836
+ EVT InVT = MVT::getVectorVT (MVT::getIntegerVT (InBits),
4837
+ SystemZ::VectorBits / InBits);
4838
+ SDValue PackedOp = DAG.getNode (ISD::BITCAST, DL, InVT, Op);
4839
+ unsigned OutBits = InBits * 2 ;
4840
+ EVT OutVT = MVT::getVectorVT (MVT::getIntegerVT (OutBits),
4841
+ SystemZ::VectorBits / OutBits);
4842
+ return DAG.getNode (SystemZISD::UNPACKL_HIGH, DL, OutVT, PackedOp);
4843
+ }
4844
+
4723
4845
// Return true if the given BUILD_VECTOR is a scalar-to-vector conversion.
4724
4846
static bool isScalarToVector (SDValue Op) {
4725
4847
for (unsigned I = 1 , E = Op.getNumOperands (); I != E; ++I)
@@ -5114,9 +5236,8 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
5114
5236
return DAG.getNode (ISD::BITCAST, DL, VT, Res);
5115
5237
}
5116
5238
5117
- SDValue
5118
- SystemZTargetLowering::lowerExtendVectorInreg (SDValue Op, SelectionDAG &DAG,
5119
- unsigned UnpackHigh) const {
5239
+ SDValue SystemZTargetLowering::
5240
+ lowerSIGN_EXTEND_VECTOR_INREG (SDValue Op, SelectionDAG &DAG) const {
5120
5241
SDValue PackedOp = Op.getOperand (0 );
5121
5242
EVT OutVT = Op.getValueType ();
5122
5243
EVT InVT = PackedOp.getValueType ();
@@ -5126,11 +5247,39 @@ SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
5126
5247
FromBits *= 2 ;
5127
5248
EVT OutVT = MVT::getVectorVT (MVT::getIntegerVT (FromBits),
5128
5249
SystemZ::VectorBits / FromBits);
5129
- PackedOp = DAG.getNode (UnpackHigh, SDLoc (PackedOp), OutVT, PackedOp);
5250
+ PackedOp =
5251
+ DAG.getNode (SystemZISD::UNPACK_HIGH, SDLoc (PackedOp), OutVT, PackedOp);
5130
5252
} while (FromBits != ToBits);
5131
5253
return PackedOp;
5132
5254
}
5133
5255
5256
+ // Lower a ZERO_EXTEND_VECTOR_INREG to a vector shuffle with a zero vector.
5257
+ SDValue SystemZTargetLowering::
5258
+ lowerZERO_EXTEND_VECTOR_INREG (SDValue Op, SelectionDAG &DAG) const {
5259
+ SDValue PackedOp = Op.getOperand (0 );
5260
+ SDLoc DL (Op);
5261
+ EVT OutVT = Op.getValueType ();
5262
+ EVT InVT = PackedOp.getValueType ();
5263
+ unsigned InNumElts = InVT.getVectorNumElements ();
5264
+ unsigned OutNumElts = OutVT.getVectorNumElements ();
5265
+ unsigned NumInPerOut = InNumElts / OutNumElts;
5266
+
5267
+ SDValue ZeroVec =
5268
+ DAG.getSplatVector (InVT, DL, DAG.getConstant (0 , DL, InVT.getScalarType ()));
5269
+
5270
+ SmallVector<int , 16 > Mask (InNumElts);
5271
+ unsigned ZeroVecElt = InNumElts;
5272
+ for (unsigned PackedElt = 0 ; PackedElt < OutNumElts; PackedElt++) {
5273
+ unsigned MaskElt = PackedElt * NumInPerOut;
5274
+ unsigned End = MaskElt + NumInPerOut - 1 ;
5275
+ for (; MaskElt < End; MaskElt++)
5276
+ Mask[MaskElt] = ZeroVecElt++;
5277
+ Mask[MaskElt] = PackedElt;
5278
+ }
5279
+ SDValue Shuf = DAG.getVectorShuffle (InVT, DL, PackedOp, ZeroVec, Mask);
5280
+ return DAG.getNode (ISD::BITCAST, DL, OutVT, Shuf);
5281
+ }
5282
+
5134
5283
SDValue SystemZTargetLowering::lowerShift (SDValue Op, SelectionDAG &DAG,
5135
5284
unsigned ByScalar) const {
5136
5285
// Look for cases where a vector shift can use the *_BY_SCALAR form.
@@ -5296,9 +5445,9 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
5296
5445
case ISD::EXTRACT_VECTOR_ELT:
5297
5446
return lowerEXTRACT_VECTOR_ELT (Op, DAG);
5298
5447
case ISD::SIGN_EXTEND_VECTOR_INREG:
5299
- return lowerExtendVectorInreg (Op, DAG, SystemZISD::UNPACK_HIGH );
5448
+ return lowerSIGN_EXTEND_VECTOR_INREG (Op, DAG);
5300
5449
case ISD::ZERO_EXTEND_VECTOR_INREG:
5301
- return lowerExtendVectorInreg (Op, DAG, SystemZISD::UNPACKL_HIGH );
5450
+ return lowerZERO_EXTEND_VECTOR_INREG (Op, DAG);
5302
5451
case ISD::SHL:
5303
5452
return lowerShift (Op, DAG, SystemZISD::VSHL_BY_SCALAR);
5304
5453
case ISD::SRL:
0 commit comments