Skip to content

Commit c5b4f03

Browse files
Anil Mahmudlei137
authored andcommitted
[PowerPC] Exploit xxspltiw and xxspltidp instructions
Exploits the VSX Vector Splat Immediate Word and VSX Vector Splat Immediate Double Precision instructions: xxspltiw XT,IMM32 xxspltidp XT,IMM32 Differential Revision: https://reviews.llvm.org/D82911
1 parent 0670f85 commit c5b4f03

File tree

6 files changed

+539
-28
lines changed

6 files changed

+539
-28
lines changed

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 93 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1473,6 +1473,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
14731473
case PPCISD::STFIWX: return "PPCISD::STFIWX";
14741474
case PPCISD::VPERM: return "PPCISD::VPERM";
14751475
case PPCISD::XXSPLT: return "PPCISD::XXSPLT";
1476+
case PPCISD::XXSPLTI_SP_TO_DP:
1477+
return "PPCISD::XXSPLTI_SP_TO_DP";
14761478
case PPCISD::VECINSERT: return "PPCISD::VECINSERT";
14771479
case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI";
14781480
case PPCISD::VECSHL: return "PPCISD::VECSHL";
@@ -8966,19 +8968,21 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
89668968
// Vector related lowering.
89678969
//
89688970

8969-
/// BuildSplatI - Build a canonical splati of Val with an element size of
8970-
/// SplatSize. Cast the result to VT.
8971-
static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
8972-
SelectionDAG &DAG, const SDLoc &dl) {
8971+
/// getCanonicalConstSplat - Build a canonical splat immediate of Val with an
8972+
/// element size of SplatSize. Cast the result to VT.
8973+
static SDValue getCanonicalConstSplat(uint64_t Val, unsigned SplatSize, EVT VT,
8974+
SelectionDAG &DAG, const SDLoc &dl) {
89738975
static const MVT VTys[] = { // canonical VT to use for each size.
89748976
MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
89758977
};
89768978

89778979
EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
89788980

8979-
// Force vspltis[hw] -1 to vspltisb -1 to canonicalize.
8980-
if (Val == -1)
8981+
// For a splat with all ones, turn it to vspltisb 0xFF to canonicalize.
8982+
if (Val == ((1LU << (SplatSize * 8)) - 1)) {
89818983
SplatSize = 1;
8984+
Val = 0xFF;
8985+
}
89828986

89838987
EVT CanonicalVT = VTys[SplatSize-1];
89848988

@@ -9113,6 +9117,34 @@ static const SDValue *getNormalLoadInput(const SDValue &Op) {
91139117
return ISD::isNormalLoad(LD) ? InputLoad : nullptr;
91149118
}
91159119

9120+
// Convert the argument APFloat to a single precision APFloat if there is no
9121+
// loss in information during the conversion to single precision APFloat and the
9122+
// resulting number is not a denormal number. Return true if successful.
9123+
bool llvm::convertToNonDenormSingle(APFloat &ArgAPFloat) {
9124+
APFloat APFloatToConvert = ArgAPFloat;
9125+
bool LosesInfo = true;
9126+
APFloatToConvert.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
9127+
&LosesInfo);
9128+
bool Success = (!LosesInfo && !APFloatToConvert.isDenormal());
9129+
if (Success)
9130+
ArgAPFloat = APFloatToConvert;
9131+
return Success;
9132+
}
9133+
9134+
// Bitcast the argument APInt to a double and convert it to a single precision
9135+
// APFloat, bitcast the APFloat to an APInt and assign it to the original
9136+
// argument if there is no loss in information during the conversion from
9137+
// double to single precision APFloat and the resulting number is not a denormal
9138+
// number. Return true if successful.
9139+
bool llvm::convertToNonDenormSingle(APInt &ArgAPInt) {
9140+
double DpValue = ArgAPInt.bitsToDouble();
9141+
APFloat APFloatDp(DpValue);
9142+
bool Success = convertToNonDenormSingle(APFloatDp);
9143+
if (Success)
9144+
ArgAPInt = APFloatDp.bitcastToAPInt();
9145+
return Success;
9146+
}
9147+
91169148
// If this is a case we can't handle, return null and let the default
91179149
// expansion code take care of it. If we CAN select this case, and if it
91189150
// selects to a single instruction, return Op. Otherwise, if we can codegen
@@ -9232,9 +9264,23 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
92329264
APInt APSplatBits, APSplatUndef;
92339265
unsigned SplatBitSize;
92349266
bool HasAnyUndefs;
9235-
if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
9236-
HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
9237-
SplatBitSize > 32) {
9267+
bool BVNIsConstantSplat =
9268+
BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
9269+
HasAnyUndefs, 0, !Subtarget.isLittleEndian());
9270+
9271+
// If it is a splat of a double, check if we can shrink it to a 32 bit
9272+
// non-denormal float which when converted back to double gives us the same
9273+
// double. This is to exploit the XXSPLTIDP instruction.
9274+
if (BVNIsConstantSplat && Subtarget.hasPrefixInstrs() &&
9275+
(SplatBitSize == 64) && (Op->getValueType(0) == MVT::v2f64) &&
9276+
convertToNonDenormSingle(APSplatBits)) {
9277+
SDValue SplatNode = DAG.getNode(
9278+
PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64,
9279+
DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));
9280+
return DAG.getBitcast(Op.getValueType(), SplatNode);
9281+
}
9282+
9283+
if (!BVNIsConstantSplat || SplatBitSize > 32) {
92389284

92399285
const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
92409286
// Handle load-and-splat patterns as we have instructions that will do this
@@ -9273,8 +9319,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
92739319
return SDValue();
92749320
}
92759321

9276-
unsigned SplatBits = APSplatBits.getZExtValue();
9277-
unsigned SplatUndef = APSplatUndef.getZExtValue();
9322+
uint64_t SplatBits = APSplatBits.getZExtValue();
9323+
uint64_t SplatUndef = APSplatUndef.getZExtValue();
92789324
unsigned SplatSize = SplatBitSize / 8;
92799325

92809326
// First, handle single instruction cases.
@@ -9289,17 +9335,30 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
92899335
return Op;
92909336
}
92919337

9292-
// We have XXSPLTIB for constant splats one byte wide
9293-
// FIXME: SplatBits is an unsigned int being cast to an int while passing it
9294-
// as an argument to BuildSplatiI. Given SplatSize == 1 it is okay here.
9338+
// We have XXSPLTIW for constant splats four bytes wide.
9339+
// Given vector length is a multiple of 4, 2-byte splats can be replaced
9340+
// with 4-byte splats. We replicate the SplatBits in case of 2-byte splat to
9341+
// make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be
9342+
// turned into a 4-byte splat of 0xABABABAB.
9343+
if (Subtarget.hasPrefixInstrs() && SplatSize == 2)
9344+
return getCanonicalConstSplat((SplatBits |= SplatBits << 16), SplatSize * 2,
9345+
Op.getValueType(), DAG, dl);
9346+
9347+
if (Subtarget.hasPrefixInstrs() && SplatSize == 4)
9348+
return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
9349+
dl);
9350+
9351+
// We have XXSPLTIB for constant splats one byte wide.
92959352
if (Subtarget.hasP9Vector() && SplatSize == 1)
9296-
return BuildSplatI(SplatBits, SplatSize, Op.getValueType(), DAG, dl);
9353+
return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
9354+
dl);
92979355

92989356
// If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
92999357
int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
93009358
(32-SplatBitSize));
93019359
if (SextVal >= -16 && SextVal <= 15)
9302-
return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);
9360+
return getCanonicalConstSplat(SextVal, SplatSize, Op.getValueType(), DAG,
9361+
dl);
93039362

93049363
// Two instruction sequences.
93059364

@@ -9330,7 +9389,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
93309389
// for fneg/fabs.
93319390
if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
93329391
// Make -1 and vspltisw -1:
9333-
SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl);
9392+
SDValue OnesV = getCanonicalConstSplat(-1, 4, MVT::v4i32, DAG, dl);
93349393

93359394
// Make the VSLW intrinsic, computing 0x8000_0000.
93369395
SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
@@ -9358,7 +9417,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
93589417

93599418
// vsplti + shl self.
93609419
if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
9361-
SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
9420+
SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
93629421
static const unsigned IIDs[] = { // Intrinsic to use for each size.
93639422
Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
93649423
Intrinsic::ppc_altivec_vslw
@@ -9369,7 +9428,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
93699428

93709429
// vsplti + srl self.
93719430
if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
9372-
SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
9431+
SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
93739432
static const unsigned IIDs[] = { // Intrinsic to use for each size.
93749433
Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
93759434
Intrinsic::ppc_altivec_vsrw
@@ -9380,7 +9439,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
93809439

93819440
// vsplti + sra self.
93829441
if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
9383-
SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
9442+
SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
93849443
static const unsigned IIDs[] = { // Intrinsic to use for each size.
93859444
Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
93869445
Intrinsic::ppc_altivec_vsraw
@@ -9392,7 +9451,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
93929451
// vsplti + rol self.
93939452
if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
93949453
((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
9395-
SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
9454+
SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
93969455
static const unsigned IIDs[] = { // Intrinsic to use for each size.
93979456
Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
93989457
Intrinsic::ppc_altivec_vrlw
@@ -9403,19 +9462,19 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
94039462

94049463
// t = vsplti c, result = vsldoi t, t, 1
94059464
if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
9406-
SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
9465+
SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
94079466
unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
94089467
return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
94099468
}
94109469
// t = vsplti c, result = vsldoi t, t, 2
94119470
if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
9412-
SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
9471+
SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
94139472
unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
94149473
return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
94159474
}
94169475
// t = vsplti c, result = vsldoi t, t, 3
94179476
if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
9418-
SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
9477+
SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
94199478
unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
94209479
return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
94219480
}
@@ -10817,9 +10876,9 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
1081710876
if (Op.getValueType() == MVT::v4i32) {
1081810877
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
1081910878

10820-
SDValue Zero = BuildSplatI( 0, 1, MVT::v4i32, DAG, dl);
10821-
SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt.
10822-
10879+
SDValue Zero = getCanonicalConstSplat(0, 1, MVT::v4i32, DAG, dl);
10880+
// +16 as shift amt.
10881+
SDValue Neg16 = getCanonicalConstSplat(-16, 4, MVT::v4i32, DAG, dl);
1082310882
SDValue RHSSwap = // = vrlw RHS, 16
1082410883
BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);
1082510884

@@ -16239,6 +16298,13 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
1623916298
return false;
1624016299
case MVT::f32:
1624116300
case MVT::f64:
16301+
if (Subtarget.hasPrefixInstrs()) {
16302+
// With prefixed instructions, we can materialize anything that can be
16303+
// represented with a 32-bit immediate, not just positive zero.
16304+
APFloat APFloatOfImm = Imm;
16305+
return convertToNonDenormSingle(APFloatOfImm);
16306+
}
16307+
LLVM_FALLTHROUGH;
1624216308
case MVT::ppcf128:
1624316309
return Imm.isPosZero();
1624416310
}

llvm/lib/Target/PowerPC/PPCISelLowering.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,11 @@ namespace llvm {
9797
///
9898
XXSPLT,
9999

100+
/// XXSPLTI_SP_TO_DP - The PPC VSX splat instructions for immediates for
101+
/// converting immediate single precision numbers to double precision
102+
/// vector or scalar.
103+
XXSPLTI_SP_TO_DP,
104+
100105
/// VECINSERT - The PPC vector insert instruction
101106
///
102107
VECINSERT,
@@ -1273,6 +1278,9 @@ namespace llvm {
12731278
bool isIntS16Immediate(SDNode *N, int16_t &Imm);
12741279
bool isIntS16Immediate(SDValue Op, int16_t &Imm);
12751280

1281+
bool convertToNonDenormSingle(APInt &ArgAPInt);
1282+
bool convertToNonDenormSingle(APFloat &ArgAPFloat);
1283+
12761284
} // end namespace llvm
12771285

12781286
#endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H

llvm/lib/Target/PowerPC/PPCInstrInfo.td

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ def SDT_PPCVecSplat : SDTypeProfile<1, 2, [ SDTCisVec<0>,
5050
SDTCisVec<1>, SDTCisInt<2>
5151
]>;
5252

53+
def SDT_PPCSpToDp : SDTypeProfile<1, 1, [ SDTCisVT<0, v2f64>,
54+
SDTCisInt<1>
55+
]>;
56+
5357
def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>,
5458
SDTCisVec<1>, SDTCisVec<2>, SDTCisPtrTy<3>
5559
]>;
@@ -194,6 +198,7 @@ def PPCaddiDtprelL : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
194198

195199
def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
196200
def PPCxxsplt : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
201+
def PPCxxspltidp : SDNode<"PPCISD::XXSPLTI_SP_TO_DP", SDT_PPCSpToDp, []>;
197202
def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>;
198203
def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>;
199204
def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
@@ -326,6 +331,23 @@ def PPCmatpcreladdr : SDNode<"PPCISD::MAT_PCREL_ADDR", SDTIntUnaryOp, []>;
326331
// PowerPC specific transformation functions and pattern fragments.
327332
//
328333

334+
// A floating point immediate that is not a positive zero and can be converted
335+
// to a single precision floating point non-denormal immediate without loss of
336+
// information.
337+
def nzFPImmAsi32 : PatLeaf<(fpimm), [{
338+
APFloat APFloatOfN = N->getValueAPF();
339+
return convertToNonDenormSingle(APFloatOfN) && !N->isExactlyValue(+0.0);
340+
}]>;
341+
342+
// Convert the floating point immediate into a 32 bit floating point immediate
343+
// and get a i32 with the resulting bits.
344+
def getFPAs32BitInt : SDNodeXForm<fpimm, [{
345+
APFloat APFloatOfN = N->getValueAPF();
346+
convertToNonDenormSingle(APFloatOfN);
347+
return CurDAG->getTargetConstant(APFloatOfN.bitcastToAPInt().getZExtValue(),
348+
SDLoc(N), MVT::i32);
349+
}]>;
350+
329351
def SHL32 : SDNodeXForm<imm, [{
330352
// Transformation function: 31 - imm
331353
return getI32Imm(31 - N->getZExtValue(), SDLoc(N));
@@ -392,6 +414,7 @@ def immZExt16 : PatLeaf<(imm), [{
392414
def immNonAllOneAnyExt8 : ImmLeaf<i32, [{
393415
return (isInt<8>(Imm) && (Imm != -1)) || (isUInt<8>(Imm) && (Imm != 0xFF));
394416
}]>;
417+
def i32immNonAllOneNonZero : ImmLeaf<i32, [{ return Imm && (Imm != -1); }]>;
395418
def immSExt5NonZero : ImmLeaf<i32, [{ return Imm && isInt<5>(Imm); }]>;
396419

397420
// imm16Shifted* - These match immediates where the low 16-bits are zero. There

llvm/lib/Target/PowerPC/PPCInstrPrefix.td

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -704,7 +704,8 @@ let Predicates = [PrefixInstrs] in {
704704
def XXSPLTIDP : 8RR_DForm_IMM32_XT6<32, 2, (outs vsrc:$XT),
705705
(ins i32imm:$IMM32),
706706
"xxspltidp $XT, $IMM32", IIC_VecGeneral,
707-
[]>;
707+
[(set v2f64:$XT,
708+
(PPCxxspltidp i32:$IMM32))]>;
708709
def XXSPLTI32DX :
709710
8RR_DForm_IMM32_XT6_IX<32, 0, (outs vsrc:$XT),
710711
(ins vsrc:$XTi, i1imm:$IX, i32imm:$IMM32),
@@ -822,3 +823,17 @@ let Predicates = [IsISA3_1] in {
822823
def : Pat<(v2i64 (int_ppc_vsx_xxgenpcvdm v2i64:$VRB, imm:$IMM)),
823824
(v2i64 (COPY_TO_REGCLASS (XXGENPCVDM $VRB, imm:$IMM), VRRC))>;
824825
}
826+
827+
let AddedComplexity = 400, Predicates = [PrefixInstrs] in {
828+
def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A,
829+
i32immNonAllOneNonZero:$A,
830+
i32immNonAllOneNonZero:$A,
831+
i32immNonAllOneNonZero:$A)),
832+
(v4i32 (XXSPLTIW imm:$A))>;
833+
def : Pat<(f32 nzFPImmAsi32:$A),
834+
(COPY_TO_REGCLASS (XXSPLTIDP (getFPAs32BitInt fpimm:$A)),
835+
VSFRC)>;
836+
def : Pat<(f64 nzFPImmAsi32:$A),
837+
(COPY_TO_REGCLASS (XXSPLTIDP (getFPAs32BitInt fpimm:$A)),
838+
VSFRC)>;
839+
}

0 commit comments

Comments
 (0)