Skip to content

Commit dc9f16d

Browse files
committed
[AMDGPU] Promote uniform ops to I32 in ISel
Promote uniform binops, selects and setcc in Global & DAGISel instead of CGP. Solves #64591
1 parent 1f8f2ed commit dc9f16d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+9343
-10651
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3299,7 +3299,7 @@ class TargetLoweringBase {
32993299
/// Return true if it's profitable to narrow operations of type SrcVT to
33003300
/// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not from
33013301
/// i32 to i16.
3302-
virtual bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
3302+
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const {
33033303
return false;
33043304
}
33053305

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7031,7 +7031,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
70317031
if (N1C->getAPIntValue().countLeadingZeros() >= (BitWidth - SrcBitWidth) &&
70327032
TLI.isTruncateFree(VT, SrcVT) && TLI.isZExtFree(SrcVT, VT) &&
70337033
TLI.isTypeDesirableForOp(ISD::AND, SrcVT) &&
7034-
TLI.isNarrowingProfitable(VT, SrcVT))
7034+
TLI.isNarrowingProfitable(N, VT, SrcVT))
70357035
return DAG.getNode(ISD::ZERO_EXTEND, DL, VT,
70367036
DAG.getNode(ISD::AND, DL, SrcVT, N0Op0,
70377037
DAG.getZExtOrTrunc(N1, DL, SrcVT)));
@@ -14574,7 +14574,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
1457414574
// ShLeftAmt will indicate how much a narrowed load should be shifted left.
1457514575
unsigned ShLeftAmt = 0;
1457614576
if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
14577-
ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
14577+
ExtVT == VT && TLI.isNarrowingProfitable(N, N0.getValueType(), VT)) {
1457814578
if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
1457914579
ShLeftAmt = N01->getZExtValue();
1458014580
N0 = N0.getOperand(0);
@@ -15118,9 +15118,11 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
1511815118
}
1511915119

1512015120
// trunc (select c, a, b) -> select c, (trunc a), (trunc b)
15121-
if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
15122-
if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
15123-
TLI.isTruncateFree(SrcVT, VT)) {
15121+
if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse() &&
15122+
TLI.isTruncateFree(SrcVT, VT)) {
15123+
if (!LegalOperations ||
15124+
(TLI.isOperationLegal(ISD::SELECT, SrcVT) &&
15125+
TLI.isNarrowingProfitable(N0.getNode(), N0.getValueType(), VT))) {
1512415126
SDLoc SL(N0);
1512515127
SDValue Cond = N0.getOperand(0);
1512615128
SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
@@ -20061,10 +20063,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
2006120063
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
2006220064
// The narrowing should be profitable, the load/store operation should be
2006320065
// legal (or custom) and the store size should be equal to the NewVT width.
20064-
while (NewBW < BitWidth &&
20065-
(NewVT.getStoreSizeInBits() != NewBW ||
20066-
!TLI.isOperationLegalOrCustom(Opc, NewVT) ||
20067-
!TLI.isNarrowingProfitable(VT, NewVT))) {
20066+
while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW ||
20067+
!TLI.isOperationLegalOrCustom(Opc, NewVT) ||
20068+
!TLI.isNarrowingProfitable(N, VT, NewVT))) {
2006820069
NewBW = NextPowerOf2(NewBW);
2006920070
NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
2007020071
}

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1841,7 +1841,7 @@ bool TargetLowering::SimplifyDemandedBits(
18411841
for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize);
18421842
SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
18431843
EVT SmallVT = EVT::getIntegerVT(*TLO.DAG.getContext(), SmallVTBits);
1844-
if (isNarrowingProfitable(VT, SmallVT) &&
1844+
if (isNarrowingProfitable(Op.getNode(), VT, SmallVT) &&
18451845
isTypeDesirableForOp(ISD::SHL, SmallVT) &&
18461846
isTruncateFree(VT, SmallVT) && isZExtFree(SmallVT, VT) &&
18471847
(!TLO.LegalOperations() || isOperationLegal(ISD::SHL, SmallVT))) {
@@ -1865,7 +1865,7 @@ bool TargetLowering::SimplifyDemandedBits(
18651865
if ((BitWidth % 2) == 0 && !VT.isVector() && ShAmt < HalfWidth &&
18661866
DemandedBits.countLeadingOnes() >= HalfWidth) {
18671867
EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), HalfWidth);
1868-
if (isNarrowingProfitable(VT, HalfVT) &&
1868+
if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
18691869
isTypeDesirableForOp(ISD::SHL, HalfVT) &&
18701870
isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
18711871
(!TLO.LegalOperations() || isOperationLegal(ISD::SHL, HalfVT))) {
@@ -1984,7 +1984,7 @@ bool TargetLowering::SimplifyDemandedBits(
19841984
if ((BitWidth % 2) == 0 && !VT.isVector()) {
19851985
APInt HiBits = APInt::getHighBitsSet(BitWidth, BitWidth / 2);
19861986
EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), BitWidth / 2);
1987-
if (isNarrowingProfitable(VT, HalfVT) &&
1987+
if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
19881988
isTypeDesirableForOp(ISD::SRL, HalfVT) &&
19891989
isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
19901990
(!TLO.LegalOperations() || isOperationLegal(ISD::SRL, HalfVT)) &&
@@ -4762,9 +4762,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
47624762
case ISD::SETULT:
47634763
case ISD::SETULE: {
47644764
EVT newVT = N0.getOperand(0).getValueType();
4765+
// FIXME: Should use isNarrowingProfitable.
47654766
if (DCI.isBeforeLegalizeOps() ||
47664767
(isOperationLegal(ISD::SETCC, newVT) &&
4767-
isCondCodeLegal(Cond, newVT.getSimpleVT()))) {
4768+
isCondCodeLegal(Cond, newVT.getSimpleVT()) &&
4769+
isTypeDesirableForOp(ISD::SETCC, newVT))) {
47684770
EVT NewSetCCVT = getSetCCResultType(Layout, *DAG.getContext(), newVT);
47694771
SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT);
47704772

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1022,14 +1022,45 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
10221022
return Src == MVT::i32 && Dest == MVT::i64;
10231023
}
10241024

1025-
bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
1025+
bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
1026+
EVT DestVT) const {
1027+
switch (N->getOpcode()) {
1028+
case ISD::ADD:
1029+
case ISD::SUB:
1030+
case ISD::SHL:
1031+
case ISD::SRL:
1032+
case ISD::SRA:
1033+
case ISD::AND:
1034+
case ISD::OR:
1035+
case ISD::XOR:
1036+
case ISD::MUL:
1037+
case ISD::SETCC:
1038+
case ISD::SELECT:
1039+
if (Subtarget->has16BitInsts() &&
1040+
(DestVT.isVector() ? !Subtarget->hasVOP3PInsts() : true)) {
1041+
// Don't narrow back down to i16 if promoted to i32 already.
1042+
if (!N->isDivergent() && DestVT.isInteger() &&
1043+
DestVT.getScalarSizeInBits() > 1 &&
1044+
DestVT.getScalarSizeInBits() <= 16 &&
1045+
SrcVT.getScalarSizeInBits() > 16) {
1046+
return false;
1047+
}
1048+
}
1049+
return true;
1050+
default:
1051+
break;
1052+
}
1053+
10261054
// There aren't really 64-bit registers, but pairs of 32-bit ones and only a
10271055
// limited number of native 64-bit operations. Shrinking an operation to fit
10281056
// in a single 32-bit register should always be helpful. As currently used,
10291057
// this is much less general than the name suggests, and is only used in
10301058
// places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
10311059
// not profitable, and may actually be harmful.
1032-
return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1060+
if (isa<LoadSDNode>(N))
1061+
return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1062+
1063+
return true;
10331064
}
10341065

10351066
bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ class AMDGPUTargetLowering : public TargetLowering {
201201
NegatibleCost &Cost,
202202
unsigned Depth) const override;
203203

204-
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override;
204+
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
205205

206206
bool isDesirableToCommuteWithShift(const SDNode *N,
207207
CombineLevel Level) const override;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 149 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -894,6 +894,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
894894
ISD::UADDO_CARRY,
895895
ISD::SUB,
896896
ISD::USUBO_CARRY,
897+
ISD::MUL,
897898
ISD::FADD,
898899
ISD::FSUB,
899900
ISD::FDIV,
@@ -909,9 +910,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
909910
ISD::UMIN,
910911
ISD::UMAX,
911912
ISD::SETCC,
913+
ISD::SELECT,
914+
ISD::SMIN,
915+
ISD::SMAX,
916+
ISD::UMIN,
917+
ISD::UMAX,
912918
ISD::AND,
913919
ISD::OR,
914920
ISD::XOR,
921+
ISD::SHL,
922+
ISD::SRL,
923+
ISD::SRA,
915924
ISD::FSHR,
916925
ISD::SINT_TO_FP,
917926
ISD::UINT_TO_FP,
@@ -1935,13 +1944,6 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
19351944
switch (Op) {
19361945
case ISD::LOAD:
19371946
case ISD::STORE:
1938-
1939-
// These operations are done with 32-bit instructions anyway.
1940-
case ISD::AND:
1941-
case ISD::OR:
1942-
case ISD::XOR:
1943-
case ISD::SELECT:
1944-
// TODO: Extensions?
19451947
return true;
19461948
default:
19471949
return false;
@@ -6746,6 +6748,122 @@ SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
67466748
return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
67476749
}
67486750

6751+
static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
6752+
switch (Op->getOpcode()) {
6753+
case ISD::SRA:
6754+
case ISD::SMIN:
6755+
case ISD::SMAX:
6756+
return ISD::SIGN_EXTEND;
6757+
case ISD::ADD:
6758+
case ISD::SUB:
6759+
case ISD::SRL:
6760+
case ISD::UMIN:
6761+
case ISD::UMAX:
6762+
return ISD::ZERO_EXTEND;
6763+
case ISD::AND:
6764+
case ISD::OR:
6765+
case ISD::XOR:
6766+
case ISD::SHL:
6767+
case ISD::SELECT:
6768+
case ISD::MUL:
6769+
// operation result won't be influenced by garbage high bits.
6770+
// TODO: are all of those cases correct, and are there more?
6771+
return ISD::ANY_EXTEND;
6772+
case ISD::SETCC: {
6773+
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6774+
return ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6775+
}
6776+
default:
6777+
llvm_unreachable("unexpected opcode!");
6778+
}
6779+
}
6780+
6781+
SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
6782+
DAGCombinerInfo &DCI) const {
6783+
const unsigned Opc = Op.getOpcode();
6784+
assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
6785+
Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
6786+
Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
6787+
Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
6788+
Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
6789+
6790+
EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
6791+
: Op->getOperand(0).getValueType();
6792+
6793+
if (DCI.isBeforeLegalizeOps())
6794+
return SDValue();
6795+
6796+
// Promote only if:
6797+
// - We have 16 bit insts (not true 16 bit insts).
6798+
// - We don't have packed instructions (for vector types only).
6799+
// TODO: For vector types, the set of packed operations is more limited, so
6800+
// may want to promote some anyway.
6801+
if (!Subtarget->has16BitInsts() ||
6802+
(OpTy.isVector() ? Subtarget->hasVOP3PInsts() : false))
6803+
return SDValue();
6804+
6805+
// Promote uniform scalar and vector integers between 2 and 16 bits.
6806+
if (Op->isDivergent() || !OpTy.isInteger() ||
6807+
OpTy.getScalarSizeInBits() == 1 || OpTy.getScalarSizeInBits() > 16)
6808+
return SDValue();
6809+
6810+
auto &DAG = DCI.DAG;
6811+
6812+
SDLoc DL(Op);
6813+
SDValue LHS;
6814+
SDValue RHS;
6815+
if (Opc == ISD::SELECT) {
6816+
LHS = Op->getOperand(1);
6817+
RHS = Op->getOperand(2);
6818+
} else {
6819+
LHS = Op->getOperand(0);
6820+
RHS = Op->getOperand(1);
6821+
}
6822+
6823+
auto ExtTy = OpTy.changeElementType(MVT::i32);
6824+
6825+
const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
6826+
LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
6827+
RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
6828+
6829+
// setcc always return i1/i1 vec so no need to truncate after.
6830+
if (Opc == ISD::SETCC) {
6831+
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6832+
return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
6833+
}
6834+
6835+
SDNodeFlags Flags;
6836+
switch (Op->getOpcode()) {
6837+
case ISD::ADD:
6838+
case ISD::SHL:
6839+
Flags.setNoUnsignedWrap(true);
6840+
Flags.setNoSignedWrap(true);
6841+
break;
6842+
case ISD::SUB:
6843+
Flags.setNoUnsignedWrap(Op->getFlags().hasNoUnsignedWrap());
6844+
Flags.setNoSignedWrap(true);
6845+
break;
6846+
case ISD::MUL:
6847+
Flags.setNoUnsignedWrap(true);
6848+
Flags.setNoSignedWrap(Op->getFlags().hasNoUnsignedWrap());
6849+
break;
6850+
default:
6851+
break;
6852+
}
6853+
6854+
Flags.setExact(Op->getFlags().hasExact());
6855+
6856+
// For other ops, we extend the operation's return type as well so we need to
6857+
// truncate back to the original type.
6858+
SDValue NewVal;
6859+
if (Opc == ISD::SELECT)
6860+
NewVal = DAG.getSelect(DL, ExtTy, Op->getOperand(0), LHS, RHS);
6861+
else
6862+
NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS}, Flags);
6863+
6864+
return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
6865+
}
6866+
67496867
// Custom lowering for vector multiplications and s_mul_u64.
67506868
SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
67516869
EVT VT = Op.getValueType();
@@ -14682,8 +14800,32 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
1468214800

1468314801
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
1468414802
DAGCombinerInfo &DCI) const {
14803+
switch (N->getOpcode()) {
14804+
case ISD::ADD:
14805+
case ISD::SUB:
14806+
case ISD::SHL:
14807+
case ISD::SRL:
14808+
case ISD::SRA:
14809+
case ISD::AND:
14810+
case ISD::OR:
14811+
case ISD::XOR:
14812+
case ISD::MUL:
14813+
case ISD::SETCC:
14814+
case ISD::SELECT:
14815+
case ISD::SMIN:
14816+
case ISD::SMAX:
14817+
case ISD::UMIN:
14818+
case ISD::UMAX:
14819+
if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
14820+
return Res;
14821+
break;
14822+
default:
14823+
break;
14824+
}
14825+
1468514826
if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
1468614827
return SDValue();
14828+
1468714829
switch (N->getOpcode()) {
1468814830
case ISD::ADD:
1468914831
return performAddCombine(N, DCI);

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
148148
SDValue lowerFPTRUNC_ROUND(SDValue Op, SelectionDAG &DAG) const;
149149
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
150150
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
151+
SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
151152
SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
152153
SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
153154
SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
@@ -464,7 +465,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
464465
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
465466
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
466467
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
467-
468468
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
469469
SelectionDAG &DAG) const override;
470470

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34445,7 +34445,8 @@ bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
3444534445
return false;
3444634446
}
3444734447

34448-
bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
34448+
bool X86TargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
34449+
EVT DestVT) const {
3444934450
// i16 instructions are longer (0x66 prefix) and potentially slower.
3445034451
return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
3445134452
}

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1429,7 +1429,7 @@ namespace llvm {
14291429
/// Return true if it's profitable to narrow operations of type SrcVT to
14301430
/// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not
14311431
/// from i32 to i16.
1432-
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override;
1432+
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
14331433

14341434
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
14351435
EVT VT) const override;

0 commit comments

Comments
 (0)