Skip to content

Commit 04fad5d

Browse files
committed
[AMDGPU] Promote uniform ops to I32 in ISel
Promote uniform binops, selects and setcc in Global & DAGISel instead of CGP. Solves #64591
1 parent aa21ce4 commit 04fad5d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+9360
-10766
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3306,7 +3306,7 @@ class TargetLoweringBase {
33063306
/// Return true if it's profitable to narrow operations of type SrcVT to
33073307
/// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not from
33083308
/// i32 to i16.
3309-
virtual bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
3309+
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const {
33103310
return false;
33113311
}
33123312

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7050,7 +7050,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
70507050
if (N1C->getAPIntValue().countLeadingZeros() >= (BitWidth - SrcBitWidth) &&
70517051
TLI.isTruncateFree(VT, SrcVT) && TLI.isZExtFree(SrcVT, VT) &&
70527052
TLI.isTypeDesirableForOp(ISD::AND, SrcVT) &&
7053-
TLI.isNarrowingProfitable(VT, SrcVT))
7053+
TLI.isNarrowingProfitable(N, VT, SrcVT))
70547054
return DAG.getNode(ISD::ZERO_EXTEND, DL, VT,
70557055
DAG.getNode(ISD::AND, DL, SrcVT, N0Op0,
70567056
DAG.getZExtOrTrunc(N1, DL, SrcVT)));
@@ -14622,7 +14622,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
1462214622
// ShLeftAmt will indicate how much a narrowed load should be shifted left.
1462314623
unsigned ShLeftAmt = 0;
1462414624
if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
14625-
ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
14625+
ExtVT == VT && TLI.isNarrowingProfitable(N, N0.getValueType(), VT)) {
1462614626
if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
1462714627
ShLeftAmt = N01->getZExtValue();
1462814628
N0 = N0.getOperand(0);
@@ -15166,9 +15166,11 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
1516615166
}
1516715167

1516815168
// trunc (select c, a, b) -> select c, (trunc a), (trunc b)
15169-
if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
15170-
if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
15171-
TLI.isTruncateFree(SrcVT, VT)) {
15169+
if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse() &&
15170+
TLI.isTruncateFree(SrcVT, VT)) {
15171+
if (!LegalOperations ||
15172+
(TLI.isOperationLegal(ISD::SELECT, SrcVT) &&
15173+
TLI.isNarrowingProfitable(N0.getNode(), N0.getValueType(), VT))) {
1517215174
SDLoc SL(N0);
1517315175
SDValue Cond = N0.getOperand(0);
1517415176
SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
@@ -20109,10 +20111,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
2010920111
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
2011020112
// The narrowing should be profitable, the load/store operation should be
2011120113
// legal (or custom) and the store size should be equal to the NewVT width.
20112-
while (NewBW < BitWidth &&
20113-
(NewVT.getStoreSizeInBits() != NewBW ||
20114-
!TLI.isOperationLegalOrCustom(Opc, NewVT) ||
20115-
!TLI.isNarrowingProfitable(VT, NewVT))) {
20114+
while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW ||
20115+
!TLI.isOperationLegalOrCustom(Opc, NewVT) ||
20116+
!TLI.isNarrowingProfitable(N, VT, NewVT))) {
2011620117
NewBW = NextPowerOf2(NewBW);
2011720118
NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
2011820119
}

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1841,7 +1841,7 @@ bool TargetLowering::SimplifyDemandedBits(
18411841
for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize);
18421842
SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
18431843
EVT SmallVT = EVT::getIntegerVT(*TLO.DAG.getContext(), SmallVTBits);
1844-
if (isNarrowingProfitable(VT, SmallVT) &&
1844+
if (isNarrowingProfitable(Op.getNode(), VT, SmallVT) &&
18451845
isTypeDesirableForOp(ISD::SHL, SmallVT) &&
18461846
isTruncateFree(VT, SmallVT) && isZExtFree(SmallVT, VT) &&
18471847
(!TLO.LegalOperations() || isOperationLegal(ISD::SHL, SmallVT))) {
@@ -1865,7 +1865,7 @@ bool TargetLowering::SimplifyDemandedBits(
18651865
if ((BitWidth % 2) == 0 && !VT.isVector() && ShAmt < HalfWidth &&
18661866
DemandedBits.countLeadingOnes() >= HalfWidth) {
18671867
EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), HalfWidth);
1868-
if (isNarrowingProfitable(VT, HalfVT) &&
1868+
if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
18691869
isTypeDesirableForOp(ISD::SHL, HalfVT) &&
18701870
isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
18711871
(!TLO.LegalOperations() || isOperationLegal(ISD::SHL, HalfVT))) {
@@ -1984,7 +1984,7 @@ bool TargetLowering::SimplifyDemandedBits(
19841984
if ((BitWidth % 2) == 0 && !VT.isVector()) {
19851985
APInt HiBits = APInt::getHighBitsSet(BitWidth, BitWidth / 2);
19861986
EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), BitWidth / 2);
1987-
if (isNarrowingProfitable(VT, HalfVT) &&
1987+
if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
19881988
isTypeDesirableForOp(ISD::SRL, HalfVT) &&
19891989
isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
19901990
(!TLO.LegalOperations() || isOperationLegal(ISD::SRL, HalfVT)) &&
@@ -4762,9 +4762,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
47624762
case ISD::SETULT:
47634763
case ISD::SETULE: {
47644764
EVT newVT = N0.getOperand(0).getValueType();
4765+
// FIXME: Should use isNarrowingProfitable.
47654766
if (DCI.isBeforeLegalizeOps() ||
47664767
(isOperationLegal(ISD::SETCC, newVT) &&
4767-
isCondCodeLegal(Cond, newVT.getSimpleVT()))) {
4768+
isCondCodeLegal(Cond, newVT.getSimpleVT()) &&
4769+
isTypeDesirableForOp(ISD::SETCC, newVT))) {
47684770
EVT NewSetCCVT = getSetCCResultType(Layout, *DAG.getContext(), newVT);
47694771
SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT);
47704772

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1022,14 +1022,45 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
10221022
return Src == MVT::i32 && Dest == MVT::i64;
10231023
}
10241024

1025-
bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
1025+
bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
1026+
EVT DestVT) const {
1027+
switch (N->getOpcode()) {
1028+
case ISD::ADD:
1029+
case ISD::SUB:
1030+
case ISD::SHL:
1031+
case ISD::SRL:
1032+
case ISD::SRA:
1033+
case ISD::AND:
1034+
case ISD::OR:
1035+
case ISD::XOR:
1036+
case ISD::MUL:
1037+
case ISD::SETCC:
1038+
case ISD::SELECT:
1039+
if (Subtarget->has16BitInsts() &&
1040+
(DestVT.isVector() ? !Subtarget->hasVOP3PInsts() : true)) {
1041+
// Don't narrow back down to i16 if promoted to i32 already.
1042+
if (!N->isDivergent() && DestVT.isInteger() &&
1043+
DestVT.getScalarSizeInBits() > 1 &&
1044+
DestVT.getScalarSizeInBits() <= 16 &&
1045+
SrcVT.getScalarSizeInBits() > 16) {
1046+
return false;
1047+
}
1048+
}
1049+
return true;
1050+
default:
1051+
break;
1052+
}
1053+
10261054
// There aren't really 64-bit registers, but pairs of 32-bit ones and only a
10271055
// limited number of native 64-bit operations. Shrinking an operation to fit
10281056
// in a single 32-bit register should always be helpful. As currently used,
10291057
// this is much less general than the name suggests, and is only used in
10301058
// places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
10311059
// not profitable, and may actually be harmful.
1032-
return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1060+
if (isa<LoadSDNode>(N))
1061+
return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1062+
1063+
return true;
10331064
}
10341065

10351066
bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ class AMDGPUTargetLowering : public TargetLowering {
201201
NegatibleCost &Cost,
202202
unsigned Depth) const override;
203203

204-
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override;
204+
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
205205

206206
bool isDesirableToCommuteWithShift(const SDNode *N,
207207
CombineLevel Level) const override;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 149 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -894,6 +894,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
894894
ISD::UADDO_CARRY,
895895
ISD::SUB,
896896
ISD::USUBO_CARRY,
897+
ISD::MUL,
897898
ISD::FADD,
898899
ISD::FSUB,
899900
ISD::FDIV,
@@ -909,9 +910,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
909910
ISD::UMIN,
910911
ISD::UMAX,
911912
ISD::SETCC,
913+
ISD::SELECT,
914+
ISD::SMIN,
915+
ISD::SMAX,
916+
ISD::UMIN,
917+
ISD::UMAX,
912918
ISD::AND,
913919
ISD::OR,
914920
ISD::XOR,
921+
ISD::SHL,
922+
ISD::SRL,
923+
ISD::SRA,
915924
ISD::FSHR,
916925
ISD::SINT_TO_FP,
917926
ISD::UINT_TO_FP,
@@ -1948,13 +1957,6 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
19481957
switch (Op) {
19491958
case ISD::LOAD:
19501959
case ISD::STORE:
1951-
1952-
// These operations are done with 32-bit instructions anyway.
1953-
case ISD::AND:
1954-
case ISD::OR:
1955-
case ISD::XOR:
1956-
case ISD::SELECT:
1957-
// TODO: Extensions?
19581960
return true;
19591961
default:
19601962
return false;
@@ -6733,6 +6735,122 @@ SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
67336735
return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
67346736
}
67356737

6738+
static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
6739+
switch (Op->getOpcode()) {
6740+
case ISD::SRA:
6741+
case ISD::SMIN:
6742+
case ISD::SMAX:
6743+
return ISD::SIGN_EXTEND;
6744+
case ISD::ADD:
6745+
case ISD::SUB:
6746+
case ISD::SRL:
6747+
case ISD::UMIN:
6748+
case ISD::UMAX:
6749+
return ISD::ZERO_EXTEND;
6750+
case ISD::AND:
6751+
case ISD::OR:
6752+
case ISD::XOR:
6753+
case ISD::SHL:
6754+
case ISD::SELECT:
6755+
case ISD::MUL:
6756+
// operation result won't be influenced by garbage high bits.
6757+
// TODO: are all of those cases correct, and are there more?
6758+
return ISD::ANY_EXTEND;
6759+
case ISD::SETCC: {
6760+
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6761+
return ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6762+
}
6763+
default:
6764+
llvm_unreachable("unexpected opcode!");
6765+
}
6766+
}
6767+
6768+
SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
6769+
DAGCombinerInfo &DCI) const {
6770+
const unsigned Opc = Op.getOpcode();
6771+
assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
6772+
Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
6773+
Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
6774+
Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
6775+
Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
6776+
6777+
EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
6778+
: Op->getOperand(0).getValueType();
6779+
6780+
if (DCI.isBeforeLegalizeOps())
6781+
return SDValue();
6782+
6783+
// Promote only if:
6784+
// - We have 16 bit insts (not true 16 bit insts).
6785+
// - We don't have packed instructions (for vector types only).
6786+
// TODO: For vector types, the set of packed operations is more limited, so
6787+
// may want to promote some anyway.
6788+
if (!Subtarget->has16BitInsts() ||
6789+
(OpTy.isVector() ? Subtarget->hasVOP3PInsts() : false))
6790+
return SDValue();
6791+
6792+
// Promote uniform scalar and vector integers between 2 and 16 bits.
6793+
if (Op->isDivergent() || !OpTy.isInteger() ||
6794+
OpTy.getScalarSizeInBits() == 1 || OpTy.getScalarSizeInBits() > 16)
6795+
return SDValue();
6796+
6797+
auto &DAG = DCI.DAG;
6798+
6799+
SDLoc DL(Op);
6800+
SDValue LHS;
6801+
SDValue RHS;
6802+
if (Opc == ISD::SELECT) {
6803+
LHS = Op->getOperand(1);
6804+
RHS = Op->getOperand(2);
6805+
} else {
6806+
LHS = Op->getOperand(0);
6807+
RHS = Op->getOperand(1);
6808+
}
6809+
6810+
auto ExtTy = OpTy.changeElementType(MVT::i32);
6811+
6812+
const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
6813+
LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
6814+
RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
6815+
6816+
// setcc always return i1/i1 vec so no need to truncate after.
6817+
if (Opc == ISD::SETCC) {
6818+
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6819+
return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
6820+
}
6821+
6822+
SDNodeFlags Flags;
6823+
switch (Op->getOpcode()) {
6824+
case ISD::ADD:
6825+
case ISD::SHL:
6826+
Flags.setNoUnsignedWrap(true);
6827+
Flags.setNoSignedWrap(true);
6828+
break;
6829+
case ISD::SUB:
6830+
Flags.setNoUnsignedWrap(Op->getFlags().hasNoUnsignedWrap());
6831+
Flags.setNoSignedWrap(true);
6832+
break;
6833+
case ISD::MUL:
6834+
Flags.setNoUnsignedWrap(true);
6835+
Flags.setNoSignedWrap(Op->getFlags().hasNoUnsignedWrap());
6836+
break;
6837+
default:
6838+
break;
6839+
}
6840+
6841+
Flags.setExact(Op->getFlags().hasExact());
6842+
6843+
// For other ops, we extend the operation's return type as well so we need to
6844+
// truncate back to the original type.
6845+
SDValue NewVal;
6846+
if (Opc == ISD::SELECT)
6847+
NewVal = DAG.getSelect(DL, ExtTy, Op->getOperand(0), LHS, RHS);
6848+
else
6849+
NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS}, Flags);
6850+
6851+
return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
6852+
}
6853+
67366854
// Custom lowering for vector multiplications and s_mul_u64.
67376855
SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
67386856
EVT VT = Op.getValueType();
@@ -14687,8 +14805,32 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
1468714805

1468814806
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
1468914807
DAGCombinerInfo &DCI) const {
14808+
switch (N->getOpcode()) {
14809+
case ISD::ADD:
14810+
case ISD::SUB:
14811+
case ISD::SHL:
14812+
case ISD::SRL:
14813+
case ISD::SRA:
14814+
case ISD::AND:
14815+
case ISD::OR:
14816+
case ISD::XOR:
14817+
case ISD::MUL:
14818+
case ISD::SETCC:
14819+
case ISD::SELECT:
14820+
case ISD::SMIN:
14821+
case ISD::SMAX:
14822+
case ISD::UMIN:
14823+
case ISD::UMAX:
14824+
if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
14825+
return Res;
14826+
break;
14827+
default:
14828+
break;
14829+
}
14830+
1469014831
if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
1469114832
return SDValue();
14833+
1469214834
switch (N->getOpcode()) {
1469314835
case ISD::ADD:
1469414836
return performAddCombine(N, DCI);

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
147147
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
148148
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
149149
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
150+
SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
150151
SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
151152
SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
152153
SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
@@ -463,7 +464,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
463464
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
464465
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
465466
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
466-
467467
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
468468
SelectionDAG &DAG) const override;
469469

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34533,7 +34533,8 @@ bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
3453334533
return false;
3453434534
}
3453534535

34536-
bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
34536+
bool X86TargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
34537+
EVT DestVT) const {
3453734538
// i16 instructions are longer (0x66 prefix) and potentially slower.
3453834539
return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
3453934540
}

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1429,7 +1429,7 @@ namespace llvm {
14291429
/// Return true if it's profitable to narrow operations of type SrcVT to
14301430
/// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not
14311431
/// from i32 to i16.
1432-
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override;
1432+
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
14331433

14341434
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
14351435
EVT VT) const override;

0 commit comments

Comments
 (0)