-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Promote uniform ops to I32 in DAGISel #106383
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-llvm-selectiondag Author: Pierre van Houtryve (Pierre-vh) ChangesSee #106382 for NFC test updates. Promote uniform binops, selects and setcc in Global & DAGISel instead of CGP. Solves #64591 Patch is 1.35 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106383.diff 88 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index eda38cd8a564d6..85310a4911b8ed 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3299,7 +3299,7 @@ class TargetLoweringBase {
/// Return true if it's profitable to narrow operations of type SrcVT to
/// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not from
/// i32 to i16.
- virtual bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+ virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const {
return false;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b0a906743f29ff..513ad392cb360a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7031,7 +7031,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
if (N1C->getAPIntValue().countLeadingZeros() >= (BitWidth - SrcBitWidth) &&
TLI.isTruncateFree(VT, SrcVT) && TLI.isZExtFree(SrcVT, VT) &&
TLI.isTypeDesirableForOp(ISD::AND, SrcVT) &&
- TLI.isNarrowingProfitable(VT, SrcVT))
+ TLI.isNarrowingProfitable(N, VT, SrcVT))
return DAG.getNode(ISD::ZERO_EXTEND, DL, VT,
DAG.getNode(ISD::AND, DL, SrcVT, N0Op0,
DAG.getZExtOrTrunc(N1, DL, SrcVT)));
@@ -14574,7 +14574,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
// ShLeftAmt will indicate how much a narrowed load should be shifted left.
unsigned ShLeftAmt = 0;
if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
- ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
+ ExtVT == VT && TLI.isNarrowingProfitable(N, N0.getValueType(), VT)) {
if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
ShLeftAmt = N01->getZExtValue();
N0 = N0.getOperand(0);
@@ -15118,9 +15118,11 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
}
// trunc (select c, a, b) -> select c, (trunc a), (trunc b)
- if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
- if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
- TLI.isTruncateFree(SrcVT, VT)) {
+ if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse() &&
+ TLI.isTruncateFree(SrcVT, VT)) {
+ if (!LegalOperations ||
+ (TLI.isOperationLegal(ISD::SELECT, SrcVT) &&
+ TLI.isNarrowingProfitable(N0.getNode(), N0.getValueType(), VT))) {
SDLoc SL(N0);
SDValue Cond = N0.getOperand(0);
SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
@@ -20061,10 +20063,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
// The narrowing should be profitable, the load/store operation should be
// legal (or custom) and the store size should be equal to the NewVT width.
- while (NewBW < BitWidth &&
- (NewVT.getStoreSizeInBits() != NewBW ||
- !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
- !TLI.isNarrowingProfitable(VT, NewVT))) {
+ while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW ||
+ !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
+ !TLI.isNarrowingProfitable(N, VT, NewVT))) {
NewBW = NextPowerOf2(NewBW);
NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4e796289cff0a1..97e10b3551db1a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1841,7 +1841,7 @@ bool TargetLowering::SimplifyDemandedBits(
for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize);
SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
EVT SmallVT = EVT::getIntegerVT(*TLO.DAG.getContext(), SmallVTBits);
- if (isNarrowingProfitable(VT, SmallVT) &&
+ if (isNarrowingProfitable(Op.getNode(), VT, SmallVT) &&
isTypeDesirableForOp(ISD::SHL, SmallVT) &&
isTruncateFree(VT, SmallVT) && isZExtFree(SmallVT, VT) &&
(!TLO.LegalOperations() || isOperationLegal(ISD::SHL, SmallVT))) {
@@ -1865,7 +1865,7 @@ bool TargetLowering::SimplifyDemandedBits(
if ((BitWidth % 2) == 0 && !VT.isVector() && ShAmt < HalfWidth &&
DemandedBits.countLeadingOnes() >= HalfWidth) {
EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), HalfWidth);
- if (isNarrowingProfitable(VT, HalfVT) &&
+ if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
isTypeDesirableForOp(ISD::SHL, HalfVT) &&
isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
(!TLO.LegalOperations() || isOperationLegal(ISD::SHL, HalfVT))) {
@@ -1984,7 +1984,7 @@ bool TargetLowering::SimplifyDemandedBits(
if ((BitWidth % 2) == 0 && !VT.isVector()) {
APInt HiBits = APInt::getHighBitsSet(BitWidth, BitWidth / 2);
EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), BitWidth / 2);
- if (isNarrowingProfitable(VT, HalfVT) &&
+ if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
isTypeDesirableForOp(ISD::SRL, HalfVT) &&
isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
(!TLO.LegalOperations() || isOperationLegal(ISD::SRL, HalfVT)) &&
@@ -4762,9 +4762,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
case ISD::SETULT:
case ISD::SETULE: {
EVT newVT = N0.getOperand(0).getValueType();
+ // FIXME: Should use isNarrowingProfitable.
if (DCI.isBeforeLegalizeOps() ||
(isOperationLegal(ISD::SETCC, newVT) &&
- isCondCodeLegal(Cond, newVT.getSimpleVT()))) {
+ isCondCodeLegal(Cond, newVT.getSimpleVT()) &&
+ isTypeDesirableForOp(ISD::SETCC, newVT))) {
EVT NewSetCCVT = getSetCCResultType(Layout, *DAG.getContext(), newVT);
SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 052e1140533f3f..f689fcf62fe8eb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -46,10 +46,10 @@ static cl::opt<bool> WidenLoads(
cl::init(false));
static cl::opt<bool> Widen16BitOps(
- "amdgpu-codegenprepare-widen-16-bit-ops",
- cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
- cl::ReallyHidden,
- cl::init(true));
+ "amdgpu-codegenprepare-widen-16-bit-ops",
+ cl::desc(
+ "Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
+ cl::ReallyHidden, cl::init(false));
static cl::opt<bool>
BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b2a3f9392157d1..01e96159babd03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -145,6 +145,31 @@ def expand_promoted_fmed3 : GICombineRule<
} // End Predicates = [NotHasMed3_16]
+def promote_i16_uniform_binops_frag : GICombinePatFrag<
+ (outs root:$dst), (ins),
+ !foreach(op, [G_ADD, G_SUB, G_SHL, G_ASHR, G_LSHR, G_AND, G_XOR, G_OR, G_MUL],
+ (pattern (op i16:$dst, i16:$lhs, i16:$rhs)))>;
+
+def promote_i16_uniform_binops : GICombineRule<
+ (defs root:$dst),
+ (match (promote_i16_uniform_binops_frag i16:$dst):$mi,
+ [{ return matchPromote16to32(*${mi}); }]),
+ (apply [{ applyPromote16to32(*${mi}); }])
+>;
+
+def promote_i16_uniform_ternary_frag : GICombinePatFrag<
+ (outs root:$dst), (ins),
+ !foreach(op, [G_ICMP, G_SELECT],
+ (pattern (op i16:$dst, $first, i16:$lhs, i16:$rhs)))>;
+
+def promote_i16_uniform_ternary : GICombineRule<
+ (defs root:$dst),
+ (match (promote_i16_uniform_ternary_frag i16:$dst):$mi,
+ [{ return matchPromote16to32(*${mi}); }]),
+ (apply [{ applyPromote16to32(*${mi}); }])
+>;
+
+
// Combines which should only apply on SI/CI
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
@@ -169,5 +194,6 @@ def AMDGPURegBankCombiner : GICombiner<
"AMDGPURegBankCombinerImpl",
[unmerge_merge, unmerge_cst, unmerge_undef,
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
- fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
+ fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
+ promote_i16_uniform_binops, promote_i16_uniform_ternary]> {
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 96143d688801aa..1a596cc80c0c9c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1017,14 +1017,45 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
return Src == MVT::i32 && Dest == MVT::i64;
}
-bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
+ EVT DestVT) const {
+ switch (N->getOpcode()) {
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::MUL:
+ case ISD::SETCC:
+ case ISD::SELECT:
+ if (Subtarget->has16BitInsts() &&
+ (DestVT.isVector() ? !Subtarget->hasVOP3PInsts() : true)) {
+ // Don't narrow back down to i16 if promoted to i32 already.
+ if (!N->isDivergent() && DestVT.isInteger() &&
+ DestVT.getScalarSizeInBits() > 1 &&
+ DestVT.getScalarSizeInBits() <= 16 &&
+ SrcVT.getScalarSizeInBits() > 16) {
+ return false;
+ }
+ }
+ return true;
+ default:
+ break;
+ }
+
// There aren't really 64-bit registers, but pairs of 32-bit ones and only a
// limited number of native 64-bit operations. Shrinking an operation to fit
// in a single 32-bit register should always be helpful. As currently used,
// this is much less general than the name suggests, and is only used in
// places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
// not profitable, and may actually be harmful.
- return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+ if (isa<LoadSDNode>(N))
+ return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+
+ return true;
}
bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 59f640ea99de3e..4dfa7ac052a5ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -201,7 +201,7 @@ class AMDGPUTargetLowering : public TargetLowering {
NegatibleCost &Cost,
unsigned Depth) const override;
- bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override;
+ bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
bool isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index e236a5d7522e02..3b4faa35b93738 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -89,6 +89,9 @@ class AMDGPURegBankCombinerImpl : public Combiner {
void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
void applyClamp(MachineInstr &MI, Register &Reg) const;
+ bool matchPromote16to32(MachineInstr &MI) const;
+ void applyPromote16to32(MachineInstr &MI) const;
+
private:
SIModeRegisterDefaults getMode() const;
bool getIEEE() const;
@@ -348,6 +351,116 @@ bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
return false;
}
+bool AMDGPURegBankCombinerImpl::matchPromote16to32(MachineInstr &MI) const {
+ Register Dst = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(Dst);
+ const auto *RB = MRI.getRegBankOrNull(Dst);
+
+ // Only promote uniform instructions.
+ if (RB->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+
+ // Promote only if:
+ // - We have 16 bit insts (not true 16 bit insts).
+ // - We don't have packed instructions (for vector types only).
+ // TODO: For vector types, the set of packed operations is more limited, so
+ // may want to promote some anyway.
+ return STI.has16BitInsts() &&
+ (DstTy.isVector() ? !STI.hasVOP3PInsts() : true);
+}
+
+static unsigned getExtOpcodeForPromotedOp(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::G_ASHR:
+ return AMDGPU::G_SEXT;
+ case AMDGPU::G_ADD:
+ case AMDGPU::G_SUB:
+ case AMDGPU::G_FSHR:
+ return AMDGPU::G_ZEXT;
+ case AMDGPU::G_AND:
+ case AMDGPU::G_OR:
+ case AMDGPU::G_XOR:
+ case AMDGPU::G_SHL:
+ case AMDGPU::G_SELECT:
+ case AMDGPU::G_MUL:
+ // operation result won't be influenced by garbage high bits.
+ // TODO: are all of those cases correct, and are there more?
+ return AMDGPU::G_ANYEXT;
+ case AMDGPU::G_ICMP: {
+ return CmpInst::isSigned(cast<GICmp>(MI).getCond()) ? AMDGPU::G_SEXT
+ : AMDGPU::G_ZEXT;
+ }
+ default:
+ llvm_unreachable("unexpected opcode!");
+ }
+}
+
+void AMDGPURegBankCombinerImpl::applyPromote16to32(MachineInstr &MI) const {
+ const unsigned Opc = MI.getOpcode();
+ assert(Opc == AMDGPU::G_ADD || Opc == AMDGPU::G_SUB || Opc == AMDGPU::G_SHL ||
+ Opc == AMDGPU::G_LSHR || Opc == AMDGPU::G_ASHR ||
+ Opc == AMDGPU::G_AND || Opc == AMDGPU::G_OR || Opc == AMDGPU::G_XOR ||
+ Opc == AMDGPU::G_MUL || Opc == AMDGPU::G_SELECT ||
+ Opc == AMDGPU::G_ICMP);
+
+ Register Dst = MI.getOperand(0).getReg();
+
+ bool IsSelectOrCmp = (Opc == AMDGPU::G_SELECT || Opc == AMDGPU::G_ICMP);
+ Register LHS = MI.getOperand(IsSelectOrCmp + 1).getReg();
+ Register RHS = MI.getOperand(IsSelectOrCmp + 2).getReg();
+
+ assert(MRI.getType(Dst) == LLT::scalar(16));
+ assert(MRI.getType(LHS) == LLT::scalar(16));
+ assert(MRI.getType(RHS) == LLT::scalar(16));
+
+ assert(MRI.getRegBankOrNull(Dst)->getID() == AMDGPU::SGPRRegBankID);
+ assert(MRI.getRegBankOrNull(LHS)->getID() == AMDGPU::SGPRRegBankID);
+ assert(MRI.getRegBankOrNull(RHS)->getID() == AMDGPU::SGPRRegBankID);
+ const RegisterBank &RB = *MRI.getRegBankOrNull(Dst);
+
+ LLT S32 = LLT::scalar(32);
+
+ B.setInstrAndDebugLoc(MI);
+ const unsigned ExtOpc = getExtOpcodeForPromotedOp(MI);
+ LHS = B.buildInstr(ExtOpc, {S32}, {LHS}).getReg(0);
+ RHS = B.buildInstr(ExtOpc, {S32}, {RHS}).getReg(0);
+
+ MRI.setRegBank(LHS, RB);
+ MRI.setRegBank(RHS, RB);
+
+ MachineInstr *NewInst;
+ if (IsSelectOrCmp)
+ NewInst = B.buildInstr(Opc, {Dst}, {MI.getOperand(1), LHS, RHS});
+ else
+ NewInst = B.buildInstr(Opc, {S32}, {LHS, RHS});
+
+ if (Opc != AMDGPU::G_ICMP) {
+ Register Dst32 = NewInst->getOperand(0).getReg();
+ MRI.setRegBank(Dst32, RB);
+ B.buildTrunc(Dst, Dst32);
+ }
+
+ switch (Opc) {
+ case AMDGPU::G_ADD:
+ case AMDGPU::G_SHL:
+ NewInst->setFlag(MachineInstr::NoUWrap);
+ NewInst->setFlag(MachineInstr::NoSWrap);
+ break;
+ case AMDGPU::G_SUB:
+ if (MI.getFlag(MachineInstr::NoUWrap))
+ NewInst->setFlag(MachineInstr::NoUWrap);
+ NewInst->setFlag(MachineInstr::NoSWrap);
+ break;
+ case AMDGPU::G_MUL:
+ NewInst->setFlag(MachineInstr::NoUWrap);
+ if (MI.getFlag(MachineInstr::NoUWrap))
+ NewInst->setFlag(MachineInstr::NoUWrap);
+ break;
+ }
+
+ MI.eraseFromParent();
+}
+
void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI,
Register &Reg) const {
B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg},
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1437f3d58b5e79..96a59acd751a62 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -894,6 +894,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::UADDO_CARRY,
ISD::SUB,
ISD::USUBO_CARRY,
+ ISD::MUL,
ISD::FADD,
ISD::FSUB,
ISD::FDIV,
@@ -909,9 +910,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::UMIN,
ISD::UMAX,
ISD::SETCC,
+ ISD::SELECT,
+ ISD::SMIN,
+ ISD::SMAX,
+ ISD::UMIN,
+ ISD::UMAX,
ISD::AND,
ISD::OR,
ISD::XOR,
+ ISD::SHL,
+ ISD::SRL,
+ ISD::SRA,
ISD::FSHR,
ISD::SINT_TO_FP,
ISD::UINT_TO_FP,
@@ -1935,13 +1944,6 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
switch (Op) {
case ISD::LOAD:
case ISD::STORE:
-
- // These operations are done with 32-bit instructions anyway.
- case ISD::AND:
- case ISD::OR:
- case ISD::XOR:
- case ISD::SELECT:
- // TODO: Extensions?
return true;
default:
return false;
@@ -6746,6 +6748,122 @@ SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
}
+static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
+ switch (Op->getOpcode()) {
+ case ISD::SRA:
+ case ISD::SMIN:
+ case ISD::SMAX:
+ return ISD::SIGN_EXTEND;
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::SRL:
+ case ISD::UMIN:
+ case ISD::UMAX:
+ return ISD::ZERO_EXTEND;
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::SHL:
+ case ISD::SELECT:
+ case ISD::MUL:
+ // operation result won't be influenced by garbage high bits.
+ // TODO: are all of those cases correct, and are there more?
+ return ISD::ANY_EXTEND;
+ case ISD::SETCC: {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ return ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ }
+ default:
+ llvm_unreachable("unexpected opcode!");
+ }
+}
+
+SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
+ DAGCombinerInfo &DCI) const {
+ const unsigned Opc = Op.getOpcode();
+ assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
+ Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
+ Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
+ Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
+ Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
+
+ EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
+ : Op->getOperand(0).getValueType();
+
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ // Promote only if:
+ // - We have 16 bit insts (not true 16 bit insts).
+ // - We don't have packed instructions (for vector types only).
+ // TODO: For vector types, the set of packed operations is more limited, so
+ // may want to promote some anyway.
+ if (!Subtarget->has16BitInsts() ||
+ (OpTy.isVector() ? Subtarget->hasVOP3PInsts() : false))
+ return SDValue();
+
+ // Promote uniform scalar and vector integers between 2 and 16 bits.
+ if (Op->isDivergent() || !OpTy.isInteger() ||
+ OpTy.getScalarSizeInBits() == 1 || OpTy.getScalarSizeInBits() > 16)
+ return SDValue();
+
+ auto &DAG = DCI.DAG;
+
+ SDLoc DL(Op);
+ SDValue LHS;
+ SDValue RHS;
+ if (Opc == ISD::SELECT) {
+ LHS = Op->getOperand(1);
+ RHS = Op->getOperand(2);
+ } else {
+ LHS = Op->getOperand(0)...
[truncated]
|
@llvm/pr-subscribers-llvm-globalisel Author: Pierre van Houtryve (Pierre-vh) ChangesSee #106382 for NFC test updates. Promote uniform binops, selects and setcc in Global & DAGISel instead of CGP. Solves #64591 Patch is 1.35 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106383.diff 88 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index eda38cd8a564d6..85310a4911b8ed 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3299,7 +3299,7 @@ class TargetLoweringBase {
/// Return true if it's profitable to narrow operations of type SrcVT to
/// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not from
/// i32 to i16.
- virtual bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+ virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const {
return false;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b0a906743f29ff..513ad392cb360a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7031,7 +7031,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
if (N1C->getAPIntValue().countLeadingZeros() >= (BitWidth - SrcBitWidth) &&
TLI.isTruncateFree(VT, SrcVT) && TLI.isZExtFree(SrcVT, VT) &&
TLI.isTypeDesirableForOp(ISD::AND, SrcVT) &&
- TLI.isNarrowingProfitable(VT, SrcVT))
+ TLI.isNarrowingProfitable(N, VT, SrcVT))
return DAG.getNode(ISD::ZERO_EXTEND, DL, VT,
DAG.getNode(ISD::AND, DL, SrcVT, N0Op0,
DAG.getZExtOrTrunc(N1, DL, SrcVT)));
@@ -14574,7 +14574,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
// ShLeftAmt will indicate how much a narrowed load should be shifted left.
unsigned ShLeftAmt = 0;
if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
- ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
+ ExtVT == VT && TLI.isNarrowingProfitable(N, N0.getValueType(), VT)) {
if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
ShLeftAmt = N01->getZExtValue();
N0 = N0.getOperand(0);
@@ -15118,9 +15118,11 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
}
// trunc (select c, a, b) -> select c, (trunc a), (trunc b)
- if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
- if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
- TLI.isTruncateFree(SrcVT, VT)) {
+ if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse() &&
+ TLI.isTruncateFree(SrcVT, VT)) {
+ if (!LegalOperations ||
+ (TLI.isOperationLegal(ISD::SELECT, SrcVT) &&
+ TLI.isNarrowingProfitable(N0.getNode(), N0.getValueType(), VT))) {
SDLoc SL(N0);
SDValue Cond = N0.getOperand(0);
SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
@@ -20061,10 +20063,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
// The narrowing should be profitable, the load/store operation should be
// legal (or custom) and the store size should be equal to the NewVT width.
- while (NewBW < BitWidth &&
- (NewVT.getStoreSizeInBits() != NewBW ||
- !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
- !TLI.isNarrowingProfitable(VT, NewVT))) {
+ while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW ||
+ !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
+ !TLI.isNarrowingProfitable(N, VT, NewVT))) {
NewBW = NextPowerOf2(NewBW);
NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4e796289cff0a1..97e10b3551db1a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1841,7 +1841,7 @@ bool TargetLowering::SimplifyDemandedBits(
for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize);
SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
EVT SmallVT = EVT::getIntegerVT(*TLO.DAG.getContext(), SmallVTBits);
- if (isNarrowingProfitable(VT, SmallVT) &&
+ if (isNarrowingProfitable(Op.getNode(), VT, SmallVT) &&
isTypeDesirableForOp(ISD::SHL, SmallVT) &&
isTruncateFree(VT, SmallVT) && isZExtFree(SmallVT, VT) &&
(!TLO.LegalOperations() || isOperationLegal(ISD::SHL, SmallVT))) {
@@ -1865,7 +1865,7 @@ bool TargetLowering::SimplifyDemandedBits(
if ((BitWidth % 2) == 0 && !VT.isVector() && ShAmt < HalfWidth &&
DemandedBits.countLeadingOnes() >= HalfWidth) {
EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), HalfWidth);
- if (isNarrowingProfitable(VT, HalfVT) &&
+ if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
isTypeDesirableForOp(ISD::SHL, HalfVT) &&
isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
(!TLO.LegalOperations() || isOperationLegal(ISD::SHL, HalfVT))) {
@@ -1984,7 +1984,7 @@ bool TargetLowering::SimplifyDemandedBits(
if ((BitWidth % 2) == 0 && !VT.isVector()) {
APInt HiBits = APInt::getHighBitsSet(BitWidth, BitWidth / 2);
EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), BitWidth / 2);
- if (isNarrowingProfitable(VT, HalfVT) &&
+ if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
isTypeDesirableForOp(ISD::SRL, HalfVT) &&
isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
(!TLO.LegalOperations() || isOperationLegal(ISD::SRL, HalfVT)) &&
@@ -4762,9 +4762,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
case ISD::SETULT:
case ISD::SETULE: {
EVT newVT = N0.getOperand(0).getValueType();
+ // FIXME: Should use isNarrowingProfitable.
if (DCI.isBeforeLegalizeOps() ||
(isOperationLegal(ISD::SETCC, newVT) &&
- isCondCodeLegal(Cond, newVT.getSimpleVT()))) {
+ isCondCodeLegal(Cond, newVT.getSimpleVT()) &&
+ isTypeDesirableForOp(ISD::SETCC, newVT))) {
EVT NewSetCCVT = getSetCCResultType(Layout, *DAG.getContext(), newVT);
SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 052e1140533f3f..f689fcf62fe8eb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -46,10 +46,10 @@ static cl::opt<bool> WidenLoads(
cl::init(false));
static cl::opt<bool> Widen16BitOps(
- "amdgpu-codegenprepare-widen-16-bit-ops",
- cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
- cl::ReallyHidden,
- cl::init(true));
+ "amdgpu-codegenprepare-widen-16-bit-ops",
+ cl::desc(
+ "Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
+ cl::ReallyHidden, cl::init(false));
static cl::opt<bool>
BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b2a3f9392157d1..01e96159babd03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -145,6 +145,31 @@ def expand_promoted_fmed3 : GICombineRule<
} // End Predicates = [NotHasMed3_16]
+def promote_i16_uniform_binops_frag : GICombinePatFrag<
+ (outs root:$dst), (ins),
+ !foreach(op, [G_ADD, G_SUB, G_SHL, G_ASHR, G_LSHR, G_AND, G_XOR, G_OR, G_MUL],
+ (pattern (op i16:$dst, i16:$lhs, i16:$rhs)))>;
+
+def promote_i16_uniform_binops : GICombineRule<
+ (defs root:$dst),
+ (match (promote_i16_uniform_binops_frag i16:$dst):$mi,
+ [{ return matchPromote16to32(*${mi}); }]),
+ (apply [{ applyPromote16to32(*${mi}); }])
+>;
+
+def promote_i16_uniform_ternary_frag : GICombinePatFrag<
+ (outs root:$dst), (ins),
+ !foreach(op, [G_ICMP, G_SELECT],
+ (pattern (op i16:$dst, $first, i16:$lhs, i16:$rhs)))>;
+
+def promote_i16_uniform_ternary : GICombineRule<
+ (defs root:$dst),
+ (match (promote_i16_uniform_ternary_frag i16:$dst):$mi,
+ [{ return matchPromote16to32(*${mi}); }]),
+ (apply [{ applyPromote16to32(*${mi}); }])
+>;
+
+
// Combines which should only apply on SI/CI
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
@@ -169,5 +194,6 @@ def AMDGPURegBankCombiner : GICombiner<
"AMDGPURegBankCombinerImpl",
[unmerge_merge, unmerge_cst, unmerge_undef,
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
- fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
+ fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
+ promote_i16_uniform_binops, promote_i16_uniform_ternary]> {
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 96143d688801aa..1a596cc80c0c9c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1017,14 +1017,45 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
return Src == MVT::i32 && Dest == MVT::i64;
}
-bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
+ EVT DestVT) const {
+ switch (N->getOpcode()) {
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::MUL:
+ case ISD::SETCC:
+ case ISD::SELECT:
+ if (Subtarget->has16BitInsts() &&
+ (DestVT.isVector() ? !Subtarget->hasVOP3PInsts() : true)) {
+ // Don't narrow back down to i16 if promoted to i32 already.
+ if (!N->isDivergent() && DestVT.isInteger() &&
+ DestVT.getScalarSizeInBits() > 1 &&
+ DestVT.getScalarSizeInBits() <= 16 &&
+ SrcVT.getScalarSizeInBits() > 16) {
+ return false;
+ }
+ }
+ return true;
+ default:
+ break;
+ }
+
// There aren't really 64-bit registers, but pairs of 32-bit ones and only a
// limited number of native 64-bit operations. Shrinking an operation to fit
// in a single 32-bit register should always be helpful. As currently used,
// this is much less general than the name suggests, and is only used in
// places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
// not profitable, and may actually be harmful.
- return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+ if (isa<LoadSDNode>(N))
+ return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+
+ return true;
}
bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 59f640ea99de3e..4dfa7ac052a5ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -201,7 +201,7 @@ class AMDGPUTargetLowering : public TargetLowering {
NegatibleCost &Cost,
unsigned Depth) const override;
- bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override;
+ bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
bool isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index e236a5d7522e02..3b4faa35b93738 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -89,6 +89,9 @@ class AMDGPURegBankCombinerImpl : public Combiner {
void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
void applyClamp(MachineInstr &MI, Register &Reg) const;
+ bool matchPromote16to32(MachineInstr &MI) const;
+ void applyPromote16to32(MachineInstr &MI) const;
+
private:
SIModeRegisterDefaults getMode() const;
bool getIEEE() const;
@@ -348,6 +351,116 @@ bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
return false;
}
+bool AMDGPURegBankCombinerImpl::matchPromote16to32(MachineInstr &MI) const {
+ Register Dst = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(Dst);
+ const auto *RB = MRI.getRegBankOrNull(Dst);
+
+ // Only promote uniform instructions.
+ if (RB->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+
+ // Promote only if:
+ // - We have 16 bit insts (not true 16 bit insts).
+ // - We don't have packed instructions (for vector types only).
+ // TODO: For vector types, the set of packed operations is more limited, so
+ // may want to promote some anyway.
+ return STI.has16BitInsts() &&
+ (DstTy.isVector() ? !STI.hasVOP3PInsts() : true);
+}
+
+static unsigned getExtOpcodeForPromotedOp(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::G_ASHR:
+ return AMDGPU::G_SEXT;
+ case AMDGPU::G_ADD:
+ case AMDGPU::G_SUB:
+ case AMDGPU::G_FSHR:
+ return AMDGPU::G_ZEXT;
+ case AMDGPU::G_AND:
+ case AMDGPU::G_OR:
+ case AMDGPU::G_XOR:
+ case AMDGPU::G_SHL:
+ case AMDGPU::G_SELECT:
+ case AMDGPU::G_MUL:
+ // operation result won't be influenced by garbage high bits.
+ // TODO: are all of those cases correct, and are there more?
+ return AMDGPU::G_ANYEXT;
+ case AMDGPU::G_ICMP: {
+ return CmpInst::isSigned(cast<GICmp>(MI).getCond()) ? AMDGPU::G_SEXT
+ : AMDGPU::G_ZEXT;
+ }
+ default:
+ llvm_unreachable("unexpected opcode!");
+ }
+}
+
+void AMDGPURegBankCombinerImpl::applyPromote16to32(MachineInstr &MI) const {
+ const unsigned Opc = MI.getOpcode();
+ assert(Opc == AMDGPU::G_ADD || Opc == AMDGPU::G_SUB || Opc == AMDGPU::G_SHL ||
+ Opc == AMDGPU::G_LSHR || Opc == AMDGPU::G_ASHR ||
+ Opc == AMDGPU::G_AND || Opc == AMDGPU::G_OR || Opc == AMDGPU::G_XOR ||
+ Opc == AMDGPU::G_MUL || Opc == AMDGPU::G_SELECT ||
+ Opc == AMDGPU::G_ICMP);
+
+ Register Dst = MI.getOperand(0).getReg();
+
+ bool IsSelectOrCmp = (Opc == AMDGPU::G_SELECT || Opc == AMDGPU::G_ICMP);
+ Register LHS = MI.getOperand(IsSelectOrCmp + 1).getReg();
+ Register RHS = MI.getOperand(IsSelectOrCmp + 2).getReg();
+
+ assert(MRI.getType(Dst) == LLT::scalar(16));
+ assert(MRI.getType(LHS) == LLT::scalar(16));
+ assert(MRI.getType(RHS) == LLT::scalar(16));
+
+ assert(MRI.getRegBankOrNull(Dst)->getID() == AMDGPU::SGPRRegBankID);
+ assert(MRI.getRegBankOrNull(LHS)->getID() == AMDGPU::SGPRRegBankID);
+ assert(MRI.getRegBankOrNull(RHS)->getID() == AMDGPU::SGPRRegBankID);
+ const RegisterBank &RB = *MRI.getRegBankOrNull(Dst);
+
+ LLT S32 = LLT::scalar(32);
+
+ B.setInstrAndDebugLoc(MI);
+ const unsigned ExtOpc = getExtOpcodeForPromotedOp(MI);
+ LHS = B.buildInstr(ExtOpc, {S32}, {LHS}).getReg(0);
+ RHS = B.buildInstr(ExtOpc, {S32}, {RHS}).getReg(0);
+
+ MRI.setRegBank(LHS, RB);
+ MRI.setRegBank(RHS, RB);
+
+ MachineInstr *NewInst;
+ if (IsSelectOrCmp)
+ NewInst = B.buildInstr(Opc, {Dst}, {MI.getOperand(1), LHS, RHS});
+ else
+ NewInst = B.buildInstr(Opc, {S32}, {LHS, RHS});
+
+ if (Opc != AMDGPU::G_ICMP) {
+ Register Dst32 = NewInst->getOperand(0).getReg();
+ MRI.setRegBank(Dst32, RB);
+ B.buildTrunc(Dst, Dst32);
+ }
+
+ switch (Opc) {
+ case AMDGPU::G_ADD:
+ case AMDGPU::G_SHL:
+ NewInst->setFlag(MachineInstr::NoUWrap);
+ NewInst->setFlag(MachineInstr::NoSWrap);
+ break;
+ case AMDGPU::G_SUB:
+ if (MI.getFlag(MachineInstr::NoUWrap))
+ NewInst->setFlag(MachineInstr::NoUWrap);
+ NewInst->setFlag(MachineInstr::NoSWrap);
+ break;
+ case AMDGPU::G_MUL:
+ NewInst->setFlag(MachineInstr::NoUWrap);
+ if (MI.getFlag(MachineInstr::NoUWrap))
+ NewInst->setFlag(MachineInstr::NoUWrap);
+ break;
+ }
+
+ MI.eraseFromParent();
+}
+
void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI,
Register &Reg) const {
B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg},
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1437f3d58b5e79..96a59acd751a62 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -894,6 +894,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::UADDO_CARRY,
ISD::SUB,
ISD::USUBO_CARRY,
+ ISD::MUL,
ISD::FADD,
ISD::FSUB,
ISD::FDIV,
@@ -909,9 +910,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::UMIN,
ISD::UMAX,
ISD::SETCC,
+ ISD::SELECT,
+ ISD::SMIN,
+ ISD::SMAX,
+ ISD::UMIN,
+ ISD::UMAX,
ISD::AND,
ISD::OR,
ISD::XOR,
+ ISD::SHL,
+ ISD::SRL,
+ ISD::SRA,
ISD::FSHR,
ISD::SINT_TO_FP,
ISD::UINT_TO_FP,
@@ -1935,13 +1944,6 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
switch (Op) {
case ISD::LOAD:
case ISD::STORE:
-
- // These operations are done with 32-bit instructions anyway.
- case ISD::AND:
- case ISD::OR:
- case ISD::XOR:
- case ISD::SELECT:
- // TODO: Extensions?
return true;
default:
return false;
@@ -6746,6 +6748,122 @@ SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
}
+static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
+ switch (Op->getOpcode()) {
+ case ISD::SRA:
+ case ISD::SMIN:
+ case ISD::SMAX:
+ return ISD::SIGN_EXTEND;
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::SRL:
+ case ISD::UMIN:
+ case ISD::UMAX:
+ return ISD::ZERO_EXTEND;
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::SHL:
+ case ISD::SELECT:
+ case ISD::MUL:
+ // operation result won't be influenced by garbage high bits.
+ // TODO: are all of those cases correct, and are there more?
+ return ISD::ANY_EXTEND;
+ case ISD::SETCC: {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ return ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ }
+ default:
+ llvm_unreachable("unexpected opcode!");
+ }
+}
+
+SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
+ DAGCombinerInfo &DCI) const {
+ const unsigned Opc = Op.getOpcode();
+ assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
+ Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
+ Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
+ Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
+ Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
+
+ EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
+ : Op->getOperand(0).getValueType();
+
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ // Promote only if:
+ // - We have 16 bit insts (not true 16 bit insts).
+ // - We don't have packed instructions (for vector types only).
+ // TODO: For vector types, the set of packed operations is more limited, so
+ // may want to promote some anyway.
+ if (!Subtarget->has16BitInsts() ||
+ (OpTy.isVector() ? Subtarget->hasVOP3PInsts() : false))
+ return SDValue();
+
+ // Promote uniform scalar and vector integers between 2 and 16 bits.
+ if (Op->isDivergent() || !OpTy.isInteger() ||
+ OpTy.getScalarSizeInBits() == 1 || OpTy.getScalarSizeInBits() > 16)
+ return SDValue();
+
+ auto &DAG = DCI.DAG;
+
+ SDLoc DL(Op);
+ SDValue LHS;
+ SDValue RHS;
+ if (Opc == ISD::SELECT) {
+ LHS = Op->getOperand(1);
+ RHS = Op->getOperand(2);
+ } else {
+ LHS = Op->getOperand(0)...
[truncated]
|
3f23227
to
dc9f16d
Compare
Please only review the last commit, see llvm#106383 for DAGIsel changes. GlobalISel counterpart of llvm#106383 See #llvm#64591
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some of my comments on #106557 also apply here.
Promote uniform binops, selects and setcc in Global & DAGISel instead of CGP. Solves llvm#64591
6aa6e20
to
f9feb66
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/140/builds/6913 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/123/builds/5908 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/174/builds/5380 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/175/builds/5468 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/185/builds/5450 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/137/builds/5507 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/5762 Here is the relevant piece of the build log for the reference
|
Fix failure caused by #106383
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/153/builds/9376 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/95/builds/4003 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/108/builds/3894 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/168/builds/3495 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/187/builds/1287 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/33/builds/3284 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/4/builds/2253 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/25/builds/2594 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/56/builds/7778 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/16/builds/5621 Here is the relevant piece of the build log for the reference
|
Promote uniform binops, selects and setcc between 2 and 16 bits to 32 bits in DAGISel Solves llvm#64591
Fix failure caused by llvm#106383
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/60/builds/8010 Here is the relevant piece of the build log for the reference
|
@@ -18,189 +18,33 @@ declare hidden half @_Z4pownDhi(half, i32) | |||
; -------------------------------------------------------------------- | |||
|
|||
define half @test_pow_fast_f16(half %x, half %y) { | |||
; CHECK-LABEL: test_pow_fast_f16: | |||
; CHECK: ; %bb.0: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This file lost all the test checks
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I fixed this in 528bcf3
GlobalISel counterpart of llvm#106383 See #llvm#64591
if (Op.getOpcode() == ISD::SRA || Op.getOpcode() == ISD::SRL || | ||
Op.getOpcode() == ISD::SRA) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (Op.getOpcode() == ISD::SRA || Op.getOpcode() == ISD::SRL || | |
Op.getOpcode() == ISD::SRA) | |
if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL || | |
Op.getOpcode() == ISD::SRA) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Pierre-vh ping - this looks like it was a simple typo?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops sorry, I'll fix it right now
Promote uniform binops, selects and setcc between 2 and 16 bits to 32 bits in DAGISel
Solves #64591