-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Promote uniform ops to i32 in GISel #106557
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) ChangesPlease only review the last commit, see #106383 for DAGIsel changes. GlobalISel counterpart of #106383 See ##64591 Patch is 1.14 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106557.diff 65 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index eda38cd8a564d6..85310a4911b8ed 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3299,7 +3299,7 @@ class TargetLoweringBase {
/// Return true if it's profitable to narrow operations of type SrcVT to
/// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not from
/// i32 to i16.
- virtual bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+ virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const {
return false;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b0a906743f29ff..513ad392cb360a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7031,7 +7031,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
if (N1C->getAPIntValue().countLeadingZeros() >= (BitWidth - SrcBitWidth) &&
TLI.isTruncateFree(VT, SrcVT) && TLI.isZExtFree(SrcVT, VT) &&
TLI.isTypeDesirableForOp(ISD::AND, SrcVT) &&
- TLI.isNarrowingProfitable(VT, SrcVT))
+ TLI.isNarrowingProfitable(N, VT, SrcVT))
return DAG.getNode(ISD::ZERO_EXTEND, DL, VT,
DAG.getNode(ISD::AND, DL, SrcVT, N0Op0,
DAG.getZExtOrTrunc(N1, DL, SrcVT)));
@@ -14574,7 +14574,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
// ShLeftAmt will indicate how much a narrowed load should be shifted left.
unsigned ShLeftAmt = 0;
if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
- ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
+ ExtVT == VT && TLI.isNarrowingProfitable(N, N0.getValueType(), VT)) {
if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
ShLeftAmt = N01->getZExtValue();
N0 = N0.getOperand(0);
@@ -15118,9 +15118,11 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
}
// trunc (select c, a, b) -> select c, (trunc a), (trunc b)
- if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
- if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
- TLI.isTruncateFree(SrcVT, VT)) {
+ if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse() &&
+ TLI.isTruncateFree(SrcVT, VT)) {
+ if (!LegalOperations ||
+ (TLI.isOperationLegal(ISD::SELECT, SrcVT) &&
+ TLI.isNarrowingProfitable(N0.getNode(), N0.getValueType(), VT))) {
SDLoc SL(N0);
SDValue Cond = N0.getOperand(0);
SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
@@ -20061,10 +20063,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
// The narrowing should be profitable, the load/store operation should be
// legal (or custom) and the store size should be equal to the NewVT width.
- while (NewBW < BitWidth &&
- (NewVT.getStoreSizeInBits() != NewBW ||
- !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
- !TLI.isNarrowingProfitable(VT, NewVT))) {
+ while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW ||
+ !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
+ !TLI.isNarrowingProfitable(N, VT, NewVT))) {
NewBW = NextPowerOf2(NewBW);
NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4e796289cff0a1..97e10b3551db1a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1841,7 +1841,7 @@ bool TargetLowering::SimplifyDemandedBits(
for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize);
SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
EVT SmallVT = EVT::getIntegerVT(*TLO.DAG.getContext(), SmallVTBits);
- if (isNarrowingProfitable(VT, SmallVT) &&
+ if (isNarrowingProfitable(Op.getNode(), VT, SmallVT) &&
isTypeDesirableForOp(ISD::SHL, SmallVT) &&
isTruncateFree(VT, SmallVT) && isZExtFree(SmallVT, VT) &&
(!TLO.LegalOperations() || isOperationLegal(ISD::SHL, SmallVT))) {
@@ -1865,7 +1865,7 @@ bool TargetLowering::SimplifyDemandedBits(
if ((BitWidth % 2) == 0 && !VT.isVector() && ShAmt < HalfWidth &&
DemandedBits.countLeadingOnes() >= HalfWidth) {
EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), HalfWidth);
- if (isNarrowingProfitable(VT, HalfVT) &&
+ if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
isTypeDesirableForOp(ISD::SHL, HalfVT) &&
isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
(!TLO.LegalOperations() || isOperationLegal(ISD::SHL, HalfVT))) {
@@ -1984,7 +1984,7 @@ bool TargetLowering::SimplifyDemandedBits(
if ((BitWidth % 2) == 0 && !VT.isVector()) {
APInt HiBits = APInt::getHighBitsSet(BitWidth, BitWidth / 2);
EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), BitWidth / 2);
- if (isNarrowingProfitable(VT, HalfVT) &&
+ if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
isTypeDesirableForOp(ISD::SRL, HalfVT) &&
isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
(!TLO.LegalOperations() || isOperationLegal(ISD::SRL, HalfVT)) &&
@@ -4762,9 +4762,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
case ISD::SETULT:
case ISD::SETULE: {
EVT newVT = N0.getOperand(0).getValueType();
+ // FIXME: Should use isNarrowingProfitable.
if (DCI.isBeforeLegalizeOps() ||
(isOperationLegal(ISD::SETCC, newVT) &&
- isCondCodeLegal(Cond, newVT.getSimpleVT()))) {
+ isCondCodeLegal(Cond, newVT.getSimpleVT()) &&
+ isTypeDesirableForOp(ISD::SETCC, newVT))) {
EVT NewSetCCVT = getSetCCResultType(Layout, *DAG.getContext(), newVT);
SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b2a3f9392157d1..01e96159babd03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -145,6 +145,31 @@ def expand_promoted_fmed3 : GICombineRule<
} // End Predicates = [NotHasMed3_16]
+def promote_i16_uniform_binops_frag : GICombinePatFrag<
+ (outs root:$dst), (ins),
+ !foreach(op, [G_ADD, G_SUB, G_SHL, G_ASHR, G_LSHR, G_AND, G_XOR, G_OR, G_MUL],
+ (pattern (op i16:$dst, i16:$lhs, i16:$rhs)))>;
+
+def promote_i16_uniform_binops : GICombineRule<
+ (defs root:$dst),
+ (match (promote_i16_uniform_binops_frag i16:$dst):$mi,
+ [{ return matchPromote16to32(*${mi}); }]),
+ (apply [{ applyPromote16to32(*${mi}); }])
+>;
+
+def promote_i16_uniform_ternary_frag : GICombinePatFrag<
+ (outs root:$dst), (ins),
+ !foreach(op, [G_ICMP, G_SELECT],
+ (pattern (op i16:$dst, $first, i16:$lhs, i16:$rhs)))>;
+
+def promote_i16_uniform_ternary : GICombineRule<
+ (defs root:$dst),
+ (match (promote_i16_uniform_ternary_frag i16:$dst):$mi,
+ [{ return matchPromote16to32(*${mi}); }]),
+ (apply [{ applyPromote16to32(*${mi}); }])
+>;
+
+
// Combines which should only apply on SI/CI
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
@@ -169,5 +194,6 @@ def AMDGPURegBankCombiner : GICombiner<
"AMDGPURegBankCombinerImpl",
[unmerge_merge, unmerge_cst, unmerge_undef,
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
- fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
+ fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
+ promote_i16_uniform_binops, promote_i16_uniform_ternary]> {
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d24836b7eeb095..1f3110c2294444 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1022,14 +1022,45 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
return Src == MVT::i32 && Dest == MVT::i64;
}
-bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
+ EVT DestVT) const {
+ switch (N->getOpcode()) {
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::MUL:
+ case ISD::SETCC:
+ case ISD::SELECT:
+ if (Subtarget->has16BitInsts() &&
+ (DestVT.isVector() ? !Subtarget->hasVOP3PInsts() : true)) {
+ // Don't narrow back down to i16 if promoted to i32 already.
+ if (!N->isDivergent() && DestVT.isInteger() &&
+ DestVT.getScalarSizeInBits() > 1 &&
+ DestVT.getScalarSizeInBits() <= 16 &&
+ SrcVT.getScalarSizeInBits() > 16) {
+ return false;
+ }
+ }
+ return true;
+ default:
+ break;
+ }
+
// There aren't really 64-bit registers, but pairs of 32-bit ones and only a
// limited number of native 64-bit operations. Shrinking an operation to fit
// in a single 32-bit register should always be helpful. As currently used,
// this is much less general than the name suggests, and is only used in
// places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
// not profitable, and may actually be harmful.
- return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+ if (isa<LoadSDNode>(N))
+ return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+
+ return true;
}
bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 59f640ea99de3e..4dfa7ac052a5ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -201,7 +201,7 @@ class AMDGPUTargetLowering : public TargetLowering {
NegatibleCost &Cost,
unsigned Depth) const override;
- bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override;
+ bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
bool isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index e236a5d7522e02..3b4faa35b93738 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -89,6 +89,9 @@ class AMDGPURegBankCombinerImpl : public Combiner {
void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
void applyClamp(MachineInstr &MI, Register &Reg) const;
+ bool matchPromote16to32(MachineInstr &MI) const;
+ void applyPromote16to32(MachineInstr &MI) const;
+
private:
SIModeRegisterDefaults getMode() const;
bool getIEEE() const;
@@ -348,6 +351,116 @@ bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
return false;
}
+bool AMDGPURegBankCombinerImpl::matchPromote16to32(MachineInstr &MI) const {
+ Register Dst = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(Dst);
+ const auto *RB = MRI.getRegBankOrNull(Dst);
+
+ // Only promote uniform instructions.
+ if (RB->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+
+ // Promote only if:
+ // - We have 16 bit insts (not true 16 bit insts).
+ // - We don't have packed instructions (for vector types only).
+ // TODO: For vector types, the set of packed operations is more limited, so
+ // may want to promote some anyway.
+ return STI.has16BitInsts() &&
+ (DstTy.isVector() ? !STI.hasVOP3PInsts() : true);
+}
+
+static unsigned getExtOpcodeForPromotedOp(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::G_ASHR:
+ return AMDGPU::G_SEXT;
+ case AMDGPU::G_ADD:
+ case AMDGPU::G_SUB:
+ case AMDGPU::G_FSHR:
+ return AMDGPU::G_ZEXT;
+ case AMDGPU::G_AND:
+ case AMDGPU::G_OR:
+ case AMDGPU::G_XOR:
+ case AMDGPU::G_SHL:
+ case AMDGPU::G_SELECT:
+ case AMDGPU::G_MUL:
+ // operation result won't be influenced by garbage high bits.
+ // TODO: are all of those cases correct, and are there more?
+ return AMDGPU::G_ANYEXT;
+ case AMDGPU::G_ICMP: {
+ return CmpInst::isSigned(cast<GICmp>(MI).getCond()) ? AMDGPU::G_SEXT
+ : AMDGPU::G_ZEXT;
+ }
+ default:
+ llvm_unreachable("unexpected opcode!");
+ }
+}
+
+void AMDGPURegBankCombinerImpl::applyPromote16to32(MachineInstr &MI) const {
+ const unsigned Opc = MI.getOpcode();
+ assert(Opc == AMDGPU::G_ADD || Opc == AMDGPU::G_SUB || Opc == AMDGPU::G_SHL ||
+ Opc == AMDGPU::G_LSHR || Opc == AMDGPU::G_ASHR ||
+ Opc == AMDGPU::G_AND || Opc == AMDGPU::G_OR || Opc == AMDGPU::G_XOR ||
+ Opc == AMDGPU::G_MUL || Opc == AMDGPU::G_SELECT ||
+ Opc == AMDGPU::G_ICMP);
+
+ Register Dst = MI.getOperand(0).getReg();
+
+ bool IsSelectOrCmp = (Opc == AMDGPU::G_SELECT || Opc == AMDGPU::G_ICMP);
+ Register LHS = MI.getOperand(IsSelectOrCmp + 1).getReg();
+ Register RHS = MI.getOperand(IsSelectOrCmp + 2).getReg();
+
+ assert(MRI.getType(Dst) == LLT::scalar(16));
+ assert(MRI.getType(LHS) == LLT::scalar(16));
+ assert(MRI.getType(RHS) == LLT::scalar(16));
+
+ assert(MRI.getRegBankOrNull(Dst)->getID() == AMDGPU::SGPRRegBankID);
+ assert(MRI.getRegBankOrNull(LHS)->getID() == AMDGPU::SGPRRegBankID);
+ assert(MRI.getRegBankOrNull(RHS)->getID() == AMDGPU::SGPRRegBankID);
+ const RegisterBank &RB = *MRI.getRegBankOrNull(Dst);
+
+ LLT S32 = LLT::scalar(32);
+
+ B.setInstrAndDebugLoc(MI);
+ const unsigned ExtOpc = getExtOpcodeForPromotedOp(MI);
+ LHS = B.buildInstr(ExtOpc, {S32}, {LHS}).getReg(0);
+ RHS = B.buildInstr(ExtOpc, {S32}, {RHS}).getReg(0);
+
+ MRI.setRegBank(LHS, RB);
+ MRI.setRegBank(RHS, RB);
+
+ MachineInstr *NewInst;
+ if (IsSelectOrCmp)
+ NewInst = B.buildInstr(Opc, {Dst}, {MI.getOperand(1), LHS, RHS});
+ else
+ NewInst = B.buildInstr(Opc, {S32}, {LHS, RHS});
+
+ if (Opc != AMDGPU::G_ICMP) {
+ Register Dst32 = NewInst->getOperand(0).getReg();
+ MRI.setRegBank(Dst32, RB);
+ B.buildTrunc(Dst, Dst32);
+ }
+
+ switch (Opc) {
+ case AMDGPU::G_ADD:
+ case AMDGPU::G_SHL:
+ NewInst->setFlag(MachineInstr::NoUWrap);
+ NewInst->setFlag(MachineInstr::NoSWrap);
+ break;
+ case AMDGPU::G_SUB:
+ if (MI.getFlag(MachineInstr::NoUWrap))
+ NewInst->setFlag(MachineInstr::NoUWrap);
+ NewInst->setFlag(MachineInstr::NoSWrap);
+ break;
+ case AMDGPU::G_MUL:
+ NewInst->setFlag(MachineInstr::NoUWrap);
+ if (MI.getFlag(MachineInstr::NoUWrap))
+ NewInst->setFlag(MachineInstr::NoUWrap);
+ break;
+ }
+
+ MI.eraseFromParent();
+}
+
void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI,
Register &Reg) const {
B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg},
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1437f3d58b5e79..96a59acd751a62 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -894,6 +894,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::UADDO_CARRY,
ISD::SUB,
ISD::USUBO_CARRY,
+ ISD::MUL,
ISD::FADD,
ISD::FSUB,
ISD::FDIV,
@@ -909,9 +910,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::UMIN,
ISD::UMAX,
ISD::SETCC,
+ ISD::SELECT,
+ ISD::SMIN,
+ ISD::SMAX,
+ ISD::UMIN,
+ ISD::UMAX,
ISD::AND,
ISD::OR,
ISD::XOR,
+ ISD::SHL,
+ ISD::SRL,
+ ISD::SRA,
ISD::FSHR,
ISD::SINT_TO_FP,
ISD::UINT_TO_FP,
@@ -1935,13 +1944,6 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
switch (Op) {
case ISD::LOAD:
case ISD::STORE:
-
- // These operations are done with 32-bit instructions anyway.
- case ISD::AND:
- case ISD::OR:
- case ISD::XOR:
- case ISD::SELECT:
- // TODO: Extensions?
return true;
default:
return false;
@@ -6746,6 +6748,122 @@ SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
}
+static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
+ switch (Op->getOpcode()) {
+ case ISD::SRA:
+ case ISD::SMIN:
+ case ISD::SMAX:
+ return ISD::SIGN_EXTEND;
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::SRL:
+ case ISD::UMIN:
+ case ISD::UMAX:
+ return ISD::ZERO_EXTEND;
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::SHL:
+ case ISD::SELECT:
+ case ISD::MUL:
+ // operation result won't be influenced by garbage high bits.
+ // TODO: are all of those cases correct, and are there more?
+ return ISD::ANY_EXTEND;
+ case ISD::SETCC: {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ return ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ }
+ default:
+ llvm_unreachable("unexpected opcode!");
+ }
+}
+
+SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
+ DAGCombinerInfo &DCI) const {
+ const unsigned Opc = Op.getOpcode();
+ assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
+ Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
+ Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
+ Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
+ Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
+
+ EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
+ : Op->getOperand(0).getValueType();
+
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ // Promote only if:
+ // - We have 16 bit insts (not true 16 bit insts).
+ // - We don't have packed instructions (for vector types only).
+ // TODO: For vector types, the set of packed operations is more limited, so
+ // may want to promote some anyway.
+ if (!Subtarget->has16BitInsts() ||
+ (OpTy.isVector() ? Subtarget->hasVOP3PInsts() : false))
+ return SDValue();
+
+ // Promote uniform scalar and vector integers between 2 and 16 bits.
+ if (Op->isDivergent() || !OpTy.isInteger() ||
+ OpTy.getScalarSizeInBits() == 1 || OpTy.getScalarSizeInBits() > 16)
+ return SDValue();
+
+ auto &DAG = DCI.DAG;
+
+ SDLoc DL(Op);
+ SDValue LHS;
+ SDValue RHS;
+ if (Opc == ISD::SELECT) {
+ LHS = Op->getOperand(1);
+ RHS = Op->getOperand(2);
+ } else {
+ LHS = Op->getOperand(0);
+ RHS = Op->getOperand(1);
+ }
+
+ auto ExtTy = OpTy.changeElementType(MVT::i32);
+
+ const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
+ LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
+ RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
+
+ // setcc always return i1/i1 vec so no need to truncate after.
+ if (Opc == ISD::SETCC) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
+ }
+
+ SDNodeFlags Flags;
+ switch (Op->getOpcode()) {
+ case ISD::ADD:
+ case ISD::SHL:
+ Flags.setNoUnsignedWrap(true);
+ Flags.setNoSignedWrap(true);
+ break;
+ case ISD::SUB:
+ Flags.setNoUnsignedWrap(Op->getFlags().hasNoUnsignedWrap());
+ Flags.setNoSignedWrap(true);
+ break;
+ case ISD::MUL:
+ Flags.setNoUns...
[truncated]
|
return false; | ||
|
||
// Promote only if: | ||
// - We have 16 bit insts (not true 16 bit insts). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand the rationale here. Why would you only promote uniform 16-bit ops if you do have 16-bit VALU instructions? (I guess if you do not have 16-bit VALU instructions then these 16-bit ops would already have been legalized to 32-bit. But that does not really explain why it makes sense to check has16BitInsts
again here.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I borrowed this from AMDGPUCodeGenPrepare, @arsenm wrote it I think?
I also don't think all of the operations are legalized to 32 bits, at least not in the DAG part. I need to look a bit longer the GlobalISel case (my focus has been on the DAG and I want to land that first)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess if you do not have 16-bit VALU instructions then these 16-bit ops would already have been legalized to 32-bit
Yes, but that should already be obvious from what types are legal at this point
case AMDGPU::G_SELECT: | ||
case AMDGPU::G_MUL: | ||
// operation result won't be influenced by garbage high bits. | ||
// TODO: are all of those cases correct, and are there more? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ADD and SUB could also use ANYEXT if you give up on trying to set nsw/nuw on the promoted instructions.
return AMDGPU::G_SEXT; | ||
case AMDGPU::G_ADD: | ||
case AMDGPU::G_SUB: | ||
case AMDGPU::G_FSHR: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Typo for LSHR?
|
||
switch (Opc) { | ||
case AMDGPU::G_ADD: | ||
case AMDGPU::G_SHL: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think you can set these flags for SHL since you're using ANYEXT. Same for MUL below.
case AMDGPU::G_AND: | ||
case AMDGPU::G_OR: | ||
case AMDGPU::G_XOR: | ||
case AMDGPU::G_SHL: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Strictly speaking for all shifts the RHS should use ZEXT.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
They should be able to use G_ANYEXT, only the bottom 5 bits are read (but I suppose the generic shifts don't know that)
(OpTy.isVector() ? Subtarget->hasVOP3PInsts() : false)) | ||
return SDValue(); | ||
|
||
// Promote uniform scalar and vector integers between 2 and 16 bits. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With true16, I think this should promote uniform scalars and vectors, but not divergent scalars and vectors. Because we don't have true16 support for 16 bit SGPRs, but do for VGPRs. You can use Subtarget->useRealTrue16Insts()
I don't think there is enough test coverage to show the difference on true16 targets right now, but there should be within a few weeks.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same logic in DAG isel
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With true16, I think this should promote uniform scalars and vectors, but not divergent scalars and vectors.
I don't understand, this only promotes uniform operations anyway (see SGPR check above)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The code looks fine then. My confusion was from this comment
- We have 16 bit insts (not true 16 bit insts)
I read it as "We don't want to promote if we have true16 bit insts on this subtarget." That would be incorrect imo.
GlobalISel counterpart of llvm#106383 See #llvm#64591
14e3a98
to
bf5070f
Compare
I updated the diff, need some help with the vector case because it can generate illegal ops. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should add a MIR test that ensures this doesn't crash on a preset register class.
The codegen changes look worse to me, and I don't see any improvements. I think the issue is this is trying to promote after regbankselect, when most of the legalization up to scalar operations already occurred
// Only promote uniform instructions. | ||
if (RB->getID() != AMDGPU::SGPRRegBankID) | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Check this first, up when RB is assigned. Also this is a case where really what we care about is scalar, not uniformity
case AMDGPU::G_AND: | ||
case AMDGPU::G_OR: | ||
case AMDGPU::G_XOR: | ||
case AMDGPU::G_SHL: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
They should be able to use G_ANYEXT, only the bottom 5 bits are read (but I suppose the generic shifts don't know that)
bool AMDGPURegBankCombinerImpl::matchPromote16to32(MachineInstr &MI) const { | ||
Register Dst = MI.getOperand(0).getReg(); | ||
LLT DstTy = MRI.getType(Dst); | ||
const auto *RB = MRI.getRegBankOrNull(Dst); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ideally would use RBI.getRegBank to handle the case where the register has a concrete register class. Failing that, this needs to return false on null
// TODO: For vector types, the set of packed operations is more limited, so | ||
// may want to promote some anyway. | ||
assert(STI.has16BitInsts()); | ||
return (DstTy.isVector() ? !STI.hasVOP3PInsts() : true); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an and and doesn't need parentheses. Shouldn't need the assert, it's implied by legality
; GFX9-LABEL: s_orn2_i16: | ||
; GFX9: ; %bb.0: | ||
; GFX9-NEXT: s_xor_b32 s0, s3, -1 | ||
; GFX9-NEXT: s_or_b32 s0, s2, s0 | ||
; GFX9-NEXT: ; return to shader part epilog |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is worse, the pattern is broken
; GFX9: ; %bb.0: | ||
; GFX9-NEXT: s_xor_b32 s1, s4, -1 | ||
; GFX9-NEXT: s_or_b32 s0, s2, s1 | ||
; GFX9-NEXT: s_or_b32 s1, s3, s1 | ||
; GFX9-NEXT: ; return to shader part epilog |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is worse
; GFX8-NEXT: s_xor_b32 s3, s2, -1 | ||
; GFX8-NEXT: s_and_b32 s3, s3, 7 | ||
; GFX8-NEXT: s_and_b32 s2, s2, 7 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is worse
Should we perhaps leave the CGP expansion in for the time being, and just disable it for DAGISel? |
We will keep the CGP expansion for GlobalISel; the right solution needs a bit more experimentation and probably some infrastructure improvements. |
GlobalISel counterpart of #106383
See ##64591