Skip to content

Commit ed55fef

Browse files
committed
[AMDGPU] Promote uniform ops to I32 in ISel
Promote uniform binops, selects and setcc in Global & DAGISel instead of CGP. Solves #64591
1 parent 9d14c13 commit ed55fef

File tree

86 files changed

+10437
-11559
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

86 files changed

+10437
-11559
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3299,7 +3299,7 @@ class TargetLoweringBase {
32993299
/// Return true if it's profitable to narrow operations of type SrcVT to
33003300
/// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not from
33013301
/// i32 to i16.
3302-
virtual bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
3302+
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const {
33033303
return false;
33043304
}
33053305

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7031,7 +7031,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
70317031
if (N1C->getAPIntValue().countLeadingZeros() >= (BitWidth - SrcBitWidth) &&
70327032
TLI.isTruncateFree(VT, SrcVT) && TLI.isZExtFree(SrcVT, VT) &&
70337033
TLI.isTypeDesirableForOp(ISD::AND, SrcVT) &&
7034-
TLI.isNarrowingProfitable(VT, SrcVT))
7034+
TLI.isNarrowingProfitable(N, VT, SrcVT))
70357035
return DAG.getNode(ISD::ZERO_EXTEND, DL, VT,
70367036
DAG.getNode(ISD::AND, DL, SrcVT, N0Op0,
70377037
DAG.getZExtOrTrunc(N1, DL, SrcVT)));
@@ -14574,7 +14574,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
1457414574
// ShLeftAmt will indicate how much a narrowed load should be shifted left.
1457514575
unsigned ShLeftAmt = 0;
1457614576
if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
14577-
ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
14577+
ExtVT == VT && TLI.isNarrowingProfitable(N, N0.getValueType(), VT)) {
1457814578
if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
1457914579
ShLeftAmt = N01->getZExtValue();
1458014580
N0 = N0.getOperand(0);
@@ -15118,9 +15118,11 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
1511815118
}
1511915119

1512015120
// trunc (select c, a, b) -> select c, (trunc a), (trunc b)
15121-
if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
15122-
if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
15123-
TLI.isTruncateFree(SrcVT, VT)) {
15121+
if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse() &&
15122+
TLI.isTruncateFree(SrcVT, VT)) {
15123+
if (!LegalOperations ||
15124+
(TLI.isOperationLegal(ISD::SELECT, SrcVT) &&
15125+
TLI.isNarrowingProfitable(N0.getNode(), N0.getValueType(), VT))) {
1512415126
SDLoc SL(N0);
1512515127
SDValue Cond = N0.getOperand(0);
1512615128
SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
@@ -20061,10 +20063,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
2006120063
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
2006220064
// The narrowing should be profitable, the load/store operation should be
2006320065
// legal (or custom) and the store size should be equal to the NewVT width.
20064-
while (NewBW < BitWidth &&
20065-
(NewVT.getStoreSizeInBits() != NewBW ||
20066-
!TLI.isOperationLegalOrCustom(Opc, NewVT) ||
20067-
!TLI.isNarrowingProfitable(VT, NewVT))) {
20066+
while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW ||
20067+
!TLI.isOperationLegalOrCustom(Opc, NewVT) ||
20068+
!TLI.isNarrowingProfitable(N, VT, NewVT))) {
2006820069
NewBW = NextPowerOf2(NewBW);
2006920070
NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
2007020071
}

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1841,7 +1841,7 @@ bool TargetLowering::SimplifyDemandedBits(
18411841
for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize);
18421842
SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
18431843
EVT SmallVT = EVT::getIntegerVT(*TLO.DAG.getContext(), SmallVTBits);
1844-
if (isNarrowingProfitable(VT, SmallVT) &&
1844+
if (isNarrowingProfitable(Op.getNode(), VT, SmallVT) &&
18451845
isTypeDesirableForOp(ISD::SHL, SmallVT) &&
18461846
isTruncateFree(VT, SmallVT) && isZExtFree(SmallVT, VT) &&
18471847
(!TLO.LegalOperations() || isOperationLegal(ISD::SHL, SmallVT))) {
@@ -1865,7 +1865,7 @@ bool TargetLowering::SimplifyDemandedBits(
18651865
if ((BitWidth % 2) == 0 && !VT.isVector() && ShAmt < HalfWidth &&
18661866
DemandedBits.countLeadingOnes() >= HalfWidth) {
18671867
EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), HalfWidth);
1868-
if (isNarrowingProfitable(VT, HalfVT) &&
1868+
if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
18691869
isTypeDesirableForOp(ISD::SHL, HalfVT) &&
18701870
isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
18711871
(!TLO.LegalOperations() || isOperationLegal(ISD::SHL, HalfVT))) {
@@ -1984,7 +1984,7 @@ bool TargetLowering::SimplifyDemandedBits(
19841984
if ((BitWidth % 2) == 0 && !VT.isVector()) {
19851985
APInt HiBits = APInt::getHighBitsSet(BitWidth, BitWidth / 2);
19861986
EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), BitWidth / 2);
1987-
if (isNarrowingProfitable(VT, HalfVT) &&
1987+
if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
19881988
isTypeDesirableForOp(ISD::SRL, HalfVT) &&
19891989
isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
19901990
(!TLO.LegalOperations() || isOperationLegal(ISD::SRL, HalfVT)) &&
@@ -4762,9 +4762,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
47624762
case ISD::SETULT:
47634763
case ISD::SETULE: {
47644764
EVT newVT = N0.getOperand(0).getValueType();
4765+
// FIXME: Should use isNarrowingProfitable.
47654766
if (DCI.isBeforeLegalizeOps() ||
47664767
(isOperationLegal(ISD::SETCC, newVT) &&
4767-
isCondCodeLegal(Cond, newVT.getSimpleVT()))) {
4768+
isCondCodeLegal(Cond, newVT.getSimpleVT()) &&
4769+
isTypeDesirableForOp(ISD::SETCC, newVT))) {
47684770
EVT NewSetCCVT = getSetCCResultType(Layout, *DAG.getContext(), newVT);
47694771
SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT);
47704772

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,10 @@ static cl::opt<bool> WidenLoads(
4646
cl::init(false));
4747

4848
static cl::opt<bool> Widen16BitOps(
49-
"amdgpu-codegenprepare-widen-16-bit-ops",
50-
cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
51-
cl::ReallyHidden,
52-
cl::init(true));
49+
"amdgpu-codegenprepare-widen-16-bit-ops",
50+
cl::desc(
51+
"Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
52+
cl::ReallyHidden, cl::init(false));
5353

5454
static cl::opt<bool>
5555
BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",

llvm/lib/Target/AMDGPU/AMDGPUCombine.td

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,31 @@ def expand_promoted_fmed3 : GICombineRule<
145145

146146
} // End Predicates = [NotHasMed3_16]
147147

148+
def promote_i16_uniform_binops_frag : GICombinePatFrag<
149+
(outs root:$dst), (ins),
150+
!foreach(op, [G_ADD, G_SUB, G_SHL, G_ASHR, G_LSHR, G_AND, G_XOR, G_OR, G_MUL],
151+
(pattern (op i16:$dst, i16:$lhs, i16:$rhs)))>;
152+
153+
def promote_i16_uniform_binops : GICombineRule<
154+
(defs root:$dst),
155+
(match (promote_i16_uniform_binops_frag i16:$dst):$mi,
156+
[{ return matchPromote16to32(*${mi}); }]),
157+
(apply [{ applyPromote16to32(*${mi}); }])
158+
>;
159+
160+
def promote_i16_uniform_ternary_frag : GICombinePatFrag<
161+
(outs root:$dst), (ins),
162+
!foreach(op, [G_ICMP, G_SELECT],
163+
(pattern (op i16:$dst, $first, i16:$lhs, i16:$rhs)))>;
164+
165+
def promote_i16_uniform_ternary : GICombineRule<
166+
(defs root:$dst),
167+
(match (promote_i16_uniform_ternary_frag i16:$dst):$mi,
168+
[{ return matchPromote16to32(*${mi}); }]),
169+
(apply [{ applyPromote16to32(*${mi}); }])
170+
>;
171+
172+
148173
// Combines which should only apply on SI/CI
149174
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
150175

@@ -169,5 +194,6 @@ def AMDGPURegBankCombiner : GICombiner<
169194
"AMDGPURegBankCombinerImpl",
170195
[unmerge_merge, unmerge_cst, unmerge_undef,
171196
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
172-
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
197+
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
198+
promote_i16_uniform_binops, promote_i16_uniform_ternary]> {
173199
}

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,14 +1017,45 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
10171017
return Src == MVT::i32 && Dest == MVT::i64;
10181018
}
10191019

1020-
bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
1020+
bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
1021+
EVT DestVT) const {
1022+
switch (N->getOpcode()) {
1023+
case ISD::ADD:
1024+
case ISD::SUB:
1025+
case ISD::SHL:
1026+
case ISD::SRL:
1027+
case ISD::SRA:
1028+
case ISD::AND:
1029+
case ISD::OR:
1030+
case ISD::XOR:
1031+
case ISD::MUL:
1032+
case ISD::SETCC:
1033+
case ISD::SELECT:
1034+
if (Subtarget->has16BitInsts() &&
1035+
(DestVT.isVector() ? !Subtarget->hasVOP3PInsts() : true)) {
1036+
// Don't narrow back down to i16 if promoted to i32 already.
1037+
if (!N->isDivergent() && DestVT.isInteger() &&
1038+
DestVT.getScalarSizeInBits() > 1 &&
1039+
DestVT.getScalarSizeInBits() <= 16 &&
1040+
SrcVT.getScalarSizeInBits() > 16) {
1041+
return false;
1042+
}
1043+
}
1044+
return true;
1045+
default:
1046+
break;
1047+
}
1048+
10211049
// There aren't really 64-bit registers, but pairs of 32-bit ones and only a
10221050
// limited number of native 64-bit operations. Shrinking an operation to fit
10231051
// in a single 32-bit register should always be helpful. As currently used,
10241052
// this is much less general than the name suggests, and is only used in
10251053
// places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
10261054
// not profitable, and may actually be harmful.
1027-
return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1055+
if (isa<LoadSDNode>(N))
1056+
return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1057+
1058+
return true;
10281059
}
10291060

10301061
bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ class AMDGPUTargetLowering : public TargetLowering {
201201
NegatibleCost &Cost,
202202
unsigned Depth) const override;
203203

204-
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override;
204+
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
205205

206206
bool isDesirableToCommuteWithShift(const SDNode *N,
207207
CombineLevel Level) const override;

llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,9 @@ class AMDGPURegBankCombinerImpl : public Combiner {
8989
void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
9090
void applyClamp(MachineInstr &MI, Register &Reg) const;
9191

92+
bool matchPromote16to32(MachineInstr &MI) const;
93+
void applyPromote16to32(MachineInstr &MI) const;
94+
9295
private:
9396
SIModeRegisterDefaults getMode() const;
9497
bool getIEEE() const;
@@ -348,6 +351,116 @@ bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
348351
return false;
349352
}
350353

354+
bool AMDGPURegBankCombinerImpl::matchPromote16to32(MachineInstr &MI) const {
355+
Register Dst = MI.getOperand(0).getReg();
356+
LLT DstTy = MRI.getType(Dst);
357+
const auto *RB = MRI.getRegBankOrNull(Dst);
358+
359+
// Only promote uniform instructions.
360+
if (RB->getID() != AMDGPU::SGPRRegBankID)
361+
return false;
362+
363+
// Promote only if:
364+
// - We have 16 bit insts (not true 16 bit insts).
365+
// - We don't have packed instructions (for vector types only).
366+
// TODO: For vector types, the set of packed operations is more limited, so
367+
// may want to promote some anyway.
368+
return STI.has16BitInsts() &&
369+
(DstTy.isVector() ? !STI.hasVOP3PInsts() : true);
370+
}
371+
372+
static unsigned getExtOpcodeForPromotedOp(MachineInstr &MI) {
373+
switch (MI.getOpcode()) {
374+
case AMDGPU::G_ASHR:
375+
return AMDGPU::G_SEXT;
376+
case AMDGPU::G_ADD:
377+
case AMDGPU::G_SUB:
378+
case AMDGPU::G_FSHR:
379+
return AMDGPU::G_ZEXT;
380+
case AMDGPU::G_AND:
381+
case AMDGPU::G_OR:
382+
case AMDGPU::G_XOR:
383+
case AMDGPU::G_SHL:
384+
case AMDGPU::G_SELECT:
385+
case AMDGPU::G_MUL:
386+
// operation result won't be influenced by garbage high bits.
387+
// TODO: are all of those cases correct, and are there more?
388+
return AMDGPU::G_ANYEXT;
389+
case AMDGPU::G_ICMP: {
390+
return CmpInst::isSigned(cast<GICmp>(MI).getCond()) ? AMDGPU::G_SEXT
391+
: AMDGPU::G_ZEXT;
392+
}
393+
default:
394+
llvm_unreachable("unexpected opcode!");
395+
}
396+
}
397+
398+
void AMDGPURegBankCombinerImpl::applyPromote16to32(MachineInstr &MI) const {
399+
const unsigned Opc = MI.getOpcode();
400+
assert(Opc == AMDGPU::G_ADD || Opc == AMDGPU::G_SUB || Opc == AMDGPU::G_SHL ||
401+
Opc == AMDGPU::G_LSHR || Opc == AMDGPU::G_ASHR ||
402+
Opc == AMDGPU::G_AND || Opc == AMDGPU::G_OR || Opc == AMDGPU::G_XOR ||
403+
Opc == AMDGPU::G_MUL || Opc == AMDGPU::G_SELECT ||
404+
Opc == AMDGPU::G_ICMP);
405+
406+
Register Dst = MI.getOperand(0).getReg();
407+
408+
bool IsSelectOrCmp = (Opc == AMDGPU::G_SELECT || Opc == AMDGPU::G_ICMP);
409+
Register LHS = MI.getOperand(IsSelectOrCmp + 1).getReg();
410+
Register RHS = MI.getOperand(IsSelectOrCmp + 2).getReg();
411+
412+
assert(MRI.getType(Dst) == LLT::scalar(16));
413+
assert(MRI.getType(LHS) == LLT::scalar(16));
414+
assert(MRI.getType(RHS) == LLT::scalar(16));
415+
416+
assert(MRI.getRegBankOrNull(Dst)->getID() == AMDGPU::SGPRRegBankID);
417+
assert(MRI.getRegBankOrNull(LHS)->getID() == AMDGPU::SGPRRegBankID);
418+
assert(MRI.getRegBankOrNull(RHS)->getID() == AMDGPU::SGPRRegBankID);
419+
const RegisterBank &RB = *MRI.getRegBankOrNull(Dst);
420+
421+
LLT S32 = LLT::scalar(32);
422+
423+
B.setInstrAndDebugLoc(MI);
424+
const unsigned ExtOpc = getExtOpcodeForPromotedOp(MI);
425+
LHS = B.buildInstr(ExtOpc, {S32}, {LHS}).getReg(0);
426+
RHS = B.buildInstr(ExtOpc, {S32}, {RHS}).getReg(0);
427+
428+
MRI.setRegBank(LHS, RB);
429+
MRI.setRegBank(RHS, RB);
430+
431+
MachineInstr *NewInst;
432+
if (IsSelectOrCmp)
433+
NewInst = B.buildInstr(Opc, {Dst}, {MI.getOperand(1), LHS, RHS});
434+
else
435+
NewInst = B.buildInstr(Opc, {S32}, {LHS, RHS});
436+
437+
if (Opc != AMDGPU::G_ICMP) {
438+
Register Dst32 = NewInst->getOperand(0).getReg();
439+
MRI.setRegBank(Dst32, RB);
440+
B.buildTrunc(Dst, Dst32);
441+
}
442+
443+
switch (Opc) {
444+
case AMDGPU::G_ADD:
445+
case AMDGPU::G_SHL:
446+
NewInst->setFlag(MachineInstr::NoUWrap);
447+
NewInst->setFlag(MachineInstr::NoSWrap);
448+
break;
449+
case AMDGPU::G_SUB:
450+
if (MI.getFlag(MachineInstr::NoUWrap))
451+
NewInst->setFlag(MachineInstr::NoUWrap);
452+
NewInst->setFlag(MachineInstr::NoSWrap);
453+
break;
454+
case AMDGPU::G_MUL:
455+
NewInst->setFlag(MachineInstr::NoUWrap);
456+
if (MI.getFlag(MachineInstr::NoUWrap))
457+
NewInst->setFlag(MachineInstr::NoUWrap);
458+
break;
459+
}
460+
461+
MI.eraseFromParent();
462+
}
463+
351464
void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI,
352465
Register &Reg) const {
353466
B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg},

0 commit comments

Comments
 (0)