Skip to content

Commit 13ebb87

Browse files
committed
[AMDGPU] Improve selection of conditional branch on amdgcn.ballot!=0 in SelectionDAG.
1 parent 0b14138 commit 13ebb87

File tree

9 files changed

+146
-75
lines changed

9 files changed

+146
-75
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
666666
case ISD::FP_EXTEND:
667667
SelectFP_EXTEND(N);
668668
return;
669+
case AMDGPUISD::BRCONDZ:
670+
SelectBRCONDZ(N);
671+
return;
669672
case AMDGPUISD::CVT_PKRTZ_F16_F32:
670673
case AMDGPUISD::CVT_PKNORM_I16_F32:
671674
case AMDGPUISD::CVT_PKNORM_U16_F32:
@@ -2306,6 +2309,34 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
23062309
VCC.getValue(0));
23072310
}
23082311

2312+
void AMDGPUDAGToDAGISel::SelectBRCONDZ(SDNode *N) {
2313+
const SIRegisterInfo *TRI =
2314+
static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
2315+
2316+
SDValue Cond = N->getOperand(1);
2317+
2318+
// BRCONDZ condition is either AMDGPUISD::SETCC or i1 value that comes from
2319+
// ISD::SETCC node or logical combination of ISD::SETCCs therefore we don't
2320+
// need to AND the condition with execmask.
2321+
2322+
// TODO: AMDGPUISD::SETCC is always selected as V_CMP so use VCC condition.
2323+
// This might change later.
2324+
bool UseSCCBr = Cond->getOpcode() != AMDGPUISD::SETCC && !Cond->isDivergent();
2325+
2326+
auto CondCode = cast<CondCodeSDNode>(N->getOperand(3))->get();
2327+
assert(CondCode == ISD::SETEQ || CondCode == ISD::SETNE);
2328+
2329+
bool EqZero = CondCode == ISD::SETEQ;
2330+
unsigned BrOp =
2331+
UseSCCBr ? (EqZero ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2332+
: (EqZero ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2333+
2334+
SDValue CondCopy = CurDAG->getCopyToReg(
2335+
N->getOperand(0), SDLoc(N), UseSCCBr ? AMDGPU::SCC : TRI->getVCC(),
2336+
N->getOperand(1));
2337+
CurDAG->SelectNodeTo(N, BrOp, MVT::Other, N->getOperand(2), CondCopy);
2338+
}
2339+
23092340
void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
23102341
if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
23112342
!N->isDivergent()) {

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
272272
void SelectS_BFE(SDNode *N);
273273
bool isCBranchSCC(const SDNode *N) const;
274274
void SelectBRCOND(SDNode *N);
275+
void SelectBRCONDZ(SDNode *N);
275276
void SelectFMAD_FMA(SDNode *N);
276277
void SelectFP_EXTEND(SDNode *N);
277278
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5317,6 +5317,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
53175317
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
53185318
NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
53195319
NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5320+
NODE_NAME_CASE(BRCONDZ)
53205321

53215322
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
53225323
}

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,20 @@ enum NodeType : unsigned {
422422
// This is SETCC with the full mask result which is used for a compare with a
423423
// result bit per item in the wavefront.
424424
SETCC,
425+
426+
// Conditional branch on comparison of CondWaveMask operand to zero.
427+
// BRCONDZ CondWaveMask, BB, CondCode
428+
// where:
429+
// - CondWaveMask - is either:
430+
// * the i32/i64 result of AMDGPUISD::SETCC node,
431+
// * i1 value that comes from ISD::SETCC node or logical combination of
432+
// ISD::SETCCs. For a divergent node this becomes a i32/i64 value after
433+
// selection.
434+
// - BB is the target basic block,
435+
// - CondCode is either SETEQ or SETNE meaning that the branch should happen
436+
// if the CondWaveMask is either equal to zero or not.
437+
BRCONDZ,
438+
425439
SETREG,
426440

427441
DENORM_MODE,

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,19 @@ def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
5858
[SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, i1>]
5959
>;
6060

61+
def AMDGPUBrcondzOp : SDTypeProfile<0, 3, [
62+
// cond, bb, cc
63+
SDTCisInt<0>, SDTCisVT<1, OtherVT>, SDTCisVT<2, OtherVT>
64+
]>;
65+
6166
//===----------------------------------------------------------------------===//
6267
// AMDGPU DAG Nodes
6368
//
6469

6570
def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>;
6671
def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
6772
def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;
73+
def AMDGPUbrcondz: SDNode<"AMDGPUISD::BRCONDZ", AMDGPUBrcondzOp, [SDNPHasChain]>;
6874

6975
def callseq_start : SDNode<"ISD::CALLSEQ_START",
7076
SDCallSeqStart<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>,

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -798,7 +798,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
798798
ISD::SIGN_EXTEND_INREG,
799799
ISD::EXTRACT_VECTOR_ELT,
800800
ISD::INSERT_VECTOR_ELT,
801-
ISD::FCOPYSIGN});
801+
ISD::FCOPYSIGN,
802+
ISD::BRCOND});
802803

803804
if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
804805
setTargetDAGCombine(ISD::FP_ROUND);
@@ -13584,6 +13585,56 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
1358413585
return SDValue(CSrc, 0);
1358513586
}
1358613587

13588+
SDValue SITargetLowering::performBRCondCombine(SDNode *N,
13589+
DAGCombinerInfo &DCI) const {
13590+
if (!DCI.isAfterLegalizeDAG())
13591+
return SDValue(N, 0);
13592+
13593+
SDValue Cond = N->getOperand(1);
13594+
if (Cond.getOpcode() == ISD::SETCC &&
13595+
Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
13596+
13597+
// %VCMP = i32/i64 AMDGPUISD::SETCC ...
13598+
// %C = ISD::SETCC %VCMP, 0, setne/seteq
13599+
// BRCOND %BB, %C
13600+
// =>
13601+
// %VCMP = i32/i64 AMDGPUISD::SETCC ...
13602+
// BRCONDZ %BB, %VCMP, setne/seteq
13603+
13604+
auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
13605+
auto *CRHS = dyn_cast<ConstantSDNode>(Cond->getOperand(1));
13606+
if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero()) {
13607+
13608+
auto VCMP = Cond->getOperand(0);
13609+
auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
13610+
auto *VCMP_CRHS = dyn_cast<ConstantSDNode>(VCMP.getOperand(1));
13611+
auto Src = VCMP;
13612+
if (VCMP_CC == ISD::SETNE && VCMP_CRHS && VCMP_CRHS->isZero()) {
13613+
13614+
// Special case for amdgcn.ballot:
13615+
// %VCMPSrc = ISD::SETCC or a logical combination of ISD::SETCCs
13616+
// %VCMP = i32/i64 AMDGPUISD::SETCC (ext %VCMPSrc), 0, setne
13617+
// %C = ISD::SETCC %VCMP, 0, setne/seteq
13618+
// BRCOND %BB, %C
13619+
// =>
13620+
// BRCONDZ %BB, %VCMPSrc, setne/seteq
13621+
13622+
auto VCMPSrc = VCMP.getOperand(0);
13623+
if (ISD::isExtOpcode(VCMPSrc->getOpcode())) // Skip extension.
13624+
VCMPSrc = VCMPSrc.getOperand(0);
13625+
13626+
if (isBoolSGPR(VCMPSrc))
13627+
Src = VCMPSrc;
13628+
}
13629+
return DCI.DAG.getNode(AMDGPUISD::BRCONDZ, SDLoc(N), N->getVTList(),
13630+
N->getOperand(0), // Chain
13631+
Src,
13632+
N->getOperand(2), // BB
13633+
DCI.DAG.getCondCode(CC)); // SETEQ|SETNE
13634+
}
13635+
}
13636+
return SDValue(N, 0);
13637+
}
1358713638

1358813639
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
1358913640
DAGCombinerInfo &DCI) const {
@@ -13694,6 +13745,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
1369413745
return performInsertVectorEltCombine(N, DCI);
1369513746
case ISD::FP_ROUND:
1369613747
return performFPRoundCombine(N, DCI);
13748+
case ISD::BRCOND:
13749+
return performBRCondCombine(N, DCI);
1369713750
case ISD::LOAD: {
1369813751
if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
1369913752
return Widended;

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
220220
SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
221221
SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
222222
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
223+
SDValue performBRCondCombine(SDNode *N, DAGCombinerInfo &DCI) const;
223224

224225
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
225226
bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll

Lines changed: 19 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
8989
; CHECK: ; %bb.0:
9090
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
9191
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
92-
; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
93-
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
92+
; CHECK-NEXT: s_cbranch_vccz .LBB7_2
9493
; CHECK-NEXT: ; %bb.1: ; %true
9594
; CHECK-NEXT: s_mov_b32 s0, 42
9695
; CHECK-NEXT: s_branch .LBB7_3
@@ -112,9 +111,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
112111
; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
113112
; CHECK: ; %bb.0:
114113
; CHECK-NEXT: s_and_b32 s0, s0, 1
115-
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
116-
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
117-
; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
114+
; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
115+
; CHECK-NEXT: s_cbranch_vccz .LBB8_2
118116
; CHECK-NEXT: ; %bb.1: ; %true
119117
; CHECK-NEXT: s_mov_b32 s0, 42
120118
; CHECK-NEXT: s_branch .LBB8_3
@@ -137,8 +135,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
137135
; CHECK: ; %bb.0:
138136
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
139137
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
140-
; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
141-
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
138+
; CHECK-NEXT: s_cbranch_vccz .LBB9_2
142139
; CHECK-NEXT: ; %bb.1: ; %false
143140
; CHECK-NEXT: s_mov_b32 s0, 33
144141
; CHECK-NEXT: s_branch .LBB9_3
@@ -160,9 +157,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
160157
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
161158
; CHECK: ; %bb.0:
162159
; CHECK-NEXT: s_and_b32 s0, s0, 1
163-
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
164-
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
165-
; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
160+
; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
161+
; CHECK-NEXT: s_cbranch_vccz .LBB10_2
166162
; CHECK-NEXT: ; %bb.1: ; %false
167163
; CHECK-NEXT: s_mov_b32 s0, 33
168164
; CHECK-NEXT: s_branch .LBB10_3
@@ -184,8 +180,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
184180
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
185181
; CHECK: ; %bb.0:
186182
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
187-
; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
188-
; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
183+
; CHECK-NEXT: s_cbranch_vccz .LBB11_2
189184
; CHECK-NEXT: ; %bb.1: ; %true
190185
; CHECK-NEXT: s_mov_b32 s0, 42
191186
; CHECK-NEXT: s_branch .LBB11_3
@@ -206,9 +201,8 @@ false:
206201
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
207202
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
208203
; CHECK: ; %bb.0:
209-
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
210-
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
211-
; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
204+
; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
205+
; CHECK-NEXT: s_cbranch_vccz .LBB12_2
212206
; CHECK-NEXT: ; %bb.1: ; %true
213207
; CHECK-NEXT: s_mov_b32 s0, 42
214208
; CHECK-NEXT: s_branch .LBB12_3
@@ -230,8 +224,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
230224
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
231225
; CHECK: ; %bb.0:
232226
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
233-
; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
234-
; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
227+
; CHECK-NEXT: s_cbranch_vccz .LBB13_2
235228
; CHECK-NEXT: ; %bb.1: ; %false
236229
; CHECK-NEXT: s_mov_b32 s0, 33
237230
; CHECK-NEXT: s_branch .LBB13_3
@@ -252,9 +245,8 @@ false:
252245
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
253246
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
254247
; CHECK: ; %bb.0:
255-
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
256-
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
257-
; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
248+
; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
249+
; CHECK-NEXT: s_cbranch_vccz .LBB14_2
258250
; CHECK-NEXT: ; %bb.1: ; %false
259251
; CHECK-NEXT: s_mov_b32 s0, 33
260252
; CHECK-NEXT: s_branch .LBB14_3
@@ -277,11 +269,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
277269
; CHECK: ; %bb.0:
278270
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
279271
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
280-
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
281-
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
282-
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
283-
; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
284-
; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
272+
; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
273+
; CHECK-NEXT: s_cbranch_vccz .LBB15_2
285274
; CHECK-NEXT: ; %bb.1: ; %true
286275
; CHECK-NEXT: s_mov_b32 s0, 42
287276
; CHECK-NEXT: s_branch .LBB15_3
@@ -309,10 +298,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
309298
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
310299
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
311300
; CHECK-NEXT: s_and_b32 s0, s0, s1
312-
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
313-
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
314-
; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
315-
; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
301+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
302+
; CHECK-NEXT: s_cbranch_scc0 .LBB16_2
316303
; CHECK-NEXT: ; %bb.1: ; %true
317304
; CHECK-NEXT: s_mov_b32 s0, 42
318305
; CHECK-NEXT: s_branch .LBB16_3
@@ -337,11 +324,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
337324
; CHECK: ; %bb.0:
338325
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
339326
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
340-
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
341-
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
342-
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
343-
; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
344-
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
327+
; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
328+
; CHECK-NEXT: s_cbranch_vccz .LBB17_2
345329
; CHECK-NEXT: ; %bb.1: ; %false
346330
; CHECK-NEXT: s_mov_b32 s0, 33
347331
; CHECK-NEXT: s_branch .LBB17_3
@@ -369,9 +353,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
369353
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
370354
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
371355
; CHECK-NEXT: s_and_b32 s0, s0, s1
372-
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
373-
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
374-
; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
356+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
375357
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
376358
; CHECK-NEXT: ; %bb.1: ; %false
377359
; CHECK-NEXT: s_mov_b32 s0, 33

0 commit comments

Comments
 (0)