Skip to content

Commit 8b836e2

Browse files
committed
Simplified version without using DAG combiner.
1 parent a17e48f commit 8b836e2

File tree

6 files changed

+196
-77
lines changed

6 files changed

+196
-77
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2259,6 +2259,30 @@ bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
22592259
return false;
22602260
}
22612261

2262+
bool isBoolSGPR(SDValue V);
2263+
2264+
static SDValue combineBallotPattern(SDValue VCMP) {
2265+
assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2266+
// Special case for amdgcn.ballot:
2267+
// %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2268+
// %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne ; lowered ballot
2269+
// =>
2270+
// Use i1 %Cond value instead of i(WaveSize) %VCMP.
2271+
// This is possible because divergent ISD::SETCC is selected as V_CMP and
2272+
// Cond becomes a i(WaveSize) full mask value.
2273+
auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2274+
auto *VCMP_CRHS = dyn_cast<ConstantSDNode>(VCMP.getOperand(1));
2275+
if (VCMP_CC == ISD::SETNE && VCMP_CRHS && VCMP_CRHS->isZero()) {
2276+
auto Cond = VCMP.getOperand(0);
2277+
if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2278+
Cond = Cond.getOperand(0);
2279+
2280+
if (isBoolSGPR(Cond))
2281+
return Cond;
2282+
}
2283+
return SDValue();
2284+
}
2285+
22622286
void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
22632287
SDValue Cond = N->getOperand(1);
22642288

@@ -2272,11 +2296,44 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
22722296
const SIRegisterInfo *TRI = ST->getRegisterInfo();
22732297

22742298
bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2275-
unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
2299+
bool AndExec = !UseSCCBr;
2300+
bool Negate = false;
2301+
2302+
if (Cond.getOpcode() == ISD::SETCC &&
2303+
Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2304+
auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2305+
auto *CRHS = dyn_cast<ConstantSDNode>(Cond->getOperand(1));
2306+
if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero()) {
2307+
// %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2308+
// %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2309+
// =>
2310+
// Use "i(WaveSize) %VCMP value in VCC register ne/eq zero" as the branch
2311+
// condition.
2312+
Negate = CC == ISD::SETEQ;
2313+
auto VCMP = Cond->getOperand(0);
2314+
if (auto BallotCond = combineBallotPattern(VCMP)) {
2315+
Cond = BallotCond;
2316+
UseSCCBr = !BallotCond->isDivergent();
2317+
} else {
2318+
// TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2319+
// selected as V_CMP, but this may change for uniform condition.
2320+
Cond = VCMP;
2321+
UseSCCBr = false;
2322+
}
2323+
}
2324+
// Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2325+
// V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2326+
// used.
2327+
AndExec = false;
2328+
}
2329+
2330+
unsigned BrOp =
2331+
UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2332+
: (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
22762333
Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
22772334
SDLoc SL(N);
22782335

2279-
if (!UseSCCBr) {
2336+
if (AndExec) {
22802337
// This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
22812338
// analyzed what generates the vcc value, so we do not know whether vcc
22822339
// bits for disabled lanes are 0. Thus we need to mask out bits for

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10501,7 +10501,7 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
1050110501

1050210502
// Returns true if argument is a boolean value which is not serialized into
1050310503
// memory or argument and does not require v_cndmask_b32 to be deserialized.
10504-
static bool isBoolSGPR(SDValue V) {
10504+
bool isBoolSGPR(SDValue V) {
1050510505
if (V.getValueType() != MVT::i1)
1050610506
return false;
1050710507
switch (V.getOpcode()) {

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,3 +393,29 @@ true:
393393
false:
394394
ret i32 33
395395
}
396+
397+
define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
398+
; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare:
399+
; CHECK: ; %bb.0:
400+
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
401+
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
402+
; CHECK-NEXT: s_and_b32 s0, 1, s0
403+
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
404+
; CHECK-NEXT: s_cmp_le_i32 s0, 22
405+
; CHECK-NEXT: s_cbranch_scc1 .LBB19_2
406+
; CHECK-NEXT: ; %bb.1: ; %true
407+
; CHECK-NEXT: s_mov_b32 s0, 42
408+
; CHECK-NEXT: s_branch .LBB19_3
409+
; CHECK-NEXT: .LBB19_2: ; %false
410+
; CHECK-NEXT: s_mov_b32 s0, 33
411+
; CHECK-NEXT: s_branch .LBB19_3
412+
; CHECK-NEXT: .LBB19_3:
413+
%c = icmp ult i32 %v, 12
414+
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
415+
%bc = icmp sgt i32 %ballot, 22
416+
br i1 %bc, label %true, label %false
417+
true:
418+
ret i32 42
419+
false:
420+
ret i32 33
421+
}

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,3 +396,29 @@ true:
396396
false:
397397
ret i32 33
398398
}
399+
400+
define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
401+
; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare:
402+
; CHECK: ; %bb.0:
403+
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
404+
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
405+
; CHECK-NEXT: s_and_b32 s0, 1, s0
406+
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
407+
; CHECK-NEXT: v_cmp_le_i64_e64 vcc, s[0:1], 22
408+
; CHECK-NEXT: s_cbranch_vccnz .LBB19_2
409+
; CHECK-NEXT: ; %bb.1: ; %true
410+
; CHECK-NEXT: s_mov_b32 s0, 42
411+
; CHECK-NEXT: s_branch .LBB19_3
412+
; CHECK-NEXT: .LBB19_2: ; %false
413+
; CHECK-NEXT: s_mov_b32 s0, 33
414+
; CHECK-NEXT: s_branch .LBB19_3
415+
; CHECK-NEXT: .LBB19_3:
416+
%c = icmp ult i32 %v, 12
417+
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
418+
%bc = icmp sgt i64 %ballot, 22
419+
br i1 %bc, label %true, label %false
420+
true:
421+
ret i32 42
422+
false:
423+
ret i32 33
424+
}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll

Lines changed: 42 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
8989
; CHECK: ; %bb.0:
9090
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
9191
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
92-
; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
93-
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
92+
; CHECK-NEXT: s_cbranch_vccz .LBB7_2
9493
; CHECK-NEXT: ; %bb.1: ; %true
9594
; CHECK-NEXT: s_mov_b32 s0, 42
9695
; CHECK-NEXT: s_branch .LBB7_3
@@ -112,9 +111,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
112111
; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
113112
; CHECK: ; %bb.0:
114113
; CHECK-NEXT: s_and_b32 s0, s0, 1
115-
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
116-
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
117-
; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
114+
; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
115+
; CHECK-NEXT: s_cbranch_vccz .LBB8_2
118116
; CHECK-NEXT: ; %bb.1: ; %true
119117
; CHECK-NEXT: s_mov_b32 s0, 42
120118
; CHECK-NEXT: s_branch .LBB8_3
@@ -137,8 +135,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
137135
; CHECK: ; %bb.0:
138136
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
139137
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
140-
; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
141-
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
138+
; CHECK-NEXT: s_cbranch_vccz .LBB9_2
142139
; CHECK-NEXT: ; %bb.1: ; %false
143140
; CHECK-NEXT: s_mov_b32 s0, 33
144141
; CHECK-NEXT: s_branch .LBB9_3
@@ -160,9 +157,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
160157
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
161158
; CHECK: ; %bb.0:
162159
; CHECK-NEXT: s_and_b32 s0, s0, 1
163-
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, s0, 0
164-
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
165-
; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
160+
; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
161+
; CHECK-NEXT: s_cbranch_vccz .LBB10_2
166162
; CHECK-NEXT: ; %bb.1: ; %false
167163
; CHECK-NEXT: s_mov_b32 s0, 33
168164
; CHECK-NEXT: s_branch .LBB10_3
@@ -184,8 +180,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
184180
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
185181
; CHECK: ; %bb.0:
186182
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
187-
; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
188-
; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
183+
; CHECK-NEXT: s_cbranch_vccz .LBB11_2
189184
; CHECK-NEXT: ; %bb.1: ; %true
190185
; CHECK-NEXT: s_mov_b32 s0, 42
191186
; CHECK-NEXT: s_branch .LBB11_3
@@ -206,9 +201,8 @@ false:
206201
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
207202
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
208203
; CHECK: ; %bb.0:
209-
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
210-
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
211-
; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
204+
; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
205+
; CHECK-NEXT: s_cbranch_vccz .LBB12_2
212206
; CHECK-NEXT: ; %bb.1: ; %true
213207
; CHECK-NEXT: s_mov_b32 s0, 42
214208
; CHECK-NEXT: s_branch .LBB12_3
@@ -230,8 +224,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
230224
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
231225
; CHECK: ; %bb.0:
232226
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
233-
; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
234-
; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
227+
; CHECK-NEXT: s_cbranch_vccz .LBB13_2
235228
; CHECK-NEXT: ; %bb.1: ; %false
236229
; CHECK-NEXT: s_mov_b32 s0, 33
237230
; CHECK-NEXT: s_branch .LBB13_3
@@ -252,9 +245,8 @@ false:
252245
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
253246
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
254247
; CHECK: ; %bb.0:
255-
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
256-
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
257-
; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
248+
; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
249+
; CHECK-NEXT: s_cbranch_vccz .LBB14_2
258250
; CHECK-NEXT: ; %bb.1: ; %false
259251
; CHECK-NEXT: s_mov_b32 s0, 33
260252
; CHECK-NEXT: s_branch .LBB14_3
@@ -277,11 +269,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
277269
; CHECK: ; %bb.0:
278270
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
279271
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
280-
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
281-
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
282-
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
283-
; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
284-
; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
272+
; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
273+
; CHECK-NEXT: s_cbranch_vccz .LBB15_2
285274
; CHECK-NEXT: ; %bb.1: ; %true
286275
; CHECK-NEXT: s_mov_b32 s0, 42
287276
; CHECK-NEXT: s_branch .LBB15_3
@@ -309,10 +298,8 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
309298
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
310299
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
311300
; CHECK-NEXT: s_and_b32 s0, s0, s1
312-
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
313-
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
314-
; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
315-
; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
301+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
302+
; CHECK-NEXT: s_cbranch_scc0 .LBB16_2
316303
; CHECK-NEXT: ; %bb.1: ; %true
317304
; CHECK-NEXT: s_mov_b32 s0, 42
318305
; CHECK-NEXT: s_branch .LBB16_3
@@ -337,11 +324,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
337324
; CHECK: ; %bb.0:
338325
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
339326
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
340-
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
341-
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
342-
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
343-
; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
344-
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
327+
; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
328+
; CHECK-NEXT: s_cbranch_vccz .LBB17_2
345329
; CHECK-NEXT: ; %bb.1: ; %false
346330
; CHECK-NEXT: s_mov_b32 s0, 33
347331
; CHECK-NEXT: s_branch .LBB17_3
@@ -369,9 +353,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
369353
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
370354
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
371355
; CHECK-NEXT: s_and_b32 s0, s0, s1
372-
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
373-
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
374-
; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
356+
; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
375357
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
376358
; CHECK-NEXT: ; %bb.1: ; %false
377359
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -391,3 +373,26 @@ true:
391373
false:
392374
ret i32 33
393375
}
376+
377+
define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
378+
; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare:
379+
; CHECK: ; %bb.0:
380+
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, s0, 12
381+
; CHECK-NEXT: s_cmp_lt_i32 s0, 23
382+
; CHECK-NEXT: s_cbranch_scc1 .LBB19_2
383+
; CHECK-NEXT: ; %bb.1: ; %true
384+
; CHECK-NEXT: s_mov_b32 s0, 42
385+
; CHECK-NEXT: s_branch .LBB19_3
386+
; CHECK-NEXT: .LBB19_2: ; %false
387+
; CHECK-NEXT: s_mov_b32 s0, 33
388+
; CHECK-NEXT: s_branch .LBB19_3
389+
; CHECK-NEXT: .LBB19_3:
390+
%c = icmp ult i32 %v, 12
391+
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
392+
%bc = icmp sgt i32 %ballot, 22
393+
br i1 %bc, label %true, label %false
394+
true:
395+
ret i32 42
396+
false:
397+
ret i32 33
398+
}

0 commit comments

Comments
 (0)