Skip to content

Commit 6e865d1

Browse files
committed
* skip optimization when ballot bitwitdh doesn't match wavefront size.
* added the TODO by Nicolai's suggestion.
1 parent 73ba94d commit 6e865d1

File tree

3 files changed

+226
-102
lines changed

3 files changed

+226
-102
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2306,9 +2306,12 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
23062306

23072307
if (Cond.getOpcode() == ISD::SETCC &&
23082308
Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2309+
SDValue VCMP = Cond->getOperand(0);
23092310
auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
23102311
auto *CRHS = dyn_cast<ConstantSDNode>(Cond->getOperand(1));
2311-
if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero()) {
2312+
if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero() &&
2313+
// TODO: make condition below an assert after fixing ballot bitwidth.
2314+
VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {
23122315
// %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
23132316
// %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
23142317
// BRCOND i1 %C, %BB
@@ -2317,7 +2320,6 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
23172320
// VCC = COPY i(WaveSize) %VCMP
23182321
// S_CBRANCH_VCCNZ/VCCZ %BB
23192322
Negate = CC == ISD::SETEQ;
2320-
auto VCMP = Cond->getOperand(0);
23212323
bool NegatedBallot = false;
23222324
if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
23232325
Cond = BallotCond;

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll

Lines changed: 93 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel -verify-machineinstrs < %s | FileCheck %s
44

55
declare i32 @llvm.amdgcn.ballot.i32(i1)
6+
declare i64 @llvm.amdgcn.ballot.i64(i1)
67
declare i32 @llvm.ctpop.i32(i32)
78

89
; Test ballot(0)
@@ -203,6 +204,30 @@ false:
203204
ret i32 33
204205
}
205206

207+
define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_compare(i32 %v) {
208+
; CHECK-LABEL: branch_divergent_ballot64_ne_zero_compare:
209+
; CHECK: ; %bb.0:
210+
; CHECK-NEXT: v_cmp_gt_u32_e64 s0, 12, v0
211+
; CHECK-NEXT: s_mov_b32 s1, 0
212+
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
213+
; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
214+
; CHECK-NEXT: ; %bb.1: ; %true
215+
; CHECK-NEXT: s_mov_b32 s0, 42
216+
; CHECK-NEXT: s_branch .LBB12_3
217+
; CHECK-NEXT: .LBB12_2: ; %false
218+
; CHECK-NEXT: s_mov_b32 s0, 33
219+
; CHECK-NEXT: s_branch .LBB12_3
220+
; CHECK-NEXT: .LBB12_3:
221+
%c = icmp ult i32 %v, 12
222+
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
223+
%ballot_ne_zero = icmp ne i64 %ballot, 0
224+
br i1 %ballot_ne_zero, label %true, label %false
225+
true:
226+
ret i32 42
227+
false:
228+
ret i32 33
229+
}
230+
206231
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
207232
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
208233
; CHECK: ; %bb.0:
@@ -211,14 +236,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
211236
; CHECK-NEXT: s_and_b32 s0, 1, s0
212237
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
213238
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
214-
; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
239+
; CHECK-NEXT: s_cbranch_scc1 .LBB13_2
215240
; CHECK-NEXT: ; %bb.1: ; %true
216241
; CHECK-NEXT: s_mov_b32 s0, 42
217-
; CHECK-NEXT: s_branch .LBB12_3
218-
; CHECK-NEXT: .LBB12_2: ; %false
242+
; CHECK-NEXT: s_branch .LBB13_3
243+
; CHECK-NEXT: .LBB13_2: ; %false
219244
; CHECK-NEXT: s_mov_b32 s0, 33
220-
; CHECK-NEXT: s_branch .LBB12_3
221-
; CHECK-NEXT: .LBB12_3:
245+
; CHECK-NEXT: s_branch .LBB13_3
246+
; CHECK-NEXT: .LBB13_3:
222247
%c = icmp ult i32 %v, 12
223248
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
224249
%ballot_ne_zero = icmp ne i32 %ballot, 0
@@ -234,14 +259,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
234259
; CHECK: ; %bb.0:
235260
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
236261
; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
237-
; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
262+
; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
238263
; CHECK-NEXT: ; %bb.1: ; %false
239264
; CHECK-NEXT: s_mov_b32 s0, 33
240-
; CHECK-NEXT: s_branch .LBB13_3
241-
; CHECK-NEXT: .LBB13_2: ; %true
265+
; CHECK-NEXT: s_branch .LBB14_3
266+
; CHECK-NEXT: .LBB14_2: ; %true
242267
; CHECK-NEXT: s_mov_b32 s0, 42
243-
; CHECK-NEXT: s_branch .LBB13_3
244-
; CHECK-NEXT: .LBB13_3:
268+
; CHECK-NEXT: s_branch .LBB14_3
269+
; CHECK-NEXT: .LBB14_3:
245270
%c = icmp ult i32 %v, 12
246271
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
247272
%ballot_eq_zero = icmp eq i32 %ballot, 0
@@ -260,14 +285,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
260285
; CHECK-NEXT: s_and_b32 s0, 1, s0
261286
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
262287
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
263-
; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
288+
; CHECK-NEXT: s_cbranch_scc0 .LBB15_2
264289
; CHECK-NEXT: ; %bb.1: ; %false
265290
; CHECK-NEXT: s_mov_b32 s0, 33
266-
; CHECK-NEXT: s_branch .LBB14_3
267-
; CHECK-NEXT: .LBB14_2: ; %true
291+
; CHECK-NEXT: s_branch .LBB15_3
292+
; CHECK-NEXT: .LBB15_2: ; %true
268293
; CHECK-NEXT: s_mov_b32 s0, 42
269-
; CHECK-NEXT: s_branch .LBB14_3
270-
; CHECK-NEXT: .LBB14_3:
294+
; CHECK-NEXT: s_branch .LBB15_3
295+
; CHECK-NEXT: .LBB15_3:
271296
%c = icmp ult i32 %v, 12
272297
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
273298
%ballot_eq_zero = icmp eq i32 %ballot, 0
@@ -285,14 +310,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
285310
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
286311
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
287312
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
288-
; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
313+
; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
289314
; CHECK-NEXT: ; %bb.1: ; %true
290315
; CHECK-NEXT: s_mov_b32 s0, 42
291-
; CHECK-NEXT: s_branch .LBB15_3
292-
; CHECK-NEXT: .LBB15_2: ; %false
316+
; CHECK-NEXT: s_branch .LBB16_3
317+
; CHECK-NEXT: .LBB16_2: ; %false
293318
; CHECK-NEXT: s_mov_b32 s0, 33
294-
; CHECK-NEXT: s_branch .LBB15_3
295-
; CHECK-NEXT: .LBB15_3:
319+
; CHECK-NEXT: s_branch .LBB16_3
320+
; CHECK-NEXT: .LBB16_3:
296321
%v1c = icmp ult i32 %v1, 12
297322
%v2c = icmp ugt i32 %v2, 34
298323
%c = and i1 %v1c, %v2c
@@ -305,6 +330,34 @@ false:
305330
ret i32 33
306331
}
307332

333+
define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_and(i32 %v1, i32 %v2) {
334+
; CHECK-LABEL: branch_divergent_ballot64_ne_zero_and:
335+
; CHECK: ; %bb.0:
336+
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
337+
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
338+
; CHECK-NEXT: s_mov_b32 s1, 0
339+
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
340+
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
341+
; CHECK-NEXT: s_cbranch_scc1 .LBB17_2
342+
; CHECK-NEXT: ; %bb.1: ; %true
343+
; CHECK-NEXT: s_mov_b32 s0, 42
344+
; CHECK-NEXT: s_branch .LBB17_3
345+
; CHECK-NEXT: .LBB17_2: ; %false
346+
; CHECK-NEXT: s_mov_b32 s0, 33
347+
; CHECK-NEXT: s_branch .LBB17_3
348+
; CHECK-NEXT: .LBB17_3:
349+
%v1c = icmp ult i32 %v1, 12
350+
%v2c = icmp ugt i32 %v2, 34
351+
%c = and i1 %v1c, %v2c
352+
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
353+
%ballot_ne_zero = icmp ne i64 %ballot, 0
354+
br i1 %ballot_ne_zero, label %true, label %false
355+
true:
356+
ret i32 42
357+
false:
358+
ret i32 33
359+
}
360+
308361
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
309362
; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
310363
; CHECK: ; %bb.0:
@@ -316,14 +369,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
316369
; CHECK-NEXT: s_and_b32 s0, 1, s0
317370
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
318371
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
319-
; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
372+
; CHECK-NEXT: s_cbranch_scc1 .LBB18_2
320373
; CHECK-NEXT: ; %bb.1: ; %true
321374
; CHECK-NEXT: s_mov_b32 s0, 42
322-
; CHECK-NEXT: s_branch .LBB16_3
323-
; CHECK-NEXT: .LBB16_2: ; %false
375+
; CHECK-NEXT: s_branch .LBB18_3
376+
; CHECK-NEXT: .LBB18_2: ; %false
324377
; CHECK-NEXT: s_mov_b32 s0, 33
325-
; CHECK-NEXT: s_branch .LBB16_3
326-
; CHECK-NEXT: .LBB16_3:
378+
; CHECK-NEXT: s_branch .LBB18_3
379+
; CHECK-NEXT: .LBB18_3:
327380
%v1c = icmp ult i32 %v1, 12
328381
%v2c = icmp ugt i32 %v2, 34
329382
%c = and i1 %v1c, %v2c
@@ -343,14 +396,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
343396
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
344397
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
345398
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
346-
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
399+
; CHECK-NEXT: s_cbranch_scc0 .LBB19_2
347400
; CHECK-NEXT: ; %bb.1: ; %false
348401
; CHECK-NEXT: s_mov_b32 s0, 33
349-
; CHECK-NEXT: s_branch .LBB17_3
350-
; CHECK-NEXT: .LBB17_2: ; %true
402+
; CHECK-NEXT: s_branch .LBB19_3
403+
; CHECK-NEXT: .LBB19_2: ; %true
351404
; CHECK-NEXT: s_mov_b32 s0, 42
352-
; CHECK-NEXT: s_branch .LBB17_3
353-
; CHECK-NEXT: .LBB17_3:
405+
; CHECK-NEXT: s_branch .LBB19_3
406+
; CHECK-NEXT: .LBB19_3:
354407
%v1c = icmp ult i32 %v1, 12
355408
%v2c = icmp ugt i32 %v2, 34
356409
%c = and i1 %v1c, %v2c
@@ -374,14 +427,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
374427
; CHECK-NEXT: s_and_b32 s0, 1, s0
375428
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
376429
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
377-
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
430+
; CHECK-NEXT: s_cbranch_scc0 .LBB20_2
378431
; CHECK-NEXT: ; %bb.1: ; %false
379432
; CHECK-NEXT: s_mov_b32 s0, 33
380-
; CHECK-NEXT: s_branch .LBB18_3
381-
; CHECK-NEXT: .LBB18_2: ; %true
433+
; CHECK-NEXT: s_branch .LBB20_3
434+
; CHECK-NEXT: .LBB20_2: ; %true
382435
; CHECK-NEXT: s_mov_b32 s0, 42
383-
; CHECK-NEXT: s_branch .LBB18_3
384-
; CHECK-NEXT: .LBB18_3:
436+
; CHECK-NEXT: s_branch .LBB20_3
437+
; CHECK-NEXT: .LBB20_3:
385438
%v1c = icmp ult i32 %v1, 12
386439
%v2c = icmp ugt i32 %v2, 34
387440
%c = and i1 %v1c, %v2c
@@ -402,14 +455,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
402455
; CHECK-NEXT: s_and_b32 s0, 1, s0
403456
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
404457
; CHECK-NEXT: s_cmp_le_i32 s0, 22
405-
; CHECK-NEXT: s_cbranch_scc1 .LBB19_2
458+
; CHECK-NEXT: s_cbranch_scc1 .LBB21_2
406459
; CHECK-NEXT: ; %bb.1: ; %true
407460
; CHECK-NEXT: s_mov_b32 s0, 42
408-
; CHECK-NEXT: s_branch .LBB19_3
409-
; CHECK-NEXT: .LBB19_2: ; %false
461+
; CHECK-NEXT: s_branch .LBB21_3
462+
; CHECK-NEXT: .LBB21_2: ; %false
410463
; CHECK-NEXT: s_mov_b32 s0, 33
411-
; CHECK-NEXT: s_branch .LBB19_3
412-
; CHECK-NEXT: .LBB19_3:
464+
; CHECK-NEXT: s_branch .LBB21_3
465+
; CHECK-NEXT: .LBB21_3:
413466
%c = icmp ult i32 %v, 12
414467
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
415468
%bc = icmp sgt i32 %ballot, 22

0 commit comments

Comments
 (0)