3
3
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel -verify-machineinstrs < %s | FileCheck %s
4
4
5
5
declare i32 @llvm.amdgcn.ballot.i32 (i1 )
6
+ declare i64 @llvm.amdgcn.ballot.i64 (i1 )
6
7
declare i32 @llvm.ctpop.i32 (i32 )
7
8
8
9
; Test ballot(0)
@@ -203,6 +204,30 @@ false:
203
204
ret i32 33
204
205
}
205
206
207
+ define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_compare (i32 %v ) {
208
+ ; CHECK-LABEL: branch_divergent_ballot64_ne_zero_compare:
209
+ ; CHECK: ; %bb.0:
210
+ ; CHECK-NEXT: v_cmp_gt_u32_e64 s0, 12, v0
211
+ ; CHECK-NEXT: s_mov_b32 s1, 0
212
+ ; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
213
+ ; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
214
+ ; CHECK-NEXT: ; %bb.1: ; %true
215
+ ; CHECK-NEXT: s_mov_b32 s0, 42
216
+ ; CHECK-NEXT: s_branch .LBB12_3
217
+ ; CHECK-NEXT: .LBB12_2: ; %false
218
+ ; CHECK-NEXT: s_mov_b32 s0, 33
219
+ ; CHECK-NEXT: s_branch .LBB12_3
220
+ ; CHECK-NEXT: .LBB12_3:
221
+ %c = icmp ult i32 %v , 12
222
+ %ballot = call i64 @llvm.amdgcn.ballot.i64 (i1 %c )
223
+ %ballot_ne_zero = icmp ne i64 %ballot , 0
224
+ br i1 %ballot_ne_zero , label %true , label %false
225
+ true:
226
+ ret i32 42
227
+ false:
228
+ ret i32 33
229
+ }
230
+
206
231
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare (i32 inreg %v ) {
207
232
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
208
233
; CHECK: ; %bb.0:
@@ -211,14 +236,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
211
236
; CHECK-NEXT: s_and_b32 s0, 1, s0
212
237
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
213
238
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
214
- ; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
239
+ ; CHECK-NEXT: s_cbranch_scc1 .LBB13_2
215
240
; CHECK-NEXT: ; %bb.1: ; %true
216
241
; CHECK-NEXT: s_mov_b32 s0, 42
217
- ; CHECK-NEXT: s_branch .LBB12_3
218
- ; CHECK-NEXT: .LBB12_2 : ; %false
242
+ ; CHECK-NEXT: s_branch .LBB13_3
243
+ ; CHECK-NEXT: .LBB13_2 : ; %false
219
244
; CHECK-NEXT: s_mov_b32 s0, 33
220
- ; CHECK-NEXT: s_branch .LBB12_3
221
- ; CHECK-NEXT: .LBB12_3 :
245
+ ; CHECK-NEXT: s_branch .LBB13_3
246
+ ; CHECK-NEXT: .LBB13_3 :
222
247
%c = icmp ult i32 %v , 12
223
248
%ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
224
249
%ballot_ne_zero = icmp ne i32 %ballot , 0
@@ -234,14 +259,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
234
259
; CHECK: ; %bb.0:
235
260
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
236
261
; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
237
- ; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
262
+ ; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
238
263
; CHECK-NEXT: ; %bb.1: ; %false
239
264
; CHECK-NEXT: s_mov_b32 s0, 33
240
- ; CHECK-NEXT: s_branch .LBB13_3
241
- ; CHECK-NEXT: .LBB13_2 : ; %true
265
+ ; CHECK-NEXT: s_branch .LBB14_3
266
+ ; CHECK-NEXT: .LBB14_2 : ; %true
242
267
; CHECK-NEXT: s_mov_b32 s0, 42
243
- ; CHECK-NEXT: s_branch .LBB13_3
244
- ; CHECK-NEXT: .LBB13_3 :
268
+ ; CHECK-NEXT: s_branch .LBB14_3
269
+ ; CHECK-NEXT: .LBB14_3 :
245
270
%c = icmp ult i32 %v , 12
246
271
%ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
247
272
%ballot_eq_zero = icmp eq i32 %ballot , 0
@@ -260,14 +285,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
260
285
; CHECK-NEXT: s_and_b32 s0, 1, s0
261
286
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
262
287
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
263
- ; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
288
+ ; CHECK-NEXT: s_cbranch_scc0 .LBB15_2
264
289
; CHECK-NEXT: ; %bb.1: ; %false
265
290
; CHECK-NEXT: s_mov_b32 s0, 33
266
- ; CHECK-NEXT: s_branch .LBB14_3
267
- ; CHECK-NEXT: .LBB14_2 : ; %true
291
+ ; CHECK-NEXT: s_branch .LBB15_3
292
+ ; CHECK-NEXT: .LBB15_2 : ; %true
268
293
; CHECK-NEXT: s_mov_b32 s0, 42
269
- ; CHECK-NEXT: s_branch .LBB14_3
270
- ; CHECK-NEXT: .LBB14_3 :
294
+ ; CHECK-NEXT: s_branch .LBB15_3
295
+ ; CHECK-NEXT: .LBB15_3 :
271
296
%c = icmp ult i32 %v , 12
272
297
%ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
273
298
%ballot_eq_zero = icmp eq i32 %ballot , 0
@@ -285,14 +310,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
285
310
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
286
311
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
287
312
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
288
- ; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
313
+ ; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
289
314
; CHECK-NEXT: ; %bb.1: ; %true
290
315
; CHECK-NEXT: s_mov_b32 s0, 42
291
- ; CHECK-NEXT: s_branch .LBB15_3
292
- ; CHECK-NEXT: .LBB15_2 : ; %false
316
+ ; CHECK-NEXT: s_branch .LBB16_3
317
+ ; CHECK-NEXT: .LBB16_2 : ; %false
293
318
; CHECK-NEXT: s_mov_b32 s0, 33
294
- ; CHECK-NEXT: s_branch .LBB15_3
295
- ; CHECK-NEXT: .LBB15_3 :
319
+ ; CHECK-NEXT: s_branch .LBB16_3
320
+ ; CHECK-NEXT: .LBB16_3 :
296
321
%v1c = icmp ult i32 %v1 , 12
297
322
%v2c = icmp ugt i32 %v2 , 34
298
323
%c = and i1 %v1c , %v2c
@@ -305,6 +330,34 @@ false:
305
330
ret i32 33
306
331
}
307
332
333
+ define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_and (i32 %v1 , i32 %v2 ) {
334
+ ; CHECK-LABEL: branch_divergent_ballot64_ne_zero_and:
335
+ ; CHECK: ; %bb.0:
336
+ ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
337
+ ; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
338
+ ; CHECK-NEXT: s_mov_b32 s1, 0
339
+ ; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
340
+ ; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
341
+ ; CHECK-NEXT: s_cbranch_scc1 .LBB17_2
342
+ ; CHECK-NEXT: ; %bb.1: ; %true
343
+ ; CHECK-NEXT: s_mov_b32 s0, 42
344
+ ; CHECK-NEXT: s_branch .LBB17_3
345
+ ; CHECK-NEXT: .LBB17_2: ; %false
346
+ ; CHECK-NEXT: s_mov_b32 s0, 33
347
+ ; CHECK-NEXT: s_branch .LBB17_3
348
+ ; CHECK-NEXT: .LBB17_3:
349
+ %v1c = icmp ult i32 %v1 , 12
350
+ %v2c = icmp ugt i32 %v2 , 34
351
+ %c = and i1 %v1c , %v2c
352
+ %ballot = call i64 @llvm.amdgcn.ballot.i64 (i1 %c )
353
+ %ballot_ne_zero = icmp ne i64 %ballot , 0
354
+ br i1 %ballot_ne_zero , label %true , label %false
355
+ true:
356
+ ret i32 42
357
+ false:
358
+ ret i32 33
359
+ }
360
+
308
361
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and (i32 inreg %v1 , i32 inreg %v2 ) {
309
362
; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
310
363
; CHECK: ; %bb.0:
@@ -316,14 +369,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
316
369
; CHECK-NEXT: s_and_b32 s0, 1, s0
317
370
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
318
371
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
319
- ; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
372
+ ; CHECK-NEXT: s_cbranch_scc1 .LBB18_2
320
373
; CHECK-NEXT: ; %bb.1: ; %true
321
374
; CHECK-NEXT: s_mov_b32 s0, 42
322
- ; CHECK-NEXT: s_branch .LBB16_3
323
- ; CHECK-NEXT: .LBB16_2 : ; %false
375
+ ; CHECK-NEXT: s_branch .LBB18_3
376
+ ; CHECK-NEXT: .LBB18_2 : ; %false
324
377
; CHECK-NEXT: s_mov_b32 s0, 33
325
- ; CHECK-NEXT: s_branch .LBB16_3
326
- ; CHECK-NEXT: .LBB16_3 :
378
+ ; CHECK-NEXT: s_branch .LBB18_3
379
+ ; CHECK-NEXT: .LBB18_3 :
327
380
%v1c = icmp ult i32 %v1 , 12
328
381
%v2c = icmp ugt i32 %v2 , 34
329
382
%c = and i1 %v1c , %v2c
@@ -343,14 +396,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
343
396
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
344
397
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
345
398
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
346
- ; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
399
+ ; CHECK-NEXT: s_cbranch_scc0 .LBB19_2
347
400
; CHECK-NEXT: ; %bb.1: ; %false
348
401
; CHECK-NEXT: s_mov_b32 s0, 33
349
- ; CHECK-NEXT: s_branch .LBB17_3
350
- ; CHECK-NEXT: .LBB17_2 : ; %true
402
+ ; CHECK-NEXT: s_branch .LBB19_3
403
+ ; CHECK-NEXT: .LBB19_2 : ; %true
351
404
; CHECK-NEXT: s_mov_b32 s0, 42
352
- ; CHECK-NEXT: s_branch .LBB17_3
353
- ; CHECK-NEXT: .LBB17_3 :
405
+ ; CHECK-NEXT: s_branch .LBB19_3
406
+ ; CHECK-NEXT: .LBB19_3 :
354
407
%v1c = icmp ult i32 %v1 , 12
355
408
%v2c = icmp ugt i32 %v2 , 34
356
409
%c = and i1 %v1c , %v2c
@@ -374,14 +427,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
374
427
; CHECK-NEXT: s_and_b32 s0, 1, s0
375
428
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
376
429
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
377
- ; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
430
+ ; CHECK-NEXT: s_cbranch_scc0 .LBB20_2
378
431
; CHECK-NEXT: ; %bb.1: ; %false
379
432
; CHECK-NEXT: s_mov_b32 s0, 33
380
- ; CHECK-NEXT: s_branch .LBB18_3
381
- ; CHECK-NEXT: .LBB18_2 : ; %true
433
+ ; CHECK-NEXT: s_branch .LBB20_3
434
+ ; CHECK-NEXT: .LBB20_2 : ; %true
382
435
; CHECK-NEXT: s_mov_b32 s0, 42
383
- ; CHECK-NEXT: s_branch .LBB18_3
384
- ; CHECK-NEXT: .LBB18_3 :
436
+ ; CHECK-NEXT: s_branch .LBB20_3
437
+ ; CHECK-NEXT: .LBB20_3 :
385
438
%v1c = icmp ult i32 %v1 , 12
386
439
%v2c = icmp ugt i32 %v2 , 34
387
440
%c = and i1 %v1c , %v2c
@@ -402,14 +455,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
402
455
; CHECK-NEXT: s_and_b32 s0, 1, s0
403
456
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
404
457
; CHECK-NEXT: s_cmp_le_i32 s0, 22
405
- ; CHECK-NEXT: s_cbranch_scc1 .LBB19_2
458
+ ; CHECK-NEXT: s_cbranch_scc1 .LBB21_2
406
459
; CHECK-NEXT: ; %bb.1: ; %true
407
460
; CHECK-NEXT: s_mov_b32 s0, 42
408
- ; CHECK-NEXT: s_branch .LBB19_3
409
- ; CHECK-NEXT: .LBB19_2 : ; %false
461
+ ; CHECK-NEXT: s_branch .LBB21_3
462
+ ; CHECK-NEXT: .LBB21_2 : ; %false
410
463
; CHECK-NEXT: s_mov_b32 s0, 33
411
- ; CHECK-NEXT: s_branch .LBB19_3
412
- ; CHECK-NEXT: .LBB19_3 :
464
+ ; CHECK-NEXT: s_branch .LBB21_3
465
+ ; CHECK-NEXT: .LBB21_3 :
413
466
%c = icmp ult i32 %v , 12
414
467
%ballot = call i32 @llvm.amdgcn.ballot.i32 (i1 %c )
415
468
%bc = icmp sgt i32 %ballot , 22
0 commit comments