Skip to content

Commit 6656481

Browse files
committed
[AMDGPU][SIPreEmitPeephole] mustRetainExeczBranch: accept memory instructions in the "then" block
1 parent 88945db commit 6656481

25 files changed

+133
-269
lines changed

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -326,9 +326,8 @@ bool SIPreEmitPeephole::mustRetainExeczBranch(
326326
if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
327327
return true;
328328

329-
// These instructions are potentially expensive even if EXEC = 0.
330-
if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) ||
331-
TII->isDS(MI) || TII->isWaitcnt(MI.getOpcode()))
329+
// Waitcnt instructions are potentially expensive even if EXEC = 0.
330+
if (TII->isWaitcnt(MI.getOpcode()))
332331
return true;
333332

334333
++NumInstr;

llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,12 @@ define amdgpu_cs void @atomic_add(<4 x i32> inreg %arg) {
4141
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
4242
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
4343
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
44-
; GCN-NEXT: s_cbranch_execz .LBB0_2
4544
; GCN-NEXT: ; %bb.1:
4645
; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
4746
; GCN-NEXT: v_mov_b32_e32 v0, 0
4847
; GCN-NEXT: v_mov_b32_e32 v1, s4
4948
; GCN-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 idxen
50-
; GCN-NEXT: .LBB0_2:
49+
; GCN-NEXT: ; %bb.2:
5150
; GCN-NEXT: s_endpgm
5251
.entry:
5352
call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 1, <4 x i32> %arg, i32 0, i32 0, i32 0, i32 0)
@@ -87,13 +86,12 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
8786
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
8887
; GCN-NEXT: ; implicit-def: $vgpr1
8988
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
90-
; GCN-NEXT: s_cbranch_execz .LBB1_2
9189
; GCN-NEXT: ; %bb.1:
9290
; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
9391
; GCN-NEXT: v_mov_b32_e32 v1, s6
9492
; GCN-NEXT: v_mov_b32_e32 v2, 0
9593
; GCN-NEXT: buffer_atomic_add v1, v2, s[0:3], 0 idxen glc
96-
; GCN-NEXT: .LBB1_2:
94+
; GCN-NEXT: ; %bb.2:
9795
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
9896
; GCN-NEXT: s_waitcnt vmcnt(0)
9997
; GCN-NEXT: v_readfirstlane_b32 s4, v1
@@ -139,13 +137,12 @@ define amdgpu_cs void @atomic_sub(<4 x i32> inreg %arg) {
139137
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
140138
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
141139
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
142-
; GCN-NEXT: s_cbranch_execz .LBB2_2
143140
; GCN-NEXT: ; %bb.1:
144141
; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
145142
; GCN-NEXT: v_mov_b32_e32 v0, 0
146143
; GCN-NEXT: v_mov_b32_e32 v1, s4
147144
; GCN-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 idxen
148-
; GCN-NEXT: .LBB2_2:
145+
; GCN-NEXT: ; %bb.2:
149146
; GCN-NEXT: s_endpgm
150147
.entry:
151148
call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 1, <4 x i32> %arg, i32 0, i32 0, i32 0, i32 0)
@@ -185,13 +182,12 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
185182
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
186183
; GCN-NEXT: ; implicit-def: $vgpr1
187184
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
188-
; GCN-NEXT: s_cbranch_execz .LBB3_2
189185
; GCN-NEXT: ; %bb.1:
190186
; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
191187
; GCN-NEXT: v_mov_b32_e32 v1, s6
192188
; GCN-NEXT: v_mov_b32_e32 v2, 0
193189
; GCN-NEXT: buffer_atomic_sub v1, v2, s[0:3], 0 idxen glc
194-
; GCN-NEXT: .LBB3_2:
190+
; GCN-NEXT: ; %bb.2:
195191
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
196192
; GCN-NEXT: s_waitcnt vmcnt(0)
197193
; GCN-NEXT: v_readfirstlane_b32 s4, v1
@@ -238,14 +234,13 @@ define amdgpu_cs void @atomic_xor(<4 x i32> inreg %arg) {
238234
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
239235
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
240236
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
241-
; GCN-NEXT: s_cbranch_execz .LBB4_2
242237
; GCN-NEXT: ; %bb.1:
243238
; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
244239
; GCN-NEXT: s_and_b32 s4, s4, 1
245240
; GCN-NEXT: v_mov_b32_e32 v0, 0
246241
; GCN-NEXT: v_mov_b32_e32 v1, s4
247242
; GCN-NEXT: buffer_atomic_xor v1, v0, s[0:3], 0 idxen
248-
; GCN-NEXT: .LBB4_2:
243+
; GCN-NEXT: ; %bb.2:
249244
; GCN-NEXT: s_endpgm
250245
.entry:
251246
call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 1, <4 x i32> %arg, i32 0, i32 0, i32 0, i32 0)
@@ -287,14 +282,13 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
287282
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
288283
; GCN-NEXT: ; implicit-def: $vgpr1
289284
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
290-
; GCN-NEXT: s_cbranch_execz .LBB5_2
291285
; GCN-NEXT: ; %bb.1:
292286
; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
293287
; GCN-NEXT: s_and_b32 s6, s6, 1
294288
; GCN-NEXT: v_mov_b32_e32 v1, s6
295289
; GCN-NEXT: v_mov_b32_e32 v2, 0
296290
; GCN-NEXT: buffer_atomic_xor v1, v2, s[0:3], 0 idxen glc
297-
; GCN-NEXT: .LBB5_2:
291+
; GCN-NEXT: ; %bb.2:
298292
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
299293
; GCN-NEXT: s_waitcnt vmcnt(0)
300294
; GCN-NEXT: v_readfirstlane_b32 s4, v1
@@ -341,13 +335,12 @@ define amdgpu_cs void @atomic_ptr_add(ptr addrspace(8) inreg %arg) {
341335
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
342336
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
343337
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
344-
; GCN-NEXT: s_cbranch_execz .LBB6_2
345338
; GCN-NEXT: ; %bb.1:
346339
; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
347340
; GCN-NEXT: v_mov_b32_e32 v0, 0
348341
; GCN-NEXT: v_mov_b32_e32 v1, s4
349342
; GCN-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 idxen
350-
; GCN-NEXT: .LBB6_2:
343+
; GCN-NEXT: ; %bb.2:
351344
; GCN-NEXT: s_endpgm
352345
.entry:
353346
call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add.i32(i32 1, ptr addrspace(8) %arg, i32 0, i32 0, i32 0, i32 0)
@@ -389,13 +382,12 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
389382
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
390383
; GCN-NEXT: ; implicit-def: $vgpr1
391384
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
392-
; GCN-NEXT: s_cbranch_execz .LBB7_2
393385
; GCN-NEXT: ; %bb.1:
394386
; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
395387
; GCN-NEXT: v_mov_b32_e32 v1, s6
396388
; GCN-NEXT: v_mov_b32_e32 v2, 0
397389
; GCN-NEXT: buffer_atomic_add v1, v2, s[0:3], 0 idxen glc
398-
; GCN-NEXT: .LBB7_2:
390+
; GCN-NEXT: ; %bb.2:
399391
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
400392
; GCN-NEXT: s_waitcnt vmcnt(0)
401393
; GCN-NEXT: v_readfirstlane_b32 s4, v1
@@ -443,13 +435,12 @@ define amdgpu_cs void @atomic_ptr_sub(ptr addrspace(8) inreg %arg) {
443435
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
444436
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
445437
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
446-
; GCN-NEXT: s_cbranch_execz .LBB8_2
447438
; GCN-NEXT: ; %bb.1:
448439
; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
449440
; GCN-NEXT: v_mov_b32_e32 v0, 0
450441
; GCN-NEXT: v_mov_b32_e32 v1, s4
451442
; GCN-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 idxen
452-
; GCN-NEXT: .LBB8_2:
443+
; GCN-NEXT: ; %bb.2:
453444
; GCN-NEXT: s_endpgm
454445
.entry:
455446
call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub.i32(i32 1, ptr addrspace(8) %arg, i32 0, i32 0, i32 0, i32 0)
@@ -491,13 +482,12 @@ define amdgpu_cs void @atomic_ptr_sub_and_format(ptr addrspace(8) inreg %arg) {
491482
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
492483
; GCN-NEXT: ; implicit-def: $vgpr1
493484
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
494-
; GCN-NEXT: s_cbranch_execz .LBB9_2
495485
; GCN-NEXT: ; %bb.1:
496486
; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
497487
; GCN-NEXT: v_mov_b32_e32 v1, s6
498488
; GCN-NEXT: v_mov_b32_e32 v2, 0
499489
; GCN-NEXT: buffer_atomic_sub v1, v2, s[0:3], 0 idxen glc
500-
; GCN-NEXT: .LBB9_2:
490+
; GCN-NEXT: ; %bb.2:
501491
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
502492
; GCN-NEXT: s_waitcnt vmcnt(0)
503493
; GCN-NEXT: v_readfirstlane_b32 s4, v1
@@ -546,14 +536,13 @@ define amdgpu_cs void @atomic_ptr_xor(ptr addrspace(8) inreg %arg) {
546536
; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s6, v0
547537
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
548538
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
549-
; GCN-NEXT: s_cbranch_execz .LBB10_2
550539
; GCN-NEXT: ; %bb.1:
551540
; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
552541
; GCN-NEXT: s_and_b32 s4, s4, 1
553542
; GCN-NEXT: v_mov_b32_e32 v0, 0
554543
; GCN-NEXT: v_mov_b32_e32 v1, s4
555544
; GCN-NEXT: buffer_atomic_xor v1, v0, s[0:3], 0 idxen
556-
; GCN-NEXT: .LBB10_2:
545+
; GCN-NEXT: ; %bb.2:
557546
; GCN-NEXT: s_endpgm
558547
.entry:
559548
call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.xor.i32(i32 1, ptr addrspace(8) %arg, i32 0, i32 0, i32 0, i32 0)
@@ -597,14 +586,13 @@ define amdgpu_cs void @atomic_ptr_xor_and_format(ptr addrspace(8) inreg %arg) {
597586
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
598587
; GCN-NEXT: ; implicit-def: $vgpr1
599588
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
600-
; GCN-NEXT: s_cbranch_execz .LBB11_2
601589
; GCN-NEXT: ; %bb.1:
602590
; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
603591
; GCN-NEXT: s_and_b32 s6, s6, 1
604592
; GCN-NEXT: v_mov_b32_e32 v1, s6
605593
; GCN-NEXT: v_mov_b32_e32 v2, 0
606594
; GCN-NEXT: buffer_atomic_xor v1, v2, s[0:3], 0 idxen glc
607-
; GCN-NEXT: .LBB11_2:
595+
; GCN-NEXT: ; %bb.2:
608596
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
609597
; GCN-NEXT: s_waitcnt vmcnt(0)
610598
; GCN-NEXT: v_readfirstlane_b32 s4, v1

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -249,11 +249,10 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
249249
; GFX10-NEXT: .LBB3_6: ; %Flow1
250250
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
251251
; GFX10-NEXT: s_and_saveexec_b32 s4, s6
252-
; GFX10-NEXT: s_cbranch_execz .LBB3_8
253252
; GFX10-NEXT: ; %bb.7: ; %block.after.loop
254253
; GFX10-NEXT: v_mov_b32_e32 v0, 5
255254
; GFX10-NEXT: flat_store_dword v[3:4], v0
256-
; GFX10-NEXT: .LBB3_8: ; %exit
255+
; GFX10-NEXT: ; %bb.8: ; %exit
257256
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
258257
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
259258
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -315,15 +314,14 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
315314
; GFX10-NEXT: v_mov_b32_e32 v4, v5
316315
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
317316
; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
318-
; GFX10-NEXT: s_cbranch_execz .LBB4_4
319317
; GFX10-NEXT: ; %bb.3: ; %if.block.0
320318
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
321319
; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4
322320
; GFX10-NEXT: v_lshlrev_b64 v[8:9], 2, v[4:5]
323321
; GFX10-NEXT: v_add_co_u32 v8, s4, v2, v8
324322
; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v3, v9, s4
325323
; GFX10-NEXT: global_store_dword v[8:9], v4, off
326-
; GFX10-NEXT: .LBB4_4: ; %loop.break.block
324+
; GFX10-NEXT: ; %bb.4: ; %loop.break.block
327325
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
328326
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
329327
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
@@ -342,10 +340,9 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
342340
; GFX10-NEXT: .LBB4_6: ; %cond.block.1
343341
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
344342
; GFX10-NEXT: s_and_saveexec_b32 s4, s6
345-
; GFX10-NEXT: s_cbranch_execz .LBB4_8
346343
; GFX10-NEXT: ; %bb.7: ; %if.block.1
347344
; GFX10-NEXT: global_store_dword v[6:7], v4, off
348-
; GFX10-NEXT: .LBB4_8: ; %exit
345+
; GFX10-NEXT: ; %bb.8: ; %exit
349346
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
350347
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
351348
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -536,11 +533,10 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
536533
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
537534
; GFX10-NEXT: s_and_saveexec_b32 s0, s1
538535
; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
539-
; GFX10-NEXT: s_cbranch_execz .LBB6_6
540536
; GFX10-NEXT: ; %bb.5: ; %break.body
541537
; GFX10-NEXT: v_mov_b32_e32 v0, 10
542538
; GFX10-NEXT: global_store_dword v[4:5], v0, off
543-
; GFX10-NEXT: .LBB6_6: ; %exit
539+
; GFX10-NEXT: ; %bb.6: ; %exit
544540
; GFX10-NEXT: s_endpgm
545541
entry:
546542
br label %A

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -437,11 +437,10 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad
437437
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
438438
; GFX10-NEXT: s_and_saveexec_b32 s0, s1
439439
; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
440-
; GFX10-NEXT: s_cbranch_execz .LBB5_6
441440
; GFX10-NEXT: ; %bb.5: ; %break.body
442441
; GFX10-NEXT: v_mov_b32_e32 v0, 10
443442
; GFX10-NEXT: global_store_dword v[4:5], v0, off
444-
; GFX10-NEXT: .LBB5_6: ; %exit
443+
; GFX10-NEXT: ; %bb.6: ; %exit
445444
; GFX10-NEXT: s_endpgm
446445
entry:
447446
br label %A

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,12 +152,11 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad
152152
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
153153
; GFX10-NEXT: s_and_saveexec_b32 s1, s0
154154
; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1
155-
; GFX10-NEXT: s_cbranch_execz .LBB2_7
156155
; GFX10-NEXT: ; %bb.6: ; %break.body
157156
; GFX10-NEXT: v_mov_b32_e32 v0, 10
158157
; GFX10-NEXT: v_mov_b32_e32 v1, 0
159158
; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
160-
; GFX10-NEXT: .LBB2_7: ; %exit
159+
; GFX10-NEXT: ; %bb.7: ; %exit
161160
; GFX10-NEXT: s_endpgm
162161
entry:
163162
br label %A

llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,9 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
6868
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
6969
; GFX906-NEXT: global_load_dword v1, v2, s[4:5]
7070
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
71-
; GFX906-NEXT: s_cbranch_execz .LBB1_2
7271
; GFX906-NEXT: ; %bb.1: ; %bb.1
7372
; GFX906-NEXT: global_load_dword v1, v2, s[6:7]
74-
; GFX906-NEXT: .LBB1_2: ; %bb.2
73+
; GFX906-NEXT: ; %bb.2: ; %bb.2
7574
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
7675
; GFX906-NEXT: v_mov_b32_e32 v0, 0
7776
; GFX906-NEXT: s_waitcnt vmcnt(0)
@@ -149,10 +148,9 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
149148
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
150149
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5]
151150
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
152-
; GFX906-NEXT: s_cbranch_execz .LBB3_2
153151
; GFX906-NEXT: ; %bb.1: ; %bb.1
154152
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7]
155-
; GFX906-NEXT: .LBB3_2: ; %bb.2
153+
; GFX906-NEXT: ; %bb.2: ; %bb.2
156154
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
157155
; GFX906-NEXT: v_mov_b32_e32 v0, 0
158156
; GFX906-NEXT: s_waitcnt vmcnt(0)
@@ -185,10 +183,9 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
185183
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
186184
; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[4:5]
187185
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
188-
; GFX906-NEXT: s_cbranch_execz .LBB4_2
189186
; GFX906-NEXT: ; %bb.1: ; %bb.1
190187
; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[6:7]
191-
; GFX906-NEXT: .LBB4_2: ; %bb.2
188+
; GFX906-NEXT: ; %bb.2: ; %bb.2
192189
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
193190
; GFX906-NEXT: v_mov_b32_e32 v0, 0
194191
; GFX906-NEXT: s_waitcnt vmcnt(0)
@@ -222,11 +219,10 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
222219
; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[4:5]
223220
; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[4:5] offset:16
224221
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
225-
; GFX906-NEXT: s_cbranch_execz .LBB5_2
226222
; GFX906-NEXT: ; %bb.1: ; %bb.1
227223
; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[6:7]
228224
; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[6:7] offset:16
229-
; GFX906-NEXT: .LBB5_2: ; %bb.2
225+
; GFX906-NEXT: ; %bb.2: ; %bb.2
230226
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
231227
; GFX906-NEXT: v_mov_b32_e32 v0, 0
232228
; GFX906-NEXT: s_waitcnt vmcnt(1)
@@ -486,14 +482,13 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
486482
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
487483
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5]
488484
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
489-
; GFX906-NEXT: s_cbranch_execz .LBB8_2
490485
; GFX906-NEXT: ; %bb.1: ; %bb.1
491486
; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7]
492487
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
493488
; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
494489
; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc
495490
; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
496-
; GFX906-NEXT: .LBB8_2: ; %Flow
491+
; GFX906-NEXT: ; %bb.2: ; %Flow
497492
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
498493
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
499494
; GFX906-NEXT: s_cbranch_execz .LBB8_4
@@ -547,11 +542,10 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
547542
; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[6:7]
548543
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
549544
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
550-
; GFX906-NEXT: s_cbranch_execz .LBB9_3
551545
; GFX906-NEXT: ; %bb.2: ; %bb.2
552546
; GFX906-NEXT: v_mov_b32_e32 v0, 0
553547
; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[8:9]
554-
; GFX906-NEXT: .LBB9_3: ; %Flow
548+
; GFX906-NEXT: ; %bb.3: ; %Flow
555549
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
556550
; GFX906-NEXT: .LBB9_4: ; %bb.3
557551
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]

0 commit comments

Comments
 (0)