Skip to content

Commit de830ef

Browse files
cdevadaszhang2amd
authored andcommitted
[AMDGPU] Try to fix the block prologs broken by RA inserted instructions (llvm#69924)
The insertion point determined by RA while attempting spills and liverange split at the beginning of a block goes wrong at times, and the newly inserted vector instructions are placed before the exec-mask restore instruction which is wrong. It occurs mainly due to the dependency on isBasicBlockPrologue that doesn't account early inserted instructions (spills and splits) during RA and causes the block prolog break. A better approach for deciding the insertion point should be worked out. For now, improving the helper function to consider all possible early insertions. This patch includes the spill instructions. The copies associated with liverange split should also be included in the block prolog. Change-Id: I84ce856add0028b5dadf5a518f750a35bb27ecf3
1 parent 5567f46 commit de830ef

11 files changed

+394
-362
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7988,8 +7988,16 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
79887988
}
79897989

79907990
bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
7991-
return !MI.isTerminator() && !MI.isCopy() &&
7992-
MI.modifiesRegister(AMDGPU::EXEC, &RI);
7991+
// We need to handle instructions which may be inserted during register
7992+
// allocation to handle the prolog. The initial prolog instruction may have
7993+
// been separated from the start of the block by spills and copies inserted
7994+
// needed by the prolog.
7995+
uint16_t Opc = MI.getOpcode();
7996+
7997+
// FIXME: Copies inserted in the block prolog for live-range split should also
7998+
// be included.
7999+
return (isSpillOpcode(Opc) || (!MI.isTerminator() && !MI.isCopy() &&
8000+
MI.modifiesRegister(AMDGPU::EXEC, &RI)));
79938001
}
79948002

79958003
MachineInstrBuilder

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
641641
return get(Opcode).TSFlags & SIInstrFlags::SGPRSpill;
642642
}
643643

644+
bool isSpillOpcode(uint16_t Opcode) const {
645+
return get(Opcode).TSFlags &
646+
(SIInstrFlags::SGPRSpill | SIInstrFlags::VGPRSpill);
647+
}
648+
644649
static bool isWWMRegSpillOpcode(uint16_t Opcode) {
645650
return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE ||
646651
Opcode == AMDGPU::SI_SPILL_WWM_V32_RESTORE;

llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
145145
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
146146
; CHECK-NEXT: s_mov_b32 exec_lo, s21
147147
; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
148-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
149-
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
150148
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
151149
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
152150
; CHECK-NEXT: s_mov_b32 exec_lo, s21
@@ -164,6 +162,9 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
164162
; CHECK-NEXT: v_readlane_b32 s17, v2, 1
165163
; CHECK-NEXT: v_readlane_b32 s18, v2, 2
166164
; CHECK-NEXT: v_readlane_b32 s19, v2, 3
165+
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
166+
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
167+
; CHECK-NEXT: s_waitcnt vmcnt(0)
167168
; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
168169
; CHECK-NEXT: s_waitcnt vmcnt(0)
169170
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill

llvm/test/CodeGen/AMDGPU/collapse-endcf.ll

Lines changed: 41 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,10 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
7575
; GCN-O0-NEXT: s_waitcnt expcnt(0)
7676
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
7777
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
78-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
79-
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
78+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
8079
; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0
8180
; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1
81+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
8282
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
8383
; GCN-O0-NEXT: s_mov_b32 s0, 0
8484
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
@@ -104,15 +104,16 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
104104
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
105105
; GCN-O0-NEXT: s_cbranch_execz .LBB0_3
106106
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
107-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
108107
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
109108
; GCN-O0-NEXT: s_waitcnt expcnt(0)
110109
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
111110
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
112111
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
113112
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
114113
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
114+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
115115
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
116+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
116117
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
117118
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
118119
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -248,10 +249,10 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
248249
; GCN-O0-NEXT: s_waitcnt expcnt(0)
249250
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
250251
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
251-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
252-
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
252+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
253253
; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0
254254
; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1
255+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
255256
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
256257
; GCN-O0-NEXT: s_mov_b32 s0, 0
257258
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
@@ -277,15 +278,16 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
277278
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
278279
; GCN-O0-NEXT: s_cbranch_execz .LBB1_4
279280
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
280-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
281281
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
282282
; GCN-O0-NEXT: s_waitcnt expcnt(0)
283283
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
284284
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
285285
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
286286
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
287287
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
288+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
288289
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
290+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
289291
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
290292
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
291293
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -311,7 +313,6 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
311313
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
312314
; GCN-O0-NEXT: s_branch .LBB1_5
313315
; GCN-O0-NEXT: .LBB1_4: ; %bb.inner.end
314-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
315316
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
316317
; GCN-O0-NEXT: s_waitcnt expcnt(0)
317318
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
@@ -322,7 +323,9 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
322323
; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3]
323324
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
324325
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
326+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
325327
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
328+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
326329
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
327330
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
328331
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -508,15 +511,16 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
508511
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1]
509512
; GCN-O0-NEXT: s_cbranch_execz .LBB2_5
510513
; GCN-O0-NEXT: ; %bb.3: ; %bb.then
511-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
512514
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
513515
; GCN-O0-NEXT: s_waitcnt expcnt(0)
514516
; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
515517
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
516518
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
517519
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
518520
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
521+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
519522
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
523+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
520524
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
521525
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
522526
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -532,15 +536,16 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
532536
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
533537
; GCN-O0-NEXT: s_branch .LBB2_5
534538
; GCN-O0-NEXT: .LBB2_4: ; %bb.else
535-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
536539
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
537540
; GCN-O0-NEXT: s_waitcnt expcnt(0)
538541
; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
539542
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
540543
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
541544
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
542545
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
546+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
543547
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
548+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
544549
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
545550
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
546551
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -943,20 +948,21 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
943948
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
944949
; GCN-O0-NEXT: s_cbranch_execz .LBB4_2
945950
; GCN-O0-NEXT: ; %bb.1: ; %bb.then
946-
; GCN-O0-NEXT: s_waitcnt expcnt(0)
947-
; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
948951
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
949952
; GCN-O0-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
950953
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
951954
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
952955
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
953956
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
957+
; GCN-O0-NEXT: s_waitcnt expcnt(0)
958+
; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
954959
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
955960
; GCN-O0-NEXT: s_mov_b32 s4, 0
956961
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
957962
; GCN-O0-NEXT: s_mov_b32 s5, s2
958963
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
959964
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
965+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
960966
; GCN-O0-NEXT: v_ashrrev_i32_e64 v2, 31, v0
961967
; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
962968
; GCN-O0-NEXT: v_mov_b32_e32 v1, v2
@@ -1092,14 +1098,14 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
10921098
; GCN-O0-NEXT: s_waitcnt expcnt(0)
10931099
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
10941100
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
1095-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
1096-
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
1101+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
10971102
; GCN-O0-NEXT: v_readlane_b32 s8, v0, 2
10981103
; GCN-O0-NEXT: v_readlane_b32 s9, v0, 3
10991104
; GCN-O0-NEXT: v_readlane_b32 s6, v0, 0
11001105
; GCN-O0-NEXT: v_readlane_b32 s7, v0, 1
11011106
; GCN-O0-NEXT: v_writelane_b32 v0, s6, 4
11021107
; GCN-O0-NEXT: v_writelane_b32 v0, s7, 5
1108+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
11031109
; GCN-O0-NEXT: s_mov_b32 s4, 0x207
11041110
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
11051111
; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, s4
@@ -1122,11 +1128,11 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
11221128
; GCN-O0-NEXT: s_waitcnt expcnt(0)
11231129
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
11241130
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
1125-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
1126-
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
1131+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
11271132
; GCN-O0-NEXT: v_readlane_b32 s4, v0, 6
11281133
; GCN-O0-NEXT: v_readlane_b32 s5, v0, 7
11291134
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
1135+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
11301136
; GCN-O0-NEXT: s_mov_b32 s6, 0
11311137
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
11321138
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v1, s6
@@ -1216,18 +1222,20 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
12161222
; GCN-O0-NEXT: s_branch .LBB5_6
12171223
; GCN-O0-NEXT: .LBB5_5: ; %Flow2
12181224
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
1219-
; GCN-O0-NEXT: s_waitcnt expcnt(0)
1220-
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
1221-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
1222-
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
1223-
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
12241225
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
1226+
; GCN-O0-NEXT: s_waitcnt expcnt(1)
12251227
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
12261228
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
12271229
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12281230
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 10
12291231
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 11
12301232
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
1233+
; GCN-O0-NEXT: s_waitcnt expcnt(0)
1234+
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
1235+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
1236+
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
1237+
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1238+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12311239
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
12321240
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12331241
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
@@ -1236,18 +1244,20 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
12361244
; GCN-O0-NEXT: s_branch .LBB5_7
12371245
; GCN-O0-NEXT: .LBB5_6: ; %Flow
12381246
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
1239-
; GCN-O0-NEXT: s_waitcnt expcnt(0)
1240-
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1241-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1242-
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1243-
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
12441247
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
1248+
; GCN-O0-NEXT: s_waitcnt expcnt(1)
12451249
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
12461250
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
12471251
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12481252
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 12
12491253
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 13
12501254
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
1255+
; GCN-O0-NEXT: s_waitcnt expcnt(0)
1256+
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1257+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1258+
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1259+
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
1260+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12511261
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
12521262
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12531263
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
@@ -1291,11 +1301,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
12911301
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
12921302
; GCN-O0-NEXT: .LBB5_9: ; %Flow3
12931303
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
1294-
; GCN-O0-NEXT: s_waitcnt expcnt(0)
1295-
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
1296-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
1297-
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
1298-
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
12991304
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
13001305
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
13011306
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
@@ -1307,6 +1312,11 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
13071312
; GCN-O0-NEXT: v_readlane_b32 s7, v4, 5
13081313
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 14
13091314
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 15
1315+
; GCN-O0-NEXT: s_waitcnt expcnt(0)
1316+
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
1317+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
1318+
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
1319+
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
13101320
; GCN-O0-NEXT: s_and_b64 s[4:5], exec, s[4:5]
13111321
; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
13121322
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0
@@ -1321,6 +1331,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
13211331
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
13221332
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
13231333
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
1334+
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
13241335
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
13251336
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
13261337
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill

0 commit comments

Comments
 (0)