Skip to content

Commit 40ac47d

Browse files
committed
Unrevert "[AMDGPU] Try to fix the block prologs broken by RA inserted instructions (llvm#69924)"
This reverts commit d648e11. And unreverts commit a0eb6b8.
1 parent bedf99a commit 40ac47d

16 files changed

+626
-519
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8469,8 +8469,16 @@ unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
84698469
}
84708470

84718471
bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
8472-
return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
8473-
MI.modifiesRegister(AMDGPU::EXEC, &RI);
8472+
// We need to handle instructions which may be inserted during register
8473+
// allocation to handle the prolog. The initial prolog instruction may have
8474+
// been separated from the start of the block by spills and copies inserted
8475+
// needed by the prolog.
8476+
uint16_t Opc = MI.getOpcode();
8477+
8478+
// FIXME: Copies inserted in the block prolog for live-range split should also
8479+
// be included.
8480+
return (isSpillOpcode(Opc) || (!MI.isTerminator() && Opc != AMDGPU::COPY &&
8481+
MI.modifiesRegister(AMDGPU::EXEC, &RI)));
84748482
}
84758483

84768484
MachineInstrBuilder

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -675,6 +675,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
675675
return get(Opcode).TSFlags & SIInstrFlags::SGPRSpill;
676676
}
677677

678+
bool isSpillOpcode(uint16_t Opcode) const {
679+
return get(Opcode).TSFlags &
680+
(SIInstrFlags::SGPRSpill | SIInstrFlags::VGPRSpill);
681+
}
682+
678683
static bool isWWMRegSpillOpcode(uint16_t Opcode) {
679684
return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE ||
680685
Opcode == AMDGPU::SI_SPILL_WWM_AV32_SAVE ||

llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
144144
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
145145
; CHECK-NEXT: s_mov_b32 exec_lo, s21
146146
; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
147-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
148-
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
149147
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
150148
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
151149
; CHECK-NEXT: s_mov_b32 exec_lo, s21
@@ -163,6 +161,9 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
163161
; CHECK-NEXT: v_readlane_b32 s17, v2, 1
164162
; CHECK-NEXT: v_readlane_b32 s18, v2, 2
165163
; CHECK-NEXT: v_readlane_b32 s19, v2, 3
164+
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
165+
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
166+
; CHECK-NEXT: s_waitcnt vmcnt(0)
166167
; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
167168
; CHECK-NEXT: s_waitcnt vmcnt(0)
168169
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill

llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,10 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
3333
; REGALLOC-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
3434
; REGALLOC-NEXT: {{ $}}
3535
; REGALLOC-NEXT: $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
36-
; REGALLOC-NEXT: $vgpr1 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
3736
; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr4_sgpr5
3837
; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
3938
; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def dead $scc, implicit $exec
39+
; REGALLOC-NEXT: $vgpr1 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
4040
; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
4141
; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_AND_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
4242
; REGALLOC-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr4, 2, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
@@ -66,10 +66,10 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
6666
; REGALLOC-NEXT: {{ $}}
6767
; REGALLOC-NEXT: bb.4.bb.3:
6868
; REGALLOC-NEXT: $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
69-
; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
7069
; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2, implicit-def $sgpr4_sgpr5
7170
; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3
7271
; REGALLOC-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
72+
; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
7373
; REGALLOC-NEXT: renamable $vgpr0 = V_LSHL_ADD_U32_e64 killed $vgpr0, 2, $vgpr0, implicit $exec
7474
; REGALLOC-NEXT: KILL killed renamable $vgpr1
7575
; REGALLOC-NEXT: SI_RETURN implicit killed $vgpr0

llvm/test/CodeGen/AMDGPU/collapse-endcf.ll

Lines changed: 41 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,10 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
7575
; GCN-O0-NEXT: s_waitcnt expcnt(0)
7676
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
7777
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
78-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
79-
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
78+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
8079
; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0
8180
; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1
81+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
8282
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
8383
; GCN-O0-NEXT: s_mov_b32 s0, 0
8484
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
@@ -104,15 +104,16 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
104104
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
105105
; GCN-O0-NEXT: s_cbranch_execz .LBB0_3
106106
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
107-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
108107
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
109108
; GCN-O0-NEXT: s_waitcnt expcnt(0)
110109
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
111110
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
112111
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
113112
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
114113
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
114+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
115115
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
116+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
116117
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
117118
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
118119
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -248,10 +249,10 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
248249
; GCN-O0-NEXT: s_waitcnt expcnt(0)
249250
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
250251
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
251-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
252-
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
252+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
253253
; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0
254254
; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1
255+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
255256
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
256257
; GCN-O0-NEXT: s_mov_b32 s0, 0
257258
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
@@ -277,15 +278,16 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
277278
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
278279
; GCN-O0-NEXT: s_cbranch_execz .LBB1_4
279280
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
280-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
281281
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
282282
; GCN-O0-NEXT: s_waitcnt expcnt(0)
283283
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
284284
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
285285
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
286286
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
287287
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
288+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
288289
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
290+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
289291
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
290292
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
291293
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -311,7 +313,6 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
311313
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
312314
; GCN-O0-NEXT: s_branch .LBB1_5
313315
; GCN-O0-NEXT: .LBB1_4: ; %bb.inner.end
314-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
315316
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
316317
; GCN-O0-NEXT: s_waitcnt expcnt(0)
317318
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
@@ -322,7 +323,9 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
322323
; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3]
323324
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
324325
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
326+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
325327
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
328+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
326329
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
327330
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
328331
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -508,15 +511,16 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
508511
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1]
509512
; GCN-O0-NEXT: s_cbranch_execz .LBB2_5
510513
; GCN-O0-NEXT: ; %bb.3: ; %bb.then
511-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
512514
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
513515
; GCN-O0-NEXT: s_waitcnt expcnt(0)
514516
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
515517
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
516518
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
517519
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
518520
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
521+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
519522
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
523+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
520524
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
521525
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
522526
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -532,15 +536,16 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
532536
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
533537
; GCN-O0-NEXT: s_branch .LBB2_5
534538
; GCN-O0-NEXT: .LBB2_4: ; %bb.else
535-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
536539
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
537540
; GCN-O0-NEXT: s_waitcnt expcnt(0)
538541
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
539542
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
540543
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
541544
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
542545
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
546+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
543547
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
548+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
544549
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
545550
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
546551
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -953,20 +958,21 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
953958
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
954959
; GCN-O0-NEXT: s_cbranch_execz .LBB4_2
955960
; GCN-O0-NEXT: ; %bb.1: ; %bb.then
956-
; GCN-O0-NEXT: s_waitcnt expcnt(0)
957-
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
958961
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
959962
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
960963
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
961964
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
962965
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
963966
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
967+
; GCN-O0-NEXT: s_waitcnt expcnt(0)
968+
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
964969
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
965970
; GCN-O0-NEXT: s_mov_b32 s4, 0
966971
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
967972
; GCN-O0-NEXT: s_mov_b32 s5, s2
968973
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
969974
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
975+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
970976
; GCN-O0-NEXT: v_ashrrev_i32_e64 v2, 31, v0
971977
; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
972978
; GCN-O0-NEXT: v_mov_b32_e32 v1, v2
@@ -1102,14 +1108,14 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
11021108
; GCN-O0-NEXT: s_waitcnt expcnt(0)
11031109
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
11041110
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
1105-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
1106-
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
1111+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
11071112
; GCN-O0-NEXT: v_readlane_b32 s8, v0, 2
11081113
; GCN-O0-NEXT: v_readlane_b32 s9, v0, 3
11091114
; GCN-O0-NEXT: v_readlane_b32 s6, v0, 0
11101115
; GCN-O0-NEXT: v_readlane_b32 s7, v0, 1
11111116
; GCN-O0-NEXT: v_writelane_b32 v0, s6, 4
11121117
; GCN-O0-NEXT: v_writelane_b32 v0, s7, 5
1118+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
11131119
; GCN-O0-NEXT: s_mov_b32 s4, 0x207
11141120
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
11151121
; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, s4
@@ -1132,11 +1138,11 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
11321138
; GCN-O0-NEXT: s_waitcnt expcnt(0)
11331139
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
11341140
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
1135-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
1136-
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
1141+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
11371142
; GCN-O0-NEXT: v_readlane_b32 s4, v0, 6
11381143
; GCN-O0-NEXT: v_readlane_b32 s5, v0, 7
11391144
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
1145+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
11401146
; GCN-O0-NEXT: s_mov_b32 s6, 0
11411147
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
11421148
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v1, s6
@@ -1226,18 +1232,20 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
12261232
; GCN-O0-NEXT: s_branch .LBB5_6
12271233
; GCN-O0-NEXT: .LBB5_5: ; %Flow2
12281234
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
1229-
; GCN-O0-NEXT: s_waitcnt expcnt(0)
1230-
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
1231-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
1232-
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
1233-
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
12341235
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
1236+
; GCN-O0-NEXT: s_waitcnt expcnt(1)
12351237
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
12361238
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
12371239
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12381240
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 10
12391241
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 11
12401242
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
1243+
; GCN-O0-NEXT: s_waitcnt expcnt(0)
1244+
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
1245+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
1246+
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
1247+
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1248+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12411249
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
12421250
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12431251
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
@@ -1246,18 +1254,20 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
12461254
; GCN-O0-NEXT: s_branch .LBB5_7
12471255
; GCN-O0-NEXT: .LBB5_6: ; %Flow
12481256
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
1249-
; GCN-O0-NEXT: s_waitcnt expcnt(0)
1250-
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1251-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1252-
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1253-
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
12541257
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
1258+
; GCN-O0-NEXT: s_waitcnt expcnt(1)
12551259
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
12561260
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
12571261
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12581262
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 12
12591263
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 13
12601264
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
1265+
; GCN-O0-NEXT: s_waitcnt expcnt(0)
1266+
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1267+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1268+
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1269+
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
1270+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12611271
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
12621272
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12631273
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
@@ -1301,11 +1311,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
13011311
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
13021312
; GCN-O0-NEXT: .LBB5_9: ; %Flow3
13031313
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
1304-
; GCN-O0-NEXT: s_waitcnt expcnt(0)
1305-
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
1306-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
1307-
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
1308-
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
13091314
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
13101315
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
13111316
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
@@ -1317,6 +1322,11 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
13171322
; GCN-O0-NEXT: v_readlane_b32 s7, v4, 5
13181323
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 14
13191324
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 15
1325+
; GCN-O0-NEXT: s_waitcnt expcnt(0)
1326+
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
1327+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
1328+
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
1329+
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
13201330
; GCN-O0-NEXT: s_and_b64 s[4:5], exec, s[4:5]
13211331
; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
13221332
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0
@@ -1331,6 +1341,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
13311341
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
13321342
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
13331343
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
1344+
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
13341345
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
13351346
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
13361347
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill

0 commit comments

Comments
 (0)