@@ -75,10 +75,10 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
75
75
; GCN-O0-NEXT: s_waitcnt expcnt(0)
76
76
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
77
77
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
78
- ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
79
- ; GCN-O0-NEXT: s_waitcnt vmcnt(1)
78
+ ; GCN-O0-NEXT: s_waitcnt vmcnt(0)
80
79
; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0
81
80
; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1
81
+ ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
82
82
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
83
83
; GCN-O0-NEXT: s_mov_b32 s0, 0
84
84
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
@@ -104,15 +104,16 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
104
104
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
105
105
; GCN-O0-NEXT: s_cbranch_execz .LBB0_3
106
106
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
107
- ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
108
107
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
109
108
; GCN-O0-NEXT: s_waitcnt expcnt(0)
110
109
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
111
110
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
112
111
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
113
112
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
114
113
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
114
+ ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
115
115
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
116
+ ; GCN-O0-NEXT: s_waitcnt vmcnt(0)
116
117
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
117
118
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
118
119
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -248,10 +249,10 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
248
249
; GCN-O0-NEXT: s_waitcnt expcnt(0)
249
250
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
250
251
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
251
- ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
252
- ; GCN-O0-NEXT: s_waitcnt vmcnt(1)
252
+ ; GCN-O0-NEXT: s_waitcnt vmcnt(0)
253
253
; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0
254
254
; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1
255
+ ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
255
256
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
256
257
; GCN-O0-NEXT: s_mov_b32 s0, 0
257
258
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
@@ -277,15 +278,16 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
277
278
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
278
279
; GCN-O0-NEXT: s_cbranch_execz .LBB1_4
279
280
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
280
- ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
281
281
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
282
282
; GCN-O0-NEXT: s_waitcnt expcnt(0)
283
283
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
284
284
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
285
285
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
286
286
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
287
287
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
288
+ ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
288
289
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
290
+ ; GCN-O0-NEXT: s_waitcnt vmcnt(0)
289
291
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
290
292
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
291
293
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -311,7 +313,6 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
311
313
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
312
314
; GCN-O0-NEXT: s_branch .LBB1_5
313
315
; GCN-O0-NEXT: .LBB1_4: ; %bb.inner.end
314
- ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
315
316
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
316
317
; GCN-O0-NEXT: s_waitcnt expcnt(0)
317
318
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
@@ -322,7 +323,9 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
322
323
; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3]
323
324
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
324
325
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
326
+ ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
325
327
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
328
+ ; GCN-O0-NEXT: s_waitcnt vmcnt(0)
326
329
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
327
330
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
328
331
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -508,15 +511,16 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
508
511
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1]
509
512
; GCN-O0-NEXT: s_cbranch_execz .LBB2_5
510
513
; GCN-O0-NEXT: ; %bb.3: ; %bb.then
511
- ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
512
514
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
513
515
; GCN-O0-NEXT: s_waitcnt expcnt(0)
514
516
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
515
517
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
516
518
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
517
519
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
518
520
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
521
+ ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
519
522
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
523
+ ; GCN-O0-NEXT: s_waitcnt vmcnt(0)
520
524
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
521
525
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
522
526
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -532,15 +536,16 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
532
536
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
533
537
; GCN-O0-NEXT: s_branch .LBB2_5
534
538
; GCN-O0-NEXT: .LBB2_4: ; %bb.else
535
- ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
536
539
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
537
540
; GCN-O0-NEXT: s_waitcnt expcnt(0)
538
541
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
539
542
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
540
543
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
541
544
; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0
542
545
; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1
546
+ ; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
543
547
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
548
+ ; GCN-O0-NEXT: s_waitcnt vmcnt(0)
544
549
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
545
550
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
546
551
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -953,20 +958,21 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
953
958
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
954
959
; GCN-O0-NEXT: s_cbranch_execz .LBB4_2
955
960
; GCN-O0-NEXT: ; %bb.1: ; %bb.then
956
- ; GCN-O0-NEXT: s_waitcnt expcnt(0)
957
- ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
958
961
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
959
962
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
960
963
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
961
964
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
962
965
; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0
963
966
; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1
967
+ ; GCN-O0-NEXT: s_waitcnt expcnt(0)
968
+ ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
964
969
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
965
970
; GCN-O0-NEXT: s_mov_b32 s4, 0
966
971
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
967
972
; GCN-O0-NEXT: s_mov_b32 s5, s2
968
973
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
969
974
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
975
+ ; GCN-O0-NEXT: s_waitcnt vmcnt(0)
970
976
; GCN-O0-NEXT: v_ashrrev_i32_e64 v2, 31, v0
971
977
; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
972
978
; GCN-O0-NEXT: v_mov_b32_e32 v1, v2
@@ -1102,14 +1108,14 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
1102
1108
; GCN-O0-NEXT: s_waitcnt expcnt(0)
1103
1109
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1104
1110
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
1105
- ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
1106
- ; GCN-O0-NEXT: s_waitcnt vmcnt(1)
1111
+ ; GCN-O0-NEXT: s_waitcnt vmcnt(0)
1107
1112
; GCN-O0-NEXT: v_readlane_b32 s8, v0, 2
1108
1113
; GCN-O0-NEXT: v_readlane_b32 s9, v0, 3
1109
1114
; GCN-O0-NEXT: v_readlane_b32 s6, v0, 0
1110
1115
; GCN-O0-NEXT: v_readlane_b32 s7, v0, 1
1111
1116
; GCN-O0-NEXT: v_writelane_b32 v0, s6, 4
1112
1117
; GCN-O0-NEXT: v_writelane_b32 v0, s7, 5
1118
+ ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
1113
1119
; GCN-O0-NEXT: s_mov_b32 s4, 0x207
1114
1120
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
1115
1121
; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, s4
@@ -1132,11 +1138,11 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
1132
1138
; GCN-O0-NEXT: s_waitcnt expcnt(0)
1133
1139
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1134
1140
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
1135
- ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
1136
- ; GCN-O0-NEXT: s_waitcnt vmcnt(1)
1141
+ ; GCN-O0-NEXT: s_waitcnt vmcnt(0)
1137
1142
; GCN-O0-NEXT: v_readlane_b32 s4, v0, 6
1138
1143
; GCN-O0-NEXT: v_readlane_b32 s5, v0, 7
1139
1144
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
1145
+ ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
1140
1146
; GCN-O0-NEXT: s_mov_b32 s6, 0
1141
1147
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
1142
1148
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v1, s6
@@ -1226,18 +1232,20 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
1226
1232
; GCN-O0-NEXT: s_branch .LBB5_6
1227
1233
; GCN-O0-NEXT: .LBB5_5: ; %Flow2
1228
1234
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
1229
- ; GCN-O0-NEXT: s_waitcnt expcnt(0)
1230
- ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
1231
- ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
1232
- ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
1233
- ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1234
1235
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
1236
+ ; GCN-O0-NEXT: s_waitcnt expcnt(1)
1235
1237
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
1236
1238
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
1237
1239
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
1238
1240
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 10
1239
1241
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 11
1240
1242
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
1243
+ ; GCN-O0-NEXT: s_waitcnt expcnt(0)
1244
+ ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
1245
+ ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
1246
+ ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
1247
+ ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1248
+ ; GCN-O0-NEXT: s_waitcnt vmcnt(0)
1241
1249
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
1242
1250
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
1243
1251
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
@@ -1246,18 +1254,20 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
1246
1254
; GCN-O0-NEXT: s_branch .LBB5_7
1247
1255
; GCN-O0-NEXT: .LBB5_6: ; %Flow
1248
1256
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
1249
- ; GCN-O0-NEXT: s_waitcnt expcnt(0)
1250
- ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1251
- ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1252
- ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1253
- ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
1254
1257
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
1258
+ ; GCN-O0-NEXT: s_waitcnt expcnt(1)
1255
1259
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
1256
1260
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
1257
1261
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
1258
1262
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 12
1259
1263
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 13
1260
1264
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
1265
+ ; GCN-O0-NEXT: s_waitcnt expcnt(0)
1266
+ ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1267
+ ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1268
+ ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1269
+ ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
1270
+ ; GCN-O0-NEXT: s_waitcnt vmcnt(0)
1261
1271
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
1262
1272
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
1263
1273
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
@@ -1301,11 +1311,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
1301
1311
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
1302
1312
; GCN-O0-NEXT: .LBB5_9: ; %Flow3
1303
1313
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
1304
- ; GCN-O0-NEXT: s_waitcnt expcnt(0)
1305
- ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
1306
- ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
1307
- ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
1308
- ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
1309
1314
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
1310
1315
; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
1311
1316
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
@@ -1317,6 +1322,11 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
1317
1322
; GCN-O0-NEXT: v_readlane_b32 s7, v4, 5
1318
1323
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 14
1319
1324
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 15
1325
+ ; GCN-O0-NEXT: s_waitcnt expcnt(0)
1326
+ ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
1327
+ ; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
1328
+ ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
1329
+ ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
1320
1330
; GCN-O0-NEXT: s_and_b64 s[4:5], exec, s[4:5]
1321
1331
; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
1322
1332
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0
@@ -1331,6 +1341,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
1331
1341
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
1332
1342
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
1333
1343
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
1344
+ ; GCN-O0-NEXT: s_waitcnt vmcnt(1)
1334
1345
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
1335
1346
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
1336
1347
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
0 commit comments