@@ -30,15 +30,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
30
30
; MUBUF-NEXT: s_cmp_lg_u32 s9, 0
31
31
; MUBUF-NEXT: s_cbranch_scc1 .LBB0_3
32
32
; MUBUF-NEXT: ; %bb.2: ; %bb.1
33
- ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
34
- ; MUBUF-NEXT: s_lshl_b32 s7, s10, 2
35
- ; MUBUF-NEXT: s_mov_b32 s32, s6
33
+ ; MUBUF-NEXT: s_mov_b32 s6, s32
36
34
; MUBUF-NEXT: v_mov_b32_e32 v1, 0
37
- ; MUBUF-NEXT: v_mov_b32_e32 v2, s6
38
- ; MUBUF-NEXT: v_mov_b32_e32 v3, 1
35
+ ; MUBUF-NEXT: v_mov_b32_e32 v2, 1
36
+ ; MUBUF-NEXT: s_lshl_b32 s7, s10, 2
37
+ ; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
38
+ ; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s6
39
+ ; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4
39
40
; MUBUF-NEXT: s_add_i32 s6, s6, s7
40
- ; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
41
- ; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
42
41
; MUBUF-NEXT: v_mov_b32_e32 v2, s6
43
42
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
44
43
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
@@ -66,11 +65,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
66
65
; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0
67
66
; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_3
68
67
; FLATSCR-NEXT: ; %bb.2: ; %bb.1
69
- ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
68
+ ; FLATSCR-NEXT: s_mov_b32 s2, s32
70
69
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
71
70
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
72
71
; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2
73
- ; FLATSCR-NEXT: s_mov_b32 s32, s2
72
+ ; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
74
73
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2
75
74
; FLATSCR-NEXT: s_add_i32 s2, s2, s3
76
75
; FLATSCR-NEXT: scratch_load_dword v2, off, s2
@@ -131,16 +130,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
131
130
; MUBUF-NEXT: s_cmp_lg_u32 s4, 0
132
131
; MUBUF-NEXT: s_cbranch_scc1 .LBB1_2
133
132
; MUBUF-NEXT: ; %bb.1: ; %bb.0
134
- ; MUBUF-NEXT: s_add_i32 s4, s32, 0x1000
135
- ; MUBUF-NEXT: s_and_b32 s4, s4, 0xfffff000
136
- ; MUBUF-NEXT: s_lshl_b32 s5, s5, 2
137
- ; MUBUF-NEXT: s_mov_b32 s32, s4
133
+ ; MUBUF-NEXT: s_mov_b32 s4, s32
138
134
; MUBUF-NEXT: v_mov_b32_e32 v1, 0
139
- ; MUBUF-NEXT: v_mov_b32_e32 v2, s4
140
- ; MUBUF-NEXT: v_mov_b32_e32 v3, 1
135
+ ; MUBUF-NEXT: v_mov_b32_e32 v2, 1
136
+ ; MUBUF-NEXT: s_lshl_b32 s5, s5, 2
137
+ ; MUBUF-NEXT: s_add_i32 s32, s4, 0x1000
138
+ ; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s4
139
+ ; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s4 offset:4
141
140
; MUBUF-NEXT: s_add_i32 s4, s4, s5
142
- ; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
143
- ; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
144
141
; MUBUF-NEXT: v_mov_b32_e32 v2, s4
145
142
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
146
143
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
@@ -165,12 +162,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
165
162
; FLATSCR-NEXT: s_cmp_lg_u32 s0, 0
166
163
; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_2
167
164
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
168
- ; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1000
169
165
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
170
- ; FLATSCR-NEXT: s_and_b32 s0, s0, 0xfffff000
166
+ ; FLATSCR-NEXT: s_mov_b32 s0, s32
171
167
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
172
168
; FLATSCR-NEXT: s_lshl_b32 s1, s1, 2
173
- ; FLATSCR-NEXT: s_mov_b32 s32, s0
169
+ ; FLATSCR-NEXT: s_add_i32 s32, s0, 0x1000
174
170
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
175
171
; FLATSCR-NEXT: s_add_i32 s0, s0, s1
176
172
; FLATSCR-NEXT: scratch_load_dword v2, off, s0
@@ -230,16 +226,15 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
230
226
; MUBUF-NEXT: s_and_b64 exec, exec, vcc
231
227
; MUBUF-NEXT: s_cbranch_execz .LBB2_3
232
228
; MUBUF-NEXT: ; %bb.2: ; %bb.1
233
- ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
229
+ ; MUBUF-NEXT: s_mov_b32 s6, s32
234
230
; MUBUF-NEXT: v_mov_b32_e32 v2, 0
235
- ; MUBUF-NEXT: v_mov_b32_e32 v3, s6
236
- ; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
231
+ ; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6
237
232
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
238
- ; MUBUF-NEXT: buffer_store_dword v2, v3 , s[0:3], 0 offen offset:4
233
+ ; MUBUF-NEXT: buffer_store_dword v2, off , s[0:3], s6 offset:4
239
234
; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6
240
235
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
241
236
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
242
- ; MUBUF-NEXT: s_mov_b32 s32, s6
237
+ ; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
243
238
; MUBUF-NEXT: s_waitcnt vmcnt(0)
244
239
; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
245
240
; MUBUF-NEXT: global_store_dword v[0:1], v2, off
@@ -266,14 +261,14 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
266
261
; FLATSCR-NEXT: s_and_b64 exec, exec, vcc
267
262
; FLATSCR-NEXT: s_cbranch_execz .LBB2_3
268
263
; FLATSCR-NEXT: ; %bb.2: ; %bb.1
269
- ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
264
+ ; FLATSCR-NEXT: s_mov_b32 s2, s32
270
265
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
271
266
; FLATSCR-NEXT: v_mov_b32_e32 v3, 1
272
267
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2
273
268
; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s2
274
269
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
275
270
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
276
- ; FLATSCR-NEXT: s_mov_b32 s32, s2
271
+ ; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
277
272
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
278
273
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
279
274
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
@@ -324,17 +319,15 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
324
319
; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc
325
320
; MUBUF-NEXT: s_cbranch_execz .LBB3_2
326
321
; MUBUF-NEXT: ; %bb.1: ; %bb.0
327
- ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
328
- ; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000
322
+ ; MUBUF-NEXT: s_mov_b32 s6, s32
329
323
; MUBUF-NEXT: v_mov_b32_e32 v2, 0
330
- ; MUBUF-NEXT: v_mov_b32_e32 v4, s6
331
- ; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
324
+ ; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6
332
325
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
333
- ; MUBUF-NEXT: buffer_store_dword v2, v4 , s[0:3], 0 offen offset:4
326
+ ; MUBUF-NEXT: buffer_store_dword v2, off , s[0:3], s6 offset:4
334
327
; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6
335
328
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
336
329
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
337
- ; MUBUF-NEXT: s_mov_b32 s32, s6
330
+ ; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
338
331
; MUBUF-NEXT: s_waitcnt vmcnt(0)
339
332
; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
340
333
; MUBUF-NEXT: global_store_dword v[0:1], v2, off
@@ -358,15 +351,14 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
358
351
; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc
359
352
; FLATSCR-NEXT: s_cbranch_execz .LBB3_2
360
353
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
361
- ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
362
- ; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
354
+ ; FLATSCR-NEXT: s_mov_b32 s2, s32
363
355
; FLATSCR-NEXT: v_mov_b32_e32 v4, 0
364
356
; FLATSCR-NEXT: v_mov_b32_e32 v5, 1
365
357
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s2
366
358
; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2
367
359
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
368
360
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
369
- ; FLATSCR-NEXT: s_mov_b32 s32, s2
361
+ ; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
370
362
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
371
363
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
372
364
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
0 commit comments