@@ -14,7 +14,7 @@ define i32 @static_alloca() {
14
14
; ISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
15
15
; ISEL-NEXT: s_mov_b64 exec, s[18:19]
16
16
; ISEL-NEXT: s_addk_i32 s32, 0x400
17
- ; ISEL-NEXT: v_writelane_b32 v40, s16, 4
17
+ ; ISEL-NEXT: v_writelane_b32 v40, s16, 3
18
18
; ISEL-NEXT: s_getpc_b64 s[16:17]
19
19
; ISEL-NEXT: s_add_u32 s16, s16, bar@rel32@lo+4
20
20
; ISEL-NEXT: s_addc_u32 s17, s17, bar@rel32@hi+12
@@ -27,25 +27,22 @@ define i32 @static_alloca() {
27
27
; ISEL-NEXT: v_writelane_b32 v40, s34, 2
28
28
; ISEL-NEXT: s_cselect_b32 s34, s18, 0
29
29
; ISEL-NEXT: s_mov_b64 s[18:19], src_private_base
30
- ; ISEL-NEXT: v_writelane_b32 v40, s35, 3
31
- ; ISEL-NEXT: s_cselect_b32 s35, s19, 0
30
+ ; ISEL-NEXT: s_cselect_b32 s18, s19, 0
32
31
; ISEL-NEXT: v_mov_b32_e32 v0, s34
33
- ; ISEL-NEXT: v_mov_b32_e32 v1, s35
32
+ ; ISEL-NEXT: v_mov_b32_e32 v1, s18
34
33
; ISEL-NEXT: s_swappc_b64 s[30:31], s[16:17]
35
34
; ISEL-NEXT: v_mov_b32_e32 v0, s34
36
- ; ISEL-NEXT: v_mov_b32_e32 v1, s35
37
- ; ISEL-NEXT: flat_load_dword v0, v[0:1]
38
- ; ISEL-NEXT: v_readlane_b32 s35, v40, 3
35
+ ; ISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
39
36
; ISEL-NEXT: v_readlane_b32 s34, v40, 2
40
37
; ISEL-NEXT: v_readlane_b32 s31, v40, 1
41
38
; ISEL-NEXT: v_readlane_b32 s30, v40, 0
42
39
; ISEL-NEXT: s_mov_b32 s32, s33
43
- ; ISEL-NEXT: v_readlane_b32 s4, v40, 4
40
+ ; ISEL-NEXT: v_readlane_b32 s4, v40, 3
44
41
; ISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
45
42
; ISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
46
43
; ISEL-NEXT: s_mov_b64 exec, s[6:7]
47
44
; ISEL-NEXT: s_mov_b32 s33, s4
48
- ; ISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
45
+ ; ISEL-NEXT: s_waitcnt vmcnt(0)
49
46
; ISEL-NEXT: s_setpc_b64 s[30:31]
50
47
;
51
48
; GI-LABEL: static_alloca:
@@ -56,35 +53,27 @@ define i32 @static_alloca() {
56
53
; GI-NEXT: s_or_saveexec_b64 s[18:19], -1
57
54
; GI-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
58
55
; GI-NEXT: s_mov_b64 exec, s[18:19]
59
- ; GI-NEXT: v_writelane_b32 v40, s16, 4
60
- ; GI-NEXT: v_writelane_b32 v40, s30, 0
61
- ; GI-NEXT: v_writelane_b32 v40, s31, 1
56
+ ; GI-NEXT: v_writelane_b32 v40, s16, 2
62
57
; GI-NEXT: s_addk_i32 s32, 0x400
63
- ; GI-NEXT: v_writelane_b32 v40, s34, 2
64
- ; GI-NEXT: s_lshr_b32 s34, s33, 6
65
58
; GI-NEXT: s_mov_b64 s[16:17], src_private_base
59
+ ; GI-NEXT: v_writelane_b32 v40, s30, 0
66
60
; GI-NEXT: s_getpc_b64 s[18:19]
67
61
; GI-NEXT: s_add_u32 s18, s18, bar@rel32@lo+4
68
62
; GI-NEXT: s_addc_u32 s19, s19, bar@rel32@hi+12
69
63
; GI-NEXT: v_lshrrev_b32_e64 v0, 6, s33
70
64
; GI-NEXT: v_mov_b32_e32 v1, s17
71
- ; GI-NEXT: v_writelane_b32 v40, s35, 3
72
- ; GI-NEXT: s_mov_b32 s35, s17
65
+ ; GI-NEXT: v_writelane_b32 v40, s31, 1
73
66
; GI-NEXT: s_swappc_b64 s[30:31], s[18:19]
74
- ; GI-NEXT: v_mov_b32_e32 v0, s34
75
- ; GI-NEXT: v_mov_b32_e32 v1, s35
76
- ; GI-NEXT: flat_load_dword v0, v[0:1]
77
- ; GI-NEXT: v_readlane_b32 s35, v40, 3
78
- ; GI-NEXT: v_readlane_b32 s34, v40, 2
67
+ ; GI-NEXT: buffer_load_dword v0, off, s[0:3], s33
79
68
; GI-NEXT: v_readlane_b32 s31, v40, 1
80
69
; GI-NEXT: v_readlane_b32 s30, v40, 0
81
70
; GI-NEXT: s_mov_b32 s32, s33
82
- ; GI-NEXT: v_readlane_b32 s4, v40, 4
71
+ ; GI-NEXT: v_readlane_b32 s4, v40, 2
83
72
; GI-NEXT: s_or_saveexec_b64 s[6:7], -1
84
73
; GI-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
85
74
; GI-NEXT: s_mov_b64 exec, s[6:7]
86
75
; GI-NEXT: s_mov_b32 s33, s4
87
- ; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
76
+ ; GI-NEXT: s_waitcnt vmcnt(0)
88
77
; GI-NEXT: s_setpc_b64 s[30:31]
89
78
%alloca = alloca i32 , align 4
90
79
call void @bar (ptr %alloca )
@@ -112,19 +101,18 @@ define amdgpu_kernel void @static_alloca_kernel(ptr %p) {
112
101
; ISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
113
102
; ISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
114
103
; ISEL-NEXT: s_cselect_b32 s33, 0, 0
115
- ; ISEL-NEXT: s_cselect_b32 s36 , s15, 0
104
+ ; ISEL-NEXT: s_cselect_b32 s15 , s15, 0
116
105
; ISEL-NEXT: v_or3_b32 v31, v0, v1, v2
117
106
; ISEL-NEXT: s_mov_b32 s14, s16
118
107
; ISEL-NEXT: v_mov_b32_e32 v0, s33
119
- ; ISEL-NEXT: v_mov_b32_e32 v1, s36
108
+ ; ISEL-NEXT: v_mov_b32_e32 v1, s15
120
109
; ISEL-NEXT: s_movk_i32 s32, 0x400
121
110
; ISEL-NEXT: s_swappc_b64 s[30:31], s[18:19]
122
111
; ISEL-NEXT: v_mov_b32_e32 v0, s33
123
- ; ISEL-NEXT: v_mov_b32_e32 v1, s36
124
- ; ISEL-NEXT: flat_load_dword v2, v[0:1]
112
+ ; ISEL-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
125
113
; ISEL-NEXT: v_mov_b32_e32 v0, s34
126
114
; ISEL-NEXT: v_mov_b32_e32 v1, s35
127
- ; ISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
115
+ ; ISEL-NEXT: s_waitcnt vmcnt(0)
128
116
; ISEL-NEXT: flat_store_dword v[0:1], v2
129
117
; ISEL-NEXT: s_endpgm
130
118
;
@@ -138,10 +126,10 @@ define amdgpu_kernel void @static_alloca_kernel(ptr %p) {
138
126
; GI-NEXT: s_add_u32 s8, s8, 8
139
127
; GI-NEXT: s_mov_b32 s13, s15
140
128
; GI-NEXT: s_mov_b32 s12, s14
129
+ ; GI-NEXT: s_mov_b64 s[14:15], src_private_base
141
130
; GI-NEXT: s_addc_u32 s9, s9, 0
142
131
; GI-NEXT: v_lshlrev_b32_e32 v1, 10, v1
143
132
; GI-NEXT: v_lshlrev_b32_e32 v2, 20, v2
144
- ; GI-NEXT: s_mov_b64 s[14:15], src_private_base
145
133
; GI-NEXT: v_or3_b32 v31, v0, v1, v2
146
134
; GI-NEXT: s_getpc_b64 s[18:19]
147
135
; GI-NEXT: s_add_u32 s18, s18, bar@rel32@lo+4
@@ -150,15 +138,11 @@ define amdgpu_kernel void @static_alloca_kernel(ptr %p) {
150
138
; GI-NEXT: v_mov_b32_e32 v1, s15
151
139
; GI-NEXT: s_mov_b32 s14, s16
152
140
; GI-NEXT: s_movk_i32 s32, 0x400
153
- ; GI-NEXT: s_mov_b32 s36, 0
154
- ; GI-NEXT: s_mov_b32 s37, s15
155
141
; GI-NEXT: s_swappc_b64 s[30:31], s[18:19]
156
- ; GI-NEXT: v_mov_b32_e32 v0, s36
157
- ; GI-NEXT: v_mov_b32_e32 v1, s37
158
- ; GI-NEXT: flat_load_dword v2, v[0:1]
142
+ ; GI-NEXT: buffer_load_dword v2, off, s[0:3], 0
159
143
; GI-NEXT: v_mov_b32_e32 v0, s34
160
144
; GI-NEXT: v_mov_b32_e32 v1, s35
161
- ; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
145
+ ; GI-NEXT: s_waitcnt vmcnt(0)
162
146
; GI-NEXT: flat_store_dword v[0:1], v2
163
147
; GI-NEXT: s_endpgm
164
148
%alloca = alloca i32 , align 4
@@ -279,24 +263,24 @@ define amdgpu_kernel void @dynamic_alloca_i32_kernel(i32 %n, ptr %p) {
279
263
; ISEL-LABEL: dynamic_alloca_i32_kernel:
280
264
; ISEL: ; %bb.0:
281
265
; ISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17
266
+ ; ISEL-NEXT: s_mov_b32 s12, s14
267
+ ; ISEL-NEXT: s_load_dword s14, s[8:9], 0x0
268
+ ; ISEL-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x8
282
269
; ISEL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
283
270
; ISEL-NEXT: s_add_u32 s0, s0, s17
284
- ; ISEL-NEXT: s_load_dword s17, s[8:9], 0x0
285
- ; ISEL-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x8
286
- ; ISEL-NEXT: s_movk_i32 s32, 0x400
287
271
; ISEL-NEXT: s_addc_u32 s1, s1, 0
288
- ; ISEL-NEXT: s_mov_b32 s13, s15
289
- ; ISEL-NEXT: s_mov_b32 s12, s14
290
- ; ISEL-NEXT: s_mov_b64 s[14:15], src_private_base
291
- ; ISEL-NEXT: s_cmp_lg_u32 s32, -1
292
- ; ISEL-NEXT: s_cselect_b32 s15, s15, 0
293
- ; ISEL-NEXT: s_cselect_b32 s20, s32, 0
294
272
; ISEL-NEXT: s_waitcnt lgkmcnt(0)
295
- ; ISEL-NEXT: s_lshl_b32 s14, s17 , 2
273
+ ; ISEL-NEXT: s_lshl_b32 s14, s14 , 2
296
274
; ISEL-NEXT: s_add_i32 s14, s14, 15
297
275
; ISEL-NEXT: s_and_b32 s14, s14, -16
276
+ ; ISEL-NEXT: s_movk_i32 s32, 0x400
298
277
; ISEL-NEXT: s_lshl_b32 s14, s14, 6
299
- ; ISEL-NEXT: s_add_i32 s32, s32, s14
278
+ ; ISEL-NEXT: s_add_i32 s17, s32, s14
279
+ ; ISEL-NEXT: s_mov_b32 s13, s15
280
+ ; ISEL-NEXT: s_cmp_lg_u32 s32, -1
281
+ ; ISEL-NEXT: s_mov_b64 s[14:15], src_private_base
282
+ ; ISEL-NEXT: s_cselect_b32 s36, s32, 0
283
+ ; ISEL-NEXT: s_cselect_b32 s15, s15, 0
300
284
; ISEL-NEXT: s_add_u32 s8, s8, 16
301
285
; ISEL-NEXT: s_addc_u32 s9, s9, 0
302
286
; ISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
@@ -306,16 +290,16 @@ define amdgpu_kernel void @dynamic_alloca_i32_kernel(i32 %n, ptr %p) {
306
290
; ISEL-NEXT: s_addc_u32 s19, s19, bar@rel32@hi+12
307
291
; ISEL-NEXT: v_or3_b32 v31, v0, v1, v2
308
292
; ISEL-NEXT: s_mov_b32 s14, s16
309
- ; ISEL-NEXT: v_mov_b32_e32 v0, s20
293
+ ; ISEL-NEXT: v_mov_b32_e32 v0, s36
310
294
; ISEL-NEXT: v_mov_b32_e32 v1, s15
311
295
; ISEL-NEXT: s_mov_b32 s33, 0
312
- ; ISEL-NEXT: v_mov_b32_e32 v40, s20
313
- ; ISEL-NEXT: v_mov_b32_e32 v41, s15
296
+ ; ISEL-NEXT: s_mov_b32 s32, s17
314
297
; ISEL-NEXT: s_swappc_b64 s[30:31], s[18:19]
315
- ; ISEL-NEXT: flat_load_dword v2, v[40:41]
298
+ ; ISEL-NEXT: v_mov_b32_e32 v0, s36
299
+ ; ISEL-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
316
300
; ISEL-NEXT: v_mov_b32_e32 v0, s34
317
301
; ISEL-NEXT: v_mov_b32_e32 v1, s35
318
- ; ISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
302
+ ; ISEL-NEXT: s_waitcnt vmcnt(0)
319
303
; ISEL-NEXT: flat_store_dword v[0:1], v2
320
304
; ISEL-NEXT: s_endpgm
321
305
;
@@ -356,11 +340,10 @@ define amdgpu_kernel void @dynamic_alloca_i32_kernel(i32 %n, ptr %p) {
356
340
; GI-NEXT: s_mov_b32 s33, 0
357
341
; GI-NEXT: s_swappc_b64 s[30:31], s[18:19]
358
342
; GI-NEXT: v_mov_b32_e32 v0, s36
359
- ; GI-NEXT: v_mov_b32_e32 v1, s37
360
- ; GI-NEXT: flat_load_dword v2, v[0:1]
343
+ ; GI-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
361
344
; GI-NEXT: v_mov_b32_e32 v0, s34
362
345
; GI-NEXT: v_mov_b32_e32 v1, s35
363
- ; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
346
+ ; GI-NEXT: s_waitcnt vmcnt(0)
364
347
; GI-NEXT: flat_store_dword v[0:1], v2
365
348
; GI-NEXT: s_endpgm
366
349
%alloca = alloca i32 , i32 %n , align 4
@@ -478,24 +461,24 @@ define i32 @dynamic_alloca_i64(i64 %n) {
478
461
define amdgpu_kernel void @dynamic_alloca_i64_kernel (i64 %n , ptr %p ) {
479
462
; ISEL-LABEL: dynamic_alloca_i64_kernel:
480
463
; ISEL: ; %bb.0:
481
- ; ISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17
482
464
; ISEL-NEXT: s_load_dwordx4 s[20:23], s[8:9], 0x0
465
+ ; ISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17
483
466
; ISEL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
484
467
; ISEL-NEXT: s_add_u32 s0, s0, s17
485
- ; ISEL-NEXT: s_movk_i32 s32, 0x400
486
468
; ISEL-NEXT: s_addc_u32 s1, s1, 0
487
- ; ISEL-NEXT: s_mov_b32 s13, s15
488
469
; ISEL-NEXT: s_mov_b32 s12, s14
489
- ; ISEL-NEXT: s_mov_b64 s[14:15], src_private_base
490
- ; ISEL-NEXT: s_cmp_lg_u32 s32, -1
491
- ; ISEL-NEXT: s_cselect_b32 s15, s15, 0
492
- ; ISEL-NEXT: s_cselect_b32 s17, s32, 0
493
470
; ISEL-NEXT: s_waitcnt lgkmcnt(0)
494
471
; ISEL-NEXT: s_lshl_b32 s14, s20, 2
495
472
; ISEL-NEXT: s_add_i32 s14, s14, 15
496
473
; ISEL-NEXT: s_and_b32 s14, s14, -16
474
+ ; ISEL-NEXT: s_movk_i32 s32, 0x400
497
475
; ISEL-NEXT: s_lshl_b32 s14, s14, 6
498
- ; ISEL-NEXT: s_add_i32 s32, s32, s14
476
+ ; ISEL-NEXT: s_add_i32 s17, s32, s14
477
+ ; ISEL-NEXT: s_mov_b32 s13, s15
478
+ ; ISEL-NEXT: s_cmp_lg_u32 s32, -1
479
+ ; ISEL-NEXT: s_mov_b64 s[14:15], src_private_base
480
+ ; ISEL-NEXT: s_cselect_b32 s34, s32, 0
481
+ ; ISEL-NEXT: s_cselect_b32 s15, s15, 0
499
482
; ISEL-NEXT: s_add_u32 s8, s8, 16
500
483
; ISEL-NEXT: s_addc_u32 s9, s9, 0
501
484
; ISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
@@ -505,16 +488,16 @@ define amdgpu_kernel void @dynamic_alloca_i64_kernel(i64 %n, ptr %p) {
505
488
; ISEL-NEXT: s_addc_u32 s19, s19, bar@rel32@hi+12
506
489
; ISEL-NEXT: v_or3_b32 v31, v0, v1, v2
507
490
; ISEL-NEXT: s_mov_b32 s14, s16
508
- ; ISEL-NEXT: v_mov_b32_e32 v0, s17
491
+ ; ISEL-NEXT: v_mov_b32_e32 v0, s34
509
492
; ISEL-NEXT: v_mov_b32_e32 v1, s15
510
493
; ISEL-NEXT: s_mov_b32 s33, 0
511
494
; ISEL-NEXT: v_mov_b32_e32 v40, s22
512
495
; ISEL-NEXT: v_mov_b32_e32 v41, s23
513
- ; ISEL-NEXT: v_mov_b32_e32 v42, s17
514
- ; ISEL-NEXT: v_mov_b32_e32 v43, s15
496
+ ; ISEL-NEXT: s_mov_b32 s32, s17
515
497
; ISEL-NEXT: s_swappc_b64 s[30:31], s[18:19]
516
- ; ISEL-NEXT: flat_load_dword v0, v[42:43]
517
- ; ISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
498
+ ; ISEL-NEXT: v_mov_b32_e32 v0, s34
499
+ ; ISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
500
+ ; ISEL-NEXT: s_waitcnt vmcnt(0)
518
501
; ISEL-NEXT: flat_store_dword v[40:41], v0
519
502
; ISEL-NEXT: s_endpgm
520
503
;
@@ -553,11 +536,10 @@ define amdgpu_kernel void @dynamic_alloca_i64_kernel(i64 %n, ptr %p) {
553
536
; GI-NEXT: s_mov_b32 s33, 0
554
537
; GI-NEXT: s_swappc_b64 s[30:31], s[18:19]
555
538
; GI-NEXT: v_mov_b32_e32 v0, s34
556
- ; GI-NEXT: v_mov_b32_e32 v1, s35
557
- ; GI-NEXT: flat_load_dword v2, v[0:1]
539
+ ; GI-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
558
540
; GI-NEXT: v_mov_b32_e32 v0, s38
559
541
; GI-NEXT: v_mov_b32_e32 v1, s39
560
- ; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
542
+ ; GI-NEXT: s_waitcnt vmcnt(0)
561
543
; GI-NEXT: flat_store_dword v[0:1], v2
562
544
; GI-NEXT: s_endpgm
563
545
%alloca = alloca i32 , i64 %n , align 4
0 commit comments