@@ -5,22 +5,11 @@ target triple = "amdgcn-amd-amdhsa"
5
5
6
6
; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
7
7
8
- ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
9
- ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
10
- ; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
11
- ; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
12
- ; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
13
-
14
- ; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base
15
-
8
+ ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}}
9
+ ; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}}
10
+ ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]]
16
11
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
17
- ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
18
-
19
- ; GFX9: s_cmp_lg_u32 [[PTR]], -1
20
- ; GFX9-DAG: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
21
- ; GFX9-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
22
-
23
- ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
12
+ ; HSA-DAG: ds_write_b32 [[PTR]], [[K]]
24
13
25
14
; HSA: .amdhsa_user_sgpr_private_segment_buffer 1
26
15
; HSA: .amdhsa_user_sgpr_dispatch_ptr 0
@@ -39,22 +28,8 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr
39
28
40
29
; Test handling inside a non-kernel
41
30
; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func:
42
- ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x10{{$}}
43
- ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
44
- ; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
45
- ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
46
- ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0
47
-
48
- ; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base
49
-
50
31
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
51
-
52
- ; GFX9-DAG: v_mov_b32_e32 v[[VREG_HIBASE:[0-9]+]], s[[HIBASE]]
53
- ; GFX9-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
54
- ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc
55
- ; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, v[[VREG_HIBASE]], vcc
56
-
57
- ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
32
+ ; HSA-DAG: ds_write_b32 v0, [[K]]
58
33
define void @use_group_to_flat_addrspacecast_func (ptr addrspace (3 ) %ptr ) #0 {
59
34
%stof = addrspacecast ptr addrspace (3 ) %ptr to ptr
60
35
store volatile i32 7 , ptr %stof
@@ -63,23 +38,16 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 {
63
38
64
39
; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
65
40
66
- ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
67
- ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
68
-
69
- ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
70
- ; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
71
- ; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
72
- ; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
73
-
74
- ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
75
- ; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_private_base
76
-
77
- ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
78
- ; GFX9: s_cmp_lg_u32 [[PTR]], -1
79
- ; GFX9: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
80
- ; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
81
-
82
- ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
41
+ ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}}
42
+ ; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}}
43
+ ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]]
44
+ ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
45
+ ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
46
+ ; SI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
47
+ ; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
48
+ ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
49
+ ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
50
+ ; HSA: buffer_store_dword [[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen
83
51
84
52
; HSA: .amdhsa_user_sgpr_private_segment_buffer 1
85
53
; HSA: .amdhsa_user_sgpr_dispatch_ptr 0
@@ -97,10 +65,12 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
97
65
; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast:
98
66
99
67
; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
100
- ; HSA -DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
101
- ; HSA -DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
68
+ ; CI -DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
69
+ ; CI -DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
102
70
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
103
- ; HSA: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]]
71
+ ; CI: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]]
72
+ ; GFX9-DAG: v_mov_b32_e32 [[ADDR:v[0-9]+]], 0
73
+ ; GFX9: global_store_dword [[ADDR]], [[K]], s[[[PTRLO]]:[[PTRHI]]]
104
74
105
75
; HSA: .amdhsa_user_sgpr_queue_ptr 0
106
76
define amdgpu_kernel void @use_global_to_flat_addrspacecast (ptr addrspace (1 ) %ptr ) #0 {
@@ -112,9 +82,7 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt
112
82
; no-op
113
83
; HSA-LABEL: {{^}}use_constant_to_flat_addrspacecast:
114
84
; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
115
- ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
116
- ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
117
- ; HSA: flat_load_dword v{{[0-9]+}}, v[[[VPTRLO]]:[[VPTRHI]]]
85
+ ; HSA-DAG: s_load_dword s0, s[[[PTRLO]]:[[PTRHI]]], 0x0
118
86
define amdgpu_kernel void @use_constant_to_flat_addrspacecast (ptr addrspace (4 ) %ptr ) #0 {
119
87
%stof = addrspacecast ptr addrspace (4 ) %ptr to ptr
120
88
%ld = load volatile i32 , ptr %stof
@@ -215,14 +183,9 @@ define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #0 {
215
183
}
216
184
217
185
; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
218
- ; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
219
- ; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
220
-
221
- ; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base
222
-
223
186
; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
224
187
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
225
- ; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI] ]], v[[K]]
188
+ ; HSA: ds_write_b32 v[[LO ]], v[[K]]
226
189
define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast () #0 {
227
190
%cast = addrspacecast ptr addrspace (3 ) null to ptr
228
191
store volatile i32 7 , ptr %cast
@@ -240,10 +203,9 @@ define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
240
203
}
241
204
242
205
; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast:
243
- ; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
244
206
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
245
- ; HSA-DAG: v_mov_b32_e32 v[[HI :[0-9]+]], 0{{$}}
246
- ; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI] ]], v[[K]]
207
+ ; HSA-DAG: v_mov_b32_e32 v[[LO :[0-9]+]], -1
208
+ ; HSA: ds_write_b32 v[[LO ]], v[[K]]
247
209
define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast () #0 {
248
210
%cast = addrspacecast ptr addrspace (3 ) inttoptr (i32 -1 to ptr addrspace (3 )) to ptr
249
211
store volatile i32 7 , ptr %cast
@@ -262,14 +224,13 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
262
224
263
225
; FIXME: Shouldn't need to enable queue ptr
264
226
; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
265
- ; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11
266
- ; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
267
-
268
- ; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_private_base
269
-
270
- ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
227
+ ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
228
+ ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
229
+ ; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
230
+ ; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
231
+ ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
271
232
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
272
- ; HSA: {{flat|global}}_store_dword v[[[LO ]]:[[HI ]]], v[[K]]
233
+ ; HSA: buffer_store_dword v[[K]], off, s[[[BASELO ]]:[[RSRCHI ]]], 0
273
234
define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast () #0 {
274
235
%cast = addrspacecast ptr addrspace (5 ) null to ptr
275
236
store volatile i32 7 , ptr %cast
@@ -286,13 +247,16 @@ define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
286
247
ret void
287
248
}
288
249
289
-
290
250
; HSA-LABEL: {{^}}cast_neg1_private_to_flat_addrspacecast:
291
251
292
- ; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
252
+ ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
253
+ ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
254
+ ; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
255
+ ; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
256
+ ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
257
+ ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
293
258
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
294
- ; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
295
- ; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
259
+ ; HSA: buffer_store_dword v[[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen
296
260
297
261
; CI: .amdhsa_user_sgpr_queue_ptr 1
298
262
; GFX9: .amdhsa_user_sgpr_queue_ptr 0
@@ -342,16 +306,18 @@ end:
342
306
343
307
; Check for prologue initializing special SGPRs pointing to scratch.
344
308
; HSA-LABEL: {{^}}store_flat_scratch:
345
- ; CI-DAG: s_mov_b32 flat_scratch_lo, s9
346
309
; CI-DAG: s_add_i32 [[ADD:s[0-9]+]], s8, s11
347
310
; CI-DAG: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
348
-
349
- ; GFX9: s_add_u32 flat_scratch_lo, s6, s9
350
- ; GFX9: s_addc_u32 flat_scratch_hi, s7, 0
351
-
352
- ; HSA: {{flat|global}}_store_dword
353
- ; HSA: s_barrier
354
- ; HSA: {{flat|global}}_load_dword
311
+ ; HSA: buffer_store_dword
312
+ ; HSA: s_barrier
313
+ ; HSA: buffer_load_dword [[K:v[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen glc
314
+ ; HSA-DAG: s_load_dwordx2
315
+ ; CI-DAG: s_mov_b32 flat_scratch_lo, s9
316
+ ; CI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s4
317
+ ; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s5
318
+ ; GFX9-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0
319
+ ; CI: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
320
+ ; GFX9: global_store_dword [[PTR]], [[K]]
355
321
define amdgpu_kernel void @store_flat_scratch (ptr addrspace (1 ) noalias %out , i32 ) #0 {
356
322
%alloca = alloca i32 , i32 9 , align 4 , addrspace (5 )
357
323
%x = call i32 @llvm.amdgcn.workitem.id.x () #2
0 commit comments