1
- ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
2
- ; RUN: llc -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
1
+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2
+ ; RUN: llc -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1013 %s
3
+ ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s
3
4
; RUN: not --crash llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s
4
5
5
6
; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
@@ -12,12 +13,15 @@ declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4
12
13
declare <4 x i32 > @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32 (i64 , float , <4 x float >, <4 x float >, <4 x float >, <4 x i32 >)
13
14
declare <4 x i32 > @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16 (i64 , float , <4 x float >, <4 x half >, <4 x half >, <4 x i32 >)
14
15
15
- ; GCN-LABEL: {{^}}image_bvh_intersect_ray:
16
- ; GCN: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]{{$}}
17
16
; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget
18
17
; Arguments are flattened to represent the actual VGPR_A layout, so we have no
19
18
; extra moves in the generated kernel.
20
19
define amdgpu_ps <4 x float > @image_bvh_intersect_ray (i32 %node_ptr , float %ray_extent , float %ray_origin_x , float %ray_origin_y , float %ray_origin_z , float %ray_dir_x , float %ray_dir_y , float %ray_dir_z , float %ray_inv_dir_x , float %ray_inv_dir_y , float %ray_inv_dir_z , <4 x i32 > inreg %tdescr ) {
20
+ ; GCN-LABEL: image_bvh_intersect_ray:
21
+ ; GCN: ; %bb.0: ; %main_body
22
+ ; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
23
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
24
+ ; GCN-NEXT: ; return to shader part epilog
21
25
main_body:
22
26
%ray_origin0 = insertelement <4 x float > undef , float %ray_origin_x , i32 0
23
27
%ray_origin1 = insertelement <4 x float > %ray_origin0 , float %ray_origin_y , i32 1
@@ -33,20 +37,41 @@ main_body:
33
37
ret <4 x float > %r
34
38
}
35
39
36
- ; GCN-LABEL: {{^}}image_bvh_intersect_ray_a16:
37
- ; GCN: image_bvh_intersect_ray v[0:3], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}}
38
40
define amdgpu_ps <4 x float > @image_bvh_intersect_ray_a16 (i32 inreg %node_ptr , float inreg %ray_extent , <4 x float > inreg %ray_origin , <4 x half > inreg %ray_dir , <4 x half > inreg %ray_inv_dir , <4 x i32 > inreg %tdescr ) {
41
+ ; GCN-LABEL: image_bvh_intersect_ray_a16:
42
+ ; GCN: ; %bb.0: ; %main_body
43
+ ; GCN-NEXT: s_lshr_b32 s5, s8, 16
44
+ ; GCN-NEXT: s_pack_ll_b32_b16 s7, s7, s8
45
+ ; GCN-NEXT: s_pack_ll_b32_b16 s5, s5, s9
46
+ ; GCN-NEXT: v_mov_b32_e32 v0, s0
47
+ ; GCN-NEXT: v_mov_b32_e32 v1, s1
48
+ ; GCN-NEXT: v_mov_b32_e32 v2, s2
49
+ ; GCN-NEXT: v_mov_b32_e32 v3, s3
50
+ ; GCN-NEXT: v_mov_b32_e32 v4, s4
51
+ ; GCN-NEXT: v_mov_b32_e32 v5, s6
52
+ ; GCN-NEXT: v_mov_b32_e32 v6, s7
53
+ ; GCN-NEXT: v_mov_b32_e32 v7, s5
54
+ ; GCN-NEXT: s_mov_b32 s15, s13
55
+ ; GCN-NEXT: s_mov_b32 s14, s12
56
+ ; GCN-NEXT: s_mov_b32 s13, s11
57
+ ; GCN-NEXT: s_mov_b32 s12, s10
58
+ ; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16
59
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
60
+ ; GCN-NEXT: ; return to shader part epilog
39
61
main_body:
40
62
%v = call <4 x i32 > @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16 (i32 %node_ptr , float %ray_extent , <4 x float > %ray_origin , <4 x half > %ray_dir , <4 x half > %ray_inv_dir , <4 x i32 > %tdescr )
41
63
%r = bitcast <4 x i32 > %v to <4 x float >
42
64
ret <4 x float > %r
43
65
}
44
66
45
- ; GCN-LABEL: {{^}}image_bvh64_intersect_ray:
46
- ; GCN: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]{{$}}
47
67
; Arguments are flattened to represent the actual VGPR_A layout, so we have no
48
68
; extra moves in the generated kernel.
49
69
define amdgpu_ps <4 x float > @image_bvh64_intersect_ray (<2 x i32 > %node_ptr_vec , float %ray_extent , float %ray_origin_x , float %ray_origin_y , float %ray_origin_z , float %ray_dir_x , float %ray_dir_y , float %ray_dir_z , float %ray_inv_dir_x , float %ray_inv_dir_y , float %ray_inv_dir_z , <4 x i32 > inreg %tdescr ) {
70
+ ; GCN-LABEL: image_bvh64_intersect_ray:
71
+ ; GCN: ; %bb.0: ; %main_body
72
+ ; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
73
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
74
+ ; GCN-NEXT: ; return to shader part epilog
50
75
main_body:
51
76
%node_ptr = bitcast <2 x i32 > %node_ptr_vec to i64
52
77
%ray_origin0 = insertelement <4 x float > undef , float %ray_origin_x , i32 0
@@ -63,9 +88,28 @@ main_body:
63
88
ret <4 x float > %r
64
89
}
65
90
66
- ; GCN-LABEL: {{^}}image_bvh64_intersect_ray_a16:
67
- ; GCN: image_bvh64_intersect_ray v[0:3], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}}
68
91
define amdgpu_ps <4 x float > @image_bvh64_intersect_ray_a16 (i64 inreg %node_ptr , float inreg %ray_extent , <4 x float > inreg %ray_origin , <4 x half > inreg %ray_dir , <4 x half > inreg %ray_inv_dir , <4 x i32 > inreg %tdescr ) {
92
+ ; GCN-LABEL: image_bvh64_intersect_ray_a16:
93
+ ; GCN: ; %bb.0: ; %main_body
94
+ ; GCN-NEXT: s_lshr_b32 s6, s9, 16
95
+ ; GCN-NEXT: s_pack_ll_b32_b16 s8, s8, s9
96
+ ; GCN-NEXT: s_pack_ll_b32_b16 s6, s6, s10
97
+ ; GCN-NEXT: v_mov_b32_e32 v0, s0
98
+ ; GCN-NEXT: v_mov_b32_e32 v1, s1
99
+ ; GCN-NEXT: v_mov_b32_e32 v2, s2
100
+ ; GCN-NEXT: v_mov_b32_e32 v3, s3
101
+ ; GCN-NEXT: v_mov_b32_e32 v4, s4
102
+ ; GCN-NEXT: v_mov_b32_e32 v5, s5
103
+ ; GCN-NEXT: v_mov_b32_e32 v6, s7
104
+ ; GCN-NEXT: v_mov_b32_e32 v7, s8
105
+ ; GCN-NEXT: v_mov_b32_e32 v8, s6
106
+ ; GCN-NEXT: s_mov_b32 s15, s14
107
+ ; GCN-NEXT: s_mov_b32 s14, s13
108
+ ; GCN-NEXT: s_mov_b32 s13, s12
109
+ ; GCN-NEXT: s_mov_b32 s12, s11
110
+ ; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[12:15] a16
111
+ ; GCN-NEXT: s_waitcnt vmcnt(0)
112
+ ; GCN-NEXT: ; return to shader part epilog
69
113
main_body:
70
114
%v = call <4 x i32 > @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16 (i64 %node_ptr , float %ray_extent , <4 x float > %ray_origin , <4 x half > %ray_dir , <4 x half > %ray_inv_dir , <4 x i32 > %tdescr )
71
115
%r = bitcast <4 x i32 > %v to <4 x float >
@@ -74,9 +118,60 @@ main_body:
74
118
75
119
; TODO: NSA reassign is very limited and cannot work with VGPR tuples and subregs.
76
120
77
- ; GCN-LABEL: {{^}}image_bvh_intersect_ray_nsa_reassign:
78
- ; GCN: image_bvh_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
79
121
define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign (i32* %p_node_ptr , float * %p_ray , <4 x i32 > inreg %tdescr ) {
122
+ ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
123
+ ; GFX1013: ; %bb.0: ; %main_body
124
+ ; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
125
+ ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0
126
+ ; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
127
+ ; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0
128
+ ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000
129
+ ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000
130
+ ; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000
131
+ ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000
132
+ ; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
133
+ ; GFX1013-NEXT: v_add_co_u32 v2, s4, s4, v0
134
+ ; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s4, s5, 0, s4
135
+ ; GFX1013-NEXT: v_add_co_u32 v4, s4, s6, v0
136
+ ; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s4, s7, 0, s4
137
+ ; GFX1013-NEXT: flat_load_dword v0, v[2:3]
138
+ ; GFX1013-NEXT: flat_load_dword v1, v[4:5]
139
+ ; GFX1013-NEXT: v_mov_b32_e32 v2, 0
140
+ ; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0
141
+ ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
142
+ ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000
143
+ ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
144
+ ; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
145
+ ; GFX1013-NEXT: s_waitcnt vmcnt(0)
146
+ ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
147
+ ; GFX1013-NEXT: s_endpgm
148
+ ;
149
+ ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign:
150
+ ; GFX1030: ; %bb.0: ; %main_body
151
+ ; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
152
+ ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0
153
+ ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
154
+ ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000
155
+ ; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000
156
+ ; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40c00000
157
+ ; GFX1030-NEXT: v_mov_b32_e32 v7, 0x40a00000
158
+ ; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0
159
+ ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000
160
+ ; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0
161
+ ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
162
+ ; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v2
163
+ ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
164
+ ; GFX1030-NEXT: v_add_co_u32 v2, s4, s6, v2
165
+ ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, s4, s7, 0, s4
166
+ ; GFX1030-NEXT: flat_load_dword v0, v[0:1]
167
+ ; GFX1030-NEXT: flat_load_dword v1, v[2:3]
168
+ ; GFX1030-NEXT: v_mov_b32_e32 v2, 0
169
+ ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0
170
+ ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
171
+ ; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
172
+ ; GFX1030-NEXT: s_waitcnt vmcnt(0)
173
+ ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
174
+ ; GFX1030-NEXT: s_endpgm
80
175
main_body:
81
176
%lid = tail call i32 @llvm.amdgcn.workitem.id.x ()
82
177
%gep_node_ptr = getelementptr inbounds i32 , i32* %p_node_ptr , i32 %lid
@@ -97,9 +192,54 @@ main_body:
97
192
ret void
98
193
}
99
194
100
- ; GCN-LABEL: {{^}}image_bvh_intersect_ray_a16_nsa_reassign:
101
- ; GCN: image_bvh_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}}
102
195
define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign (i32* %p_node_ptr , float * %p_ray , <4 x i32 > inreg %tdescr ) {
196
+ ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
197
+ ; GFX1013: ; %bb.0: ; %main_body
198
+ ; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
199
+ ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0
200
+ ; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
201
+ ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500
202
+ ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700
203
+ ; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
204
+ ; GFX1013-NEXT: v_add_co_u32 v2, s4, s4, v0
205
+ ; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s4, s5, 0, s4
206
+ ; GFX1013-NEXT: v_add_co_u32 v4, s4, s6, v0
207
+ ; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s4, s7, 0, s4
208
+ ; GFX1013-NEXT: flat_load_dword v0, v[2:3]
209
+ ; GFX1013-NEXT: flat_load_dword v1, v[4:5]
210
+ ; GFX1013-NEXT: v_mov_b32_e32 v2, 0
211
+ ; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0
212
+ ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
213
+ ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200
214
+ ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
215
+ ; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
216
+ ; GFX1013-NEXT: s_waitcnt vmcnt(0)
217
+ ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
218
+ ; GFX1013-NEXT: s_endpgm
219
+ ;
220
+ ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
221
+ ; GFX1030: ; %bb.0: ; %main_body
222
+ ; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
223
+ ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0
224
+ ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
225
+ ; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0
226
+ ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200
227
+ ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500
228
+ ; GFX1030-NEXT: v_mov_b32_e32 v7, 0x48004700
229
+ ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
230
+ ; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v2
231
+ ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
232
+ ; GFX1030-NEXT: v_add_co_u32 v2, s4, s6, v2
233
+ ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, s4, s7, 0, s4
234
+ ; GFX1030-NEXT: flat_load_dword v0, v[0:1]
235
+ ; GFX1030-NEXT: flat_load_dword v1, v[2:3]
236
+ ; GFX1030-NEXT: v_mov_b32_e32 v2, 0
237
+ ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0
238
+ ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
239
+ ; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
240
+ ; GFX1030-NEXT: s_waitcnt vmcnt(0)
241
+ ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
242
+ ; GFX1030-NEXT: s_endpgm
103
243
main_body:
104
244
%lid = tail call i32 @llvm.amdgcn.workitem.id.x ()
105
245
%gep_node_ptr = getelementptr inbounds i32 , i32* %p_node_ptr , i32 %lid
@@ -120,9 +260,58 @@ main_body:
120
260
ret void
121
261
}
122
262
123
- ; GCN-LABEL: {{^}}image_bvh64_intersect_ray_nsa_reassign:
124
- ; GCN: image_bvh64_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
125
263
define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign (float * %p_ray , <4 x i32 > inreg %tdescr ) {
264
+ ; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
265
+ ; GFX1013: ; %bb.0: ; %main_body
266
+ ; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
267
+ ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0
268
+ ; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
269
+ ; GFX1013-NEXT: v_mov_b32_e32 v3, 0
270
+ ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
271
+ ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
272
+ ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x40400000
273
+ ; GFX1013-NEXT: v_mov_b32_e32 v7, 4.0
274
+ ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40a00000
275
+ ; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40c00000
276
+ ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000
277
+ ; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000
278
+ ; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
279
+ ; GFX1013-NEXT: v_add_co_u32 v0, s4, s4, v0
280
+ ; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
281
+ ; GFX1013-NEXT: flat_load_dword v2, v[0:1]
282
+ ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7
283
+ ; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
284
+ ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
285
+ ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
286
+ ; GFX1013-NEXT: s_waitcnt vmcnt(0)
287
+ ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
288
+ ; GFX1013-NEXT: s_endpgm
289
+ ;
290
+ ; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign:
291
+ ; GFX1030: ; %bb.0: ; %main_body
292
+ ; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
293
+ ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0
294
+ ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
295
+ ; GFX1030-NEXT: v_mov_b32_e32 v3, 0
296
+ ; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000
297
+ ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000
298
+ ; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40c00000
299
+ ; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40a00000
300
+ ; GFX1030-NEXT: v_mov_b32_e32 v7, 4.0
301
+ ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x40400000
302
+ ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
303
+ ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
304
+ ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
305
+ ; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v0
306
+ ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
307
+ ; GFX1030-NEXT: flat_load_dword v2, v[0:1]
308
+ ; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
309
+ ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7
310
+ ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
311
+ ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
312
+ ; GFX1030-NEXT: s_waitcnt vmcnt(0)
313
+ ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
314
+ ; GFX1030-NEXT: s_endpgm
126
315
main_body:
127
316
%lid = tail call i32 @llvm.amdgcn.workitem.id.x ()
128
317
%gep_ray = getelementptr inbounds float , float * %p_ray , i32 %lid
@@ -141,9 +330,52 @@ main_body:
141
330
ret void
142
331
}
143
332
144
- ; GCN-LABEL: {{^}}image_bvh64_intersect_ray_a16_nsa_reassign:
145
- ; GCN: image_bvh64_intersect_ray v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] a16{{$}}
146
333
define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign (float * %p_ray , <4 x i32 > inreg %tdescr ) {
334
+ ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
335
+ ; GFX1013: ; %bb.0: ; %main_body
336
+ ; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
337
+ ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0
338
+ ; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
339
+ ; GFX1013-NEXT: v_mov_b32_e32 v3, 0
340
+ ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
341
+ ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
342
+ ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x44004200
343
+ ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500
344
+ ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700
345
+ ; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
346
+ ; GFX1013-NEXT: v_add_co_u32 v0, s4, s4, v0
347
+ ; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
348
+ ; GFX1013-NEXT: flat_load_dword v2, v[0:1]
349
+ ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6
350
+ ; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
351
+ ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
352
+ ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
353
+ ; GFX1013-NEXT: s_waitcnt vmcnt(0)
354
+ ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
355
+ ; GFX1013-NEXT: s_endpgm
356
+ ;
357
+ ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
358
+ ; GFX1030: ; %bb.0: ; %main_body
359
+ ; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
360
+ ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0
361
+ ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
362
+ ; GFX1030-NEXT: v_mov_b32_e32 v3, 0
363
+ ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
364
+ ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
365
+ ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200
366
+ ; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500
367
+ ; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700
368
+ ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
369
+ ; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v0
370
+ ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
371
+ ; GFX1030-NEXT: flat_load_dword v2, v[0:1]
372
+ ; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
373
+ ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6
374
+ ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
375
+ ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
376
+ ; GFX1030-NEXT: s_waitcnt vmcnt(0)
377
+ ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
378
+ ; GFX1030-NEXT: s_endpgm
147
379
main_body:
148
380
%lid = tail call i32 @llvm.amdgcn.workitem.id.x ()
149
381
%gep_ray = getelementptr inbounds float , float * %p_ray , i32 %lid
0 commit comments