3
3
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX8 %s
4
4
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
5
5
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
6
- ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
7
- ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s
6
+ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
7
+ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
8
+ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-TRUE16 %s
9
+ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-FAKE16 %s
8
10
9
11
define amdgpu_kernel void @cos_f16 (ptr addrspace (1 ) %r , ptr addrspace (1 ) %a ) {
10
12
; GFX6-LABEL: cos_f16:
@@ -69,31 +71,57 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
69
71
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
70
72
; GFX10-NEXT: s_endpgm
71
73
;
72
- ; GFX11-LABEL: cos_f16:
73
- ; GFX11: ; %bb.0:
74
- ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
75
- ; GFX11-NEXT: v_mov_b32_e32 v0 , 0
76
- ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
77
- ; GFX11-NEXT: global_load_u16 v1, v0 , s[2:3]
78
- ; GFX11-NEXT: s_waitcnt vmcnt(0)
79
- ; GFX11-NEXT: v_mul_f16_e32 v1 , 0.15915494, v1
80
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
81
- ; GFX11-NEXT: v_cos_f16_e32 v1, v1
82
- ; GFX11-NEXT: global_store_b16 v0, v1 , s[0:1]
83
- ; GFX11-NEXT: s_endpgm
74
+ ; GFX11-TRUE16- LABEL: cos_f16:
75
+ ; GFX11-TRUE16 : ; %bb.0:
76
+ ; GFX11-TRUE16- NEXT: s_load_b128 s[0:3], s[4:5], 0x24
77
+ ; GFX11-TRUE16- NEXT: v_mov_b32_e32 v1 , 0
78
+ ; GFX11-TRUE16- NEXT: s_waitcnt lgkmcnt(0)
79
+ ; GFX11-TRUE16- NEXT: global_load_d16_b16 v0, v1 , s[2:3]
80
+ ; GFX11-TRUE16- NEXT: s_waitcnt vmcnt(0)
81
+ ; GFX11-TRUE16- NEXT: v_mul_f16_e32 v0.l , 0.15915494, v0.l
82
+ ; GFX11-TRUE16- NEXT: s_delay_alu instid0(VALU_DEP_1)
83
+ ; GFX11-TRUE16- NEXT: v_cos_f16_e32 v0.l, v0.l
84
+ ; GFX11-TRUE16- NEXT: global_store_b16 v1, v0 , s[0:1]
85
+ ; GFX11-TRUE16- NEXT: s_endpgm
84
86
;
85
- ; GFX12-LABEL: cos_f16:
86
- ; GFX12: ; %bb.0:
87
- ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
88
- ; GFX12-NEXT: v_mov_b32_e32 v0, 0
89
- ; GFX12-NEXT: s_wait_kmcnt 0x0
90
- ; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
91
- ; GFX12-NEXT: s_wait_loadcnt 0x0
92
- ; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
93
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
94
- ; GFX12-NEXT: v_cos_f16_e32 v1, v1
95
- ; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
96
- ; GFX12-NEXT: s_endpgm
87
+ ; GFX11-FAKE16-LABEL: cos_f16:
88
+ ; GFX11-FAKE16: ; %bb.0:
89
+ ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
90
+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
91
+ ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
92
+ ; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
93
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
94
+ ; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
95
+ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
96
+ ; GFX11-FAKE16-NEXT: v_cos_f16_e32 v1, v1
97
+ ; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
98
+ ; GFX11-FAKE16-NEXT: s_endpgm
99
+ ;
100
+ ; GFX12-TRUE16-LABEL: cos_f16:
101
+ ; GFX12-TRUE16: ; %bb.0:
102
+ ; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
103
+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
104
+ ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
105
+ ; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
106
+ ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
107
+ ; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
108
+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
109
+ ; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l
110
+ ; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
111
+ ; GFX12-TRUE16-NEXT: s_endpgm
112
+ ;
113
+ ; GFX12-FAKE16-LABEL: cos_f16:
114
+ ; GFX12-FAKE16: ; %bb.0:
115
+ ; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
116
+ ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, 0
117
+ ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
118
+ ; GFX12-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
119
+ ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
120
+ ; GFX12-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
121
+ ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
122
+ ; GFX12-FAKE16-NEXT: v_cos_f16_e32 v1, v1
123
+ ; GFX12-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
124
+ ; GFX12-FAKE16-NEXT: s_endpgm
97
125
%a.val = load half , ptr addrspace (1 ) %a
98
126
%r.val = call half @llvm.cos.f16 (half %a.val )
99
127
store half %r.val , ptr addrspace (1 ) %r
@@ -184,42 +212,79 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
184
212
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
185
213
; GFX10-NEXT: s_endpgm
186
214
;
187
- ; GFX11-LABEL: cos_v2f16:
188
- ; GFX11: ; %bb.0:
189
- ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
190
- ; GFX11-NEXT: v_mov_b32_e32 v0, 0
191
- ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
192
- ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
193
- ; GFX11-NEXT: s_waitcnt vmcnt(0)
194
- ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
195
- ; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
196
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
197
- ; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
198
- ; GFX11-NEXT: v_cos_f16_e32 v1, v1
199
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
200
- ; GFX11-NEXT: v_cos_f16_e32 v2, v2
201
- ; GFX11-NEXT: s_waitcnt_depctr 0xfff
202
- ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2
203
- ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
204
- ; GFX11-NEXT: s_endpgm
215
+ ; GFX11-TRUE16-LABEL: cos_v2f16:
216
+ ; GFX11-TRUE16: ; %bb.0:
217
+ ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
218
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
219
+ ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
220
+ ; GFX11-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3]
221
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
222
+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
223
+ ; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
224
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
225
+ ; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l
226
+ ; GFX11-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l
227
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
228
+ ; GFX11-TRUE16-NEXT: v_cos_f16_e32 v0.h, v0.h
229
+ ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
230
+ ; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
231
+ ; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
232
+ ; GFX11-TRUE16-NEXT: s_endpgm
233
+ ;
234
+ ; GFX11-FAKE16-LABEL: cos_v2f16:
235
+ ; GFX11-FAKE16: ; %bb.0:
236
+ ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
237
+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
238
+ ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
239
+ ; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
240
+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
241
+ ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
242
+ ; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
243
+ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
244
+ ; GFX11-FAKE16-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
245
+ ; GFX11-FAKE16-NEXT: v_cos_f16_e32 v1, v1
246
+ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
247
+ ; GFX11-FAKE16-NEXT: v_cos_f16_e32 v2, v2
248
+ ; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
249
+ ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2
250
+ ; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
251
+ ; GFX11-FAKE16-NEXT: s_endpgm
252
+ ;
253
+ ; GFX12-TRUE16-LABEL: cos_v2f16:
254
+ ; GFX12-TRUE16: ; %bb.0:
255
+ ; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
256
+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
257
+ ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
258
+ ; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
259
+ ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
260
+ ; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l
261
+ ; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l
262
+ ; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec
263
+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
264
+ ; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l
265
+ ; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.h, v0.h
266
+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
267
+ ; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
268
+ ; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
269
+ ; GFX12-TRUE16-NEXT: s_endpgm
205
270
;
206
- ; GFX12-LABEL: cos_v2f16:
207
- ; GFX12: ; %bb.0:
208
- ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
209
- ; GFX12-NEXT: v_mov_b32_e32 v0, 0
210
- ; GFX12-NEXT: s_wait_kmcnt 0x0
211
- ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
212
- ; GFX12-NEXT: s_wait_loadcnt 0x0
213
- ; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1
214
- ; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
215
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
216
- ; GFX12-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
217
- ; GFX12-NEXT: v_cos_f16_e32 v1, v1
218
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
219
- ; GFX12-NEXT: v_cos_f16_e32 v2, v2
220
- ; GFX12-NEXT: v_pack_b32_f16 v1, v1, v2
221
- ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
222
- ; GFX12-NEXT: s_endpgm
271
+ ; GFX12-FAKE16- LABEL: cos_v2f16:
272
+ ; GFX12-FAKE16 : ; %bb.0:
273
+ ; GFX12-FAKE16- NEXT: s_load_b128 s[0:3], s[4:5], 0x24
274
+ ; GFX12-FAKE16- NEXT: v_mov_b32_e32 v0, 0
275
+ ; GFX12-FAKE16- NEXT: s_wait_kmcnt 0x0
276
+ ; GFX12-FAKE16- NEXT: global_load_b32 v1, v0, s[2:3]
277
+ ; GFX12-FAKE16- NEXT: s_wait_loadcnt 0x0
278
+ ; GFX12-FAKE16- NEXT: v_lshrrev_b32_e32 v2, 16, v1
279
+ ; GFX12-FAKE16- NEXT: v_mul_f16_e32 v1, 0.15915494, v1
280
+ ; GFX12-FAKE16- NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
281
+ ; GFX12-FAKE16- NEXT: v_mul_f16_e32 v2, 0.15915494, v2
282
+ ; GFX12-FAKE16- NEXT: v_cos_f16_e32 v1, v1
283
+ ; GFX12-FAKE16- NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
284
+ ; GFX12-FAKE16- NEXT: v_cos_f16_e32 v2, v2
285
+ ; GFX12-FAKE16- NEXT: v_pack_b32_f16 v1, v1, v2
286
+ ; GFX12-FAKE16- NEXT: global_store_b32 v0, v1, s[0:1]
287
+ ; GFX12-FAKE16- NEXT: s_endpgm
223
288
%a.val = load <2 x half >, ptr addrspace (1 ) %a
224
289
%r.val = call <2 x half > @llvm.cos.v2f16 (<2 x half > %a.val )
225
290
store <2 x half > %r.val , ptr addrspace (1 ) %r
@@ -228,3 +293,6 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
228
293
229
294
declare half @llvm.cos.f16 (half %a )
230
295
declare <2 x half > @llvm.cos.v2f16 (<2 x half > %a )
296
+ ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
297
+ ; GFX11: {{.*}}
298
+ ; GFX12: {{.*}}
0 commit comments