Skip to content

Commit b1fe7da

Browse files
authored
[AMDGPU][True16][CodeGen] enable true16 for more codegen test patch 2 (#131210)
This is a NFC patch. Enable true16 mode for more CodeGen tests
1 parent 0b688f3 commit b1fe7da

23 files changed

+11960
-4457
lines changed

llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll

Lines changed: 129 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX8 %s
44
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
55
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
6-
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
7-
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s
6+
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
7+
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
8+
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-TRUE16 %s
9+
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-FAKE16 %s
810

911
define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
1012
; GFX6-LABEL: cos_f16:
@@ -69,31 +71,57 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
6971
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
7072
; GFX10-NEXT: s_endpgm
7173
;
72-
; GFX11-LABEL: cos_f16:
73-
; GFX11: ; %bb.0:
74-
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
75-
; GFX11-NEXT: v_mov_b32_e32 v0, 0
76-
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
77-
; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
78-
; GFX11-NEXT: s_waitcnt vmcnt(0)
79-
; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
80-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
81-
; GFX11-NEXT: v_cos_f16_e32 v1, v1
82-
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
83-
; GFX11-NEXT: s_endpgm
74+
; GFX11-TRUE16-LABEL: cos_f16:
75+
; GFX11-TRUE16: ; %bb.0:
76+
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
77+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
78+
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
79+
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
80+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
81+
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
82+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
83+
; GFX11-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l
84+
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
85+
; GFX11-TRUE16-NEXT: s_endpgm
8486
;
85-
; GFX12-LABEL: cos_f16:
86-
; GFX12: ; %bb.0:
87-
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
88-
; GFX12-NEXT: v_mov_b32_e32 v0, 0
89-
; GFX12-NEXT: s_wait_kmcnt 0x0
90-
; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
91-
; GFX12-NEXT: s_wait_loadcnt 0x0
92-
; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
93-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
94-
; GFX12-NEXT: v_cos_f16_e32 v1, v1
95-
; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
96-
; GFX12-NEXT: s_endpgm
87+
; GFX11-FAKE16-LABEL: cos_f16:
88+
; GFX11-FAKE16: ; %bb.0:
89+
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
90+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
91+
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
92+
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
93+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
94+
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
95+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
96+
; GFX11-FAKE16-NEXT: v_cos_f16_e32 v1, v1
97+
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
98+
; GFX11-FAKE16-NEXT: s_endpgm
99+
;
100+
; GFX12-TRUE16-LABEL: cos_f16:
101+
; GFX12-TRUE16: ; %bb.0:
102+
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
103+
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
104+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
105+
; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
106+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
107+
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
108+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
109+
; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l
110+
; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
111+
; GFX12-TRUE16-NEXT: s_endpgm
112+
;
113+
; GFX12-FAKE16-LABEL: cos_f16:
114+
; GFX12-FAKE16: ; %bb.0:
115+
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
116+
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, 0
117+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
118+
; GFX12-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
119+
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
120+
; GFX12-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
121+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
122+
; GFX12-FAKE16-NEXT: v_cos_f16_e32 v1, v1
123+
; GFX12-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
124+
; GFX12-FAKE16-NEXT: s_endpgm
97125
%a.val = load half, ptr addrspace(1) %a
98126
%r.val = call half @llvm.cos.f16(half %a.val)
99127
store half %r.val, ptr addrspace(1) %r
@@ -184,42 +212,79 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
184212
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
185213
; GFX10-NEXT: s_endpgm
186214
;
187-
; GFX11-LABEL: cos_v2f16:
188-
; GFX11: ; %bb.0:
189-
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
190-
; GFX11-NEXT: v_mov_b32_e32 v0, 0
191-
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
192-
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
193-
; GFX11-NEXT: s_waitcnt vmcnt(0)
194-
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
195-
; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
196-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
197-
; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
198-
; GFX11-NEXT: v_cos_f16_e32 v1, v1
199-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
200-
; GFX11-NEXT: v_cos_f16_e32 v2, v2
201-
; GFX11-NEXT: s_waitcnt_depctr 0xfff
202-
; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2
203-
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
204-
; GFX11-NEXT: s_endpgm
215+
; GFX11-TRUE16-LABEL: cos_v2f16:
216+
; GFX11-TRUE16: ; %bb.0:
217+
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
218+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
219+
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
220+
; GFX11-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3]
221+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
222+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
223+
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
224+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
225+
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l
226+
; GFX11-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l
227+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
228+
; GFX11-TRUE16-NEXT: v_cos_f16_e32 v0.h, v0.h
229+
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
230+
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
231+
; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
232+
; GFX11-TRUE16-NEXT: s_endpgm
233+
;
234+
; GFX11-FAKE16-LABEL: cos_v2f16:
235+
; GFX11-FAKE16: ; %bb.0:
236+
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
237+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
238+
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
239+
; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
240+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
241+
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
242+
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
243+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
244+
; GFX11-FAKE16-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
245+
; GFX11-FAKE16-NEXT: v_cos_f16_e32 v1, v1
246+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
247+
; GFX11-FAKE16-NEXT: v_cos_f16_e32 v2, v2
248+
; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
249+
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2
250+
; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
251+
; GFX11-FAKE16-NEXT: s_endpgm
252+
;
253+
; GFX12-TRUE16-LABEL: cos_v2f16:
254+
; GFX12-TRUE16: ; %bb.0:
255+
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
256+
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
257+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
258+
; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
259+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
260+
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l
261+
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l
262+
; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec
263+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
264+
; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l
265+
; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.h, v0.h
266+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
267+
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
268+
; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
269+
; GFX12-TRUE16-NEXT: s_endpgm
205270
;
206-
; GFX12-LABEL: cos_v2f16:
207-
; GFX12: ; %bb.0:
208-
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
209-
; GFX12-NEXT: v_mov_b32_e32 v0, 0
210-
; GFX12-NEXT: s_wait_kmcnt 0x0
211-
; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
212-
; GFX12-NEXT: s_wait_loadcnt 0x0
213-
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1
214-
; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
215-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
216-
; GFX12-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
217-
; GFX12-NEXT: v_cos_f16_e32 v1, v1
218-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
219-
; GFX12-NEXT: v_cos_f16_e32 v2, v2
220-
; GFX12-NEXT: v_pack_b32_f16 v1, v1, v2
221-
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
222-
; GFX12-NEXT: s_endpgm
271+
; GFX12-FAKE16-LABEL: cos_v2f16:
272+
; GFX12-FAKE16: ; %bb.0:
273+
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
274+
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, 0
275+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
276+
; GFX12-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
277+
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
278+
; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
279+
; GFX12-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
280+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
281+
; GFX12-FAKE16-NEXT: v_mul_f16_e32 v2, 0.15915494, v2
282+
; GFX12-FAKE16-NEXT: v_cos_f16_e32 v1, v1
283+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
284+
; GFX12-FAKE16-NEXT: v_cos_f16_e32 v2, v2
285+
; GFX12-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2
286+
; GFX12-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
287+
; GFX12-FAKE16-NEXT: s_endpgm
223288
%a.val = load <2 x half>, ptr addrspace(1) %a
224289
%r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val)
225290
store <2 x half> %r.val, ptr addrspace(1) %r
@@ -228,3 +293,6 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
228293

229294
declare half @llvm.cos.f16(half %a)
230295
declare <2 x half> @llvm.cos.v2f16(<2 x half> %a)
296+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
297+
; GFX11: {{.*}}
298+
; GFX12: {{.*}}

0 commit comments

Comments
 (0)