Skip to content

Commit 401658c

Browse files
committed
AMDGPU: Fix vector handling of fptrunc_round
1 parent a13ff06 commit 401658c

File tree

3 files changed

+324
-3
lines changed

3 files changed

+324
-3
lines changed

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4764,6 +4764,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
47644764
return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*pow*/});
47654765
case G_BITCAST:
47664766
return fewerElementsBitcast(MI, TypeIdx, NarrowTy);
4767+
case G_INTRINSIC_FPTRUNC_ROUND:
4768+
return fewerElementsVectorMultiEltType(GMI, NumElts, {2});
47674769
default:
47684770
return UnableToLegalize;
47694771
}

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
420420
case ISD::FFLOOR:
421421
case ISD::FP_ROUND:
422422
case ISD::FP_EXTEND:
423+
case ISD::FPTRUNC_ROUND:
423424
case ISD::FMA:
424425
case ISD::SIGN_EXTEND_INREG:
425426
case ISD::ANY_EXTEND_VECTOR_INREG:

llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll

Lines changed: 321 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s
3-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s
4-
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=CHECK,SDAG %s
3+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=CHECK,SDAG %s
4+
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=CHECK,GISEL %s
55

66
define amdgpu_gs half @v_fptrunc_round_f32_to_f16_upward(float %a) {
77
; CHECK-LABEL: v_fptrunc_round_f32_to_f16_upward:
@@ -98,3 +98,321 @@ define amdgpu_gs void @s_fptrunc_round_f32_to_f16_upward_multiple_calls(float in
9898
store half %res5, ptr addrspace(1) %out, align 4
9999
ret void
100100
}
101+
102+
define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> %a) {
103+
; SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward:
104+
; SDAG: ; %bb.0:
105+
; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
106+
; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
107+
; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
108+
; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
109+
; SDAG-NEXT: ; return to shader part epilog
110+
;
111+
; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward:
112+
; GISEL: ; %bb.0:
113+
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
114+
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
115+
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
116+
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
117+
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
118+
; GISEL-NEXT: ; return to shader part epilog
119+
%res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
120+
ret <2 x half> %res
121+
}
122+
123+
define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_downward(<2 x float> %a) {
124+
; SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_downward:
125+
; SDAG: ; %bb.0:
126+
; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
127+
; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
128+
; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
129+
; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
130+
; SDAG-NEXT: ; return to shader part epilog
131+
;
132+
; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_downward:
133+
; GISEL: ; %bb.0:
134+
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
135+
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
136+
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
137+
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
138+
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
139+
; GISEL-NEXT: ; return to shader part epilog
140+
%res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward")
141+
ret <2 x half> %res
142+
}
143+
144+
define amdgpu_gs void @v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x float> %a, <2 x float> %b, ptr addrspace(1) %out) {
145+
; SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
146+
; SDAG: ; %bb.0:
147+
; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
148+
; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
149+
; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
150+
; SDAG-NEXT: v_cvt_f16_f32_e32 v6, v2
151+
; SDAG-NEXT: v_cvt_f16_f32_e32 v7, v3
152+
; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
153+
; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
154+
; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2
155+
; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v3
156+
; SDAG-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
157+
; SDAG-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
158+
; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
159+
; SDAG-NEXT: v_pk_add_f16 v0, v0, v3
160+
; SDAG-NEXT: v_pk_add_f16 v0, v1, v0
161+
; SDAG-NEXT: global_store_dword v[4:5], v0, off
162+
; SDAG-NEXT: s_endpgm
163+
;
164+
; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
165+
; GISEL: ; %bb.0:
166+
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
167+
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
168+
; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v2
169+
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
170+
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
171+
; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
172+
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 1
173+
; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v3
174+
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
175+
; GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6
176+
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
177+
; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
178+
; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
179+
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
180+
; GISEL-NEXT: v_lshl_or_b32 v1, v7, 16, v6
181+
; GISEL-NEXT: v_lshl_or_b32 v2, v3, 16, v2
182+
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
183+
; GISEL-NEXT: v_pk_add_f16 v0, v0, v1
184+
; GISEL-NEXT: v_pk_add_f16 v0, v2, v0
185+
; GISEL-NEXT: global_store_dword v[4:5], v0, off
186+
; GISEL-NEXT: s_endpgm
187+
%res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
188+
%res2 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.upward")
189+
%res3 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.downward")
190+
%res4 = fadd <2 x half> %res1, %res2
191+
%res5 = fadd <2 x half> %res3, %res4
192+
store <2 x half> %res5, ptr addrspace(1) %out, align 4
193+
ret void
194+
}
195+
196+
define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> inreg %a, ptr addrspace(1) %out) {
197+
; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward:
198+
; CHECK: ; %bb.0:
199+
; CHECK-NEXT: v_mov_b32_e32 v0, s0
200+
; CHECK-NEXT: v_mov_b32_e32 v1, s1
201+
; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
202+
; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
203+
; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
204+
; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
205+
; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1
206+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
207+
; CHECK-NEXT: v_readfirstlane_b32 s1, v1
208+
; CHECK-NEXT: ; return to shader part epilog
209+
%res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
210+
%bitcast = bitcast <2 x half> %res to <2 x i16>
211+
%ret = zext <2 x i16> %bitcast to <2 x i32>
212+
ret <2 x i32> %ret
213+
}
214+
215+
define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_downward(<2 x float> inreg %a, ptr addrspace(1) %out) {
216+
; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_downward:
217+
; CHECK: ; %bb.0:
218+
; CHECK-NEXT: v_mov_b32_e32 v0, s0
219+
; CHECK-NEXT: v_mov_b32_e32 v1, s1
220+
; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
221+
; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
222+
; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
223+
; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
224+
; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1
225+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
226+
; CHECK-NEXT: v_readfirstlane_b32 s1, v1
227+
; CHECK-NEXT: ; return to shader part epilog
228+
%res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward")
229+
%bitcast = bitcast <2 x half> %res to <2 x i16>
230+
%ret = zext <2 x i16> %bitcast to <2 x i32>
231+
ret <2 x i32> %ret
232+
}
233+
234+
define amdgpu_gs void @s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x float> inreg %a, <2 x float> inreg %b, ptr addrspace(1) %out) {
235+
; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
236+
; CHECK: ; %bb.0:
237+
; CHECK-NEXT: v_mov_b32_e32 v2, s0
238+
; CHECK-NEXT: v_mov_b32_e32 v3, s2
239+
; CHECK-NEXT: v_mov_b32_e32 v4, s1
240+
; CHECK-NEXT: v_mov_b32_e32 v5, s3
241+
; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
242+
; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2
243+
; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v3
244+
; CHECK-NEXT: v_cvt_f16_f32_e32 v4, v4
245+
; CHECK-NEXT: v_cvt_f16_f32_e32 v7, v5
246+
; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
247+
; CHECK-NEXT: v_cvt_f16_f32_e32 v3, v3
248+
; CHECK-NEXT: v_and_b32_e32 v2, 0xffff, v2
249+
; CHECK-NEXT: v_and_b32_e32 v6, 0xffff, v6
250+
; CHECK-NEXT: v_and_b32_e32 v3, 0xffff, v3
251+
; CHECK-NEXT: v_lshl_or_b32 v2, v4, 16, v2
252+
; CHECK-NEXT: v_cvt_f16_f32_e32 v4, v5
253+
; CHECK-NEXT: v_lshl_or_b32 v5, v7, 16, v6
254+
; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v3
255+
; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
256+
; CHECK-NEXT: v_pk_add_f16 v2, v2, v5
257+
; CHECK-NEXT: v_pk_add_f16 v2, v3, v2
258+
; CHECK-NEXT: global_store_dword v[0:1], v2, off
259+
; CHECK-NEXT: s_endpgm
260+
%res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
261+
%res2 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.upward")
262+
%res3 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.downward")
263+
%res4 = fadd <2 x half> %res1, %res2
264+
%res5 = fadd <2 x half> %res3, %res4
265+
store <2 x half> %res5, ptr addrspace(1) %out, align 4
266+
ret void
267+
}
268+
269+
; FIXME
270+
; define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_upward(<3 x float> %a) {
271+
; %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.upward")
272+
; ret <3 x half> %res
273+
; }
274+
275+
; define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_downward(<3 x float> %a) {
276+
; %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.downward")
277+
; ret <3 x half> %res
278+
; }
279+
280+
define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_upward(<4 x float> %a) {
281+
; SDAG-LABEL: v_fptrunc_round_v4f32_to_v4f16_upward:
282+
; SDAG: ; %bb.0:
283+
; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
284+
; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
285+
; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
286+
; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
287+
; SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3
288+
; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
289+
; SDAG-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
290+
; SDAG-NEXT: ; return to shader part epilog
291+
;
292+
; GISEL-LABEL: v_fptrunc_round_v4f32_to_v4f16_upward:
293+
; GISEL: ; %bb.0:
294+
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
295+
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
296+
; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
297+
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
298+
; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
299+
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
300+
; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
301+
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
302+
; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2
303+
; GISEL-NEXT: ; return to shader part epilog
304+
%res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.upward")
305+
ret <4 x half> %res
306+
}
307+
308+
define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_downward(<4 x float> %a) {
309+
; SDAG-LABEL: v_fptrunc_round_v4f32_to_v4f16_downward:
310+
; SDAG: ; %bb.0:
311+
; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
312+
; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
313+
; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
314+
; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
315+
; SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3
316+
; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
317+
; SDAG-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
318+
; SDAG-NEXT: ; return to shader part epilog
319+
;
320+
; GISEL-LABEL: v_fptrunc_round_v4f32_to_v4f16_downward:
321+
; GISEL: ; %bb.0:
322+
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
323+
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
324+
; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
325+
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
326+
; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
327+
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
328+
; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
329+
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
330+
; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2
331+
; GISEL-NEXT: ; return to shader part epilog
332+
%res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.downward")
333+
ret <4 x half> %res
334+
}
335+
336+
define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_upward(<8 x float> %a) {
337+
; SDAG-LABEL: v_fptrunc_round_v8f32_to_v8f16_upward:
338+
; SDAG: ; %bb.0:
339+
; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
340+
; SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6
341+
; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4
342+
; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
343+
; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
344+
; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
345+
; SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3
346+
; SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5
347+
; SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7
348+
; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
349+
; SDAG-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
350+
; SDAG-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
351+
; SDAG-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
352+
; SDAG-NEXT: ; return to shader part epilog
353+
;
354+
; GISEL-LABEL: v_fptrunc_round_v8f32_to_v8f16_upward:
355+
; GISEL: ; %bb.0:
356+
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
357+
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
358+
; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
359+
; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4
360+
; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6
361+
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
362+
; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
363+
; GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5
364+
; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7
365+
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
366+
; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
367+
; GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4
368+
; GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6
369+
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
370+
; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2
371+
; GISEL-NEXT: v_lshl_or_b32 v2, v5, 16, v4
372+
; GISEL-NEXT: v_lshl_or_b32 v3, v7, 16, v6
373+
; GISEL-NEXT: ; return to shader part epilog
374+
%res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.upward")
375+
ret <8 x half> %res
376+
}
377+
378+
define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_downward(<8 x float> %a) {
379+
; SDAG-LABEL: v_fptrunc_round_v8f32_to_v8f16_downward:
380+
; SDAG: ; %bb.0:
381+
; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
382+
; SDAG-NEXT: v_cvt_f16_f32_e32 v6, v6
383+
; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4
384+
; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
385+
; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
386+
; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
387+
; SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3
388+
; SDAG-NEXT: v_cvt_f16_f32_e32 v5, v5
389+
; SDAG-NEXT: v_cvt_f16_f32_e32 v7, v7
390+
; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
391+
; SDAG-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
392+
; SDAG-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
393+
; SDAG-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
394+
; SDAG-NEXT: ; return to shader part epilog
395+
;
396+
; GISEL-LABEL: v_fptrunc_round_v8f32_to_v8f16_downward:
397+
; GISEL: ; %bb.0:
398+
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
399+
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
400+
; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
401+
; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4
402+
; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6
403+
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
404+
; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
405+
; GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5
406+
; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7
407+
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
408+
; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
409+
; GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4
410+
; GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6
411+
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
412+
; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2
413+
; GISEL-NEXT: v_lshl_or_b32 v2, v5, 16, v4
414+
; GISEL-NEXT: v_lshl_or_b32 v3, v7, 16, v6
415+
; GISEL-NEXT: ; return to shader part epilog
416+
%res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.downward")
417+
ret <8 x half> %res
418+
}

0 commit comments

Comments
 (0)