Skip to content

Commit 1e9acb5

Browse files
committed
[NVPTX] Add support for f16 fabs
1 parent 7b52549 commit 1e9acb5

File tree

2 files changed

+106
-2
lines changed

2 files changed

+106
-2
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -863,10 +863,15 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
863863
AddPromotedToType(Op, MVT::bf16, MVT::f32);
864864
}
865865
for (const auto &Op : {ISD::FABS}) {
866-
setOperationAction(Op, MVT::f16, Promote);
867866
setOperationAction(Op, MVT::f32, Legal);
868867
setOperationAction(Op, MVT::f64, Legal);
869-
setOperationAction(Op, MVT::v2f16, Expand);
868+
if (STI.getPTXVersion() >= 65) {
869+
setFP16OperationAction(Op, MVT::f16, Legal, Promote);
870+
setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
871+
} else {
872+
setOperationAction(Op, MVT::f16, Promote);
873+
setOperationAction(Op, MVT::v2f16, Expand);
874+
}
870875
setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
871876
setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
872877
if (getOperationAction(Op, MVT::bf16) == Promote)

llvm/test/CodeGen/NVPTX/f16-abs.ll

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; ## Some FP16 support but not for abs
3+
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \
4+
; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
5+
; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK-NOF16 %s
6+
; RUN: %if ptxas %{ \
7+
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \
8+
; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
9+
; RUN: | %ptxas-verify -arch=sm_53 \
10+
; RUN: %}
11+
12+
; ## FP16 support explicitly disabled.
13+
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \
14+
; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \
15+
; RUN: -verify-machineinstrs \
16+
; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK-NOF16 %s
17+
; RUN: %if ptxas %{ \
18+
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \
19+
; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \
20+
; RUN: -verify-machineinstrs \
21+
; RUN: | %ptxas-verify -arch=sm_53 \
22+
; RUN: %}
23+
24+
; ## FP16 is not supported by hardware.
25+
; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 \
26+
; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \
27+
; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK-NOF16 %s
28+
; RUN: %if ptxas %{ \
29+
; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 \
30+
; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \
31+
; RUN: | %ptxas-verify -arch=sm_52 \
32+
; RUN: %}
33+
34+
; ## Full FP16 support.
35+
; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -mattr=+ptx70 \
36+
; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \
37+
; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK-F16-ABS %s
38+
; RUN: %if ptxas %{ \
39+
; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -mattr=+ptx70 \
40+
; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \
41+
; RUN: | %ptxas-verify -arch=sm_53 \
42+
; RUN: %}
43+
44+
target triple = "nvptx64-nvidia-cuda"
45+
46+
declare half @llvm.fabs.f16(half %a)
47+
declare <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
48+
49+
define half @test_fabs(half %a) {
50+
; CHECK-NOF16-LABEL: test_fabs(
51+
; CHECK-NOF16: {
52+
; CHECK-NOF16-NEXT: .reg .b16 %rs<3>;
53+
; CHECK-NOF16-NEXT: .reg .f32 %f<3>;
54+
; CHECK-NOF16-EMPTY:
55+
; CHECK-NOF16-NEXT: // %bb.0:
56+
; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [test_fabs_param_0];
57+
; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs1;
58+
; CHECK-NOF16-NEXT: abs.f32 %f2, %f1;
59+
; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs2, %f2;
60+
; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs2;
61+
; CHECK-NOF16-NEXT: ret;
62+
;
63+
; CHECK-F16-ABS-LABEL: test_fabs(
64+
; CHECK-F16-ABS: {
65+
; CHECK-F16-ABS-NEXT: .reg .b16 %rs<3>;
66+
; CHECK-F16-ABS-EMPTY:
67+
; CHECK-F16-ABS-NEXT: // %bb.0:
68+
; CHECK-F16-ABS-NEXT: ld.param.b16 %rs1, [test_fabs_param_0];
69+
; CHECK-F16-ABS-NEXT: abs.f16 %rs2, %rs1;
70+
; CHECK-F16-ABS-NEXT: st.param.b16 [func_retval0], %rs2;
71+
; CHECK-F16-ABS-NEXT: ret;
72+
%r = call half @llvm.fabs.f16(half %a)
73+
ret half %r
74+
}
75+
76+
define <2 x half> @test_fabs_2(<2 x half> %a) #0 {
77+
; CHECK-F16-LABEL: test_fabs_2(
78+
; CHECK-F16: {
79+
; CHECK-F16-NEXT: .reg .b32 %r<5>;
80+
; CHECK-F16-EMPTY:
81+
; CHECK-F16-NEXT: // %bb.0:
82+
; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fabs_2_param_0];
83+
; CHECK-F16-NEXT: and.b32 %r3, %r1, 2147450879;
84+
; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3;
85+
; CHECK-F16-NEXT: ret;
86+
;
87+
; CHECK-F16-ABS-LABEL: test_fabs_2(
88+
; CHECK-F16-ABS: {
89+
; CHECK-F16-ABS-NEXT: .reg .b32 %r<3>;
90+
; CHECK-F16-ABS-EMPTY:
91+
; CHECK-F16-ABS-NEXT: // %bb.0:
92+
; CHECK-F16-ABS-NEXT: ld.param.b32 %r1, [test_fabs_2_param_0];
93+
; CHECK-F16-ABS-NEXT: abs.f16x2 %r2, %r1;
94+
; CHECK-F16-ABS-NEXT: st.param.b32 [func_retval0], %r2;
95+
; CHECK-F16-ABS-NEXT: ret;
96+
%r = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
97+
ret <2 x half> %r
98+
}
99+

0 commit comments

Comments
 (0)