-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[NVPTX] Add support for f16 fabs #116107
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NVPTX] Add support for f16 fabs #116107
Conversation
@llvm/pr-subscribers-backend-nvptx Author: Alex MacLean (AlexMaclean) ChangesAdd support for f16 and f16x2 support for abs. See PTX ISA 9.7.4.6. Half Precision Floating Point Instructions: abs https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs Full diff: https://github.com/llvm/llvm-project/pull/116107.diff 2 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 18b05b23da220b..b57af4518b2fc6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -864,10 +864,15 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
AddPromotedToType(Op, MVT::bf16, MVT::f32);
}
for (const auto &Op : {ISD::FABS}) {
- setOperationAction(Op, MVT::f16, Promote);
setOperationAction(Op, MVT::f32, Legal);
setOperationAction(Op, MVT::f64, Legal);
- setOperationAction(Op, MVT::v2f16, Expand);
+ if (STI.getPTXVersion() >= 65) {
+ setFP16OperationAction(Op, MVT::f16, Legal, Promote);
+ setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
+ } else {
+ setOperationAction(Op, MVT::f16, Promote);
+ setOperationAction(Op, MVT::v2f16, Expand);
+ }
setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
if (getOperationAction(Op, MVT::bf16) == Promote)
diff --git a/llvm/test/CodeGen/NVPTX/f16-abs.ll b/llvm/test/CodeGen/NVPTX/f16-abs.ll
new file mode 100644
index 00000000000000..38cb8ce7ae93aa
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/f16-abs.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; ## Some FP16 support but not for abs
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \
+; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK-NOF16 %s
+; RUN: %if ptxas %{ \
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \
+; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | %ptxas-verify -arch=sm_53 \
+; RUN: %}
+
+; ## FP16 support explicitly disabled.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \
+; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \
+; RUN: -verify-machineinstrs \
+; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK-NOF16 %s
+; RUN: %if ptxas %{ \
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \
+; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \
+; RUN: -verify-machineinstrs \
+; RUN: | %ptxas-verify -arch=sm_53 \
+; RUN: %}
+
+; ## FP16 is not supported by hardware.
+; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 \
+; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK-NOF16 %s
+; RUN: %if ptxas %{ \
+; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 \
+; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | %ptxas-verify -arch=sm_52 \
+; RUN: %}
+
+; ## Full FP16 support.
+; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -mattr=+ptx70 \
+; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK-F16-ABS %s
+; RUN: %if ptxas %{ \
+; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -mattr=+ptx70 \
+; RUN: -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | %ptxas-verify -arch=sm_53 \
+; RUN: %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare half @llvm.fabs.f16(half %a)
+declare <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
+
+define half @test_fabs(half %a) {
+; CHECK-NOF16-LABEL: test_fabs(
+; CHECK-NOF16: {
+; CHECK-NOF16-NEXT: .reg .b16 %rs<3>;
+; CHECK-NOF16-NEXT: .reg .f32 %f<3>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT: // %bb.0:
+; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [test_fabs_param_0];
+; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs1;
+; CHECK-NOF16-NEXT: abs.f32 %f2, %f1;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-NOF16-NEXT: ret;
+;
+; CHECK-F16-ABS-LABEL: test_fabs(
+; CHECK-F16-ABS: {
+; CHECK-F16-ABS-NEXT: .reg .b16 %rs<3>;
+; CHECK-F16-ABS-EMPTY:
+; CHECK-F16-ABS-NEXT: // %bb.0:
+; CHECK-F16-ABS-NEXT: ld.param.b16 %rs1, [test_fabs_param_0];
+; CHECK-F16-ABS-NEXT: abs.f16 %rs2, %rs1;
+; CHECK-F16-ABS-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-F16-ABS-NEXT: ret;
+ %r = call half @llvm.fabs.f16(half %a)
+ ret half %r
+}
+
+define <2 x half> @test_fabs_2(<2 x half> %a) #0 {
+; CHECK-F16-LABEL: test_fabs_2(
+; CHECK-F16: {
+; CHECK-F16-NEXT: .reg .b32 %r<5>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT: // %bb.0:
+; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fabs_2_param_0];
+; CHECK-F16-NEXT: and.b32 %r3, %r1, 2147450879;
+; CHECK-F16-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-F16-NEXT: ret;
+;
+; CHECK-F16-ABS-LABEL: test_fabs_2(
+; CHECK-F16-ABS: {
+; CHECK-F16-ABS-NEXT: .reg .b32 %r<3>;
+; CHECK-F16-ABS-EMPTY:
+; CHECK-F16-ABS-NEXT: // %bb.0:
+; CHECK-F16-ABS-NEXT: ld.param.b32 %r1, [test_fabs_2_param_0];
+; CHECK-F16-ABS-NEXT: abs.f16x2 %r2, %r1;
+; CHECK-F16-ABS-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-F16-ABS-NEXT: ret;
+ %r = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
|
; CHECK-F16-EMPTY: | ||
; CHECK-F16-NEXT: // %bb.0: | ||
; CHECK-F16-NEXT: ld.param.b32 %r1, [test_fabs_2_param_0]; | ||
; CHECK-F16-NEXT: and.b32 %r3, %r1, 2147450879; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Huh. This is interesting. If we are allowed to do fabs
via sign masking, perhaps instead of promotion to fp32
for scalars, we should cutom-lower it to a logical op for the fallback path for fp16 scalars where native fp16 abs is not supported. That's likely more efficient than abs.f32 plus two conversions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To clarify -- it's an optimization opportunity. If it easy to incorporate into this change -- great. If not, can be done separately. This CL is good to go as is.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it is fine from the perspective of llvm IR's semantics to implement f16 abs with an and. Actually, it's probably more conformant because it will preserve NaN payloads, while the conversions may not. That being said, I'm not sure about the perf implications of going this route. Maybe in some cases maybe the abs could be strung together with other promoted operations and result in better codegen?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the only case where it will matter is when we would have a cluster of fp16 ops promoted to f32, with a few abs
in the middle. If LLVM would decide to do that abs in fp16, that indeed may be slower. I'm not sure how exectly this is handled. In theory it would boil down to cost analysis between two casts + and
vs fp32 fabs
, with fp32 fabs
winning.
That may be something to teach instcombine about, if we do not, yet.
In either case, sm_60 and older GPUs are nearly obsolete these days and are not worth spending much effort on. We can just leave things as is.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good, will leave as is for older architectures for now.
setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); | ||
setBF16OperationAction(Op, MVT::bf16, Legal, Promote); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
abs.bf16
andabs.bf16x2
introduced in PTX ISA 7.0.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should add ISD::FABS
to this set of case
s:
llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Lines 486 to 493 in be95e16
case ISD::FMINNUM: | |
case ISD::FMAXNUM: | |
case ISD::FMAXNUM_IEEE: | |
case ISD::FMINNUM_IEEE: | |
case ISD::FMAXIMUM: | |
case ISD::FMINIMUM: | |
IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; | |
break; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this is necessary, in fact I suspect this whole case can be removed. hasBF16Math
already checks for sm_80 and since sm_80 is not supported until ptx_70 we can assume this requirement will be met as well.
bool hasBF16Math() const { return SmVersion >= 80; } |
@Artem-B does the latest iteration of this change look good to you? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM in general + few test nits.
llvm/test/CodeGen/NVPTX/f16-abs.ll
Outdated
@@ -0,0 +1,99 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | |||
; ## Some FP16 support but not for abs | |||
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 \ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Few nits on the tests -- I would explicitly set PTX version high enough on all test cases to make sure that native abs
instruction is disabled by the GPU type or by explicit attributes, and not unintentionally so by the too-low default PTX version.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, I've updated the cases to include each of the following:
- no f16 abs due to insufficient hardware
- no f16 abs due to insufficient ptx version
- no f16 abs due to explicit disable
- f16 abs supported
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Did you push those changes? The last commit github shows me is e382e8adafe8ccb8
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Apologies, I did not, I've just pushed 0c7af950b7dbefcb3ee545ddd74670ffcb6d7869
0c7af95
to
defd1a1
Compare
Add support for f16 and f16x2 support for abs. See PTX ISA 9.7.4.6. Half Precision Floating Point Instructions: abs https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs