[NVPTX] Add support for f16 fabs #116107

AlexMaclean · 2024-11-13T21:07:04Z

Add support for f16 and f16x2 support for abs. See PTX ISA 9.7.4.6. Half Precision Floating Point Instructions: abs https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs

llvmbot · 2024-11-13T21:07:39Z

@llvm/pr-subscribers-backend-nvptx

Author: Alex MacLean (AlexMaclean)

Changes

Add support for f16 and f16x2 support for abs. See PTX ISA 9.7.4.6. Half Precision Floating Point Instructions: abs https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs

Full diff: https://github.com/llvm/llvm-project/pull/116107.diff

2 Files Affected:

(modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+7-2)
(added) llvm/test/CodeGen/NVPTX/f16-abs.ll (+99)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 18b05b23da220b..b57af4518b2fc6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -864,10 +864,15 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
     AddPromotedToType(Op, MVT::bf16, MVT::f32);
   }
   for (const auto &Op : {ISD::FABS}) {
-    setOperationAction(Op, MVT::f16, Promote);
     setOperationAction(Op, MVT::f32, Legal);
     setOperationAction(Op, MVT::f64, Legal);
-    setOperationAction(Op, MVT::v2f16, Expand);
+    if (STI.getPTXVersion() >= 65) {
+      setFP16OperationAction(Op, MVT::f16, Legal, Promote);
+      setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
+    } else {
+      setOperationAction(Op, MVT::f16, Promote);
+      setOperationAction(Op, MVT::v2f16, Expand);
+    }
     setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
     setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
     if (getOperationAction(Op, MVT::bf16) == Promote)
diff --git a/llvm/test/CodeGen/NVPTX/f16-abs.ll b/llvm/test/CodeGen/NVPTX/f16-abs.ll
new file mode 100644
index 00000000000000..38cb8ce7ae93aa
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/f16-abs.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; ## Some FP16 support but not for abs
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53  \
+; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK-NOF16 %s
+; RUN: %if ptxas %{                                                           \
+; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53  \
+; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN:   | %ptxas-verify -arch=sm_53                                          \
+; RUN: %}
+
+; ## FP16 support explicitly disabled.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53  \
+; RUN:          -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \
+; RUN:           -verify-machineinstrs \
+; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK-NOF16 %s
+; RUN: %if ptxas %{                                                           \
+; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53  \
+; RUN:          -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math   \
+; RUN:           -verify-machineinstrs                                        \
+; RUN:   | %ptxas-verify -arch=sm_53                                          \
+; RUN: %}
+
+; ## FP16 is not supported by hardware.
+; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52  \
+; RUN:          -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK-NOF16 %s
+; RUN: %if ptxas %{                                                               \
+; RUN:   llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52  \
+; RUN:          -disable-post-ra -frame-pointer=all -verify-machineinstrs         \
+; RUN:   | %ptxas-verify -arch=sm_52                                              \
+; RUN: %}
+
+; ## Full FP16 support.
+; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -mattr=+ptx70 \
+; RUN:          -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK-F16-ABS %s
+; RUN: %if ptxas %{                                                               \
+; RUN:   llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -mattr=+ptx70 \
+; RUN:          -disable-post-ra -frame-pointer=all -verify-machineinstrs         \
+; RUN:   | %ptxas-verify -arch=sm_53                                              \
+; RUN: %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare half @llvm.fabs.f16(half %a)
+declare <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
+
+define half @test_fabs(half %a) {
+; CHECK-NOF16-LABEL: test_fabs(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<3>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [test_fabs_param_0];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs1;
+; CHECK-NOF16-NEXT:    abs.f32 %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs2, %f2;
+; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-ABS-LABEL: test_fabs(
+; CHECK-F16-ABS:       {
+; CHECK-F16-ABS-NEXT:    .reg .b16 %rs<3>;
+; CHECK-F16-ABS-EMPTY:
+; CHECK-F16-ABS-NEXT:  // %bb.0:
+; CHECK-F16-ABS-NEXT:    ld.param.b16 %rs1, [test_fabs_param_0];
+; CHECK-F16-ABS-NEXT:    abs.f16 %rs2, %rs1;
+; CHECK-F16-ABS-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-F16-ABS-NEXT:    ret;
+  %r = call half @llvm.fabs.f16(half %a)
+  ret half %r
+}
+
+define <2 x half> @test_fabs_2(<2 x half> %a) #0 {
+; CHECK-F16-LABEL: test_fabs_2(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<5>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fabs_2_param_0];
+; CHECK-F16-NEXT:    and.b32 %r3, %r1, 2147450879;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-F16-ABS-LABEL: test_fabs_2(
+; CHECK-F16-ABS:       {
+; CHECK-F16-ABS-NEXT:    .reg .b32 %r<3>;
+; CHECK-F16-ABS-EMPTY:
+; CHECK-F16-ABS-NEXT:  // %bb.0:
+; CHECK-F16-ABS-NEXT:    ld.param.b32 %r1, [test_fabs_2_param_0];
+; CHECK-F16-ABS-NEXT:    abs.f16x2 %r2, %r1;
+; CHECK-F16-ABS-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-F16-ABS-NEXT:    ret;
+  %r = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+

Artem-B · 2024-11-13T21:42:51Z

llvm/test/CodeGen/NVPTX/f16-abs.ll

+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fabs_2_param_0];
+; CHECK-F16-NEXT:    and.b32 %r3, %r1, 2147450879;


Huh. This is interesting. If we are allowed to do fabs via sign masking, perhaps instead of promotion to fp32 for scalars, we should cutom-lower it to a logical op for the fallback path for fp16 scalars where native fp16 abs is not supported. That's likely more efficient than abs.f32 plus two conversions.

To clarify -- it's an optimization opportunity. If it easy to incorporate into this change -- great. If not, can be done separately. This CL is good to go as is.

I think it is fine from the perspective of llvm IR's semantics to implement f16 abs with an and. Actually, it's probably more conformant because it will preserve NaN payloads, while the conversions may not. That being said, I'm not sure about the perf implications of going this route. Maybe in some cases maybe the abs could be strung together with other promoted operations and result in better codegen?

I think the only case where it will matter is when we would have a cluster of fp16 ops promoted to f32, with a few abs in the middle. If LLVM would decide to do that abs in fp16, that indeed may be slower. I'm not sure how exectly this is handled. In theory it would boil down to cost analysis between two casts + and vs fp32 fabs, with fp32 fabs winning.

That may be something to teach instcombine about, if we do not, yet.

In either case, sm_60 and older GPUs are nearly obsolete these days and are not worth spending much effort on. We can just leave things as is.

Sounds good, will leave as is for older architectures for now.

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

justinfargnoli · 2024-11-13T21:43:29Z

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

    setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
    setBF16OperationAction(Op, MVT::bf16, Legal, Promote);


abs.bf16 and abs.bf16x2 introduced in PTX ISA 7.0.

9.7.4.6. Half Precision Floating Point Instructions: abs

We should add ISD::FABS to this set of cases:

llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines 486 to 493 in be95e16

case ISD::FMINNUM:

case ISD::FMAXNUM:

case ISD::FMAXNUM_IEEE:

case ISD::FMINNUM_IEEE:

case ISD::FMAXIMUM:

case ISD::FMINIMUM:

IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;

break;

I don't think this is necessary, in fact I suspect this whole case can be removed. hasBF16Math already checks for sm_80 and since sm_80 is not supported until ptx_70 we can assume this requirement will be met as well.

llvm-project/llvm/lib/Target/NVPTX/NVPTXSubtarget.h

Line 86 in ec066d3

bool hasBF16Math() const { return SmVersion >= 80; }

justinfargnoli · 2024-11-13T21:54:55Z

Feel free to ignore comment and comment. I'll file issues for them.

justinfargnoli · 2024-11-13T21:56:51Z

Overall, this LGTM. I'm holding off on approving this in case @Artem-B wants you to address comment before merging.

AlexMaclean · 2024-11-15T16:47:17Z

@Artem-B does the latest iteration of this change look good to you?

Artem-B

LGTM in general + few test nits.

Artem-B · 2024-11-15T19:22:12Z

llvm/test/CodeGen/NVPTX/f16-abs.ll

@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; ## Some FP16 support but not for abs
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53  \


Few nits on the tests -- I would explicitly set PTX version high enough on all test cases to make sure that native abs instruction is disabled by the GPU type or by explicit attributes, and not unintentionally so by the too-low default PTX version.

Okay, I've updated the cases to include each of the following:

no f16 abs due to insufficient hardware

no f16 abs due to insufficient ptx version

no f16 abs due to explicit disable

f16 abs supported

Did you push those changes? The last commit github shows me is e382e8adafe8ccb8

Apologies, I did not, I've just pushed 0c7af950b7dbefcb3ee545ddd74670ffcb6d7869

llvm/test/CodeGen/NVPTX/f16-abs.ll

AlexMaclean requested review from Artem-B and justinfargnoli November 13, 2024 21:07

AlexMaclean self-assigned this Nov 13, 2024

llvmbot added the backend:NVPTX label Nov 13, 2024

Artem-B reviewed Nov 13, 2024

View reviewed changes

justinfargnoli reviewed Nov 13, 2024

View reviewed changes

justinfargnoli approved these changes Nov 14, 2024

View reviewed changes

Artem-B approved these changes Nov 15, 2024

View reviewed changes

Artem-B approved these changes Nov 18, 2024

View reviewed changes

AlexMaclean added 3 commits November 18, 2024 20:03

[NVPTX] Add support for f16 fabs

1e9acb5

address comments

64bcb07

address comments

defd1a1

AlexMaclean force-pushed the dev/amaclean/upstream-f16-abs branch from 0c7af95 to defd1a1 Compare November 18, 2024 20:03

AlexMaclean merged commit 5587627 into llvm:main Nov 18, 2024
6 of 7 checks passed

		setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
		setBF16OperationAction(Op, MVT::bf16, Legal, Promote);

	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FMAXNUM_IEEE:
	case ISD::FMINNUM_IEEE:
	case ISD::FMAXIMUM:
	case ISD::FMINIMUM:
	IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
	break;

[NVPTX] Add support for f16 fabs #116107

[NVPTX] Add support for f16 fabs #116107

Uh oh!

Conversation

AlexMaclean commented Nov 13, 2024

Uh oh!

llvmbot commented Nov 13, 2024

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

AlexMaclean Nov 13, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

justinfargnoli commented Nov 13, 2024

Uh oh!

justinfargnoli commented Nov 13, 2024

Uh oh!

AlexMaclean commented Nov 15, 2024

Uh oh!

Artem-B left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

AlexMaclean Nov 13, 2024 •

edited

Loading