[NVPTX] Improve support for {ex2,lg2}.approx (#120519)

Prince781 · web-flow · commit 3ba339b5e702 · 2025-01-16T12:21:32.000-08:00
- Add support for `@llvm.exp2()`:
  - LLVM: `float`        -&gt; PTX: `ex2.approx{.ftz}.f32`
  - LLVM: `half`         -&gt; PTX: `ex2.approx.f16`
  - LLVM: `&lt;2 x half&gt;`   -&gt; PTX: `ex2.approx.f16x2`
  - LLVM: `bfloat`       -&gt; PTX: `ex2.approx.ftz.bf16`
  - LLVM: `&lt;2 x bfloat&gt;` -&gt; PTX: `ex2.approx.ftz.bf16x2`
  - Any operations with non-native vector widths are expanded. On
    targets not supporting f16/bf16, values are promoted to f32.

- Add *CONDITIONAL* support for `@llvm.log2()` [^1]:
  - LLVM: `float` -&gt; PTX: `lg2.approx{.ftz}.f32`
  - Support for f16/bf16 is emulated by promoting values to f32.

[1]: CUDA implements `exp2()` with `ex2.approx` but `log2()` is
implemented differently, so this is off by default. To enable, use the
flag `-nvptx-approx-log2f32`.
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -94,6 +94,13 @@ static cl::opt<bool> UsePrecSqrtF32(
     cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
     cl::init(true));
 
+/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it
+/// does NOT use lg2.approx for log2, so this is disabled by default.
+static cl::opt<bool> UseApproxLog2F32(
+    "nvptx-approx-log2f32",
+    cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),
+    cl::init(false));
+
 static cl::opt<bool> ForceMinByValParamAlign(
     "nvptx-force-min-byval-param-align", cl::Hidden,
     cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
@@ -529,6 +536,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
     case ISD::FMINIMUM:
       IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
       break;
+    case ISD::FEXP2:
+      IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70;
+      break;
     }
     setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
   };
@@ -959,7 +969,26 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::CopyToReg, MVT::i128, Custom);
   setOperationAction(ISD::CopyFromReg, MVT::i128, Custom);
 
-  // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
+  // FEXP2 support:
+  // - f32
+  // - f16/f16x2 (sm_70+, PTX 7.0+)
+  // - bf16/bf16x2 (sm_90+, PTX 7.8+)
+  // When f16/bf16 types aren't supported, they are promoted/expanded to f32.
+  setOperationAction(ISD::FEXP2, MVT::f32, Legal);
+  setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote);
+  setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand);
+  setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote);
+  setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand);
+
+  // FLOG2 supports f32 only
+  // f16/bf16 types aren't supported, but they are promoted/expanded to f32.
+  if (UseApproxLog2F32) {
+    setOperationAction(ISD::FLOG2, MVT::f32, Legal);
+    setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
+    setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
+    setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16}, Expand);
+  }
+
   // No FPOW or FREM in PTX.
 
   // Now deduce the information based on the above mentioned
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -569,6 +569,18 @@ multiclass F2_Support_Half<string OpcStr, SDNode OpNode> {
 
 }
 
+// Variant where only .ftz.bf16 is supported.
+multiclass F2_Support_Half_BF<string OpcStr, SDNode OpNode> {
+   def bf16_ftz :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
+                           OpcStr # ".ftz.bf16 \t$dst, $a;",
+                           [(set bf16:$dst, (OpNode bf16:$a))]>,
+                           Requires<[hasSM<90>, hasPTX<78>]>;
+   def bf16x2_ftz: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
+                           OpcStr # ".ftz.bf16x2 \t$dst, $a;",
+                           [(set v2bf16:$dst, (OpNode v2bf16:$a))]>,
+                           Requires<[hasSM<90>, hasPTX<78>]>;
+}
+
 //===----------------------------------------------------------------------===//
 // NVPTX Instructions.
 //===----------------------------------------------------------------------===//
@@ -1183,6 +1195,8 @@ defm FNEG_H: F2_Support_Half<"neg", fneg>;
 
 defm FSQRT : F2<"sqrt.rn", fsqrt>;
 
+defm FEXP2_H: F2_Support_Half_BF<"ex2.approx", fexp2>;
+
 //
 // F16 NEG
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1304,18 +1304,33 @@ def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;",
   Float32Regs, Float32Regs, int_nvvm_ex2_approx_f>;
 def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;",
   Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>;
+
 def INT_NVVM_EX2_APPROX_F16 : F_MATH_1<"ex2.approx.f16 \t$dst, $src0;",
   Int16Regs, Int16Regs, int_nvvm_ex2_approx_f16, [hasPTX<70>, hasSM<75>]>;
 def INT_NVVM_EX2_APPROX_F16X2 : F_MATH_1<"ex2.approx.f16x2 \t$dst, $src0;",
   Int32Regs, Int32Regs, int_nvvm_ex2_approx_f16x2, [hasPTX<70>, hasSM<75>]>;
 
+def : Pat<(fexp2 f32:$a),
+          (INT_NVVM_EX2_APPROX_FTZ_F $a)>, Requires<[doF32FTZ]>;
+def : Pat<(fexp2 f32:$a),
+          (INT_NVVM_EX2_APPROX_F $a)>, Requires<[doNoF32FTZ]>;
+def : Pat<(fexp2 f16:$a),
+          (INT_NVVM_EX2_APPROX_F16 $a)>, Requires<[useFP16Math]>;
+def : Pat<(fexp2 v2f16:$a),
+          (INT_NVVM_EX2_APPROX_F16X2 $a)>, Requires<[useFP16Math]>;
+
 def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;",
   Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>;
 def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;",
   Float32Regs, Float32Regs, int_nvvm_lg2_approx_f>;
 def INT_NVVM_LG2_APPROX_D : F_MATH_1<"lg2.approx.f64 \t$dst, $src0;",
   Float64Regs, Float64Regs, int_nvvm_lg2_approx_d>;
 
+def : Pat<(flog2 f32:$a), (INT_NVVM_LG2_APPROX_FTZ_F $a)>,
+          Requires<[doF32FTZ]>;
+def : Pat<(flog2 f32:$a), (INT_NVVM_LG2_APPROX_F $a)>,
+          Requires<[doNoF32FTZ]>;
+
 //
 // Sin  Cos
 //
diff --git a/llvm/test/CodeGen/NVPTX/f16-ex2.ll b/llvm/test/CodeGen/NVPTX/f16-ex2.ll
@@ -1,21 +1,37 @@
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx70 | FileCheck %s
-; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s
+; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %}
+target triple = "nvptx64-nvidia-cuda"
 
 declare half @llvm.nvvm.ex2.approx.f16(half)
 declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>)
 
-; CHECK-LABEL: exp2_half
-define half @exp2_half(half %0) {
-  ; CHECK-NOT: call
-  ; CHECK: ex2.approx.f16
-  %res = call half @llvm.nvvm.ex2.approx.f16(half %0);
+; CHECK-LABEL: ex2_half
+define half @ex2_half(half %0) {
+; CHECK-FP16-LABEL: ex2_half(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .b16 %rs<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0:
+; CHECK-FP16-NEXT:    ld.param.b16 %rs1, [ex2_half_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.f16 %rs2, %rs1;
+; CHECK-FP16-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-FP16-NEXT:    ret;
+  %res = call half @llvm.nvvm.ex2.approx.f16(half %0)
   ret half %res
 }
 
-; CHECK-LABEL: exp2_2xhalf
-define <2 x half> @exp2_2xhalf(<2 x half> %0) {
-  ; CHECK-NOT: call
-  ; CHECK: ex2.approx.f16x2
-  %res = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %0);
+; CHECK-LABEL: ex2_2xhalf
+define <2 x half> @ex2_2xhalf(<2 x half> %0) {
+; CHECK-FP16-LABEL: ex2_2xhalf(
+; CHECK-FP16:       {
+; CHECK-FP16-NEXT:    .reg .b32 %r<3>;
+; CHECK-FP16-EMPTY:
+; CHECK-FP16-NEXT:  // %bb.0:
+; CHECK-FP16-NEXT:    ld.param.b32 %r1, [ex2_2xhalf_param_0];
+; CHECK-FP16-NEXT:    ex2.approx.f16x2 %r2, %r1;
+; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-FP16-NEXT:    ret;
+  %res = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %0)
   ret <2 x half> %res
 }
diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_50 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %}
+target triple = "nvptx-nvidia-cuda"
+
+declare float @llvm.nvvm.ex2.approx.f(float)
+
+; CHECK-LABEL: ex2_float
+define float @ex2_float(float %0) {
+; CHECK-LABEL: ex2_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [ex2_float_param_0];
+; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %res = call float @llvm.nvvm.ex2.approx.f(float %0)
+  ret float %res
+}
+
+; CHECK-LABEL: ex2_float_ftz
+define float @ex2_float_ftz(float %0) {
+; CHECK-LABEL: ex2_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [ex2_float_ftz_param_0];
+; CHECK-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %res = call float @llvm.nvvm.ex2.approx.ftz.f(float %0)
+  ret float %res
+}
diff --git a/llvm/test/CodeGen/NVPTX/f32-lg2.ll b/llvm/test/CodeGen/NVPTX/f32-lg2.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_20 -mattr=+ptx32 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 -mattr=+ptx32 | %ptxas-verify %}
+target triple = "nvptx-nvidia-cuda"
+
+declare float @llvm.nvvm.lg2.approx.f(float)
+declare float @llvm.nvvm.lg2.approx.ftz.f(float)
+
+; CHECK-LABEL: lg2_float
+define float @lg2_float(float %0) {
+; CHECK-LABEL: lg2_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [lg2_float_param_0];
+; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %res = call float @llvm.nvvm.lg2.approx.f(float %0)
+  ret float %res
+}
+
+; CHECK-LABEL: lg2_float_ftz
+define float @lg2_float_ftz(float %0) {
+; CHECK-LABEL: lg2_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [lg2_float_ftz_param_0];
+; CHECK-NEXT:    lg2.approx.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %res = call float @llvm.nvvm.lg2.approx.ftz.f(float %0)
+  ret float %res
+}
diff --git a/llvm/test/CodeGen/NVPTX/fexp2.ll b/llvm/test/CodeGen/NVPTX/fexp2.ll
diff --git a/llvm/test/CodeGen/NVPTX/flog2.ll b/llvm/test/CodeGen/NVPTX/flog2.ll