[NVPTX] Check 'contract' fast-math flag in addition to global options (#131372)

AlexMaclean · web-flow · commit bed4c581c2a7 · 2025-03-25T09:20:42.000-07:00
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -4950,7 +4950,9 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
   if (N0.getOpcode() == ISD::FMUL) {
     const auto *TLI = static_cast<const NVPTXTargetLowering *>(
         &DCI.DAG.getTargetLoweringInfo());
-    if (!TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel))
+    if (!(TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel) ||
+          (N->getFlags().hasAllowContract() &&
+           N0->getFlags().hasAllowContract())))
       return SDValue();
 
     // For floating point:
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -150,8 +150,6 @@ def doRsqrtOpt : Predicate<"doRsqrtOpt()">;
 
 def doMulWide      : Predicate<"doMulWide">;
 
-def allowFMA : Predicate<"allowFMA()">;
-def noFMA : Predicate<"!allowFMA()">;
 def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">;
 def noUnsafeFPMath : Predicate<"!allowUnsafeFPMath()">;
 
@@ -367,167 +365,89 @@ multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> {
 // This multiclass should be used for nodes that can be folded to make fma ops.
 // In this case, we use the ".rn" variant when FMA is disabled, as this behaves
 // just like the non ".rn" op, but prevents ptxas from creating FMAs.
-multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
-   def f64rr :
-     NVPTXInst<(outs Float64Regs:$dst),
-               (ins Float64Regs:$a, Float64Regs:$b),
-               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
-               [(set f64:$dst, (OpNode f64:$a, f64:$b))]>,
-               Requires<[allowFMA]>;
-   def f64ri :
-     NVPTXInst<(outs Float64Regs:$dst),
-               (ins Float64Regs:$a, f64imm:$b),
-               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
-               [(set f64:$dst, (OpNode f64:$a, fpimm:$b))]>,
-               Requires<[allowFMA]>;
-   def f32rr_ftz :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, Float32Regs:$b),
-               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
-               [(set f32:$dst, (OpNode f32:$a, f32:$b))]>,
-               Requires<[allowFMA, doF32FTZ]>;
-   def f32ri_ftz :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, f32imm:$b),
-               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
-               [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>,
-               Requires<[allowFMA, doF32FTZ]>;
-   def f32rr :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, Float32Regs:$b),
-               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
-               [(set f32:$dst, (OpNode f32:$a, f32:$b))]>,
-               Requires<[allowFMA]>;
-   def f32ri :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, f32imm:$b),
-               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
-               [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>,
-               Requires<[allowFMA]>;
-
-   def f16rr_ftz :
-     NVPTXInst<(outs Int16Regs:$dst),
-               (ins Int16Regs:$a, Int16Regs:$b),
-               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
-               [(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
-               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
-   def f16rr :
-     NVPTXInst<(outs Int16Regs:$dst),
-               (ins Int16Regs:$a, Int16Regs:$b),
-               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
-               [(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
-               Requires<[useFP16Math, allowFMA]>;
+multiclass F3<string op_str, SDPatternOperator op_pat> {
+  def f64rr :
+    NVPTXInst<(outs Float64Regs:$dst),
+              (ins Float64Regs:$a, Float64Regs:$b),
+              op_str # ".f64 \t$dst, $a, $b;",
+              [(set f64:$dst, (op_pat f64:$a, f64:$b))]>;
+  def f64ri :
+    NVPTXInst<(outs Float64Regs:$dst),
+              (ins Float64Regs:$a, f64imm:$b),
+              op_str # ".f64 \t$dst, $a, $b;",
+              [(set f64:$dst, (op_pat f64:$a, fpimm:$b))]>;
+  def f32rr_ftz :
+    NVPTXInst<(outs Float32Regs:$dst),
+              (ins Float32Regs:$a, Float32Regs:$b),
+              op_str # ".ftz.f32 \t$dst, $a, $b;",
+              [(set f32:$dst, (op_pat f32:$a, f32:$b))]>,
+              Requires<[doF32FTZ]>;
+  def f32ri_ftz :
+    NVPTXInst<(outs Float32Regs:$dst),
+              (ins Float32Regs:$a, f32imm:$b),
+              op_str # ".ftz.f32 \t$dst, $a, $b;",
+              [(set f32:$dst, (op_pat f32:$a, fpimm:$b))]>,
+              Requires<[doF32FTZ]>;
+  def f32rr :
+    NVPTXInst<(outs Float32Regs:$dst),
+              (ins Float32Regs:$a, Float32Regs:$b),
+              op_str # ".f32 \t$dst, $a, $b;",
+              [(set f32:$dst, (op_pat f32:$a, f32:$b))]>;
+  def f32ri :
+    NVPTXInst<(outs Float32Regs:$dst),
+              (ins Float32Regs:$a, f32imm:$b),
+              op_str # ".f32 \t$dst, $a, $b;",
+              [(set f32:$dst, (op_pat f32:$a, fpimm:$b))]>;
+
+  def f16rr_ftz :
+    NVPTXInst<(outs Int16Regs:$dst),
+              (ins Int16Regs:$a, Int16Regs:$b),
+              op_str # ".ftz.f16 \t$dst, $a, $b;",
+              [(set f16:$dst, (op_pat f16:$a, f16:$b))]>,
+              Requires<[useFP16Math, doF32FTZ]>;
+  def f16rr :
+    NVPTXInst<(outs Int16Regs:$dst),
+              (ins Int16Regs:$a, Int16Regs:$b),
+              op_str # ".f16 \t$dst, $a, $b;",
+              [(set f16:$dst, (op_pat f16:$a, f16:$b))]>,
+              Requires<[useFP16Math]>;
+
+  def f16x2rr_ftz :
+    NVPTXInst<(outs Int32Regs:$dst),
+              (ins Int32Regs:$a, Int32Regs:$b),
+              op_str # ".ftz.f16x2 \t$dst, $a, $b;",
+              [(set v2f16:$dst, (op_pat v2f16:$a, v2f16:$b))]>,
+              Requires<[useFP16Math, doF32FTZ]>;
+  def f16x2rr :
+    NVPTXInst<(outs Int32Regs:$dst),
+              (ins Int32Regs:$a, Int32Regs:$b),
+              op_str # ".f16x2 \t$dst, $a, $b;",
+              [(set v2f16:$dst, (op_pat v2f16:$a, v2f16:$b))]>,
+              Requires<[useFP16Math]>;
+  def bf16rr :
+    NVPTXInst<(outs Int16Regs:$dst),
+              (ins Int16Regs:$a, Int16Regs:$b),
+              op_str # ".bf16 \t$dst, $a, $b;",
+              [(set bf16:$dst, (op_pat bf16:$a, bf16:$b))]>,
+              Requires<[hasBF16Math]>;
+
+  def bf16x2rr :
+    NVPTXInst<(outs Int32Regs:$dst),
+              (ins Int32Regs:$a, Int32Regs:$b),
+              op_str # ".bf16x2 \t$dst, $a, $b;",
+              [(set v2bf16:$dst, (op_pat v2bf16:$a, v2bf16:$b))]>,
+              Requires<[hasBF16Math]>;
+}
 
-   def f16x2rr_ftz :
-     NVPTXInst<(outs Int32Regs:$dst),
-               (ins Int32Regs:$a, Int32Regs:$b),
-               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
-               [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
-               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
-   def f16x2rr :
-     NVPTXInst<(outs Int32Regs:$dst),
-               (ins Int32Regs:$a, Int32Regs:$b),
-               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
-               [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
-               Requires<[useFP16Math, allowFMA]>;
-   def bf16rr :
-     NVPTXInst<(outs Int16Regs:$dst),
-               (ins Int16Regs:$a, Int16Regs:$b),
-               !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
-               [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>,
-               Requires<[hasBF16Math, allowFMA]>;
+class BinOpAllowsFMA<SDPatternOperator operator>
+    : PatFrag<(ops node:$A, node:$B),
+              (operator node:$A, node:$B), [{
+  return allowFMA() || N->getFlags().hasAllowContract();;
+}]>;
 
-   def bf16x2rr :
-     NVPTXInst<(outs Int32Regs:$dst),
-               (ins Int32Regs:$a, Int32Regs:$b),
-               !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
-               [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>,
-               Requires<[hasBF16Math, allowFMA]>;
-   // These have strange names so we don't perturb existing mir tests.
-   def _rnf64rr :
-     NVPTXInst<(outs Float64Regs:$dst),
-               (ins Float64Regs:$a, Float64Regs:$b),
-               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
-               [(set f64:$dst, (OpNode f64:$a, f64:$b))]>,
-               Requires<[noFMA]>;
-   def _rnf64ri :
-     NVPTXInst<(outs Float64Regs:$dst),
-               (ins Float64Regs:$a, f64imm:$b),
-               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
-               [(set f64:$dst, (OpNode f64:$a, fpimm:$b))]>,
-               Requires<[noFMA]>;
-   def _rnf32rr_ftz :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, Float32Regs:$b),
-               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
-               [(set f32:$dst, (OpNode f32:$a, Float32Regs:$b))]>,
-               Requires<[noFMA, doF32FTZ]>;
-   def _rnf32ri_ftz :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, f32imm:$b),
-               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
-               [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>,
-               Requires<[noFMA, doF32FTZ]>;
-   def _rnf32rr :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, Float32Regs:$b),
-               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
-               [(set f32:$dst, (OpNode f32:$a, f32:$b))]>,
-               Requires<[noFMA]>;
-   def _rnf32ri :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, f32imm:$b),
-               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
-               [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>,
-               Requires<[noFMA]>;
-   def _rnf16rr_ftz :
-     NVPTXInst<(outs Int16Regs:$dst),
-               (ins Int16Regs:$a, Int16Regs:$b),
-               !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
-               [(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
-               Requires<[useFP16Math, noFMA, doF32FTZ]>;
-   def _rnf16rr :
-     NVPTXInst<(outs Int16Regs:$dst),
-               (ins Int16Regs:$a, Int16Regs:$b),
-               !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
-               [(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
-               Requires<[useFP16Math, noFMA]>;
-   def _rnf16x2rr_ftz :
-     NVPTXInst<(outs Int32Regs:$dst),
-               (ins Int32Regs:$a, Int32Regs:$b),
-               !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
-               [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
-               Requires<[useFP16Math, noFMA, doF32FTZ]>;
-   def _rnf16x2rr :
-     NVPTXInst<(outs Int32Regs:$dst),
-               (ins Int32Regs:$a, Int32Regs:$b),
-               !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
-               [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
-               Requires<[useFP16Math, noFMA]>;
-  def _rnbf16rr_ftz :
-     NVPTXInst<(outs Int16Regs:$dst),
-               (ins Int16Regs:$a, Int16Regs:$b),
-               !strconcat(OpcStr, ".rn.ftz.bf16 \t$dst, $a, $b;"),
-               [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>,
-               Requires<[hasBF16Math, noFMA, doF32FTZ]>;
-   def _rnbf16rr :
-     NVPTXInst<(outs Int16Regs:$dst),
-               (ins Int16Regs:$a, Int16Regs:$b),
-               !strconcat(OpcStr, ".rn.bf16 \t$dst, $a, $b;"),
-               [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>,
-               Requires<[hasBF16Math, noFMA]>;
-   def _rnbf16x2rr_ftz :
-     NVPTXInst<(outs Int32Regs:$dst),
-               (ins Int32Regs:$a, Int32Regs:$b),
-               !strconcat(OpcStr, ".rn.ftz.bf16x2 \t$dst, $a, $b;"),
-               [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>,
-               Requires<[hasBF16Math, noFMA, doF32FTZ]>;
-   def _rnbf16x2rr :
-     NVPTXInst<(outs Int32Regs:$dst),
-               (ins Int32Regs:$a, Int32Regs:$b),
-               !strconcat(OpcStr, ".rn.bf16x2 \t$dst, $a, $b;"),
-               [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>,
-               Requires<[hasBF16Math, noFMA]>;
+multiclass F3_fma_component<string op_str, SDNode op_node> {
+  defm "" : F3<op_str, BinOpAllowsFMA<op_node>>;
+  defm _rn : F3<op_str # ".rn", op_node>;
 }
 
 // Template for operations which take two f32 or f64 operands.  Provides three
diff --git a/llvm/test/CodeGen/NVPTX/fp-contract.ll b/llvm/test/CodeGen/NVPTX/fp-contract.ll
@@ -1,6 +1,7 @@
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefix=DEFAULT
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,FAST
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefixes=CHECK,DEFAULT
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -fp-contract=fast | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 | %ptxas-verify %}
 
 target triple = "nvptx64-unknown-cuda"
@@ -11,25 +12,105 @@ target triple = "nvptx64-unknown-cuda"
 ;; is free to fuse with a multiply if it is able.  If fusion is not allowed,
 ;; we do not form fma.rn at the PTX level and explicitly generate add.rn
 ;; for all adds to prevent ptxas from fusion the ops.
-
-;; FAST-LABEL: @t0
-;; DEFAULT-LABEL: @t0
 define float @t0(float %a, float %b, float %c) {
-;; FAST: fma.rn.f32
-;; DEFAULT: mul.rn.f32
-;; DEFAULT: add.rn.f32
+; FAST-LABEL: t0(
+; FAST:       {
+; FAST-NEXT:    .reg .f32 %f<5>;
+; FAST-EMPTY:
+; FAST-NEXT:  // %bb.0:
+; FAST-NEXT:    ld.param.f32 %f1, [t0_param_0];
+; FAST-NEXT:    ld.param.f32 %f2, [t0_param_1];
+; FAST-NEXT:    ld.param.f32 %f3, [t0_param_2];
+; FAST-NEXT:    fma.rn.f32 %f4, %f1, %f2, %f3;
+; FAST-NEXT:    st.param.f32 [func_retval0], %f4;
+; FAST-NEXT:    ret;
+;
+; DEFAULT-LABEL: t0(
+; DEFAULT:       {
+; DEFAULT-NEXT:    .reg .f32 %f<6>;
+; DEFAULT-EMPTY:
+; DEFAULT-NEXT:  // %bb.0:
+; DEFAULT-NEXT:    ld.param.f32 %f1, [t0_param_0];
+; DEFAULT-NEXT:    ld.param.f32 %f2, [t0_param_1];
+; DEFAULT-NEXT:    mul.rn.f32 %f3, %f1, %f2;
+; DEFAULT-NEXT:    ld.param.f32 %f4, [t0_param_2];
+; DEFAULT-NEXT:    add.rn.f32 %f5, %f3, %f4;
+; DEFAULT-NEXT:    st.param.f32 [func_retval0], %f5;
+; DEFAULT-NEXT:    ret;
   %v0 = fmul float %a, %b
   %v1 = fadd float %v0, %c
   ret float %v1
 }
 
-;; FAST-LABEL: @t1
-;; DEFAULT-LABEL: @t1
-define float @t1(float %a, float %b) {
 ;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32
 ;; to prevent ptxas from fusing this with anything else.
-;; FAST: add.f32
-;; DEFAULT: add.rn.f32
+define float @t1(float %a, float %b) {
+; FAST-LABEL: t1(
+; FAST:       {
+; FAST-NEXT:    .reg .f32 %f<6>;
+; FAST-EMPTY:
+; FAST-NEXT:  // %bb.0:
+; FAST-NEXT:    ld.param.f32 %f1, [t1_param_0];
+; FAST-NEXT:    ld.param.f32 %f2, [t1_param_1];
+; FAST-NEXT:    add.f32 %f3, %f1, %f2;
+; FAST-NEXT:    sub.f32 %f4, %f1, %f2;
+; FAST-NEXT:    mul.f32 %f5, %f3, %f4;
+; FAST-NEXT:    st.param.f32 [func_retval0], %f5;
+; FAST-NEXT:    ret;
+;
+; DEFAULT-LABEL: t1(
+; DEFAULT:       {
+; DEFAULT-NEXT:    .reg .f32 %f<6>;
+; DEFAULT-EMPTY:
+; DEFAULT-NEXT:  // %bb.0:
+; DEFAULT-NEXT:    ld.param.f32 %f1, [t1_param_0];
+; DEFAULT-NEXT:    ld.param.f32 %f2, [t1_param_1];
+; DEFAULT-NEXT:    add.rn.f32 %f3, %f1, %f2;
+; DEFAULT-NEXT:    sub.rn.f32 %f4, %f1, %f2;
+; DEFAULT-NEXT:    mul.rn.f32 %f5, %f3, %f4;
+; DEFAULT-NEXT:    st.param.f32 [func_retval0], %f5;
+; DEFAULT-NEXT:    ret;
   %v1 = fadd float %a, %b
+  %v2 = fsub float %a, %b
+  %v3 = fmul float %v1, %v2
+  ret float %v3
+}
+
+;; Make sure we generate the non ".rn" version when the "contract" flag is
+;; present on the instructions
+define float @t2(float %a, float %b) {
+; CHECK-LABEL: t2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [t2_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [t2_param_1];
+; CHECK-NEXT:    add.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    sub.f32 %f4, %f1, %f2;
+; CHECK-NEXT:    mul.f32 %f5, %f3, %f4;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f5;
+; CHECK-NEXT:    ret;
+  %v1 = fadd contract float %a, %b
+  %v2 = fsub contract float %a, %b
+  %v3 = fmul contract float %v1, %v2
+  ret float %v3
+}
+
+;; Make sure we always fold to fma when the "contract" flag is present
+define float @t3(float %a, float %b, float %c) {
+; CHECK-LABEL: t3(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [t3_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [t3_param_1];
+; CHECK-NEXT:    ld.param.f32 %f3, [t3_param_2];
+; CHECK-NEXT:    fma.rn.f32 %f4, %f1, %f2, %f3;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f4;
+; CHECK-NEXT:    ret;
+  %v0 = fmul contract float %a, %b
+  %v1 = fadd contract float %v0, %c
   ret float %v1
 }