AMDGPU: Implement llvm.get.fpmode

arsenm · arsenm · commit 17bd80601efe · 2023-09-10T10:19:19.000+03:00
Currently s_getreg_b32 is missing the possible mode use. Really we need separate pseudos for mode-only accesses, but leave this as a pre-existing issue. https://reviews.llvm.org/D152710
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
@@ -999,6 +999,13 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
   :ref:`llvm.stacksave.p5 <int_stacksave>`         Implemented, must use the alloca address space.
   :ref:`llvm.stackrestore.p5 <int_stackrestore>`   Implemented, must use the alloca address space.
 
+  :ref:`llvm.get.fpmode.i32 <int_get_fpmode>`      The natural floating-point mode type is i32. This
+                                                   implemented by extracting relevant bits out of the MODE
+                                                   register with s_getreg_b32. The first 10 bits are the
+                                                   core floating-point mode. Bits 12:18 are the exception
+                                                   mask. On gfx9+, bit 23 is FP16_OVFL. Bitfields not
+                                                   relevant to floating-point instructions are 0s.
+
   :ref:`llvm.get.rounding<int_get_rounding>`       AMDGPU supports two separately controllable rounding
                                                    modes depending on the floating-point type. One
                                                    controls float, and the other controls both double and
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
@@ -25660,6 +25660,7 @@ The '``llvm.reset.fpenv``' intrinsic sets the current floating-point environment
 to default state. It is similar to the call 'fesetenv(FE_DFL_ENV)', except it
 does not return any value.
 
+.. _int_get_fpmode:
 
 '``llvm.get.fpmode``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -757,6 +757,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
   setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
 
+  // TODO: Could move this to custom lowering, could benefit from combines on
+  // extract of relevant bits.
+  setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
+
   setTargetDAGCombine({ISD::ADD,
                        ISD::UADDO_CARRY,
                        ISD::SUB,
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -870,6 +870,8 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo <
 
 // This is hasSideEffects to allow its use in readcyclecounter selection.
 // FIXME: Need to truncate immediate to 16-bits.
+// FIXME: Missing mode register use. Should have separate pseudos for
+// known may read MODE and only read MODE.
 def S_GETREG_B32 : SOPK_Pseudo <
   "s_getreg_b32",
   (outs SReg_32:$sdst), (ins hwreg:$simm16),
@@ -1424,6 +1426,66 @@ def : GCNPat <
     (S_WAIT_EVENT (i16 0))
 >;
 
+// The first 10 bits of the mode register are the core FP mode on all
+// subtargets.
+//
+// The high bits include additional fields, intermixed with some
+// non-floating point environment information. We extract the full
+// register and clear non-relevant bits.
+//
+// EXCP_EN covers floating point exceptions, but also some other
+// non-FP exceptions.
+//
+// Bits 12-18 cover the relevant exception mask on all subtargets.
+//
+// FIXME: Bit 18 is int_div0, should this be in the FP environment? I
+// think the only source is v_rcp_iflag_i32.
+//
+// On GFX9+:
+// Bit 23 is the additional FP16_OVFL mode.
+//
+// Bits 19, 20, and 21 cover non-FP exceptions and differ between
+// gfx9/10/11, so we ignore them here.
+
+// TODO: Would it be cheaper to emit multiple s_getreg_b32 calls for
+// the ranges and combine the results?
+
+defvar fp_round_mask = !add(!shl(1, 4), -1);
+defvar fp_denorm_mask = !shl(!add(!shl(1, 4), -1), 4);
+defvar dx10_clamp_mask = !shl(1, 8);
+defvar ieee_mode_mask = !shl(1, 9);
+
+// Covers fp_round, fp_denorm, dx10_clamp, and IEEE bit.
+defvar fpmode_mask =
+  !or(fp_round_mask, fp_denorm_mask, dx10_clamp_mask, ieee_mode_mask);
+
+defvar fp_excp_en_mask = !shl(!add(!shl(1, 7), -1), 12);
+defvar fp16_ovfl = !shl(1, 23);
+defvar fpmode_mask_gfx6plus = !or(fpmode_mask, fp_excp_en_mask);
+defvar fpmode_mask_gfx9plus = !or(fpmode_mask_gfx6plus, fp16_ovfl);
+
+class GetFPModePat<int fpmode_mask> : GCNPat<
+  (i32 get_fpmode),
+  (S_AND_B32 (i32 fpmode_mask),
+             (S_GETREG_B32 getHwRegImm<
+                HWREG.MODE, 0,
+                !add(!logtwo(fpmode_mask), 1)>.ret))
+>;
+
+// TODO: Might be worth moving to custom lowering so the and is
+// exposed to demanded bits optimizations. Most users probably only
+// care about the rounding or denorm mode bits. We also can reduce the
+// demanded read from the getreg immediate.
+let SubtargetPredicate = isGFX9Plus in {
+// Last bit = FP16_OVFL
+def : GetFPModePat<fpmode_mask_gfx9plus>;
+}
+
+// Last bit = EXCP_EN.int_div0
+let SubtargetPredicate = isNotGFX9Plus in {
+def : GetFPModePat<fpmode_mask_gfx6plus>;
+}
+
 //===----------------------------------------------------------------------===//
 // SOP2 Patterns
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll