AMDGPU: Implement llvm.get.rounding

arsenm · arsenm · commit 5f8ee45d5afb · 2023-08-30T14:06:13.000-04:00
There are really two rounding modes, so only return the standard values if both modes are the same. Otherwise, return a bitmask representing the two modes. Annoyingly the register doesn't use the same values as FLT_ROUNDS. Use a simple integer table we can shift into to convert. https://reviews.llvm.org/D153158
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
@@ -999,6 +999,20 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
   :ref:`llvm.stacksave.p5 <int_stacksave>`         Implemented, must use the alloca address space.
   :ref:`llvm.stackrestore.p5 <int_stackrestore>`   Implemented, must use the alloca address space.
 
+  :ref:`llvm.get.rounding<int_get_rounding>`       AMDGPU supports two separately controllable rounding
+                                                   modes depending on the floating-point type. One
+                                                   controls float, and the other controls both double and
+                                                   half operations. If both modes are the same, returns
+                                                   one of the standard return values. If the modes are
+                                                   different, returns one of :ref:`12 extended values
+                                                   <amdgpu-rounding-mode-enumeration-values-table>`
+                                                   describing the two modes.
+
+                                                   To nearest, ties away from zero is not a supported
+                                                   mode. The raw rounding mode values in the MODE
+                                                   register do not exactly match the FLT_ROUNDS values,
+                                                   so a conversion is performed.
+
   llvm.amdgcn.wave.reduce.umin                     Performs an arithmetic unsigned min reduction on the unsigned values
                                                    provided by each lane in the wavefront.
                                                    Intrinsic takes a hint for reduction strategy using second operand
@@ -4916,6 +4930,22 @@ The fields used by CP for code objects before V3 also match those specified in
      FLOAT_ROUND_MODE_ZERO                  3     Round Toward 0
      ====================================== ===== ==============================
 
+
+  .. table:: Extended FLT_ROUNDS Enumeration Values
+     :name: amdgpu-rounding-mode-enumeration-values-table
+
+     +------------------------+---------------+-------------------+--------------------+----------+
+     |                        | F32 NEAR_EVEN | F32 PLUS_INFINITY | F32 MINUS_INFINITY | F32 ZERO |
+     +------------------------+---------------+-------------------+--------------------+----------+
+     | F64/F16 NEAR_EVEN      |      1        |        11         |        14          |     17   |
+     +------------------------+---------------+-------------------+--------------------+----------+
+     | F64/F16 PLUS_INFINITY  |      8        |         2         |        15          |     18   |
+     +------------------------+---------------+-------------------+--------------------+----------+
+     | F64/F16 MINUS_INFINITY |      9        |        12         |         3          |     19   |
+     +------------------------+---------------+-------------------+--------------------+----------+
+     | F64/F16 ZERO           |     10        |        13         |        16          |     0    |
+     +------------------------+---------------+-------------------+--------------------+----------+
+
 ..
 
   .. table:: Floating Point Denorm Mode Enumeration Values
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
@@ -25478,6 +25478,8 @@ These functions read or write floating point environment, such as rounding
 mode or state of floating point exceptions. Altering the floating point
 environment requires special care. See :ref:`Floating Point Environment <floatenv>`.
 
+.. _int_get_rounding:
+
 '``llvm.get.rounding``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
@@ -81,6 +81,8 @@ Changes to the AMDGPU Backend
 
 * Implemented `llvm.stacksave` and `llvm.stackrestore` intrinsics.
 
+* Implemented :ref:`llvm.get.rounding <int_get_rounding>`
+
 Changes to the ARM Backend
 --------------------------
 
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -872,6 +872,7 @@ enum NodeType {
   ///  2 Round to +inf
   ///  3 Round to -inf
   ///  4 Round to nearest, ties to zero
+  ///  Other values are target dependent.
   /// Result is rounding mode and chain. Input is a chain.
   GET_ROUNDING,
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -755,6 +755,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                      Custom);
 
   setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
+  setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
 
   setTargetDAGCombine({ISD::ADD,
                        ISD::UADDO_CARRY,
@@ -3541,6 +3542,77 @@ SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
 }
 
+SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  assert(Op.getValueType() == MVT::i32);
+
+  uint32_t BothRoundHwReg =
+      AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_MODE, 0, 4);
+  SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
+
+  SDValue IntrinID =
+      DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
+  SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
+                               Op.getOperand(0), IntrinID, GetRoundBothImm);
+
+  // There are two rounding modes, one for f32 and one for f64/f16. We only
+  // report in the standard value range if both are the same.
+  //
+  // The raw values also differ from the expected FLT_ROUNDS values. Nearest
+  // ties away from zero is not supported, and the other values are rotated by
+  // 1.
+  //
+  // If the two rounding modes are not the same, report a target defined value.
+
+  // Mode register rounding mode fields:
+  //
+  // [1:0] Single-precision round mode.
+  // [3:2] Double/Half-precision round mode.
+  //
+  // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
+  //
+  //             Hardware   Spec
+  // Toward-0        3        0
+  // Nearest Even    0        1
+  // +Inf            1        2
+  // -Inf            2        3
+  //  NearestAway0  N/A       4
+  //
+  // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
+  // table we can index by the raw hardware mode.
+  //
+  // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
+
+  SDValue BitTable =
+      DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64);
+
+  SDValue Two = DAG.getConstant(2, SL, MVT::i32);
+  SDValue RoundModeTimesNumBits =
+      DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
+
+  // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
+  // knew only one mode was demanded.
+  SDValue TableValue =
+      DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
+  SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
+
+  SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
+  SDValue TableEntry =
+      DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
+
+  // There's a gap in the 4-bit encoded table and actual enum values, so offset
+  // if it's an extended value.
+  SDValue Four = DAG.getConstant(4, SL, MVT::i32);
+  SDValue IsStandardValue =
+      DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
+  SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
+  SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
+                               TableEntry, EnumOffset);
+
+  return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
+}
+
 Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
                                              const MachineFunction &MF) const {
   Register Reg = StringSwitch<Register>(RegName)
@@ -5050,6 +5122,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::STACKSAVE:
     return LowerSTACKSAVE(Op, DAG);
+  case ISD::GET_ROUNDING:
+    return lowerGET_ROUNDING(Op, DAG);
   }
   return SDValue();
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -411,6 +411,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
 
   Register getRegisterByName(const char* RegName, LLT VT,
                              const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
@@ -36,3 +36,135 @@ SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) {
     FP64FP16Denormals = DenormMode;
   }
 }
+
+using namespace AMDGPU;
+
+/// Combine f32 and f64 rounding modes into a combined rounding mode value.
+static constexpr uint32_t getModeRegisterRoundMode(uint32_t HWFP32Val,
+                                                   uint32_t HWFP64Val) {
+  return HWFP32Val << F32FltRoundOffset | HWFP64Val << F64FltRoundOffset;
+}
+
+static constexpr uint64_t encodeFltRoundsTable(uint32_t FltRoundsVal,
+                                               uint32_t HWF32Val,
+                                               uint32_t HWF64Val) {
+  uint32_t ModeVal = getModeRegisterRoundMode(HWF32Val, HWF64Val);
+  if (FltRoundsVal > TowardNegative)
+    FltRoundsVal -= ExtendedFltRoundOffset;
+
+  uint32_t BitIndex = ModeVal << 2;
+  return static_cast<uint64_t>(FltRoundsVal) << BitIndex;
+}
+
+// Encode FLT_ROUNDS value where the two rounding modes are the same and use a
+// standard value
+static constexpr uint64_t
+encodeFltRoundsTableSame(AMDGPUFltRounds FltRoundsMode, uint32_t HWVal) {
+  return encodeFltRoundsTable(FltRoundsMode, HWVal, HWVal);
+}
+
+// Convert mode register encoded rounding mode to AMDGPUFltRounds
+static constexpr AMDGPUFltRounds
+decodeIndexFltRoundConversionTable(uint32_t HWMode) {
+  uint32_t TableRead = (FltRoundConversionTable >> (HWMode << 2)) & 0xf;
+  if (TableRead > TowardNegative)
+    TableRead += ExtendedFltRoundOffset;
+  return static_cast<AMDGPUFltRounds>(TableRead);
+}
+
+static constexpr uint32_t HWTowardZero = FP_ROUND_ROUND_TO_ZERO;
+static constexpr uint32_t HWNearestTiesToEven = FP_ROUND_ROUND_TO_NEAREST;
+static constexpr uint32_t HWTowardPositive = FP_ROUND_ROUND_TO_INF;
+static constexpr uint32_t HWTowardNegative = FP_ROUND_ROUND_TO_NEGINF;
+
+constexpr uint64_t AMDGPU::FltRoundConversionTable =
+    encodeFltRoundsTableSame(TowardZeroF32_TowardZeroF64, HWTowardZero) |
+    encodeFltRoundsTableSame(NearestTiesToEvenF32_NearestTiesToEvenF64,
+                             HWNearestTiesToEven) |
+    encodeFltRoundsTableSame(TowardPositiveF32_TowardPositiveF64,
+                             HWTowardPositive) |
+    encodeFltRoundsTableSame(TowardNegativeF32_TowardNegativeF64,
+                             HWTowardNegative) |
+
+    encodeFltRoundsTable(TowardZeroF32_NearestTiesToEvenF64, HWTowardZero,
+                         HWNearestTiesToEven) |
+    encodeFltRoundsTable(TowardZeroF32_TowardPositiveF64, HWTowardZero,
+                         HWTowardPositive) |
+    encodeFltRoundsTable(TowardZeroF32_TowardNegativeF64, HWTowardZero,
+                         HWTowardNegative) |
+
+    encodeFltRoundsTable(NearestTiesToEvenF32_TowardZeroF64,
+                         HWNearestTiesToEven, HWTowardZero) |
+    encodeFltRoundsTable(NearestTiesToEvenF32_TowardPositiveF64,
+                         HWNearestTiesToEven, HWTowardPositive) |
+    encodeFltRoundsTable(NearestTiesToEvenF32_TowardNegativeF64,
+                         HWNearestTiesToEven, HWTowardNegative) |
+
+    encodeFltRoundsTable(TowardPositiveF32_TowardZeroF64, HWTowardPositive,
+                         HWTowardZero) |
+    encodeFltRoundsTable(TowardPositiveF32_NearestTiesToEvenF64,
+                         HWTowardPositive, HWNearestTiesToEven) |
+    encodeFltRoundsTable(TowardPositiveF32_TowardNegativeF64, HWTowardPositive,
+                         HWTowardNegative) |
+
+    encodeFltRoundsTable(TowardNegativeF32_TowardZeroF64, HWTowardNegative,
+                         HWTowardZero) |
+    encodeFltRoundsTable(TowardNegativeF32_NearestTiesToEvenF64,
+                         HWTowardNegative, HWNearestTiesToEven) |
+    encodeFltRoundsTable(TowardNegativeF32_TowardPositiveF64, HWTowardNegative,
+                         HWTowardPositive);
+
+// Verify evaluation of FltRoundConversionTable
+
+// If both modes are the same, should return the standard values.
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardZero, HWTowardZero)) == AMDGPUFltRounds::TowardZero);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWNearestTiesToEven, HWNearestTiesToEven)) ==
+              AMDGPUFltRounds::NearestTiesToEven);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardPositive, HWTowardPositive)) ==
+              AMDGPUFltRounds::TowardPositive);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardNegative, HWTowardNegative)) ==
+              AMDGPUFltRounds::TowardNegative);
+
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardZero, HWNearestTiesToEven)) ==
+              TowardZeroF32_NearestTiesToEvenF64);
+static_assert(decodeIndexFltRoundConversionTable(
+                  getModeRegisterRoundMode(HWTowardZero, HWTowardPositive)) ==
+              TowardZeroF32_TowardPositiveF64);
+static_assert(decodeIndexFltRoundConversionTable(
+                  getModeRegisterRoundMode(HWTowardZero, HWTowardNegative)) ==
+              TowardZeroF32_TowardNegativeF64);
+
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWNearestTiesToEven, HWTowardZero)) ==
+              NearestTiesToEvenF32_TowardZeroF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWNearestTiesToEven, HWTowardPositive)) ==
+              NearestTiesToEvenF32_TowardPositiveF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWNearestTiesToEven, HWTowardNegative)) ==
+              NearestTiesToEvenF32_TowardNegativeF64);
+
+static_assert(decodeIndexFltRoundConversionTable(
+                  getModeRegisterRoundMode(HWTowardPositive, HWTowardZero)) ==
+              TowardPositiveF32_TowardZeroF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardPositive, HWNearestTiesToEven)) ==
+              TowardPositiveF32_NearestTiesToEvenF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardPositive, HWTowardNegative)) ==
+              TowardPositiveF32_TowardNegativeF64);
+
+static_assert(decodeIndexFltRoundConversionTable(
+                  getModeRegisterRoundMode(HWTowardNegative, HWTowardZero)) ==
+              TowardNegativeF32_TowardZeroF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardNegative, HWNearestTiesToEven)) ==
+              TowardNegativeF32_NearestTiesToEvenF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+                  HWTowardNegative, HWTowardPositive)) ==
+              TowardNegativeF32_TowardPositiveF64);
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
@@ -85,6 +85,65 @@ struct SIModeRegisterDefaults {
   }
 };
 
+namespace AMDGPU {
+
+/// Return values used for llvm.get.rounding
+///
+/// When both the F32 and F64/F16 modes are the same, returns the standard
+/// values. If they differ, returns an extended mode starting at 8.
+enum AMDGPUFltRounds : int8_t {
+  // Inherit everything from RoundingMode
+  TowardZero = static_cast<int8_t>(RoundingMode::TowardZero),
+  NearestTiesToEven = static_cast<int8_t>(RoundingMode::NearestTiesToEven),
+  TowardPositive = static_cast<int8_t>(RoundingMode::TowardPositive),
+  TowardNegative = static_cast<int8_t>(RoundingMode::TowardNegative),
+  NearestTiesToAwayUnsupported =
+      static_cast<int8_t>(RoundingMode::NearestTiesToAway),
+
+  Dynamic = static_cast<int8_t>(RoundingMode::Dynamic),
+
+  // Permute the mismatched rounding mode cases.  If the modes are the same, use
+  // the standard values, otherwise, these values are sorted such that higher
+  // hardware encoded values have higher enum values.
+  NearestTiesToEvenF32_NearestTiesToEvenF64 = NearestTiesToEven,
+  NearestTiesToEvenF32_TowardPositiveF64 = 8,
+  NearestTiesToEvenF32_TowardNegativeF64 = 9,
+  NearestTiesToEvenF32_TowardZeroF64 = 10,
+
+  TowardPositiveF32_NearestTiesToEvenF64 = 11,
+  TowardPositiveF32_TowardPositiveF64 = TowardPositive,
+  TowardPositiveF32_TowardNegativeF64 = 12,
+  TowardPositiveF32_TowardZeroF64 = 13,
+
+  TowardNegativeF32_NearestTiesToEvenF64 = 14,
+  TowardNegativeF32_TowardPositiveF64 = 15,
+  TowardNegativeF32_TowardNegativeF64 = TowardNegative,
+  TowardNegativeF32_TowardZeroF64 = 16,
+
+  TowardZeroF32_NearestTiesToEvenF64 = 17,
+  TowardZeroF32_TowardPositiveF64 = 18,
+  TowardZeroF32_TowardNegativeF64 = 19,
+  TowardZeroF32_TowardZeroF64 = TowardZero,
+
+  Invalid = static_cast<int8_t>(RoundingMode::Invalid)
+};
+
+/// Offset of nonstandard values for llvm.get.rounding results from the largest
+/// supported mode.
+static constexpr uint32_t ExtendedFltRoundOffset = 4;
+
+/// Offset in mode register of f32 rounding mode.
+static constexpr uint32_t F32FltRoundOffset = 0;
+
+/// Offset in mode register of f64/f16 rounding mode.
+static constexpr uint32_t F64FltRoundOffset = 2;
+
+// Bit indexed table to convert from hardware rounding mode values to FLT_ROUNDS
+// values.
+extern const uint64_t FltRoundConversionTable;
+
+} // end namespace AMDGPU
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_AMDGPU_SIMODEREGISTERDEFAULTS_H
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.get.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.get.rounding.ll