Skip to content

Commit 5f8ee45

Browse files
committed
AMDGPU: Implement llvm.get.rounding
There are really two rounding modes, so only return the standard values if both modes are the same. Otherwise, return a bitmask representing the two modes. Annoyingly the register doesn't use the same values as FLT_ROUNDS. Use a simple integer table we can shift into to convert. https://reviews.llvm.org/D153158
1 parent c6fa07c commit 5f8ee45

File tree

9 files changed

+380
-0
lines changed

9 files changed

+380
-0
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -999,6 +999,20 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
999999
:ref:`llvm.stacksave.p5 <int_stacksave>` Implemented, must use the alloca address space.
10001000
:ref:`llvm.stackrestore.p5 <int_stackrestore>` Implemented, must use the alloca address space.
10011001

1002+
:ref:`llvm.get.rounding<int_get_rounding>` AMDGPU supports two separately controllable rounding
1003+
modes depending on the floating-point type. One
1004+
controls float, and the other controls both double and
1005+
half operations. If both modes are the same, returns
1006+
one of the standard return values. If the modes are
1007+
different, returns one of :ref:`12 extended values
1008+
<amdgpu-rounding-mode-enumeration-values-table>`
1009+
describing the two modes.
1010+
1011+
To nearest, ties away from zero is not a supported
1012+
mode. The raw rounding mode values in the MODE
1013+
register do not exactly match the FLT_ROUNDS values,
1014+
so a conversion is performed.
1015+
10021016
llvm.amdgcn.wave.reduce.umin Performs an arithmetic unsigned min reduction on the unsigned values
10031017
provided by each lane in the wavefront.
10041018
Intrinsic takes a hint for reduction strategy using second operand
@@ -4916,6 +4930,22 @@ The fields used by CP for code objects before V3 also match those specified in
49164930
FLOAT_ROUND_MODE_ZERO 3 Round Toward 0
49174931
====================================== ===== ==============================
49184932

4933+
4934+
.. table:: Extended FLT_ROUNDS Enumeration Values
4935+
:name: amdgpu-rounding-mode-enumeration-values-table
4936+
4937+
+------------------------+---------------+-------------------+--------------------+----------+
4938+
| | F32 NEAR_EVEN | F32 PLUS_INFINITY | F32 MINUS_INFINITY | F32 ZERO |
4939+
+------------------------+---------------+-------------------+--------------------+----------+
4940+
| F64/F16 NEAR_EVEN | 1 | 11 | 14 | 17 |
4941+
+------------------------+---------------+-------------------+--------------------+----------+
4942+
| F64/F16 PLUS_INFINITY | 8 | 2 | 15 | 18 |
4943+
+------------------------+---------------+-------------------+--------------------+----------+
4944+
| F64/F16 MINUS_INFINITY | 9 | 12 | 3 | 19 |
4945+
+------------------------+---------------+-------------------+--------------------+----------+
4946+
| F64/F16 ZERO | 10 | 13 | 16 | 0 |
4947+
+------------------------+---------------+-------------------+--------------------+----------+
4948+
49194949
..
49204950

49214951
.. table:: Floating Point Denorm Mode Enumeration Values

llvm/docs/LangRef.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25478,6 +25478,8 @@ These functions read or write floating point environment, such as rounding
2547825478
mode or state of floating point exceptions. Altering the floating point
2547925479
environment requires special care. See :ref:`Floating Point Environment <floatenv>`.
2548025480

25481+
.. _int_get_rounding:
25482+
2548125483
'``llvm.get.rounding``' Intrinsic
2548225484
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2548325485

llvm/docs/ReleaseNotes.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ Changes to the AMDGPU Backend
8181

8282
* Implemented `llvm.stacksave` and `llvm.stackrestore` intrinsics.
8383

84+
* Implemented :ref:`llvm.get.rounding <int_get_rounding>`
85+
8486
Changes to the ARM Backend
8587
--------------------------
8688

llvm/include/llvm/CodeGen/ISDOpcodes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -872,6 +872,7 @@ enum NodeType {
872872
/// 2 Round to +inf
873873
/// 3 Round to -inf
874874
/// 4 Round to nearest, ties to zero
875+
/// Other values are target dependent.
875876
/// Result is rounding mode and chain. Input is a chain.
876877
GET_ROUNDING,
877878

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -755,6 +755,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
755755
Custom);
756756

757757
setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
758+
setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
758759

759760
setTargetDAGCombine({ISD::ADD,
760761
ISD::UADDO_CARRY,
@@ -3541,6 +3542,77 @@ SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
35413542
return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
35423543
}
35433544

3545+
SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
3546+
SelectionDAG &DAG) const {
3547+
SDLoc SL(Op);
3548+
assert(Op.getValueType() == MVT::i32);
3549+
3550+
uint32_t BothRoundHwReg =
3551+
AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_MODE, 0, 4);
3552+
SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
3553+
3554+
SDValue IntrinID =
3555+
DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
3556+
SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
3557+
Op.getOperand(0), IntrinID, GetRoundBothImm);
3558+
3559+
// There are two rounding modes, one for f32 and one for f64/f16. We only
3560+
// report in the standard value range if both are the same.
3561+
//
3562+
// The raw values also differ from the expected FLT_ROUNDS values. Nearest
3563+
// ties away from zero is not supported, and the other values are rotated by
3564+
// 1.
3565+
//
3566+
// If the two rounding modes are not the same, report a target defined value.
3567+
3568+
// Mode register rounding mode fields:
3569+
//
3570+
// [1:0] Single-precision round mode.
3571+
// [3:2] Double/Half-precision round mode.
3572+
//
3573+
// 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
3574+
//
3575+
// Hardware Spec
3576+
// Toward-0 3 0
3577+
// Nearest Even 0 1
3578+
// +Inf 1 2
3579+
// -Inf 2 3
3580+
// NearestAway0 N/A 4
3581+
//
3582+
// We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
3583+
// table we can index by the raw hardware mode.
3584+
//
3585+
// (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
3586+
3587+
SDValue BitTable =
3588+
DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64);
3589+
3590+
SDValue Two = DAG.getConstant(2, SL, MVT::i32);
3591+
SDValue RoundModeTimesNumBits =
3592+
DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
3593+
3594+
// TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
3595+
// knew only one mode was demanded.
3596+
SDValue TableValue =
3597+
DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
3598+
SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
3599+
3600+
SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
3601+
SDValue TableEntry =
3602+
DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
3603+
3604+
// There's a gap in the 4-bit encoded table and actual enum values, so offset
3605+
// if it's an extended value.
3606+
SDValue Four = DAG.getConstant(4, SL, MVT::i32);
3607+
SDValue IsStandardValue =
3608+
DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
3609+
SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
3610+
SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
3611+
TableEntry, EnumOffset);
3612+
3613+
return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
3614+
}
3615+
35443616
Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
35453617
const MachineFunction &MF) const {
35463618
Register Reg = StringSwitch<Register>(RegName)
@@ -5050,6 +5122,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
50505122
return LowerDYNAMIC_STACKALLOC(Op, DAG);
50515123
case ISD::STACKSAVE:
50525124
return LowerSTACKSAVE(Op, DAG);
5125+
case ISD::GET_ROUNDING:
5126+
return lowerGET_ROUNDING(Op, DAG);
50535127
}
50545128
return SDValue();
50555129
}

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
411411
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const;
412412
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
413413
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
414+
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
414415

415416
Register getRegisterByName(const char* RegName, LLT VT,
416417
const MachineFunction &MF) const override;

llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,135 @@ SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) {
3636
FP64FP16Denormals = DenormMode;
3737
}
3838
}
39+
40+
using namespace AMDGPU;
41+
42+
/// Combine f32 and f64 rounding modes into a combined rounding mode value.
43+
static constexpr uint32_t getModeRegisterRoundMode(uint32_t HWFP32Val,
44+
uint32_t HWFP64Val) {
45+
return HWFP32Val << F32FltRoundOffset | HWFP64Val << F64FltRoundOffset;
46+
}
47+
48+
static constexpr uint64_t encodeFltRoundsTable(uint32_t FltRoundsVal,
49+
uint32_t HWF32Val,
50+
uint32_t HWF64Val) {
51+
uint32_t ModeVal = getModeRegisterRoundMode(HWF32Val, HWF64Val);
52+
if (FltRoundsVal > TowardNegative)
53+
FltRoundsVal -= ExtendedFltRoundOffset;
54+
55+
uint32_t BitIndex = ModeVal << 2;
56+
return static_cast<uint64_t>(FltRoundsVal) << BitIndex;
57+
}
58+
59+
// Encode FLT_ROUNDS value where the two rounding modes are the same and use a
60+
// standard value
61+
static constexpr uint64_t
62+
encodeFltRoundsTableSame(AMDGPUFltRounds FltRoundsMode, uint32_t HWVal) {
63+
return encodeFltRoundsTable(FltRoundsMode, HWVal, HWVal);
64+
}
65+
66+
// Convert mode register encoded rounding mode to AMDGPUFltRounds
67+
static constexpr AMDGPUFltRounds
68+
decodeIndexFltRoundConversionTable(uint32_t HWMode) {
69+
uint32_t TableRead = (FltRoundConversionTable >> (HWMode << 2)) & 0xf;
70+
if (TableRead > TowardNegative)
71+
TableRead += ExtendedFltRoundOffset;
72+
return static_cast<AMDGPUFltRounds>(TableRead);
73+
}
74+
75+
static constexpr uint32_t HWTowardZero = FP_ROUND_ROUND_TO_ZERO;
76+
static constexpr uint32_t HWNearestTiesToEven = FP_ROUND_ROUND_TO_NEAREST;
77+
static constexpr uint32_t HWTowardPositive = FP_ROUND_ROUND_TO_INF;
78+
static constexpr uint32_t HWTowardNegative = FP_ROUND_ROUND_TO_NEGINF;
79+
80+
constexpr uint64_t AMDGPU::FltRoundConversionTable =
81+
encodeFltRoundsTableSame(TowardZeroF32_TowardZeroF64, HWTowardZero) |
82+
encodeFltRoundsTableSame(NearestTiesToEvenF32_NearestTiesToEvenF64,
83+
HWNearestTiesToEven) |
84+
encodeFltRoundsTableSame(TowardPositiveF32_TowardPositiveF64,
85+
HWTowardPositive) |
86+
encodeFltRoundsTableSame(TowardNegativeF32_TowardNegativeF64,
87+
HWTowardNegative) |
88+
89+
encodeFltRoundsTable(TowardZeroF32_NearestTiesToEvenF64, HWTowardZero,
90+
HWNearestTiesToEven) |
91+
encodeFltRoundsTable(TowardZeroF32_TowardPositiveF64, HWTowardZero,
92+
HWTowardPositive) |
93+
encodeFltRoundsTable(TowardZeroF32_TowardNegativeF64, HWTowardZero,
94+
HWTowardNegative) |
95+
96+
encodeFltRoundsTable(NearestTiesToEvenF32_TowardZeroF64,
97+
HWNearestTiesToEven, HWTowardZero) |
98+
encodeFltRoundsTable(NearestTiesToEvenF32_TowardPositiveF64,
99+
HWNearestTiesToEven, HWTowardPositive) |
100+
encodeFltRoundsTable(NearestTiesToEvenF32_TowardNegativeF64,
101+
HWNearestTiesToEven, HWTowardNegative) |
102+
103+
encodeFltRoundsTable(TowardPositiveF32_TowardZeroF64, HWTowardPositive,
104+
HWTowardZero) |
105+
encodeFltRoundsTable(TowardPositiveF32_NearestTiesToEvenF64,
106+
HWTowardPositive, HWNearestTiesToEven) |
107+
encodeFltRoundsTable(TowardPositiveF32_TowardNegativeF64, HWTowardPositive,
108+
HWTowardNegative) |
109+
110+
encodeFltRoundsTable(TowardNegativeF32_TowardZeroF64, HWTowardNegative,
111+
HWTowardZero) |
112+
encodeFltRoundsTable(TowardNegativeF32_NearestTiesToEvenF64,
113+
HWTowardNegative, HWNearestTiesToEven) |
114+
encodeFltRoundsTable(TowardNegativeF32_TowardPositiveF64, HWTowardNegative,
115+
HWTowardPositive);
116+
117+
// Verify evaluation of FltRoundConversionTable
118+
119+
// If both modes are the same, should return the standard values.
120+
static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
121+
HWTowardZero, HWTowardZero)) == AMDGPUFltRounds::TowardZero);
122+
static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
123+
HWNearestTiesToEven, HWNearestTiesToEven)) ==
124+
AMDGPUFltRounds::NearestTiesToEven);
125+
static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
126+
HWTowardPositive, HWTowardPositive)) ==
127+
AMDGPUFltRounds::TowardPositive);
128+
static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
129+
HWTowardNegative, HWTowardNegative)) ==
130+
AMDGPUFltRounds::TowardNegative);
131+
132+
static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
133+
HWTowardZero, HWNearestTiesToEven)) ==
134+
TowardZeroF32_NearestTiesToEvenF64);
135+
static_assert(decodeIndexFltRoundConversionTable(
136+
getModeRegisterRoundMode(HWTowardZero, HWTowardPositive)) ==
137+
TowardZeroF32_TowardPositiveF64);
138+
static_assert(decodeIndexFltRoundConversionTable(
139+
getModeRegisterRoundMode(HWTowardZero, HWTowardNegative)) ==
140+
TowardZeroF32_TowardNegativeF64);
141+
142+
static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
143+
HWNearestTiesToEven, HWTowardZero)) ==
144+
NearestTiesToEvenF32_TowardZeroF64);
145+
static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
146+
HWNearestTiesToEven, HWTowardPositive)) ==
147+
NearestTiesToEvenF32_TowardPositiveF64);
148+
static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
149+
HWNearestTiesToEven, HWTowardNegative)) ==
150+
NearestTiesToEvenF32_TowardNegativeF64);
151+
152+
static_assert(decodeIndexFltRoundConversionTable(
153+
getModeRegisterRoundMode(HWTowardPositive, HWTowardZero)) ==
154+
TowardPositiveF32_TowardZeroF64);
155+
static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
156+
HWTowardPositive, HWNearestTiesToEven)) ==
157+
TowardPositiveF32_NearestTiesToEvenF64);
158+
static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
159+
HWTowardPositive, HWTowardNegative)) ==
160+
TowardPositiveF32_TowardNegativeF64);
161+
162+
static_assert(decodeIndexFltRoundConversionTable(
163+
getModeRegisterRoundMode(HWTowardNegative, HWTowardZero)) ==
164+
TowardNegativeF32_TowardZeroF64);
165+
static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
166+
HWTowardNegative, HWNearestTiesToEven)) ==
167+
TowardNegativeF32_NearestTiesToEvenF64);
168+
static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
169+
HWTowardNegative, HWTowardPositive)) ==
170+
TowardNegativeF32_TowardPositiveF64);

llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,65 @@ struct SIModeRegisterDefaults {
8585
}
8686
};
8787

88+
namespace AMDGPU {
89+
90+
/// Return values used for llvm.get.rounding
91+
///
92+
/// When both the F32 and F64/F16 modes are the same, returns the standard
93+
/// values. If they differ, returns an extended mode starting at 8.
94+
enum AMDGPUFltRounds : int8_t {
95+
// Inherit everything from RoundingMode
96+
TowardZero = static_cast<int8_t>(RoundingMode::TowardZero),
97+
NearestTiesToEven = static_cast<int8_t>(RoundingMode::NearestTiesToEven),
98+
TowardPositive = static_cast<int8_t>(RoundingMode::TowardPositive),
99+
TowardNegative = static_cast<int8_t>(RoundingMode::TowardNegative),
100+
NearestTiesToAwayUnsupported =
101+
static_cast<int8_t>(RoundingMode::NearestTiesToAway),
102+
103+
Dynamic = static_cast<int8_t>(RoundingMode::Dynamic),
104+
105+
// Permute the mismatched rounding mode cases. If the modes are the same, use
106+
// the standard values, otherwise, these values are sorted such that higher
107+
// hardware encoded values have higher enum values.
108+
NearestTiesToEvenF32_NearestTiesToEvenF64 = NearestTiesToEven,
109+
NearestTiesToEvenF32_TowardPositiveF64 = 8,
110+
NearestTiesToEvenF32_TowardNegativeF64 = 9,
111+
NearestTiesToEvenF32_TowardZeroF64 = 10,
112+
113+
TowardPositiveF32_NearestTiesToEvenF64 = 11,
114+
TowardPositiveF32_TowardPositiveF64 = TowardPositive,
115+
TowardPositiveF32_TowardNegativeF64 = 12,
116+
TowardPositiveF32_TowardZeroF64 = 13,
117+
118+
TowardNegativeF32_NearestTiesToEvenF64 = 14,
119+
TowardNegativeF32_TowardPositiveF64 = 15,
120+
TowardNegativeF32_TowardNegativeF64 = TowardNegative,
121+
TowardNegativeF32_TowardZeroF64 = 16,
122+
123+
TowardZeroF32_NearestTiesToEvenF64 = 17,
124+
TowardZeroF32_TowardPositiveF64 = 18,
125+
TowardZeroF32_TowardNegativeF64 = 19,
126+
TowardZeroF32_TowardZeroF64 = TowardZero,
127+
128+
Invalid = static_cast<int8_t>(RoundingMode::Invalid)
129+
};
130+
131+
/// Offset of nonstandard values for llvm.get.rounding results from the largest
132+
/// supported mode.
133+
static constexpr uint32_t ExtendedFltRoundOffset = 4;
134+
135+
/// Offset in mode register of f32 rounding mode.
136+
static constexpr uint32_t F32FltRoundOffset = 0;
137+
138+
/// Offset in mode register of f64/f16 rounding mode.
139+
static constexpr uint32_t F64FltRoundOffset = 2;
140+
141+
// Bit indexed table to convert from hardware rounding mode values to FLT_ROUNDS
142+
// values.
143+
extern const uint64_t FltRoundConversionTable;
144+
145+
} // end namespace AMDGPU
146+
88147
} // end namespace llvm
89148

90149
#endif // LLVM_LIB_TARGET_AMDGPU_SIMODEREGISTERDEFAULTS_H

0 commit comments

Comments
 (0)