Skip to content

Commit d654278

Browse files
authored
Reapply "AMDGPU: Implement llvm.set.rounding (#88587)" series (#91113)
Revert "Revert 4 last AMDGPU commits to unbreak Windows bots" This reverts commit 0d493ed. MSVC does not like constexpr on the definition after an extern declaration of a global.
1 parent db532ff commit d654278

File tree

8 files changed

+1890
-0
lines changed

8 files changed

+1890
-0
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1157,6 +1157,12 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
11571157
register do not exactly match the FLT_ROUNDS values,
11581158
so a conversion is performed.
11591159

1160+
:ref:`llvm.set.rounding<int_set_rounding>` Input value expected to be one of the valid results
1161+
from '``llvm.get.rounding``'. Rounding mode is
1162+
undefined if not passed a valid input. This should be
1163+
a wave uniform value. In case of a divergent input
1164+
value, the first active lane's value will be used.
1165+
11601166
:ref:`llvm.get.fpenv<int_get_fpenv>` Returns the current value of the AMDGPU floating point environment.
11611167
This stores information related to the current rounding mode,
11621168
denormalization mode, enabled traps, and floating point exceptions.

llvm/docs/LangRef.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26739,6 +26739,8 @@ specified by C standard:
2673926739
Other values may be used to represent additional rounding modes, supported by a
2674026740
target. These values are target-specific.
2674126741

26742+
.. _int_set_rounding:
26743+
2674226744
'``llvm.set.rounding``' Intrinsic
2674326745
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2674426746

llvm/docs/ReleaseNotes.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ Changes to the AMDGPU Backend
8181

8282
* Implemented the ``llvm.get.fpenv`` and ``llvm.set.fpenv`` intrinsics.
8383

84+
* Implemented :ref:`llvm.get.rounding <int_get_rounding>` and :ref:`llvm.set.rounding <int_set_rounding>`
85+
8486
Changes to the ARM Backend
8587
--------------------------
8688
* FEAT_F32MM is no longer activated by default when using `+sve` on v8.6-A or greater. The feature is still available and can be used by adding `+f32mm` to the command line options.

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -877,6 +877,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
877877

878878
setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
879879
setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
880+
setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
880881
setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
881882
setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
882883

@@ -4059,6 +4060,91 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
40594060
return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
40604061
}
40614062

4063+
SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
4064+
SelectionDAG &DAG) const {
4065+
SDLoc SL(Op);
4066+
4067+
SDValue NewMode = Op.getOperand(1);
4068+
assert(NewMode.getValueType() == MVT::i32);
4069+
4070+
// Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4071+
// hardware MODE.fp_round values.
4072+
if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4073+
uint32_t ClampedVal = std::min(
4074+
static_cast<uint32_t>(ConstMode->getZExtValue()),
4075+
static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
4076+
NewMode = DAG.getConstant(
4077+
AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4078+
} else {
4079+
// If we know the input can only be one of the supported standard modes in
4080+
// the range 0-3, we can use a simplified mapping to hardware values.
4081+
KnownBits KB = DAG.computeKnownBits(NewMode);
4082+
const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4083+
// The supported standard values are 0-3. The extended values start at 8. We
4084+
// need to offset by 4 if the value is in the extended range.
4085+
4086+
if (UseReducedTable) {
4087+
// Truncate to the low 32-bits.
4088+
SDValue BitTable = DAG.getConstant(
4089+
AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4090+
4091+
SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4092+
SDValue RoundModeTimesNumBits =
4093+
DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4094+
4095+
NewMode =
4096+
DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4097+
4098+
// TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4099+
// the table extracted bits into inline immediates.
4100+
} else {
4101+
// table_index = umin(value, value - 4)
4102+
// MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4103+
SDValue BitTable =
4104+
DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
4105+
4106+
SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4107+
SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4108+
SDValue IndexVal =
4109+
DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4110+
4111+
SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4112+
SDValue RoundModeTimesNumBits =
4113+
DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4114+
4115+
SDValue TableValue =
4116+
DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4117+
SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4118+
4119+
// No need to mask out the high bits since the setreg will ignore them
4120+
// anyway.
4121+
NewMode = TruncTable;
4122+
}
4123+
4124+
// Insert a readfirstlane in case the value is a VGPR. We could do this
4125+
// earlier and keep more operations scalar, but that interferes with
4126+
// combining the source.
4127+
SDValue ReadFirstLaneID =
4128+
DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4129+
NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4130+
ReadFirstLaneID, NewMode);
4131+
}
4132+
4133+
// N.B. The setreg will be later folded into s_round_mode on supported
4134+
// targets.
4135+
SDValue IntrinID =
4136+
DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4137+
uint32_t BothRoundHwReg =
4138+
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);
4139+
SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4140+
4141+
SDValue SetReg =
4142+
DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4143+
IntrinID, RoundBothImm, NewMode);
4144+
4145+
return SetReg;
4146+
}
4147+
40624148
SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
40634149
if (Op->isDivergent())
40644150
return SDValue();
@@ -5754,6 +5840,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
57545840
return LowerSTACKSAVE(Op, DAG);
57555841
case ISD::GET_ROUNDING:
57565842
return lowerGET_ROUNDING(Op, DAG);
5843+
case ISD::SET_ROUNDING:
5844+
return lowerSET_ROUNDING(Op, DAG);
57575845
case ISD::PREFETCH:
57585846
return lowerPREFETCH(Op, DAG);
57595847
case ISD::FP_EXTEND:

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
422422
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
423423
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
424424
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
425+
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
425426

426427
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
427428
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;

llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,3 +174,122 @@ static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
174174
static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
175175
HWTowardNegative, HWTowardPositive)) ==
176176
TowardNegativeF32_TowardPositiveF64);
177+
178+
// Decode FLT_ROUNDS into the hardware value where the two rounding modes are
179+
// the same and use a standard value
180+
static constexpr uint64_t encodeFltRoundsToHWTableSame(uint32_t HWVal,
181+
uint32_t FltRoundsVal) {
182+
if (FltRoundsVal > TowardNegative)
183+
FltRoundsVal -= ExtendedFltRoundOffset;
184+
185+
return static_cast<uint64_t>(getModeRegisterRoundMode(HWVal, HWVal))
186+
<< (FltRoundsVal << 2);
187+
}
188+
189+
/// Decode FLT_ROUNDS into the hardware value where the two rounding modes
190+
/// different and use an extended value.
191+
static constexpr uint64_t encodeFltRoundsToHWTable(uint32_t HWF32Val,
192+
uint32_t HWF64Val,
193+
uint32_t FltRoundsVal) {
194+
if (FltRoundsVal > TowardNegative)
195+
FltRoundsVal -= ExtendedFltRoundOffset;
196+
return static_cast<uint64_t>(getModeRegisterRoundMode(HWF32Val, HWF64Val))
197+
<< (FltRoundsVal << 2);
198+
}
199+
200+
const uint64_t AMDGPU::FltRoundToHWConversionTable =
201+
encodeFltRoundsToHWTableSame(HWTowardZero, TowardZeroF32_TowardZeroF64) |
202+
encodeFltRoundsToHWTableSame(HWNearestTiesToEven,
203+
NearestTiesToEvenF32_NearestTiesToEvenF64) |
204+
encodeFltRoundsToHWTableSame(HWTowardPositive,
205+
TowardPositiveF32_TowardPositiveF64) |
206+
encodeFltRoundsToHWTableSame(HWTowardNegative,
207+
TowardNegativeF32_TowardNegativeF64) |
208+
209+
encodeFltRoundsToHWTable(HWTowardZero, HWNearestTiesToEven,
210+
TowardZeroF32_NearestTiesToEvenF64) |
211+
encodeFltRoundsToHWTable(HWTowardZero, HWTowardPositive,
212+
TowardZeroF32_TowardPositiveF64) |
213+
encodeFltRoundsToHWTable(HWTowardZero, HWTowardNegative,
214+
TowardZeroF32_TowardNegativeF64) |
215+
216+
encodeFltRoundsToHWTable(HWNearestTiesToEven, HWTowardZero,
217+
NearestTiesToEvenF32_TowardZeroF64) |
218+
encodeFltRoundsToHWTable(HWNearestTiesToEven, HWTowardPositive,
219+
NearestTiesToEvenF32_TowardPositiveF64) |
220+
encodeFltRoundsToHWTable(HWNearestTiesToEven, HWTowardNegative,
221+
NearestTiesToEvenF32_TowardNegativeF64) |
222+
223+
encodeFltRoundsToHWTable(HWTowardPositive, HWTowardZero,
224+
TowardPositiveF32_TowardZeroF64) |
225+
encodeFltRoundsToHWTable(HWTowardPositive, HWNearestTiesToEven,
226+
TowardPositiveF32_NearestTiesToEvenF64) |
227+
encodeFltRoundsToHWTable(HWTowardPositive, HWTowardNegative,
228+
TowardPositiveF32_TowardNegativeF64) |
229+
230+
encodeFltRoundsToHWTable(HWTowardNegative, HWTowardZero,
231+
TowardNegativeF32_TowardZeroF64) |
232+
encodeFltRoundsToHWTable(HWTowardNegative, HWNearestTiesToEven,
233+
TowardNegativeF32_NearestTiesToEvenF64) |
234+
encodeFltRoundsToHWTable(HWTowardNegative, HWTowardPositive,
235+
TowardNegativeF32_TowardPositiveF64);
236+
237+
/// Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
238+
static constexpr uint32_t
239+
decodeFltRoundToHWConversionTable(uint64_t FltRoundToHWConversionTable,
240+
uint32_t FltRounds) {
241+
uint32_t IndexVal = FltRounds;
242+
if (IndexVal > TowardNegative)
243+
IndexVal -= ExtendedFltRoundOffset;
244+
return (FltRoundToHWConversionTable >> (IndexVal << 2)) & 0xf;
245+
}
246+
247+
uint32_t AMDGPU::decodeFltRoundToHWConversionTable(uint32_t FltRounds) {
248+
return ::decodeFltRoundToHWConversionTable(FltRoundToHWConversionTable,
249+
FltRounds);
250+
}
251+
252+
static constexpr uint32_t decodeFltRoundToHW(uint32_t FltRounds) {
253+
return ::decodeFltRoundToHWConversionTable(FltRoundToHWConversionTable,
254+
FltRounds);
255+
}
256+
257+
// Verify evaluation of FltRoundToHWConversionTable
258+
259+
static_assert(decodeFltRoundToHW(AMDGPUFltRounds::TowardZero) ==
260+
getModeRegisterRoundMode(HWTowardZero, HWTowardZero));
261+
static_assert(decodeFltRoundToHW(AMDGPUFltRounds::NearestTiesToEven) ==
262+
getModeRegisterRoundMode(HWNearestTiesToEven,
263+
HWNearestTiesToEven));
264+
static_assert(decodeFltRoundToHW(AMDGPUFltRounds::TowardPositive) ==
265+
getModeRegisterRoundMode(HWTowardPositive, HWTowardPositive));
266+
static_assert(decodeFltRoundToHW(AMDGPUFltRounds::TowardNegative) ==
267+
getModeRegisterRoundMode(HWTowardNegative, HWTowardNegative));
268+
269+
static_assert(decodeFltRoundToHW(NearestTiesToEvenF32_TowardPositiveF64) ==
270+
getModeRegisterRoundMode(HWNearestTiesToEven, HWTowardPositive));
271+
static_assert(decodeFltRoundToHW(NearestTiesToEvenF32_TowardNegativeF64) ==
272+
getModeRegisterRoundMode(HWNearestTiesToEven, HWTowardNegative));
273+
static_assert(decodeFltRoundToHW(NearestTiesToEvenF32_TowardZeroF64) ==
274+
getModeRegisterRoundMode(HWNearestTiesToEven, HWTowardZero));
275+
276+
static_assert(decodeFltRoundToHW(TowardPositiveF32_NearestTiesToEvenF64) ==
277+
getModeRegisterRoundMode(HWTowardPositive, HWNearestTiesToEven));
278+
static_assert(decodeFltRoundToHW(TowardPositiveF32_TowardNegativeF64) ==
279+
getModeRegisterRoundMode(HWTowardPositive, HWTowardNegative));
280+
static_assert(decodeFltRoundToHW(TowardPositiveF32_TowardZeroF64) ==
281+
getModeRegisterRoundMode(HWTowardPositive, HWTowardZero));
282+
283+
static_assert(decodeFltRoundToHW(TowardNegativeF32_NearestTiesToEvenF64) ==
284+
getModeRegisterRoundMode(HWTowardNegative, HWNearestTiesToEven));
285+
static_assert(decodeFltRoundToHW(TowardNegativeF32_TowardPositiveF64) ==
286+
getModeRegisterRoundMode(HWTowardNegative, HWTowardPositive));
287+
static_assert(decodeFltRoundToHW(TowardNegativeF32_TowardZeroF64) ==
288+
getModeRegisterRoundMode(HWTowardNegative, HWTowardZero));
289+
290+
static_assert(decodeFltRoundToHW(TowardZeroF32_NearestTiesToEvenF64) ==
291+
getModeRegisterRoundMode(HWTowardZero, HWNearestTiesToEven));
292+
static_assert(decodeFltRoundToHW(TowardZeroF32_TowardPositiveF64) ==
293+
getModeRegisterRoundMode(HWTowardZero, HWTowardPositive));
294+
static_assert(decodeFltRoundToHW(TowardZeroF32_TowardNegativeF64) ==
295+
getModeRegisterRoundMode(HWTowardZero, HWTowardNegative));

llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,13 @@ static constexpr uint32_t F64FltRoundOffset = 2;
144144
// values.
145145
extern const uint64_t FltRoundConversionTable;
146146

147+
// Bit indexed table to convert from FLT_ROUNDS values to hardware rounding mode
148+
// values
149+
extern const uint64_t FltRoundToHWConversionTable;
150+
151+
/// Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
152+
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds);
153+
147154
} // end namespace AMDGPU
148155

149156
} // end namespace llvm

0 commit comments

Comments
 (0)