Skip to content

Reapply "AMDGPU: Implement llvm.set.rounding (#88587)" series #91113

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1157,6 +1157,12 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
register do not exactly match the FLT_ROUNDS values,
so a conversion is performed.

:ref:`llvm.set.rounding<int_set_rounding>` Input value expected to be one of the valid results
from '``llvm.get.rounding``'. Rounding mode is
undefined if not passed a valid input. This should be
a wave uniform value. In case of a divergent input
value, the first active lane's value will be used.

:ref:`llvm.get.fpenv<int_get_fpenv>` Returns the current value of the AMDGPU floating point environment.
This stores information related to the current rounding mode,
denormalization mode, enabled traps, and floating point exceptions.
Expand Down
2 changes: 2 additions & 0 deletions llvm/docs/LangRef.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26739,6 +26739,8 @@ specified by C standard:
Other values may be used to represent additional rounding modes, supported by a
target. These values are target-specific.

.. _int_set_rounding:

'``llvm.set.rounding``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
2 changes: 2 additions & 0 deletions llvm/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ Changes to the AMDGPU Backend

* Implemented the ``llvm.get.fpenv`` and ``llvm.set.fpenv`` intrinsics.

* Implemented :ref:`llvm.get.rounding <int_get_rounding>` and :ref:`llvm.set.rounding <int_set_rounding>`

Changes to the ARM Backend
--------------------------
* FEAT_F32MM is no longer activated by default when using `+sve` on v8.6-A or greater. The feature is still available and can be used by adding `+f32mm` to the command line options.
Expand Down
88 changes: 88 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);

Expand Down Expand Up @@ -4059,6 +4060,91 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
}

SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
SelectionDAG &DAG) const {
SDLoc SL(Op);

SDValue NewMode = Op.getOperand(1);
assert(NewMode.getValueType() == MVT::i32);

// Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
// hardware MODE.fp_round values.
if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
uint32_t ClampedVal = std::min(
static_cast<uint32_t>(ConstMode->getZExtValue()),
static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
NewMode = DAG.getConstant(
AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
} else {
// If we know the input can only be one of the supported standard modes in
// the range 0-3, we can use a simplified mapping to hardware values.
KnownBits KB = DAG.computeKnownBits(NewMode);
const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
// The supported standard values are 0-3. The extended values start at 8. We
// need to offset by 4 if the value is in the extended range.

if (UseReducedTable) {
// Truncate to the low 32-bits.
SDValue BitTable = DAG.getConstant(
AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);

SDValue Two = DAG.getConstant(2, SL, MVT::i32);
SDValue RoundModeTimesNumBits =
DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);

NewMode =
DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);

// TODO: SimplifyDemandedBits on the setreg source here can likely reduce
// the table extracted bits into inline immediates.
} else {
// table_index = umin(value, value - 4)
// MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
SDValue BitTable =
DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);

SDValue Four = DAG.getConstant(4, SL, MVT::i32);
SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
SDValue IndexVal =
DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);

SDValue Two = DAG.getConstant(2, SL, MVT::i32);
SDValue RoundModeTimesNumBits =
DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);

SDValue TableValue =
DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);

// No need to mask out the high bits since the setreg will ignore them
// anyway.
NewMode = TruncTable;
}

// Insert a readfirstlane in case the value is a VGPR. We could do this
// earlier and keep more operations scalar, but that interferes with
// combining the source.
SDValue ReadFirstLaneID =
DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
ReadFirstLaneID, NewMode);
}

// N.B. The setreg will be later folded into s_round_mode on supported
// targets.
SDValue IntrinID =
DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
uint32_t BothRoundHwReg =
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);
SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);

SDValue SetReg =
DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
IntrinID, RoundBothImm, NewMode);

return SetReg;
}

SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
if (Op->isDivergent())
return SDValue();
Expand Down Expand Up @@ -5754,6 +5840,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerSTACKSAVE(Op, DAG);
case ISD::GET_ROUNDING:
return lowerGET_ROUNDING(Op, DAG);
case ISD::SET_ROUNDING:
return lowerSET_ROUNDING(Op, DAG);
case ISD::PREFETCH:
return lowerPREFETCH(Op, DAG);
case ISD::FP_EXTEND:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;

SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
Expand Down
119 changes: 119 additions & 0 deletions llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,3 +174,122 @@ static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
HWTowardNegative, HWTowardPositive)) ==
TowardNegativeF32_TowardPositiveF64);

// Decode FLT_ROUNDS into the hardware value where the two rounding modes are
// the same and use a standard value
static constexpr uint64_t encodeFltRoundsToHWTableSame(uint32_t HWVal,
uint32_t FltRoundsVal) {
if (FltRoundsVal > TowardNegative)
FltRoundsVal -= ExtendedFltRoundOffset;

return static_cast<uint64_t>(getModeRegisterRoundMode(HWVal, HWVal))
<< (FltRoundsVal << 2);
}

/// Decode FLT_ROUNDS into the hardware value where the two rounding modes
/// different and use an extended value.
static constexpr uint64_t encodeFltRoundsToHWTable(uint32_t HWF32Val,
uint32_t HWF64Val,
uint32_t FltRoundsVal) {
if (FltRoundsVal > TowardNegative)
FltRoundsVal -= ExtendedFltRoundOffset;
return static_cast<uint64_t>(getModeRegisterRoundMode(HWF32Val, HWF64Val))
<< (FltRoundsVal << 2);
}

const uint64_t AMDGPU::FltRoundToHWConversionTable =
encodeFltRoundsToHWTableSame(HWTowardZero, TowardZeroF32_TowardZeroF64) |
encodeFltRoundsToHWTableSame(HWNearestTiesToEven,
NearestTiesToEvenF32_NearestTiesToEvenF64) |
encodeFltRoundsToHWTableSame(HWTowardPositive,
TowardPositiveF32_TowardPositiveF64) |
encodeFltRoundsToHWTableSame(HWTowardNegative,
TowardNegativeF32_TowardNegativeF64) |

encodeFltRoundsToHWTable(HWTowardZero, HWNearestTiesToEven,
TowardZeroF32_NearestTiesToEvenF64) |
encodeFltRoundsToHWTable(HWTowardZero, HWTowardPositive,
TowardZeroF32_TowardPositiveF64) |
encodeFltRoundsToHWTable(HWTowardZero, HWTowardNegative,
TowardZeroF32_TowardNegativeF64) |

encodeFltRoundsToHWTable(HWNearestTiesToEven, HWTowardZero,
NearestTiesToEvenF32_TowardZeroF64) |
encodeFltRoundsToHWTable(HWNearestTiesToEven, HWTowardPositive,
NearestTiesToEvenF32_TowardPositiveF64) |
encodeFltRoundsToHWTable(HWNearestTiesToEven, HWTowardNegative,
NearestTiesToEvenF32_TowardNegativeF64) |

encodeFltRoundsToHWTable(HWTowardPositive, HWTowardZero,
TowardPositiveF32_TowardZeroF64) |
encodeFltRoundsToHWTable(HWTowardPositive, HWNearestTiesToEven,
TowardPositiveF32_NearestTiesToEvenF64) |
encodeFltRoundsToHWTable(HWTowardPositive, HWTowardNegative,
TowardPositiveF32_TowardNegativeF64) |

encodeFltRoundsToHWTable(HWTowardNegative, HWTowardZero,
TowardNegativeF32_TowardZeroF64) |
encodeFltRoundsToHWTable(HWTowardNegative, HWNearestTiesToEven,
TowardNegativeF32_NearestTiesToEvenF64) |
encodeFltRoundsToHWTable(HWTowardNegative, HWTowardPositive,
TowardNegativeF32_TowardPositiveF64);

/// Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
static constexpr uint32_t
decodeFltRoundToHWConversionTable(uint64_t FltRoundToHWConversionTable,
uint32_t FltRounds) {
uint32_t IndexVal = FltRounds;
if (IndexVal > TowardNegative)
IndexVal -= ExtendedFltRoundOffset;
return (FltRoundToHWConversionTable >> (IndexVal << 2)) & 0xf;
}

uint32_t AMDGPU::decodeFltRoundToHWConversionTable(uint32_t FltRounds) {
return ::decodeFltRoundToHWConversionTable(FltRoundToHWConversionTable,
FltRounds);
}

static constexpr uint32_t decodeFltRoundToHW(uint32_t FltRounds) {
return ::decodeFltRoundToHWConversionTable(FltRoundToHWConversionTable,
FltRounds);
}

// Verify evaluation of FltRoundToHWConversionTable

static_assert(decodeFltRoundToHW(AMDGPUFltRounds::TowardZero) ==
getModeRegisterRoundMode(HWTowardZero, HWTowardZero));
static_assert(decodeFltRoundToHW(AMDGPUFltRounds::NearestTiesToEven) ==
getModeRegisterRoundMode(HWNearestTiesToEven,
HWNearestTiesToEven));
static_assert(decodeFltRoundToHW(AMDGPUFltRounds::TowardPositive) ==
getModeRegisterRoundMode(HWTowardPositive, HWTowardPositive));
static_assert(decodeFltRoundToHW(AMDGPUFltRounds::TowardNegative) ==
getModeRegisterRoundMode(HWTowardNegative, HWTowardNegative));

static_assert(decodeFltRoundToHW(NearestTiesToEvenF32_TowardPositiveF64) ==
getModeRegisterRoundMode(HWNearestTiesToEven, HWTowardPositive));
static_assert(decodeFltRoundToHW(NearestTiesToEvenF32_TowardNegativeF64) ==
getModeRegisterRoundMode(HWNearestTiesToEven, HWTowardNegative));
static_assert(decodeFltRoundToHW(NearestTiesToEvenF32_TowardZeroF64) ==
getModeRegisterRoundMode(HWNearestTiesToEven, HWTowardZero));

static_assert(decodeFltRoundToHW(TowardPositiveF32_NearestTiesToEvenF64) ==
getModeRegisterRoundMode(HWTowardPositive, HWNearestTiesToEven));
static_assert(decodeFltRoundToHW(TowardPositiveF32_TowardNegativeF64) ==
getModeRegisterRoundMode(HWTowardPositive, HWTowardNegative));
static_assert(decodeFltRoundToHW(TowardPositiveF32_TowardZeroF64) ==
getModeRegisterRoundMode(HWTowardPositive, HWTowardZero));

static_assert(decodeFltRoundToHW(TowardNegativeF32_NearestTiesToEvenF64) ==
getModeRegisterRoundMode(HWTowardNegative, HWNearestTiesToEven));
static_assert(decodeFltRoundToHW(TowardNegativeF32_TowardPositiveF64) ==
getModeRegisterRoundMode(HWTowardNegative, HWTowardPositive));
static_assert(decodeFltRoundToHW(TowardNegativeF32_TowardZeroF64) ==
getModeRegisterRoundMode(HWTowardNegative, HWTowardZero));

static_assert(decodeFltRoundToHW(TowardZeroF32_NearestTiesToEvenF64) ==
getModeRegisterRoundMode(HWTowardZero, HWNearestTiesToEven));
static_assert(decodeFltRoundToHW(TowardZeroF32_TowardPositiveF64) ==
getModeRegisterRoundMode(HWTowardZero, HWTowardPositive));
static_assert(decodeFltRoundToHW(TowardZeroF32_TowardNegativeF64) ==
getModeRegisterRoundMode(HWTowardZero, HWTowardNegative));
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,13 @@ static constexpr uint32_t F64FltRoundOffset = 2;
// values.
extern const uint64_t FltRoundConversionTable;

// Bit indexed table to convert from FLT_ROUNDS values to hardware rounding mode
// values
extern const uint64_t FltRoundToHWConversionTable;

/// Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds);

} // end namespace AMDGPU

} // end namespace llvm
Expand Down
Loading