Skip to content

Commit b4e751e

Browse files
authored
AMDGPU: Optimize set_rounding if input is known to fit in 2 bits (llvm#88588)
We don't need to figure out the weird extended rounding modes or handle offsets to keep the lookup table in 64-bits. https://reviews.llvm.org/D153258 Depends llvm#88587
1 parent 7c64b53 commit b4e751e

File tree

2 files changed

+145
-329
lines changed

2 files changed

+145
-329
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4076,34 +4076,54 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
40764076
NewMode = DAG.getConstant(
40774077
AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
40784078
} else {
4079-
SDValue BitTable =
4080-
DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
4081-
4079+
// If we know the input can only be one of the supported standard modes in
4080+
// the range 0-3, we can use a simplified mapping to hardware values.
4081+
KnownBits KB = DAG.computeKnownBits(NewMode);
4082+
const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
40824083
// The supported standard values are 0-3. The extended values start at 8. We
40834084
// need to offset by 4 if the value is in the extended range.
40844085

4085-
// is_standard = value < 4;
4086-
// table_index = is_standard ? value : (value - 4)
4087-
// MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4086+
if (UseReducedTable) {
4087+
// Truncate to the low 32-bits.
4088+
SDValue BitTable = DAG.getConstant(
4089+
AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4090+
4091+
SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4092+
SDValue RoundModeTimesNumBits =
4093+
DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
40884094

4089-
SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4090-
SDValue IsStandardValue =
4091-
DAG.getSetCC(SL, MVT::i1, NewMode, Four, ISD::SETULT);
4092-
SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4093-
SDValue IndexVal = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4094-
NewMode, OffsetEnum);
4095+
NewMode =
4096+
DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4097+
4098+
// TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4099+
// the table extracted bits into inline immediates.
4100+
} else {
4101+
// is_standard = value < 4;
4102+
// table_index = is_standard ? value : (value - 4)
4103+
// MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4104+
SDValue BitTable =
4105+
DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
40954106

4096-
SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4097-
SDValue RoundModeTimesNumBits =
4098-
DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4107+
SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4108+
SDValue IsStandardValue =
4109+
DAG.getSetCC(SL, MVT::i1, NewMode, Four, ISD::SETULT);
4110+
SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
40994111

4100-
SDValue TableValue =
4101-
DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4102-
SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4112+
SDValue IndexVal = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4113+
NewMode, OffsetEnum);
41034114

4104-
// No need to mask out the high bits since the setreg will ignore them
4105-
// anyway.
4106-
NewMode = TruncTable;
4115+
SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4116+
SDValue RoundModeTimesNumBits =
4117+
DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4118+
4119+
SDValue TableValue =
4120+
DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4121+
SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4122+
4123+
// No need to mask out the high bits since the setreg will ignore them
4124+
// anyway.
4125+
NewMode = TruncTable;
4126+
}
41074127

41084128
// Insert a readfirstlane in case the value is a VGPR. We could do this
41094129
// earlier and keep more operations scalar, but that interferes with

0 commit comments

Comments
 (0)