Skip to content

Commit 722c313

Browse files
committed
AMDGPU: Optimize set_rounding if input is known to fit in 2 bits
We don't need to figure out the weird extended rounding modes or handle offsets to keep the lookup table in 64-bits. https://reviews.llvm.org/D153258
1 parent a037b7f commit 722c313

File tree

2 files changed

+143
-326
lines changed

2 files changed

+143
-326
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4073,34 +4073,55 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
40734073
NewMode = DAG.getConstant(
40744074
AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
40754075
} else {
4076-
SDValue BitTable =
4077-
DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
4078-
4076+
// If we know the input can only be one of the supported standard modes in
4077+
// the range 0-3, we can use a simplified mapping to hardware values.
4078+
KnownBits KB = DAG.computeKnownBits(NewMode);
4079+
const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
40794080
// The supported standard values are 0-3. The extended values start at 8. We
40804081
// need to offset by 4 if the value is in the extended range.
40814082

4082-
// is_standard = value < 4;
4083-
// table_index = is_standard ? value : (value - 4)
4084-
// MODE.fp_round = (bit_table >> table_index) & 0xf
4083+
if (UseReducedTable) {
4084+
// Truncate to the low 32-bits.
4085+
SDValue BitTable = DAG.getConstant(
4086+
AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4087+
4088+
SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4089+
SDValue RoundModeTimesNumBits =
4090+
DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
40854091

4086-
SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4087-
SDValue IsStandardValue =
4092+
SDValue TableValue =
4093+
DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4094+
NewMode = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4095+
4096+
// TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4097+
// the table extracted bits into inline immediates.
4098+
} else {
4099+
// is_standard = value < 4;
4100+
// table_index = is_standard ? value : (value - 4)
4101+
// MODE.fp_round = (bit_table >> table_index) & 0xf
4102+
SDValue BitTable =
4103+
DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
4104+
4105+
SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4106+
SDValue IsStandardValue =
40884107
DAG.getSetCC(SL, MVT::i1, NewMode, Four, ISD::SETULT);
4089-
SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4090-
SDValue IndexVal = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4091-
NewMode, OffsetEnum);
4108+
SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
40924109

4093-
SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4094-
SDValue RoundModeTimesNumBits =
4110+
SDValue IndexVal = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4111+
NewMode, OffsetEnum);
4112+
4113+
SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4114+
SDValue RoundModeTimesNumBits =
40954115
DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
40964116

4097-
SDValue TableValue =
4117+
SDValue TableValue =
40984118
DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4099-
SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4119+
SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
41004120

4101-
// No need to mask out the high bits since the setreg will ignore them
4102-
// anyway.
4103-
NewMode = TruncTable;
4121+
// No need to mask out the high bits since the setreg will ignore them
4122+
// anyway.
4123+
NewMode = TruncTable;
4124+
}
41044125

41054126
// Insert a readfirstlane in case the value is a VGPR. We could do this
41064127
// earlier and keep more operations scalar, but that interferes with

0 commit comments

Comments
 (0)