@@ -4073,34 +4073,55 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
4073
4073
NewMode = DAG.getConstant(
4074
4074
AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4075
4075
} else {
4076
- SDValue BitTable =
4077
- DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
4078
-
4076
+ // If we know the input can only be one of the supported standard modes in
4077
+ // the range 0-3, we can use a simplified mapping to hardware values.
4078
+ KnownBits KB = DAG.computeKnownBits(NewMode);
4079
+ const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4079
4080
// The supported standard values are 0-3. The extended values start at 8. We
4080
4081
// need to offset by 4 if the value is in the extended range.
4081
4082
4082
- // is_standard = value < 4;
4083
- // table_index = is_standard ? value : (value - 4)
4084
- // MODE.fp_round = (bit_table >> table_index) & 0xf
4083
+ if (UseReducedTable) {
4084
+ // Truncate to the low 32-bits.
4085
+ SDValue BitTable = DAG.getConstant(
4086
+ AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4087
+
4088
+ SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4089
+ SDValue RoundModeTimesNumBits =
4090
+ DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4085
4091
4086
- SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4087
- SDValue IsStandardValue =
4092
+ SDValue TableValue =
4093
+ DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4094
+ NewMode = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4095
+
4096
+ // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4097
+ // the table extracted bits into inline immediates.
4098
+ } else {
4099
+ // is_standard = value < 4;
4100
+ // table_index = is_standard ? value : (value - 4)
4101
+ // MODE.fp_round = (bit_table >> table_index) & 0xf
4102
+ SDValue BitTable =
4103
+ DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
4104
+
4105
+ SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4106
+ SDValue IsStandardValue =
4088
4107
DAG.getSetCC(SL, MVT::i1, NewMode, Four, ISD::SETULT);
4089
- SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4090
- SDValue IndexVal = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4091
- NewMode, OffsetEnum);
4108
+ SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4092
4109
4093
- SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4094
- SDValue RoundModeTimesNumBits =
4110
+ SDValue IndexVal = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4111
+ NewMode, OffsetEnum);
4112
+
4113
+ SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4114
+ SDValue RoundModeTimesNumBits =
4095
4115
DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4096
4116
4097
- SDValue TableValue =
4117
+ SDValue TableValue =
4098
4118
DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4099
- SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4119
+ SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4100
4120
4101
- // No need to mask out the high bits since the setreg will ignore them
4102
- // anyway.
4103
- NewMode = TruncTable;
4121
+ // No need to mask out the high bits since the setreg will ignore them
4122
+ // anyway.
4123
+ NewMode = TruncTable;
4124
+ }
4104
4125
4105
4126
// Insert a readfirstlane in case the value is a VGPR. We could do this
4106
4127
// earlier and keep more operations scalar, but that interferes with
0 commit comments