@@ -4076,34 +4076,54 @@ SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
4076
4076
NewMode = DAG.getConstant(
4077
4077
AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4078
4078
} else {
4079
- SDValue BitTable =
4080
- DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
4081
-
4079
+ // If we know the input can only be one of the supported standard modes in
4080
+ // the range 0-3, we can use a simplified mapping to hardware values.
4081
+ KnownBits KB = DAG.computeKnownBits(NewMode);
4082
+ const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4082
4083
// The supported standard values are 0-3. The extended values start at 8. We
4083
4084
// need to offset by 4 if the value is in the extended range.
4084
4085
4085
- // is_standard = value < 4;
4086
- // table_index = is_standard ? value : (value - 4)
4087
- // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4086
+ if (UseReducedTable) {
4087
+ // Truncate to the low 32-bits.
4088
+ SDValue BitTable = DAG.getConstant(
4089
+ AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4090
+
4091
+ SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4092
+ SDValue RoundModeTimesNumBits =
4093
+ DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4088
4094
4089
- SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4090
- SDValue IsStandardValue =
4091
- DAG.getSetCC(SL, MVT::i1, NewMode, Four, ISD::SETULT);
4092
- SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4093
- SDValue IndexVal = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4094
- NewMode, OffsetEnum);
4095
+ NewMode =
4096
+ DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4097
+
4098
+ // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4099
+ // the table extracted bits into inline immediates.
4100
+ } else {
4101
+ // is_standard = value < 4;
4102
+ // table_index = is_standard ? value : (value - 4)
4103
+ // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4104
+ SDValue BitTable =
4105
+ DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64);
4095
4106
4096
- SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4097
- SDValue RoundModeTimesNumBits =
4098
- DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4107
+ SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4108
+ SDValue IsStandardValue =
4109
+ DAG.getSetCC(SL, MVT::i1, NewMode, Four, ISD::SETULT);
4110
+ SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4099
4111
4100
- SDValue TableValue =
4101
- DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4102
- SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4112
+ SDValue IndexVal = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4113
+ NewMode, OffsetEnum);
4103
4114
4104
- // No need to mask out the high bits since the setreg will ignore them
4105
- // anyway.
4106
- NewMode = TruncTable;
4115
+ SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4116
+ SDValue RoundModeTimesNumBits =
4117
+ DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4118
+
4119
+ SDValue TableValue =
4120
+ DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4121
+ SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4122
+
4123
+ // No need to mask out the high bits since the setreg will ignore them
4124
+ // anyway.
4125
+ NewMode = TruncTable;
4126
+ }
4107
4127
4108
4128
// Insert a readfirstlane in case the value is a VGPR. We could do this
4109
4129
// earlier and keep more operations scalar, but that interferes with
0 commit comments