Skip to content

Commit 0b9f3f5

Browse files
committed
[AMDGPU] Adapt new lowering sequence for fdiv16
The current lowering of fdiv16 can generate incorrectly rounded result in some cases. Fixes SWDEV-47760.
1 parent 5fa742e commit 0b9f3f5

File tree

8 files changed

+2060
-901
lines changed

8 files changed

+2060
-901
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4902,14 +4902,18 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
49024902

49034903
auto LHSExt = B.buildFPExt(S32, LHS, Flags);
49044904
auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4905-
4906-
auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4905+
auto NegRHSExt = B.buildFNeg(S32, RHSExt);
4906+
auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
49074907
.addUse(RHSExt.getReg(0))
49084908
.setMIFlags(Flags);
4909-
4910-
auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4911-
auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4912-
4909+
auto Quot = B.buildFMul(S32, LHSExt, Rcp);
4910+
auto Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt);
4911+
Quot = B.buildFMA(S32, Err, Rcp, Quot);
4912+
Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt);
4913+
auto Tmp = B.buildFMul(S32, Err, Rcp);
4914+
Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
4915+
Quot = B.buildFAdd(S32, Tmp, Quot);
4916+
auto RDst = B.buildFPTrunc(S16, Quot, Flags);
49134917
B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
49144918
.addUse(RDst.getReg(0))
49154919
.addUse(RHS)

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10606,19 +10606,25 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
1060610606
return FastLowered;
1060710607

1060810608
SDLoc SL(Op);
10609-
SDValue Src0 = Op.getOperand(0);
10610-
SDValue Src1 = Op.getOperand(1);
10611-
10612-
SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10613-
SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10614-
10615-
SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10616-
SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10617-
10609+
SDValue LHS = Op.getOperand(0);
10610+
SDValue RHS = Op.getOperand(1);
10611+
SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
10612+
SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
10613+
SDValue NegRHSExt =DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
10614+
SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt);
10615+
SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp);
10616+
SDValue Err = DAG.getNode(ISD::FMA, SL, MVT::f32, NegRHSExt, Quot, LHSExt);
10617+
Quot = DAG.getNode(ISD::FMA, SL, MVT::f32, Err, Rcp, Quot);
10618+
Err = DAG.getNode(ISD::FMA, SL, MVT::f32, NegRHSExt, Quot, LHSExt);
10619+
SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp);
10620+
SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
10621+
TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
10622+
DAG.getTargetConstant(0xff800000, SL, MVT::i32));
10623+
Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
10624+
Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot);
1061810625
SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10619-
SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10620-
10621-
return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10626+
SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10627+
return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS);
1062210628
}
1062310629

1062410630
// Faster 2.5 ULP division that does not support denormals.

0 commit comments

Comments
 (0)