@@ -10606,19 +10606,25 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10606
10606
return FastLowered;
10607
10607
10608
10608
SDLoc SL(Op);
10609
- SDValue Src0 = Op.getOperand(0);
10610
- SDValue Src1 = Op.getOperand(1);
10611
-
10612
- SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10613
- SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10614
-
10615
- SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10616
- SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10617
-
10609
+ SDValue LHS = Op.getOperand(0);
10610
+ SDValue RHS = Op.getOperand(1);
10611
+ SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
10612
+ SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
10613
+ SDValue NegRHSExt =DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
10614
+ SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt);
10615
+ SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp);
10616
+ SDValue Err = DAG.getNode(ISD::FMA, SL, MVT::f32, NegRHSExt, Quot, LHSExt);
10617
+ Quot = DAG.getNode(ISD::FMA, SL, MVT::f32, Err, Rcp, Quot);
10618
+ Err = DAG.getNode(ISD::FMA, SL, MVT::f32, NegRHSExt, Quot, LHSExt);
10619
+ SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp);
10620
+ SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
10621
+ TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
10622
+ DAG.getTargetConstant(0xff800000, SL, MVT::i32));
10623
+ Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
10624
+ Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot);
10618
10625
SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10619
- SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10620
-
10621
- return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10626
+ SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10627
+ return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS);
10622
10628
}
10623
10629
10624
10630
// Faster 2.5 ULP division that does not support denormals.
0 commit comments