@@ -10693,19 +10693,48 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10693
10693
return FastLowered;
10694
10694
10695
10695
SDLoc SL(Op);
10696
- SDValue Src0 = Op.getOperand(0);
10697
- SDValue Src1 = Op.getOperand(1);
10698
-
10699
- SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10700
- SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10701
-
10702
- SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10703
- SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10704
-
10705
- SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10706
- SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10696
+ SDValue LHS = Op.getOperand(0);
10697
+ SDValue RHS = Op.getOperand(1);
10707
10698
10708
- return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10699
+ // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
10700
+ // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
10701
+ // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
10702
+ // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
10703
+ // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10704
+ // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
10705
+ // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10706
+ // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
10707
+ // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
10708
+ // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
10709
+ // q16.u = opx(V_CVT_F16_F32, q32.u);
10710
+ // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
10711
+
10712
+ // We will use ISD::FMA on targets that don't support ISD::FMAD.
10713
+ unsigned FMADOpCode =
10714
+ isOperationLegal(ISD::FMAD, MVT::f32) ? ISD::FMAD : ISD::FMA;
10715
+
10716
+ SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
10717
+ SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
10718
+ SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
10719
+ SDValue Rcp =
10720
+ DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
10721
+ SDValue Quot =
10722
+ DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
10723
+ SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10724
+ Op->getFlags());
10725
+ Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
10726
+ Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
10727
+ Op->getFlags());
10728
+ SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
10729
+ SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
10730
+ TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
10731
+ DAG.getConstant(0xff800000, SL, MVT::i32));
10732
+ Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
10733
+ Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
10734
+ SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
10735
+ DAG.getConstant(0, SL, MVT::i32));
10736
+ return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
10737
+ Op->getFlags());
10709
10738
}
10710
10739
10711
10740
// Faster 2.5 ULP division that does not support denormals.
0 commit comments