Skip to content

Commit 79e7b78

Browse files
committed
[AMDGPU] Adapt new lowering sequence for fdiv16
The current lowering of fdiv16 can generate incorrectly rounded result in some cases. Fixes SWDEV-47760.
1 parent b7e1fa3 commit 79e7b78

File tree

7 files changed

+1037
-402
lines changed

7 files changed

+1037
-402
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4903,16 +4903,40 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
49034903
LLT S16 = LLT::scalar(16);
49044904
LLT S32 = LLT::scalar(32);
49054905

4906+
// a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
4907+
// b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
4908+
// r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
4909+
// q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
4910+
// e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
4911+
// q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
4912+
// e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
4913+
// tmp.u = opx(V_MUL_F32, e32.u, r32.u);
4914+
// tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
4915+
// q32.u = opx(V_ADD_F32, tmp.u, q32.u);
4916+
// q16.u = opx(V_CVT_F16_F32, q32.u);
4917+
// q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
4918+
49064919
auto LHSExt = B.buildFPExt(S32, LHS, Flags);
49074920
auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4908-
4909-
auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4921+
auto NegRHSExt = B.buildFNeg(S32, RHSExt);
4922+
auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
49104923
.addUse(RHSExt.getReg(0))
49114924
.setMIFlags(Flags);
4912-
4913-
auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4914-
auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4915-
4925+
auto Quot = B.buildFMul(S32, LHSExt, Rcp);
4926+
MachineInstrBuilder Err;
4927+
if (ST.hasMadMacF32Insts()) {
4928+
Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt);
4929+
Quot = B.buildFMAD(S32, Err, Rcp, Quot);
4930+
Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt);
4931+
} else {
4932+
Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt);
4933+
Quot = B.buildFMA(S32, Err, Rcp, Quot);
4934+
Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt);
4935+
}
4936+
auto Tmp = B.buildFMul(S32, Err, Rcp);
4937+
Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
4938+
Quot = B.buildFAdd(S32, Tmp, Quot);
4939+
auto RDst = B.buildFPTrunc(S16, Quot, Flags);
49164940
B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
49174941
.addUse(RDst.getReg(0))
49184942
.addUse(RHS)

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10616,19 +10616,43 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
1061610616
return FastLowered;
1061710617

1061810618
SDLoc SL(Op);
10619-
SDValue Src0 = Op.getOperand(0);
10620-
SDValue Src1 = Op.getOperand(1);
10621-
10622-
SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10623-
SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10624-
10625-
SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10626-
SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10627-
10628-
SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10629-
SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10619+
SDValue LHS = Op.getOperand(0);
10620+
SDValue RHS = Op.getOperand(1);
1063010621

10631-
return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10622+
// a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
10623+
// b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
10624+
// r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
10625+
// q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
10626+
// e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10627+
// q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
10628+
// e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
10629+
// tmp.u = opx(V_MUL_F32, e32.u, r32.u);
10630+
// tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
10631+
// q32.u = opx(V_ADD_F32, tmp.u, q32.u);
10632+
// q16.u = opx(V_CVT_F16_F32, q32.u);
10633+
// q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
10634+
10635+
// We will use ISD::FMA on targets that don't support ISD::FMAD.
10636+
unsigned FMADOpCode =
10637+
isOperationLegal(ISD::FMAD, MVT::f32) ? ISD::FMAD : ISD::FMA;
10638+
10639+
SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
10640+
SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
10641+
SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
10642+
SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt);
10643+
SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp);
10644+
SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt);
10645+
Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot);
10646+
Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt);
10647+
SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp);
10648+
SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
10649+
TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
10650+
DAG.getConstant(0xff800000, SL, MVT::i32));
10651+
Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
10652+
Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot);
10653+
SDValue FPRoundFlag = DAG.getConstant(0, SL, MVT::i32);
10654+
SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10655+
return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS);
1063210656
}
1063310657

1063410658
// Faster 2.5 ULP division that does not support denormals.

llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll

Lines changed: 62 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,14 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
4646
; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
4747
; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
4848
; VI-NEXT: v_mov_b32_e32 v1, s0
49-
; VI-NEXT: v_rcp_f32_e32 v2, v2
50-
; VI-NEXT: v_mul_f32_e32 v0, v0, v2
49+
; VI-NEXT: v_rcp_f32_e32 v3, v2
50+
; VI-NEXT: v_mul_f32_e32 v4, v0, v3
51+
; VI-NEXT: v_mad_f32 v5, -v2, v4, v0
52+
; VI-NEXT: v_mac_f32_e32 v4, v5, v3
53+
; VI-NEXT: v_mad_f32 v0, -v2, v4, v0
54+
; VI-NEXT: v_mul_f32_e32 v0, v0, v3
55+
; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0
56+
; VI-NEXT: v_add_f32_e32 v0, v0, v4
5157
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
5258
; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
5359
; VI-NEXT: v_trunc_f16_e32 v0, v0
@@ -554,19 +560,31 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
554560
; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
555561
; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
556562
; VI-NEXT: s_lshr_b32 s3, s0, 16
557-
; VI-NEXT: v_cvt_f32_f16_e32 v3, s3
558563
; VI-NEXT: v_mov_b32_e32 v1, s0
559-
; VI-NEXT: v_rcp_f32_e32 v2, v2
560564
; VI-NEXT: s_lshr_b32 s1, s2, 16
561-
; VI-NEXT: v_rcp_f32_e32 v3, v3
562-
; VI-NEXT: v_mul_f32_e32 v0, v0, v2
565+
; VI-NEXT: v_rcp_f32_e32 v3, v2
566+
; VI-NEXT: v_mul_f32_e32 v4, v0, v3
567+
; VI-NEXT: v_mad_f32 v5, -v2, v4, v0
568+
; VI-NEXT: v_mac_f32_e32 v4, v5, v3
569+
; VI-NEXT: v_mad_f32 v0, -v2, v4, v0
570+
; VI-NEXT: v_mul_f32_e32 v0, v0, v3
571+
; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0
572+
; VI-NEXT: v_add_f32_e32 v0, v0, v4
563573
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
574+
; VI-NEXT: v_cvt_f32_f16_e32 v3, s3
564575
; VI-NEXT: v_mov_b32_e32 v2, s3
565576
; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
566577
; VI-NEXT: v_trunc_f16_e32 v0, v0
567578
; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
568579
; VI-NEXT: v_cvt_f32_f16_e32 v1, s1
569-
; VI-NEXT: v_mul_f32_e32 v1, v1, v3
580+
; VI-NEXT: v_rcp_f32_e32 v4, v3
581+
; VI-NEXT: v_mul_f32_e32 v5, v1, v4
582+
; VI-NEXT: v_mad_f32 v6, -v3, v5, v1
583+
; VI-NEXT: v_mac_f32_e32 v5, v6, v4
584+
; VI-NEXT: v_mad_f32 v1, -v3, v5, v1
585+
; VI-NEXT: v_mul_f32_e32 v1, v1, v4
586+
; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1
587+
; VI-NEXT: v_add_f32_e32 v1, v1, v5
570588
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
571589
; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s1
572590
; VI-NEXT: v_trunc_f16_e32 v1, v1
@@ -691,41 +709,65 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
691709
; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
692710
; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
693711
; VI-NEXT: s_lshr_b32 s8, s0, 16
694-
; VI-NEXT: v_cvt_f32_f16_e32 v3, s8
695712
; VI-NEXT: v_mov_b32_e32 v1, s0
696-
; VI-NEXT: v_rcp_f32_e32 v2, v2
697713
; VI-NEXT: s_lshr_b32 s6, s2, 16
698-
; VI-NEXT: v_rcp_f32_e32 v3, v3
699-
; VI-NEXT: v_cvt_f32_f16_e32 v4, s1
700-
; VI-NEXT: v_mul_f32_e32 v0, v0, v2
714+
; VI-NEXT: v_rcp_f32_e32 v3, v2
715+
; VI-NEXT: s_lshr_b32 s9, s1, 16
716+
; VI-NEXT: s_lshr_b32 s7, s3, 16
717+
; VI-NEXT: v_mul_f32_e32 v4, v0, v3
718+
; VI-NEXT: v_mad_f32 v5, -v2, v4, v0
719+
; VI-NEXT: v_mac_f32_e32 v4, v5, v3
720+
; VI-NEXT: v_mad_f32 v0, -v2, v4, v0
721+
; VI-NEXT: v_mul_f32_e32 v0, v0, v3
722+
; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0
723+
; VI-NEXT: v_add_f32_e32 v0, v0, v4
701724
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
725+
; VI-NEXT: v_cvt_f32_f16_e32 v3, s8
702726
; VI-NEXT: v_mov_b32_e32 v2, s8
703-
; VI-NEXT: v_rcp_f32_e32 v4, v4
704-
; VI-NEXT: s_lshr_b32 s9, s1, 16
705727
; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
706728
; VI-NEXT: v_trunc_f16_e32 v0, v0
707729
; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
708730
; VI-NEXT: v_cvt_f32_f16_e32 v1, s6
709-
; VI-NEXT: v_cvt_f32_f16_e32 v5, s9
710-
; VI-NEXT: s_lshr_b32 s7, s3, 16
711-
; VI-NEXT: v_mul_f32_e32 v1, v1, v3
731+
; VI-NEXT: v_rcp_f32_e32 v4, v3
732+
; VI-NEXT: v_mul_f32_e32 v5, v1, v4
733+
; VI-NEXT: v_mad_f32 v6, -v3, v5, v1
734+
; VI-NEXT: v_mac_f32_e32 v5, v6, v4
735+
; VI-NEXT: v_mad_f32 v1, -v3, v5, v1
736+
; VI-NEXT: v_mul_f32_e32 v1, v1, v4
737+
; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1
738+
; VI-NEXT: v_add_f32_e32 v1, v1, v5
712739
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
740+
; VI-NEXT: v_cvt_f32_f16_e32 v4, s1
713741
; VI-NEXT: v_mov_b32_e32 v3, s1
714-
; VI-NEXT: v_rcp_f32_e32 v5, v5
715742
; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s6
716743
; VI-NEXT: v_trunc_f16_e32 v1, v1
717744
; VI-NEXT: v_fma_f16 v1, -v1, v2, s6
718745
; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
746+
; VI-NEXT: v_rcp_f32_e32 v5, v4
719747
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
720748
; VI-NEXT: v_or_b32_e32 v0, v0, v1
721-
; VI-NEXT: v_mul_f32_e32 v2, v2, v4
749+
; VI-NEXT: v_mul_f32_e32 v6, v2, v5
750+
; VI-NEXT: v_mad_f32 v7, -v4, v6, v2
751+
; VI-NEXT: v_mac_f32_e32 v6, v7, v5
752+
; VI-NEXT: v_mad_f32 v2, -v4, v6, v2
753+
; VI-NEXT: v_mul_f32_e32 v2, v2, v5
754+
; VI-NEXT: v_and_b32_e32 v2, 0xff800000, v2
755+
; VI-NEXT: v_add_f32_e32 v2, v2, v6
722756
; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
757+
; VI-NEXT: v_cvt_f32_f16_e32 v5, s9
723758
; VI-NEXT: v_mov_b32_e32 v4, s9
724759
; VI-NEXT: v_div_fixup_f16 v2, v2, v3, s3
725760
; VI-NEXT: v_trunc_f16_e32 v2, v2
726761
; VI-NEXT: v_fma_f16 v2, -v2, v3, s3
727762
; VI-NEXT: v_cvt_f32_f16_e32 v3, s7
728-
; VI-NEXT: v_mul_f32_e32 v3, v3, v5
763+
; VI-NEXT: v_rcp_f32_e32 v6, v5
764+
; VI-NEXT: v_mul_f32_e32 v7, v3, v6
765+
; VI-NEXT: v_mad_f32 v8, -v5, v7, v3
766+
; VI-NEXT: v_mac_f32_e32 v7, v8, v6
767+
; VI-NEXT: v_mad_f32 v3, -v5, v7, v3
768+
; VI-NEXT: v_mul_f32_e32 v3, v3, v6
769+
; VI-NEXT: v_and_b32_e32 v3, 0xff800000, v3
770+
; VI-NEXT: v_add_f32_e32 v3, v3, v7
729771
; VI-NEXT: v_cvt_f16_f32_e32 v3, v3
730772
; VI-NEXT: v_div_fixup_f16 v3, v3, v4, s7
731773
; VI-NEXT: v_trunc_f16_e32 v3, v3

0 commit comments

Comments
 (0)