-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[AMDGPU] Adopt new lowering sequence for fdiv16
#109295
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) ChangesThe current lowering of fdiv16 can generate incorrectly rounded result in some Fixes SWDEV-47760. Patch is 459.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109295.diff 9 Files Affected:
diff --git a/8925731.diff b/8925731.diff
new file mode 100644
index 00000000000000..a1f9fa92350375
--- /dev/null
+++ b/8925731.diff
@@ -0,0 +1,4542 @@
+From 8925731b320f6afbbd6a7d0b1b3520f616688aa5 Mon Sep 17 00:00:00 2001
+From: Shilei Tian <[email protected]>
+Date: Tue, 10 Sep 2024 15:38:37 -0400
+Subject: [PATCH] SWDEV-477608 - Adapt new lowering of fdiv16
+
+The current lowering of fdiv16 can generate incorrectly rounded result in some
+cases.
+
+Change-Id: I302365dbd9ceedfc364a25f7806798072b3b14af
+---
+
+diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+index e657f66..86d488e 100644
+--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
++++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+@@ -4902,14 +4902,18 @@
+
+ auto LHSExt = B.buildFPExt(S32, LHS, Flags);
+ auto RHSExt = B.buildFPExt(S32, RHS, Flags);
+-
+- auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
++ auto NegRHSExt = B.buildFNeg(S32, RHSExt);
++ auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
+ .addUse(RHSExt.getReg(0))
+ .setMIFlags(Flags);
+-
+- auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
+- auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
+-
++ auto Quot = B.buildFMul(S32, LHSExt, Rcp);
++ auto Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt);
++ Quot = B.buildFMA(S32, Err, Rcp, Quot);
++ Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt);
++ auto Tmp = B.buildFMul(S32, Err, Rcp);
++ Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
++ Quot = B.buildFAdd(S32, Tmp, Quot);
++ auto RDst = B.buildFPTrunc(S16, Quot, Flags);
+ B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
+ .addUse(RDst.getReg(0))
+ .addUse(RHS)
+diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+index fd35314..ece40a3 100644
+--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
++++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+@@ -10816,19 +10816,25 @@
+ return FastLowered;
+
+ SDLoc SL(Op);
+- SDValue Src0 = Op.getOperand(0);
+- SDValue Src1 = Op.getOperand(1);
+-
+- SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
+- SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
+-
+- SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
+- SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
+-
++ SDValue LHS = Op.getOperand(0);
++ SDValue RHS = Op.getOperand(1);
++ SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
++ SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
++ SDValue NegRHSExt =DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
++ SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt);
++ SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp);
++ SDValue Err = DAG.getNode(ISD::FMA, SL, MVT::f32, NegRHSExt, Quot, LHSExt);
++ Quot = DAG.getNode(ISD::FMA, SL, MVT::f32, Err, Rcp, Quot);
++ Err = DAG.getNode(ISD::FMA, SL, MVT::f32, NegRHSExt, Quot, LHSExt);
++ SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp);
++ SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
++ TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
++ DAG.getTargetConstant(0xff800000, SL, MVT::i32));
++ Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
++ Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot);
+ SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
+- SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
+-
+- return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
++ SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
++ return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS);
+ }
+
+ // Faster 2.5 ULP division that does not support denormals.
+diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+index 89cd18a..c224621 100644
+--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
++++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+@@ -57,43 +57,37 @@
+ ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+ ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+ ;
+-; GFX8-LABEL: v_fdiv_f16:
+-; GFX8: ; %bb.0:
+-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
+-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
+-; GFX8-NEXT: v_rcp_f32_e32 v2, v2
+-; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
+-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+-; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+-; GFX8-NEXT: s_setpc_b64 s[30:31]
+-;
+-; GFX9-IEEE-LABEL: v_fdiv_f16:
+-; GFX9-IEEE: ; %bb.0:
+-; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
+-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
+-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
+-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2
+-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+-; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
+-;
+-; GFX9-FLUSH-LABEL: v_fdiv_f16:
+-; GFX9-FLUSH: ; %bb.0:
+-; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
+-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
+-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+-; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
++; GFX89-LABEL: v_fdiv_f16:
++; GFX89: ; %bb.0:
++; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
++; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1
++; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0
++; GFX89-NEXT: v_rcp_f32_e32 v4, v2
++; GFX89-NEXT: v_mul_f32_e32 v5, v3, v4
++; GFX89-NEXT: v_fma_f32 v6, -v2, v5, v3
++; GFX89-NEXT: v_fma_f32 v5, v6, v4, v5
++; GFX89-NEXT: v_fma_f32 v2, -v2, v5, v3
++; GFX89-NEXT: v_mul_f32_e32 v2, v2, v4
++; GFX89-NEXT: v_and_b32_e32 v2, 0xff800000, v2
++; GFX89-NEXT: v_add_f32_e32 v2, v2, v5
++; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2
++; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0
++; GFX89-NEXT: s_setpc_b64 s[30:31]
+ ;
+ ; GFX10-LABEL: v_fdiv_f16:
+ ; GFX10: ; %bb.0:
+ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
++; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
+ ; GFX10-NEXT: v_rcp_f32_e32 v2, v2
+-; GFX10-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
++; GFX10-NEXT: v_mul_f32_e32 v3, v3, v2
++; GFX10-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
++; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v2
++; GFX10-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
++; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2
++; GFX10-NEXT: v_and_b32_e32 v2, 0xff800000, v2
++; GFX10-NEXT: v_add_f32_e32 v2, v2, v3
++; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+ ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+ ; GFX10-NEXT: s_setpc_b64 s[30:31]
+ ;
+@@ -101,9 +95,17 @@
+ ; GFX11: ; %bb.0:
+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1
++; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0
+ ; GFX11-NEXT: v_rcp_f32_e32 v2, v2
+ ; GFX11-NEXT: s_waitcnt_depctr 0xfff
+-; GFX11-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
++; GFX11-NEXT: v_mul_f32_e32 v3, v3, v2
++; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
++; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v2
++; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
++; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2
++; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
++; GFX11-NEXT: v_add_f32_e32 v2, v2, v3
++; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+ ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %fdiv = fdiv half %a, %b
+@@ -188,43 +190,37 @@
+ ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+ ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+ ;
+-; GFX8-LABEL: v_fdiv_f16_ulp25:
+-; GFX8: ; %bb.0:
+-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
+-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
+-; GFX8-NEXT: v_rcp_f32_e32 v2, v2
+-; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
+-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+-; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+-; GFX8-NEXT: s_setpc_b64 s[30:31]
+-;
+-; GFX9-IEEE-LABEL: v_fdiv_f16_ulp25:
+-; GFX9-IEEE: ; %bb.0:
+-; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
+-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
+-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
+-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2
+-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+-; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
+-;
+-; GFX9-FLUSH-LABEL: v_fdiv_f16_ulp25:
+-; GFX9-FLUSH: ; %bb.0:
+-; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
+-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
+-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+-; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
++; GFX89-LABEL: v_fdiv_f16_ulp25:
++; GFX89: ; %bb.0:
++; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
++; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1
++; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0
++; GFX89-NEXT: v_rcp_f32_e32 v4, v2
++; GFX89-NEXT: v_mul_f32_e32 v5, v3, v4
++; GFX89-NEXT: v_fma_f32 v6, -v2, v5, v3
++; GFX89-NEXT: v_fma_f32 v5, v6, v4, v5
++; GFX89-NEXT: v_fma_f32 v2, -v2, v5, v3
++; GFX89-NEXT: v_mul_f32_e32 v2, v2, v4
++; GFX89-NEXT: v_and_b32_e32 v2, 0xff800000, v2
++; GFX89-NEXT: v_add_f32_e32 v2, v2, v5
++; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2
++; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0
++; GFX89-NEXT: s_setpc_b64 s[30:31]
+ ;
+ ; GFX10-LABEL: v_fdiv_f16_ulp25:
+ ; GFX10: ; %bb.0:
+ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
++; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
+ ; GFX10-NEXT: v_rcp_f32_e32 v2, v2
+-; GFX10-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
++; GFX10-NEXT: v_mul_f32_e32 v3, v3, v2
++; GFX10-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
++; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v2
++; GFX10-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
++; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2
++; GFX10-NEXT: v_and_b32_e32 v2, 0xff800000, v2
++; GFX10-NEXT: v_add_f32_e32 v2, v2, v3
++; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+ ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+ ; GFX10-NEXT: s_setpc_b64 s[30:31]
+ ;
+@@ -232,9 +228,17 @@
+ ; GFX11: ; %bb.0:
+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1
++; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0
+ ; GFX11-NEXT: v_rcp_f32_e32 v2, v2
+ ; GFX11-NEXT: s_waitcnt_depctr 0xfff
+-; GFX11-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
++; GFX11-NEXT: v_mul_f32_e32 v3, v3, v2
++; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
++; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v2
++; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
++; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2
++; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
++; GFX11-NEXT: v_add_f32_e32 v2, v2, v3
++; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+ ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %fdiv = fdiv half %a, %b
+@@ -673,59 +677,67 @@
+ ; GFX8-LABEL: v_fdiv_v2f16:
+ ; GFX8: ; %bb.0:
+ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
+-; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4
+-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+-; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v0
+-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
+-; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v2
+-; GFX8-NEXT: v_rcp_f32_e32 v5, v5
+-; GFX8-NEXT: v_mul_f32_e32 v3, v6, v3
+-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+-; GFX8-NEXT: v_mul_f32_e32 v5, v7, v5
+-; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v5
+-; GFX8-NEXT: v_div_fixup_f16 v0, v3, v1, v0
+-; GFX8-NEXT: v_div_fixup_f16 v1, v5, v4, v2
++; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
++; GFX8-NEXT: v_cvt_f32_f16_e32 v4, v0
++; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1
++; GFX8-NEXT: v_cvt_f32_f16_e32 v8, v6
++; GFX8-NEXT: v_rcp_f32_e32 v5, v2
++; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
++; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v3
++; GFX8-NEXT: v_mul_f32_e32 v9, v4, v5
++; GFX8-NEXT: v_fma_f32 v10, -v2, v9, v4
++; GFX8-NEXT: v_fma_f32 v9, v10, v5, v9
++; GFX8-NEXT: v_fma_f32 v2, -v2, v9, v4
++; GFX8-NEXT: v_rcp_f32_e32 v4, v8
++; GFX8-NEXT: v_mul_f32_e32 v2, v2, v5
++; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v2
++; GFX8-NEXT: v_add_f32_e32 v2, v2, v9
++; GFX8-NEXT: v_mul_f32_e32 v5, v7, v4
++; GFX8-NEXT: v_fma_f32 v9, -v8, v5, v7
++; GFX8-NEXT: v_fma_f32 v5, v9, v4, v5
++; GFX8-NEXT: v_fma_f32 v7, -v8, v5, v7
++; GFX8-NEXT: v_mul_f32_e32 v4, v7, v4
++; GFX8-NEXT: v_and_b32_e32 v4, 0xff800000, v4
++; GFX8-NEXT: v_add_f32_e32 v4, v4, v5
++; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
++; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v4
++; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
++; GFX8-NEXT: v_div_fixup_f16 v1, v4, v6, v3
+ ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+ ; GFX8-NEXT: s_setpc_b64 s[30:31]
+ ;
+-; GFX9-IEEE-LABEL: v_fdiv_v2f16:
+-; GFX9-IEEE: ; %bb.0:
+-; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
+-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v4
+-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v6, v0
+-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
+-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v2
+-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v5
+-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v6, v3
+-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v7, v5
+-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v5, v5
+-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v1, v0
+-; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v5, v4, v2
+-; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
+-; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
+-;
+-; GFX9-FLUSH-LABEL: v_fdiv_v2f16:
+-; GFX9-FLUSH: ; %bb.0:
+-; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
+-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
+-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
+-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
+-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, v0
+-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, v5
+-; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
+-; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
++; GFX9-LABEL: v_fdiv_v2f16:
++; GFX9: ; %bb.0:
++; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
++; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
++; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v0
++; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1
++; GFX9-NEXT: v_cvt_f32_f16_e32 v8, v6
++; GFX9-NEXT: v_rcp_f32_e32 v5, v2
++; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0
++; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v3
++; GFX9-NEXT: v_mul_f32_e32 v9, v4, v5
++; GFX9-NEXT: v_fma_f32 v10, -v2, v9, v4
++; GFX9-NEXT: v_fma_f32 v9, v10, v5, v9
++; GFX9-NEXT: v_fma_f32 v2, -v2, v9, v4
++; GFX9-NEXT: v_rcp_f32_e32 v4, v8
++; GFX9-NEXT: v_mul_f32_e32 v2, v2, v5
++; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v2
++; GFX9-NEXT: v_add_f32_e32 v2, v2, v9
++; GFX9-NEXT: v_mul_f32_e32 v5, v7, v4
++; GFX9-NEXT: v_fma_f32 v9, -v8, v5, v7
++; GFX9-NEXT: v_fma_f32 v5, v9, v4, v5
++; GFX9-NEXT: v_fma_f32 v7, -v8, v5, v7
++; GFX9-NEXT: v_mul_f32_e32 v4, v7, v4
++; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v4
++; GFX9-NEXT: v_add_f32_e32 v4, v4, v5
++; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
++; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4
++; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0
++; GFX9-NEXT: v_div_fixup_f16 v1, v4, v6, v3
++; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
++; GFX9-NEXT: s_setpc_b64 s[30:31]
+ ;
+ ; GFX10-LABEL: v_fdiv_v2f16:
+ ; GFX10: ; %bb.0:
+@@ -733,11 +745,27 @@
+ ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+ ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
+ ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
++; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v0
+ ; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
+ ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
++; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5
+ ; GFX10-NEXT: v_rcp_f32_e32 v4, v4
+-; GFX10-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
+-; GFX10-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
++; GFX10-NEXT: v_mul_f32_e32 v6, v6, v3
++; GFX10-NEXT: v_mul_f32_e32 v7, v7, v4
++; GFX10-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
++; GFX10-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
++; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v3
++; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v4
++; GFX10-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
++; GFX10-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
++; GFX10-NEXT: v_mul_f32_e32 v3, v8, v3
++; GFX10-NEXT: v_mul_f32_e32 v4, v9, v4
++; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3
++; GFX10-NEXT: v_and_b32_e32 v4, 0xff800000, v4
++; GFX10-NEXT: v_add_f32_e32 v3, v3, v6
++; GFX10-NEXT: v_add_f32_e32 v4, v4, v7
++; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
++; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
+ ; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0
+ ; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5
+ ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+@@ -749,12 +777,24 @@
+ ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+ ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
+ ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
++; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0
+ ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
+ ; GFX11-NEXT: v_rcp_f32_e32 v3, v3
++; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5
+ ; GFX11-NEXT: v_rcp_f32_e32 v4, v4
+ ; GFX11-NEXT: s_waitcnt_depctr 0xfff
+-; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
+-; GFX11-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
++; GFX11-NEXT: v_dual_mul_f32 v6, v6, v3 :: v_dual_mul_f32 v7, v7, v4
++; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
++; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
++; GFX11-NEXT: v_dual_fmac_f32 v6, v8, v3 :: v_dual_fmac_f32 v7, v9, v4
++; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
++; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
++; GFX11-NEXT: v_dual_mul_f32 v3, v8, v3 :: v_dual_mul_f32 v4, v9, v4
++; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3
++; GFX11-NEXT: v_dual_add_f32 v3, v3, v6 :: v_dual_and_b32 v4, 0xff800000, v4
++; GFX11-NEXT: v_add_f32_e32 v4, v4, v7
++; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
++; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
+ ; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0
+ ; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5
+ ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+@@ -900,59 +940,67 @@
+ ; GFX8-LABEL: v_fdiv_v2f16_ulp25:
+ ; GFX8: ; %bb.0:
+ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
+-; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4
+-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+-; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v0
+-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
+-; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v2
+-; GFX8-NEXT: v_rcp_f32_e32 v5, v5
+-; GFX8-NEXT: v_mul_f32_e32 v3, v6, v3
+-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+-; GFX...
[truncated]
|
fa5cc64
to
0b9f3f5
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
Can you explain the algorithm? Or at least add a comment showing what the algorithm does in pseudocode, so I don't have to work it out from all the getNode calls? |
Done. Please refer to the ticket (as well as the Gerrit PR attached to it) for more details. |
I meant as a comment in the code. |
0b9f3f5
to
f8e0d13
Compare
Done. |
f8e0d13
to
0ef2672
Compare
Did you mean "adopt"? |
fdiv16
fdiv16
😂 indeed |
2b7d74a
to
03ada68
Compare
SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt); | ||
SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt); | ||
SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp); | ||
SDValue Err = DAG.getNode(ISD::FMA, SL, MVT::f32, NegRHSExt, Quot, LHSExt); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should switch to FMAD or FMA based on legality (same in both paths)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought legalizer will legalize the (unsupported) instruction, since we are not using a target node here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
They are semantically different. They are generally not interchangeable
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But we use FMA
for newer targets because FMAD
is not available right? I think a more appropriate approach would be to use FMAD
(since it is the one that HW team was using), and let the legalizer to handle the case when FMAD
is not supported.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, because you cannot simply legalize FMAD into FMA. They are different operations. You have to use the one you know is preferable from the beginning
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comment from hardware had V_MAD_F32 in it to begin with. I'm still skeptical this is the optimal sequence though
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The point still stands that you can't choose between FMA and FMAD based on preference unless you have verified that both work correctly.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jayfoad The HW sequence is FMAD
, but I did my experiments using FMA
since the very beginning. It works perfectly fine (passing both @b-sumner 's test case as well as OpenCL CTS). That's why the first version of this PR uses FMA
regardless, until @arsenm pointed it out in this thread.
I understand FMA
and FMAD
are (slightly) semantically different, but since FMA
is widely available and passes the OpenCL CTS (not just math, but also all the others FP16 related), do you think it makes sense here to just use FMA
? @jayfoad @arsenm @b-sumner What's more, @arsenm already suggested to choose between FMA
and FMAD
based on availability, this also indicates FMA
can be a "direct" replacement.
BTW, the reason I chose FMA
to begin with was, as I mentioned in the ticket, I found FMAD
is not always available and I asked the question in the ticket, both @arsenm and @b-sumner said FMA
can be used.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FMA is slow on older targets, and it's not hard to just swap the emitted opcode for their benefit
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jayfoad The HW sequence is
FMAD
, but I did my experiments usingFMA
since the very beginning. It works perfectly fine (passing both @b-sumner 's test case as well as OpenCL CTS). That's why the first version of this PR usesFMA
regardless, until @arsenm pointed it out in this thread.I understand
FMA
andFMAD
are (slightly) semantically different, but sinceFMA
is widely available and passes the OpenCL CTS (not just math, but also all the others FP16 related), do you think it makes sense here to just useFMA
? @jayfoad @arsenm @b-sumner What's more, @arsenm already suggested to choose betweenFMA
andFMAD
based on availability, this also indicatesFMA
can be a "direct" replacement.BTW, the reason I chose
FMA
to begin with was, as I mentioned in the ticket, I foundFMAD
is not always available and I asked the question in the ticket, both @arsenm and @b-sumner saidFMA
can be used.
I don't know how thorough the OpenCL conformance tests are. It is possible to exhaustively test this on all ~ 2^32 possible inputs?
50afd63
to
5871a21
Compare
5871a21
to
8eb2259
Compare
7f665f0
to
8131471
Compare
8eb2259
to
76a9615
Compare
8131471
to
b7e1fa3
Compare
76a9615
to
79e7b78
Compare
b7e1fa3
to
0edac25
Compare
79e7b78
to
63f3d8f
Compare
0edac25
to
38b55c0
Compare
63f3d8f
to
99052f4
Compare
38b55c0
to
9f1f15e
Compare
99052f4
to
2eb020e
Compare
9f1f15e
to
494e9fa
Compare
2eb020e
to
d3e71ba
Compare
494e9fa
to
c5680dd
Compare
d3e71ba
to
1fcfeea
Compare
The current lowering of fdiv16 can generate incorrectly rounded result in some cases. Fixes SWDEV-47760.
1fcfeea
to
a5a095c
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/123/builds/7151 Here is the relevant piece of the build log for the reference
|
The current lowering of `fdiv16` can generate incorrectly rounded result in some cases. The new sequence was provided by the HW team, as shown below written in C++. ``` half fdiv(half a, half b) { float a32 = float(a); float b32 = float(b); float r32 = 1.0f / b32; float q32 = a32 * r32; float e32 = -b32 * q32 + a32; q32 = e32 * r32 + q32; e32 = -b32 * q32 + a32; float tmp = e32 * r32; uin32_t tmp32 = std::bit_cast<uint32_t>(tmp); tmp32 = tmp32 & 0xff800000; tmp = std::bit_cast<float>(tmp32); q32 = tmp + q32; half q16 = half(q32); q16 = div_fixup_f16(q16); return q16; } ``` Fixes SWDEV-477608. (cherry picked from commit 88a239d) Change-Id: I4d1dffc2037c66d2ebe0cf7b2e144eae78008d04
The current lowering of
fdiv16
can generate incorrectly rounded result in some cases. The new sequence was provided by the HW team, as shown below written in C++.Fixes SWDEV-477608.