Fix reciprocal round-trip error in FDIV emulation (OCL)

michalpaszkowski · igcbot · commit a422a1acac6b · 2024-05-12T07:41:43.000+02:00
This commit simplifies the FDIV instruction expansion by directly
incorporating the case where x == y into the main computation flow.
When x and y are equal and are normal values (neither is +/-0, +/-NaN,
+/-Inf, or subnormal), a select is added to set the division result to
1.0, thus avoiding the potential reciprocal round-trip error.
diff --git a/IGC/Compiler/LegalizationPass.cpp b/IGC/Compiler/LegalizationPass.cpp
@@ -37,7 +37,7 @@ using namespace IGC::IGCMD;
 
 namespace IGC {
 
-    bool expandFDIVInstructions(llvm::Function& F);
+    bool expandFDIVInstructions(llvm::Function &F, ShaderType ShaderTy);
 
 } // namespace IGC
 
@@ -108,7 +108,7 @@ bool Legalization::runOnFunction(Function& F)
 
     // Legalize fdiv if any
     if (!m_ctx->platform.hasFDIV())
-        expandFDIVInstructions(F);
+        expandFDIVInstructions(F, m_ctx->type);
     return true;
 }
 
@@ -2806,8 +2806,7 @@ static bool needsNoScaling(Value* Val)
 //       S = 2^(-32) if exp(y) >= 200,
 //       S = 1.0f otherwise
 //
-bool IGC::expandFDIVInstructions(llvm::Function& F)
-{
+bool IGC::expandFDIVInstructions(llvm::Function &F, ShaderType ShaderTy) {
     bool Changed = false;
     for (auto& BB : F.getBasicBlockList()) {
         for (auto Iter = BB.begin(); Iter != BB.end();) {
@@ -2855,28 +2854,36 @@ bool IGC::expandFDIVInstructions(llvm::Function& F)
                 V = Builder.CreateFMul(Y, X);
             }
             else {
+                Value* YAsInt32 = Builder.CreateBitCast(Y, Builder.getInt32Ty());
+                Value* YExp = Builder.CreateAnd(YAsInt32, Builder.getInt32(0x7f800000));
+
                 float S32 = uint64_t(1) << 32;
                 ConstantFP* C0 = ConstantFP::get(Ctx, APFloat(S32));
                 ConstantFP* C1 = ConstantFP::get(Ctx, APFloat(1.0f));
                 ConstantFP* C2 = ConstantFP::get(Ctx, APFloat(1.0f / S32));
 
-                Value* Exp = Builder.CreateAnd(
-                    Builder.CreateBitCast(Y, Builder.getInt32Ty()),
-                    Builder.getInt32(0x7f800000));
-
-                // Check if B's exponent is 0, scale up.
-                Value* P1 = Builder.CreateICmpEQ(Exp, Builder.getInt32(0));
-                Value* Scale = Builder.CreateSelect(P1, C0, C1);
-
-                // Check if B's exponent >= 200, scale down.
-                Value* P2 = Builder.CreateICmpUGE(Exp, Builder.getInt32(200 << 23));
-                Scale = Builder.CreateSelect(P2, C2, Scale);
+                // Determine the appropriate scale based on Y's exponent.
+                Value* ScaleUp = Builder.CreateSelect(Builder.CreateICmpEQ(YExp, Builder.getInt32(0)), C0, C1);
+                Value* Scale = Builder.CreateSelect(Builder.CreateICmpUGE(YExp, Builder.getInt32(200 << 23)), C2, ScaleUp);
 
                 // Compute rcp(y * S) * x * S
-                V = Builder.CreateFMul(Y, Scale);
-                V = Builder.CreateFDiv(C1, V);
-                V = Builder.CreateFMul(V, X);
+                Value *ScaledY = Builder.CreateFMul(Y, Scale);
+                ScaledY = Builder.CreateFDiv(C1, ScaledY);
+                V = Builder.CreateFMul(ScaledY, X);
                 V = Builder.CreateFMul(V, Scale);
+
+                // In case of OpenCL kernels, create comparisons to check if X or Y is +/-0, +/-Inf, +/-NaN,
+                // or subnormal. If x == y and y is a normal number, select 1.0f as a result for better precision.
+                if (ShaderTy == ShaderType::OPENCL_SHADER) {
+                    Value* CmpXY = Builder.CreateFCmpOEQ(X, Y);
+                    Value* YMantissa = Builder.CreateAnd(YAsInt32, Builder.getInt32(0x007fffff));
+                    Value* CmpYExpZero = Builder.CreateICmpEQ(YExp, Builder.getInt32(0));
+                    Value* CmpYMantissaZero = Builder.CreateICmpEQ(YMantissa, Builder.getInt32(0));
+                    Value* CmpYIsZeroOrSubnormal = Builder.CreateOr(CmpYExpZero, CmpYMantissaZero);
+                    Value* CmpYIsNotZeroOrSubnormal = Builder.CreateNot(CmpYIsZeroOrSubnormal);
+                    V = Builder.CreateSelect(Builder.CreateAnd(CmpXY, CmpYIsNotZeroOrSubnormal),
+                        ConstantFP::get(Ctx, APFloat(1.0f)), V);
+                }
             }
 
             Inst->replaceAllUsesWith(V);
@@ -2898,6 +2905,7 @@ namespace IGC {
         void getAnalysisUsage(AnalysisUsage& AU) const override
         {
             AU.setPreservesCFG();
+            AU.addRequired<CodeGenContextWrapper>();
         }
     };
 
@@ -2920,6 +2928,7 @@ GenFDIVEmulation::GenFDIVEmulation()
 
 bool GenFDIVEmulation::runOnFunction(Function& F)
 {
+    IGC::CodeGenContext* m_ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
     // Always emulate fdiv instructions.
-    return expandFDIVInstructions(F);
+    return expandFDIVInstructions(F, m_ctx->type);
 }
diff --git a/IGC/Compiler/tests/GenFDIVEmulation/basic.ll b/IGC/Compiler/tests/GenFDIVEmulation/basic.ll
@@ -16,20 +16,28 @@
 ; CHECK: CheckModuleDebugify: PASS
 
 define void @test_fdiv(float %a, float %b) {
-; CHECK-LABEL: @test_fdiv(
-; CHECK:    [[TMP1:%[A-z0-9]*]] = bitcast float [[B:%[A-z0-9]*]] to i32
-; CHECK:    [[TMP2:%[A-z0-9]*]] = and i32 [[TMP1]], 2139095040
-; CHECK:    [[TMP3:%[A-z0-9]*]] = icmp eq i32 [[TMP2]], 0
-; CHECK:    [[TMP4:%[A-z0-9]*]] = select i1 [[TMP3]], float 0x41F0000000000000, float 1.000000e+00
-; CHECK:    [[TMP5:%[A-z0-9]*]] = icmp uge i32 [[TMP2]], 1677721600
-; CHECK:    [[TMP6:%[A-z0-9]*]] = select i1 [[TMP5]], float 0x3DF0000000000000, float [[TMP4]]
-; CHECK:    [[TMP7:%[A-z0-9]*]] = fmul float [[B]], [[TMP6]]
-; CHECK:    [[TMP8:%[A-z0-9]*]] = fdiv float 1.000000e+00, [[TMP7]]
-; CHECK:    [[TMP9:%[A-z0-9]*]] = fmul float [[TMP8]], [[A:%[A-z0-9]*]]
-; CHECK:    [[TMP10:%[A-z0-9]*]] = fmul float [[TMP9]], [[TMP6]]
-; CHECK:    call void @use.f32(float [[TMP10]])
-; CHECK:    ret void
-;
+  ; CHECK-LABEL: @test_fdiv(
+  ; CHECK:    [[TMP1:%[A-z0-9]+]] = bitcast float [[B:%[A-z0-9]*]] to i32
+  ; CHECK:    [[TMP2:%[A-z0-9]+]] = and i32 [[TMP1]], 2139095040
+  ; CHECK:    [[TMP3:%[A-z0-9]+]] = icmp eq i32 [[TMP2]], 0
+  ; CHECK:    [[TMP4:%[A-z0-9]+]] = select i1 [[TMP3]], float 0x41F0000000000000, float 1.000000e+00
+  ; CHECK:    [[TMP5:%[A-z0-9]+]] = icmp uge i32 [[TMP2]], 1677721600
+  ; CHECK:    [[TMP6:%[A-z0-9]+]] = select i1 [[TMP5]], float 0x3DF0000000000000, float [[TMP4]]
+  ; CHECK:    [[TMP7:%[A-z0-9]+]] = fmul float [[B]], [[TMP6]]
+  ; CHECK:    [[TMP8:%[A-z0-9]+]] = fdiv float 1.000000e+00, [[TMP7]]
+  ; CHECK:    [[TMP9:%[A-z0-9]+]] = fmul float [[TMP8]], [[A:%[A-z0-9]*]]
+  ; CHECK:    [[TMP10:%[A-z0-9]+]] = fmul float [[TMP9]], [[TMP6]]
+  ; CHECK:    [[TMP11:%[A-z0-9]+]] = fcmp oeq float [[A]], [[B]]
+  ; CHECK:    [[TMP12:%[A-z0-9]+]] = and i32 [[TMP1]], 8388607
+  ; CHECK:    [[TMP13:%[A-z0-9]+]] = icmp eq i32 [[TMP2]], 0
+  ; CHECK:    [[TMP14:%[A-z0-9]+]] = icmp eq i32 [[TMP12]], 0
+  ; CHECK:    [[TMP15:%[A-z0-9]+]] = or i1 [[TMP13]], [[TMP14]]
+  ; CHECK:    [[TMP16:%[A-z0-9]+]] = xor i1 [[TMP15]], true
+  ; CHECK:    [[TMP17:%[A-z0-9]+]] = and i1 [[TMP11]], [[TMP16]]
+  ; CHECK:    [[TMP18:%[A-z0-9]+]] = select i1 [[TMP17]], float 1.000000e+00, float [[TMP10]]
+  ; CHECK:    call void @use.f32(float [[TMP18]])
+  ; CHECK:    ret void
+  ;
   %1 = fdiv float %a, %b
   call void @use.f32(float %1)
   ret void