Skip to content

Commit a422a1a

Browse files
michalpaszkowskiigcbot
authored andcommitted
Fix reciprocal round-trip error in FDIV emulation (OCL)
This commit simplifies the FDIV instruction expansion by directly incorporating the case where x == y into the main computation flow. When x and y are equal and are normal values (neither is +/-0, +/-NaN, +/-Inf, or subnormal), a select is added to set the division result to 1.0, thus avoiding the potential reciprocal round-trip error.
1 parent 91a9ce5 commit a422a1a

File tree

2 files changed

+50
-33
lines changed

2 files changed

+50
-33
lines changed

IGC/Compiler/LegalizationPass.cpp

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ using namespace IGC::IGCMD;
3737

3838
namespace IGC {
3939

40-
bool expandFDIVInstructions(llvm::Function& F);
40+
bool expandFDIVInstructions(llvm::Function &F, ShaderType ShaderTy);
4141

4242
} // namespace IGC
4343

@@ -108,7 +108,7 @@ bool Legalization::runOnFunction(Function& F)
108108

109109
// Legalize fdiv if any
110110
if (!m_ctx->platform.hasFDIV())
111-
expandFDIVInstructions(F);
111+
expandFDIVInstructions(F, m_ctx->type);
112112
return true;
113113
}
114114

@@ -2806,8 +2806,7 @@ static bool needsNoScaling(Value* Val)
28062806
// S = 2^(-32) if exp(y) >= 200,
28072807
// S = 1.0f otherwise
28082808
//
2809-
bool IGC::expandFDIVInstructions(llvm::Function& F)
2810-
{
2809+
bool IGC::expandFDIVInstructions(llvm::Function &F, ShaderType ShaderTy) {
28112810
bool Changed = false;
28122811
for (auto& BB : F.getBasicBlockList()) {
28132812
for (auto Iter = BB.begin(); Iter != BB.end();) {
@@ -2855,28 +2854,36 @@ bool IGC::expandFDIVInstructions(llvm::Function& F)
28552854
V = Builder.CreateFMul(Y, X);
28562855
}
28572856
else {
2857+
Value* YAsInt32 = Builder.CreateBitCast(Y, Builder.getInt32Ty());
2858+
Value* YExp = Builder.CreateAnd(YAsInt32, Builder.getInt32(0x7f800000));
2859+
28582860
float S32 = uint64_t(1) << 32;
28592861
ConstantFP* C0 = ConstantFP::get(Ctx, APFloat(S32));
28602862
ConstantFP* C1 = ConstantFP::get(Ctx, APFloat(1.0f));
28612863
ConstantFP* C2 = ConstantFP::get(Ctx, APFloat(1.0f / S32));
28622864

2863-
Value* Exp = Builder.CreateAnd(
2864-
Builder.CreateBitCast(Y, Builder.getInt32Ty()),
2865-
Builder.getInt32(0x7f800000));
2866-
2867-
// Check if B's exponent is 0, scale up.
2868-
Value* P1 = Builder.CreateICmpEQ(Exp, Builder.getInt32(0));
2869-
Value* Scale = Builder.CreateSelect(P1, C0, C1);
2870-
2871-
// Check if B's exponent >= 200, scale down.
2872-
Value* P2 = Builder.CreateICmpUGE(Exp, Builder.getInt32(200 << 23));
2873-
Scale = Builder.CreateSelect(P2, C2, Scale);
2865+
// Determine the appropriate scale based on Y's exponent.
2866+
Value* ScaleUp = Builder.CreateSelect(Builder.CreateICmpEQ(YExp, Builder.getInt32(0)), C0, C1);
2867+
Value* Scale = Builder.CreateSelect(Builder.CreateICmpUGE(YExp, Builder.getInt32(200 << 23)), C2, ScaleUp);
28742868

28752869
// Compute rcp(y * S) * x * S
2876-
V = Builder.CreateFMul(Y, Scale);
2877-
V = Builder.CreateFDiv(C1, V);
2878-
V = Builder.CreateFMul(V, X);
2870+
Value *ScaledY = Builder.CreateFMul(Y, Scale);
2871+
ScaledY = Builder.CreateFDiv(C1, ScaledY);
2872+
V = Builder.CreateFMul(ScaledY, X);
28792873
V = Builder.CreateFMul(V, Scale);
2874+
2875+
// In case of OpenCL kernels, create comparisons to check if X or Y is +/-0, +/-Inf, +/-NaN,
2876+
// or subnormal. If x == y and y is a normal number, select 1.0f as a result for better precision.
2877+
if (ShaderTy == ShaderType::OPENCL_SHADER) {
2878+
Value* CmpXY = Builder.CreateFCmpOEQ(X, Y);
2879+
Value* YMantissa = Builder.CreateAnd(YAsInt32, Builder.getInt32(0x007fffff));
2880+
Value* CmpYExpZero = Builder.CreateICmpEQ(YExp, Builder.getInt32(0));
2881+
Value* CmpYMantissaZero = Builder.CreateICmpEQ(YMantissa, Builder.getInt32(0));
2882+
Value* CmpYIsZeroOrSubnormal = Builder.CreateOr(CmpYExpZero, CmpYMantissaZero);
2883+
Value* CmpYIsNotZeroOrSubnormal = Builder.CreateNot(CmpYIsZeroOrSubnormal);
2884+
V = Builder.CreateSelect(Builder.CreateAnd(CmpXY, CmpYIsNotZeroOrSubnormal),
2885+
ConstantFP::get(Ctx, APFloat(1.0f)), V);
2886+
}
28802887
}
28812888

28822889
Inst->replaceAllUsesWith(V);
@@ -2898,6 +2905,7 @@ namespace IGC {
28982905
void getAnalysisUsage(AnalysisUsage& AU) const override
28992906
{
29002907
AU.setPreservesCFG();
2908+
AU.addRequired<CodeGenContextWrapper>();
29012909
}
29022910
};
29032911

@@ -2920,6 +2928,7 @@ GenFDIVEmulation::GenFDIVEmulation()
29202928

29212929
bool GenFDIVEmulation::runOnFunction(Function& F)
29222930
{
2931+
IGC::CodeGenContext* m_ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
29232932
// Always emulate fdiv instructions.
2924-
return expandFDIVInstructions(F);
2933+
return expandFDIVInstructions(F, m_ctx->type);
29252934
}

IGC/Compiler/tests/GenFDIVEmulation/basic.ll

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,20 +16,28 @@
1616
; CHECK: CheckModuleDebugify: PASS
1717

1818
define void @test_fdiv(float %a, float %b) {
19-
; CHECK-LABEL: @test_fdiv(
20-
; CHECK: [[TMP1:%[A-z0-9]*]] = bitcast float [[B:%[A-z0-9]*]] to i32
21-
; CHECK: [[TMP2:%[A-z0-9]*]] = and i32 [[TMP1]], 2139095040
22-
; CHECK: [[TMP3:%[A-z0-9]*]] = icmp eq i32 [[TMP2]], 0
23-
; CHECK: [[TMP4:%[A-z0-9]*]] = select i1 [[TMP3]], float 0x41F0000000000000, float 1.000000e+00
24-
; CHECK: [[TMP5:%[A-z0-9]*]] = icmp uge i32 [[TMP2]], 1677721600
25-
; CHECK: [[TMP6:%[A-z0-9]*]] = select i1 [[TMP5]], float 0x3DF0000000000000, float [[TMP4]]
26-
; CHECK: [[TMP7:%[A-z0-9]*]] = fmul float [[B]], [[TMP6]]
27-
; CHECK: [[TMP8:%[A-z0-9]*]] = fdiv float 1.000000e+00, [[TMP7]]
28-
; CHECK: [[TMP9:%[A-z0-9]*]] = fmul float [[TMP8]], [[A:%[A-z0-9]*]]
29-
; CHECK: [[TMP10:%[A-z0-9]*]] = fmul float [[TMP9]], [[TMP6]]
30-
; CHECK: call void @use.f32(float [[TMP10]])
31-
; CHECK: ret void
32-
;
19+
; CHECK-LABEL: @test_fdiv(
20+
; CHECK: [[TMP1:%[A-z0-9]+]] = bitcast float [[B:%[A-z0-9]*]] to i32
21+
; CHECK: [[TMP2:%[A-z0-9]+]] = and i32 [[TMP1]], 2139095040
22+
; CHECK: [[TMP3:%[A-z0-9]+]] = icmp eq i32 [[TMP2]], 0
23+
; CHECK: [[TMP4:%[A-z0-9]+]] = select i1 [[TMP3]], float 0x41F0000000000000, float 1.000000e+00
24+
; CHECK: [[TMP5:%[A-z0-9]+]] = icmp uge i32 [[TMP2]], 1677721600
25+
; CHECK: [[TMP6:%[A-z0-9]+]] = select i1 [[TMP5]], float 0x3DF0000000000000, float [[TMP4]]
26+
; CHECK: [[TMP7:%[A-z0-9]+]] = fmul float [[B]], [[TMP6]]
27+
; CHECK: [[TMP8:%[A-z0-9]+]] = fdiv float 1.000000e+00, [[TMP7]]
28+
; CHECK: [[TMP9:%[A-z0-9]+]] = fmul float [[TMP8]], [[A:%[A-z0-9]*]]
29+
; CHECK: [[TMP10:%[A-z0-9]+]] = fmul float [[TMP9]], [[TMP6]]
30+
; CHECK: [[TMP11:%[A-z0-9]+]] = fcmp oeq float [[A]], [[B]]
31+
; CHECK: [[TMP12:%[A-z0-9]+]] = and i32 [[TMP1]], 8388607
32+
; CHECK: [[TMP13:%[A-z0-9]+]] = icmp eq i32 [[TMP2]], 0
33+
; CHECK: [[TMP14:%[A-z0-9]+]] = icmp eq i32 [[TMP12]], 0
34+
; CHECK: [[TMP15:%[A-z0-9]+]] = or i1 [[TMP13]], [[TMP14]]
35+
; CHECK: [[TMP16:%[A-z0-9]+]] = xor i1 [[TMP15]], true
36+
; CHECK: [[TMP17:%[A-z0-9]+]] = and i1 [[TMP11]], [[TMP16]]
37+
; CHECK: [[TMP18:%[A-z0-9]+]] = select i1 [[TMP17]], float 1.000000e+00, float [[TMP10]]
38+
; CHECK: call void @use.f32(float [[TMP18]])
39+
; CHECK: ret void
40+
;
3341
%1 = fdiv float %a, %b
3442
call void @use.f32(float %1)
3543
ret void

0 commit comments

Comments
 (0)