Skip to content

Commit bac2a07

Browse files
committed
clang: Attach !fpmath metadata to __builtin_sqrt based on language flags
OpenCL and HIP have -cl-fp32-correctly-rounded-divide-sqrt and -fno-hip-correctly-rounded-divide-sqrt. The corresponding fpmath metadata was only set on fdiv, and not sqrt. The backend is currently underutilizing sqrt lowering options, and the responsibility is split between the libraries and backend and this metadata is needed. CUDA/NVCC has -prec-div and -prev-sqrt but clang doesn't appear to be aiming for compatibility with those. Don't know if OpenMP has a similar control.
1 parent c4ccd6e commit bac2a07

File tree

6 files changed

+98
-24
lines changed

6 files changed

+98
-24
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2544,11 +2544,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
25442544
case Builtin::BI__builtin_sqrtf:
25452545
case Builtin::BI__builtin_sqrtf16:
25462546
case Builtin::BI__builtin_sqrtl:
2547-
case Builtin::BI__builtin_sqrtf128:
2548-
return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2549-
Intrinsic::sqrt,
2550-
Intrinsic::experimental_constrained_sqrt));
2551-
2547+
case Builtin::BI__builtin_sqrtf128: {
2548+
llvm::Value *Call = emitUnaryMaybeConstrainedFPBuiltin(
2549+
*this, E, Intrinsic::sqrt, Intrinsic::experimental_constrained_sqrt);
2550+
SetSqrtFPAccuracy(Call);
2551+
return RValue::get(Call);
2552+
}
25522553
case Builtin::BItrunc:
25532554
case Builtin::BItruncf:
25542555
case Builtin::BItruncl:

clang/lib/CodeGen/CGExpr.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5577,6 +5577,48 @@ void CodeGenFunction::SetFPAccuracy(llvm::Value *Val, float Accuracy) {
55775577
cast<llvm::Instruction>(Val)->setMetadata(llvm::LLVMContext::MD_fpmath, Node);
55785578
}
55795579

5580+
void CodeGenFunction::SetSqrtFPAccuracy(llvm::Value *Val) {
5581+
llvm::Type *EltTy = Val->getType()->getScalarType();
5582+
if (!EltTy->isFloatTy())
5583+
return;
5584+
5585+
if ((getLangOpts().OpenCL &&
5586+
!CGM.getCodeGenOpts().OpenCLCorrectlyRoundedDivSqrt) ||
5587+
(getLangOpts().HIP && getLangOpts().CUDAIsDevice &&
5588+
!CGM.getCodeGenOpts().HIPCorrectlyRoundedDivSqrt)) {
5589+
// OpenCL v1.1 s7.4: minimum accuracy of single precision / is 3ulp
5590+
//
5591+
// OpenCL v1.2 s5.6.4.2: The -cl-fp32-correctly-rounded-divide-sqrt
5592+
// build option allows an application to specify that single precision
5593+
// floating-point divide (x/y and 1/x) and sqrt used in the program
5594+
// source are correctly rounded.
5595+
//
5596+
// TODO: CUDA has a prec-sqrt flag
5597+
SetFPAccuracy(Val, 3.0f);
5598+
}
5599+
}
5600+
5601+
void CodeGenFunction::SetDivFPAccuracy(llvm::Value *Val) {
5602+
llvm::Type *EltTy = Val->getType()->getScalarType();
5603+
if (!EltTy->isFloatTy())
5604+
return;
5605+
5606+
if ((getLangOpts().OpenCL &&
5607+
!CGM.getCodeGenOpts().OpenCLCorrectlyRoundedDivSqrt) ||
5608+
(getLangOpts().HIP && getLangOpts().CUDAIsDevice &&
5609+
!CGM.getCodeGenOpts().HIPCorrectlyRoundedDivSqrt)) {
5610+
// OpenCL v1.1 s7.4: minimum accuracy of single precision / is 2.5ulp
5611+
//
5612+
// OpenCL v1.2 s5.6.4.2: The -cl-fp32-correctly-rounded-divide-sqrt
5613+
// build option allows an application to specify that single precision
5614+
// floating-point divide (x/y and 1/x) and sqrt used in the program
5615+
// source are correctly rounded.
5616+
//
5617+
// TODO: CUDA has a prec-div flag
5618+
SetFPAccuracy(Val, 2.5f);
5619+
}
5620+
}
5621+
55805622
namespace {
55815623
struct LValueOrRValue {
55825624
LValue LV;

clang/lib/CodeGen/CGExprScalar.cpp

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3478,21 +3478,7 @@ Value *ScalarExprEmitter::EmitDiv(const BinOpInfo &Ops) {
34783478
llvm::Value *Val;
34793479
CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, Ops.FPFeatures);
34803480
Val = Builder.CreateFDiv(Ops.LHS, Ops.RHS, "div");
3481-
if ((CGF.getLangOpts().OpenCL &&
3482-
!CGF.CGM.getCodeGenOpts().OpenCLCorrectlyRoundedDivSqrt) ||
3483-
(CGF.getLangOpts().HIP && CGF.getLangOpts().CUDAIsDevice &&
3484-
!CGF.CGM.getCodeGenOpts().HIPCorrectlyRoundedDivSqrt)) {
3485-
// OpenCL v1.1 s7.4: minimum accuracy of single precision / is 2.5ulp
3486-
// OpenCL v1.2 s5.6.4.2: The -cl-fp32-correctly-rounded-divide-sqrt
3487-
// build option allows an application to specify that single precision
3488-
// floating-point divide (x/y and 1/x) and sqrt used in the program
3489-
// source are correctly rounded.
3490-
llvm::Type *ValTy = Val->getType();
3491-
if (ValTy->isFloatTy() ||
3492-
(isa<llvm::VectorType>(ValTy) &&
3493-
cast<llvm::VectorType>(ValTy)->getElementType()->isFloatTy()))
3494-
CGF.SetFPAccuracy(Val, 2.5);
3495-
}
3481+
CGF.SetDivFPAccuracy(Val);
34963482
return Val;
34973483
}
34983484
else if (Ops.isFixedPointOp())

clang/lib/CodeGen/CodeGenFunction.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4708,6 +4708,14 @@ class CodeGenFunction : public CodeGenTypeCache {
47084708
/// point operation, expressed as the maximum relative error in ulp.
47094709
void SetFPAccuracy(llvm::Value *Val, float Accuracy);
47104710

4711+
/// Set the minimum required accuracy of the given sqrt operation
4712+
/// based on CodeGenOpts.
4713+
void SetSqrtFPAccuracy(llvm::Value *Val);
4714+
4715+
/// Set the minimum required accuracy of the given sqrt operation based on
4716+
/// CodeGenOpts.
4717+
void SetDivFPAccuracy(llvm::Value *Val);
4718+
47114719
/// Set the codegen fast-math flags.
47124720
void SetFastMathFlags(FPOptions FPFeatures);
47134721

clang/test/CodeGenCUDA/correctly-rounded-div.cu

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,18 @@ __device__ double dpscalardiv(double a, double b) {
3232
return a / b;
3333
}
3434

35-
// NCRDIV: ![[MD]] = !{float 2.500000e+00}
35+
// COMMON-LABEL: @_Z12spscalarsqrt
36+
// NCRDIV: call contract float @llvm.sqrt.f32(float %{{.+}}), !fpmath ![[MD:[0-9]+]]
37+
// CRDIV: call contract float @llvm.sqrt.f32(float %{{.+}}){{$}}
38+
__device__ float spscalarsqrt(float a) {
39+
return __builtin_sqrtf(a);
40+
}
41+
42+
// COMMON-LABEL: @_Z12dpscalarsqrt
43+
// COMMON: call contract double @llvm.sqrt.f64(double %{{.+}}){{$}}
44+
// COMMON-NOT: !fpmath
45+
__device__ double dpscalarsqrt(double a) {
46+
return __builtin_sqrt(a);
47+
}
48+
49+
// NCRSQRT: ![[MD]] = !{float 2.500000e+00}

clang/test/CodeGenOpenCL/fpmath.cl

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,26 @@ typedef __attribute__(( ext_vector_type(4) )) float float4;
88
float spscalardiv(float a, float b) {
99
// CHECK: @spscalardiv
1010
// CHECK: fdiv{{.*}},
11-
// NODIVOPT: !fpmath ![[MD:[0-9]+]]
11+
// NODIVOPT: !fpmath ![[MD_FDIV:[0-9]+]]
1212
// DIVOPT-NOT: !fpmath !{{[0-9]+}}
1313
return a / b;
1414
}
1515

1616
float4 spvectordiv(float4 a, float4 b) {
1717
// CHECK: @spvectordiv
1818
// CHECK: fdiv{{.*}},
19-
// NODIVOPT: !fpmath ![[MD]]
19+
// NODIVOPT: !fpmath ![[MD_FDIV]]
2020
// DIVOPT-NOT: !fpmath !{{[0-9]+}}
2121
return a / b;
2222
}
2323

24+
float spscalarsqrt(float a) {
25+
// CHECK-LABEL: @spscalarsqrt
26+
// NODIVOPT: call float @llvm.sqrt.f32(float %{{.+}}), !fpmath ![[MD_SQRT:[0-9]+]]
27+
// DIVOPT: call float @llvm.sqrt.f32(float %{{.+}}){{$}}
28+
return __builtin_sqrtf(a);
29+
}
30+
2431
#if __OPENCL_C_VERSION__ >=120
2532
void printf(constant char* fmt, ...);
2633

@@ -34,11 +41,27 @@ void testdbllit(long *val) {
3441

3542
#ifndef NOFP64
3643
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
44+
typedef __attribute__(( ext_vector_type(4) )) double double4;
45+
3746
double dpscalardiv(double a, double b) {
3847
// CHECK: @dpscalardiv
3948
// CHECK-NOT: !fpmath
4049
return a / b;
4150
}
51+
52+
double4 dpvectordiv(double4 a, double4 b) {
53+
// CHECK: @dpvectordiv
54+
// CHECK-NOT: !fpmath
55+
return a / b;
56+
}
57+
58+
double dpscalarsqrt(double a) {
59+
// CHECK-LABEL: @dpscalarsqrt
60+
// CHECK: call double @llvm.sqrt.f64(double %{{.+}}){{$}}
61+
return __builtin_sqrt(a);
62+
}
63+
4264
#endif
4365

44-
// NODIVOPT: ![[MD]] = !{float 2.500000e+00}
66+
// NODIVOPT: ![[MD_FDIV]] = !{float 2.500000e+00}
67+
// NODIVOPT: ![[MD_SQRT]] = !{float 3.000000e+00}

0 commit comments

Comments
 (0)