[flang] Add runtime support for Fortran intrinsic ERFC_SCALED (#95040)

d-parks · David Parks · web-flow · commit a03e93e1b217 · 2024-06-11T09:40:00.000-07:00
Co-authored-by: David Parks &lt;dparks@nvidia.com&gt;
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -204,6 +204,8 @@ struct IntrinsicLibrary {
                                            llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCAssociatedCPtr(mlir::Type,
                                         llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genErfcScaled(mlir::Type resultType,
+                            llvm::ArrayRef<mlir::Value> args);
   void genCFPointer(llvm::ArrayRef<fir::ExtendedValue>);
   void genCFProcPointer(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCFunLoc(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h b/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h
@@ -18,6 +18,10 @@ class FirOpBuilder;
 
 namespace fir::runtime {
 
+/// Generate call to ErfcScaled intrinsic runtime routine.
+mlir::Value genErfcScaled(fir::FirOpBuilder &builder, mlir::Location loc,
+                          mlir::Value x);
+
 /// Generate call to Exponent intrinsic runtime routine.
 mlir::Value genExponent(fir::FirOpBuilder &builder, mlir::Location loc,
                         mlir::Type resultType, mlir::Value x);
diff --git a/flang/include/flang/Runtime/numeric.h b/flang/include/flang/Runtime/numeric.h
@@ -73,6 +73,20 @@ CppTypeFor<TypeCategory::Integer, 16> RTDECL(Ceiling16_16)(
 #endif
 #endif
 
+// ERFC_SCALED
+CppTypeFor<TypeCategory::Real, 4> RTDECL(ErfcScaled4)(
+    CppTypeFor<TypeCategory::Real, 4>);
+CppTypeFor<TypeCategory::Real, 8> RTDECL(ErfcScaled8)(
+    CppTypeFor<TypeCategory::Real, 8>);
+#if LDBL_MANT_DIG == 64
+CppTypeFor<TypeCategory::Real, 10> RTDECL(ErfcScaled10)(
+    CppTypeFor<TypeCategory::Real, 10>);
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDECL(ErfcScaled16)(
+    CppTypeFor<TypeCategory::Real, 16>);
+#endif
+
 // EXPONENT is defined to return default INTEGER; support INTEGER(4 & 8)
 CppTypeFor<TypeCategory::Integer, 4> RTDECL(Exponent4_4)(
     CppTypeFor<TypeCategory::Real, 4>);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -224,6 +224,7 @@ static constexpr IntrinsicHandler handlers[]{
        {"boundary", asBox, handleDynamicOptional},
        {"dim", asValue}}},
      /*isElemental=*/false},
+    {"erfc_scaled", &I::genErfcScaled},
     {"etime",
      &I::genEtime,
      {{{"values", asBox}, {"time", asBox}}},
@@ -5878,6 +5879,16 @@ mlir::Value IntrinsicLibrary::genRRSpacing(mlir::Type resultType,
       fir::runtime::genRRSpacing(builder, loc, fir::getBase(args[0])));
 }
 
+// ERFC_SCALED
+mlir::Value IntrinsicLibrary::genErfcScaled(mlir::Type resultType,
+                                            llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 1);
+
+  return builder.createConvert(
+      loc, resultType,
+      fir::runtime::genErfcScaled(builder, loc, fir::getBase(args[0])));
+}
+
 // SAME_TYPE_AS
 fir::ExtendedValue
 IntrinsicLibrary::genSameTypeAs(mlir::Type resultType,
diff --git a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
@@ -22,6 +22,28 @@ using namespace Fortran::runtime;
 // may not have them in their runtime library. This can occur in the
 // case of cross compilation, for example.
 
+/// Placeholder for real*10 version of ErfcScaled Intrinsic
+struct ForcedErfcScaled10 {
+  static constexpr const char *name = ExpandAndQuoteKey(RTNAME(ErfcScaled10));
+  static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
+    return [](mlir::MLIRContext *ctx) {
+      auto ty = mlir::FloatType::getF80(ctx);
+      return mlir::FunctionType::get(ctx, {ty}, {ty});
+    };
+  }
+};
+
+/// Placeholder for real*16 version of ErfcScaled Intrinsic
+struct ForcedErfcScaled16 {
+  static constexpr const char *name = ExpandAndQuoteKey(RTNAME(ErfcScaled16));
+  static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
+    return [](mlir::MLIRContext *ctx) {
+      auto ty = mlir::FloatType::getF128(ctx);
+      return mlir::FunctionType::get(ctx, {ty}, {ty});
+    };
+  }
+};
+
 /// Placeholder for real*10 version of Exponent Intrinsic
 struct ForcedExponent10_4 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Exponent10_4));
@@ -444,6 +466,30 @@ mlir::Value fir::runtime::genRRSpacing(fir::FirOpBuilder &builder,
   return builder.create<fir::CallOp>(loc, func, args).getResult(0);
 }
 
+/// Generate call to ErfcScaled intrinsic runtime routine.
+mlir::Value fir::runtime::genErfcScaled(fir::FirOpBuilder &builder,
+                                        mlir::Location loc, mlir::Value x) {
+  mlir::func::FuncOp func;
+  mlir::Type fltTy = x.getType();
+
+  if (fltTy.isF32())
+    func = fir::runtime::getRuntimeFunc<mkRTKey(ErfcScaled4)>(loc, builder);
+  else if (fltTy.isF64())
+    func = fir::runtime::getRuntimeFunc<mkRTKey(ErfcScaled8)>(loc, builder);
+  else if (fltTy.isF80())
+    func = fir::runtime::getRuntimeFunc<ForcedErfcScaled10>(loc, builder);
+  else if (fltTy.isF128())
+    func = fir::runtime::getRuntimeFunc<ForcedErfcScaled16>(loc, builder);
+  else
+    fir::intrinsicTypeTODO(builder, fltTy, loc, "ERFC_SCALED");
+
+  auto funcTy = func.getFunctionType();
+  llvm::SmallVector<mlir::Value> args = {
+      builder.createConvert(loc, funcTy.getInput(0), x)};
+
+  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+}
+
 /// Generate call to Scale intrinsic runtime routine.
 mlir::Value fir::runtime::genScale(fir::FirOpBuilder &builder,
                                    mlir::Location loc, mlir::Value x,
diff --git a/flang/runtime/numeric-templates.h b/flang/runtime/numeric-templates.h
@@ -354,6 +354,110 @@ template <int PREC, typename T> inline RT_API_ATTRS T Spacing(T x) {
   }
 }
 
+// ERFC_SCALED (16.9.71)
+template <typename T> inline RT_API_ATTRS T ErfcScaled(T arg) {
+  // Coefficients for approximation to erfc in the first interval.
+  static const T a[5] = {3.16112374387056560e00, 1.13864154151050156e02,
+      3.77485237685302021e02, 3.20937758913846947e03, 1.85777706184603153e-1};
+  static const T b[4] = {2.36012909523441209e01, 2.44024637934444173e02,
+      1.28261652607737228e03, 2.84423683343917062e03};
+
+  // Coefficients for approximation to erfc in the second interval.
+  static const T c[9] = {5.64188496988670089e-1, 8.88314979438837594e00,
+      6.61191906371416295e01, 2.98635138197400131e02, 8.81952221241769090e02,
+      1.71204761263407058e03, 2.05107837782607147e03, 1.23033935479799725e03,
+      2.15311535474403846e-8};
+  static const T d[8] = {1.57449261107098347e01, 1.17693950891312499e02,
+      5.37181101862009858e02, 1.62138957456669019e03, 3.29079923573345963e03,
+      4.36261909014324716e03, 3.43936767414372164e03, 1.23033935480374942e03};
+
+  // Coefficients for approximation to erfc in the third interval.
+  static const T p[6] = {3.05326634961232344e-1, 3.60344899949804439e-1,
+      1.25781726111229246e-1, 1.60837851487422766e-2, 6.58749161529837803e-4,
+      1.63153871373020978e-2};
+  static const T q[5] = {2.56852019228982242e00, 1.87295284992346047e00,
+      5.27905102951428412e-1, 6.05183413124413191e-2, 2.33520497626869185e-3};
+
+  constexpr T sqrtpi{1.7724538509078120380404576221783883301349L};
+  constexpr T rsqrtpi{0.5641895835477562869480794515607725858440L};
+  constexpr T epsilonby2{std::numeric_limits<T>::epsilon() * 0.5};
+  constexpr T xneg{-26.628e0};
+  constexpr T xhuge{6.71e7};
+  constexpr T thresh{0.46875e0};
+  constexpr T zero{0.0};
+  constexpr T one{1.0};
+  constexpr T four{4.0};
+  constexpr T sixteen{16.0};
+  constexpr T xmax{1.0 / (sqrtpi * std::numeric_limits<T>::min())};
+  static_assert(xmax > xhuge, "xmax must be greater than xhuge");
+
+  T ysq;
+  T xnum;
+  T xden;
+  T del;
+  T result;
+
+  auto x{arg};
+  auto y{std::fabs(x)};
+
+  if (y <= thresh) {
+    // evaluate erf for  |x| <= 0.46875
+    ysq = zero;
+    if (y > epsilonby2) {
+      ysq = y * y;
+    }
+    xnum = a[4] * ysq;
+    xden = ysq;
+    for (int i{0}; i < 3; i++) {
+      xnum = (xnum + a[i]) * ysq;
+      xden = (xden + b[i]) * ysq;
+    }
+    result = x * (xnum + a[3]) / (xden + b[3]);
+    result = one - result;
+    result = std::exp(ysq) * result;
+    return result;
+  } else if (y <= four) {
+    //  evaluate erfc for 0.46875 < |x| <= 4.0
+    xnum = c[8] * y;
+    xden = y;
+    for (int i{0}; i < 7; ++i) {
+      xnum = (xnum + c[i]) * y;
+      xden = (xden + d[i]) * y;
+    }
+    result = (xnum + c[7]) / (xden + d[7]);
+  } else {
+    //  evaluate erfc for |x| > 4.0
+    result = zero;
+    if (y >= xhuge) {
+      if (y < xmax) {
+        result = rsqrtpi / y;
+      }
+    } else {
+      ysq = one / (y * y);
+      xnum = p[5] * ysq;
+      xden = ysq;
+      for (int i{0}; i < 4; ++i) {
+        xnum = (xnum + p[i]) * ysq;
+        xden = (xden + q[i]) * ysq;
+      }
+      result = ysq * (xnum + p[4]) / (xden + q[4]);
+      result = (rsqrtpi - result) / y;
+    }
+  }
+  //  fix up for negative argument, erf, etc.
+  if (x < zero) {
+    if (x < xneg) {
+      result = std::numeric_limits<T>::max();
+    } else {
+      ysq = trunc(x * sixteen) / sixteen;
+      del = (x - ysq) * (x + ysq);
+      y = std::exp((ysq * ysq)) * std::exp((del));
+      result = (y + y) - result;
+    }
+  }
+  return result;
+}
+
 } // namespace Fortran::runtime
 
 #endif // FORTRAN_RUNTIME_NUMERIC_TEMPLATES_H_
diff --git a/flang/runtime/numeric.cpp b/flang/runtime/numeric.cpp
@@ -316,6 +316,27 @@ CppTypeFor<TypeCategory::Integer, 16> RTDEF(Ceiling16_16)(
 #endif
 #endif
 
+CppTypeFor<TypeCategory::Real, 4> RTDEF(ErfcScaled4)(
+    CppTypeFor<TypeCategory::Real, 4> x) {
+  return ErfcScaled(x);
+}
+CppTypeFor<TypeCategory::Real, 8> RTDEF(ErfcScaled8)(
+    CppTypeFor<TypeCategory::Real, 8> x) {
+  return ErfcScaled(x);
+}
+#if LDBL_MANT_DIG == 64
+CppTypeFor<TypeCategory::Real, 10> RTDEF(ErfcScaled10)(
+    CppTypeFor<TypeCategory::Real, 10> x) {
+  return ErfcScaled(x);
+}
+#endif
+#if LDBL_MANT_DIG == 113
+CppTypeFor<TypeCategory::Real, 16> RTDEF(ErfcScaled16)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return ErfcScaled(x);
+}
+#endif
+
 CppTypeFor<TypeCategory::Integer, 4> RTDEF(Exponent4_4)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Exponent<CppTypeFor<TypeCategory::Integer, 4>>(x);
diff --git a/flang/test/Lower/Intrinsics/erfc_scaled.f90 b/flang/test/Lower/Intrinsics/erfc_scaled.f90
@@ -0,0 +1,23 @@
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s
+
+! CHECK-LABEL: func @_QPerfc_scaled4(
+! CHECK-SAME: %[[x:[^:]+]]: !fir.ref<f32>{{.*}}) -> f32
+function erfc_scaled4(x)
+  real(kind=4) :: erfc_scaled4
+  real(kind=4) :: x
+  erfc_scaled4 = erfc_scaled(x);
+! CHECK: %[[a1:.*]] = fir.load %[[x]] : !fir.ref<f32>
+! CHECK: %{{.*}} = fir.call @_FortranAErfcScaled4(%[[a1]]) {{.*}}: (f32) -> f32
+end function erfc_scaled4
+
+
+! CHECK-LABEL: func @_QPerfc_scaled8(
+! CHECK-SAME: %[[x:[^:]+]]: !fir.ref<f64>{{.*}}) -> f64
+function erfc_scaled8(x)
+  real(kind=8) :: erfc_scaled8
+  real(kind=8) :: x
+  erfc_scaled8 = erfc_scaled(x);
+! CHECK: %[[a1:.*]] = fir.load %[[x]] : !fir.ref<f64>
+! CHECK: %{{.*}} = fir.call @_FortranAErfcScaled8(%[[a1]]) {{.*}}: (f64) -> f64
+end function erfc_scaled8
diff --git a/flang/unittests/Runtime/Numeric.cpp b/flang/unittests/Runtime/Numeric.cpp
@@ -31,6 +31,14 @@ TEST(Numeric, Floor) {
   EXPECT_EQ(RTNAME(Floor4_1)(Real<4>{0}), 0);
 }
 
+TEST(Numeric, Erfc_scaled) {
+  EXPECT_NEAR(RTNAME(ErfcScaled4)(Real<4>{20.0}), 0.02817434874, 1.0e-8);
+  EXPECT_NEAR(RTNAME(ErfcScaled8)(Real<8>{20.0}), 0.02817434874, 1.0e-11);
+#if LDBL_MANT_DIG == 64
+  EXPECT_NEAR(RTNAME(ErfcScaled10)(Real<10>{20.0}), 0.02817434874, 1.0e-8);
+#endif
+}
+
 TEST(Numeric, Exponent) {
   EXPECT_EQ(RTNAME(Exponent4_4)(Real<4>{0}), 0);
   EXPECT_EQ(RTNAME(Exponent4_8)(Real<4>{1.0}), 1);