Skip to content

[libc][math] Fix overflow shifts for dyadic floats and add skip accuracy option for expm1. #98048

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions libc/src/__support/FPUtil/dyadic_float.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,16 +67,26 @@ template <size_t Bits> struct DyadicFloat {
}

// Used for aligning exponents. Output might not be normalized.
LIBC_INLINE constexpr DyadicFloat &shift_left(int shift_length) {
exponent -= shift_length;
mantissa <<= static_cast<size_t>(shift_length);
LIBC_INLINE constexpr DyadicFloat &shift_left(unsigned shift_length) {
if (shift_length < Bits) {
exponent -= static_cast<int>(shift_length);
mantissa <<= shift_length;
} else {
exponent = 0;
mantissa = MantissaType(0);
}
return *this;
}

// Used for aligning exponents. Output might not be normalized.
LIBC_INLINE constexpr DyadicFloat &shift_right(int shift_length) {
exponent += shift_length;
mantissa >>= static_cast<size_t>(shift_length);
LIBC_INLINE constexpr DyadicFloat &shift_right(unsigned shift_length) {
if (shift_length < Bits) {
exponent += static_cast<int>(shift_length);
mantissa >>= shift_length;
} else {
exponent = 0;
mantissa = MantissaType(0);
}
return *this;
}

Expand Down Expand Up @@ -261,9 +271,9 @@ LIBC_INLINE constexpr DyadicFloat<Bits> quick_add(DyadicFloat<Bits> a,

// Align exponents
if (a.exponent > b.exponent)
b.shift_right(a.exponent - b.exponent);
b.shift_right(static_cast<unsigned>(a.exponent - b.exponent));
else if (b.exponent > a.exponent)
a.shift_right(b.exponent - a.exponent);
a.shift_right(static_cast<unsigned>(b.exponent - a.exponent));

DyadicFloat<Bits> result;

Expand Down
31 changes: 21 additions & 10 deletions libc/src/math/generic/expm1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
#include "src/__support/integer_literals.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY

#include <errno.h>
#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
#define LIBC_MATH_EXPM1_SKIP_ACCURATE_PASS
#endif

// #define DEBUGDEBUG

Expand All @@ -51,7 +53,7 @@ constexpr double LOG2_E = 0x1.71547652b82fep+0;
constexpr uint64_t ERR_D = 0x3c08000000000000;
// Errors when using double-double precision.
// 0x1.0p-99
constexpr uint64_t ERR_DD = 0x39c0000000000000;
[[maybe_unused]] constexpr uint64_t ERR_DD = 0x39c0000000000000;

// -2^-12 * log(2)
// > a = -2^-12 * log(2);
Expand Down Expand Up @@ -108,7 +110,7 @@ DoubleDouble poly_approx_dd(const DoubleDouble &dx) {
// Return (exp(dx) - 1)/dx ~ 1 + dx / 2 + dx^2 / 6 + ... + dx^6 / 5040
// For |dx| < 2^-13 + 2^-30:
// | output - exp(dx) | < 2^-126.
Float128 poly_approx_f128(const Float128 &dx) {
[[maybe_unused]] Float128 poly_approx_f128(const Float128 &dx) {
constexpr Float128 COEFFS_128[]{
{Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0
{Sign::POS, -128, 0x80000000'00000000'00000000'00000000_u128}, // 0.5
Expand All @@ -127,21 +129,22 @@ Float128 poly_approx_f128(const Float128 &dx) {

#ifdef DEBUGDEBUG
std::ostream &operator<<(std::ostream &OS, const Float128 &r) {
OS << (r.sign ? "-(" : "(") << r.mantissa.val[0] << " + " << r.mantissa.val[1]
<< " * 2^64) * 2^" << r.exponent << "\n";
OS << (r.sign == Sign::NEG ? "-(" : "(") << r.mantissa.val[0] << " + "
<< r.mantissa.val[1] << " * 2^64) * 2^" << r.exponent << "\n";
return OS;
}

std::ostream &operator<<(std::ostream &OS, const DoubleDouble &r) {
OS << std::hexfloat << r.hi << " + " << r.lo << std::defaultfloat << "\n";
OS << std::hexfloat << "(" << r.hi << " + " << r.lo << ")"
<< std::defaultfloat << "\n";
return OS;
}
#endif

// Compute exp(x) - 1 using 128-bit precision.
// TODO(lntue): investigate triple-double precision implementation for this
// step.
Float128 expm1_f128(double x, double kd, int idx1, int idx2) {
[[maybe_unused]] Float128 expm1_f128(double x, double kd, int idx1, int idx2) {
// Recalculate dx:

double t1 = fputil::multiply_add(kd, MLOG_2_EXP2_M12_HI, x); // exact
Expand Down Expand Up @@ -182,9 +185,10 @@ Float128 expm1_f128(double x, double kd, int idx1, int idx2) {
#ifdef DEBUGDEBUG
std::cout << "=== VERY SLOW PASS ===\n"
<< " kd: " << kd << "\n"
<< " dx: " << dx << "exp_mid_m1: " << exp_mid_m1
<< " exp_mid: " << exp_mid << " p: " << p
<< " r: " << r << std::endl;
<< " hi: " << hi << "\n"
<< " minus_one: " << minus_one << " dx: " << dx
<< "exp_mid_m1: " << exp_mid_m1 << " exp_mid: " << exp_mid
<< " p: " << p << " r: " << r << std::endl;
#endif

return r;
Expand Down Expand Up @@ -479,6 +483,12 @@ LLVM_LIBC_FUNCTION(double, expm1, (double x)) {
// Use double-double
DoubleDouble r_dd = exp_double_double(x, kd, exp_mid, hi_part);

#ifdef LIBC_MATH_EXPM1_SKIP_ACCURATE_PASS
int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
double r =
cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(r_dd.hi + r_dd.lo));
return r;
#else
double err_dd = cpp::bit_cast<double>(ERR_DD + err);

double upper_dd = r_dd.hi + (r_dd.lo + err_dd);
Expand All @@ -494,6 +504,7 @@ LLVM_LIBC_FUNCTION(double, expm1, (double x)) {
Float128 r_f128 = expm1_f128(x, kd, idx1, idx2);

return static_cast<double>(r_f128);
#endif // LIBC_MATH_EXPM1_SKIP_ACCURATE_PASS
}

} // namespace LIBC_NAMESPACE
45 changes: 17 additions & 28 deletions libc/test/src/math/expm1_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
#include "test/UnitTest/Test.h"
#include "utils/MPFRWrapper/MPFRUtils.h"

#include <errno.h>
#include <stdint.h>

using LlvmLibcExpm1Test = LIBC_NAMESPACE::testing::FPTest<double>;
Expand All @@ -23,34 +22,24 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
using LIBC_NAMESPACE::testing::tlog;

TEST_F(LlvmLibcExpm1Test, TrickyInputs) {
constexpr int N = 21;
constexpr uint64_t INPUTS[N] = {
0x3FD79289C6E6A5C0, // x=0x1.79289c6e6a5cp-2
0x3FD05DE80A173EA0, // x=0x1.05de80a173eap-2
0xbf1eb7a4cb841fcc, // x=-0x1.eb7a4cb841fccp-14
0xbf19a61fb925970d, // x=-0x1.9a61fb925970dp-14
0x3fda7b764e2cf47a, // x=0x1.a7b764e2cf47ap-2
0xc04757852a4b93aa, // x=-0x1.757852a4b93aap+5
0x4044c19e5712e377, // x=0x1.4c19e5712e377p+5
0xbf19a61fb925970d, // x=-0x1.9a61fb925970dp-14
0xc039a74cdab36c28, // x=-0x1.9a74cdab36c28p+4
0xc085b3e4e2e3bba9, // x=-0x1.5b3e4e2e3bba9p+9
0xc086960d591aec34, // x=-0x1.6960d591aec34p+9
0xc086232c09d58d91, // x=-0x1.6232c09d58d91p+9
0xc0874910d52d3051, // x=-0x1.74910d52d3051p9
0xc0867a172ceb0990, // x=-0x1.67a172ceb099p+9
0xc08ff80000000000, // x=-0x1.ff8p+9
0xbc971547652b82fe, // x=-0x1.71547652b82fep-54
0xbce465655f122ff6, // x=-0x1.465655f122ff6p-49
0x3d1bc8ee6b28659a, // x=0x1.bc8ee6b28659ap-46
0x3f18442b169f672d, // x=0x1.8442b169f672dp-14
0xc02b4f0cfb15ca0f, // x=-0x1.b4f0cfb15ca0fp+3
0xc042b708872320dd, // x=-0x1.2b708872320ddp+5
constexpr double INPUTS[] = {
0x1.71547652b82fep-54, 0x1.465655f122ff6p-49, 0x1.bc8ee6b28659ap-46,
0x1.8442b169f672dp-14, 0x1.9a61fb925970dp-14, 0x1.eb7a4cb841fccp-14,
0x1.05de80a173eap-2, 0x1.79289c6e6a5cp-2, 0x1.a7b764e2cf47ap-2,
0x1.b4f0cfb15ca0fp+3, 0x1.9a74cdab36c28p+4, 0x1.2b708872320ddp+5,
0x1.4c19e5712e377p+5, 0x1.757852a4b93aap+5, 0x1.77f74111e0894p+6,
0x1.a6c3780bbf824p+6, 0x1.e3d57e4c557f6p+6, 0x1.f07560077985ap+6,
0x1.1f0da93354198p+7, 0x1.71018579c0758p+7, 0x1.204684c1167e9p+8,
0x1.5b3e4e2e3bba9p+9, 0x1.6232c09d58d91p+9, 0x1.67a172ceb099p+9,
0x1.6960d591aec34p+9, 0x1.74910d52d3051p+9, 0x1.ff8p+9,
};
constexpr int N = sizeof(INPUTS) / sizeof(INPUTS[0]);
for (int i = 0; i < N; ++i) {
double x = FPBits(INPUTS[i]).get_val();
double x = INPUTS[i];
EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
LIBC_NAMESPACE::expm1(x), 0.5);
EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, -x,
LIBC_NAMESPACE::expm1(-x), 0.5);
}
}

Expand Down Expand Up @@ -98,10 +87,10 @@ TEST_F(LlvmLibcExpm1Test, InDoubleRange) {
}
}
}
tlog << " Expm1 failed: " << fails << "/" << count << "/" << cc
<< " tests.\n";
tlog << " Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
if (fails) {
tlog << " Expm1 failed: " << fails << "/" << count << "/" << cc
<< " tests.\n";
tlog << " Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
EXPECT_MPFR_MATCH(mpfr::Operation::Expm1, mx, mr, 0.5, rounding_mode);
}
};
Expand Down
18 changes: 10 additions & 8 deletions libc/test/src/math/tan_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,15 @@ TEST_F(LlvmLibcTanTest, TrickyInputs) {
constexpr double INPUTS[] = {
0x1.d130383d17321p-27, 0x1.8000000000009p-23, 0x1.8000000000024p-22,
0x1.800000000009p-21, 0x1.20000000000f3p-20, 0x1.800000000024p-20,
0x1.e0000000001c2p-20, 0x1.0da8cc189b47dp-10, 0x1.00a33764a0a83p-7,
0x1.911a18779813fp-7, 0x1.940c877fb7dacp-7, 0x1.f42fb19b5b9b2p-6,
0x1.0285070f9f1bcp-5, 0x1.6ca9ef729af76p-1, 0x1.23f40dccdef72p+0,
0x1.43cf16358c9d7p+0, 0x1.addf3b9722265p+0, 0x1.ae78d360afa15p+0,
0x1.fe81868fc47fep+1, 0x1.e31b55306f22cp+2, 0x1.e639103a05997p+2,
0x1.f7898d5a756ddp+2, 0x1.1685973506319p+3, 0x1.5f09cad750ab1p+3,
0x1.aaf85537ea4c7p+3, 0x1.4f2b874135d27p+4, 0x1.13114266f9764p+4,
0x1.a211877de55dbp+4, 0x1.a5eece87e8606p+4, 0x1.a65d441ea6dcep+4,
0x1.e0000000001c2p-20, 0x1.00452f0e0134dp-13, 0x1.0da8cc189b47dp-10,
0x1.00a33764a0a83p-7, 0x1.911a18779813fp-7, 0x1.940c877fb7dacp-7,
0x1.f42fb19b5b9b2p-6, 0x1.0285070f9f1bcp-5, 0x1.89f0f5241255bp-2,
0x1.6ca9ef729af76p-1, 0x1.23f40dccdef72p+0, 0x1.43cf16358c9d7p+0,
0x1.addf3b9722265p+0, 0x1.ae78d360afa15p+0, 0x1.fe81868fc47fep+1,
0x1.e31b55306f22cp+2, 0x1.e639103a05997p+2, 0x1.f7898d5a756ddp+2,
0x1.1685973506319p+3, 0x1.5f09cad750ab1p+3, 0x1.aaf85537ea4c7p+3,
0x1.4f2b874135d27p+4, 0x1.13114266f9764p+4, 0x1.a211877de55dbp+4,
0x1.a5eece87e8606p+4, 0x1.a65d441ea6dcep+4, 0x1.045457ae3994p+5,
0x1.1ffb509f3db15p+5, 0x1.2345d1e090529p+5, 0x1.c96e28eb679f8p+5,
0x1.da1838053b866p+5, 0x1.be886d9c2324dp+6, 0x1.ab514bfc61c76p+7,
0x1.14823229799c2p+7, 0x1.48ff1782ca91dp+8, 0x1.dcbfda0c7559ep+8,
Expand All @@ -42,6 +43,7 @@ TEST_F(LlvmLibcTanTest, TrickyInputs) {
0x1.6ac5b262ca1ffp+843, 0x1.8bb5847d49973p+845, 0x1.6ac5b262ca1ffp+849,
0x1.f08b14e1c4d0fp+890, 0x1.2b5fe88a9d8d5p+903, 0x1.a880417b7b119p+1023,
0x1.f6d7518808571p+1023,

};
constexpr int N = sizeof(INPUTS) / sizeof(INPUTS[0]);

Expand Down
Loading