-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[libc][math] Implement fast pass for double precision pow function with up to 1ULP error. #101926
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…th up to 1ULP error.
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-libc Author: None (lntue) ChangesPatch is 41.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101926.diff 17 Files Affected:
diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
index 3e4cb3cebce9b..dfbba63d769ee 100644
--- a/libc/config/darwin/arm/entrypoints.txt
+++ b/libc/config/darwin/arm/entrypoints.txt
@@ -227,6 +227,7 @@ set(TARGET_LIBM_ENTRYPOINTS
libc.src.math.nexttoward
libc.src.math.nexttowardf
libc.src.math.nexttowardl
+ libc.src.math.pow
libc.src.math.powf
libc.src.math.remainderf
libc.src.math.remainder
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index dff8f25142dd9..149c368338276 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -521,6 +521,7 @@ set(TARGET_LIBM_ENTRYPOINTS
libc.src.math.nextup
libc.src.math.nextupf
libc.src.math.nextupl
+ libc.src.math.pow
libc.src.math.powf
libc.src.math.remainder
libc.src.math.remainderf
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 90aae962080cd..3dc8aca518305 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -350,6 +350,7 @@ set(TARGET_LIBM_ENTRYPOINTS
libc.src.math.nextup
libc.src.math.nextupf
libc.src.math.nextupl
+ libc.src.math.pow
libc.src.math.powf
libc.src.math.remainder
libc.src.math.remainderf
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 29969fce6adf8..15a6827c040c7 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -520,6 +520,7 @@ set(TARGET_LIBM_ENTRYPOINTS
libc.src.math.nextup
libc.src.math.nextupf
libc.src.math.nextupl
+ libc.src.math.pow
libc.src.math.powf
libc.src.math.remainder
libc.src.math.remainderf
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 2ed287dc8542e..0c54fb267f855 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -520,6 +520,7 @@ set(TARGET_LIBM_ENTRYPOINTS
libc.src.math.nextup
libc.src.math.nextupf
libc.src.math.nextupl
+ libc.src.math.pow
libc.src.math.powf
libc.src.math.remainder
libc.src.math.remainderf
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index b7aac225ee055..d66e7f1fed81f 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -240,6 +240,7 @@ set(TARGET_LIBM_ENTRYPOINTS
libc.src.math.nexttoward
libc.src.math.nexttowardf
libc.src.math.nexttowardl
+ libc.src.math.pow
libc.src.math.powf
libc.src.math.remainderf
libc.src.math.remainder
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index defd075d10997..14ef59eaa03f1 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -320,7 +320,7 @@ Higher Math Functions
+-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
| logp1 | | | | | | 7.12.6.14 | F.10.3.14 |
+-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| pow | |check| | | | | | 7.12.7.5 | F.10.4.5 |
+| pow | |check| | 1 ULP | | | | 7.12.7.5 | F.10.4.5 |
+-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
| powi\* | | | | | | | |
+-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/src/math/amdgpu/CMakeLists.txt b/libc/src/math/amdgpu/CMakeLists.txt
index 2ceb12785c607..86785e56347e7 100644
--- a/libc/src/math/amdgpu/CMakeLists.txt
+++ b/libc/src/math/amdgpu/CMakeLists.txt
@@ -456,18 +456,6 @@ add_entrypoint_object(
VENDOR
)
-add_entrypoint_object(
- pow
- SRCS
- pow.cpp
- HDRS
- ../pow.h
- COMPILE_OPTIONS
- ${bitcode_link_flags}
- -O2
- VENDOR
-)
-
add_entrypoint_object(
powi
SRCS
diff --git a/libc/src/math/amdgpu/pow.cpp b/libc/src/math/amdgpu/pow.cpp
deleted file mode 100644
index 979ad6cda6ef0..0000000000000
--- a/libc/src/math/amdgpu/pow.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-//===-- Implementation of the pow function for GPU ------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/math/pow.h"
-#include "src/__support/common.h"
-
-#include "declarations.h"
-#include "src/__support/macros/config.h"
-
-namespace LIBC_NAMESPACE_DECL {
-
-LLVM_LIBC_FUNCTION(double, pow, (double x, double y)) {
- return __ocml_pow_f64(x, y);
-}
-
-} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 58daffaf4e932..8d59f5de9c310 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1552,6 +1552,26 @@ add_entrypoint_object(
-O3
)
+add_entrypoint_object(
+ pow
+ SRCS
+ pow.cpp
+ HDRS
+ ../pow.h
+ DEPENDS
+ .common_constants
+ libc.src.__support.FPUtil.fenv_impl
+ libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.FPUtil.multiply_add
+ libc.src.__support.FPUtil.nearest_integer
+ libc.src.__support.FPUtil.polyeval
+ libc.src.__support.FPUtil.rounding_mode
+ libc.src.__support.FPUtil.sqrt
+ libc.src.__support.macros.optimization
+ COMPILE_OPTIONS
+ -O3
+)
+
add_entrypoint_object(
copysign
SRCS
diff --git a/libc/src/math/generic/pow.cpp b/libc/src/math/generic/pow.cpp
new file mode 100644
index 0000000000000..49f3ca602c207
--- /dev/null
+++ b/libc/src/math/generic/pow.cpp
@@ -0,0 +1,490 @@
+//===-- Double-precision x^y function -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/pow.h"
+#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/FPUtil/sqrt.h" // Speedup for pow(x, 1/2) = sqrt(x)
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+using fputil::DoubleDouble;
+
+namespace {
+
+// Constants for log2(x) range reduction, generated by Sollya with:
+// > for i from 0 to 127 do {
+// r = 2^-8 * ceil( 2^8 * (1 - 2^(-8)) / (1 + i*2^-7) );
+// b = nearestint(log2(r) * 2^41) * 2^-41;
+// c = round(log2(r) - b, D, RN);
+// print("{", -c, ",", -b, "},");
+// };
+// This is the same as -log2(RD[i]), with the least significant bits of the
+// high part set to be 2^-41, so that the sum of high parts + e_x is exact in
+// double precision.
+// We also replace the first and the last ones to be 0.
+constexpr DoubleDouble LOG2_R_DD[128] = {
+ {0.0, 0.0},
+ {-0x1.19b14945cf6bap-44, 0x1.72c7ba21p-7},
+ {-0x1.95539356f93dcp-43, 0x1.743ee862p-6},
+ {0x1.abe0a48f83604p-43, 0x1.184b8e4c5p-5},
+ {0x1.635577970e04p-43, 0x1.77394c9d9p-5},
+ {-0x1.401fbaaa67e3cp-45, 0x1.d6ebd1f2p-5},
+ {-0x1.5b1799ceaeb51p-43, 0x1.1bb32a6008p-4},
+ {0x1.7c407050799bfp-43, 0x1.4c560fe688p-4},
+ {0x1.da6339da288fcp-43, 0x1.7d60496cf8p-4},
+ {0x1.be4f6f22dbbadp-43, 0x1.960caf9ab8p-4},
+ {-0x1.c760bc9b188c4p-45, 0x1.c7b528b71p-4},
+ {0x1.164e932b2d51cp-44, 0x1.f9c95dc1dp-4},
+ {0x1.924ae921f7ecap-45, 0x1.097e38ce6p-3},
+ {-0x1.6d25a5b8a19b2p-44, 0x1.22dadc2ab4p-3},
+ {0x1.e50a1644ac794p-43, 0x1.3c6fb650ccp-3},
+ {0x1.f34baa74a7942p-43, 0x1.494f863b8cp-3},
+ {-0x1.8f7aac147fdc1p-46, 0x1.633a8bf438p-3},
+ {0x1.f84be19cb9578p-43, 0x1.7046031c78p-3},
+ {-0x1.66cccab240e9p-46, 0x1.8a8980abfcp-3},
+ {-0x1.3f7a55cd2af4cp-47, 0x1.97c1cb13c8p-3},
+ {0x1.3458cde69308cp-43, 0x1.b2602497d4p-3},
+ {-0x1.667f21fa8423fp-44, 0x1.bfc67a8p-3},
+ {0x1.d2fe4574e09b9p-47, 0x1.dac22d3e44p-3},
+ {0x1.367bde40c5e6dp-43, 0x1.e857d3d36p-3},
+ {0x1.d45da26510033p-46, 0x1.01d9bbcfa6p-2},
+ {-0x1.7204f55bbf90dp-44, 0x1.08bce0d96p-2},
+ {-0x1.d4f1b95e0ff45p-43, 0x1.169c05364p-2},
+ {0x1.c20d74c0211bfp-44, 0x1.1d982c9d52p-2},
+ {0x1.ad89a083e072ap-43, 0x1.249cd2b13cp-2},
+ {0x1.cd0cb4492f1bcp-43, 0x1.32bfee370ep-2},
+ {-0x1.2101a9685c779p-47, 0x1.39de8e155ap-2},
+ {0x1.9451cd394fe8dp-43, 0x1.4106017c3ep-2},
+ {0x1.661e393a16b95p-44, 0x1.4f6fbb2cecp-2},
+ {-0x1.c6d8d86531d56p-44, 0x1.56b22e6b58p-2},
+ {0x1.c1c885adb21d3p-43, 0x1.5dfdcf1eeap-2},
+ {0x1.3bb5921006679p-45, 0x1.6552b49986p-2},
+ {0x1.1d406db502403p-43, 0x1.6cb0f6865cp-2},
+ {0x1.55a63e278bad5p-43, 0x1.7b89f02cf2p-2},
+ {-0x1.66ae2a7ada553p-49, 0x1.8304d90c12p-2},
+ {-0x1.66cccab240e9p-45, 0x1.8a8980abfcp-2},
+ {-0x1.62404772a151dp-45, 0x1.921800924ep-2},
+ {0x1.ac9bca36fd02ep-44, 0x1.99b072a96cp-2},
+ {0x1.4bc302ffa76fbp-43, 0x1.a8ff97181p-2},
+ {0x1.01fea1ec47c71p-43, 0x1.b0b67f4f46p-2},
+ {-0x1.f20203b3186a6p-43, 0x1.b877c57b1cp-2},
+ {-0x1.2642415d47384p-45, 0x1.c043859e3p-2},
+ {-0x1.bc76a2753b99bp-50, 0x1.c819dc2d46p-2},
+ {-0x1.da93ae3a5f451p-43, 0x1.cffae611aep-2},
+ {-0x1.50e785694a8c6p-43, 0x1.d7e6c0abc4p-2},
+ {0x1.c56138c894641p-43, 0x1.dfdd89d586p-2},
+ {0x1.5669df6a2b592p-43, 0x1.e7df5fe538p-2},
+ {-0x1.ea92d9e0e8ac2p-48, 0x1.efec61b012p-2},
+ {0x1.a0331af2e6feap-43, 0x1.f804ae8d0cp-2},
+ {0x1.9518ce032f41dp-48, 0x1.0014332bep-1},
+ {-0x1.b3b3864c60011p-44, 0x1.042bd4b9a8p-1},
+ {-0x1.103e8f00d41c8p-45, 0x1.08494c66b9p-1},
+ {0x1.65be75cc3da17p-43, 0x1.0c6caaf0c5p-1},
+ {0x1.3676289cd3dd4p-43, 0x1.1096015deep-1},
+ {-0x1.41dfc7d7c3321p-43, 0x1.14c560fe69p-1},
+ {0x1.e0cda8bd74461p-44, 0x1.18fadb6e2dp-1},
+ {0x1.2a606046ad444p-44, 0x1.1d368296b5p-1},
+ {0x1.f9ea977a639cp-43, 0x1.217868b0c3p-1},
+ {-0x1.50520a377c7ecp-45, 0x1.25c0a0463cp-1},
+ {0x1.6e3cb71b554e7p-47, 0x1.2a0f3c3407p-1},
+ {-0x1.4275f1035e5e8p-48, 0x1.2e644fac05p-1},
+ {-0x1.4275f1035e5e8p-48, 0x1.2e644fac05p-1},
+ {-0x1.979a5db68721dp-45, 0x1.32bfee370fp-1},
+ {0x1.1ee969a95f529p-43, 0x1.37222bb707p-1},
+ {0x1.bb4b69336b66ep-43, 0x1.3b8b1c68fap-1},
+ {0x1.d5e6a8a4fb059p-45, 0x1.3ffad4e74fp-1},
+ {0x1.3106e404cabb7p-44, 0x1.44716a2c08p-1},
+ {0x1.3106e404cabb7p-44, 0x1.44716a2c08p-1},
+ {-0x1.9bcaf1aa4168ap-43, 0x1.48eef19318p-1},
+ {0x1.1646b761c48dep-44, 0x1.4d7380dcc4p-1},
+ {0x1.2f0c0bfe9dbecp-43, 0x1.51ff2e3021p-1},
+ {0x1.29904613e33cp-43, 0x1.5692101d9bp-1},
+ {0x1.1d406db502403p-44, 0x1.5b2c3da197p-1},
+ {0x1.1d406db502403p-44, 0x1.5b2c3da197p-1},
+ {-0x1.125d6cbcd1095p-44, 0x1.5fcdce2728p-1},
+ {-0x1.bd9b32266d92cp-43, 0x1.6476d98adap-1},
+ {0x1.54243b21709cep-44, 0x1.6927781d93p-1},
+ {0x1.54243b21709cep-44, 0x1.6927781d93p-1},
+ {-0x1.ce60916e52e91p-44, 0x1.6ddfc2a79p-1},
+ {0x1.f1f5ae718f241p-43, 0x1.729fd26b7p-1},
+ {-0x1.6eb9612e0b4f3p-43, 0x1.7767c12968p-1},
+ {-0x1.6eb9612e0b4f3p-43, 0x1.7767c12968p-1},
+ {0x1.fed21f9cb2cc5p-43, 0x1.7c37a9227ep-1},
+ {0x1.7f5dc57266758p-43, 0x1.810fa51bf6p-1},
+ {0x1.7f5dc57266758p-43, 0x1.810fa51bf6p-1},
+ {0x1.5b338360c2ae2p-43, 0x1.85efd062c6p-1},
+ {-0x1.96fc8f4b56502p-43, 0x1.8ad846cf37p-1},
+ {-0x1.96fc8f4b56502p-43, 0x1.8ad846cf37p-1},
+ {-0x1.bdc81c4db3134p-44, 0x1.8fc924c89bp-1},
+ {0x1.36c101ee1344p-43, 0x1.94c287492cp-1},
+ {0x1.36c101ee1344p-43, 0x1.94c287492cp-1},
+ {0x1.e41fa0a62e6aep-44, 0x1.99c48be206p-1},
+ {-0x1.d97ee9124773bp-46, 0x1.9ecf50bf44p-1},
+ {-0x1.d97ee9124773bp-46, 0x1.9ecf50bf44p-1},
+ {-0x1.3f94e00e7d6bcp-46, 0x1.a3e2f4ac44p-1},
+ {-0x1.6879fa00b120ap-43, 0x1.a8ff971811p-1},
+ {-0x1.6879fa00b120ap-43, 0x1.a8ff971811p-1},
+ {0x1.1659d8e2d7d38p-44, 0x1.ae255819fp-1},
+ {0x1.1e5e0ae0d3f8ap-43, 0x1.b35458761dp-1},
+ {0x1.1e5e0ae0d3f8ap-43, 0x1.b35458761dp-1},
+ {0x1.484a15babcf88p-43, 0x1.b88cb9a2abp-1},
+ {0x1.484a15babcf88p-43, 0x1.b88cb9a2abp-1},
+ {0x1.871a7610e40bdp-45, 0x1.bdce9dcc96p-1},
+ {-0x1.2d90e5edaeceep-43, 0x1.c31a27dd01p-1},
+ {-0x1.2d90e5edaeceep-43, 0x1.c31a27dd01p-1},
+ {-0x1.5dd31d962d373p-43, 0x1.c86f7b7ea5p-1},
+ {-0x1.5dd31d962d373p-43, 0x1.c86f7b7ea5p-1},
+ {-0x1.9ad57391924a7p-43, 0x1.cdcebd2374p-1},
+ {-0x1.3167ccc538261p-44, 0x1.d338120a6ep-1},
+ {-0x1.3167ccc538261p-44, 0x1.d338120a6ep-1},
+ {0x1.c7a4ff65ddbc9p-45, 0x1.d8aba045bp-1},
+ {0x1.c7a4ff65ddbc9p-45, 0x1.d8aba045bp-1},
+ {-0x1.f9ab3cf74babap-44, 0x1.de298ec0bbp-1},
+ {-0x1.f9ab3cf74babap-44, 0x1.de298ec0bbp-1},
+ {0x1.52842c1c1e586p-43, 0x1.e3b20546f5p-1},
+ {0x1.52842c1c1e586p-43, 0x1.e3b20546f5p-1},
+ {0x1.3c6764fc87b4ap-48, 0x1.e9452c8a71p-1},
+ {0x1.3c6764fc87b4ap-48, 0x1.e9452c8a71p-1},
+ {-0x1.a0976c0a2827dp-44, 0x1.eee32e2aedp-1},
+ {-0x1.a0976c0a2827dp-44, 0x1.eee32e2aedp-1},
+ {-0x1.a45314dc4fc42p-43, 0x1.f48c34bd1fp-1},
+ {-0x1.a45314dc4fc42p-43, 0x1.f48c34bd1fp-1},
+ {0x1.ef5d00e390ap-44, 0x1.fa406bd244p-1},
+ {0.0, 1.0},
+};
+
+bool is_odd_integer(double x) {
+ using FPBits = fputil::FPBits<double>;
+ FPBits xbits(x);
+ uint64_t x_u = xbits.uintval();
+ unsigned x_e = static_cast<unsigned>(xbits.get_biased_exponent());
+ unsigned lsb =
+ static_cast<unsigned>(cpp::countr_zero(x_u | FPBits::EXP_MASK));
+ constexpr unsigned UNIT_EXPONENT =
+ static_cast<unsigned>(FPBits::EXP_BIAS + FPBits::FRACTION_LEN);
+ return (x_e + lsb == UNIT_EXPONENT);
+}
+
+bool is_integer(double x) {
+ using FPBits = fputil::FPBits<double>;
+ FPBits xbits(x);
+ uint64_t x_u = xbits.uintval();
+ unsigned x_e = static_cast<unsigned>(xbits.get_biased_exponent());
+ unsigned lsb =
+ static_cast<unsigned>(cpp::countr_zero(x_u | FPBits::EXP_MASK));
+ constexpr unsigned UNIT_EXPONENT =
+ static_cast<unsigned>(FPBits::EXP_BIAS + FPBits::FRACTION_LEN);
+ return (x_e + lsb >= UNIT_EXPONENT);
+}
+
+} // namespace
+
+LLVM_LIBC_FUNCTION(double, pow, (double x, double y)) {
+ using FPBits = fputil::FPBits<double>;
+
+ FPBits xbits(x), ybits(y);
+
+ bool x_sign = xbits.sign() == Sign::NEG;
+ bool y_sign = ybits.sign() == Sign::NEG;
+
+ FPBits x_abs = xbits.abs();
+ FPBits y_abs = ybits.abs();
+
+ uint64_t x_mant = xbits.get_mantissa();
+ uint64_t y_mant = ybits.get_mantissa();
+ uint64_t x_u = xbits.uintval();
+ uint64_t x_a = x_abs.uintval();
+ uint64_t y_a = y_abs.uintval();
+
+ double e_x = static_cast<double>(xbits.get_exponent());
+ uint64_t sign = 0;
+
+ ///////// BEGIN - Check exceptional cases ////////////////////////////////////
+
+ // The double precision number that is closest to 1 is (1 - 2^-53), which has
+ // log2(1 - 2^-53) ~ -1.715...p-53.
+ // So if |y| > 1075 / log2(1 - 2^-53), and x is finite:
+ // |y * log2(x)| = 0 or > 1075.
+ // Hence x^y will either overflow or underflow if x is not zero.
+ if (LIBC_UNLIKELY(y_mant == 0 || y_a > 0x43d74910d52d3052 ||
+ x_u == FPBits::one().uintval() ||
+ x_u >= FPBits::inf().uintval() ||
+ x_u < FPBits::min_normal().uintval())) {
+ // Exceptional exponents.
+ switch (y_a) {
+ case 0: // y = +-0.0
+ return 1.0;
+ case 0x3fe0'0000'0000'0000: // y = +-0.5
+ // TODO: speed up x^(-1/2) with rsqrt(x) when available.
+ return y_sign ? (1.0 / fputil::sqrt<double>(x)) : fputil::sqrt<double>(x);
+ case 0x3ff0'0000'0000'0000: // y = +-1.0
+ return y_sign ? (1.0 / x) : x;
+ case 0x4000'0000'0000'0000: // y = +-2.0;
+ return y_sign ? (1.0 / (x * x)) : (x * x);
+ }
+
+ if (y_a > 0x43d7'4910'd52d'3052) {
+ if (y_a >= 0x7ff0'0000'0000'0000) {
+ // y is inf or nan
+ if (y_mant != 0)
+ // y is NaN
+ // pow(1, NaN) = 1
+ // pow(x, NaN) = NaN
+ return (x_u == FPBits::one().uintval()) ? 1.0 : y;
+
+ // Now y is +-Inf
+ if (x_abs.is_nan())
+ // pow(NaN, +-Inf) = NaN
+ return x;
+
+ if (x_a == 0x3ff0'0000'0000'0000)
+ // pow(+-1, +-Inf) = 1.0
+ return 1.0;
+
+ if (x_a == 0 && y_sign) {
+ // pow(+-0, -Inf) = +inf and raise FE_DIVBYZERO
+ fputil::set_errno_if_required(EDOM);
+ fputil::raise_except_if_required(FE_DIVBYZERO);
+ return FPBits::inf().get_val();
+ }
+ // pow (|x| < 1, -inf) = +inf
+ // pow (|x| < 1, +inf) = 0.0f
+ // pow (|x| > 1, -inf) = 0.0f
+ // pow (|x| > 1, +inf) = +inf
+ return ((x_a < FPBits::one().uintval()) == y_sign)
+ ? FPBits::inf().get_val()
+ : 0.0;
+ }
+ // x^y will be overflow / underflow in double precision. Set y to a
+ // large enough exponent but not too large, so that the computations
+ // won't be overflow in double precision.
+ y = y_sign ? -0x1.0p100 : 0x1.0p100;
+ }
+
+ // y is finite and non-zero.
+
+ if (x_u == FPBits::one().uintval())
+ // pow(1, y) = 1
+ return 1.0;
+
+ // TODO: Speed things up with pow(2, y) = exp2(y) and pow(10, y) = exp10(y).
+
+ if (x_a == 0) {
+ const bool out_is_neg = x_sign && is_odd_integer(y);
+ if (y_sign) {
+ // pow(0, negative number) = inf
+ fputil::set_errno_if_required(EDOM);
+ fputil::raise_except_if_required(FE_DIVBYZERO);
+ return FPBits::inf(out_is_neg ? Sign::NEG : Sign::POS).get_val();
+ }
+ // pow(0, positive number) = 0
+ return out_is_neg ? -0.0 : 0.0;
+ }
+
+ if (x_a == FPBits::inf().uintval()) {
+ const bool out_is_neg = x_sign && is_odd_integer(y);
+ if (y_sign)
+ return out_is_neg ? -0.0f : 0.0f;
+ return FPBits::inf(out_is_neg ? Sign::NEG : Sign::POS).get_val();
+ }
+
+ if (x_a > FPBits::inf().uintval()) {
+ // x is NaN.
+ // pow (aNaN, 0) is already taken care above.
+ return x;
+ }
+
+ // Normalize denormal inputs.
+ if (x_a < FPBits::min_normal().uintval()) {
+ e_x -= 64.0;
+ x_mant = FPBits(x * 0x1.0p64).get_mantissa();
+ }
+
+ // x is finite and negative, and y is a finite integer.
+ if (x_sign) {
+ if (is_integer(y)) {
+ x = -x;
+ if (is_odd_integer(y))
+ // sign = -1.0;
+ sign = 0x8000'0000'0000'0000ULL;
+ } else {
+ // pow( negative, non-integer ) = NaN
+ fputil::set_errno_if_required(EDOM);
+ fputil::raise_except_if_required(FE_INVALID);
+ return FPBits::quiet_nan().get_val();
+ }
+ }
+ }
+
+ ///////// END - Check exceptional cases //////////////////////////////////////
+
+ // x^y = 2^( y * log2(x) )
+ // = 2^( y * ( e_x + log2(m_x) ) )
+ // First we compute log2(x) = e_x + log2(m_x)
+
+ // Extract exponent field of x.
+
+ // Use the highest 7 fractional bits of m_x as the index for look up tables.
+ unsigned idx_x = static_c...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Passes the GPU tests.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with a few more nits.
No description provided.