17
17
#include " src/__support/macros/config.h"
18
18
#include " src/__support/macros/optimization.h" // LIBC_UNLIKELY
19
19
#include " src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
20
+ #include " src/math/generic/range_reduction_double_common.h"
20
21
#include " src/math/generic/sincos_eval.h"
21
22
22
- // TODO: We might be able to improve the performance of large range reduction of
23
- // non-FMA targets further by operating directly on 25-bit chunks of 128/pi and
24
- // pre-split SIN_K_PI_OVER_128, but that might double the memory footprint of
25
- // those lookup table.
26
- #include " range_reduction_double_common.h"
27
-
28
- #if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
29
- #define LIBC_MATH_COS_SKIP_ACCURATE_PASS
30
- #endif
23
+ #ifdef LIBC_TARGET_CPU_HAS_FMA
24
+ #include " range_reduction_double_fma.h"
25
+ #else
26
+ #include " range_reduction_double_nofma.h"
27
+ #endif // LIBC_TARGET_CPU_HAS_FMA
31
28
32
29
namespace LIBC_NAMESPACE_DECL {
33
30
@@ -42,22 +39,29 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
42
39
43
40
DoubleDouble y;
44
41
unsigned k;
45
- generic:: LargeRangeReduction<NO_FMA> range_reduction_large{};
42
+ LargeRangeReduction range_reduction_large{};
46
43
47
- // |x| < 2^32 (with FMA) or |x| < 2^23 (w/o FMA)
44
+ // |x| < 2^16.
48
45
if (LIBC_LIKELY (x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) {
49
- // |x| < 2^-27
50
- if (LIBC_UNLIKELY (x_e < FPBits::EXP_BIAS - 27 )) {
51
- // Signed zeros.
52
- if (LIBC_UNLIKELY (x == 0.0 ))
53
- return 1.0 ;
54
-
55
- // For |x| < 2^-27, |cos(x) - 1| < |x|^2/2 < 2^-54 = ulp(1 - 2^-53)/2.
56
- return fputil::round_result_slightly_down (1.0 );
46
+ // |x| < 2^-7
47
+ if (LIBC_UNLIKELY (x_e < FPBits::EXP_BIAS - 7 )) {
48
+ // |x| < 2^-27
49
+ if (LIBC_UNLIKELY (x_e < FPBits::EXP_BIAS - 27 )) {
50
+ // Signed zeros.
51
+ if (LIBC_UNLIKELY (x == 0.0 ))
52
+ return 1.0 ;
53
+
54
+ // For |x| < 2^-27, |cos(x) - 1| < |x|^2/2 < 2^-54 = ulp(1 - 2^-53)/2.
55
+ return fputil::round_result_slightly_down (1.0 );
56
+ }
57
+ // No range reduction needed.
58
+ k = 0 ;
59
+ y.lo = 0.0 ;
60
+ y.hi = x;
61
+ } else {
62
+ // Small range reduction.
63
+ k = range_reduction_small (x, y);
57
64
}
58
-
59
- // // Small range reduction.
60
- k = range_reduction_small (x, y);
61
65
} else {
62
66
// Inf or NaN
63
67
if (LIBC_UNLIKELY (x_e > 2 * FPBits::EXP_BIAS)) {
@@ -70,70 +74,51 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
70
74
}
71
75
72
76
// Large range reduction.
73
- k = range_reduction_large.compute_high_part (x);
74
- y = range_reduction_large.fast ();
77
+ k = range_reduction_large.fast (x, y);
75
78
}
76
79
77
80
DoubleDouble sin_y, cos_y;
78
81
79
- generic::sincos_eval (y, sin_y, cos_y);
82
+ [[maybe_unused]] double err = generic::sincos_eval (y, sin_y, cos_y);
80
83
81
84
// Look up sin(k * pi/128) and cos(k * pi/128)
82
- // Memory saving versions:
83
-
84
- // Use 128-entry table instead:
85
- // DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 127];
86
- // uint64_t sin_s = static_cast<uint64_t>((k + 128) & 128) << (63 - 7);
87
- // sin_k.hi = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
88
- // sin_k.lo = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
89
- // DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 127];
90
- // uint64_t cos_s = static_cast<uint64_t>((k + 64) & 128) << (63 - 7);
91
- // cos_k.hi = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
92
- // cos_k.lo = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
93
-
94
- // Use 64-entry table instead:
95
- // auto get_idx_dd = [](unsigned kk) -> DoubleDouble {
96
- // unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
97
- // DoubleDouble ans = SIN_K_PI_OVER_128[idx];
98
- // if (kk & 128) {
99
- // ans.hi = -ans.hi;
100
- // ans.lo = -ans.lo;
101
- // }
102
- // return ans;
103
- // };
104
- // DoubleDouble sin_k = get_idx_dd(k + 128);
105
- // DoubleDouble cos_k = get_idx_dd(k + 64);
106
-
85
+ #ifdef LIBC_MATH_HAS_SMALL_TABLES
86
+ // Memory saving versions. Use 65-entry table.
87
+ auto get_idx_dd = [](unsigned kk) -> DoubleDouble {
88
+ unsigned idx = (kk & 64 ) ? 64 - (kk & 63 ) : (kk & 63 );
89
+ DoubleDouble ans = SIN_K_PI_OVER_128[idx];
90
+ if (kk & 128 ) {
91
+ ans.hi = -ans.hi ;
92
+ ans.lo = -ans.lo ;
93
+ }
94
+ return ans;
95
+ };
96
+ DoubleDouble sin_k = get_idx_dd (k + 128 );
97
+ DoubleDouble cos_k = get_idx_dd (k + 64 );
98
+ #else
107
99
// Fast look up version, but needs 256-entry table.
108
100
// -sin(k * pi/128) = sin((k + 128) * pi/128)
109
101
// cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
110
102
DoubleDouble msin_k = SIN_K_PI_OVER_128[(k + 128 ) & 255 ];
111
103
DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64 ) & 255 ];
104
+ #endif // LIBC_MATH_HAS_SMALL_TABLES
112
105
113
106
// After range reduction, k = round(x * 128 / pi) and y = x - k * (pi / 128).
114
107
// So k is an integer and -pi / 256 <= y <= pi / 256.
115
108
// Then cos(x) = cos((k * pi/128 + y)
116
109
// = cos(y) * cos(k*pi/128) - sin(y) * sin(k*pi/128)
117
- DoubleDouble cos_k_cos_y = fputil::quick_mult<NO_FMA> (cos_y, cos_k);
118
- DoubleDouble msin_k_sin_y = fputil::quick_mult<NO_FMA> (sin_y, msin_k);
110
+ DoubleDouble cos_k_cos_y = fputil::quick_mult (cos_y, cos_k);
111
+ DoubleDouble msin_k_sin_y = fputil::quick_mult (sin_y, msin_k);
119
112
120
113
DoubleDouble rr = fputil::exact_add<false >(cos_k_cos_y.hi , msin_k_sin_y.hi );
121
114
rr.lo += msin_k_sin_y.lo + cos_k_cos_y.lo ;
122
115
123
- #ifdef LIBC_MATH_COS_SKIP_ACCURATE_PASS
116
+ #ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
124
117
return rr.hi + rr.lo ;
125
118
#else
126
119
127
- // Accurate test and pass for correctly rounded implementation.
128
- #ifdef LIBC_TARGET_CPU_HAS_FMA
129
- constexpr double ERR = 0x1 .0p-70 ;
130
- #else
131
- // TODO: Improve non-FMA fast pass accuracy.
132
- constexpr double ERR = 0x1 .0p-66 ;
133
- #endif // LIBC_TARGET_CPU_HAS_FMA
134
-
135
- double rlp = rr.lo + ERR;
136
- double rlm = rr.lo - ERR;
120
+ double rlp = rr.lo + err;
121
+ double rlm = rr.lo - err;
137
122
138
123
double r_upper = rr.hi + rlp; // (rr.lo + ERR);
139
124
double r_lower = rr.hi + rlm; // (rr.lo - ERR);
@@ -144,15 +129,15 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
144
129
145
130
Float128 u_f128, sin_u, cos_u;
146
131
if (LIBC_LIKELY (x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
147
- u_f128 = generic:: range_reduction_small_f128 (x);
132
+ u_f128 = range_reduction_small_f128 (x);
148
133
else
149
134
u_f128 = range_reduction_large.accurate ();
150
135
151
136
generic::sincos_eval (u_f128, sin_u, cos_u);
152
137
153
138
auto get_sin_k = [](unsigned kk) -> Float128 {
154
139
unsigned idx = (kk & 64 ) ? 64 - (kk & 63 ) : (kk & 63 );
155
- Float128 ans = generic:: SIN_K_PI_OVER_128_F128[idx];
140
+ Float128 ans = SIN_K_PI_OVER_128_F128[idx];
156
141
if (kk & 128 )
157
142
ans.sign = Sign::NEG;
158
143
return ans;
@@ -172,7 +157,7 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
172
157
// https://github.com/llvm/llvm-project/issues/96452.
173
158
174
159
return static_cast <double >(r);
175
- #endif // !LIBC_MATH_COS_SKIP_ACCURATE_PASS
160
+ #endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
176
161
}
177
162
178
163
} // namespace LIBC_NAMESPACE_DECL
0 commit comments