@@ -2847,7 +2847,7 @@ based on ideas from three sources:
2847
2847
2848
2848
The double length routines allow for quite a bit of instruction
2849
2849
level parallelism. On a 3.22 Ghz Apple M1 Max, the incremental
2850
- cost of increasing the input vector size by one is 6.25 nsec.
2850
+ cost of increasing the input vector size by one is 6.0 nsec.
2851
2851
2852
2852
dl_zero() returns an extended precision zero
2853
2853
dl_split() exactly splits a double into two half precision components.
@@ -2860,22 +2860,25 @@ dl_to_d() converts from extended precision to double precision.
2860
2860
2861
2861
typedef struct { double hi ; double lo ; } DoubleLength ;
2862
2862
2863
+ static const DoubleLength dl_zero = {0.0 , 0.0 };
2864
+
2863
2865
static inline DoubleLength
2864
- dl_zero ( )
2866
+ twosum ( double a , double b )
2865
2867
{
2866
- return (DoubleLength ) {0.0 , 0.0 };
2868
+ double s = a + b ;
2869
+ double ap = s - b ;
2870
+ double bp = s - a ;
2871
+ double da = a - ap ;
2872
+ double db = b - bp ;
2873
+ double t = da + db ;
2874
+ return (DoubleLength ) {s , t };
2867
2875
}
2876
+
2868
2877
static inline DoubleLength
2869
2878
dl_add (DoubleLength total , double x )
2870
2879
{
2871
- double s = total .hi + x ;
2872
- double c = total .lo ;
2873
- if (fabs (total .hi ) >= fabs (x )) {
2874
- c += (total .hi - s ) + x ;
2875
- } else {
2876
- c += (x - s ) + total .hi ;
2877
- }
2878
- return (DoubleLength ) {s , c };
2880
+ DoubleLength s = twosum (total .hi , x );
2881
+ return (DoubleLength ) {s .hi , total .lo + s .lo };
2879
2882
}
2880
2883
2881
2884
static inline DoubleLength
@@ -2941,7 +2944,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q)
2941
2944
bool int_path_enabled = true, int_total_in_use = false;
2942
2945
bool flt_path_enabled = true, flt_total_in_use = false;
2943
2946
long int_total = 0 ;
2944
- DoubleLength flt_total = dl_zero () ;
2947
+ DoubleLength flt_total = dl_zero ;
2945
2948
2946
2949
p_it = PyObject_GetIter (p );
2947
2950
if (p_it == NULL ) {
@@ -3101,7 +3104,7 @@ math_sumprod_impl(PyObject *module, PyObject *p, PyObject *q)
3101
3104
Py_SETREF (total , new_total );
3102
3105
new_total = NULL ;
3103
3106
Py_CLEAR (term_i );
3104
- flt_total = dl_zero () ;
3107
+ flt_total = dl_zero ;
3105
3108
flt_total_in_use = false;
3106
3109
}
3107
3110
}
0 commit comments