Skip to content

[3.10] bpo-20499: Rounding error in statistics.pvariance (GH-28230) #28248

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 36 additions & 38 deletions Lib/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,21 +147,17 @@ class StatisticsError(ValueError):

# === Private utilities ===

def _sum(data, start=0):
"""_sum(data [, start]) -> (type, sum, count)
def _sum(data):
"""_sum(data) -> (type, sum, count)

Return a high-precision sum of the given numeric data as a fraction,
together with the type to be converted to and the count of items.

If optional argument ``start`` is given, it is added to the total.
If ``data`` is empty, ``start`` (defaulting to 0) is returned.


Examples
--------

>>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
(<class 'float'>, Fraction(11, 1), 5)
>>> _sum([3, 2.25, 4.5, -0.5, 0.25])
(<class 'float'>, Fraction(19, 2), 5)

Some sources of round-off error will be avoided:

Expand All @@ -184,10 +180,9 @@ def _sum(data, start=0):
allowed.
"""
count = 0
n, d = _exact_ratio(start)
partials = {d: n}
partials = {}
partials_get = partials.get
T = _coerce(int, type(start))
T = int
for typ, values in groupby(data, type):
T = _coerce(T, typ) # or raise TypeError
for n, d in map(_exact_ratio, values):
Expand All @@ -200,8 +195,7 @@ def _sum(data, start=0):
assert not _isfinite(total)
else:
# Sum all the partial sums using builtin sum.
# FIXME is this faster if we sum them in order of the denominator?
total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
total = sum(Fraction(n, d) for d, n in partials.items())
return (T, total, count)


Expand Down Expand Up @@ -252,27 +246,19 @@ def _exact_ratio(x):
x is expected to be an int, Fraction, Decimal or float.
"""
try:
# Optimise the common case of floats. We expect that the most often
# used numeric type will be builtin floats, so try to make this as
# fast as possible.
if type(x) is float or type(x) is Decimal:
return x.as_integer_ratio()
try:
# x may be an int, Fraction, or Integral ABC.
return (x.numerator, x.denominator)
except AttributeError:
try:
# x may be a float or Decimal subclass.
return x.as_integer_ratio()
except AttributeError:
# Just give up?
pass
return x.as_integer_ratio()
except AttributeError:
pass
except (OverflowError, ValueError):
# float NAN or INF.
assert not _isfinite(x)
return (x, None)
msg = "can't convert type '{}' to numerator/denominator"
raise TypeError(msg.format(type(x).__name__))
try:
# x may be an Integral ABC.
return (x.numerator, x.denominator)
except AttributeError:
msg = f"can't convert type '{type(x).__name__}' to numerator/denominator"
raise TypeError(msg)


def _convert(value, T):
Expand Down Expand Up @@ -719,14 +705,20 @@ def _ss(data, c=None):
if c is not None:
T, total, count = _sum((x-c)**2 for x in data)
return (T, total)
c = mean(data)
T, total, count = _sum((x-c)**2 for x in data)
# The following sum should mathematically equal zero, but due to rounding
# error may not.
U, total2, count2 = _sum((x - c) for x in data)
assert T == U and count == count2
total -= total2 ** 2 / len(data)
assert not total < 0, 'negative sum of square deviations: %f' % total
T, total, count = _sum(data)
mean_n, mean_d = (total / count).as_integer_ratio()
partials = Counter()
for n, d in map(_exact_ratio, data):
diff_n = n * mean_d - d * mean_n
diff_d = d * mean_d
partials[diff_d * diff_d] += diff_n * diff_n
if None in partials:
# The sum will be a NAN or INF. We can ignore all the finite
# partials, and just look at this special one.
total = partials[None]
assert not _isfinite(total)
else:
total = sum(Fraction(n, d) for d, n in partials.items())
return (T, total)


Expand Down Expand Up @@ -830,6 +822,9 @@ def stdev(data, xbar=None):
1.0810874155219827

"""
# Fixme: Despite the exact sum of squared deviations, some inaccuracy
# remain because there are two rounding steps. The first occurs in
# the _convert() step for variance(), the second occurs in math.sqrt().
var = variance(data, xbar)
try:
return var.sqrt()
Expand All @@ -846,6 +841,9 @@ def pstdev(data, mu=None):
0.986893273527251

"""
# Fixme: Despite the exact sum of squared deviations, some inaccuracy
# remain because there are two rounding steps. The first occurs in
# the _convert() step for pvariance(), the second occurs in math.sqrt().
var = pvariance(data, mu)
try:
return var.sqrt()
Expand Down
28 changes: 14 additions & 14 deletions Lib/test/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1247,20 +1247,14 @@ def test_empty_data(self):
# Override test for empty data.
for data in ([], (), iter([])):
self.assertEqual(self.func(data), (int, Fraction(0), 0))
self.assertEqual(self.func(data, 23), (int, Fraction(23), 0))
self.assertEqual(self.func(data, 2.3), (float, Fraction(2.3), 0))

def test_ints(self):
self.assertEqual(self.func([1, 5, 3, -4, -8, 20, 42, 1]),
(int, Fraction(60), 8))
self.assertEqual(self.func([4, 2, 3, -8, 7], 1000),
(int, Fraction(1008), 5))

def test_floats(self):
self.assertEqual(self.func([0.25]*20),
(float, Fraction(5.0), 20))
self.assertEqual(self.func([0.125, 0.25, 0.5, 0.75], 1.5),
(float, Fraction(3.125), 4))

def test_fractions(self):
self.assertEqual(self.func([Fraction(1, 1000)]*500),
Expand All @@ -1281,14 +1275,6 @@ def test_compare_with_math_fsum(self):
data = [random.uniform(-100, 1000) for _ in range(1000)]
self.assertApproxEqual(float(self.func(data)[1]), math.fsum(data), rel=2e-16)

def test_start_argument(self):
# Test that the optional start argument works correctly.
data = [random.uniform(1, 1000) for _ in range(100)]
t = self.func(data)[1]
self.assertEqual(t+42, self.func(data, 42)[1])
self.assertEqual(t-23, self.func(data, -23)[1])
self.assertEqual(t+Fraction(1e20), self.func(data, 1e20)[1])

def test_strings_fail(self):
# Sum of strings should fail.
self.assertRaises(TypeError, self.func, [1, 2, 3], '999')
Expand Down Expand Up @@ -2077,6 +2063,13 @@ def test_decimals(self):
self.assertEqual(result, exact)
self.assertIsInstance(result, Decimal)

def test_accuracy_bug_20499(self):
data = [0, 0, 1]
exact = 2 / 9
result = self.func(data)
self.assertEqual(result, exact)
self.assertIsInstance(result, float)


class TestVariance(VarianceStdevMixin, NumericTestCase, UnivariateTypeMixin):
# Tests for sample variance.
Expand Down Expand Up @@ -2117,6 +2110,13 @@ def test_center_not_at_mean(self):
self.assertEqual(self.func(data), 0.5)
self.assertEqual(self.func(data, xbar=2.0), 1.0)

def test_accuracy_bug_20499(self):
data = [0, 0, 2]
exact = 4 / 3
result = self.func(data)
self.assertEqual(result, exact)
self.assertIsInstance(result, float)

class TestPStdev(VarianceStdevMixin, NumericTestCase):
# Tests for population standard deviation.
def setUp(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improve the speed and accuracy of statistics.pvariance().