Skip to content

[3.10] Improve speed and accuracy for correlation() (GH-26135) #26151

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 5, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 14 additions & 12 deletions Lib/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,12 @@
__all__ = [
'NormalDist',
'StatisticsError',
'correlation',
'covariance',
'fmean',
'geometric_mean',
'harmonic_mean',
'linear_regression',
'mean',
'median',
'median_grouped',
Expand All @@ -122,9 +125,6 @@
'quantiles',
'stdev',
'variance',
'correlation',
'covariance',
'linear_regression',
]

import math
Expand Down Expand Up @@ -882,10 +882,10 @@ def covariance(x, y, /):
raise StatisticsError('covariance requires that both inputs have same number of data points')
if n < 2:
raise StatisticsError('covariance requires at least two data points')
xbar = fmean(x)
ybar = fmean(y)
total = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
return total / (n - 1)
xbar = fsum(x) / n
ybar = fsum(y) / n
sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
return sxy / (n - 1)


def correlation(x, y, /):
Expand All @@ -910,11 +910,13 @@ def correlation(x, y, /):
raise StatisticsError('correlation requires that both inputs have same number of data points')
if n < 2:
raise StatisticsError('correlation requires at least two data points')
cov = covariance(x, y)
stdx = stdev(x)
stdy = stdev(y)
xbar = fsum(x) / n
ybar = fsum(y) / n
sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
s2x = fsum((xi - xbar) ** 2.0 for xi in x)
s2y = fsum((yi - ybar) ** 2.0 for yi in y)
try:
return cov / (stdx * stdy)
return sxy / sqrt(s2x * s2y)
except ZeroDivisionError:
raise StatisticsError('at least one of the inputs is constant')

Expand Down Expand Up @@ -958,7 +960,7 @@ def linear_regression(regressor, dependent_variable, /):
sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
s2x = fsum((xi - xbar) ** 2.0 for xi in x)
try:
slope = sxy / s2x
slope = sxy / s2x # equivalent to: covariance(x, y) / variance(x)
except ZeroDivisionError:
raise StatisticsError('regressor is constant')
intercept = ybar - slope * xbar
Expand Down