Skip to content

Commit 1d74b3e

Browse files
authored
CLN: use numpy quantile in qcut (#43991)
1 parent a83b780 commit 1d74b3e

File tree

4 files changed

+8
-99
lines changed

4 files changed

+8
-99
lines changed

pandas/core/algorithms.py

Lines changed: 0 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1117,89 +1117,6 @@ def checked_add_with_arr(
11171117
return arr + b
11181118

11191119

1120-
def quantile(x, q, interpolation_method="fraction"):
1121-
"""
1122-
Compute sample quantile or quantiles of the input array. For example, q=0.5
1123-
computes the median.
1124-
1125-
The `interpolation_method` parameter supports three values, namely
1126-
`fraction` (default), `lower` and `higher`. Interpolation is done only,
1127-
if the desired quantile lies between two data points `i` and `j`. For
1128-
`fraction`, the result is an interpolated value between `i` and `j`;
1129-
for `lower`, the result is `i`, for `higher` the result is `j`.
1130-
1131-
Parameters
1132-
----------
1133-
x : ndarray
1134-
Values from which to extract score.
1135-
q : scalar or array
1136-
Percentile at which to extract score.
1137-
interpolation_method : {'fraction', 'lower', 'higher'}, optional
1138-
This optional parameter specifies the interpolation method to use,
1139-
when the desired quantile lies between two data points `i` and `j`:
1140-
1141-
- fraction: `i + (j - i)*fraction`, where `fraction` is the
1142-
fractional part of the index surrounded by `i` and `j`.
1143-
-lower: `i`.
1144-
- higher: `j`.
1145-
1146-
Returns
1147-
-------
1148-
score : float
1149-
Score at percentile.
1150-
1151-
Examples
1152-
--------
1153-
>>> from scipy import stats
1154-
>>> a = np.arange(100)
1155-
>>> stats.scoreatpercentile(a, 50)
1156-
49.5
1157-
1158-
"""
1159-
x = np.asarray(x)
1160-
mask = isna(x)
1161-
1162-
x = x[~mask]
1163-
1164-
values = np.sort(x)
1165-
1166-
def _interpolate(a, b, fraction):
1167-
"""
1168-
Returns the point at the given fraction between a and b, where
1169-
'fraction' must be between 0 and 1.
1170-
"""
1171-
return a + (b - a) * fraction
1172-
1173-
def _get_score(at):
1174-
if len(values) == 0:
1175-
return np.nan
1176-
1177-
idx = at * (len(values) - 1)
1178-
if idx % 1 == 0:
1179-
score = values[int(idx)]
1180-
else:
1181-
if interpolation_method == "fraction":
1182-
score = _interpolate(values[int(idx)], values[int(idx) + 1], idx % 1)
1183-
elif interpolation_method == "lower":
1184-
score = values[np.floor(idx)]
1185-
elif interpolation_method == "higher":
1186-
score = values[np.ceil(idx)]
1187-
else:
1188-
raise ValueError(
1189-
"interpolation_method can only be 'fraction' "
1190-
", 'lower' or 'higher'"
1191-
)
1192-
1193-
return score
1194-
1195-
if is_scalar(q):
1196-
return _get_score(q)
1197-
1198-
q = np.asarray(q, np.float64)
1199-
result = [_get_score(x) for x in q]
1200-
return np.array(result, dtype=np.float64)
1201-
1202-
12031120
# --------------- #
12041121
# select n #
12051122
# --------------- #

pandas/core/reshape/tile.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -367,11 +367,12 @@ def qcut(
367367
x = _preprocess_for_cut(x)
368368
x, dtype = _coerce_to_type(x)
369369

370-
if is_integer(q):
371-
quantiles = np.linspace(0, 1, q + 1)
372-
else:
373-
quantiles = q
374-
bins = algos.quantile(x, quantiles)
370+
quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q
371+
372+
x_np = np.asarray(x)
373+
x_np = x_np[~np.isnan(x_np)]
374+
bins = np.quantile(x_np, quantiles)
375+
375376
fac, bins = _bins_to_cuts(
376377
x,
377378
bins,

pandas/tests/reshape/test_qcut.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
)
2222
import pandas._testing as tm
2323
from pandas.api.types import CategoricalDtype as CDT
24-
from pandas.core.algorithms import quantile
2524

2625
from pandas.tseries.offsets import (
2726
Day,
@@ -34,8 +33,8 @@ def test_qcut():
3433

3534
# We store the bins as Index that have been
3635
# rounded to comparisons are a bit tricky.
37-
labels, bins = qcut(arr, 4, retbins=True)
38-
ex_bins = quantile(arr, [0, 0.25, 0.5, 0.75, 1.0])
36+
labels, _ = qcut(arr, 4, retbins=True)
37+
ex_bins = np.quantile(arr, [0, 0.25, 0.5, 0.75, 1.0])
3938

4039
result = labels.categories.left.values
4140
assert np.allclose(result, ex_bins[:-1], atol=1e-2)

pandas/tests/test_algos.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1731,14 +1731,6 @@ def test_hashtable_large_sizehint(self, hashtable):
17311731
tbl = hashtable(size_hint=size_hint) # noqa
17321732

17331733

1734-
def test_quantile():
1735-
s = Series(np.random.randn(100))
1736-
1737-
result = algos.quantile(s, [0, 0.25, 0.5, 0.75, 1.0])
1738-
expected = algos.quantile(s.values, [0, 0.25, 0.5, 0.75, 1.0])
1739-
tm.assert_almost_equal(result, expected)
1740-
1741-
17421734
def test_unique_label_indices():
17431735

17441736
a = np.random.randint(1, 1 << 10, 1 << 15).astype(np.intp)

0 commit comments

Comments
 (0)