Skip to content

Adding more documentation for upsampling with replacement and error m… #29444

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Nov 8, 2019
19 changes: 19 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4964,6 +4964,21 @@ def sample(
dog 4 0 2
fish 0 0 8


An upsample sample of the ``DataFrame`` with replacement:
Note that `replace` parameter has to be `True` for `frac` parameter > 1.

>>> df.sample(frac=2, replace=True, random_state=1)
num_legs num_wings num_specimen_seen
dog 4 0 2
fish 0 0 8
falcon 2 2 10
falcon 2 2 10
fish 0 0 8
dog 4 0 2
fish 0 0 8
dog 4 0 2

Using a DataFrame column as weights. Rows with larger value in the
`num_specimen_seen` column are more likely to be sampled.

Expand Down Expand Up @@ -5039,6 +5054,10 @@ def sample(
# If no frac or n, default to n=1.
if n is None and frac is None:
n = 1
elif frac is not None and frac > 1 and not replace:
raise ValueError(
"Replace has to be set to `True` when upsampling the population `frac` > 1"
)
elif n is not None and frac is None and n % 1 != 0:
raise ValueError("Only integers accepted as `n` values")
elif n is None and frac is not None:
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/generic/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ def test_sample(self):
self._compare(
o.sample(n=4, random_state=seed), o.sample(n=4, random_state=seed)
)

self._compare(
o.sample(frac=0.7, random_state=seed),
o.sample(frac=0.7, random_state=seed),
Expand All @@ -337,6 +338,15 @@ def test_sample(self):
o.sample(frac=0.7, random_state=np.random.RandomState(test)),
)

self._compare(
o.sample(
frac=2, replace=True, random_state=np.random.RandomState(test)
),
o.sample(
frac=2, replace=True, random_state=np.random.RandomState(test)
),
)

os1, os2 = [], []
for _ in range(2):
np.random.seed(test)
Expand Down Expand Up @@ -424,6 +434,14 @@ def test_sample(self):
weights_with_None[5] = 0.5
self._compare(o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6])

def test_sample_upsampling_without_replacement(self):
df = pd.DataFrame({"A": list("abc")})
msg = (
"Replace has to be set to `True` when upsampling the population `frac` > 1"
)
with pytest.raises(TypeError, match=msg):
hash(df.sample(frac=2, replace=False))

def test_size_compat(self):
# GH8846
# size property should be defined
Expand Down