Skip to content

Commit 7b7b025

Browse files
authored
BUG: Unstack/pivot raising ValueError on large result (#45084)
1 parent abf85d9 commit 7b7b025

File tree

4 files changed

+55
-14
lines changed

4 files changed

+55
-14
lines changed

doc/source/whatsnew/v1.4.0.rst

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -364,10 +364,29 @@ second column is instead renamed to ``a.2``.
364364
365365
res
366366
367-
.. _whatsnew_140.notable_bug_fixes.notable_bug_fix3:
367+
.. _whatsnew_140.notable_bug_fixes.unstack_pivot_int32_limit:
368368

369-
notable_bug_fix3
370-
^^^^^^^^^^^^^^^^
369+
unstack and pivot_table no longer raises ValueError for result that would exceed int32 limit
370+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
371+
372+
Previously :meth:`DataFrame.pivot_table` and :meth:`DataFrame.unstack` would raise a ``ValueError`` if the operation
373+
could produce a result with more than ``2**31 - 1`` elements. This operation now raises a :class:`errors.PerformanceWarning`
374+
instead (:issue:`26314`).
375+
376+
*Previous behavior*:
377+
378+
.. code-block:: ipython
379+
380+
In [3]: df = DataFrame({"ind1": np.arange(2 ** 16), "ind2": np.arange(2 ** 16), "count": 0})
381+
In [4]: df.pivot_table(index="ind1", columns="ind2", values="count", aggfunc="count")
382+
ValueError: Unstacked DataFrame is too big, causing int32 overflow
383+
384+
*New behavior*:
385+
386+
.. code-block:: python
387+
388+
In [4]: df.pivot_table(index="ind1", columns="ind2", values="count", aggfunc="count")
389+
PerformanceWarning: The following operation may generate 4294967296 cells in the resulting pandas object.
371390
372391
.. ---------------------------------------------------------------------------
373392

pandas/core/reshape/reshape.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import itertools
44
from typing import TYPE_CHECKING
5+
import warnings
56

67
import numpy as np
78

@@ -11,6 +12,7 @@
1112
Dtype,
1213
npt,
1314
)
15+
from pandas.errors import PerformanceWarning
1416
from pandas.util._decorators import cache_readonly
1517

1618
from pandas.core.dtypes.cast import maybe_promote
@@ -125,10 +127,15 @@ def __init__(self, index: MultiIndex, level=-1, constructor=None):
125127
num_columns = self.removed_level.size
126128

127129
# GH20601: This forces an overflow if the number of cells is too high.
128-
num_cells = np.multiply(num_rows, num_columns, dtype=np.int32)
129-
130-
if num_rows > 0 and num_columns > 0 and num_cells <= 0:
131-
raise ValueError("Unstacked DataFrame is too big, causing int32 overflow")
130+
num_cells = num_rows * num_columns
131+
132+
# GH 26314: Previous ValueError raised was too restrictive for many users.
133+
if num_cells > np.iinfo(np.int32).max:
134+
warnings.warn(
135+
f"The following operation may generate {num_cells} cells "
136+
f"in the resulting pandas object.",
137+
PerformanceWarning,
138+
)
132139

133140
self._make_selectors()
134141

pandas/tests/frame/test_stack_unstack.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import numpy as np
66
import pytest
77

8+
from pandas.errors import PerformanceWarning
9+
810
import pandas as pd
911
from pandas import (
1012
DataFrame,
@@ -1819,11 +1821,17 @@ def test_unstack_unobserved_keys(self):
18191821
@pytest.mark.slow
18201822
def test_unstack_number_of_levels_larger_than_int32(self):
18211823
# GH#20601
1824+
# GH 26314: Change ValueError to PerformanceWarning
18221825
df = DataFrame(
18231826
np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)]
18241827
)
1825-
with pytest.raises(ValueError, match="int32 overflow"):
1826-
df.unstack()
1828+
msg = "The following operation may generate"
1829+
with tm.assert_produces_warning(PerformanceWarning, match=msg):
1830+
try:
1831+
df.unstack()
1832+
except MemoryError:
1833+
# Just checking the warning
1834+
return
18271835

18281836
def test_stack_order_with_unsorted_levels(self):
18291837
# GH#16323

pandas/tests/reshape/test_pivot.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import numpy as np
99
import pytest
1010

11+
from pandas.errors import PerformanceWarning
12+
1113
import pandas as pd
1214
from pandas import (
1315
Categorical,
@@ -1991,15 +1993,20 @@ def test_pivot_string_func_vs_func(self, f, f_numpy):
19911993
@pytest.mark.slow
19921994
def test_pivot_number_of_levels_larger_than_int32(self):
19931995
# GH 20601
1996+
# GH 26314: Change ValueError to PerformanceWarning
19941997
df = DataFrame(
19951998
{"ind1": np.arange(2 ** 16), "ind2": np.arange(2 ** 16), "count": 0}
19961999
)
19972000

1998-
msg = "Unstacked DataFrame is too big, causing int32 overflow"
1999-
with pytest.raises(ValueError, match=msg):
2000-
df.pivot_table(
2001-
index="ind1", columns="ind2", values="count", aggfunc="count"
2002-
)
2001+
msg = "The following operation may generate"
2002+
with tm.assert_produces_warning(PerformanceWarning, match=msg):
2003+
try:
2004+
df.pivot_table(
2005+
index="ind1", columns="ind2", values="count", aggfunc="count"
2006+
)
2007+
except MemoryError:
2008+
# Just checking the warning
2009+
return
20032010

20042011
def test_pivot_table_aggfunc_dropna(self, dropna):
20052012
# GH 22159

0 commit comments

Comments
 (0)