Skip to content

Commit bbb3cae

Browse files
authored
PERF: unstack (#43025)
1 parent df8f428 commit bbb3cae

File tree

5 files changed

+59
-9
lines changed

5 files changed

+59
-9
lines changed

pandas/_libs/internals.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ class BlockPlacement:
4444
def __len__(self) -> int: ...
4545
def delete(self, loc) -> BlockPlacement: ...
4646
def append(self, others: list[BlockPlacement]) -> BlockPlacement: ...
47+
def tile_for_unstack(self, factor: int) -> np.ndarray: ...
4748

4849
class SharedBlock:
4950
_mgr_locs: BlockPlacement

pandas/_libs/internals.pyx

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,27 @@ cdef class BlockPlacement:
210210

211211
return self._as_slice
212212

213+
def tile_for_unstack(self, factor: int) -> np.ndarray:
214+
"""
215+
Find the new mgr_locs for the un-stacked version of a Block.
216+
"""
217+
cdef:
218+
slice slc = self._ensure_has_slice()
219+
slice new_slice
220+
ndarray new_placement
221+
222+
if slc is not None and slc.step == 1:
223+
new_slc = slice(slc.start * factor, slc.stop * factor, 1)
224+
new_placement = np.arange(new_slc.start, new_slc.stop, dtype=np.intp)
225+
else:
226+
# Note: test_pivot_table_empty_aggfunc gets here with `slc is not None`
227+
mapped = [
228+
np.arange(x * factor, (x + 1) * factor, dtype=np.intp)
229+
for x in self
230+
]
231+
new_placement = np.concatenate(mapped)
232+
return new_placement
233+
213234

214235
cdef slice slice_canonize(slice s):
215236
"""

pandas/core/internals/managers.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1363,10 +1363,16 @@ def unstack(self, unstacker, fill_value) -> BlockManager:
13631363
new_blocks: list[Block] = []
13641364
columns_mask: list[np.ndarray] = []
13651365

1366+
if len(self.items) == 0:
1367+
factor = 1
1368+
else:
1369+
fac = len(new_columns) / len(self.items)
1370+
assert fac == int(fac)
1371+
factor = int(fac)
1372+
13661373
for blk in self.blocks:
1367-
blk_cols = self.items[blk.mgr_locs.indexer]
1368-
new_items = unstacker.get_new_columns(blk_cols)
1369-
new_placement = new_columns.get_indexer(new_items)
1374+
mgr_locs = blk.mgr_locs
1375+
new_placement = mgr_locs.tile_for_unstack(factor)
13701376

13711377
blocks, mask = blk._unstack(
13721378
unstacker,

pandas/core/reshape/reshape.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ def get_new_values(self, values, fill_value=None):
287287

288288
return new_values, new_mask
289289

290-
def get_new_columns(self, value_columns):
290+
def get_new_columns(self, value_columns: Index | None):
291291
if value_columns is None:
292292
if self.lift == 0:
293293
return self.removed_level._rename(name=self.removed_name)
@@ -308,6 +308,16 @@ def get_new_columns(self, value_columns):
308308
new_names = [value_columns.name, self.removed_name]
309309
new_codes = [propagator]
310310

311+
repeater = self._repeater
312+
313+
# The entire level is then just a repetition of the single chunk:
314+
new_codes.append(np.tile(repeater, width))
315+
return MultiIndex(
316+
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
317+
)
318+
319+
@cache_readonly
320+
def _repeater(self) -> np.ndarray:
311321
# The two indices differ only if the unstacked level had unused items:
312322
if len(self.removed_level_full) != len(self.removed_level):
313323
# In this case, we remap the new codes to the original level:
@@ -316,13 +326,10 @@ def get_new_columns(self, value_columns):
316326
repeater = np.insert(repeater, 0, -1)
317327
else:
318328
# Otherwise, we just use each level item exactly once:
329+
stride = len(self.removed_level) + self.lift
319330
repeater = np.arange(stride) - self.lift
320331

321-
# The entire level is then just a repetition of the single chunk:
322-
new_codes.append(np.tile(repeater, width))
323-
return MultiIndex(
324-
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
325-
)
332+
return repeater
326333

327334
@cache_readonly
328335
def new_index(self):

pandas/tests/frame/test_stack_unstack.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1285,6 +1285,21 @@ def test_stack_positional_level_duplicate_column_names():
12851285
tm.assert_frame_equal(result, expected)
12861286

12871287

1288+
def test_unstack_non_slice_like_blocks(using_array_manager):
1289+
# Case where the mgr_locs of a DataFrame's underlying blocks are not slice-like
1290+
1291+
mi = MultiIndex.from_product([range(5), ["A", "B", "C"]])
1292+
df = DataFrame(np.random.randn(15, 4), index=mi)
1293+
df[1] = df[1].astype(np.int64)
1294+
if not using_array_manager:
1295+
assert any(not x.mgr_locs.is_slice_like for x in df._mgr.blocks)
1296+
1297+
res = df.unstack()
1298+
1299+
expected = pd.concat([df[n].unstack() for n in range(4)], keys=range(4), axis=1)
1300+
tm.assert_frame_equal(res, expected)
1301+
1302+
12881303
class TestStackUnstackMultiLevel:
12891304
def test_unstack(self, multiindex_year_month_day_dataframe_random_data):
12901305
# just check that it works for now

0 commit comments

Comments
 (0)