Skip to content

Commit 3c79e28

Browse files
CoW: Use weakref callbacks to track dead references
Co-authored-by: José Lucas Silva Mayer <[email protected]>
1 parent 73bc5f4 commit 3c79e28

File tree

2 files changed

+47
-33
lines changed

2 files changed

+47
-33
lines changed

pandas/_libs/internals.pyx

Lines changed: 25 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -886,29 +886,33 @@ cdef class BlockValuesRefs:
886886
"""
887887
cdef:
888888
public list referenced_blocks
889-
public int clear_counter
889+
public int dead_counter
890+
object __weakref__
891+
object _weakref_cb
890892

891893
def __cinit__(self, blk: Block | None = None) -> None:
894+
def _weakref_cb(
895+
item: weakref.ref,
896+
selfref: weakref.ref = weakref.ref(self)
897+
) -> None:
898+
self = selfref()
899+
if self is not None:
900+
self.dead_counter += 1
901+
if self.dead_counter > 256:
902+
if self.dead_counter > len(self.referenced_blocks) // 2:
903+
self._clear_dead_references()
904+
self._weakref_cb = _weakref_cb
892905
if blk is not None:
893-
self.referenced_blocks = [weakref.ref(blk)]
906+
self.referenced_blocks = [weakref.ref(blk, self._weakref_cb)]
894907
else:
895908
self.referenced_blocks = []
896-
self.clear_counter = 500 # set reasonably high
897-
898-
def _clear_dead_references(self, force=False) -> None:
899-
# Use exponential backoff to decide when we want to clear references
900-
# if force=False. Clearing for every insertion causes slowdowns if
901-
# all these objects stay alive, e.g. df.items() for wide DataFrames
902-
# see GH#55245 and GH#55008
903-
if force or len(self.referenced_blocks) > self.clear_counter:
904-
self.referenced_blocks = [
905-
ref for ref in self.referenced_blocks if ref() is not None
906-
]
907-
nr_of_refs = len(self.referenced_blocks)
908-
if nr_of_refs < self.clear_counter // 2:
909-
self.clear_counter = max(self.clear_counter // 2, 500)
910-
elif nr_of_refs > self.clear_counter:
911-
self.clear_counter = max(self.clear_counter * 2, nr_of_refs)
909+
910+
def _clear_dead_references(self) -> None:
911+
old_len = len(self.referenced_blocks)
912+
self.referenced_blocks = [
913+
ref for ref in self.referenced_blocks if ref() is not None
914+
]
915+
self.dead_counter = self.dead_counter - (old_len - len(self.referenced_blocks))
912916

913917
def add_reference(self, blk: Block) -> None:
914918
"""Adds a new reference to our reference collection.
@@ -918,8 +922,7 @@ cdef class BlockValuesRefs:
918922
blk : Block
919923
The block that the new references should point to.
920924
"""
921-
self._clear_dead_references()
922-
self.referenced_blocks.append(weakref.ref(blk))
925+
self.referenced_blocks.append(weakref.ref(blk, self._weakref_cb))
923926

924927
def add_index_reference(self, index: object) -> None:
925928
"""Adds a new reference to our reference collection when creating an index.
@@ -929,8 +932,7 @@ cdef class BlockValuesRefs:
929932
index : Index
930933
The index that the new reference should point to.
931934
"""
932-
self._clear_dead_references()
933-
self.referenced_blocks.append(weakref.ref(index))
935+
self.referenced_blocks.append(weakref.ref(index, self._weakref_cb))
934936

935937
def has_reference(self) -> bool:
936938
"""Checks if block has foreign references.
@@ -942,6 +944,5 @@ cdef class BlockValuesRefs:
942944
-------
943945
bool
944946
"""
945-
self._clear_dead_references(force=True)
946947
# Checking for more references than block pointing to itself
947-
return len(self.referenced_blocks) > 1
948+
return len(self.referenced_blocks) - self.dead_counter > 1

pandas/tests/copy_view/test_internals.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -115,31 +115,44 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype):
115115
assert not np.shares_memory(get_array(df, col), get_array(df2, col))
116116

117117

118-
def test_exponential_backoff():
119-
# GH#55518
118+
def test_clear_dead_references():
119+
# GH#55539
120120
df = DataFrame({"a": [1, 2, 3]})
121121
for i in range(490):
122122
df.copy(deep=False)
123123

124-
assert len(df._mgr.blocks[0].refs.referenced_blocks) == 491
124+
assert (
125+
len(df._mgr.blocks[0].refs.referenced_blocks)
126+
- df._mgr.blocks[0].refs.dead_counter
127+
== 1
128+
)
125129

126130
df = DataFrame({"a": [1, 2, 3]})
127131
dfs = [df.copy(deep=False) for i in range(510)]
128132

129133
for i in range(20):
130134
df.copy(deep=False)
131-
assert len(df._mgr.blocks[0].refs.referenced_blocks) == 531
132-
assert df._mgr.blocks[0].refs.clear_counter == 1000
135+
assert (
136+
len(df._mgr.blocks[0].refs.referenced_blocks)
137+
- df._mgr.blocks[0].refs.dead_counter
138+
== 511
139+
)
133140

134141
for i in range(500):
135142
df.copy(deep=False)
136143

137-
# Don't reduce since we still have over 500 objects alive
138-
assert df._mgr.blocks[0].refs.clear_counter == 1000
144+
assert (
145+
len(df._mgr.blocks[0].refs.referenced_blocks)
146+
- df._mgr.blocks[0].refs.dead_counter
147+
== 511
148+
)
139149

140150
dfs = dfs[:300]
141151
for i in range(500):
142152
df.copy(deep=False)
143153

144-
# Reduce since there are less than 500 objects alive
145-
assert df._mgr.blocks[0].refs.clear_counter == 500
154+
assert (
155+
len(df._mgr.blocks[0].refs.referenced_blocks)
156+
- df._mgr.blocks[0].refs.dead_counter
157+
== 301
158+
)

0 commit comments

Comments
 (0)