Skip to content

Commit 5b76bdb

Browse files
bpo-31993: Do not use memoryview when pickle large strings. (#5154)
PyMemoryView_FromMemory() created a memoryview referring to the internal data of the string. When the string is destroyed the memoryview become referring to a freed memory.
1 parent f3031b8 commit 5b76bdb

File tree

3 files changed

+36
-35
lines changed

3 files changed

+36
-35
lines changed

Lib/test/pickletester.py

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2169,67 +2169,67 @@ def remove_frames(pickled, keep_frame=None):
21692169
def test_framed_write_sizes_with_delayed_writer(self):
21702170
class ChunkAccumulator:
21712171
"""Accumulate pickler output in a list of raw chunks."""
2172-
21732172
def __init__(self):
21742173
self.chunks = []
2175-
21762174
def write(self, chunk):
21772175
self.chunks.append(chunk)
2178-
21792176
def concatenate_chunks(self):
2180-
# Some chunks can be memoryview instances, we need to convert
2181-
# them to bytes to be able to call join
2182-
return b"".join([c.tobytes() if hasattr(c, 'tobytes') else c
2183-
for c in self.chunks])
2184-
2185-
small_objects = [(str(i).encode('ascii'), i % 42, {'i': str(i)})
2186-
for i in range(int(1e4))]
2177+
return b"".join(self.chunks)
21872178

21882179
for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
2180+
objects = [(str(i).encode('ascii'), i % 42, {'i': str(i)})
2181+
for i in range(int(1e4))]
2182+
# Add a large unique ASCII string
2183+
objects.append('0123456789abcdef' *
2184+
(self.FRAME_SIZE_TARGET // 16 + 1))
2185+
21892186
# Protocol 4 packs groups of small objects into frames and issues
21902187
# calls to write only once or twice per frame:
21912188
# The C pickler issues one call to write per-frame (header and
21922189
# contents) while Python pickler issues two calls to write: one for
21932190
# the frame header and one for the frame binary contents.
21942191
writer = ChunkAccumulator()
2195-
self.pickler(writer, proto).dump(small_objects)
2192+
self.pickler(writer, proto).dump(objects)
21962193

21972194
# Actually read the binary content of the chunks after the end
2198-
# of the call to dump: ant memoryview passed to write should not
2195+
# of the call to dump: any memoryview passed to write should not
21992196
# be released otherwise this delayed access would not be possible.
22002197
pickled = writer.concatenate_chunks()
22012198
reconstructed = self.loads(pickled)
2202-
self.assertEqual(reconstructed, small_objects)
2199+
self.assertEqual(reconstructed, objects)
22032200
self.assertGreater(len(writer.chunks), 1)
22042201

2205-
n_frames, remainder = divmod(len(pickled), self.FRAME_SIZE_TARGET)
2206-
if remainder > 0:
2207-
n_frames += 1
2202+
# memoryviews should own the memory.
2203+
del objects
2204+
support.gc_collect()
2205+
self.assertEqual(writer.concatenate_chunks(), pickled)
22082206

2207+
n_frames = (len(pickled) - 1) // self.FRAME_SIZE_TARGET + 1
22092208
# There should be at least one call to write per frame
22102209
self.assertGreaterEqual(len(writer.chunks), n_frames)
22112210

22122211
# but not too many either: there can be one for the proto,
2213-
# one per-frame header and one per frame for the actual contents.
2214-
self.assertGreaterEqual(2 * n_frames + 1, len(writer.chunks))
2212+
# one per-frame header, one per frame for the actual contents,
2213+
# and two for the header.
2214+
self.assertLessEqual(len(writer.chunks), 2 * n_frames + 3)
22152215

2216-
chunk_sizes = [len(c) for c in writer.chunks[:-1]]
2216+
chunk_sizes = [len(c) for c in writer.chunks]
22172217
large_sizes = [s for s in chunk_sizes
22182218
if s >= self.FRAME_SIZE_TARGET]
2219-
small_sizes = [s for s in chunk_sizes
2220-
if s < self.FRAME_SIZE_TARGET]
2219+
medium_sizes = [s for s in chunk_sizes
2220+
if 9 < s < self.FRAME_SIZE_TARGET]
2221+
small_sizes = [s for s in chunk_sizes if s <= 9]
22212222

22222223
# Large chunks should not be too large:
22232224
for chunk_size in large_sizes:
2224-
self.assertGreater(2 * self.FRAME_SIZE_TARGET, chunk_size)
2225-
2226-
last_chunk_size = len(writer.chunks[-1])
2227-
self.assertGreater(2 * self.FRAME_SIZE_TARGET, last_chunk_size)
2228-
2229-
# Small chunks (if any) should be very small
2230-
# (only proto and frame headers)
2231-
for chunk_size in small_sizes:
2232-
self.assertGreaterEqual(9, chunk_size)
2225+
self.assertLess(chunk_size, 2 * self.FRAME_SIZE_TARGET,
2226+
chunk_sizes)
2227+
# There shouldn't bee too many small chunks: the protocol header,
2228+
# the frame headers and the large string headers are written
2229+
# in small chunks.
2230+
self.assertLessEqual(len(small_sizes),
2231+
len(large_sizes) + len(medium_sizes) + 3,
2232+
chunk_sizes)
22332233

22342234
def test_nested_names(self):
22352235
global Nested

Misc/NEWS.d/3.7.0a4.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -693,9 +693,9 @@ ctypes._aix.find_library() Patch by: Michael Felt
693693
.. nonce: -OMNg8
694694
.. section: Library
695695
696-
The picklers no longer allocate temporary memory when dumping large
697-
``bytes`` and ``str`` objects into a file object. Instead the data is
698-
directly streamed into the underlying file object.
696+
The pickler now uses less memory when serializing large bytes and str
697+
objects into a file. Pickles created with protocol 4 will require less
698+
memory for unpickling large bytes and str objects.
699699

700700
..
701701

Modules/_pickle.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2184,8 +2184,9 @@ _Pickler_write_bytes(PicklerObject *self,
21842184
/* Stream write the payload into the file without going through the
21852185
output buffer. */
21862186
if (payload == NULL) {
2187-
payload = mem = PyMemoryView_FromMemory((char *) data, data_size,
2188-
PyBUF_READ);
2187+
/* TODO: It would be better to use a memoryview with a linked
2188+
original string if this is possible. */
2189+
payload = mem = PyBytes_FromStringAndSize(data, data_size);
21892190
if (payload == NULL) {
21902191
return -1;
21912192
}

0 commit comments

Comments
 (0)