Skip to content

bpo-31993: Do not use memoryview when pickle large strings. #5154

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 30 additions & 30 deletions Lib/test/pickletester.py
Original file line number Diff line number Diff line change
Expand Up @@ -2169,67 +2169,67 @@ def remove_frames(pickled, keep_frame=None):
def test_framed_write_sizes_with_delayed_writer(self):
class ChunkAccumulator:
"""Accumulate pickler output in a list of raw chunks."""

def __init__(self):
self.chunks = []

def write(self, chunk):
self.chunks.append(chunk)

def concatenate_chunks(self):
# Some chunks can be memoryview instances, we need to convert
# them to bytes to be able to call join
return b"".join([c.tobytes() if hasattr(c, 'tobytes') else c
for c in self.chunks])

small_objects = [(str(i).encode('ascii'), i % 42, {'i': str(i)})
for i in range(int(1e4))]
return b"".join(self.chunks)

for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
objects = [(str(i).encode('ascii'), i % 42, {'i': str(i)})
for i in range(int(1e4))]
# Add a large unique ASCII string
objects.append('0123456789abcdef' *
(self.FRAME_SIZE_TARGET // 16 + 1))

# Protocol 4 packs groups of small objects into frames and issues
# calls to write only once or twice per frame:
# The C pickler issues one call to write per-frame (header and
# contents) while Python pickler issues two calls to write: one for
# the frame header and one for the frame binary contents.
writer = ChunkAccumulator()
self.pickler(writer, proto).dump(small_objects)
self.pickler(writer, proto).dump(objects)

# Actually read the binary content of the chunks after the end
# of the call to dump: ant memoryview passed to write should not
# of the call to dump: any memoryview passed to write should not
# be released otherwise this delayed access would not be possible.
pickled = writer.concatenate_chunks()
reconstructed = self.loads(pickled)
self.assertEqual(reconstructed, small_objects)
self.assertEqual(reconstructed, objects)
self.assertGreater(len(writer.chunks), 1)

n_frames, remainder = divmod(len(pickled), self.FRAME_SIZE_TARGET)
if remainder > 0:
n_frames += 1
# memoryviews should own the memory.
del objects
support.gc_collect()
self.assertEqual(writer.concatenate_chunks(), pickled)

n_frames = (len(pickled) - 1) // self.FRAME_SIZE_TARGET + 1
# There should be at least one call to write per frame
self.assertGreaterEqual(len(writer.chunks), n_frames)

# but not too many either: there can be one for the proto,
# one per-frame header and one per frame for the actual contents.
self.assertGreaterEqual(2 * n_frames + 1, len(writer.chunks))
# one per-frame header, one per frame for the actual contents,
# and two for the header.
self.assertLessEqual(len(writer.chunks), 2 * n_frames + 3)

chunk_sizes = [len(c) for c in writer.chunks[:-1]]
chunk_sizes = [len(c) for c in writer.chunks]
large_sizes = [s for s in chunk_sizes
if s >= self.FRAME_SIZE_TARGET]
small_sizes = [s for s in chunk_sizes
if s < self.FRAME_SIZE_TARGET]
medium_sizes = [s for s in chunk_sizes
if 9 < s < self.FRAME_SIZE_TARGET]
small_sizes = [s for s in chunk_sizes if s <= 9]

# Large chunks should not be too large:
for chunk_size in large_sizes:
self.assertGreater(2 * self.FRAME_SIZE_TARGET, chunk_size)

last_chunk_size = len(writer.chunks[-1])
self.assertGreater(2 * self.FRAME_SIZE_TARGET, last_chunk_size)

# Small chunks (if any) should be very small
# (only proto and frame headers)
for chunk_size in small_sizes:
self.assertGreaterEqual(9, chunk_size)
self.assertLess(chunk_size, 2 * self.FRAME_SIZE_TARGET,
chunk_sizes)
# There shouldn't bee too many small chunks: the protocol header,
# the frame headers and the large string headers are written
# in small chunks.
self.assertLessEqual(len(small_sizes),
len(large_sizes) + len(medium_sizes) + 3,
chunk_sizes)

def test_nested_names(self):
global Nested
Expand Down
6 changes: 3 additions & 3 deletions Misc/NEWS.d/3.7.0a4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -693,9 +693,9 @@ ctypes._aix.find_library() Patch by: Michael Felt
.. nonce: -OMNg8
.. section: Library

The picklers no longer allocate temporary memory when dumping large
``bytes`` and ``str`` objects into a file object. Instead the data is
directly streamed into the underlying file object.
The pickler now uses less memory when serializing large bytes and str
objects into a file. Pickles created with protocol 4 will require less
memory for unpickling large bytes and str objects.

..

Expand Down
5 changes: 3 additions & 2 deletions Modules/_pickle.c
Original file line number Diff line number Diff line change
Expand Up @@ -2184,8 +2184,9 @@ _Pickler_write_bytes(PicklerObject *self,
/* Stream write the payload into the file without going through the
output buffer. */
if (payload == NULL) {
payload = mem = PyMemoryView_FromMemory((char *) data, data_size,
PyBUF_READ);
/* TODO: It would be better to use a memoryview with a linked
original string if this is possible. */
payload = mem = PyBytes_FromStringAndSize(data, data_size);
if (payload == NULL) {
return -1;
}
Expand Down