Skip to content

Commit 1211c9a

Browse files
bpo-32503: Avoid creating too small frames in pickles. (#5127)
1 parent bd5c7d2 commit 1211c9a

File tree

4 files changed

+62
-51
lines changed

4 files changed

+62
-51
lines changed

Lib/pickle.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ def __init__(self, value):
183183

184184
class _Framer:
185185

186+
_FRAME_SIZE_MIN = 4
186187
_FRAME_SIZE_TARGET = 64 * 1024
187188

188189
def __init__(self, file_write):
@@ -203,11 +204,12 @@ def commit_frame(self, force=False):
203204
if f.tell() >= self._FRAME_SIZE_TARGET or force:
204205
data = f.getbuffer()
205206
write = self.file_write
206-
# Issue a single call to the write method of the underlying
207-
# file object for the frame opcode with the size of the
208-
# frame. The concatenation is expected to be less expensive
209-
# than issuing an additional call to write.
210-
write(FRAME + pack("<Q", len(data)))
207+
if len(data) >= self._FRAME_SIZE_MIN:
208+
# Issue a single call to the write method of the underlying
209+
# file object for the frame opcode with the size of the
210+
# frame. The concatenation is expected to be less expensive
211+
# than issuing an additional call to write.
212+
write(FRAME + pack("<Q", len(data)))
211213

212214
# Issue a separate call to write to append the frame
213215
# contents without concatenation to the above to avoid a

Lib/test/pickletester.py

Lines changed: 45 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2037,6 +2037,7 @@ def test_setitems_on_non_dicts(self):
20372037

20382038
# Exercise framing (proto >= 4) for significant workloads
20392039

2040+
FRAME_SIZE_MIN = 4
20402041
FRAME_SIZE_TARGET = 64 * 1024
20412042

20422043
def check_frame_opcodes(self, pickled):
@@ -2047,36 +2048,43 @@ def check_frame_opcodes(self, pickled):
20472048
framed by default and are therefore considered a frame by themselves in
20482049
the following consistency check.
20492050
"""
2050-
last_arg = last_pos = last_frame_opcode_size = None
2051-
frameless_opcode_sizes = {
2052-
'BINBYTES': 5,
2053-
'BINUNICODE': 5,
2054-
'BINBYTES8': 9,
2055-
'BINUNICODE8': 9,
2056-
}
2051+
frame_end = frameless_start = None
2052+
frameless_opcodes = {'BINBYTES', 'BINUNICODE', 'BINBYTES8', 'BINUNICODE8'}
20572053
for op, arg, pos in pickletools.genops(pickled):
2058-
if op.name in frameless_opcode_sizes:
2059-
if len(arg) > self.FRAME_SIZE_TARGET:
2060-
frame_opcode_size = frameless_opcode_sizes[op.name]
2061-
arg = len(arg)
2062-
else:
2063-
continue
2064-
elif op.name == 'FRAME':
2065-
frame_opcode_size = 9
2066-
else:
2067-
continue
2068-
2069-
if last_pos is not None:
2070-
# The previous frame's size should be equal to the number
2071-
# of bytes up to the current frame.
2072-
frame_size = pos - last_pos - last_frame_opcode_size
2073-
self.assertEqual(frame_size, last_arg)
2074-
last_arg, last_pos = arg, pos
2075-
last_frame_opcode_size = frame_opcode_size
2076-
# The last frame's size should be equal to the number of bytes up
2077-
# to the pickle's end.
2078-
frame_size = len(pickled) - last_pos - last_frame_opcode_size
2079-
self.assertEqual(frame_size, last_arg)
2054+
if frame_end is not None:
2055+
self.assertLessEqual(pos, frame_end)
2056+
if pos == frame_end:
2057+
frame_end = None
2058+
2059+
if frame_end is not None: # framed
2060+
self.assertNotEqual(op.name, 'FRAME')
2061+
if op.name in frameless_opcodes:
2062+
# Only short bytes and str objects should be written
2063+
# in a frame
2064+
self.assertLessEqual(len(arg), self.FRAME_SIZE_TARGET)
2065+
2066+
else: # not framed
2067+
if (op.name == 'FRAME' or
2068+
(op.name in frameless_opcodes and
2069+
len(arg) > self.FRAME_SIZE_TARGET)):
2070+
# Frame or large bytes or str object
2071+
if frameless_start is not None:
2072+
# Only short data should be written outside of a frame
2073+
self.assertLess(pos - frameless_start,
2074+
self.FRAME_SIZE_MIN)
2075+
frameless_start = None
2076+
elif frameless_start is None and op.name != 'PROTO':
2077+
frameless_start = pos
2078+
2079+
if op.name == 'FRAME':
2080+
self.assertGreaterEqual(arg, self.FRAME_SIZE_MIN)
2081+
frame_end = pos + 9 + arg
2082+
2083+
pos = len(pickled)
2084+
if frame_end is not None:
2085+
self.assertEqual(frame_end, pos)
2086+
elif frameless_start is not None:
2087+
self.assertLess(pos - frameless_start, self.FRAME_SIZE_MIN)
20802088

20812089
def test_framing_many_objects(self):
20822090
obj = list(range(10**5))
@@ -2095,7 +2103,8 @@ def test_framing_many_objects(self):
20952103

20962104
def test_framing_large_objects(self):
20972105
N = 1024 * 1024
2098-
obj = [b'x' * N, b'y' * N, 'z' * N]
2106+
small_items = [[i] for i in range(10)]
2107+
obj = [b'x' * N, *small_items, b'y' * N, 'z' * N]
20992108
for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
21002109
for fast in [False, True]:
21012110
with self.subTest(proto=proto, fast=fast):
@@ -2119,12 +2128,9 @@ def test_framing_large_objects(self):
21192128
# Perform full equality check if the lengths match.
21202129
self.assertEqual(obj, unpickled)
21212130
n_frames = count_opcode(pickle.FRAME, pickled)
2122-
if not fast:
2123-
# One frame per memoize for each large object.
2124-
self.assertGreaterEqual(n_frames, len(obj))
2125-
else:
2126-
# One frame at the beginning and one at the end.
2127-
self.assertGreaterEqual(n_frames, 2)
2131+
# A single frame for small objects between
2132+
# first two large objects.
2133+
self.assertEqual(n_frames, 1)
21282134
self.check_frame_opcodes(pickled)
21292135

21302136
def test_optional_frames(self):
@@ -2152,7 +2158,9 @@ def remove_frames(pickled, keep_frame=None):
21522158

21532159
frame_size = self.FRAME_SIZE_TARGET
21542160
num_frames = 20
2155-
obj = [bytes([i]) * frame_size for i in range(num_frames)]
2161+
# Large byte objects (dict values) intermitted with small objects
2162+
# (dict keys)
2163+
obj = {i: bytes([i]) * frame_size for i in range(num_frames)}
21562164

21572165
for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
21582166
pickled = self.dumps(obj, proto)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Pickling with protocol 4 no longer creates too small frames.

Modules/_pickle.c

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,8 @@ enum {
119119
/* Prefetch size when unpickling (disabled on unpeekable streams) */
120120
PREFETCH = 8192 * 16,
121121

122+
FRAME_SIZE_MIN = 4,
122123
FRAME_SIZE_TARGET = 64 * 1024,
123-
124124
FRAME_HEADER_SIZE = 9
125125
};
126126

@@ -949,13 +949,6 @@ _write_size64(char *out, size_t value)
949949
}
950950
}
951951

952-
static void
953-
_Pickler_WriteFrameHeader(PicklerObject *self, char *qdata, size_t frame_len)
954-
{
955-
qdata[0] = FRAME;
956-
_write_size64(qdata + 1, frame_len);
957-
}
958-
959952
static int
960953
_Pickler_CommitFrame(PicklerObject *self)
961954
{
@@ -966,7 +959,14 @@ _Pickler_CommitFrame(PicklerObject *self)
966959
return 0;
967960
frame_len = self->output_len - self->frame_start - FRAME_HEADER_SIZE;
968961
qdata = PyBytes_AS_STRING(self->output_buffer) + self->frame_start;
969-
_Pickler_WriteFrameHeader(self, qdata, frame_len);
962+
if (frame_len >= FRAME_SIZE_MIN) {
963+
qdata[0] = FRAME;
964+
_write_size64(qdata + 1, frame_len);
965+
}
966+
else {
967+
memmove(qdata, qdata + FRAME_HEADER_SIZE, frame_len);
968+
self->output_len -= FRAME_HEADER_SIZE;
969+
}
970970
self->frame_start = -1;
971971
return 0;
972972
}

0 commit comments

Comments
 (0)