Skip to content

bpo-36311: Fixes decoding multibyte characters around chunk boundaries and improves decoding performance #15083

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Aug 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions Lib/test/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3075,13 +3075,13 @@ def test_mbcs_alias(self):
self.assertEqual(codec.name, 'mbcs')

@support.bigmemtest(size=2**31, memuse=7, dry_run=False)
def test_large_input(self):
def test_large_input(self, size):
# Test input longer than INT_MAX.
# Input should contain undecodable bytes before and after
# the INT_MAX limit.
encoded = (b'01234567' * (2**28-1) +
encoded = (b'01234567' * ((size//8)-1) +
b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
self.assertEqual(len(encoded), 2**31+2)
self.assertEqual(len(encoded), size+2)
decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
self.assertEqual(decoded[1], len(encoded))
del encoded
Expand All @@ -3092,6 +3092,20 @@ def test_large_input(self):
'\udc85\udc86\udcea\udceb\udcec'
'\udcef\udcfc\udcfd\udcfe\udcff')

@support.bigmemtest(size=2**31, memuse=6, dry_run=False)
def test_large_utf8_input(self, size):
# Test input longer than INT_MAX.
# Input should contain a decodable multi-byte character
# surrounding INT_MAX
encoded = (b'0123456\xed\x84\x80' * (size//8))
self.assertEqual(len(encoded), size // 8 * 10)
decoded = codecs.code_page_decode(65001, encoded, 'ignore', True)
self.assertEqual(decoded[1], len(encoded))
del encoded
self.assertEqual(len(decoded[0]), size)
self.assertEqual(decoded[0][:10], '0123456\ud10001')
self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100')


class ASCIITest(unittest.TestCase):
def test_encode(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Decoding bytes objects larger than 2GiB is faster and no longer fails when a
multibyte characters spans a chunk boundary.
16 changes: 10 additions & 6 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -7186,6 +7186,12 @@ PyUnicode_AsASCIIString(PyObject *unicode)
#define NEED_RETRY
#endif

/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
transcoding from UTF-16), but INT_MAX / 4 perfoms better in
both cases also and avoids partial characters overrunning the
length limit in MultiByteToWideChar on Windows */
#define DECODING_CHUNK_SIZE (INT_MAX/4)

#ifndef WC_ERR_INVALID_CHARS
# define WC_ERR_INVALID_CHARS 0x0080
#endif
Expand Down Expand Up @@ -7422,8 +7428,8 @@ decode_code_page_stateful(int code_page,
do
{
#ifdef NEED_RETRY
if (size > INT_MAX) {
chunk_size = INT_MAX;
if (size > DECODING_CHUNK_SIZE) {
chunk_size = DECODING_CHUNK_SIZE;
final = 0;
done = 0;
}
Expand Down Expand Up @@ -7827,10 +7833,8 @@ encode_code_page(int code_page,
do
{
#ifdef NEED_RETRY
/* UTF-16 encoding may double the size, so use only INT_MAX/2
chunks. */
if (len > INT_MAX/2) {
chunk_len = INT_MAX/2;
if (len > DECODING_CHUNK_SIZE) {
chunk_len = DECODING_CHUNK_SIZE;
done = 0;
}
else
Expand Down