Skip to content

Commit 2c7fd46

Browse files
authored
bpo-32583: Fix possible crashing in builtin Unicode decoders (#5325)
When using customized decode error handlers, it is possible for builtin decoders to write out-of-bounds and then crash.
1 parent 8452104 commit 2c7fd46

File tree

3 files changed

+74
-2
lines changed

3 files changed

+74
-2
lines changed

Lib/test/test_codeccallbacks.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1044,6 +1044,58 @@ def mutating(exc):
10441044
for (encoding, data) in baddata:
10451045
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
10461046

1047+
# issue32583
1048+
def test_crashing_decode_handler(self):
1049+
# better generating one more character to fill the extra space slot
1050+
# so in debug build it can steadily fail
1051+
def forward_shorter_than_end(exc):
1052+
if isinstance(exc, UnicodeDecodeError):
1053+
# size one character, 0 < forward < exc.end
1054+
return ('\ufffd', exc.start+1)
1055+
else:
1056+
raise TypeError("don't know how to handle %r" % exc)
1057+
codecs.register_error(
1058+
"test.forward_shorter_than_end", forward_shorter_than_end)
1059+
1060+
self.assertEqual(
1061+
b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode(
1062+
'utf-16-le', 'test.forward_shorter_than_end'),
1063+
'\ufffd\ufffd\ufffd\ufffd\xd8\x00'
1064+
)
1065+
self.assertEqual(
1066+
b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode(
1067+
'utf-16-be', 'test.forward_shorter_than_end'),
1068+
'\ufffd\ufffd\ufffd\ufffd\xd8\x00'
1069+
)
1070+
self.assertEqual(
1071+
b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode(
1072+
'utf-32-le', 'test.forward_shorter_than_end'),
1073+
'\ufffd\ufffd\ufffd\u1111\x00'
1074+
)
1075+
self.assertEqual(
1076+
b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode(
1077+
'utf-32-be', 'test.forward_shorter_than_end'),
1078+
'\ufffd\ufffd\ufffd\u1111\x00'
1079+
)
1080+
1081+
def replace_with_long(exc):
1082+
if isinstance(exc, UnicodeDecodeError):
1083+
exc.object = b"\x00" * 8
1084+
return ('\ufffd', exc.start)
1085+
else:
1086+
raise TypeError("don't know how to handle %r" % exc)
1087+
codecs.register_error("test.replace_with_long", replace_with_long)
1088+
1089+
self.assertEqual(
1090+
b'\x00'.decode('utf-16', 'test.replace_with_long'),
1091+
'\ufffd\x00\x00\x00\x00'
1092+
)
1093+
self.assertEqual(
1094+
b'\x00'.decode('utf-32', 'test.replace_with_long'),
1095+
'\ufffd\x00\x00'
1096+
)
1097+
1098+
10471099
def test_fake_error_class(self):
10481100
handlers = [
10491101
codecs.strict_errors,
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix possible crashing in builtin Unicode decoders caused by write
2+
out-of-bound errors when using customized decode error handlers.

Objects/unicodeobject.c

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4190,7 +4190,10 @@ unicode_decode_call_errorhandler_writer(
41904190
Py_ssize_t insize;
41914191
Py_ssize_t newpos;
41924192
Py_ssize_t replen;
4193+
Py_ssize_t remain;
41934194
PyObject *inputobj = NULL;
4195+
int need_to_grow = 0;
4196+
const char *new_inptr;
41944197

41954198
if (*errorHandler == NULL) {
41964199
*errorHandler = PyCodec_LookupError(errors);
@@ -4221,6 +4224,7 @@ unicode_decode_call_errorhandler_writer(
42214224
inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
42224225
if (!inputobj)
42234226
goto onError;
4227+
remain = *inend - *input - *endinpos;
42244228
*input = PyBytes_AS_STRING(inputobj);
42254229
insize = PyBytes_GET_SIZE(inputobj);
42264230
*inend = *input + insize;
@@ -4238,6 +4242,19 @@ unicode_decode_call_errorhandler_writer(
42384242
replen = PyUnicode_GET_LENGTH(repunicode);
42394243
if (replen > 1) {
42404244
writer->min_length += replen - 1;
4245+
need_to_grow = 1;
4246+
}
4247+
new_inptr = *input + newpos;
4248+
if (*inend - new_inptr > remain) {
4249+
/* We don't know the decoding algorithm here so we make the worst
4250+
assumption that one byte decodes to one unicode character.
4251+
If unfortunately one byte could decode to more unicode characters,
4252+
the decoder may write out-of-bound then. Is it possible for the
4253+
algorithms using this function? */
4254+
writer->min_length += *inend - new_inptr - remain;
4255+
need_to_grow = 1;
4256+
}
4257+
if (need_to_grow) {
42414258
writer->overallocate = 1;
42424259
if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
42434260
PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
@@ -4247,7 +4264,7 @@ unicode_decode_call_errorhandler_writer(
42474264
goto onError;
42484265

42494266
*endinpos = newpos;
4250-
*inptr = *input + newpos;
4267+
*inptr = new_inptr;
42514268

42524269
/* we made it! */
42534270
Py_DECREF(restuple);
@@ -5572,7 +5589,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
55725589
#endif
55735590

55745591
/* Note: size will always be longer than the resulting Unicode
5575-
character count */
5592+
character count normally. Error handler will take care of
5593+
resizing when needed. */
55765594
_PyUnicodeWriter_Init(&writer);
55775595
writer.min_length = (e - q + 1) / 2;
55785596
if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)

0 commit comments

Comments
 (0)