Skip to content

Commit ea94fce

Browse files
authored
[3.6] bpo-32583: Fix possible crashing in builtin Unicode decoders (GH-5325) (#5459)
When using customized decode error handlers, it is possible for builtin decoders to write out-of-bounds and then crash.. (cherry picked from commit 2c7fd46)
1 parent eb126ed commit ea94fce

File tree

3 files changed

+74
-2
lines changed

3 files changed

+74
-2
lines changed

Lib/test/test_codeccallbacks.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1044,6 +1044,58 @@ def mutating(exc):
10441044
for (encoding, data) in baddata:
10451045
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
10461046

1047+
# issue32583
1048+
def test_crashing_decode_handler(self):
1049+
# better generating one more character to fill the extra space slot
1050+
# so in debug build it can steadily fail
1051+
def forward_shorter_than_end(exc):
1052+
if isinstance(exc, UnicodeDecodeError):
1053+
# size one character, 0 < forward < exc.end
1054+
return ('\ufffd', exc.start+1)
1055+
else:
1056+
raise TypeError("don't know how to handle %r" % exc)
1057+
codecs.register_error(
1058+
"test.forward_shorter_than_end", forward_shorter_than_end)
1059+
1060+
self.assertEqual(
1061+
b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode(
1062+
'utf-16-le', 'test.forward_shorter_than_end'),
1063+
'\ufffd\ufffd\ufffd\ufffd\xd8\x00'
1064+
)
1065+
self.assertEqual(
1066+
b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode(
1067+
'utf-16-be', 'test.forward_shorter_than_end'),
1068+
'\ufffd\ufffd\ufffd\ufffd\xd8\x00'
1069+
)
1070+
self.assertEqual(
1071+
b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode(
1072+
'utf-32-le', 'test.forward_shorter_than_end'),
1073+
'\ufffd\ufffd\ufffd\u1111\x00'
1074+
)
1075+
self.assertEqual(
1076+
b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode(
1077+
'utf-32-be', 'test.forward_shorter_than_end'),
1078+
'\ufffd\ufffd\ufffd\u1111\x00'
1079+
)
1080+
1081+
def replace_with_long(exc):
1082+
if isinstance(exc, UnicodeDecodeError):
1083+
exc.object = b"\x00" * 8
1084+
return ('\ufffd', exc.start)
1085+
else:
1086+
raise TypeError("don't know how to handle %r" % exc)
1087+
codecs.register_error("test.replace_with_long", replace_with_long)
1088+
1089+
self.assertEqual(
1090+
b'\x00'.decode('utf-16', 'test.replace_with_long'),
1091+
'\ufffd\x00\x00\x00\x00'
1092+
)
1093+
self.assertEqual(
1094+
b'\x00'.decode('utf-32', 'test.replace_with_long'),
1095+
'\ufffd\x00\x00'
1096+
)
1097+
1098+
10471099
def test_fake_error_class(self):
10481100
handlers = [
10491101
codecs.strict_errors,
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix possible crashing in builtin Unicode decoders caused by write
2+
out-of-bound errors when using customized decode error handlers.

Objects/unicodeobject.c

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4429,7 +4429,10 @@ unicode_decode_call_errorhandler_writer(
44294429
Py_ssize_t insize;
44304430
Py_ssize_t newpos;
44314431
Py_ssize_t replen;
4432+
Py_ssize_t remain;
44324433
PyObject *inputobj = NULL;
4434+
int need_to_grow = 0;
4435+
const char *new_inptr;
44334436

44344437
if (*errorHandler == NULL) {
44354438
*errorHandler = PyCodec_LookupError(errors);
@@ -4463,6 +4466,7 @@ unicode_decode_call_errorhandler_writer(
44634466
if (!PyBytes_Check(inputobj)) {
44644467
PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
44654468
}
4469+
remain = *inend - *input - *endinpos;
44664470
*input = PyBytes_AS_STRING(inputobj);
44674471
insize = PyBytes_GET_SIZE(inputobj);
44684472
*inend = *input + insize;
@@ -4482,6 +4486,19 @@ unicode_decode_call_errorhandler_writer(
44824486
replen = PyUnicode_GET_LENGTH(repunicode);
44834487
if (replen > 1) {
44844488
writer->min_length += replen - 1;
4489+
need_to_grow = 1;
4490+
}
4491+
new_inptr = *input + newpos;
4492+
if (*inend - new_inptr > remain) {
4493+
/* We don't know the decoding algorithm here so we make the worst
4494+
assumption that one byte decodes to one unicode character.
4495+
If unfortunately one byte could decode to more unicode characters,
4496+
the decoder may write out-of-bound then. Is it possible for the
4497+
algorithms using this function? */
4498+
writer->min_length += *inend - new_inptr - remain;
4499+
need_to_grow = 1;
4500+
}
4501+
if (need_to_grow) {
44854502
writer->overallocate = 1;
44864503
if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
44874504
PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
@@ -4491,7 +4508,7 @@ unicode_decode_call_errorhandler_writer(
44914508
goto onError;
44924509

44934510
*endinpos = newpos;
4494-
*inptr = *input + newpos;
4511+
*inptr = new_inptr;
44954512

44964513
/* we made it! */
44974514
Py_XDECREF(restuple);
@@ -5663,7 +5680,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
56635680
#endif
56645681

56655682
/* Note: size will always be longer than the resulting Unicode
5666-
character count */
5683+
character count normally. Error handler will take care of
5684+
resizing when needed. */
56675685
_PyUnicodeWriter_Init(&writer);
56685686
writer.min_length = (e - q + 1) / 2;
56695687
if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)

0 commit comments

Comments
 (0)