Skip to content

Commit 39aa983

Browse files
bpo-45467: Fix IncrementalDecoder and StreamReader in the "raw-unicode-escape" codec (GH-28944)
They support now splitting escape sequences between input chunks. Add the third parameter "final" in codecs.raw_unicode_escape_decode(). It is True by default to match the former behavior.
1 parent d413c50 commit 39aa983

File tree

7 files changed

+116
-35
lines changed

7 files changed

+116
-35
lines changed

Include/cpython/unicodeobject.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -796,6 +796,16 @@ PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
796796
string. */
797797
);
798798

799+
/* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
800+
801+
/* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
802+
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
803+
const char *string, /* Unicode-Escape encoded string */
804+
Py_ssize_t length, /* size of string */
805+
const char *errors, /* error handling */
806+
Py_ssize_t *consumed /* bytes consumed */
807+
);
808+
799809
/* --- Latin-1 Codecs ----------------------------------------------------- */
800810

801811
PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(

Lib/encodings/raw_unicode_escape.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
2121
def encode(self, input, final=False):
2222
return codecs.raw_unicode_escape_encode(input, self.errors)[0]
2323

24-
class IncrementalDecoder(codecs.IncrementalDecoder):
25-
def decode(self, input, final=False):
26-
return codecs.raw_unicode_escape_decode(input, self.errors)[0]
24+
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
25+
def _buffer_decode(self, input, errors, final):
26+
return codecs.raw_unicode_escape_decode(input, errors, final)
2727

2828
class StreamWriter(Codec,codecs.StreamWriter):
2929
pass
3030

3131
class StreamReader(Codec,codecs.StreamReader):
32-
pass
32+
def decode(self, input, errors='strict'):
33+
return codecs.raw_unicode_escape_decode(input, errors, False)
3334

3435
### encodings module API
3536

Lib/test/test_codecs.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2483,7 +2483,11 @@ def test_partial(self):
24832483
]
24842484
)
24852485

2486-
class RawUnicodeEscapeTest(unittest.TestCase):
2486+
class RawUnicodeEscapeTest(ReadTest, unittest.TestCase):
2487+
encoding = "raw-unicode-escape"
2488+
2489+
test_lone_surrogates = None
2490+
24872491
def test_empty(self):
24882492
self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
24892493
self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
@@ -2532,6 +2536,35 @@ def test_decode_errors(self):
25322536
self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
25332537
self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
25342538

2539+
def test_partial(self):
2540+
self.check_partial(
2541+
"\x00\t\n\r\\\xff\uffff\U00010000",
2542+
[
2543+
'\x00',
2544+
'\x00\t',
2545+
'\x00\t\n',
2546+
'\x00\t\n\r',
2547+
'\x00\t\n\r',
2548+
'\x00\t\n\r\\\xff',
2549+
'\x00\t\n\r\\\xff',
2550+
'\x00\t\n\r\\\xff',
2551+
'\x00\t\n\r\\\xff',
2552+
'\x00\t\n\r\\\xff',
2553+
'\x00\t\n\r\\\xff',
2554+
'\x00\t\n\r\\\xff\uffff',
2555+
'\x00\t\n\r\\\xff\uffff',
2556+
'\x00\t\n\r\\\xff\uffff',
2557+
'\x00\t\n\r\\\xff\uffff',
2558+
'\x00\t\n\r\\\xff\uffff',
2559+
'\x00\t\n\r\\\xff\uffff',
2560+
'\x00\t\n\r\\\xff\uffff',
2561+
'\x00\t\n\r\\\xff\uffff',
2562+
'\x00\t\n\r\\\xff\uffff',
2563+
'\x00\t\n\r\\\xff\uffff',
2564+
'\x00\t\n\r\\\xff\uffff\U00010000',
2565+
]
2566+
)
2567+
25352568

25362569
class EscapeEncodeTest(unittest.TestCase):
25372570

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix incremental decoder and stream reader in the "raw-unicode-escape" codec.
2+
Previously they failed if the escape sequence was split.

Modules/_codecsmodule.c

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -509,17 +509,20 @@ _codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
509509
_codecs.raw_unicode_escape_decode
510510
data: Py_buffer(accept={str, buffer})
511511
errors: str(accept={str, NoneType}) = None
512+
final: bool(accept={int}) = True
512513
/
513514
[clinic start generated code]*/
514515

515516
static PyObject *
516517
_codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
517-
const char *errors)
518-
/*[clinic end generated code: output=c98eeb56028070a6 input=d2f5159ce3b3392f]*/
518+
const char *errors, int final)
519+
/*[clinic end generated code: output=11dbd96301e2879e input=2d166191beb3235a]*/
519520
{
520-
PyObject *decoded = PyUnicode_DecodeRawUnicodeEscape(data->buf, data->len,
521-
errors);
522-
return codec_tuple(decoded, data->len);
521+
Py_ssize_t consumed = data->len;
522+
PyObject *decoded = _PyUnicode_DecodeRawUnicodeEscapeStateful(data->buf, data->len,
523+
errors,
524+
final ? NULL : &consumed);
525+
return codec_tuple(decoded, consumed);
523526
}
524527

525528
/*[clinic input]

Modules/clinic/_codecsmodule.c.h

Lines changed: 13 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Objects/unicodeobject.c

Lines changed: 44 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6379,8 +6379,6 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
63796379
unsigned char c = (unsigned char) *s++;
63806380
Py_UCS4 ch;
63816381
int count;
6382-
Py_ssize_t startinpos;
6383-
Py_ssize_t endinpos;
63846382
const char *message;
63856383

63866384
#define WRITE_ASCII_CHAR(ch) \
@@ -6407,7 +6405,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
64076405
continue;
64086406
}
64096407

6410-
startinpos = s - starts - 1;
6408+
Py_ssize_t startinpos = s - starts - 1;
64116409
/* \ - Escapes */
64126410
if (s >= end) {
64136411
message = "\\ at end of string";
@@ -6554,8 +6552,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
65546552
*consumed = startinpos;
65556553
break;
65566554
}
6557-
error:
6558-
endinpos = s-starts;
6555+
error:;
6556+
Py_ssize_t endinpos = s-starts;
65596557
writer.min_length = end - s + writer.pos;
65606558
if (unicode_decode_call_errorhandler_writer(
65616559
errors, &errorHandler,
@@ -6735,9 +6733,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
67356733
/* --- Raw Unicode Escape Codec ------------------------------------------- */
67366734

67376735
PyObject *
6738-
PyUnicode_DecodeRawUnicodeEscape(const char *s,
6739-
Py_ssize_t size,
6740-
const char *errors)
6736+
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6737+
Py_ssize_t size,
6738+
const char *errors,
6739+
Py_ssize_t *consumed)
67416740
{
67426741
const char *starts = s;
67436742
_PyUnicodeWriter writer;
@@ -6746,6 +6745,9 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
67466745
PyObject *exc = NULL;
67476746

67486747
if (size == 0) {
6748+
if (consumed) {
6749+
*consumed = 0;
6750+
}
67496751
_Py_RETURN_UNICODE_EMPTY();
67506752
}
67516753

@@ -6764,8 +6766,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
67646766
unsigned char c = (unsigned char) *s++;
67656767
Py_UCS4 ch;
67666768
int count;
6767-
Py_ssize_t startinpos;
6768-
Py_ssize_t endinpos;
67696769
const char *message;
67706770

67716771
#define WRITE_CHAR(ch) \
@@ -6780,11 +6780,21 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
67806780
} while(0)
67816781

67826782
/* Non-escape characters are interpreted as Unicode ordinals */
6783-
if (c != '\\' || s >= end) {
6783+
if (c != '\\' || (s >= end && !consumed)) {
67846784
WRITE_CHAR(c);
67856785
continue;
67866786
}
67876787

6788+
Py_ssize_t startinpos = s - starts - 1;
6789+
/* \ - Escapes */
6790+
if (s >= end) {
6791+
assert(consumed);
6792+
// Set message to silent compiler warning.
6793+
// Actually it is never used.
6794+
message = "\\ at end of string";
6795+
goto incomplete;
6796+
}
6797+
67886798
c = (unsigned char) *s++;
67896799
if (c == 'u') {
67906800
count = 4;
@@ -6800,10 +6810,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
68006810
WRITE_CHAR(c);
68016811
continue;
68026812
}
6803-
startinpos = s - starts - 2;
68046813

68056814
/* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6806-
for (ch = 0; count && s < end; ++s, --count) {
6815+
for (ch = 0; count; ++s, --count) {
6816+
if (s >= end) {
6817+
goto incomplete;
6818+
}
68076819
c = (unsigned char)*s;
68086820
ch <<= 4;
68096821
if (c >= '0' && c <= '9') {
@@ -6816,18 +6828,23 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
68166828
ch += c - ('A' - 10);
68176829
}
68186830
else {
6819-
break;
6831+
goto error;
68206832
}
68216833
}
6822-
if (!count) {
6823-
if (ch <= MAX_UNICODE) {
6824-
WRITE_CHAR(ch);
6825-
continue;
6826-
}
6834+
if (ch > MAX_UNICODE) {
68276835
message = "\\Uxxxxxxxx out of range";
6836+
goto error;
68286837
}
6838+
WRITE_CHAR(ch);
6839+
continue;
68296840

6830-
endinpos = s-starts;
6841+
incomplete:
6842+
if (consumed) {
6843+
*consumed = startinpos;
6844+
break;
6845+
}
6846+
error:;
6847+
Py_ssize_t endinpos = s-starts;
68316848
writer.min_length = end - s + writer.pos;
68326849
if (unicode_decode_call_errorhandler_writer(
68336850
errors, &errorHandler,
@@ -6849,7 +6866,14 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
68496866
Py_XDECREF(errorHandler);
68506867
Py_XDECREF(exc);
68516868
return NULL;
6869+
}
68526870

6871+
PyObject *
6872+
PyUnicode_DecodeRawUnicodeEscape(const char *s,
6873+
Py_ssize_t size,
6874+
const char *errors)
6875+
{
6876+
return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
68536877
}
68546878

68556879

0 commit comments

Comments
 (0)