Skip to content

Commit 22eb689

Browse files
authored
bpo-37388: Development mode check encoding and errors (GH-14341)
In development mode and in debug build, encoding and errors arguments are now checked on string encoding and decoding operations. Examples: open(), str.encode() and bytes.decode(). By default, for best performances, the errors argument is only checked at the first encoding/decoding error, and the encoding argument is sometimes ignored for empty strings.
1 parent e1a63c4 commit 22eb689

File tree

10 files changed

+315
-6
lines changed

10 files changed

+315
-6
lines changed

Doc/library/stdtypes.rst

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1559,9 +1559,16 @@ expression support in the :mod:`re` module).
15591559
:func:`codecs.register_error`, see section :ref:`error-handlers`. For a
15601560
list of possible encodings, see section :ref:`standard-encodings`.
15611561

1562+
By default, the *errors* argument is not checked for best performances, but
1563+
only used at the first encoding error. Enable the development mode
1564+
(:option:`-X` ``dev`` option), or use a debug build, to check *errors*.
1565+
15621566
.. versionchanged:: 3.1
15631567
Support for keyword arguments added.
15641568

1569+
.. versionchanged:: 3.9
1570+
The *errors* is now checked in development mode and in debug mode.
1571+
15651572

15661573
.. method:: str.endswith(suffix[, start[, end]])
15671574

@@ -2575,6 +2582,10 @@ arbitrary binary data.
25752582
:func:`codecs.register_error`, see section :ref:`error-handlers`. For a
25762583
list of possible encodings, see section :ref:`standard-encodings`.
25772584

2585+
By default, the *errors* argument is not checked for best performances, but
2586+
only used at the first decoding error. Enable the development mode
2587+
(:option:`-X` ``dev`` option), or use a debug build, to check *errors*.
2588+
25782589
.. note::
25792590

25802591
Passing the *encoding* argument to :class:`str` allows decoding any
@@ -2584,6 +2595,9 @@ arbitrary binary data.
25842595
.. versionchanged:: 3.1
25852596
Added support for keyword arguments.
25862597

2598+
.. versionchanged:: 3.9
2599+
The *errors* is now checked in development mode and in debug mode.
2600+
25872601

25882602
.. method:: bytes.endswith(suffix[, start[, end]])
25892603
bytearray.endswith(suffix[, start[, end]])

Doc/using/cmdline.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,9 @@ Miscellaneous options
429429
not be more verbose than the default if the code is correct: new warnings
430430
are only emitted when an issue is detected. Effect of the developer mode:
431431

432+
* Check *encoding* and *errors* arguments on string encoding and decoding
433+
operations. Examples: :func:`open`, :meth:`str.encode` and
434+
:meth:`bytes.decode`.
432435
* Add ``default`` warning filter, as :option:`-W` ``default``.
433436
* Install debug hooks on memory allocators: see the
434437
:c:func:`PyMem_SetupDebugHooks` C function.
@@ -469,6 +472,10 @@ Miscellaneous options
469472
The ``-X pycache_prefix`` option. The ``-X dev`` option now logs
470473
``close()`` exceptions in :class:`io.IOBase` destructor.
471474

475+
.. versionchanged:: 3.9
476+
Using ``-X dev`` option, check *encoding* and *errors* arguments on
477+
string encoding and decoding operations.
478+
472479

473480
Options you shouldn't use
474481
~~~~~~~~~~~~~~~~~~~~~~~~~

Doc/whatsnew/3.9.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,15 @@ Other Language Changes
8484
this case.
8585
(Contributed by Victor Stinner in :issue:`20443`.)
8686

87+
* In development mode and in debug build, *encoding* and *errors* arguments are
88+
now checked on string encoding and decoding operations. Examples:
89+
:func:`open`, :meth:`str.encode` and :meth:`bytes.decode`.
90+
91+
By default, for best performances, the *errors* argument is only checked at
92+
the first encoding/decoding error, and the *encoding* argument is sometimes
93+
ignored for empty strings.
94+
(Contributed by Victor Stinner in :issue:`37388`.)
95+
8796

8897
New Modules
8998
===========

Lib/_pyio.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
# Does io.IOBase finalizer log the exception if the close() method fails?
3737
# The exception is ignored silently by default in release build.
3838
_IOBASE_EMITS_UNRAISABLE = (hasattr(sys, "gettotalrefcount") or sys.flags.dev_mode)
39+
# Does open() check its 'errors' argument?
40+
_CHECK_ERRORS = _IOBASE_EMITS_UNRAISABLE
3941

4042

4143
def open(file, mode="r", buffering=-1, encoding=None, errors=None,
@@ -2022,6 +2024,8 @@ def __init__(self, buffer, encoding=None, errors=None, newline=None,
20222024
else:
20232025
if not isinstance(errors, str):
20242026
raise ValueError("invalid errors: %r" % errors)
2027+
if _CHECK_ERRORS:
2028+
codecs.lookup_error(errors)
20252029

20262030
self._buffer = buffer
20272031
self._decoded_chars = '' # buffer for text returned from decoder

Lib/test/test_bytes.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,14 @@
1212
import functools
1313
import pickle
1414
import tempfile
15+
import textwrap
1516
import unittest
1617

1718
import test.support
1819
import test.string_tests
1920
import test.list_tests
2021
from test.support import bigaddrspacetest, MAX_Py_ssize_t
22+
from test.support.script_helper import assert_python_failure
2123

2224

2325
if sys.flags.bytes_warning:
@@ -315,6 +317,62 @@ def test_decode(self):
315317
# Default encoding is utf-8
316318
self.assertEqual(self.type2test(b'\xe2\x98\x83').decode(), '\u2603')
317319

320+
def test_check_encoding_errors(self):
321+
# bpo-37388: bytes(str) and bytes.encode() must check encoding
322+
# and errors arguments in dev mode
323+
invalid = 'Boom, Shaka Laka, Boom!'
324+
encodings = ('ascii', 'utf8', 'latin1')
325+
code = textwrap.dedent(f'''
326+
import sys
327+
type2test = {self.type2test.__name__}
328+
encodings = {encodings!r}
329+
330+
for data in ('', 'short string'):
331+
try:
332+
type2test(data, encoding={invalid!r})
333+
except LookupError:
334+
pass
335+
else:
336+
sys.exit(21)
337+
338+
for encoding in encodings:
339+
try:
340+
type2test(data, encoding=encoding, errors={invalid!r})
341+
except LookupError:
342+
pass
343+
else:
344+
sys.exit(22)
345+
346+
for data in (b'', b'short string'):
347+
data = type2test(data)
348+
print(repr(data))
349+
try:
350+
data.decode(encoding={invalid!r})
351+
except LookupError:
352+
sys.exit(10)
353+
else:
354+
sys.exit(23)
355+
356+
try:
357+
data.decode(errors={invalid!r})
358+
except LookupError:
359+
pass
360+
else:
361+
sys.exit(24)
362+
363+
for encoding in encodings:
364+
try:
365+
data.decode(encoding=encoding, errors={invalid!r})
366+
except LookupError:
367+
pass
368+
else:
369+
sys.exit(25)
370+
371+
sys.exit(10)
372+
''')
373+
proc = assert_python_failure('-X', 'dev', '-c', code)
374+
self.assertEqual(proc.rc, 10, proc)
375+
318376
def test_from_int(self):
319377
b = self.type2test(0)
320378
self.assertEqual(b, self.type2test())

Lib/test/test_io.py

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import signal
3030
import sys
3131
import sysconfig
32+
import textwrap
3233
import threading
3334
import time
3435
import unittest
@@ -37,7 +38,8 @@
3738
from collections import deque, UserList
3839
from itertools import cycle, count
3940
from test import support
40-
from test.support.script_helper import assert_python_ok, run_python_until_end
41+
from test.support.script_helper import (
42+
assert_python_ok, assert_python_failure, run_python_until_end)
4143
from test.support import FakePath
4244

4345
import codecs
@@ -4130,6 +4132,51 @@ def test_open_allargs(self):
41304132
# there used to be a buffer overflow in the parser for rawmode
41314133
self.assertRaises(ValueError, self.open, support.TESTFN, 'rwax+')
41324134

4135+
def test_check_encoding_errors(self):
4136+
# bpo-37388: open() and TextIOWrapper must check encoding and errors
4137+
# arguments in dev mode
4138+
mod = self.io.__name__
4139+
filename = __file__
4140+
invalid = 'Boom, Shaka Laka, Boom!'
4141+
code = textwrap.dedent(f'''
4142+
import sys
4143+
from {mod} import open, TextIOWrapper
4144+
4145+
try:
4146+
open({filename!r}, encoding={invalid!r})
4147+
except LookupError:
4148+
pass
4149+
else:
4150+
sys.exit(21)
4151+
4152+
try:
4153+
open({filename!r}, errors={invalid!r})
4154+
except LookupError:
4155+
pass
4156+
else:
4157+
sys.exit(22)
4158+
4159+
fp = open({filename!r}, "rb")
4160+
with fp:
4161+
try:
4162+
TextIOWrapper(fp, encoding={invalid!r})
4163+
except LookupError:
4164+
pass
4165+
else:
4166+
sys.exit(23)
4167+
4168+
try:
4169+
TextIOWrapper(fp, errors={invalid!r})
4170+
except LookupError:
4171+
pass
4172+
else:
4173+
sys.exit(24)
4174+
4175+
sys.exit(10)
4176+
''')
4177+
proc = assert_python_failure('-X', 'dev', '-c', code)
4178+
self.assertEqual(proc.rc, 10, proc)
4179+
41334180

41344181
class CMiscIOTest(MiscIOTest):
41354182
io = io

Lib/test/test_unicode.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111
import operator
1212
import struct
1313
import sys
14+
import textwrap
1415
import unittest
1516
import warnings
1617
from test import support, string_tests
18+
from test.support.script_helper import assert_python_failure
1719

1820
# Error handling (bad decoder return)
1921
def search_function(encoding):
@@ -2436,6 +2438,66 @@ def test_free_after_iterating(self):
24362438
support.check_free_after_iterating(self, iter, str)
24372439
support.check_free_after_iterating(self, reversed, str)
24382440

2441+
def test_check_encoding_errors(self):
2442+
# bpo-37388: str(bytes) and str.decode() must check encoding and errors
2443+
# arguments in dev mode
2444+
encodings = ('ascii', 'utf8', 'latin1')
2445+
invalid = 'Boom, Shaka Laka, Boom!'
2446+
code = textwrap.dedent(f'''
2447+
import sys
2448+
encodings = {encodings!r}
2449+
2450+
for data in (b'', b'short string'):
2451+
try:
2452+
str(data, encoding={invalid!r})
2453+
except LookupError:
2454+
pass
2455+
else:
2456+
sys.exit(21)
2457+
2458+
try:
2459+
str(data, errors={invalid!r})
2460+
except LookupError:
2461+
pass
2462+
else:
2463+
sys.exit(22)
2464+
2465+
for encoding in encodings:
2466+
try:
2467+
str(data, encoding, errors={invalid!r})
2468+
except LookupError:
2469+
pass
2470+
else:
2471+
sys.exit(22)
2472+
2473+
for data in ('', 'short string'):
2474+
try:
2475+
data.encode(encoding={invalid!r})
2476+
except LookupError:
2477+
pass
2478+
else:
2479+
sys.exit(23)
2480+
2481+
try:
2482+
data.encode(errors={invalid!r})
2483+
except LookupError:
2484+
pass
2485+
else:
2486+
sys.exit(24)
2487+
2488+
for encoding in encodings:
2489+
try:
2490+
data.encode(encoding, errors={invalid!r})
2491+
except LookupError:
2492+
pass
2493+
else:
2494+
sys.exit(24)
2495+
2496+
sys.exit(10)
2497+
''')
2498+
proc = assert_python_failure('-X', 'dev', '-c', code)
2499+
self.assertEqual(proc.rc, 10, proc)
2500+
24392501

24402502
class CAPITest(unittest.TestCase):
24412503

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
In development mode and in debug build, *encoding* and *errors* arguments are
2+
now checked on string encoding and decoding operations. Examples: :func:`open`,
3+
:meth:`str.encode` and :meth:`bytes.decode`.
4+
5+
By default, for best performances, the *errors* argument is only checked at the
6+
first encoding/decoding error, and the *encoding* argument is sometimes ignored
7+
for empty strings.

Modules/_io/textio.c

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -988,6 +988,46 @@ _textiowrapper_fix_encoder_state(textio *self)
988988
return 0;
989989
}
990990

991+
static int
992+
io_check_errors(PyObject *errors)
993+
{
994+
assert(errors != NULL && errors != Py_None);
995+
996+
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
997+
#ifndef Py_DEBUG
998+
/* In release mode, only check in development mode (-X dev) */
999+
if (!interp->config.dev_mode) {
1000+
return 0;
1001+
}
1002+
#else
1003+
/* Always check in debug mode */
1004+
#endif
1005+
1006+
/* Avoid calling PyCodec_LookupError() before the codec registry is ready:
1007+
before_PyUnicode_InitEncodings() is called. */
1008+
if (!interp->fs_codec.encoding) {
1009+
return 0;
1010+
}
1011+
1012+
Py_ssize_t name_length;
1013+
const char *name = PyUnicode_AsUTF8AndSize(errors, &name_length);
1014+
if (name == NULL) {
1015+
return -1;
1016+
}
1017+
if (strlen(name) != (size_t)name_length) {
1018+
PyErr_SetString(PyExc_ValueError, "embedded null character in errors");
1019+
return -1;
1020+
}
1021+
PyObject *handler = PyCodec_LookupError(name);
1022+
if (handler != NULL) {
1023+
Py_DECREF(handler);
1024+
return 0;
1025+
}
1026+
return -1;
1027+
}
1028+
1029+
1030+
9911031
/*[clinic input]
9921032
_io.TextIOWrapper.__init__
9931033
buffer: object
@@ -1057,6 +1097,9 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer,
10571097
errors->ob_type->tp_name);
10581098
return -1;
10591099
}
1100+
else if (io_check_errors(errors)) {
1101+
return -1;
1102+
}
10601103

10611104
if (validate_newline(newline) < 0) {
10621105
return -1;

0 commit comments

Comments
 (0)