Skip to content

Commit b92c159

Browse files
authored
[3.6] bpo-32555: Fix locale encodings (#5193)
On FreeBSD and Solaris, os.strerror() now always decode the byte string from the current locale encoding, rather than using ASCII/surrogateescape in some cases. Changes: * Add _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() which has an additional current_locale parameter. * PyUnicode_DecodeLocale(), PyUnicode_DecodeLocaleAndSize() and * PyUnicode_EncodeLocale() now always use the current locale * encoding, instead of using Py_DecodeLocale()/Py_EncodeLocale(). * Document encoding in Py_DecodeLocale() and Py_EncodeLocale() documentations. * Add USE_FORCE_ASCII define to not define decode_ascii_surrogateescape() on Android.
1 parent 5f959c4 commit b92c159

File tree

6 files changed

+227
-105
lines changed

6 files changed

+227
-105
lines changed

Doc/c-api/sys.rst

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,18 @@ Operating System Utilities
6666
surrogate character, escape the bytes using the surrogateescape error
6767
handler instead of decoding them.
6868
69+
Encoding, highest priority to lowest priority:
70+
71+
* ``UTF-8`` on macOS and Android;
72+
* ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
73+
``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
74+
and :c:func:`mbstowcs` and :c:func:`wcstombs` functions use the
75+
``ISO-8859-1`` encoding.
76+
* the current locale encoding (``LC_CTYPE`` locale).
77+
6978
Return a pointer to a newly allocated wide character string, use
7079
:c:func:`PyMem_RawFree` to free the memory. If size is not ``NULL``, write
71-
the number of wide characters excluding the null character into ``*size``
80+
the number of wide characters excluding the null character into ``*size``.
7281
7382
Return ``NULL`` on decoding error or memory allocation error. If *size* is
7483
not ``NULL``, ``*size`` is set to ``(size_t)-1`` on memory error or set to
@@ -94,6 +103,15 @@ Operating System Utilities
94103
:ref:`surrogateescape error handler <surrogateescape>`: surrogate characters
95104
in the range U+DC80..U+DCFF are converted to bytes 0x80..0xFF.
96105
106+
Encoding, highest priority to lowest priority:
107+
108+
* ``UTF-8`` on macOS and Android;
109+
* ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
110+
``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
111+
and :c:func:`mbstowcs` and :c:func:`wcstombs` functions uses the
112+
``ISO-8859-1`` encoding.
113+
* the current locale encoding.
114+
97115
Return a pointer to a newly allocated byte string, use :c:func:`PyMem_Free`
98116
to free the memory. Return ``NULL`` on encoding error or memory allocation
99117
error

Doc/c-api/unicode.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -773,6 +773,12 @@ system.
773773
774774
.. versionadded:: 3.3
775775
776+
.. versionchanged:: 3.6.5
777+
The function now also uses the current locale encoding for the
778+
``surrogateescape`` error handler. Previously, :c:func:`Py_DecodeLocale`
779+
was used for the ``surrogateescape``, and the current locale encoding was
780+
used for ``strict``.
781+
776782
777783
.. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors)
778784
@@ -800,6 +806,12 @@ system.
800806
801807
.. versionadded:: 3.3
802808
809+
.. versionchanged:: 3.6.5
810+
The function now also uses the current locale encoding for the
811+
``surrogateescape`` error handler. Previously, :c:func:`Py_EncodeLocale`
812+
was used for the ``surrogateescape``, and the current locale encoding was
813+
used for ``strict``.
814+
803815
804816
File System Encoding
805817
""""""""""""""""""""

Include/fileutils.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,16 @@ PyAPI_FUNC(char*) Py_EncodeLocale(
1717

1818
#ifndef Py_LIMITED_API
1919

20+
PyAPI_FUNC(wchar_t *) _Py_DecodeLocaleEx(
21+
const char *arg,
22+
size_t *size,
23+
int current_locale);
24+
25+
PyAPI_FUNC(char*) _Py_EncodeLocaleEx(
26+
const wchar_t *text,
27+
size_t *error_pos,
28+
int current_locale);
29+
2030
PyAPI_FUNC(PyObject *) _Py_device_encoding(int);
2131

2232
#ifdef MS_WINDOWS
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
On FreeBSD and Solaris, os.strerror() now always decode the byte string from
2+
the current locale encoding, rather than using ASCII/surrogateescape in some
3+
cases.

Objects/unicodeobject.c

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3439,8 +3439,9 @@ locale_error_handler(const char *errors, int *surrogateescape)
34393439
}
34403440
}
34413441

3442-
PyObject *
3443-
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3442+
static PyObject *
3443+
unicode_encode_locale(PyObject *unicode, const char *errors,
3444+
int current_locale)
34443445
{
34453446
Py_ssize_t wlen, wlen2;
34463447
wchar_t *wstr;
@@ -3469,7 +3470,7 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
34693470
/* "surrogateescape" error handler */
34703471
char *str;
34713472

3472-
str = Py_EncodeLocale(wstr, &error_pos);
3473+
str = _Py_EncodeLocaleEx(wstr, &error_pos, current_locale);
34733474
if (str == NULL) {
34743475
if (error_pos == (size_t)-1) {
34753476
PyErr_NoMemory();
@@ -3549,6 +3550,12 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
35493550
return NULL;
35503551
}
35513552

3553+
PyObject *
3554+
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3555+
{
3556+
return unicode_encode_locale(unicode, errors, 1);
3557+
}
3558+
35523559
PyObject *
35533560
PyUnicode_EncodeFSDefault(PyObject *unicode)
35543561
{
@@ -3571,7 +3578,8 @@ PyUnicode_EncodeFSDefault(PyObject *unicode)
35713578
Py_FileSystemDefaultEncodeErrors);
35723579
}
35733580
else {
3574-
return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
3581+
return unicode_encode_locale(unicode,
3582+
Py_FileSystemDefaultEncodeErrors, 0);
35753583
}
35763584
#endif
35773585
}
@@ -3741,9 +3749,9 @@ mbstowcs_errorpos(const char *str, size_t len)
37413749
return 0;
37423750
}
37433751

3744-
PyObject*
3745-
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3746-
const char *errors)
3752+
static PyObject*
3753+
unicode_decode_locale(const char *str, Py_ssize_t len,
3754+
const char *errors, int current_locale)
37473755
{
37483756
wchar_t smallbuf[256];
37493757
size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
@@ -3766,7 +3774,7 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
37663774

37673775
if (surrogateescape) {
37683776
/* "surrogateescape" error handler */
3769-
wstr = Py_DecodeLocale(str, &wlen);
3777+
wstr = _Py_DecodeLocaleEx(str, &wlen, current_locale);
37703778
if (wstr == NULL) {
37713779
if (wlen == (size_t)-1)
37723780
PyErr_NoMemory();
@@ -3844,11 +3852,18 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
38443852
return NULL;
38453853
}
38463854

3855+
PyObject*
3856+
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t size,
3857+
const char *errors)
3858+
{
3859+
return unicode_decode_locale(str, size, errors, 1);
3860+
}
3861+
38473862
PyObject*
38483863
PyUnicode_DecodeLocale(const char *str, const char *errors)
38493864
{
38503865
Py_ssize_t size = (Py_ssize_t)strlen(str);
3851-
return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3866+
return unicode_decode_locale(str, size, errors, 1);
38523867
}
38533868

38543869

@@ -3880,7 +3895,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
38803895
Py_FileSystemDefaultEncodeErrors);
38813896
}
38823897
else {
3883-
return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
3898+
return unicode_decode_locale(s, size,
3899+
Py_FileSystemDefaultEncodeErrors, 0);
38843900
}
38853901
#endif
38863902
}

0 commit comments

Comments
 (0)