Skip to content

Commit 82458b6

Browse files
authored
bpo-42236: Enhance _locale._get_locale_encoding() (GH-23083)
* Rename _Py_GetLocaleEncoding() to _Py_GetLocaleEncodingObject() * Add _Py_GetLocaleEncoding() which returns a wchar_t* string to share code between _Py_GetLocaleEncodingObject() and config_get_locale_encoding(). * _Py_GetLocaleEncodingObject() now decodes nl_langinfo(CODESET) from the current locale encoding with surrogateescape, rather than using UTF-8.
1 parent 1f7dfb2 commit 82458b6

File tree

5 files changed

+76
-52
lines changed

5 files changed

+76
-52
lines changed

Include/internal/pycore_fileutils.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric(
5050

5151
PyAPI_FUNC(void) _Py_closerange(int first, int last);
5252

53-
PyAPI_FUNC(PyObject*) _Py_GetLocaleEncoding(void);
53+
PyAPI_FUNC(wchar_t*) _Py_GetLocaleEncoding(const char **errmsg);
54+
PyAPI_FUNC(PyObject*) _Py_GetLocaleEncodingObject(void);
5455

5556
#ifdef __cplusplus
5657
}

Modules/_io/textio.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1155,7 +1155,7 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer,
11551155
}
11561156
}
11571157
if (encoding == NULL && self->encoding == NULL) {
1158-
self->encoding = _Py_GetLocaleEncoding();
1158+
self->encoding = _Py_GetLocaleEncodingObject();
11591159
if (self->encoding == NULL) {
11601160
goto error;
11611161
}

Modules/_localemodule.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -783,7 +783,7 @@ static PyObject *
783783
_locale__get_locale_encoding_impl(PyObject *module)
784784
/*[clinic end generated code: output=e8e2f6f6f184591a input=513d9961d2f45c76]*/
785785
{
786-
return _Py_GetLocaleEncoding();
786+
return _Py_GetLocaleEncodingObject();
787787
}
788788

789789

Python/fileutils.c

Lines changed: 59 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -821,43 +821,87 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str,
821821
}
822822

823823

824-
// Get the current locale encoding: locale.getpreferredencoding(False).
824+
// Get the current locale encoding name:
825+
//
826+
// - Return "UTF-8" if _Py_FORCE_UTF8_LOCALE macro is defined (ex: on Android)
827+
// - Return "UTF-8" if the UTF-8 Mode is enabled
828+
// - On Windows, return the ANSI code page (ex: "cp1250")
829+
// - Return "UTF-8" if nl_langinfo(CODESET) returns an empty string
830+
// and if the _Py_FORCE_UTF8_FS_ENCODING macro is defined (ex: on macOS).
831+
// - Otherwise, return nl_langinfo(CODESET).
832+
//
833+
// Return NULL and set errmsg to an error message
834+
// if nl_langinfo(CODESET) fails.
835+
//
836+
// Return NULL and set errmsg to NULL on memory allocation failure.
837+
//
825838
// See also config_get_locale_encoding()
826-
PyObject *
827-
_Py_GetLocaleEncoding(void)
839+
wchar_t*
840+
_Py_GetLocaleEncoding(const char **errmsg)
828841
{
842+
*errmsg = NULL;
829843
#ifdef _Py_FORCE_UTF8_LOCALE
830844
// On Android langinfo.h and CODESET are missing,
831845
// and UTF-8 is always used in mbstowcs() and wcstombs().
832-
return PyUnicode_FromString("UTF-8");
846+
return _PyMem_RawWcsdup(L"UTF-8");
833847
#else
834848
const PyPreConfig *preconfig = &_PyRuntime.preconfig;
835849
if (preconfig->utf8_mode) {
836-
return PyUnicode_FromString("UTF-8");
850+
return _PyMem_RawWcsdup(L"UTF-8");
837851
}
838852

839-
#if defined(MS_WINDOWS)
840-
return PyUnicode_FromFormat("cp%u", GetACP());
853+
#ifdef MS_WINDOWS
854+
wchar_t encoding[23];
855+
unsigned int ansi_codepage = GetACP();
856+
swprintf(encoding, Py_ARRAY_LENGTH(encoding), L"cp%u", ansi_codepage);
857+
encoding[Py_ARRAY_LENGTH(encoding) - 1] = 0;
858+
return _PyMem_RawWcsdup(encoding);
841859
#else
842860
const char *encoding = nl_langinfo(CODESET);
843861
if (!encoding || encoding[0] == '\0') {
844862
#ifdef _Py_FORCE_UTF8_FS_ENCODING
845863
// nl_langinfo() can return an empty string when the LC_CTYPE locale is
846864
// not supported. Default to UTF-8 in that case, because UTF-8 is the
847865
// default charset on macOS.
848-
encoding = "UTF-8";
866+
return _PyMem_RawWcsdup(L"UTF-8");
849867
#else
850-
PyErr_SetString(PyExc_ValueError,
851-
"failed to get the locale encoding: "
852-
"nl_langinfo(CODESET) returns an empty string");
868+
*errmsg = "failed to get the locale encoding: "
869+
"nl_langinfo(CODESET) returns an empty string";
853870
return NULL;
854871
#endif
855872
}
856-
// Decode from UTF-8
857-
return PyUnicode_FromString(encoding);
858-
#endif // !CODESET
859873

860-
#endif
874+
wchar_t *wstr;
875+
int res = decode_current_locale(encoding, &wstr, NULL,
876+
errmsg, _Py_ERROR_SURROGATEESCAPE);
877+
if (res < 0) {
878+
return NULL;
879+
}
880+
return wstr;
881+
#endif // !MS_WINDOWS
882+
883+
#endif // !_Py_FORCE_UTF8_LOCALE
884+
}
885+
886+
887+
PyObject *
888+
_Py_GetLocaleEncodingObject(void)
889+
{
890+
const char *errmsg;
891+
wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg);
892+
if (encoding == NULL) {
893+
if (errmsg != NULL) {
894+
PyErr_SetString(PyExc_ValueError, errmsg);
895+
}
896+
else {
897+
PyErr_NoMemory();
898+
}
899+
return NULL;
900+
}
901+
902+
PyObject *str = PyUnicode_FromWideChar(encoding, -1);
903+
PyMem_RawFree(encoding);
904+
return str;
861905
}
862906

863907

Python/initconfig.c

Lines changed: 13 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,7 @@
1111

1212
#include "osdefs.h" // DELIM
1313
#include <locale.h> // setlocale()
14-
#ifdef HAVE_LANGINFO_H
15-
# include <langinfo.h> // nl_langinfo(CODESET)
16-
#endif
1714
#if defined(MS_WINDOWS) || defined(__CYGWIN__)
18-
# include <windows.h> // GetACP()
1915
# ifdef HAVE_IO_H
2016
# include <io.h>
2117
# endif
@@ -1497,41 +1493,24 @@ config_get_stdio_errors(const PyPreConfig *preconfig)
14971493
}
14981494

14991495

1500-
// See also _Py_GetLocaleEncoding() and config_get_fs_encoding()
1496+
// See also config_get_fs_encoding()
15011497
static PyStatus
15021498
config_get_locale_encoding(PyConfig *config, const PyPreConfig *preconfig,
15031499
wchar_t **locale_encoding)
15041500
{
1505-
#ifdef _Py_FORCE_UTF8_LOCALE
1506-
return PyConfig_SetString(config, locale_encoding, L"utf-8");
1507-
#else
1508-
if (preconfig->utf8_mode) {
1509-
return PyConfig_SetString(config, locale_encoding, L"utf-8");
1510-
}
1511-
1512-
#ifdef MS_WINDOWS
1513-
char encoding[20];
1514-
PyOS_snprintf(encoding, sizeof(encoding), "cp%u", GetACP());
1515-
return PyConfig_SetBytesString(config, locale_encoding, encoding);
1516-
#else
1517-
const char *encoding = nl_langinfo(CODESET);
1518-
if (!encoding || encoding[0] == '\0') {
1519-
#ifdef _Py_FORCE_UTF8_FS_ENCODING
1520-
// nl_langinfo() can return an empty string when the LC_CTYPE locale is
1521-
// not supported. Default to UTF-8 in that case, because UTF-8 is the
1522-
// default charset on macOS.
1523-
encoding = "UTF-8";
1524-
#else
1525-
return _PyStatus_ERR("failed to get the locale encoding: "
1526-
"nl_langinfo(CODESET) returns an empty string");
1527-
#endif
1501+
const char *errmsg;
1502+
wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg);
1503+
if (encoding == NULL) {
1504+
if (errmsg != NULL) {
1505+
return _PyStatus_ERR(errmsg);
1506+
}
1507+
else {
1508+
return _PyStatus_NO_MEMORY();
1509+
}
15281510
}
1529-
/* nl_langinfo(CODESET) is decoded by Py_DecodeLocale() */
1530-
return CONFIG_SET_BYTES_STR(config,
1531-
locale_encoding, encoding,
1532-
"nl_langinfo(CODESET)");
1533-
#endif // !MS_WINDOWS
1534-
#endif // !_Py_FORCE_UTF8_LOCALE
1511+
PyStatus status = PyConfig_SetString(config, locale_encoding, encoding);
1512+
PyMem_RawFree(encoding);
1513+
return status;
15351514
}
15361515

15371516

0 commit comments

Comments
 (0)