Skip to content

Commit 6a16b18

Browse files
authored
bpo-36297: remove "unicode_internal" codec (GH-12342)
1 parent 6fb544d commit 6a16b18

File tree

12 files changed

+41
-530
lines changed

12 files changed

+41
-530
lines changed

Doc/library/codecs.rst

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1316,16 +1316,10 @@ encodings.
13161316
| | | code actually uses UTF-8 |
13171317
| | | by default. |
13181318
+--------------------+---------+---------------------------+
1319-
| unicode_internal | | Return the internal |
1320-
| | | representation of the |
1321-
| | | operand. Stateful codecs |
1322-
| | | are not supported. |
1323-
| | | |
1324-
| | | .. deprecated:: 3.3 |
1325-
| | | This representation is |
1326-
| | | obsoleted by |
1327-
| | | :pep:`393`. |
1328-
+--------------------+---------+---------------------------+
1319+
1320+
.. versionchanged:: 3.8
1321+
"unicode_internal" codec is removed.
1322+
13291323

13301324
.. _binary-transforms:
13311325

Doc/whatsnew/3.8.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,9 @@ The following features and APIs have been removed from Python 3.8:
573573
* Removed the ``doctype()`` method of :class:`~xml.etree.ElementTree.XMLParser`.
574574
(Contributed by Serhiy Storchaka in :issue:`29209`.)
575575

576+
* "unicode_internal" codec is removed.
577+
(Contributed by Inada Naoki in :issue:`36297`.)
578+
576579

577580
Porting to Python 3.8
578581
=====================

Include/cpython/unicodeobject.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -896,15 +896,6 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
896896
Py_ssize_t length /* Number of Py_UNICODE chars to encode */
897897
) Py_DEPRECATED(3.3);
898898

899-
/* --- Unicode Internal Codec --------------------------------------------- */
900-
901-
/* Only for internal use in _codecsmodule.c */
902-
PyObject *_PyUnicode_DecodeUnicodeInternal(
903-
const char *string,
904-
Py_ssize_t length,
905-
const char *errors
906-
);
907-
908899
/* --- Latin-1 Codecs ----------------------------------------------------- */
909900

910901
PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(

Lib/encodings/unicode_internal.py

Lines changed: 0 additions & 45 deletions
This file was deleted.

Lib/test/test_codeccallbacks.py

Lines changed: 11 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -211,42 +211,6 @@ def test_charmapencode(self):
211211
charmap[ord("?")] = "XYZ" # wrong type in mapping
212212
self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
213213

214-
def test_decodeunicodeinternal(self):
215-
with test.support.check_warnings(('unicode_internal codec has been '
216-
'deprecated', DeprecationWarning)):
217-
self.assertRaises(
218-
UnicodeDecodeError,
219-
b"\x00\x00\x00\x00\x00".decode,
220-
"unicode-internal",
221-
)
222-
if len('\0'.encode('unicode-internal')) == 4:
223-
def handler_unicodeinternal(exc):
224-
if not isinstance(exc, UnicodeDecodeError):
225-
raise TypeError("don't know how to handle %r" % exc)
226-
return ("\x01", 1)
227-
228-
self.assertEqual(
229-
b"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"),
230-
"\u0000"
231-
)
232-
233-
self.assertEqual(
234-
b"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"),
235-
"\u0000\ufffd"
236-
)
237-
238-
self.assertEqual(
239-
b"\x00\x00\x00\x00\x00".decode("unicode-internal", "backslashreplace"),
240-
"\u0000\\x00"
241-
)
242-
243-
codecs.register_error("test.hui", handler_unicodeinternal)
244-
245-
self.assertEqual(
246-
b"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"),
247-
"\u0000\u0001\u0000"
248-
)
249-
250214
def test_callbacks(self):
251215
def handler1(exc):
252216
r = range(exc.start, exc.end)
@@ -794,16 +758,13 @@ def test_badhandlerresults(self):
794758
("ascii", b"\xff"),
795759
("utf-8", b"\xff"),
796760
("utf-7", b"+x-"),
797-
("unicode-internal", b"\x00"),
798761
):
799-
with test.support.check_warnings():
800-
# unicode-internal has been deprecated
801-
self.assertRaises(
802-
TypeError,
803-
bytes.decode,
804-
enc,
805-
"test.badhandler"
806-
)
762+
self.assertRaises(
763+
TypeError,
764+
bytes.decode,
765+
enc,
766+
"test.badhandler"
767+
)
807768

808769
def test_lookup(self):
809770
self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
@@ -1013,7 +974,6 @@ def test_mutatingdecodehandler(self):
1013974
("utf-32", b"\xff"),
1014975
("unicode-escape", b"\\u123g"),
1015976
("raw-unicode-escape", b"\\u123g"),
1016-
("unicode-internal", b"\xff"),
1017977
]
1018978

1019979
def replacing(exc):
@@ -1024,11 +984,9 @@ def replacing(exc):
1024984
raise TypeError("don't know how to handle %r" % exc)
1025985
codecs.register_error("test.replacing", replacing)
1026986

1027-
with test.support.check_warnings():
1028-
# unicode-internal has been deprecated
1029-
for (encoding, data) in baddata:
1030-
with self.assertRaises(TypeError):
1031-
data.decode(encoding, "test.replacing")
987+
for (encoding, data) in baddata:
988+
with self.assertRaises(TypeError):
989+
data.decode(encoding, "test.replacing")
1032990

1033991
def mutating(exc):
1034992
if isinstance(exc, UnicodeDecodeError):
@@ -1039,10 +997,8 @@ def mutating(exc):
1039997
codecs.register_error("test.mutating", mutating)
1040998
# If the decoder doesn't pick up the modified input the following
1041999
# will lead to an endless loop
1042-
with test.support.check_warnings():
1043-
# unicode-internal has been deprecated
1044-
for (encoding, data) in baddata:
1045-
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
1000+
for (encoding, data) in baddata:
1001+
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
10461002

10471003
# issue32583
10481004
def test_crashing_decode_handler(self):

Lib/test/test_codecs.py

Lines changed: 5 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -1239,16 +1239,6 @@ def test_errors(self):
12391239
self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
12401240

12411241

1242-
class RecodingTest(unittest.TestCase):
1243-
def test_recoding(self):
1244-
f = io.BytesIO()
1245-
with codecs.EncodedFile(f, "unicode_internal", "utf-8") as f2:
1246-
f2.write("a")
1247-
# Python used to crash on this at exit because of a refcount
1248-
# bug in _codecsmodule.c
1249-
1250-
self.assertTrue(f.closed)
1251-
12521242
# From RFC 3492
12531243
punycode_testcases = [
12541244
# A Arabic (Egyptian):
@@ -1378,87 +1368,6 @@ def test_decode(self):
13781368
self.assertEqual(uni, puny.decode("punycode"))
13791369

13801370

1381-
class UnicodeInternalTest(unittest.TestCase):
1382-
@unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
1383-
def test_bug1251300(self):
1384-
# Decoding with unicode_internal used to not correctly handle "code
1385-
# points" above 0x10ffff on UCS-4 builds.
1386-
ok = [
1387-
(b"\x00\x10\xff\xff", "\U0010ffff"),
1388-
(b"\x00\x00\x01\x01", "\U00000101"),
1389-
(b"", ""),
1390-
]
1391-
not_ok = [
1392-
b"\x7f\xff\xff\xff",
1393-
b"\x80\x00\x00\x00",
1394-
b"\x81\x00\x00\x00",
1395-
b"\x00",
1396-
b"\x00\x00\x00\x00\x00",
1397-
]
1398-
for internal, uni in ok:
1399-
if sys.byteorder == "little":
1400-
internal = bytes(reversed(internal))
1401-
with support.check_warnings():
1402-
self.assertEqual(uni, internal.decode("unicode_internal"))
1403-
for internal in not_ok:
1404-
if sys.byteorder == "little":
1405-
internal = bytes(reversed(internal))
1406-
with support.check_warnings(('unicode_internal codec has been '
1407-
'deprecated', DeprecationWarning)):
1408-
self.assertRaises(UnicodeDecodeError, internal.decode,
1409-
"unicode_internal")
1410-
if sys.byteorder == "little":
1411-
invalid = b"\x00\x00\x11\x00"
1412-
invalid_backslashreplace = r"\x00\x00\x11\x00"
1413-
else:
1414-
invalid = b"\x00\x11\x00\x00"
1415-
invalid_backslashreplace = r"\x00\x11\x00\x00"
1416-
with support.check_warnings():
1417-
self.assertRaises(UnicodeDecodeError,
1418-
invalid.decode, "unicode_internal")
1419-
with support.check_warnings():
1420-
self.assertEqual(invalid.decode("unicode_internal", "replace"),
1421-
'\ufffd')
1422-
with support.check_warnings():
1423-
self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1424-
invalid_backslashreplace)
1425-
1426-
@unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
1427-
def test_decode_error_attributes(self):
1428-
try:
1429-
with support.check_warnings(('unicode_internal codec has been '
1430-
'deprecated', DeprecationWarning)):
1431-
b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
1432-
except UnicodeDecodeError as ex:
1433-
self.assertEqual("unicode_internal", ex.encoding)
1434-
self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1435-
self.assertEqual(4, ex.start)
1436-
self.assertEqual(8, ex.end)
1437-
else:
1438-
self.fail()
1439-
1440-
@unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
1441-
def test_decode_callback(self):
1442-
codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1443-
decoder = codecs.getdecoder("unicode_internal")
1444-
with support.check_warnings(('unicode_internal codec has been '
1445-
'deprecated', DeprecationWarning)):
1446-
ab = "ab".encode("unicode_internal").decode()
1447-
ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1448-
"ascii"),
1449-
"UnicodeInternalTest")
1450-
self.assertEqual(("ab", 12), ignored)
1451-
1452-
def test_encode_length(self):
1453-
with support.check_warnings(('unicode_internal codec has been '
1454-
'deprecated', DeprecationWarning)):
1455-
# Issue 3739
1456-
encoder = codecs.getencoder("unicode_internal")
1457-
self.assertEqual(encoder("a")[1], 1)
1458-
self.assertEqual(encoder("\xe9\u0142")[1], 2)
1459-
1460-
self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
1461-
14621371
# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
14631372
nameprep_tests = [
14641373
# 3.1 Map to nothing.
@@ -1949,7 +1858,6 @@ def test_basic(self):
19491858
"shift_jisx0213",
19501859
"tis_620",
19511860
"unicode_escape",
1952-
"unicode_internal",
19531861
"utf_16",
19541862
"utf_16_be",
19551863
"utf_16_le",
@@ -1969,7 +1877,6 @@ def test_basic(self):
19691877
# The following encodings don't work in stateful mode
19701878
broken_unicode_with_stateful = [
19711879
"punycode",
1972-
"unicode_internal"
19731880
]
19741881

19751882

@@ -1984,12 +1891,10 @@ def test_basics(self):
19841891
name = "latin_1"
19851892
self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
19861893

1987-
with support.check_warnings():
1988-
# unicode-internal has been deprecated
1989-
(b, size) = codecs.getencoder(encoding)(s)
1990-
self.assertEqual(size, len(s), "encoding=%r" % encoding)
1991-
(chars, size) = codecs.getdecoder(encoding)(b)
1992-
self.assertEqual(chars, s, "encoding=%r" % encoding)
1894+
(b, size) = codecs.getencoder(encoding)(s)
1895+
self.assertEqual(size, len(s), "encoding=%r" % encoding)
1896+
(chars, size) = codecs.getdecoder(encoding)(b)
1897+
self.assertEqual(chars, s, "encoding=%r" % encoding)
19931898

19941899
if encoding not in broken_unicode_with_stateful:
19951900
# check stream reader/writer
@@ -2116,9 +2021,7 @@ def test_bad_decode_args(self):
21162021
def test_bad_encode_args(self):
21172022
for encoding in all_unicode_encodings:
21182023
encoder = codecs.getencoder(encoding)
2119-
with support.check_warnings():
2120-
# unicode-internal has been deprecated
2121-
self.assertRaises(TypeError, encoder)
2024+
self.assertRaises(TypeError, encoder)
21222025

21232026
def test_encoding_map_type_initialized(self):
21242027
from encodings import cp1140

0 commit comments

Comments
 (0)