Skip to content

Commit 4d840e4

Browse files
[3.8] bpo-42318: Fix support of non-BMP characters in Tkinter on macOS (GH-23281). (GH-23784) (GH-23787)
(cherry picked from commit a26215d) (cherry picked from commit 28bf6ab)
1 parent 0178a6b commit 4d840e4

File tree

3 files changed

+94
-7
lines changed

3 files changed

+94
-7
lines changed

Lib/test/test_tcl.py

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import unittest
2+
import locale
23
import re
34
import subprocess
45
import sys
@@ -58,6 +59,10 @@ def test_eval_null_in_result(self):
5859
tcl = self.interp
5960
self.assertEqual(tcl.eval('set a "a\\0b"'), 'a\x00b')
6061

62+
def test_eval_surrogates_in_result(self):
63+
tcl = self.interp
64+
self.assertIn(tcl.eval(r'set a "<\ud83d\udcbb>"'), '<\U0001f4bb>')
65+
6166
def testEvalException(self):
6267
tcl = self.interp
6368
self.assertRaises(TclError,tcl.eval,'set a')
@@ -190,29 +195,48 @@ def test_getboolean(self):
190195

191196
def testEvalFile(self):
192197
tcl = self.interp
193-
with open(support.TESTFN, 'w') as f:
194-
self.addCleanup(support.unlink, support.TESTFN)
198+
filename = support.TESTFN
199+
self.addCleanup(support.unlink, filename)
200+
with open(filename, 'w') as f:
195201
f.write("""set a 1
196202
set b 2
197203
set c [ expr $a + $b ]
198204
""")
199-
tcl.evalfile(support.TESTFN)
205+
tcl.evalfile(filename)
200206
self.assertEqual(tcl.eval('set a'),'1')
201207
self.assertEqual(tcl.eval('set b'),'2')
202208
self.assertEqual(tcl.eval('set c'),'3')
203209

204210
def test_evalfile_null_in_result(self):
205211
tcl = self.interp
206-
with open(support.TESTFN, 'w') as f:
207-
self.addCleanup(support.unlink, support.TESTFN)
212+
filename = support.TESTFN
213+
self.addCleanup(support.unlink, filename)
214+
with open(filename, 'w') as f:
208215
f.write("""
209216
set a "a\0b"
210217
set b "a\\0b"
211218
""")
212-
tcl.evalfile(support.TESTFN)
219+
tcl.evalfile(filename)
213220
self.assertEqual(tcl.eval('set a'), 'a\x00b')
214221
self.assertEqual(tcl.eval('set b'), 'a\x00b')
215222

223+
def test_evalfile_surrogates_in_result(self):
224+
tcl = self.interp
225+
encoding = tcl.call('encoding', 'system')
226+
self.addCleanup(tcl.call, 'encoding', 'system', encoding)
227+
tcl.call('encoding', 'system', 'utf-8')
228+
229+
filename = support.TESTFN
230+
self.addCleanup(support.unlink, filename)
231+
with open(filename, 'wb') as f:
232+
f.write(b"""
233+
set a "<\xed\xa0\xbd\xed\xb2\xbb>"
234+
set b "<\\ud83d\\udcbb>"
235+
""")
236+
tcl.evalfile(filename)
237+
self.assertEqual(tcl.eval('set a'), '<\U0001f4bb>')
238+
self.assertEqual(tcl.eval('set b'), '<\U0001f4bb>')
239+
216240
def testEvalFileException(self):
217241
tcl = self.interp
218242
filename = "doesnotexists"
@@ -435,6 +459,11 @@ def passValue(value):
435459
self.assertEqual(passValue('str\x00ing\u20ac'), 'str\x00ing\u20ac')
436460
self.assertEqual(passValue('str\x00ing\U0001f4bb'),
437461
'str\x00ing\U0001f4bb')
462+
if sys.platform != 'win32':
463+
self.assertEqual(passValue('<\udce2\udc82\udcac>'),
464+
'<\u20ac>')
465+
self.assertEqual(passValue('<\udced\udca0\udcbd\udced\udcb2\udcbb>'),
466+
'<\U0001f4bb>')
438467
self.assertEqual(passValue(b'str\x00ing'),
439468
b'str\x00ing' if self.wantobjects else 'str\x00ing')
440469
self.assertEqual(passValue(b'str\xc0\x80ing'),
@@ -494,6 +523,9 @@ def float_eq(actual, expected):
494523
check('string\xbd')
495524
check('string\u20ac')
496525
check('string\U0001f4bb')
526+
if sys.platform != 'win32':
527+
check('<\udce2\udc82\udcac>', '<\u20ac>')
528+
check('<\udced\udca0\udcbd\udced\udcb2\udcbb>', '<\U0001f4bb>')
497529
check('')
498530
check(b'string', 'string')
499531
check(b'string\xe2\x82\xac', 'string\xe2\x82\xac')
@@ -537,6 +569,8 @@ def test_splitlist(self):
537569
('a \u20ac', ('a', '\u20ac')),
538570
('a \U0001f4bb', ('a', '\U0001f4bb')),
539571
(b'a \xe2\x82\xac', ('a', '\u20ac')),
572+
(b'a \xf0\x9f\x92\xbb', ('a', '\U0001f4bb')),
573+
(b'a \xed\xa0\xbd\xed\xb2\xbb', ('a', '\U0001f4bb')),
540574
(b'a\xc0\x80b c\xc0\x80d', ('a\x00b', 'c\x00d')),
541575
('a {b c}', ('a', 'b c')),
542576
(r'a b\ c', ('a', 'b c')),
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fixed support of non-BMP characters in :mod:`tkinter` on macOS.

Modules/_tkinter.c

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,8 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size)
397397

398398
char *buf = NULL;
399399
PyErr_Clear();
400-
/* Tcl encodes null character as \xc0\x80 */
400+
/* Tcl encodes null character as \xc0\x80.
401+
https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8 */
401402
if (memchr(s, '\xc0', size)) {
402403
char *q;
403404
const char *e = s + size;
@@ -421,6 +422,57 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size)
421422
if (buf != NULL) {
422423
PyMem_Free(buf);
423424
}
425+
if (r == NULL || PyUnicode_KIND(r) == PyUnicode_1BYTE_KIND) {
426+
return r;
427+
}
428+
429+
/* In CESU-8 non-BMP characters are represented as a surrogate pair,
430+
like in UTF-16, and then each surrogate code point is encoded in UTF-8.
431+
https://en.wikipedia.org/wiki/CESU-8 */
432+
Py_ssize_t len = PyUnicode_GET_LENGTH(r);
433+
Py_ssize_t i, j;
434+
/* All encoded surrogate characters start with \xED. */
435+
i = PyUnicode_FindChar(r, 0xdcED, 0, len, 1);
436+
if (i == -2) {
437+
Py_DECREF(r);
438+
return NULL;
439+
}
440+
if (i == -1) {
441+
return r;
442+
}
443+
Py_UCS4 *u = PyUnicode_AsUCS4Copy(r);
444+
Py_DECREF(r);
445+
if (u == NULL) {
446+
return NULL;
447+
}
448+
Py_UCS4 ch;
449+
for (j = i; i < len; i++, u[j++] = ch) {
450+
Py_UCS4 ch1, ch2, ch3, high, low;
451+
/* Low surrogates U+D800 - U+DBFF are encoded as
452+
\xED\xA0\x80 - \xED\xAF\xBF. */
453+
ch1 = ch = u[i];
454+
if (ch1 != 0xdcED) continue;
455+
ch2 = u[i + 1];
456+
if (!(0xdcA0 <= ch2 && ch2 <= 0xdcAF)) continue;
457+
ch3 = u[i + 2];
458+
if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue;
459+
high = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F);
460+
assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
461+
/* High surrogates U+DC00 - U+DFFF are encoded as
462+
\xED\xB0\x80 - \xED\xBF\xBF. */
463+
ch1 = u[i + 3];
464+
if (ch1 != 0xdcED) continue;
465+
ch2 = u[i + 4];
466+
if (!(0xdcB0 <= ch2 && ch2 <= 0xdcBF)) continue;
467+
ch3 = u[i + 5];
468+
if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue;
469+
low = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F);
470+
assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
471+
ch = Py_UNICODE_JOIN_SURROGATES(high, low);
472+
i += 5;
473+
}
474+
r = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, u, j);
475+
PyMem_Free(u);
424476
return r;
425477
}
426478

0 commit comments

Comments
 (0)