Skip to content

Commit 2e15785

Browse files
authored
gh-119182: Add PyUnicodeWriter_WriteUCS4() function (#120849)
1 parent a47abdb commit 2e15785

File tree

7 files changed

+131
-6
lines changed

7 files changed

+131
-6
lines changed

Doc/c-api/unicode.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1563,6 +1563,15 @@ object.
15631563
On success, return ``0``.
15641564
On error, set an exception, leave the writer unchanged, and return ``-1``.
15651565
1566+
.. c:function:: int PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *writer, Py_UCS4 *str, Py_ssize_t size)
1567+
1568+
Writer the UCS4 string *str* into *writer*.
1569+
1570+
*size* is a number of UCS4 characters.
1571+
1572+
On success, return ``0``.
1573+
On error, set an exception, leave the writer unchanged, and return ``-1``.
1574+
15661575
.. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
15671576
15681577
Call :c:func:`PyObject_Str` on *obj* and write the output into *writer*.

Doc/whatsnew/3.14.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,7 @@ New Features
314314
* :c:func:`PyUnicodeWriter_Finish`.
315315
* :c:func:`PyUnicodeWriter_WriteChar`.
316316
* :c:func:`PyUnicodeWriter_WriteUTF8`.
317+
* :c:func:`PyUnicodeWriter_WriteUCS4`.
317318
* :c:func:`PyUnicodeWriter_WriteWideChar`.
318319
* :c:func:`PyUnicodeWriter_WriteStr`.
319320
* :c:func:`PyUnicodeWriter_WriteRepr`.

Include/cpython/unicodeobject.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar(
463463
PyUnicodeWriter *writer,
464464
const wchar_t *str,
465465
Py_ssize_t size);
466+
PyAPI_FUNC(int) PyUnicodeWriter_WriteUCS4(
467+
PyUnicodeWriter *writer,
468+
Py_UCS4 *str,
469+
Py_ssize_t size);
466470

467471
PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
468472
PyUnicodeWriter *writer,

Lib/test/test_capi/test_unicode.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1826,8 +1826,42 @@ def test_widechar(self):
18261826
writer.write_widechar("latin1=\xE9")
18271827
writer.write_widechar("-")
18281828
writer.write_widechar("euro=\u20AC")
1829+
writer.write_char("-")
1830+
writer.write_widechar("max=\U0010ffff")
18291831
writer.write_char('.')
1830-
self.assertEqual(writer.finish(), "latin1=\xE9-euro=\u20AC.")
1832+
self.assertEqual(writer.finish(),
1833+
"latin1=\xE9-euro=\u20AC-max=\U0010ffff.")
1834+
1835+
def test_ucs4(self):
1836+
writer = self.create_writer(0)
1837+
writer.write_ucs4("ascii IGNORED", 5)
1838+
writer.write_char("-")
1839+
writer.write_ucs4("latin1=\xe9", 8)
1840+
writer.write_char("-")
1841+
writer.write_ucs4("euro=\u20ac", 6)
1842+
writer.write_char("-")
1843+
writer.write_ucs4("max=\U0010ffff", 5)
1844+
writer.write_char(".")
1845+
self.assertEqual(writer.finish(),
1846+
"ascii-latin1=\xE9-euro=\u20AC-max=\U0010ffff.")
1847+
1848+
# Test some special characters
1849+
writer = self.create_writer(0)
1850+
# Lone surrogate character
1851+
writer.write_ucs4("lone\uDC80", 5)
1852+
writer.write_char("-")
1853+
# Surrogate pair
1854+
writer.write_ucs4("pair\uDBFF\uDFFF", 5)
1855+
writer.write_char("-")
1856+
writer.write_ucs4("null[\0]", 7)
1857+
self.assertEqual(writer.finish(),
1858+
"lone\udc80-pair\udbff-null[\0]")
1859+
1860+
# invalid size
1861+
writer = self.create_writer(0)
1862+
with self.assertRaises(ValueError):
1863+
writer.write_ucs4("text", -1)
1864+
18311865

18321866

18331867
@unittest.skipIf(ctypes is None, 'need ctypes')

Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@ Add a new :c:type:`PyUnicodeWriter` API to create a Python :class:`str` object:
55
* :c:func:`PyUnicodeWriter_Finish`.
66
* :c:func:`PyUnicodeWriter_WriteChar`.
77
* :c:func:`PyUnicodeWriter_WriteUTF8`.
8+
* :c:func:`PyUnicodeWriter_WriteUCS4`.
9+
* :c:func:`PyUnicodeWriter_WriteWideChar`.
810
* :c:func:`PyUnicodeWriter_WriteStr`.
911
* :c:func:`PyUnicodeWriter_WriteRepr`.
1012
* :c:func:`PyUnicodeWriter_WriteSubstring`.
1113
* :c:func:`PyUnicodeWriter_Format`.
14+
* :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
1215

1316
Patch by Victor Stinner.

Modules/_testcapi/unicode.c

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,36 @@ writer_write_widechar(PyObject *self_raw, PyObject *args)
360360
}
361361

362362

363+
static PyObject*
364+
writer_write_ucs4(PyObject *self_raw, PyObject *args)
365+
{
366+
WriterObject *self = (WriterObject *)self_raw;
367+
if (writer_check(self) < 0) {
368+
return NULL;
369+
}
370+
371+
PyObject *str;
372+
Py_ssize_t size;
373+
if (!PyArg_ParseTuple(args, "Un", &str, &size)) {
374+
return NULL;
375+
}
376+
Py_ssize_t len = PyUnicode_GET_LENGTH(str);
377+
size = Py_MIN(size, len);
378+
379+
Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(str);
380+
if (ucs4 == NULL) {
381+
return NULL;
382+
}
383+
384+
int res = PyUnicodeWriter_WriteUCS4(self->writer, ucs4, size);
385+
PyMem_Free(ucs4);
386+
if (res < 0) {
387+
return NULL;
388+
}
389+
Py_RETURN_NONE;
390+
}
391+
392+
363393
static PyObject*
364394
writer_write_str(PyObject *self_raw, PyObject *args)
365395
{
@@ -484,6 +514,7 @@ static PyMethodDef writer_methods[] = {
484514
{"write_char", _PyCFunction_CAST(writer_write_char), METH_VARARGS},
485515
{"write_utf8", _PyCFunction_CAST(writer_write_utf8), METH_VARARGS},
486516
{"write_widechar", _PyCFunction_CAST(writer_write_widechar), METH_VARARGS},
517+
{"write_ucs4", _PyCFunction_CAST(writer_write_ucs4), METH_VARARGS},
487518
{"write_str", _PyCFunction_CAST(writer_write_str), METH_VARARGS},
488519
{"write_repr", _PyCFunction_CAST(writer_write_repr), METH_VARARGS},
489520
{"write_substring", _PyCFunction_CAST(writer_write_substring), METH_VARARGS},

Objects/unicodeobject.c

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2035,11 +2035,9 @@ PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
20352035
if (!converted) {
20362036
return -1;
20372037
}
2038-
PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2039-
PyMem_Free(converted);
20402038

2041-
int res = _PyUnicodeWriter_WriteStr(writer, unicode);
2042-
Py_DECREF(unicode);
2039+
int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
2040+
PyMem_Free(converted);
20432041
return res;
20442042
}
20452043
#endif
@@ -2289,6 +2287,51 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
22892287
return res;
22902288
}
22912289

2290+
2291+
int
2292+
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2293+
Py_UCS4 *str,
2294+
Py_ssize_t size)
2295+
{
2296+
_PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2297+
2298+
if (size < 0) {
2299+
PyErr_SetString(PyExc_ValueError,
2300+
"size must be positive");
2301+
return -1;
2302+
}
2303+
2304+
if (size == 0) {
2305+
return 0;
2306+
}
2307+
2308+
Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2309+
2310+
if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2311+
return -1;
2312+
}
2313+
2314+
int kind = writer->kind;
2315+
void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2316+
if (kind == PyUnicode_1BYTE_KIND) {
2317+
_PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2318+
str, str + size,
2319+
data);
2320+
}
2321+
else if (kind == PyUnicode_2BYTE_KIND) {
2322+
_PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2323+
str, str + size,
2324+
data);
2325+
}
2326+
else {
2327+
memcpy(data, str, size * sizeof(Py_UCS4));
2328+
}
2329+
writer->pos += size;
2330+
2331+
return 0;
2332+
}
2333+
2334+
22922335
PyObject*
22932336
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
22942337
{
@@ -13357,7 +13400,7 @@ PyUnicodeWriter*
1335713400
PyUnicodeWriter_Create(Py_ssize_t length)
1335813401
{
1335913402
if (length < 0) {
13360-
PyErr_SetString(PyExc_TypeError,
13403+
PyErr_SetString(PyExc_ValueError,
1336113404
"length must be positive");
1336213405
return NULL;
1336313406
}

0 commit comments

Comments
 (0)