Skip to content

Commit c7ad974

Browse files
methanevstinner
andauthored
bpo-39087: Add _PyUnicode_GetUTF8Buffer() (GH-17659)
Co-authored-by: Victor Stinner <[email protected]>
1 parent 8fb02b6 commit c7ad974

File tree

5 files changed

+284
-6
lines changed

5 files changed

+284
-6
lines changed

Include/cpython/unicodeobject.h

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -734,6 +734,19 @@ PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
734734

735735
/* --- Manage the default encoding ---------------------------------------- */
736736

737+
/* Get a buffer to the UTF-8 encoding of the Unicode object unicode.
738+
Returns -1 on error.
739+
740+
Successful calls must be paired to
741+
calls to PyBuffer_Release.
742+
*/
743+
744+
PyAPI_FUNC(int) _PyUnicode_GetUTF8Buffer(
745+
PyObject *unicode, /* Unicode object */
746+
const char *errors, /* error handling */
747+
Py_buffer *view /* (out) buffer to the UTF-8 encoding */
748+
);
749+
737750
/* Returns a pointer to the default encoding (UTF-8) of the
738751
Unicode object unicode and the size of the encoded representation
739752
in bytes stored in *size.
@@ -746,12 +759,6 @@ PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
746759
747760
_PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
748761
support the previous internal function with the same behaviour.
749-
750-
*** This API is for interpreter INTERNAL USE ONLY and will likely
751-
*** be removed or changed in the future.
752-
753-
*** If you need to access the Unicode object as UTF-8 bytes string,
754-
*** please use PyUnicode_AsUTF8String() instead.
755762
*/
756763

757764
PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(

Lib/test/test_unicode.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2830,6 +2830,28 @@ def test_asucs4(self):
28302830
self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0')
28312831
self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff')
28322832

2833+
# Test _PyUnicode_GetUTF8Buffer()
2834+
@support.cpython_only
2835+
def test_getutf8buffer(self):
2836+
from _testcapi import unicode_getutf8buffer, unicode_test_getutf8buffer
2837+
2838+
# Run tests wrtten in C. Raise an error when test failed.
2839+
unicode_test_getutf8buffer()
2840+
2841+
ascii_ = "foo"
2842+
bmp = '\u0100'
2843+
bmp2 = '\uffff'
2844+
nonbmp = chr(0x10ffff)
2845+
surrogates = 'a\ud800b\udfffc'
2846+
2847+
self.assertEqual(unicode_getutf8buffer(ascii_), b'foo')
2848+
self.assertEqual(unicode_getutf8buffer(bmp), b'\xc4\x80')
2849+
self.assertEqual(unicode_getutf8buffer(bmp2), b'\xef\xbf\xbf')
2850+
self.assertEqual(unicode_getutf8buffer(nonbmp), b'\xf4\x8f\xbf\xbf')
2851+
self.assertRaises(UnicodeEncodeError, unicode_getutf8buffer, surrogates)
2852+
self.assertEqual(unicode_getutf8buffer(surrogates, "surrogatepass"),
2853+
b'a\xed\xa0\x80b\xed\xbf\xbfc')
2854+
28332855
# Test PyUnicode_AsUTF8()
28342856
@support.cpython_only
28352857
def test_asutf8(self):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Add new ``_PyUnicode_GetUTF8Buffer`` private API to get UTF-8 encode of the
2+
unicode object without cache or extra allocation.

Modules/_testcapimodule.c

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1967,6 +1967,216 @@ unicode_asutf8andsize(PyObject *self, PyObject *args)
19671967
return Py_BuildValue("(Nn)", result, utf8_len);
19681968
}
19691969

1970+
static PyObject *
1971+
unicode_getutf8buffer(PyObject *self, PyObject *args)
1972+
{
1973+
PyObject *unicode;
1974+
const char *errors = NULL;
1975+
if(!PyArg_ParseTuple(args, "O|s", &unicode, &errors)) {
1976+
return NULL;
1977+
}
1978+
1979+
Py_buffer buffer;
1980+
if (_PyUnicode_GetUTF8Buffer(unicode, errors, &buffer) < 0) {
1981+
return NULL;
1982+
}
1983+
1984+
assert(buffer.obj != NULL);
1985+
assert(buffer.obj == unicode || PyBytes_CheckExact(buffer.obj));
1986+
1987+
PyObject *result = PyBytes_FromStringAndSize(buffer.buf, buffer.len);
1988+
PyBuffer_Release(&buffer);
1989+
return result;
1990+
}
1991+
1992+
static PyObject *
1993+
unicode_test_getutf8buffer(PyObject *self, PyObject *Py_UNUSED(ignored))
1994+
{
1995+
Py_buffer buf;
1996+
1997+
// Test 1: ASCII string
1998+
PyObject *str = PyUnicode_FromString("hello");
1999+
if (str == NULL) {
2000+
return NULL;
2001+
}
2002+
Py_ssize_t refcnt = Py_REFCNT(str);
2003+
2004+
// _PyUnicode_GetUTF8Buffer() must not fail for ASCII string.
2005+
int ret = _PyUnicode_GetUTF8Buffer(str, NULL, &buf);
2006+
assert(ret == 0);
2007+
2008+
if (buf.obj != str) {
2009+
PyErr_Format(TestError,
2010+
"buf.obj must be equal to str. (%s:%d)",
2011+
__FILE__, __LINE__);
2012+
PyBuffer_Release(&buf);
2013+
Py_DECREF(str);
2014+
return NULL;
2015+
}
2016+
2017+
if (buf.len != PyUnicode_GET_LENGTH(str)) {
2018+
PyErr_Format(TestError,
2019+
"buf.len must be equal to len(str). (%s:%d)",
2020+
__FILE__, __LINE__);
2021+
PyBuffer_Release(&buf);
2022+
Py_DECREF(str);
2023+
return NULL;
2024+
}
2025+
assert(((const char*)buf.buf)[5] == '\0');
2026+
2027+
if ((Py_UCS1*)buf.buf != PyUnicode_1BYTE_DATA(str)) {
2028+
PyErr_Format(TestError,
2029+
"buf.buf must be equal to PyUnicode_1BYTE_DATA(str). (%s:%d)",
2030+
__FILE__, __LINE__);
2031+
PyBuffer_Release(&buf);
2032+
Py_DECREF(str);
2033+
return NULL;
2034+
}
2035+
2036+
if (refcnt + 1 != Py_REFCNT(str)) {
2037+
PyErr_Format(TestError,
2038+
"Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
2039+
refcnt + 1, Py_REFCNT(str),
2040+
__FILE__, __LINE__);
2041+
PyBuffer_Release(&buf);
2042+
Py_DECREF(str);
2043+
return NULL;
2044+
}
2045+
2046+
PyBuffer_Release(&buf);
2047+
2048+
if (refcnt != Py_REFCNT(str)) {
2049+
PyErr_Format(TestError,
2050+
"Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
2051+
refcnt, Py_REFCNT(str),
2052+
__FILE__, __LINE__);
2053+
Py_DECREF(str);
2054+
return NULL;
2055+
}
2056+
2057+
Py_DECREF(str);
2058+
2059+
// Test 2: non-ASCII string
2060+
2061+
// "hello" in Japanese. len(str)==5, len(str.encode()) == 15.
2062+
str = PyUnicode_FromString("\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf");
2063+
if (str == NULL) {
2064+
return NULL;
2065+
}
2066+
refcnt = Py_REFCNT(str);
2067+
assert(PyUnicode_GET_LENGTH(str) == 5);
2068+
2069+
if (_PyUnicode_GetUTF8Buffer(str, NULL, &buf) < 0) {
2070+
Py_DECREF(str);
2071+
if (!PyErr_Occurred()) {
2072+
PyErr_Format(TestError,
2073+
"_PyUnicode_GetUTF8Buffer() returned nonzero "
2074+
"without exception set. (%s:%d)",
2075+
__FILE__, __LINE__);
2076+
}
2077+
return NULL;
2078+
}
2079+
2080+
if (!PyBytes_CheckExact(buf.obj)) {
2081+
PyErr_Format(TestError,
2082+
"buf.obj must be a bytes object, got %R (%s:%d)",
2083+
buf.obj, __FILE__, __LINE__);
2084+
PyBuffer_Release(&buf);
2085+
Py_DECREF(str);
2086+
return NULL;
2087+
}
2088+
2089+
if (buf.len != 15) {
2090+
PyErr_Format(TestError,
2091+
"Expected buf.len == 15, actual %zd (%s:%d)",
2092+
buf.len, __FILE__, __LINE__);
2093+
PyBuffer_Release(&buf);
2094+
Py_DECREF(str);
2095+
return NULL;
2096+
}
2097+
assert(((const char*)buf.buf)[15] == '\0');
2098+
2099+
if (refcnt != Py_REFCNT(str)) {
2100+
PyErr_Format(TestError,
2101+
"Py_REFCNT(str) must not be changed. (%s:%d)",
2102+
__FILE__, __LINE__);
2103+
// Do not DECREF here because refcnt is broken.
2104+
return NULL;
2105+
}
2106+
2107+
PyBuffer_Release(&buf);
2108+
2109+
// Test 3: There is a UTF-8 cache
2110+
// Reuse str of the previoss test.
2111+
2112+
const char *cache = PyUnicode_AsUTF8(str);
2113+
if (cache == NULL) {
2114+
return NULL;
2115+
}
2116+
2117+
if (_PyUnicode_GetUTF8Buffer(str, NULL, &buf) < 0) {
2118+
Py_DECREF(str);
2119+
if (!PyErr_Occurred()) {
2120+
PyErr_Format(TestError,
2121+
"_PyUnicode_GetUTF8Buffer() returned nonzero "
2122+
"without exception set. (%s:%d)",
2123+
__FILE__, __LINE__);
2124+
}
2125+
return NULL;
2126+
}
2127+
2128+
if (buf.obj != str) {
2129+
PyErr_Format(TestError,
2130+
"buf.obj must be equal to str. (%s:%d)",
2131+
__FILE__, __LINE__);
2132+
PyBuffer_Release(&buf);
2133+
Py_DECREF(str);
2134+
return NULL;
2135+
}
2136+
2137+
if (buf.buf != cache) {
2138+
PyErr_Format(TestError,
2139+
"buf.buf must be equal to the UTF-8 cache (%s:%d)",
2140+
__FILE__, __LINE__);
2141+
PyBuffer_Release(&buf);
2142+
Py_DECREF(str);
2143+
return NULL;
2144+
}
2145+
2146+
if (buf.len != 15) {
2147+
PyErr_Format(TestError,
2148+
"Expected buf.len == 15, actual %zd (%s:%d)",
2149+
buf.len, __FILE__, __LINE__);
2150+
PyBuffer_Release(&buf);
2151+
Py_DECREF(str);
2152+
return NULL;
2153+
}
2154+
assert(((const char*)buf.buf)[15] == '\0');
2155+
2156+
if (refcnt + 1 != Py_REFCNT(str)) {
2157+
PyErr_Format(TestError,
2158+
"Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
2159+
refcnt + 1, Py_REFCNT(str),
2160+
__FILE__, __LINE__);
2161+
// Do not DECREF here because refcnt is broken.
2162+
return NULL;
2163+
}
2164+
2165+
PyBuffer_Release(&buf);
2166+
2167+
if (refcnt != Py_REFCNT(str)) {
2168+
PyErr_Format(TestError,
2169+
"Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
2170+
refcnt, Py_REFCNT(str),
2171+
__FILE__, __LINE__);
2172+
// Do not DECREF here because refcnt is broken.
2173+
return NULL;
2174+
}
2175+
2176+
Py_DECREF(str);
2177+
Py_RETURN_NONE;
2178+
}
2179+
19702180
static PyObject *
19712181
unicode_findchar(PyObject *self, PyObject *args)
19722182
{
@@ -5392,6 +5602,8 @@ static PyMethodDef TestMethods[] = {
53925602
{"unicode_asucs4", unicode_asucs4, METH_VARARGS},
53935603
{"unicode_asutf8", unicode_asutf8, METH_VARARGS},
53945604
{"unicode_asutf8andsize", unicode_asutf8andsize, METH_VARARGS},
5605+
{"unicode_getutf8buffer", unicode_getutf8buffer, METH_VARARGS},
5606+
{"unicode_test_getutf8buffer", unicode_test_getutf8buffer, METH_NOARGS},
53955607
{"unicode_findchar", unicode_findchar, METH_VARARGS},
53965608
{"unicode_copycharacters", unicode_copycharacters, METH_VARARGS},
53975609
{"unicode_encodedecimal", unicode_encodedecimal, METH_VARARGS},

Objects/unicodeobject.c

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3991,6 +3991,41 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
39913991
}
39923992

39933993

3994+
int
3995+
_PyUnicode_GetUTF8Buffer(PyObject *unicode, const char *errors,
3996+
Py_buffer *view)
3997+
{
3998+
if (!PyUnicode_Check(unicode)) {
3999+
PyErr_BadArgument();
4000+
return -1;
4001+
}
4002+
if (PyUnicode_READY(unicode) == -1) {
4003+
return -1;
4004+
}
4005+
4006+
if (PyUnicode_UTF8(unicode) != NULL
4007+
&& Py_TYPE(unicode)->tp_as_buffer == NULL) {
4008+
return PyBuffer_FillInfo(view, unicode,
4009+
PyUnicode_UTF8(unicode),
4010+
PyUnicode_UTF8_LENGTH(unicode),
4011+
/* readonly */ 1, PyBUF_SIMPLE);
4012+
}
4013+
4014+
// Unlike PyUnicode_AsUTF8AndSize(), this function doesn't
4015+
// create a UTF-8 cache for speed and efficiency.
4016+
PyObject *bytes = _PyUnicode_AsUTF8String(unicode, errors);
4017+
if (bytes == NULL) {
4018+
return -1;
4019+
}
4020+
assert(PyBytes_CheckExact(bytes));
4021+
if (PyObject_GetBuffer(bytes, view, PyBUF_SIMPLE) < 0) {
4022+
Py_DECREF(bytes);
4023+
return -1;
4024+
}
4025+
return 0;
4026+
}
4027+
4028+
39944029
static int unicode_fill_utf8(PyObject *unicode);
39954030

39964031
const char *

0 commit comments

Comments
 (0)