-
-
Notifications
You must be signed in to change notification settings - Fork 32.3k
bpo-39087: Add _PyUnicode_GetUTF8Buffer() #17659
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b923398
3fb6235
3bac143
ec18bac
2f1f8ac
3a27b8b
7cef9a1
d92ed64
a32837f
f8f8a91
98ec45f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2830,6 +2830,28 @@ def test_asucs4(self): | |
self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0') | ||
self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff') | ||
|
||
# Test _PyUnicode_GetUTF8Buffer() | ||
@support.cpython_only | ||
def test_getutf8buffer(self): | ||
from _testcapi import unicode_getutf8buffer, unicode_test_getutf8buffer | ||
|
||
# Run tests wrtten in C. Raise an error when test failed. | ||
unicode_test_getutf8buffer() | ||
|
||
ascii_ = "foo" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suggest to rename ascii_ to asciistr or ascii_str. The "_" suffix looks strange :-) |
||
bmp = '\u0100' | ||
bmp2 = '\uffff' | ||
nonbmp = chr(0x10ffff) | ||
surrogates = 'a\ud800b\udfffc' | ||
|
||
self.assertEqual(unicode_getutf8buffer(ascii_), b'foo') | ||
self.assertEqual(unicode_getutf8buffer(bmp), b'\xc4\x80') | ||
self.assertEqual(unicode_getutf8buffer(bmp2), b'\xef\xbf\xbf') | ||
self.assertEqual(unicode_getutf8buffer(nonbmp), b'\xf4\x8f\xbf\xbf') | ||
self.assertRaises(UnicodeEncodeError, unicode_getutf8buffer, surrogates) | ||
self.assertEqual(unicode_getutf8buffer(surrogates, "surrogatepass"), | ||
b'a\xed\xa0\x80b\xed\xbf\xbfc') | ||
|
||
# Test PyUnicode_AsUTF8() | ||
@support.cpython_only | ||
def test_asutf8(self): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
Add new ``_PyUnicode_GetUTF8Buffer`` private API to get UTF-8 encode of the | ||
unicode object without cache or extra allocation. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1965,6 +1965,216 @@ unicode_asutf8andsize(PyObject *self, PyObject *args) | |
return Py_BuildValue("(Nn)", result, utf8_len); | ||
} | ||
|
||
static PyObject * | ||
unicode_getutf8buffer(PyObject *self, PyObject *args) | ||
{ | ||
PyObject *unicode; | ||
const char *errors = NULL; | ||
if(!PyArg_ParseTuple(args, "O|s", &unicode, &errors)) { | ||
return NULL; | ||
} | ||
|
||
Py_buffer buffer; | ||
if (_PyUnicode_GetUTF8Buffer(unicode, errors, &buffer) < 0) { | ||
return NULL; | ||
} | ||
|
||
assert(buffer.obj != NULL); | ||
assert(buffer.obj == unicode || PyBytes_CheckExact(buffer.obj)); | ||
|
||
PyObject *result = PyBytes_FromStringAndSize(buffer.buf, buffer.len); | ||
PyBuffer_Release(&buffer); | ||
return result; | ||
} | ||
|
||
static PyObject * | ||
unicode_test_getutf8buffer(PyObject *self, PyObject *Py_UNUSED(ignored)) | ||
{ | ||
Py_buffer buf; | ||
|
||
// Test 1: ASCII string | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggestion. Would it be possible to factorize the code of each test with an helper function? It seems like the code of each test is basically copy/pasted. I don't think that it matters to provide accurate error message. Add a paremeter "utf8_cache" for test 3 to call or not PyUnicode_AsUTF8(). |
||
PyObject *str = PyUnicode_FromString("hello"); | ||
if (str == NULL) { | ||
return NULL; | ||
} | ||
Py_ssize_t refcnt = Py_REFCNT(str); | ||
|
||
// _PyUnicode_GetUTF8Buffer() must not fail for ASCII string. | ||
int ret = _PyUnicode_GetUTF8Buffer(str, NULL, &buf); | ||
assert(ret == 0); | ||
|
||
if (buf.obj != str) { | ||
PyErr_Format(TestError, | ||
"buf.obj must be equal to str. (%s:%d)", | ||
__FILE__, __LINE__); | ||
PyBuffer_Release(&buf); | ||
Py_DECREF(str); | ||
return NULL; | ||
} | ||
|
||
if (buf.len != PyUnicode_GET_LENGTH(str)) { | ||
PyErr_Format(TestError, | ||
"buf.len must be equal to len(str). (%s:%d)", | ||
__FILE__, __LINE__); | ||
PyBuffer_Release(&buf); | ||
Py_DECREF(str); | ||
return NULL; | ||
} | ||
assert(((const char*)buf.buf)[5] == '\0'); | ||
|
||
if ((Py_UCS1*)buf.buf != PyUnicode_1BYTE_DATA(str)) { | ||
PyErr_Format(TestError, | ||
"buf.buf must be equal to PyUnicode_1BYTE_DATA(str). (%s:%d)", | ||
__FILE__, __LINE__); | ||
PyBuffer_Release(&buf); | ||
Py_DECREF(str); | ||
return NULL; | ||
} | ||
|
||
if (refcnt + 1 != Py_REFCNT(str)) { | ||
PyErr_Format(TestError, | ||
"Py_REFCNT(str); expected %zd, got %zd. (%s:%d)", | ||
refcnt + 1, Py_REFCNT(str), | ||
__FILE__, __LINE__); | ||
PyBuffer_Release(&buf); | ||
Py_DECREF(str); | ||
return NULL; | ||
} | ||
|
||
PyBuffer_Release(&buf); | ||
|
||
if (refcnt != Py_REFCNT(str)) { | ||
PyErr_Format(TestError, | ||
"Py_REFCNT(str); expected %zd, got %zd. (%s:%d)", | ||
refcnt, Py_REFCNT(str), | ||
__FILE__, __LINE__); | ||
Py_DECREF(str); | ||
return NULL; | ||
} | ||
|
||
Py_DECREF(str); | ||
|
||
// Test 2: non-ASCII string | ||
|
||
// "hello" in Japanese. len(str)==5, len(str.encode()) == 15. | ||
str = PyUnicode_FromString("\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf"); | ||
if (str == NULL) { | ||
return NULL; | ||
} | ||
refcnt = Py_REFCNT(str); | ||
assert(PyUnicode_GET_LENGTH(str) == 5); | ||
|
||
if (_PyUnicode_GetUTF8Buffer(str, NULL, &buf) < 0) { | ||
Py_DECREF(str); | ||
if (!PyErr_Occurred()) { | ||
PyErr_Format(TestError, | ||
"_PyUnicode_GetUTF8Buffer() returned nonzero " | ||
"without exception set. (%s:%d)", | ||
__FILE__, __LINE__); | ||
} | ||
return NULL; | ||
} | ||
|
||
if (!PyBytes_CheckExact(buf.obj)) { | ||
PyErr_Format(TestError, | ||
"buf.obj must be a bytes object, got %R (%s:%d)", | ||
buf.obj, __FILE__, __LINE__); | ||
PyBuffer_Release(&buf); | ||
Py_DECREF(str); | ||
return NULL; | ||
} | ||
|
||
if (buf.len != 15) { | ||
PyErr_Format(TestError, | ||
"Expected buf.len == 15, actual %zd (%s:%d)", | ||
buf.len, __FILE__, __LINE__); | ||
PyBuffer_Release(&buf); | ||
Py_DECREF(str); | ||
return NULL; | ||
} | ||
assert(((const char*)buf.buf)[15] == '\0'); | ||
|
||
if (refcnt != Py_REFCNT(str)) { | ||
PyErr_Format(TestError, | ||
"Py_REFCNT(str) must not be changed. (%s:%d)", | ||
__FILE__, __LINE__); | ||
// Do not DECREF here because refcnt is broken. | ||
return NULL; | ||
} | ||
|
||
PyBuffer_Release(&buf); | ||
|
||
// Test 3: There is a UTF-8 cache | ||
// Reuse str of the previoss test. | ||
|
||
const char *cache = PyUnicode_AsUTF8(str); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be too difficult to test also that there was no cache before calling |
||
if (cache == NULL) { | ||
return NULL; | ||
} | ||
|
||
if (_PyUnicode_GetUTF8Buffer(str, NULL, &buf) < 0) { | ||
Py_DECREF(str); | ||
if (!PyErr_Occurred()) { | ||
PyErr_Format(TestError, | ||
"_PyUnicode_GetUTF8Buffer() returned nonzero " | ||
"without exception set. (%s:%d)", | ||
__FILE__, __LINE__); | ||
} | ||
return NULL; | ||
} | ||
|
||
if (buf.obj != str) { | ||
PyErr_Format(TestError, | ||
"buf.obj must be equal to str. (%s:%d)", | ||
__FILE__, __LINE__); | ||
PyBuffer_Release(&buf); | ||
Py_DECREF(str); | ||
return NULL; | ||
} | ||
|
||
if (buf.buf != cache) { | ||
PyErr_Format(TestError, | ||
"buf.buf must be equal to the UTF-8 cache (%s:%d)", | ||
__FILE__, __LINE__); | ||
PyBuffer_Release(&buf); | ||
Py_DECREF(str); | ||
return NULL; | ||
} | ||
|
||
if (buf.len != 15) { | ||
PyErr_Format(TestError, | ||
"Expected buf.len == 15, actual %zd (%s:%d)", | ||
buf.len, __FILE__, __LINE__); | ||
PyBuffer_Release(&buf); | ||
Py_DECREF(str); | ||
return NULL; | ||
} | ||
assert(((const char*)buf.buf)[15] == '\0'); | ||
|
||
if (refcnt + 1 != Py_REFCNT(str)) { | ||
PyErr_Format(TestError, | ||
"Py_REFCNT(str); expected %zd, got %zd. (%s:%d)", | ||
refcnt + 1, Py_REFCNT(str), | ||
__FILE__, __LINE__); | ||
// Do not DECREF here because refcnt is broken. | ||
return NULL; | ||
} | ||
|
||
PyBuffer_Release(&buf); | ||
|
||
if (refcnt != Py_REFCNT(str)) { | ||
PyErr_Format(TestError, | ||
"Py_REFCNT(str); expected %zd, got %zd. (%s:%d)", | ||
refcnt, Py_REFCNT(str), | ||
__FILE__, __LINE__); | ||
// Do not DECREF here because refcnt is broken. | ||
return NULL; | ||
} | ||
|
||
Py_DECREF(str); | ||
Py_RETURN_NONE; | ||
} | ||
|
||
static PyObject * | ||
unicode_findchar(PyObject *self, PyObject *args) | ||
{ | ||
|
@@ -5399,6 +5609,8 @@ static PyMethodDef TestMethods[] = { | |
{"unicode_asucs4", unicode_asucs4, METH_VARARGS}, | ||
{"unicode_asutf8", unicode_asutf8, METH_VARARGS}, | ||
{"unicode_asutf8andsize", unicode_asutf8andsize, METH_VARARGS}, | ||
{"unicode_getutf8buffer", unicode_getutf8buffer, METH_VARARGS}, | ||
{"unicode_test_getutf8buffer", unicode_test_getutf8buffer, METH_NOARGS}, | ||
{"unicode_findchar", unicode_findchar, METH_VARARGS}, | ||
{"unicode_copycharacters", unicode_copycharacters, METH_VARARGS}, | ||
{"unicode_encodedecimal", unicode_encodedecimal, METH_VARARGS}, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not inline these variables?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.