Skip to content

Revert "bpo-39087: Add _PyUnicode_GetUTF8Buffer()" #18985

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions Include/cpython/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -734,19 +734,6 @@ PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);

/* --- Manage the default encoding ---------------------------------------- */

/* Get a buffer to the UTF-8 encoding of the Unicode object unicode.
Returns -1 on error.

Successful calls must be paired to
calls to PyBuffer_Release.
*/

PyAPI_FUNC(int) _PyUnicode_GetUTF8Buffer(
PyObject *unicode, /* Unicode object */
const char *errors, /* error handling */
Py_buffer *view /* (out) buffer to the UTF-8 encoding */
);

/* Returns a pointer to the default encoding (UTF-8) of the
Unicode object unicode and the size of the encoded representation
in bytes stored in *size.
Expand Down
22 changes: 0 additions & 22 deletions Lib/test/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -2830,28 +2830,6 @@ def test_asucs4(self):
self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0')
self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff')

# Test _PyUnicode_GetUTF8Buffer()
@support.cpython_only
def test_getutf8buffer(self):
from _testcapi import unicode_getutf8buffer, unicode_test_getutf8buffer

# Run tests wrtten in C. Raise an error when test failed.
unicode_test_getutf8buffer()

ascii_ = "foo"
bmp = '\u0100'
bmp2 = '\uffff'
nonbmp = chr(0x10ffff)
surrogates = 'a\ud800b\udfffc'

self.assertEqual(unicode_getutf8buffer(ascii_), b'foo')
self.assertEqual(unicode_getutf8buffer(bmp), b'\xc4\x80')
self.assertEqual(unicode_getutf8buffer(bmp2), b'\xef\xbf\xbf')
self.assertEqual(unicode_getutf8buffer(nonbmp), b'\xf4\x8f\xbf\xbf')
self.assertRaises(UnicodeEncodeError, unicode_getutf8buffer, surrogates)
self.assertEqual(unicode_getutf8buffer(surrogates, "surrogatepass"),
b'a\xed\xa0\x80b\xed\xbf\xbfc')

# Test PyUnicode_AsUTF8()
@support.cpython_only
def test_asutf8(self):
Expand Down

This file was deleted.

212 changes: 0 additions & 212 deletions Modules/_testcapimodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -1967,216 +1967,6 @@ unicode_asutf8andsize(PyObject *self, PyObject *args)
return Py_BuildValue("(Nn)", result, utf8_len);
}

static PyObject *
unicode_getutf8buffer(PyObject *self, PyObject *args)
{
PyObject *unicode;
const char *errors = NULL;
if(!PyArg_ParseTuple(args, "O|s", &unicode, &errors)) {
return NULL;
}

Py_buffer buffer;
if (_PyUnicode_GetUTF8Buffer(unicode, errors, &buffer) < 0) {
return NULL;
}

assert(buffer.obj != NULL);
assert(buffer.obj == unicode || PyBytes_CheckExact(buffer.obj));

PyObject *result = PyBytes_FromStringAndSize(buffer.buf, buffer.len);
PyBuffer_Release(&buffer);
return result;
}

static PyObject *
unicode_test_getutf8buffer(PyObject *self, PyObject *Py_UNUSED(ignored))
{
Py_buffer buf;

// Test 1: ASCII string
PyObject *str = PyUnicode_FromString("hello");
if (str == NULL) {
return NULL;
}
Py_ssize_t refcnt = Py_REFCNT(str);

// _PyUnicode_GetUTF8Buffer() must not fail for ASCII string.
int ret = _PyUnicode_GetUTF8Buffer(str, NULL, &buf);
assert(ret == 0);

if (buf.obj != str) {
PyErr_Format(TestError,
"buf.obj must be equal to str. (%s:%d)",
__FILE__, __LINE__);
PyBuffer_Release(&buf);
Py_DECREF(str);
return NULL;
}

if (buf.len != PyUnicode_GET_LENGTH(str)) {
PyErr_Format(TestError,
"buf.len must be equal to len(str). (%s:%d)",
__FILE__, __LINE__);
PyBuffer_Release(&buf);
Py_DECREF(str);
return NULL;
}
assert(((const char*)buf.buf)[5] == '\0');

if ((Py_UCS1*)buf.buf != PyUnicode_1BYTE_DATA(str)) {
PyErr_Format(TestError,
"buf.buf must be equal to PyUnicode_1BYTE_DATA(str). (%s:%d)",
__FILE__, __LINE__);
PyBuffer_Release(&buf);
Py_DECREF(str);
return NULL;
}

if (refcnt + 1 != Py_REFCNT(str)) {
PyErr_Format(TestError,
"Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
refcnt + 1, Py_REFCNT(str),
__FILE__, __LINE__);
PyBuffer_Release(&buf);
Py_DECREF(str);
return NULL;
}

PyBuffer_Release(&buf);

if (refcnt != Py_REFCNT(str)) {
PyErr_Format(TestError,
"Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
refcnt, Py_REFCNT(str),
__FILE__, __LINE__);
Py_DECREF(str);
return NULL;
}

Py_DECREF(str);

// Test 2: non-ASCII string

// "hello" in Japanese. len(str)==5, len(str.encode()) == 15.
str = PyUnicode_FromString("\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf");
if (str == NULL) {
return NULL;
}
refcnt = Py_REFCNT(str);
assert(PyUnicode_GET_LENGTH(str) == 5);

if (_PyUnicode_GetUTF8Buffer(str, NULL, &buf) < 0) {
Py_DECREF(str);
if (!PyErr_Occurred()) {
PyErr_Format(TestError,
"_PyUnicode_GetUTF8Buffer() returned nonzero "
"without exception set. (%s:%d)",
__FILE__, __LINE__);
}
return NULL;
}

if (!PyBytes_CheckExact(buf.obj)) {
PyErr_Format(TestError,
"buf.obj must be a bytes object, got %R (%s:%d)",
buf.obj, __FILE__, __LINE__);
PyBuffer_Release(&buf);
Py_DECREF(str);
return NULL;
}

if (buf.len != 15) {
PyErr_Format(TestError,
"Expected buf.len == 15, actual %zd (%s:%d)",
buf.len, __FILE__, __LINE__);
PyBuffer_Release(&buf);
Py_DECREF(str);
return NULL;
}
assert(((const char*)buf.buf)[15] == '\0');

if (refcnt != Py_REFCNT(str)) {
PyErr_Format(TestError,
"Py_REFCNT(str) must not be changed. (%s:%d)",
__FILE__, __LINE__);
// Do not DECREF here because refcnt is broken.
return NULL;
}

PyBuffer_Release(&buf);

// Test 3: There is a UTF-8 cache
// Reuse str of the previoss test.

const char *cache = PyUnicode_AsUTF8(str);
if (cache == NULL) {
return NULL;
}

if (_PyUnicode_GetUTF8Buffer(str, NULL, &buf) < 0) {
Py_DECREF(str);
if (!PyErr_Occurred()) {
PyErr_Format(TestError,
"_PyUnicode_GetUTF8Buffer() returned nonzero "
"without exception set. (%s:%d)",
__FILE__, __LINE__);
}
return NULL;
}

if (buf.obj != str) {
PyErr_Format(TestError,
"buf.obj must be equal to str. (%s:%d)",
__FILE__, __LINE__);
PyBuffer_Release(&buf);
Py_DECREF(str);
return NULL;
}

if (buf.buf != cache) {
PyErr_Format(TestError,
"buf.buf must be equal to the UTF-8 cache (%s:%d)",
__FILE__, __LINE__);
PyBuffer_Release(&buf);
Py_DECREF(str);
return NULL;
}

if (buf.len != 15) {
PyErr_Format(TestError,
"Expected buf.len == 15, actual %zd (%s:%d)",
buf.len, __FILE__, __LINE__);
PyBuffer_Release(&buf);
Py_DECREF(str);
return NULL;
}
assert(((const char*)buf.buf)[15] == '\0');

if (refcnt + 1 != Py_REFCNT(str)) {
PyErr_Format(TestError,
"Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
refcnt + 1, Py_REFCNT(str),
__FILE__, __LINE__);
// Do not DECREF here because refcnt is broken.
return NULL;
}

PyBuffer_Release(&buf);

if (refcnt != Py_REFCNT(str)) {
PyErr_Format(TestError,
"Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
refcnt, Py_REFCNT(str),
__FILE__, __LINE__);
// Do not DECREF here because refcnt is broken.
return NULL;
}

Py_DECREF(str);
Py_RETURN_NONE;
}

static PyObject *
unicode_findchar(PyObject *self, PyObject *args)
{
Expand Down Expand Up @@ -5602,8 +5392,6 @@ static PyMethodDef TestMethods[] = {
{"unicode_asucs4", unicode_asucs4, METH_VARARGS},
{"unicode_asutf8", unicode_asutf8, METH_VARARGS},
{"unicode_asutf8andsize", unicode_asutf8andsize, METH_VARARGS},
{"unicode_getutf8buffer", unicode_getutf8buffer, METH_VARARGS},
{"unicode_test_getutf8buffer", unicode_test_getutf8buffer, METH_NOARGS},
{"unicode_findchar", unicode_findchar, METH_VARARGS},
{"unicode_copycharacters", unicode_copycharacters, METH_VARARGS},
{"unicode_encodedecimal", unicode_encodedecimal, METH_VARARGS},
Expand Down
35 changes: 0 additions & 35 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -3991,41 +3991,6 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
}


int
_PyUnicode_GetUTF8Buffer(PyObject *unicode, const char *errors,
Py_buffer *view)
{
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return -1;
}
if (PyUnicode_READY(unicode) == -1) {
return -1;
}

if (PyUnicode_UTF8(unicode) != NULL
&& Py_TYPE(unicode)->tp_as_buffer == NULL) {
return PyBuffer_FillInfo(view, unicode,
PyUnicode_UTF8(unicode),
PyUnicode_UTF8_LENGTH(unicode),
/* readonly */ 1, PyBUF_SIMPLE);
}

// Unlike PyUnicode_AsUTF8AndSize(), this function doesn't
// create a UTF-8 cache for speed and efficiency.
PyObject *bytes = _PyUnicode_AsUTF8String(unicode, errors);
if (bytes == NULL) {
return -1;
}
assert(PyBytes_CheckExact(bytes));
if (PyObject_GetBuffer(bytes, view, PyBUF_SIMPLE) < 0) {
Py_DECREF(bytes);
return -1;
}
return 0;
}


static int unicode_fill_utf8(PyObject *unicode);

const char *
Expand Down