@@ -1967,6 +1967,216 @@ unicode_asutf8andsize(PyObject *self, PyObject *args)
1967
1967
return Py_BuildValue ("(Nn)" , result , utf8_len );
1968
1968
}
1969
1969
1970
+ static PyObject *
1971
+ unicode_getutf8buffer (PyObject * self , PyObject * args )
1972
+ {
1973
+ PyObject * unicode ;
1974
+ const char * errors = NULL ;
1975
+ if (!PyArg_ParseTuple (args , "O|s" , & unicode , & errors )) {
1976
+ return NULL ;
1977
+ }
1978
+
1979
+ Py_buffer buffer ;
1980
+ if (_PyUnicode_GetUTF8Buffer (unicode , errors , & buffer ) < 0 ) {
1981
+ return NULL ;
1982
+ }
1983
+
1984
+ assert (buffer .obj != NULL );
1985
+ assert (buffer .obj == unicode || PyBytes_CheckExact (buffer .obj ));
1986
+
1987
+ PyObject * result = PyBytes_FromStringAndSize (buffer .buf , buffer .len );
1988
+ PyBuffer_Release (& buffer );
1989
+ return result ;
1990
+ }
1991
+
1992
+ static PyObject *
1993
+ unicode_test_getutf8buffer (PyObject * self , PyObject * Py_UNUSED (ignored ))
1994
+ {
1995
+ Py_buffer buf ;
1996
+
1997
+ // Test 1: ASCII string
1998
+ PyObject * str = PyUnicode_FromString ("hello" );
1999
+ if (str == NULL ) {
2000
+ return NULL ;
2001
+ }
2002
+ Py_ssize_t refcnt = Py_REFCNT (str );
2003
+
2004
+ // _PyUnicode_GetUTF8Buffer() must not fail for ASCII string.
2005
+ int ret = _PyUnicode_GetUTF8Buffer (str , NULL , & buf );
2006
+ assert (ret == 0 );
2007
+
2008
+ if (buf .obj != str ) {
2009
+ PyErr_Format (TestError ,
2010
+ "buf.obj must be equal to str. (%s:%d)" ,
2011
+ __FILE__ , __LINE__ );
2012
+ PyBuffer_Release (& buf );
2013
+ Py_DECREF (str );
2014
+ return NULL ;
2015
+ }
2016
+
2017
+ if (buf .len != PyUnicode_GET_LENGTH (str )) {
2018
+ PyErr_Format (TestError ,
2019
+ "buf.len must be equal to len(str). (%s:%d)" ,
2020
+ __FILE__ , __LINE__ );
2021
+ PyBuffer_Release (& buf );
2022
+ Py_DECREF (str );
2023
+ return NULL ;
2024
+ }
2025
+ assert (((const char * )buf .buf )[5 ] == '\0' );
2026
+
2027
+ if ((Py_UCS1 * )buf .buf != PyUnicode_1BYTE_DATA (str )) {
2028
+ PyErr_Format (TestError ,
2029
+ "buf.buf must be equal to PyUnicode_1BYTE_DATA(str). (%s:%d)" ,
2030
+ __FILE__ , __LINE__ );
2031
+ PyBuffer_Release (& buf );
2032
+ Py_DECREF (str );
2033
+ return NULL ;
2034
+ }
2035
+
2036
+ if (refcnt + 1 != Py_REFCNT (str )) {
2037
+ PyErr_Format (TestError ,
2038
+ "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)" ,
2039
+ refcnt + 1 , Py_REFCNT (str ),
2040
+ __FILE__ , __LINE__ );
2041
+ PyBuffer_Release (& buf );
2042
+ Py_DECREF (str );
2043
+ return NULL ;
2044
+ }
2045
+
2046
+ PyBuffer_Release (& buf );
2047
+
2048
+ if (refcnt != Py_REFCNT (str )) {
2049
+ PyErr_Format (TestError ,
2050
+ "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)" ,
2051
+ refcnt , Py_REFCNT (str ),
2052
+ __FILE__ , __LINE__ );
2053
+ Py_DECREF (str );
2054
+ return NULL ;
2055
+ }
2056
+
2057
+ Py_DECREF (str );
2058
+
2059
+ // Test 2: non-ASCII string
2060
+
2061
+ // "hello" in Japanese. len(str)==5, len(str.encode()) == 15.
2062
+ str = PyUnicode_FromString ("\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf" );
2063
+ if (str == NULL ) {
2064
+ return NULL ;
2065
+ }
2066
+ refcnt = Py_REFCNT (str );
2067
+ assert (PyUnicode_GET_LENGTH (str ) == 5 );
2068
+
2069
+ if (_PyUnicode_GetUTF8Buffer (str , NULL , & buf ) < 0 ) {
2070
+ Py_DECREF (str );
2071
+ if (!PyErr_Occurred ()) {
2072
+ PyErr_Format (TestError ,
2073
+ "_PyUnicode_GetUTF8Buffer() returned nonzero "
2074
+ "without exception set. (%s:%d)" ,
2075
+ __FILE__ , __LINE__ );
2076
+ }
2077
+ return NULL ;
2078
+ }
2079
+
2080
+ if (!PyBytes_CheckExact (buf .obj )) {
2081
+ PyErr_Format (TestError ,
2082
+ "buf.obj must be a bytes object, got %R (%s:%d)" ,
2083
+ buf .obj , __FILE__ , __LINE__ );
2084
+ PyBuffer_Release (& buf );
2085
+ Py_DECREF (str );
2086
+ return NULL ;
2087
+ }
2088
+
2089
+ if (buf .len != 15 ) {
2090
+ PyErr_Format (TestError ,
2091
+ "Expected buf.len == 15, actual %zd (%s:%d)" ,
2092
+ buf .len , __FILE__ , __LINE__ );
2093
+ PyBuffer_Release (& buf );
2094
+ Py_DECREF (str );
2095
+ return NULL ;
2096
+ }
2097
+ assert (((const char * )buf .buf )[15 ] == '\0' );
2098
+
2099
+ if (refcnt != Py_REFCNT (str )) {
2100
+ PyErr_Format (TestError ,
2101
+ "Py_REFCNT(str) must not be changed. (%s:%d)" ,
2102
+ __FILE__ , __LINE__ );
2103
+ // Do not DECREF here because refcnt is broken.
2104
+ return NULL ;
2105
+ }
2106
+
2107
+ PyBuffer_Release (& buf );
2108
+
2109
+ // Test 3: There is a UTF-8 cache
2110
+ // Reuse str of the previoss test.
2111
+
2112
+ const char * cache = PyUnicode_AsUTF8 (str );
2113
+ if (cache == NULL ) {
2114
+ return NULL ;
2115
+ }
2116
+
2117
+ if (_PyUnicode_GetUTF8Buffer (str , NULL , & buf ) < 0 ) {
2118
+ Py_DECREF (str );
2119
+ if (!PyErr_Occurred ()) {
2120
+ PyErr_Format (TestError ,
2121
+ "_PyUnicode_GetUTF8Buffer() returned nonzero "
2122
+ "without exception set. (%s:%d)" ,
2123
+ __FILE__ , __LINE__ );
2124
+ }
2125
+ return NULL ;
2126
+ }
2127
+
2128
+ if (buf .obj != str ) {
2129
+ PyErr_Format (TestError ,
2130
+ "buf.obj must be equal to str. (%s:%d)" ,
2131
+ __FILE__ , __LINE__ );
2132
+ PyBuffer_Release (& buf );
2133
+ Py_DECREF (str );
2134
+ return NULL ;
2135
+ }
2136
+
2137
+ if (buf .buf != cache ) {
2138
+ PyErr_Format (TestError ,
2139
+ "buf.buf must be equal to the UTF-8 cache (%s:%d)" ,
2140
+ __FILE__ , __LINE__ );
2141
+ PyBuffer_Release (& buf );
2142
+ Py_DECREF (str );
2143
+ return NULL ;
2144
+ }
2145
+
2146
+ if (buf .len != 15 ) {
2147
+ PyErr_Format (TestError ,
2148
+ "Expected buf.len == 15, actual %zd (%s:%d)" ,
2149
+ buf .len , __FILE__ , __LINE__ );
2150
+ PyBuffer_Release (& buf );
2151
+ Py_DECREF (str );
2152
+ return NULL ;
2153
+ }
2154
+ assert (((const char * )buf .buf )[15 ] == '\0' );
2155
+
2156
+ if (refcnt + 1 != Py_REFCNT (str )) {
2157
+ PyErr_Format (TestError ,
2158
+ "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)" ,
2159
+ refcnt + 1 , Py_REFCNT (str ),
2160
+ __FILE__ , __LINE__ );
2161
+ // Do not DECREF here because refcnt is broken.
2162
+ return NULL ;
2163
+ }
2164
+
2165
+ PyBuffer_Release (& buf );
2166
+
2167
+ if (refcnt != Py_REFCNT (str )) {
2168
+ PyErr_Format (TestError ,
2169
+ "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)" ,
2170
+ refcnt , Py_REFCNT (str ),
2171
+ __FILE__ , __LINE__ );
2172
+ // Do not DECREF here because refcnt is broken.
2173
+ return NULL ;
2174
+ }
2175
+
2176
+ Py_DECREF (str );
2177
+ Py_RETURN_NONE ;
2178
+ }
2179
+
1970
2180
static PyObject *
1971
2181
unicode_findchar (PyObject * self , PyObject * args )
1972
2182
{
@@ -5392,6 +5602,8 @@ static PyMethodDef TestMethods[] = {
5392
5602
{"unicode_asucs4" , unicode_asucs4 , METH_VARARGS },
5393
5603
{"unicode_asutf8" , unicode_asutf8 , METH_VARARGS },
5394
5604
{"unicode_asutf8andsize" , unicode_asutf8andsize , METH_VARARGS },
5605
+ {"unicode_getutf8buffer" , unicode_getutf8buffer , METH_VARARGS },
5606
+ {"unicode_test_getutf8buffer" , unicode_test_getutf8buffer , METH_NOARGS },
5395
5607
{"unicode_findchar" , unicode_findchar , METH_VARARGS },
5396
5608
{"unicode_copycharacters" , unicode_copycharacters , METH_VARARGS },
5397
5609
{"unicode_encodedecimal" , unicode_encodedecimal , METH_VARARGS },
0 commit comments