@@ -3996,11 +3996,11 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
3996
3996
}
3997
3997
3998
3998
3999
+ static int unicode_fill_utf8 (PyObject * unicode );
4000
+
3999
4001
const char *
4000
4002
PyUnicode_AsUTF8AndSize (PyObject * unicode , Py_ssize_t * psize )
4001
4003
{
4002
- PyObject * bytes ;
4003
-
4004
4004
if (!PyUnicode_Check (unicode )) {
4005
4005
PyErr_BadArgument ();
4006
4006
return NULL ;
@@ -4009,21 +4009,9 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4009
4009
return NULL ;
4010
4010
4011
4011
if (PyUnicode_UTF8 (unicode ) == NULL ) {
4012
- assert (!PyUnicode_IS_COMPACT_ASCII (unicode ));
4013
- bytes = _PyUnicode_AsUTF8String (unicode , NULL );
4014
- if (bytes == NULL )
4015
- return NULL ;
4016
- _PyUnicode_UTF8 (unicode ) = PyObject_MALLOC (PyBytes_GET_SIZE (bytes ) + 1 );
4017
- if (_PyUnicode_UTF8 (unicode ) == NULL ) {
4018
- PyErr_NoMemory ();
4019
- Py_DECREF (bytes );
4012
+ if (unicode_fill_utf8 (unicode ) == -1 ) {
4020
4013
return NULL ;
4021
4014
}
4022
- _PyUnicode_UTF8_LENGTH (unicode ) = PyBytes_GET_SIZE (bytes );
4023
- memcpy (_PyUnicode_UTF8 (unicode ),
4024
- PyBytes_AS_STRING (bytes ),
4025
- _PyUnicode_UTF8_LENGTH (unicode ) + 1 );
4026
- Py_DECREF (bytes );
4027
4015
}
4028
4016
4029
4017
if (psize )
@@ -5386,10 +5374,6 @@ static PyObject *
5386
5374
unicode_encode_utf8 (PyObject * unicode , _Py_error_handler error_handler ,
5387
5375
const char * errors )
5388
5376
{
5389
- enum PyUnicode_Kind kind ;
5390
- void * data ;
5391
- Py_ssize_t size ;
5392
-
5393
5377
if (!PyUnicode_Check (unicode )) {
5394
5378
PyErr_BadArgument ();
5395
5379
return NULL ;
@@ -5402,22 +5386,85 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5402
5386
return PyBytes_FromStringAndSize (PyUnicode_UTF8 (unicode ),
5403
5387
PyUnicode_UTF8_LENGTH (unicode ));
5404
5388
5405
- kind = PyUnicode_KIND (unicode );
5406
- data = PyUnicode_DATA (unicode );
5407
- size = PyUnicode_GET_LENGTH (unicode );
5389
+ enum PyUnicode_Kind kind = PyUnicode_KIND (unicode );
5390
+ void * data = PyUnicode_DATA (unicode );
5391
+ Py_ssize_t size = PyUnicode_GET_LENGTH (unicode );
5392
+
5393
+ _PyBytesWriter writer ;
5394
+ char * end ;
5408
5395
5409
5396
switch (kind ) {
5410
5397
default :
5411
5398
Py_UNREACHABLE ();
5412
5399
case PyUnicode_1BYTE_KIND :
5413
5400
/* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5414
5401
assert (!PyUnicode_IS_ASCII (unicode ));
5415
- return ucs1lib_utf8_encoder (unicode , data , size , error_handler , errors );
5402
+ end = ucs1lib_utf8_encoder (& writer , unicode , data , size , error_handler , errors );
5403
+ break ;
5404
+ case PyUnicode_2BYTE_KIND :
5405
+ end = ucs2lib_utf8_encoder (& writer , unicode , data , size , error_handler , errors );
5406
+ break ;
5407
+ case PyUnicode_4BYTE_KIND :
5408
+ end = ucs4lib_utf8_encoder (& writer , unicode , data , size , error_handler , errors );
5409
+ break ;
5410
+ }
5411
+
5412
+ if (end == NULL ) {
5413
+ return NULL ;
5414
+ }
5415
+ return _PyBytesWriter_Finish (& writer , end );
5416
+ }
5417
+
5418
+ static int
5419
+ unicode_fill_utf8 (PyObject * unicode )
5420
+ {
5421
+ /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5422
+ assert (!PyUnicode_IS_ASCII (unicode ));
5423
+
5424
+ enum PyUnicode_Kind kind = PyUnicode_KIND (unicode );
5425
+ void * data = PyUnicode_DATA (unicode );
5426
+ Py_ssize_t size = PyUnicode_GET_LENGTH (unicode );
5427
+
5428
+ _PyBytesWriter writer ;
5429
+ char * end ;
5430
+
5431
+ switch (kind ) {
5432
+ default :
5433
+ Py_UNREACHABLE ();
5434
+ case PyUnicode_1BYTE_KIND :
5435
+ end = ucs1lib_utf8_encoder (& writer , unicode , data , size ,
5436
+ _Py_ERROR_STRICT , NULL );
5437
+ break ;
5416
5438
case PyUnicode_2BYTE_KIND :
5417
- return ucs2lib_utf8_encoder (unicode , data , size , error_handler , errors );
5439
+ end = ucs2lib_utf8_encoder (& writer , unicode , data , size ,
5440
+ _Py_ERROR_STRICT , NULL );
5441
+ break ;
5418
5442
case PyUnicode_4BYTE_KIND :
5419
- return ucs4lib_utf8_encoder (unicode , data , size , error_handler , errors );
5443
+ end = ucs4lib_utf8_encoder (& writer , unicode , data , size ,
5444
+ _Py_ERROR_STRICT , NULL );
5445
+ break ;
5446
+ }
5447
+ if (end == NULL ) {
5448
+ // _PyBytesWriter_Dealloc is called in encoder
5449
+ return -1 ;
5450
+ }
5451
+
5452
+ char * start = writer .use_small_buffer ? writer .small_buffer :
5453
+ PyBytes_AS_STRING (writer .buffer );
5454
+ Py_ssize_t len = end - start ;
5455
+
5456
+ char * cache = PyObject_MALLOC (len + 1 );
5457
+ if (cache == NULL ) {
5458
+ _PyBytesWriter_Dealloc (& writer );
5459
+ PyErr_NoMemory ();
5460
+ return -1 ;
5420
5461
}
5462
+ _PyUnicode_UTF8 (unicode ) = cache ;
5463
+ _PyUnicode_UTF8_LENGTH (unicode ) = len ;
5464
+ memcpy (cache , start , len );
5465
+ cache [len ] = '\0' ;
5466
+ _PyBytesWriter_Dealloc (& writer );
5467
+ return 0 ;
5421
5468
}
5422
5469
5423
5470
PyObject *
0 commit comments