@@ -1305,6 +1305,45 @@ _PyUnicode_Dump(PyObject *op)
1305
1305
}
1306
1306
#endif
1307
1307
1308
+ // Simplified version of PyUnicode_New() that only creates ASCII strings.
1309
+ // This function does not test if size == 0.
1310
+ static PyObject *
1311
+ ascii_new (Py_ssize_t size )
1312
+ {
1313
+ PyObject * obj ;
1314
+ void * data ;
1315
+ Py_ssize_t struct_size = sizeof (PyASCIIObject );
1316
+
1317
+ if (size > ((PY_SSIZE_T_MAX - struct_size ) - 1 ))
1318
+ return PyErr_NoMemory ();
1319
+
1320
+ /* Duplicated allocation code from _PyObject_New() instead of a call to
1321
+ * PyObject_New() so we are able to allocate space for the object and
1322
+ * it's data buffer.
1323
+ */
1324
+ obj = (PyObject * ) PyObject_Malloc (struct_size + (size + 1 ));
1325
+ if (obj == NULL ) {
1326
+ return PyErr_NoMemory ();
1327
+ }
1328
+ _PyObject_Init (obj , & PyUnicode_Type );
1329
+
1330
+ data = ((PyASCIIObject * )obj ) + 1 ;
1331
+
1332
+ _PyUnicode_LENGTH (obj ) = size ;
1333
+ _PyUnicode_HASH (obj ) = -1 ;
1334
+ _PyUnicode_STATE (obj ).interned = 0 ;
1335
+ _PyUnicode_STATE (obj ).kind = PyUnicode_1BYTE_KIND ;
1336
+ _PyUnicode_STATE (obj ).compact = 1 ;
1337
+ _PyUnicode_STATE (obj ).ascii = 1 ;
1338
+ _PyUnicode_STATE (obj ).statically_allocated = 0 ;
1339
+ ((char * )data )[size ] = 0 ;
1340
+
1341
+ #ifdef Py_DEBUG
1342
+ unicode_fill_invalid ((PyObject * )unicode , 0 );
1343
+ #endif
1344
+ assert (_PyUnicode_CheckConsistency (obj , 0 ));
1345
+ return obj ;
1346
+ }
1308
1347
1309
1348
PyObject *
1310
1349
PyUnicode_New (Py_ssize_t size , Py_UCS4 maxchar )
@@ -2208,13 +2247,16 @@ _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2208
2247
{
2209
2248
const unsigned char * s = (const unsigned char * )buffer ;
2210
2249
PyObject * unicode ;
2250
+ if (size == 0 ) {
2251
+ return unicode_get_empty ();
2252
+ }
2211
2253
if (size == 1 ) {
2212
2254
#ifdef Py_DEBUG
2213
2255
assert ((unsigned char )s [0 ] < 128 );
2214
2256
#endif
2215
2257
return get_latin1_char (s [0 ]);
2216
2258
}
2217
- unicode = PyUnicode_New (size , 127 );
2259
+ unicode = ascii_new (size );
2218
2260
if (!unicode )
2219
2261
return NULL ;
2220
2262
memcpy (PyUnicode_1BYTE_DATA (unicode ), s , size );
@@ -5297,11 +5339,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
5297
5339
5298
5340
Py_ssize_t pos = find_first_nonascii (starts , end );
5299
5341
if (pos == size ) { // fast path: ASCII string.
5300
- PyObject * u = PyUnicode_New (size , 127 );
5342
+ PyObject * u = ascii_new (size );
5301
5343
if (u == NULL ) {
5302
5344
return NULL ;
5303
5345
}
5304
- memcpy (PyUnicode_1BYTE_DATA (u ), s , size );
5346
+ // memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5347
+ // bypass iscompact & isascii checks.
5348
+ memcpy (_Py_STATIC_CAST (void * , (_PyASCIIObject_CAST (u ) + 1 )), s , size );
5305
5349
if (consumed ) {
5306
5350
* consumed = size ;
5307
5351
}
@@ -5338,7 +5382,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
5338
5382
_PyUnicodeWriter writer ;
5339
5383
_PyUnicodeWriter_InitWithBuffer (& writer , u );
5340
5384
if (maxchr <= 255 ) {
5341
- memcpy (PyUnicode_1BYTE_DATA (u ), s , pos );
5385
+ memcpy (_PyUnicode_COMPACT_DATA (u ), s , pos );
5342
5386
s += pos ;
5343
5387
size -= pos ;
5344
5388
writer .pos = pos ;
@@ -7419,7 +7463,7 @@ PyUnicode_DecodeASCII(const char *s,
7419
7463
}
7420
7464
7421
7465
// Shortcut for simple case
7422
- PyObject * u = PyUnicode_New (size , 127 );
7466
+ PyObject * u = ascii_new (size );
7423
7467
if (u == NULL ) {
7424
7468
return NULL ;
7425
7469
}
0 commit comments