@@ -202,6 +202,11 @@ static PyObject *
202
202
unicode_decode_utf8 (const char * s , Py_ssize_t size ,
203
203
_Py_error_handler error_handler , const char * errors ,
204
204
Py_ssize_t * consumed );
205
+ static int
206
+ unicode_decode_utf8_writer (_PyUnicodeWriter * writer ,
207
+ const char * s , Py_ssize_t size ,
208
+ _Py_error_handler error_handler , const char * errors ,
209
+ Py_ssize_t * consumed );
205
210
#ifdef Py_DEBUG
206
211
static inline int unicode_is_finalizing (void );
207
212
static int unicode_is_singleton (PyObject * unicode );
@@ -2377,14 +2382,11 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2377
2382
}
2378
2383
2379
2384
static int
2380
- unicode_fromformat_write_cstr (_PyUnicodeWriter * writer , const char * str ,
2385
+ unicode_fromformat_write_utf8 (_PyUnicodeWriter * writer , const char * str ,
2381
2386
Py_ssize_t width , Py_ssize_t precision , int flags )
2382
2387
{
2383
2388
/* UTF-8 */
2384
2389
Py_ssize_t length ;
2385
- PyObject * unicode ;
2386
- int res ;
2387
-
2388
2390
if (precision == -1 ) {
2389
2391
length = strlen (str );
2390
2392
}
@@ -2394,11 +2396,19 @@ unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2394
2396
length ++ ;
2395
2397
}
2396
2398
}
2397
- unicode = PyUnicode_DecodeUTF8Stateful (str , length , "replace" , NULL );
2399
+
2400
+ if (width < 0 ) {
2401
+ return unicode_decode_utf8_writer (writer , str , length ,
2402
+ _Py_ERROR_REPLACE , "replace" , NULL );
2403
+ }
2404
+
2405
+ PyObject * unicode = PyUnicode_DecodeUTF8Stateful (str , length ,
2406
+ "replace" , NULL );
2398
2407
if (unicode == NULL )
2399
2408
return -1 ;
2400
2409
2401
- res = unicode_fromformat_write_str (writer , unicode , width , -1 , flags );
2410
+ int res = unicode_fromformat_write_str (writer , unicode ,
2411
+ width , -1 , flags );
2402
2412
Py_DECREF (unicode );
2403
2413
return res ;
2404
2414
}
@@ -2700,7 +2710,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
2700
2710
else {
2701
2711
/* UTF-8 */
2702
2712
const char * s = va_arg (* vargs , const char * );
2703
- if (unicode_fromformat_write_cstr (writer , s , width , precision , flags ) < 0 )
2713
+ if (unicode_fromformat_write_utf8 (writer , s , width , precision , flags ) < 0 )
2704
2714
return NULL ;
2705
2715
}
2706
2716
break ;
@@ -2739,7 +2749,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
2739
2749
}
2740
2750
else {
2741
2751
assert (str != NULL );
2742
- if (unicode_fromformat_write_cstr (writer , str , width , precision , flags ) < 0 )
2752
+ if (unicode_fromformat_write_utf8 (writer , str , width , precision , flags ) < 0 )
2743
2753
return NULL ;
2744
2754
}
2745
2755
break ;
@@ -4737,65 +4747,33 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4737
4747
return p - start ;
4738
4748
}
4739
4749
4740
- static PyObject *
4741
- unicode_decode_utf8 (const char * s , Py_ssize_t size ,
4742
- _Py_error_handler error_handler , const char * errors ,
4743
- Py_ssize_t * consumed )
4744
- {
4745
- if (size == 0 ) {
4746
- if (consumed )
4747
- * consumed = 0 ;
4748
- _Py_RETURN_UNICODE_EMPTY ();
4749
- }
4750
-
4751
- /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4752
- if (size == 1 && (unsigned char )s [0 ] < 128 ) {
4753
- if (consumed ) {
4754
- * consumed = 1 ;
4755
- }
4756
- return get_latin1_char ((unsigned char )s [0 ]);
4757
- }
4758
-
4759
- const char * starts = s ;
4760
- const char * end = s + size ;
4761
-
4762
- // fast path: try ASCII string.
4763
- PyObject * u = PyUnicode_New (size , 127 );
4764
- if (u == NULL ) {
4765
- return NULL ;
4766
- }
4767
- s += ascii_decode (s , end , PyUnicode_1BYTE_DATA (u ));
4768
- if (s == end ) {
4769
- if (consumed ) {
4770
- * consumed = size ;
4771
- }
4772
- return u ;
4773
- }
4774
-
4775
- // Use _PyUnicodeWriter after fast path is failed.
4776
- _PyUnicodeWriter writer ;
4777
- _PyUnicodeWriter_InitWithBuffer (& writer , u );
4778
- writer .pos = s - starts ;
4779
4750
4751
+ static int
4752
+ unicode_decode_utf8_impl (_PyUnicodeWriter * writer ,
4753
+ const char * starts , const char * s , const char * end ,
4754
+ _Py_error_handler error_handler ,
4755
+ const char * errors ,
4756
+ Py_ssize_t * consumed )
4757
+ {
4780
4758
Py_ssize_t startinpos , endinpos ;
4781
4759
const char * errmsg = "" ;
4782
4760
PyObject * error_handler_obj = NULL ;
4783
4761
PyObject * exc = NULL ;
4784
4762
4785
4763
while (s < end ) {
4786
4764
Py_UCS4 ch ;
4787
- int kind = writer . kind ;
4765
+ int kind = writer -> kind ;
4788
4766
4789
4767
if (kind == PyUnicode_1BYTE_KIND ) {
4790
- if (PyUnicode_IS_ASCII (writer . buffer ))
4791
- ch = asciilib_utf8_decode (& s , end , writer . data , & writer . pos );
4768
+ if (PyUnicode_IS_ASCII (writer -> buffer ))
4769
+ ch = asciilib_utf8_decode (& s , end , writer -> data , & writer -> pos );
4792
4770
else
4793
- ch = ucs1lib_utf8_decode (& s , end , writer . data , & writer . pos );
4771
+ ch = ucs1lib_utf8_decode (& s , end , writer -> data , & writer -> pos );
4794
4772
} else if (kind == PyUnicode_2BYTE_KIND ) {
4795
- ch = ucs2lib_utf8_decode (& s , end , writer . data , & writer . pos );
4773
+ ch = ucs2lib_utf8_decode (& s , end , writer -> data , & writer -> pos );
4796
4774
} else {
4797
4775
assert (kind == PyUnicode_4BYTE_KIND );
4798
- ch = ucs4lib_utf8_decode (& s , end , writer . data , & writer . pos );
4776
+ ch = ucs4lib_utf8_decode (& s , end , writer -> data , & writer -> pos );
4799
4777
}
4800
4778
4801
4779
switch (ch ) {
@@ -4826,7 +4804,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
4826
4804
endinpos = startinpos + ch - 1 ;
4827
4805
break ;
4828
4806
default :
4829
- if (_PyUnicodeWriter_WriteCharInline (& writer , ch ) < 0 )
4807
+ // ch doesn't fit into kind, so change the buffer kind to write
4808
+ // the character
4809
+ if (_PyUnicodeWriter_WriteCharInline (writer , ch ) < 0 )
4830
4810
goto onError ;
4831
4811
continue ;
4832
4812
}
@@ -4840,7 +4820,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
4840
4820
break ;
4841
4821
4842
4822
case _Py_ERROR_REPLACE :
4843
- if (_PyUnicodeWriter_WriteCharInline (& writer , 0xfffd ) < 0 )
4823
+ if (_PyUnicodeWriter_WriteCharInline (writer , 0xfffd ) < 0 )
4844
4824
goto onError ;
4845
4825
s += (endinpos - startinpos );
4846
4826
break ;
@@ -4849,13 +4829,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
4849
4829
{
4850
4830
Py_ssize_t i ;
4851
4831
4852
- if (_PyUnicodeWriter_PrepareKind (& writer , PyUnicode_2BYTE_KIND ) < 0 )
4832
+ if (_PyUnicodeWriter_PrepareKind (writer , PyUnicode_2BYTE_KIND ) < 0 )
4853
4833
goto onError ;
4854
4834
for (i = startinpos ; i < endinpos ; i ++ ) {
4855
4835
ch = (Py_UCS4 )(unsigned char )(starts [i ]);
4856
- PyUnicode_WRITE (writer . kind , writer . data , writer . pos ,
4836
+ PyUnicode_WRITE (writer -> kind , writer -> data , writer -> pos ,
4857
4837
ch + 0xdc00 );
4858
- writer . pos ++ ;
4838
+ writer -> pos ++ ;
4859
4839
}
4860
4840
s += (endinpos - startinpos );
4861
4841
break ;
@@ -4866,8 +4846,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
4866
4846
errors , & error_handler_obj ,
4867
4847
"utf-8" , errmsg ,
4868
4848
& starts , & end , & startinpos , & endinpos , & exc , & s ,
4869
- & writer ))
4849
+ writer )) {
4870
4850
goto onError ;
4851
+ }
4852
+
4853
+ if (_PyUnicodeWriter_Prepare (writer , end - s , 127 ) < 0 ) {
4854
+ return -1 ;
4855
+ }
4871
4856
}
4872
4857
}
4873
4858
@@ -4877,13 +4862,107 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
4877
4862
4878
4863
Py_XDECREF (error_handler_obj );
4879
4864
Py_XDECREF (exc );
4880
- return _PyUnicodeWriter_Finish ( & writer ) ;
4865
+ return 0 ;
4881
4866
4882
4867
onError :
4883
4868
Py_XDECREF (error_handler_obj );
4884
4869
Py_XDECREF (exc );
4885
- _PyUnicodeWriter_Dealloc (& writer );
4886
- return NULL ;
4870
+ return -1 ;
4871
+ }
4872
+
4873
+
4874
+ static PyObject *
4875
+ unicode_decode_utf8 (const char * s , Py_ssize_t size ,
4876
+ _Py_error_handler error_handler , const char * errors ,
4877
+ Py_ssize_t * consumed )
4878
+ {
4879
+ if (size == 0 ) {
4880
+ if (consumed ) {
4881
+ * consumed = 0 ;
4882
+ }
4883
+ _Py_RETURN_UNICODE_EMPTY ();
4884
+ }
4885
+
4886
+ /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4887
+ if (size == 1 && (unsigned char )s [0 ] < 128 ) {
4888
+ if (consumed ) {
4889
+ * consumed = 1 ;
4890
+ }
4891
+ return get_latin1_char ((unsigned char )s [0 ]);
4892
+ }
4893
+
4894
+ // fast path: try ASCII string.
4895
+ const char * starts = s ;
4896
+ const char * end = s + size ;
4897
+ PyObject * u = PyUnicode_New (size , 127 );
4898
+ if (u == NULL ) {
4899
+ return NULL ;
4900
+ }
4901
+ Py_ssize_t decoded = ascii_decode (s , end , PyUnicode_1BYTE_DATA (u ));
4902
+ if (decoded == size ) {
4903
+ if (consumed ) {
4904
+ * consumed = size ;
4905
+ }
4906
+ return u ;
4907
+ }
4908
+ s += decoded ;
4909
+ size -= decoded ;
4910
+
4911
+ // Use _PyUnicodeWriter after fast path is failed.
4912
+ _PyUnicodeWriter writer ;
4913
+ _PyUnicodeWriter_InitWithBuffer (& writer , u );
4914
+ writer .pos = decoded ;
4915
+
4916
+ if (unicode_decode_utf8_impl (& writer , starts , s , end ,
4917
+ error_handler , errors ,
4918
+ consumed ) < 0 ) {
4919
+ _PyUnicodeWriter_Dealloc (& writer );
4920
+ return NULL ;
4921
+ }
4922
+ return _PyUnicodeWriter_Finish (& writer );
4923
+ }
4924
+
4925
+
4926
+ static int
4927
+ unicode_decode_utf8_writer (_PyUnicodeWriter * writer ,
4928
+ const char * s , Py_ssize_t size ,
4929
+ _Py_error_handler error_handler , const char * errors ,
4930
+ Py_ssize_t * consumed )
4931
+ {
4932
+ if (size == 0 ) {
4933
+ if (consumed ) {
4934
+ * consumed = 0 ;
4935
+ }
4936
+ return 0 ;
4937
+ }
4938
+
4939
+ // fast path: try ASCII string.
4940
+ if (_PyUnicodeWriter_Prepare (writer , size , 127 ) < 0 ) {
4941
+ return -1 ;
4942
+ }
4943
+
4944
+ const char * starts = s ;
4945
+ const char * end = s + size ;
4946
+ Py_ssize_t decoded = 0 ;
4947
+ Py_UCS1 * dest = (Py_UCS1 * )writer -> data + writer -> pos * writer -> kind ;
4948
+ if (writer -> kind == PyUnicode_1BYTE_KIND
4949
+ && _Py_IS_ALIGNED (dest , ALIGNOF_SIZE_T ))
4950
+ {
4951
+ decoded = ascii_decode (s , end , dest );
4952
+ writer -> pos += decoded ;
4953
+
4954
+ if (decoded == size ) {
4955
+ if (consumed ) {
4956
+ * consumed = size ;
4957
+ }
4958
+ return 0 ;
4959
+ }
4960
+ s += decoded ;
4961
+ size -= decoded ;
4962
+ }
4963
+
4964
+ return unicode_decode_utf8_impl (writer , starts , s , end ,
4965
+ error_handler , errors , consumed );
4887
4966
}
4888
4967
4889
4968
0 commit comments