1
1
#![ unstable( issue = "none" , feature = "windows_stdio" ) ]
2
2
3
- use core:: str:: utf8_char_width;
4
-
5
3
use super :: api:: { self , WinError } ;
6
4
use crate :: mem:: MaybeUninit ;
7
5
use crate :: os:: windows:: io:: { FromRawHandle , IntoRawHandle } ;
8
6
use crate :: sys:: handle:: Handle ;
9
7
use crate :: sys:: { c, cvt} ;
10
- use crate :: { cmp, io, ptr, str } ;
8
+ use crate :: { cmp, io, ptr} ;
11
9
12
10
#[ cfg( test) ]
13
11
mod tests;
@@ -19,13 +17,9 @@ pub struct Stdin {
19
17
incomplete_utf8 : IncompleteUtf8 ,
20
18
}
21
19
22
- pub struct Stdout {
23
- incomplete_utf8 : IncompleteUtf8 ,
24
- }
20
+ pub struct Stdout { }
25
21
26
- pub struct Stderr {
27
- incomplete_utf8 : IncompleteUtf8 ,
28
- }
22
+ pub struct Stderr { }
29
23
30
24
struct IncompleteUtf8 {
31
25
bytes : [ u8 ; 4 ] ,
@@ -98,7 +92,7 @@ fn is_utf8_console() -> bool {
98
92
false
99
93
}
100
94
101
- fn write ( handle_id : u32 , data : & [ u8 ] , incomplete_utf8 : & mut IncompleteUtf8 ) -> io:: Result < usize > {
95
+ fn write ( handle_id : u32 , data : & [ u8 ] ) -> io:: Result < usize > {
102
96
if data. is_empty ( ) {
103
97
return Ok ( 0 ) ;
104
98
}
@@ -112,134 +106,41 @@ fn write(handle_id: u32, data: &[u8], incomplete_utf8: &mut IncompleteUtf8) -> i
112
106
return ret;
113
107
}
114
108
} else {
115
- write_console_utf16 ( data, incomplete_utf8 , handle)
109
+ write_console_utf16 ( data, handle)
116
110
}
117
111
}
118
112
119
- fn write_console_utf16 (
120
- data : & [ u8 ] ,
121
- incomplete_utf8 : & mut IncompleteUtf8 ,
122
- handle : c:: HANDLE ,
123
- ) -> io:: Result < usize > {
124
- if incomplete_utf8. len > 0 {
125
- assert ! (
126
- incomplete_utf8. len < 4 ,
127
- "Unexpected number of bytes for incomplete UTF-8 codepoint."
128
- ) ;
129
- if data[ 0 ] >> 6 != 0b10 {
130
- // not a continuation byte - reject
131
- incomplete_utf8. len = 0 ;
132
- return Err ( io:: const_error!(
133
- io:: ErrorKind :: InvalidData ,
134
- "Windows stdio in console mode does not support writing non-UTF-8 byte sequences" ,
135
- ) ) ;
136
- }
137
- incomplete_utf8. bytes [ incomplete_utf8. len as usize ] = data[ 0 ] ;
138
- incomplete_utf8. len += 1 ;
139
- let char_width = utf8_char_width ( incomplete_utf8. bytes [ 0 ] ) ;
140
- if ( incomplete_utf8. len as usize ) < char_width {
141
- // more bytes needed
142
- return Ok ( 1 ) ;
143
- }
144
- let s = str:: from_utf8 ( & incomplete_utf8. bytes [ 0 ..incomplete_utf8. len as usize ] ) ;
145
- incomplete_utf8. len = 0 ;
146
- match s {
147
- Ok ( s) => {
148
- assert_eq ! ( char_width, s. len( ) ) ;
149
- let written = write_valid_utf8_to_console ( handle, s) ?;
150
- assert_eq ! ( written, s. len( ) ) ; // guaranteed by write_valid_utf8_to_console() for single codepoint writes
151
- return Ok ( 1 ) ;
152
- }
153
- Err ( _) => {
154
- return Err ( io:: const_error!(
155
- io:: ErrorKind :: InvalidData ,
156
- "Windows stdio in console mode does not support writing non-UTF-8 byte sequences" ,
157
- ) ) ;
158
- }
159
- }
160
- }
161
-
162
- // As the console is meant for presenting text, we assume bytes of `data` are encoded as UTF-8,
163
- // which needs to be encoded as UTF-16.
164
- //
165
- // If the data is not valid UTF-8 we write out as many bytes as are valid.
166
- // If the first byte is invalid it is either first byte of a multi-byte sequence but the
167
- // provided byte slice is too short or it is the first byte of an invalid multi-byte sequence.
168
- let len = cmp:: min ( data. len ( ) , MAX_BUFFER_SIZE / 2 ) ;
169
- let utf8 = match str:: from_utf8 ( & data[ ..len] ) {
170
- Ok ( s) => s,
171
- Err ( ref e) if e. valid_up_to ( ) == 0 => {
172
- let first_byte_char_width = utf8_char_width ( data[ 0 ] ) ;
173
- if first_byte_char_width > 1 && data. len ( ) < first_byte_char_width {
174
- incomplete_utf8. bytes [ 0 ] = data[ 0 ] ;
175
- incomplete_utf8. len = 1 ;
176
- return Ok ( 1 ) ;
177
- } else {
178
- return Err ( io:: const_error!(
179
- io:: ErrorKind :: InvalidData ,
180
- "Windows stdio in console mode does not support writing non-UTF-8 byte sequences" ,
181
- ) ) ;
182
- }
183
- }
184
- Err ( e) => str:: from_utf8 ( & data[ ..e. valid_up_to ( ) ] ) . unwrap ( ) ,
185
- } ;
186
-
187
- write_valid_utf8_to_console ( handle, utf8)
113
+ fn write_console_utf16 ( data : & [ u8 ] , handle : c:: HANDLE ) -> io:: Result < usize > {
114
+ let mut buffer = [ MaybeUninit :: < u16 > :: uninit ( ) ; MAX_BUFFER_SIZE / 2 ] ;
115
+ let data = & data[ ..data. len ( ) . min ( buffer. len ( ) ) ] ;
116
+
117
+ // Split off any trailing incomplete UTF-8 from the end of the input.
118
+ let utf8 = trim_last_char_boundary ( data) ;
119
+ let utf16 = utf8_to_utf16_lossy ( utf8, & mut buffer) ;
120
+ debug_assert ! ( !utf16. is_empty( ) ) ;
121
+
122
+ // Write the UTF-16 chars to the console.
123
+ // This will succeed in one write so long as our [u16] slice is smaller than the console's buffer,
124
+ // which we've ensured by truncating the input (see `MAX_BUFFER_SIZE`).
125
+ let written = write_u16s ( handle, & utf16) ?;
126
+ debug_assert_eq ! ( written, utf16. len( ) ) ;
127
+ Ok ( utf8. len ( ) )
188
128
}
189
129
190
- fn write_valid_utf8_to_console ( handle : c:: HANDLE , utf8 : & str ) -> io:: Result < usize > {
191
- debug_assert ! ( !utf8. is_empty( ) ) ;
192
-
193
- let mut utf16 = [ MaybeUninit :: < u16 > :: uninit ( ) ; MAX_BUFFER_SIZE / 2 ] ;
194
- let utf8 = & utf8[ ..utf8. floor_char_boundary ( utf16. len ( ) ) ] ;
195
-
196
- let utf16: & [ u16 ] = unsafe {
197
- // Note that this theoretically checks validity twice in the (most common) case
198
- // where the underlying byte sequence is valid utf-8 (given the check in `write()`).
130
+ fn utf8_to_utf16_lossy < ' a > ( utf8 : & [ u8 ] , utf16 : & ' a mut [ MaybeUninit < u16 > ] ) -> & ' a [ u16 ] {
131
+ unsafe {
199
132
let result = c:: MultiByteToWideChar (
200
133
c:: CP_UTF8 , // CodePage
201
- c :: MB_ERR_INVALID_CHARS , // dwFlags
134
+ 0 , // dwFlags
202
135
utf8. as_ptr ( ) , // lpMultiByteStr
203
136
utf8. len ( ) as i32 , // cbMultiByte
204
137
utf16. as_mut_ptr ( ) as * mut c:: WCHAR , // lpWideCharStr
205
138
utf16. len ( ) as i32 , // cchWideChar
206
139
) ;
207
- assert ! ( result != 0 , "Unexpected error in MultiByteToWideChar" ) ;
208
-
140
+ // The only way an error can happen here is if we've messed up.
141
+ debug_assert ! ( result != 0 , "Unexpected error in MultiByteToWideChar" ) ;
209
142
// Safety: MultiByteToWideChar initializes `result` values.
210
143
MaybeUninit :: slice_assume_init_ref ( & utf16[ ..result as usize ] )
211
- } ;
212
-
213
- let mut written = write_u16s ( handle, utf16) ?;
214
-
215
- // Figure out how many bytes of as UTF-8 were written away as UTF-16.
216
- if written == utf16. len ( ) {
217
- Ok ( utf8. len ( ) )
218
- } else {
219
- // Make sure we didn't end up writing only half of a surrogate pair (even though the chance
220
- // is tiny). Because it is not possible for user code to re-slice `data` in such a way that
221
- // a missing surrogate can be produced (and also because of the UTF-8 validation above),
222
- // write the missing surrogate out now.
223
- // Buffering it would mean we have to lie about the number of bytes written.
224
- let first_code_unit_remaining = utf16[ written] ;
225
- if matches ! ( first_code_unit_remaining, 0xDCEE ..=0xDFFF ) {
226
- // low surrogate
227
- // We just hope this works, and give up otherwise
228
- let _ = write_u16s ( handle, & utf16[ written..written + 1 ] ) ;
229
- written += 1 ;
230
- }
231
- // Calculate the number of bytes of `utf8` that were actually written.
232
- let mut count = 0 ;
233
- for ch in utf16[ ..written] . iter ( ) {
234
- count += match ch {
235
- 0x0000 ..=0x007F => 1 ,
236
- 0x0080 ..=0x07FF => 2 ,
237
- 0xDCEE ..=0xDFFF => 1 , // Low surrogate. We already counted 3 bytes for the other.
238
- _ => 3 ,
239
- } ;
240
- }
241
- debug_assert ! ( String :: from_utf16( & utf16[ ..written] ) . unwrap( ) == utf8[ ..count] ) ;
242
- Ok ( count)
243
144
}
244
145
}
245
146
@@ -432,13 +333,13 @@ impl IncompleteUtf8 {
432
333
433
334
impl Stdout {
434
335
pub const fn new ( ) -> Stdout {
435
- Stdout { incomplete_utf8 : IncompleteUtf8 :: new ( ) }
336
+ Stdout { }
436
337
}
437
338
}
438
339
439
340
impl io:: Write for Stdout {
440
341
fn write ( & mut self , buf : & [ u8 ] ) -> io:: Result < usize > {
441
- write ( c:: STD_OUTPUT_HANDLE , buf, & mut self . incomplete_utf8 )
342
+ write ( c:: STD_OUTPUT_HANDLE , buf)
442
343
}
443
344
444
345
fn flush ( & mut self ) -> io:: Result < ( ) > {
@@ -448,13 +349,13 @@ impl io::Write for Stdout {
448
349
449
350
impl Stderr {
450
351
pub const fn new ( ) -> Stderr {
451
- Stderr { incomplete_utf8 : IncompleteUtf8 :: new ( ) }
352
+ Stderr { }
452
353
}
453
354
}
454
355
455
356
impl io:: Write for Stderr {
456
357
fn write ( & mut self , buf : & [ u8 ] ) -> io:: Result < usize > {
457
- write ( c:: STD_ERROR_HANDLE , buf, & mut self . incomplete_utf8 )
358
+ write ( c:: STD_ERROR_HANDLE , buf)
458
359
}
459
360
460
361
fn flush ( & mut self ) -> io:: Result < ( ) > {
@@ -469,3 +370,50 @@ pub fn is_ebadf(err: &io::Error) -> bool {
469
370
pub fn panic_output ( ) -> Option < impl io:: Write > {
470
371
Some ( Stderr :: new ( ) )
471
372
}
373
+
374
+ /// Trim one incomplete UTF-8 char from the end of a byte slice.
375
+ ///
376
+ /// If trimming would lead to an empty slice then it returns `bytes` instead.
377
+ ///
378
+ /// Note: This function is optimized for size rather than speed.
379
+ pub fn trim_last_char_boundary ( bytes : & [ u8 ] ) -> & [ u8 ] {
380
+ // UTF-8's multiple-byte encoding uses the leading bits to encode the length of a code point.
381
+ // The bits of a multi-byte sequence are (where `n` is a placeholder for any bit):
382
+ //
383
+ // 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn
384
+ // 1110nnnn 10nnnnnn 10nnnnnn
385
+ // 110nnnnn 10nnnnnn
386
+ //
387
+ // So if follows that an incomplete sequence is one of these:
388
+ // 11110nnn 10nnnnnn 10nnnnnn
389
+ // 11110nnn 10nnnnnn
390
+ // 1110nnnn 10nnnnnn
391
+ // 11110nnn
392
+ // 1110nnnn
393
+ // 110nnnnn
394
+
395
+ // Get up to three bytes from the end of the slice and encode them as a u32
396
+ // because it turns out the compiler is very good at optimizing numbers.
397
+ let u = match bytes {
398
+ [ .., b1, b2, b3] => ( * b1 as u32 ) << 16 | ( * b2 as u32 ) << 8 | * b3 as u32 ,
399
+ [ .., b1, b2] => ( * b1 as u32 ) << 8 | * b2 as u32 ,
400
+ // If it's just a single byte or empty then we return the full slice
401
+ _ => return bytes,
402
+ } ;
403
+ if ( u & 0b_11111000_11000000_11000000 == 0b_11110000_10000000_10000000 ) && bytes. len ( ) >= 4 {
404
+ & bytes[ ..bytes. len ( ) - 3 ]
405
+ } else if ( u & 0b_11111000_11000000 == 0b_11110000_10000000
406
+ || u & 0b_11110000_11000000 == 0b_11100000_10000000 )
407
+ && bytes. len ( ) >= 3
408
+ {
409
+ & bytes[ ..bytes. len ( ) - 2 ]
410
+ } else if ( u & 0b_1111_1000 == 0b_1111_0000
411
+ || u & 0b_11110000 == 0b_11100000
412
+ || u & 0b_11100000 == 0b_11000000 )
413
+ && bytes. len ( ) >= 2
414
+ {
415
+ & bytes[ ..bytes. len ( ) - 1 ]
416
+ } else {
417
+ bytes
418
+ }
419
+ }
0 commit comments