Skip to content

Commit bd64bcb

Browse files
committed
Don't error on incomplete UTF-8
1 parent 85eabcd commit bd64bcb

File tree

1 file changed

+76
-128
lines changed

1 file changed

+76
-128
lines changed

library/std/src/sys/pal/windows/stdio.rs

Lines changed: 76 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
#![unstable(issue = "none", feature = "windows_stdio")]
22

3-
use core::str::utf8_char_width;
4-
53
use super::api::{self, WinError};
64
use crate::mem::MaybeUninit;
75
use crate::os::windows::io::{FromRawHandle, IntoRawHandle};
86
use crate::sys::handle::Handle;
97
use crate::sys::{c, cvt};
10-
use crate::{cmp, io, ptr, str};
8+
use crate::{cmp, io, ptr};
119

1210
#[cfg(test)]
1311
mod tests;
@@ -19,13 +17,9 @@ pub struct Stdin {
1917
incomplete_utf8: IncompleteUtf8,
2018
}
2119

22-
pub struct Stdout {
23-
incomplete_utf8: IncompleteUtf8,
24-
}
20+
pub struct Stdout {}
2521

26-
pub struct Stderr {
27-
incomplete_utf8: IncompleteUtf8,
28-
}
22+
pub struct Stderr {}
2923

3024
struct IncompleteUtf8 {
3125
bytes: [u8; 4],
@@ -98,7 +92,7 @@ fn is_utf8_console() -> bool {
9892
false
9993
}
10094

101-
fn write(handle_id: u32, data: &[u8], incomplete_utf8: &mut IncompleteUtf8) -> io::Result<usize> {
95+
fn write(handle_id: u32, data: &[u8]) -> io::Result<usize> {
10296
if data.is_empty() {
10397
return Ok(0);
10498
}
@@ -112,134 +106,41 @@ fn write(handle_id: u32, data: &[u8], incomplete_utf8: &mut IncompleteUtf8) -> i
112106
return ret;
113107
}
114108
} else {
115-
write_console_utf16(data, incomplete_utf8, handle)
109+
write_console_utf16(data, handle)
116110
}
117111
}
118112

119-
fn write_console_utf16(
120-
data: &[u8],
121-
incomplete_utf8: &mut IncompleteUtf8,
122-
handle: c::HANDLE,
123-
) -> io::Result<usize> {
124-
if incomplete_utf8.len > 0 {
125-
assert!(
126-
incomplete_utf8.len < 4,
127-
"Unexpected number of bytes for incomplete UTF-8 codepoint."
128-
);
129-
if data[0] >> 6 != 0b10 {
130-
// not a continuation byte - reject
131-
incomplete_utf8.len = 0;
132-
return Err(io::const_error!(
133-
io::ErrorKind::InvalidData,
134-
"Windows stdio in console mode does not support writing non-UTF-8 byte sequences",
135-
));
136-
}
137-
incomplete_utf8.bytes[incomplete_utf8.len as usize] = data[0];
138-
incomplete_utf8.len += 1;
139-
let char_width = utf8_char_width(incomplete_utf8.bytes[0]);
140-
if (incomplete_utf8.len as usize) < char_width {
141-
// more bytes needed
142-
return Ok(1);
143-
}
144-
let s = str::from_utf8(&incomplete_utf8.bytes[0..incomplete_utf8.len as usize]);
145-
incomplete_utf8.len = 0;
146-
match s {
147-
Ok(s) => {
148-
assert_eq!(char_width, s.len());
149-
let written = write_valid_utf8_to_console(handle, s)?;
150-
assert_eq!(written, s.len()); // guaranteed by write_valid_utf8_to_console() for single codepoint writes
151-
return Ok(1);
152-
}
153-
Err(_) => {
154-
return Err(io::const_error!(
155-
io::ErrorKind::InvalidData,
156-
"Windows stdio in console mode does not support writing non-UTF-8 byte sequences",
157-
));
158-
}
159-
}
160-
}
161-
162-
// As the console is meant for presenting text, we assume bytes of `data` are encoded as UTF-8,
163-
// which needs to be encoded as UTF-16.
164-
//
165-
// If the data is not valid UTF-8 we write out as many bytes as are valid.
166-
// If the first byte is invalid it is either first byte of a multi-byte sequence but the
167-
// provided byte slice is too short or it is the first byte of an invalid multi-byte sequence.
168-
let len = cmp::min(data.len(), MAX_BUFFER_SIZE / 2);
169-
let utf8 = match str::from_utf8(&data[..len]) {
170-
Ok(s) => s,
171-
Err(ref e) if e.valid_up_to() == 0 => {
172-
let first_byte_char_width = utf8_char_width(data[0]);
173-
if first_byte_char_width > 1 && data.len() < first_byte_char_width {
174-
incomplete_utf8.bytes[0] = data[0];
175-
incomplete_utf8.len = 1;
176-
return Ok(1);
177-
} else {
178-
return Err(io::const_error!(
179-
io::ErrorKind::InvalidData,
180-
"Windows stdio in console mode does not support writing non-UTF-8 byte sequences",
181-
));
182-
}
183-
}
184-
Err(e) => str::from_utf8(&data[..e.valid_up_to()]).unwrap(),
185-
};
186-
187-
write_valid_utf8_to_console(handle, utf8)
113+
fn write_console_utf16(data: &[u8], handle: c::HANDLE) -> io::Result<usize> {
114+
let mut buffer = [MaybeUninit::<u16>::uninit(); MAX_BUFFER_SIZE / 2];
115+
let data = &data[..data.len().min(buffer.len())];
116+
117+
// Split off any trailing incomplete UTF-8 from the end of the input.
118+
let utf8 = trim_last_char_boundary(data);
119+
let utf16 = utf8_to_utf16_lossy(utf8, &mut buffer);
120+
debug_assert!(!utf16.is_empty());
121+
122+
// Write the UTF-16 chars to the console.
123+
// This will succeed in one write so long as our [u16] slice is smaller than the console's buffer,
124+
// which we've ensured by truncating the input (see `MAX_BUFFER_SIZE`).
125+
let written = write_u16s(handle, &utf16)?;
126+
debug_assert_eq!(written, utf16.len());
127+
Ok(utf8.len())
188128
}
189129

190-
fn write_valid_utf8_to_console(handle: c::HANDLE, utf8: &str) -> io::Result<usize> {
191-
debug_assert!(!utf8.is_empty());
192-
193-
let mut utf16 = [MaybeUninit::<u16>::uninit(); MAX_BUFFER_SIZE / 2];
194-
let utf8 = &utf8[..utf8.floor_char_boundary(utf16.len())];
195-
196-
let utf16: &[u16] = unsafe {
197-
// Note that this theoretically checks validity twice in the (most common) case
198-
// where the underlying byte sequence is valid utf-8 (given the check in `write()`).
130+
fn utf8_to_utf16_lossy<'a>(utf8: &[u8], utf16: &'a mut [MaybeUninit<u16>]) -> &'a [u16] {
131+
unsafe {
199132
let result = c::MultiByteToWideChar(
200133
c::CP_UTF8, // CodePage
201-
c::MB_ERR_INVALID_CHARS, // dwFlags
134+
0, // dwFlags
202135
utf8.as_ptr(), // lpMultiByteStr
203136
utf8.len() as i32, // cbMultiByte
204137
utf16.as_mut_ptr() as *mut c::WCHAR, // lpWideCharStr
205138
utf16.len() as i32, // cchWideChar
206139
);
207-
assert!(result != 0, "Unexpected error in MultiByteToWideChar");
208-
140+
// The only way an error can happen here is if we've messed up.
141+
debug_assert!(result != 0, "Unexpected error in MultiByteToWideChar");
209142
// Safety: MultiByteToWideChar initializes `result` values.
210143
MaybeUninit::slice_assume_init_ref(&utf16[..result as usize])
211-
};
212-
213-
let mut written = write_u16s(handle, utf16)?;
214-
215-
// Figure out how many bytes of as UTF-8 were written away as UTF-16.
216-
if written == utf16.len() {
217-
Ok(utf8.len())
218-
} else {
219-
// Make sure we didn't end up writing only half of a surrogate pair (even though the chance
220-
// is tiny). Because it is not possible for user code to re-slice `data` in such a way that
221-
// a missing surrogate can be produced (and also because of the UTF-8 validation above),
222-
// write the missing surrogate out now.
223-
// Buffering it would mean we have to lie about the number of bytes written.
224-
let first_code_unit_remaining = utf16[written];
225-
if matches!(first_code_unit_remaining, 0xDCEE..=0xDFFF) {
226-
// low surrogate
227-
// We just hope this works, and give up otherwise
228-
let _ = write_u16s(handle, &utf16[written..written + 1]);
229-
written += 1;
230-
}
231-
// Calculate the number of bytes of `utf8` that were actually written.
232-
let mut count = 0;
233-
for ch in utf16[..written].iter() {
234-
count += match ch {
235-
0x0000..=0x007F => 1,
236-
0x0080..=0x07FF => 2,
237-
0xDCEE..=0xDFFF => 1, // Low surrogate. We already counted 3 bytes for the other.
238-
_ => 3,
239-
};
240-
}
241-
debug_assert!(String::from_utf16(&utf16[..written]).unwrap() == utf8[..count]);
242-
Ok(count)
243144
}
244145
}
245146

@@ -432,13 +333,13 @@ impl IncompleteUtf8 {
432333

433334
impl Stdout {
434335
pub const fn new() -> Stdout {
435-
Stdout { incomplete_utf8: IncompleteUtf8::new() }
336+
Stdout {}
436337
}
437338
}
438339

439340
impl io::Write for Stdout {
440341
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
441-
write(c::STD_OUTPUT_HANDLE, buf, &mut self.incomplete_utf8)
342+
write(c::STD_OUTPUT_HANDLE, buf)
442343
}
443344

444345
fn flush(&mut self) -> io::Result<()> {
@@ -448,13 +349,13 @@ impl io::Write for Stdout {
448349

449350
impl Stderr {
450351
pub const fn new() -> Stderr {
451-
Stderr { incomplete_utf8: IncompleteUtf8::new() }
352+
Stderr {}
452353
}
453354
}
454355

455356
impl io::Write for Stderr {
456357
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
457-
write(c::STD_ERROR_HANDLE, buf, &mut self.incomplete_utf8)
358+
write(c::STD_ERROR_HANDLE, buf)
458359
}
459360

460361
fn flush(&mut self) -> io::Result<()> {
@@ -469,3 +370,50 @@ pub fn is_ebadf(err: &io::Error) -> bool {
469370
pub fn panic_output() -> Option<impl io::Write> {
470371
Some(Stderr::new())
471372
}
373+
374+
/// Trim one incomplete UTF-8 char from the end of a byte slice.
375+
///
376+
/// If trimming would lead to an empty slice then it returns `bytes` instead.
377+
///
378+
/// Note: This function is optimized for size rather than speed.
379+
pub fn trim_last_char_boundary(bytes: &[u8]) -> &[u8] {
380+
// UTF-8's multiple-byte encoding uses the leading bits to encode the length of a code point.
381+
// The bits of a multi-byte sequence are (where `n` is a placeholder for any bit):
382+
//
383+
// 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn
384+
// 1110nnnn 10nnnnnn 10nnnnnn
385+
// 110nnnnn 10nnnnnn
386+
//
387+
// So if follows that an incomplete sequence is one of these:
388+
// 11110nnn 10nnnnnn 10nnnnnn
389+
// 11110nnn 10nnnnnn
390+
// 1110nnnn 10nnnnnn
391+
// 11110nnn
392+
// 1110nnnn
393+
// 110nnnnn
394+
395+
// Get up to three bytes from the end of the slice and encode them as a u32
396+
// because it turns out the compiler is very good at optimizing numbers.
397+
let u = match bytes {
398+
[.., b1, b2, b3] => (*b1 as u32) << 16 | (*b2 as u32) << 8 | *b3 as u32,
399+
[.., b1, b2] => (*b1 as u32) << 8 | *b2 as u32,
400+
// If it's just a single byte or empty then we return the full slice
401+
_ => return bytes,
402+
};
403+
if (u & 0b_11111000_11000000_11000000 == 0b_11110000_10000000_10000000) && bytes.len() >= 4 {
404+
&bytes[..bytes.len() - 3]
405+
} else if (u & 0b_11111000_11000000 == 0b_11110000_10000000
406+
|| u & 0b_11110000_11000000 == 0b_11100000_10000000)
407+
&& bytes.len() >= 3
408+
{
409+
&bytes[..bytes.len() - 2]
410+
} else if (u & 0b_1111_1000 == 0b_1111_0000
411+
|| u & 0b_11110000 == 0b_11100000
412+
|| u & 0b_11100000 == 0b_11000000)
413+
&& bytes.len() >= 2
414+
{
415+
&bytes[..bytes.len() - 1]
416+
} else {
417+
bytes
418+
}
419+
}

0 commit comments

Comments
 (0)