Skip to content

Commit 27393d5

Browse files
committed
fix incomplete UTF-8 writes in Windows console stdio
1 parent db492ec commit 27393d5

File tree

1 file changed

+102
-14
lines changed

1 file changed

+102
-14
lines changed

library/std/src/sys/windows/stdio.rs

Lines changed: 102 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,18 @@ use crate::sys::handle::Handle;
1414
pub struct Stdin {
1515
surrogate: u16,
1616
}
17-
pub struct Stdout;
18-
pub struct Stderr;
17+
pub struct Stdout {
18+
incomplete_utf8: IncompleteUtf8,
19+
}
20+
21+
pub struct Stderr {
22+
incomplete_utf8: IncompleteUtf8,
23+
}
24+
25+
struct IncompleteUtf8 {
26+
bytes: [u8; 4],
27+
len: u8,
28+
}
1929

2030
// Apparently Windows doesn't handle large reads on stdin or writes to stdout/stderr well (see
2131
// #13304 for details).
@@ -50,7 +60,27 @@ fn is_console(handle: c::HANDLE) -> bool {
5060
unsafe { c::GetConsoleMode(handle, &mut mode) != 0 }
5161
}
5262

53-
fn write(handle_id: c::DWORD, data: &[u8]) -> io::Result<usize> {
63+
// Simple reimplementation of std::str::utf8_char_width() which is feature-gated
64+
fn utf8_char_width(b: u8) -> usize {
65+
match b {
66+
0x00..=0x7F => 1,
67+
0x80..=0xC1 => 0,
68+
0xC2..=0xDF => 2,
69+
0xE0..=0xEF => 3,
70+
0xF0..=0xF4 => 4,
71+
0xF5..=0xFF => 0,
72+
}
73+
}
74+
75+
fn write(
76+
handle_id: c::DWORD,
77+
data: &[u8],
78+
incomplete_utf8: &mut IncompleteUtf8,
79+
) -> io::Result<usize> {
80+
if data.is_empty() {
81+
return Ok(0);
82+
}
83+
5484
let handle = get_handle(handle_id)?;
5585
if !is_console(handle) {
5686
let handle = Handle::new(handle);
@@ -59,22 +89,74 @@ fn write(handle_id: c::DWORD, data: &[u8]) -> io::Result<usize> {
5989
return ret;
6090
}
6191

62-
// As the console is meant for presenting text, we assume bytes of `data` come from a string
63-
// and are encoded as UTF-8, which needs to be encoded as UTF-16.
92+
match incomplete_utf8.len {
93+
0 => {}
94+
1..=3 => {
95+
if data[0] >> 6 != 0b10 {
96+
incomplete_utf8.len = 0;
97+
// not a continuation byte - reject
98+
return Err(io::Error::new(
99+
io::ErrorKind::InvalidData,
100+
"Windows stdio in console mode does not support writing non-UTF-8 byte sequences",
101+
));
102+
}
103+
incomplete_utf8.bytes[incomplete_utf8.len as usize] = data[0];
104+
incomplete_utf8.len += 1;
105+
let char_width = utf8_char_width(incomplete_utf8.bytes[0]);
106+
if (incomplete_utf8.len as usize) < char_width {
107+
// more bytes needed
108+
return Ok(1);
109+
}
110+
let s = str::from_utf8(&incomplete_utf8.bytes[0..incomplete_utf8.len as usize]);
111+
incomplete_utf8.len = 0;
112+
match s {
113+
Ok(s) => {
114+
assert_eq!(char_width, s.len());
115+
let written = write_valid_utf8(handle, s)?;
116+
assert_eq!(written, s.len()); // guaranteed by write0() for single codepoint writes
117+
return Ok(1);
118+
}
119+
Err(_) => {
120+
return Err(io::Error::new(
121+
io::ErrorKind::InvalidData,
122+
"Windows stdio in console mode does not support writing non-UTF-8 byte sequences",
123+
));
124+
}
125+
}
126+
}
127+
_ => {
128+
panic!("Unexpected number of incomplete UTF-8 chars.");
129+
}
130+
}
131+
132+
// As the console is meant for presenting text, we assume bytes of `data` are encoded as UTF-8,
133+
// which needs to be encoded as UTF-16.
64134
//
65135
// If the data is not valid UTF-8 we write out as many bytes as are valid.
66-
// Only when there are no valid bytes (which will happen on the next call), return an error.
136+
// If the first byte is invalid it is either first byte of a multi-byte sequence but the
137+
// provided byte slice is too short or it is the first byte of an invalide multi-byte sequence.
67138
let len = cmp::min(data.len(), MAX_BUFFER_SIZE / 2);
68139
let utf8 = match str::from_utf8(&data[..len]) {
69140
Ok(s) => s,
70141
Err(ref e) if e.valid_up_to() == 0 => {
71-
return Err(io::Error::new_const(
72-
io::ErrorKind::InvalidData,
73-
&"Windows stdio in console mode does not support writing non-UTF-8 byte sequences",
74-
));
142+
if data.len() < utf8_char_width(data[0]) {
143+
incomplete_utf8.bytes[0] = data[0];
144+
incomplete_utf8.len = 1;
145+
return Ok(1);
146+
} else {
147+
return Err(io::Error::new_const(
148+
io::ErrorKind::InvalidData,
149+
&"Windows stdio in console mode does not support writing non-UTF-8 byte sequences",
150+
));
151+
}
75152
}
76153
Err(e) => str::from_utf8(&data[..e.valid_up_to()]).unwrap(),
77154
};
155+
156+
write_valid_utf8(handle, utf8)
157+
}
158+
159+
fn write_valid_utf8(handle: c::HANDLE, utf8: &str) -> io::Result<usize> {
78160
let mut utf16 = [0u16; MAX_BUFFER_SIZE / 2];
79161
let mut len_utf16 = 0;
80162
for (chr, dest) in utf8.encode_utf16().zip(utf16.iter_mut()) {
@@ -254,15 +336,21 @@ fn utf16_to_utf8(utf16: &[u16], utf8: &mut [u8]) -> io::Result<usize> {
254336
Ok(written)
255337
}
256338

339+
impl IncompleteUtf8 {
340+
pub const fn new() -> IncompleteUtf8 {
341+
IncompleteUtf8 { bytes: [0; 4], len: 0 }
342+
}
343+
}
344+
257345
impl Stdout {
258346
pub const fn new() -> Stdout {
259-
Stdout
347+
Stdout { incomplete_utf8: IncompleteUtf8::new() }
260348
}
261349
}
262350

263351
impl io::Write for Stdout {
264352
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
265-
write(c::STD_OUTPUT_HANDLE, buf)
353+
write(c::STD_ERROR_HANDLE, buf, &mut self.incomplete_utf8)
266354
}
267355

268356
fn flush(&mut self) -> io::Result<()> {
@@ -272,13 +360,13 @@ impl io::Write for Stdout {
272360

273361
impl Stderr {
274362
pub const fn new() -> Stderr {
275-
Stderr
363+
Stderr { incomplete_utf8: IncompleteUtf8::new() }
276364
}
277365
}
278366

279367
impl io::Write for Stderr {
280368
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
281-
write(c::STD_ERROR_HANDLE, buf)
369+
write(c::STD_ERROR_HANDLE, buf, &mut self.incomplete_utf8)
282370
}
283371

284372
fn flush(&mut self) -> io::Result<()> {

0 commit comments

Comments
 (0)