Skip to content

Commit bcc2563

Browse files
author
Grahame Bowland
committed
add new read_chars method, fix bug in read_char
having a read_chars method is convenient and more efficient. the old read_char method had a bug due to re-use of the 'w' width variable as a loop counter and so was broken for wide characters, this patch fixes that.
1 parent e3afc78 commit bcc2563

File tree

1 file changed

+60
-20
lines changed

1 file changed

+60
-20
lines changed

src/libstd/io.rs

Lines changed: 60 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ type reader =
5050
fn unread_byte(int);
5151
fn read_bytes(uint) -> [u8];
5252
fn read_char() -> char;
53+
fn read_chars(uint) -> [char];
5354
fn eof() -> bool;
5455
fn read_line() -> str;
5556
fn read_c_str() -> str;
@@ -101,29 +102,68 @@ obj new_reader(rdr: buf_reader) {
101102
fn read_byte() -> int { ret rdr.read_byte(); }
102103
fn unread_byte(byte: int) { ret rdr.unread_byte(byte); }
103104
fn read_bytes(len: uint) -> [u8] { ret rdr.read(len); }
105+
fn read_chars(n: uint) -> [char] {
106+
// returns the (consumed offset, n_req), appends characters to &chars
107+
fn chars_from_buf(buf: [u8], &chars: [char]) -> (uint, uint) {
108+
let i = 0u;
109+
while i < vec::len(buf) {
110+
let b0 = buf[i];
111+
let w = str::utf8_char_width(b0);
112+
let end = i + w;
113+
i += 1u;
114+
assert (w > 0u);
115+
if w == 1u {
116+
chars += [ b0 as char ];
117+
cont;
118+
}
119+
// can't satisfy this char with the existing data
120+
if end > vec::len(buf) {
121+
ret (i - 1u, end - vec::len(buf));
122+
}
123+
let val = 0u;
124+
while i < end {
125+
let next = buf[i] as int;
126+
i += 1u;
127+
assert (next > -1);
128+
assert (next & 192 == 128);
129+
val <<= 6u;
130+
val += next & 63 as uint;
131+
}
132+
// See str::char_at
133+
val += (b0 << (w + 1u as u8) as uint) << (w - 1u) * 6u - w - 1u;
134+
chars += [ val as char ];
135+
}
136+
ret (i, 0u);
137+
}
138+
let buf: [u8] = [];
139+
let chars: [char] = [];
140+
let nbread = n; // might need more bytes, but reading n will never over-read
141+
while nbread > 0u {
142+
let data = self.read_bytes(nbread);
143+
if vec::len(data) == 0u {
144+
// eof - FIXME should we do something if we're split in a unicode char?
145+
break;
146+
}
147+
buf += data;
148+
let (offset, nbreq) = chars_from_buf(buf, chars);
149+
let ncreq = n - vec::len(chars);
150+
// again we either know we need a certain number of bytes to complete a
151+
// character, or we make sure we don't over-read by reading 1-byte per char
152+
// needed
153+
nbread = if ncreq > nbreq { ncreq } else { nbreq };
154+
if nbread > 0u {
155+
buf = vec::slice(buf, offset, vec::len(buf));
156+
}
157+
}
158+
ret chars;
159+
}
104160
fn read_char() -> char {
105-
let c0 = rdr.read_byte();
106-
if c0 == -1 {
161+
let c = self.read_chars(1u);
162+
if vec::len(c) == 0u {
107163
ret -1 as char; // FIXME will this stay valid?
108-
109164
}
110-
let b0 = c0 as u8;
111-
let w = str::utf8_char_width(b0);
112-
assert (w > 0u);
113-
if w == 1u { ret b0 as char; }
114-
let val = 0u;
115-
while w > 1u {
116-
w -= 1u;
117-
let next = rdr.read_byte();
118-
assert (next > -1);
119-
assert (next & 192 == 128);
120-
val <<= 6u;
121-
val += next & 63 as uint;
122-
}
123-
// See str::char_at
124-
125-
val += (b0 << (w + 1u as u8) as uint) << (w - 1u) * 6u - w - 1u;
126-
ret val as char;
165+
assert(vec::len(c) == 1u);
166+
ret c[0];
127167
}
128168
fn eof() -> bool { ret rdr.eof(); }
129169
fn read_line() -> str {

0 commit comments

Comments
 (0)