Skip to content

Commit 182176e

Browse files
marijnhgraydon
authored andcommitted
---
yaml --- r: 1852 b: refs/heads/master c: a045514 h: refs/heads/master v: v3
1 parent 32ca924 commit 182176e

File tree

11 files changed

+288
-46
lines changed

11 files changed

+288
-46
lines changed

[refs]

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
---
2-
refs/heads/master: d3b49f5aab6a9e9efc2ab1d6713cc0d2bde94f4e
2+
refs/heads/master: a0455144774de6c9dc0ff0e87fe4352f8a70cac3

trunk/Makefile.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,7 @@ TEST_XFAILS_STAGE0 := $(FLOAT_XFAILS) \
712712
use-import-export.rs \
713713
user.rs \
714714
utf8.rs \
715+
utf8_chars.rs \
715716
vec-alloc-append.rs \
716717
vec-append.rs \
717718
vec-slice.rs \

trunk/src/comp/front/lexer.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ impure fn new_reader(io.reader rdr, str filename) -> reader
7676
col += 1u;
7777
}
7878

79-
n = rdr.read_char() as char;
79+
n = rdr.read_byte() as char;
8080
}
8181

8282
fn mark() {
@@ -204,8 +204,8 @@ impure fn new_reader(io.reader rdr, str filename) -> reader
204204
reserved.insert("m128", ()); // IEEE 754-2008 'decimal128'
205205
reserved.insert("dec", ()); // One of m32, m64, m128
206206

207-
ret reader(rdr, filename, rdr.read_char() as char,
208-
rdr.read_char() as char, 1u, 0u, 1u, 0u, keywords, reserved);
207+
ret reader(rdr, filename, rdr.read_byte() as char,
208+
rdr.read_byte() as char, 1u, 0u, 1u, 0u, keywords, reserved);
209209
}
210210

211211

trunk/src/lib/_str.rs

Lines changed: 164 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ native "rust" mod rustrt {
1010
fn str_from_vec(vec[mutable? u8] b) -> str;
1111
fn str_from_cstr(sbuf cstr) -> str;
1212
fn str_from_buf(sbuf buf, uint len) -> str;
13+
fn str_push_byte(str s, uint byte) -> str;
1314
fn refcount[T](str s) -> uint;
1415
}
1516

@@ -65,15 +66,42 @@ fn hash(&str s) -> uint {
6566
ret u;
6667
}
6768

69+
// UTF-8 tags and ranges
70+
const u8 tag_cont_u8 = 0x80_u8;
71+
const uint tag_cont = 0x80_u;
72+
const uint max_one_b = 0x80_u;
73+
const uint tag_two_b = 0xc0_u;
74+
const uint max_two_b = 0x800_u;
75+
const uint tag_three_b = 0xe0_u;
76+
const uint max_three_b = 0x10000_u;
77+
const uint tag_four_b = 0xf0_u;
78+
const uint max_four_b = 0x200000_u;
79+
const uint tag_five_b = 0xf8_u;
80+
const uint max_five_b = 0x4000000_u;
81+
const uint tag_six_b = 0xfc_u;
82+
6883
fn is_utf8(vec[u8] v) -> bool {
69-
fail; // FIXME
84+
auto i = 0u;
85+
auto total = _vec.len[u8](v);
86+
while (i < total) {
87+
auto chsize = utf8_char_width(v.(i));
88+
if (chsize == 0u) {ret false;}
89+
if (i + chsize > total) {ret false;}
90+
i += 1u;
91+
while (chsize > 1u) {
92+
if (v.(i) & 0xc0_u8 != tag_cont_u8) {ret false;}
93+
i += 1u;
94+
chsize -= 1u;
95+
}
96+
}
97+
ret true;
7098
}
7199

72100
fn is_ascii(str s) -> bool {
73101
let uint i = byte_len(s);
74102
while (i > 0u) {
75103
i -= 1u;
76-
if ((s.(i) & 0x80u8) != 0u8) {
104+
if ((s.(i) & 0x80_u8) != 0u8) {
77105
ret false;
78106
}
79107
}
@@ -134,6 +162,139 @@ unsafe fn str_from_buf(sbuf buf, uint len) -> str {
134162
ret rustrt.str_from_buf(buf, len);
135163
}
136164

165+
fn push_utf8_bytes(&mutable str s, char ch) {
166+
auto code = ch as uint;
167+
if (code < max_one_b) {
168+
s = rustrt.str_push_byte(s, code);
169+
} else if (code < max_two_b) {
170+
s = rustrt.str_push_byte(s, ((code >> 6u) & 0x1f_u) | tag_two_b);
171+
s = rustrt.str_push_byte(s, (code & 0x3f_u) | tag_cont);
172+
} else if (code < max_three_b) {
173+
s = rustrt.str_push_byte(s, ((code >> 12u) & 0x0f_u) | tag_three_b);
174+
s = rustrt.str_push_byte(s, ((code >> 6u) & 0x3f_u) | tag_cont);
175+
s = rustrt.str_push_byte(s, (code & 0x3f_u) | tag_cont);
176+
} else if (code < max_four_b) {
177+
s = rustrt.str_push_byte(s, ((code >> 18u) & 0x07_u) | tag_four_b);
178+
s = rustrt.str_push_byte(s, ((code >> 12u) & 0x3f_u) | tag_cont);
179+
s = rustrt.str_push_byte(s, ((code >> 6u) & 0x3f_u) | tag_cont);
180+
s = rustrt.str_push_byte(s, (code & 0x3f_u) | tag_cont);
181+
} else if (code < max_five_b) {
182+
s = rustrt.str_push_byte(s, ((code >> 24u) & 0x03_u) | tag_five_b);
183+
s = rustrt.str_push_byte(s, ((code >> 18u) & 0x3f_u) | tag_cont);
184+
s = rustrt.str_push_byte(s, ((code >> 12u) & 0x3f_u) | tag_cont);
185+
s = rustrt.str_push_byte(s, ((code >> 6u) & 0x3f_u) | tag_cont);
186+
s = rustrt.str_push_byte(s, (code & 0x3f_u) | tag_cont);
187+
} else {
188+
s = rustrt.str_push_byte(s, ((code >> 30u) & 0x01_u) | tag_six_b);
189+
s = rustrt.str_push_byte(s, ((code >> 24u) & 0x3f_u) | tag_cont);
190+
s = rustrt.str_push_byte(s, ((code >> 18u) & 0x3f_u) | tag_cont);
191+
s = rustrt.str_push_byte(s, ((code >> 12u) & 0x3f_u) | tag_cont);
192+
s = rustrt.str_push_byte(s, ((code >> 6u) & 0x3f_u) | tag_cont);
193+
s = rustrt.str_push_byte(s, (code & 0x3f_u) | tag_cont);
194+
}
195+
}
196+
197+
fn from_char(char ch) -> str {
198+
auto buf = "";
199+
push_utf8_bytes(buf, ch);
200+
ret buf;
201+
}
202+
203+
fn from_chars(vec[char] chs) -> str {
204+
auto buf = "";
205+
for (char ch in chs) {push_utf8_bytes(buf, ch);}
206+
ret buf;
207+
}
208+
209+
fn utf8_char_width(u8 b) -> uint {
210+
let uint byte = b as uint;
211+
if (byte < 0x80_u) {ret 1u;}
212+
if (byte < 0xc0_u) {ret 0u;} // Not a valid start byte
213+
if (byte < 0xe0_u) {ret 2u;}
214+
if (byte < 0xf0_u) {ret 3u;}
215+
if (byte < 0xf8_u) {ret 4u;}
216+
if (byte < 0xfc_u) {ret 5u;}
217+
ret 6u;
218+
}
219+
220+
fn char_range_at(str s, uint i) -> tup(char, uint) {
221+
auto b0 = s.(i);
222+
auto w = utf8_char_width(b0);
223+
check(w != 0u);
224+
if (w == 1u) {ret tup(b0 as char, i + 1u);}
225+
auto val = 0u;
226+
auto end = i + w;
227+
i += 1u;
228+
while (i < end) {
229+
auto byte = s.(i);
230+
check(byte & 0xc0_u8 == tag_cont_u8);
231+
val <<= 6u;
232+
val += (byte & 0x3f_u8) as uint;
233+
i += 1u;
234+
}
235+
// Clunky way to get the right bits from the first byte. Uses two shifts,
236+
// the first to clip off the marker bits at the left of the byte, and then
237+
// a second (as uint) to get it to the right position.
238+
val += ((b0 << ((w + 1u) as u8)) as uint) << ((w - 1u) * 6u - w - 1u);
239+
ret tup(val as char, i);
240+
}
241+
242+
fn char_at(str s, uint i) -> char {
243+
ret char_range_at(s, i)._0;
244+
}
245+
246+
fn char_len(str s) -> uint {
247+
auto i = 0u;
248+
auto len = 0u;
249+
auto total = byte_len(s);
250+
while (i < total) {
251+
auto chsize = utf8_char_width(s.(i));
252+
check(chsize > 0u);
253+
len += 1u;
254+
i += chsize;
255+
}
256+
check(i == total);
257+
ret len;
258+
}
259+
260+
fn to_chars(str s) -> vec[char] {
261+
let vec[char] buf = vec();
262+
auto i = 0u;
263+
auto len = byte_len(s);
264+
while (i < len) {
265+
auto cur = char_range_at(s, i);
266+
_vec.push[char](buf, cur._0);
267+
i = cur._1;
268+
}
269+
ret buf;
270+
}
271+
272+
fn push_char(&mutable str s, char ch) {
273+
s += from_char(ch);
274+
}
275+
276+
fn pop_char(&mutable str s) -> char {
277+
auto end = byte_len(s);
278+
while (end > 0u && s.(end - 1u) & 0xc0_u8 == tag_cont_u8) {end -= 1u;}
279+
check(end > 0u);
280+
auto ch = char_at(s, end - 1u);
281+
s = substr(s, 0u, end - 1u);
282+
ret ch;
283+
}
284+
285+
fn shift_char(&mutable str s) -> char {
286+
auto r = char_range_at(s, 0u);
287+
s = substr(s, r._1, byte_len(s) - r._1);
288+
ret r._0;
289+
}
290+
291+
fn unshift_char(&mutable str s, char ch) {
292+
// Workaround for rustboot order-of-evaluation issue -- if I put s
293+
// directly after the +, the string ends up containing (only) the
294+
// character, twice.
295+
auto x = s;
296+
s = from_char(ch) + x;
297+
}
137298

138299
fn refcount(str s) -> uint {
139300
auto r = rustrt.refcount[u8](s);
@@ -256,7 +417,7 @@ fn pop_byte(&mutable str s) -> u8 {
256417
}
257418

258419
fn push_byte(&mutable str s, u8 b) {
259-
s += unsafe_from_byte(b);
420+
s = rustrt.str_push_byte(s, b as uint);
260421
}
261422

262423
fn unshift_byte(&mutable str s, u8 b) {

trunk/src/lib/ebml.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,18 @@ type reader = rec(
2121

2222
// TODO: eventually use u64 or big here
2323
impure fn read_vint(&io.reader reader) -> uint {
24-
auto a = reader.read_byte();
24+
auto a = reader.read_byte() as u8;
2525
if (a & 0x80u8 != 0u8) { ret (a & 0x7fu8) as uint; }
26-
auto b = reader.read_byte();
26+
auto b = reader.read_byte() as u8;
2727
if (a & 0x40u8 != 0u8) {
2828
ret (((a & 0x3fu8) as uint) << 8u) | (b as uint);
2929
}
30-
auto c = reader.read_byte();
30+
auto c = reader.read_byte() as u8;
3131
if (a & 0x20u8 != 0u8) {
3232
ret (((a & 0x1fu8) as uint) << 16u) | ((b as uint) << 8u) |
3333
(c as uint);
3434
}
35-
auto d = reader.read_byte();
35+
auto d = reader.read_byte() as u8;
3636
if (a & 0x10u8 != 0u8) {
3737
ret (((a & 0x0fu8) as uint) << 24u) | ((b as uint) << 16u) |
3838
((c as uint) << 8u) | (d as uint);

trunk/src/lib/fs.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ native "rust" mod rustrt {
33
}
44

55
fn path_sep() -> str {
6-
ret _str.unsafe_from_bytes(vec(os_fs.path_sep as u8));
6+
ret _str.from_char(os_fs.path_sep);
77
}
88

99
type path = str;

0 commit comments

Comments
 (0)