Skip to content

Commit cb9c1e0

Browse files
committed
auto merge of #16498 : Kimundi/rust/inline-utf-encoding, r=alexcrichton
The first commit improves code generation through a few changes: - The `#[inline]` attributes allow llvm to constant fold the encoding step away in certain situations. For example, code like this changes from a call to `encode_utf8` in a inner loop to the pushing of a byte constant: ```rust let mut s = String::new(); for _ in range(0u, 21) { s.push_char('a'); } ``` - Both methods changed their semantic from causing run time failure if the target buffer is not large enough to returning `None` instead. This makes llvm no longer emit code for causing failure for these methods. - A few debug `assert!()` calls got removed because they affected code generation due to unwinding, and where basically unnecessary with today's sound handling of `char` as a Unicode scalar value. ~~The second commit is optional. It changes the methods from regular indexing with the `dst[i]` syntax to unsafe indexing with `dst.unsafe_mut_ref(i)`. This does not change code generation directly - in both cases llvm is smart enough to see that there can never be an out-of-bounds access. But it makes it emit a `nounwind` attribute for the function. However, I'm not sure whether that is a real improvement, so if there is any objection to this I'll remove the commit.~~ This changes how the methods behave on a too small buffer, so this is a [breaking-change]
2 parents 4a5654f + 13079c1 commit cb9c1e0

File tree

7 files changed

+47
-42
lines changed

7 files changed

+47
-42
lines changed

src/libcollections/str.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1682,7 +1682,7 @@ mod tests {
16821682
fn test_chars_decoding() {
16831683
let mut bytes = [0u8, ..4];
16841684
for c in range(0u32, 0x110000).filter_map(|c| ::core::char::from_u32(c)) {
1685-
let len = c.encode_utf8(bytes);
1685+
let len = c.encode_utf8(bytes).unwrap_or(0);
16861686
let s = ::core::str::from_utf8(bytes.slice_to(len)).unwrap();
16871687
if Some(c) != s.chars().next() {
16881688
fail!("character {:x}={} does not decode correctly", c as u32, c);
@@ -1694,7 +1694,7 @@ mod tests {
16941694
fn test_chars_rev_decoding() {
16951695
let mut bytes = [0u8, ..4];
16961696
for c in range(0u32, 0x110000).filter_map(|c| ::core::char::from_u32(c)) {
1697-
let len = c.encode_utf8(bytes);
1697+
let len = c.encode_utf8(bytes).unwrap_or(0);
16981698
let s = ::core::str::from_utf8(bytes.slice_to(len)).unwrap();
16991699
if Some(c) != s.chars().rev().next() {
17001700
fail!("character {:x}={} does not decode correctly", c as u32, c);

src/libcollections/string.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -502,7 +502,7 @@ impl String {
502502
data: self.vec.as_ptr().offset(cur_len as int),
503503
len: 4,
504504
};
505-
let used = ch.encode_utf8(mem::transmute(slice));
505+
let used = ch.encode_utf8(mem::transmute(slice)).unwrap_or(0);
506506
self.vec.set_len(cur_len + used);
507507
}
508508
}

src/libcore/char.rs

Lines changed: 36 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
use mem::transmute;
1919
use option::{None, Option, Some};
2020
use iter::range_step;
21+
use collections::Collection;
2122

2223
// UTF-8 ranges and tags for encoding characters
2324
static TAG_CONT: u8 = 0b1000_0000u8;
@@ -27,7 +28,6 @@ static TAG_FOUR_B: u8 = 0b1111_0000u8;
2728
static MAX_ONE_B: u32 = 0x80u32;
2829
static MAX_TWO_B: u32 = 0x800u32;
2930
static MAX_THREE_B: u32 = 0x10000u32;
30-
static MAX_FOUR_B: u32 = 0x200000u32;
3131

3232
/*
3333
Lu Uppercase_Letter an uppercase letter
@@ -217,14 +217,14 @@ pub fn escape_default(c: char, f: |char|) {
217217
}
218218

219219
/// Returns the amount of bytes this `char` would need if encoded in UTF-8
220+
#[inline]
220221
pub fn len_utf8_bytes(c: char) -> uint {
221222
let code = c as u32;
222223
match () {
223224
_ if code < MAX_ONE_B => 1u,
224225
_ if code < MAX_TWO_B => 2u,
225226
_ if code < MAX_THREE_B => 3u,
226-
_ if code < MAX_FOUR_B => 4u,
227-
_ => fail!("invalid character!"),
227+
_ => 4u,
228228
}
229229
}
230230

@@ -297,21 +297,19 @@ pub trait Char {
297297
/// UTF-8.
298298
fn len_utf8_bytes(&self) -> uint;
299299

300-
/// Encodes this character as UTF-8 into the provided byte buffer.
301-
///
302-
/// The buffer must be at least 4 bytes long or a runtime failure may
303-
/// occur.
300+
/// Encodes this character as UTF-8 into the provided byte buffer,
301+
/// and then returns the number of bytes written.
304302
///
305-
/// This will then return the number of bytes written to the slice.
306-
fn encode_utf8(&self, dst: &mut [u8]) -> uint;
303+
/// If the buffer is not large enough, nothing will be written into it
304+
/// and a `None` will be returned.
305+
fn encode_utf8(&self, dst: &mut [u8]) -> Option<uint>;
307306

308-
/// Encodes this character as UTF-16 into the provided `u16` buffer.
307+
/// Encodes this character as UTF-16 into the provided `u16` buffer,
308+
/// and then returns the number of `u16`s written.
309309
///
310-
/// The buffer must be at least 2 elements long or a runtime failure may
311-
/// occur.
312-
///
313-
/// This will then return the number of `u16`s written to the slice.
314-
fn encode_utf16(&self, dst: &mut [u16]) -> uint;
310+
/// If the buffer is not large enough, nothing will be written into it
311+
/// and a `None` will be returned.
312+
fn encode_utf16(&self, dst: &mut [u16]) -> Option<uint>;
315313
}
316314

317315
impl Char for char {
@@ -325,45 +323,52 @@ impl Char for char {
325323

326324
fn escape_default(&self, f: |char|) { escape_default(*self, f) }
327325

326+
#[inline]
328327
fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }
329328

330-
fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> uint {
329+
#[inline]
330+
fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> Option<uint> {
331+
// Marked #[inline] to allow llvm optimizing it away
331332
let code = *self as u32;
332-
if code < MAX_ONE_B {
333+
if code < MAX_ONE_B && dst.len() >= 1 {
333334
dst[0] = code as u8;
334-
1
335-
} else if code < MAX_TWO_B {
335+
Some(1)
336+
} else if code < MAX_TWO_B && dst.len() >= 2 {
336337
dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
337338
dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
338-
2
339-
} else if code < MAX_THREE_B {
339+
Some(2)
340+
} else if code < MAX_THREE_B && dst.len() >= 3 {
340341
dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
341342
dst[1] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
342343
dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
343-
3
344-
} else {
344+
Some(3)
345+
} else if dst.len() >= 4 {
345346
dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
346347
dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
347348
dst[2] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
348349
dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
349-
4
350+
Some(4)
351+
} else {
352+
None
350353
}
351354
}
352355

353-
fn encode_utf16(&self, dst: &mut [u16]) -> uint {
356+
#[inline]
357+
fn encode_utf16(&self, dst: &mut [u16]) -> Option<uint> {
358+
// Marked #[inline] to allow llvm optimizing it away
354359
let mut ch = *self as u32;
355-
if (ch & 0xFFFF_u32) == ch {
360+
if (ch & 0xFFFF_u32) == ch && dst.len() >= 1 {
356361
// The BMP falls through (assuming non-surrogate, as it should)
357-
assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
358362
dst[0] = ch as u16;
359-
1
360-
} else {
363+
Some(1)
364+
} else if dst.len() >= 2 {
361365
// Supplementary planes break into surrogates.
362-
assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
363366
ch -= 0x1_0000_u32;
364367
dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
365368
dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
366-
2
369+
Some(2)
370+
} else {
371+
None
367372
}
368373
}
369374
}

src/libcore/fmt/mod.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,7 @@ impl<'a> Formatter<'a> {
366366
let write_prefix = |f: &mut Formatter| {
367367
for c in sign.move_iter() {
368368
let mut b = [0, ..4];
369-
let n = c.encode_utf8(b);
369+
let n = c.encode_utf8(b).unwrap_or(0);
370370
try!(f.buf.write(b.slice_to(n)));
371371
}
372372
if prefixed { f.buf.write(prefix.as_bytes()) }
@@ -467,7 +467,7 @@ impl<'a> Formatter<'a> {
467467
try!(f(self));
468468
}
469469
let mut fill = [0u8, ..4];
470-
let len = self.fill.encode_utf8(fill);
470+
let len = self.fill.encode_utf8(fill).unwrap_or(0);
471471
for _ in range(0, padding) {
472472
try!(self.buf.write(fill.slice_to(len)));
473473
}
@@ -545,7 +545,7 @@ impl Char for char {
545545
use char::Char;
546546

547547
let mut utf8 = [0u8, ..4];
548-
let amt = self.encode_utf8(utf8);
548+
let amt = self.encode_utf8(utf8).unwrap_or(0);
549549
let s: &str = unsafe { mem::transmute(utf8.slice_to(amt)) };
550550
secret_string(&s, f)
551551
}

src/libcore/str.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ use iter::range;
3030
use num::{CheckedMul, Saturating};
3131
use option::{Option, None, Some};
3232
use raw::Repr;
33-
use slice::ImmutableSlice;
33+
use slice::{ImmutableSlice, MutableSlice};
3434
use slice;
3535
use uint;
3636

@@ -646,7 +646,7 @@ impl<'a> Iterator<u16> for Utf16CodeUnits<'a> {
646646

647647
let mut buf = [0u16, ..2];
648648
self.chars.next().map(|ch| {
649-
let n = ch.encode_utf16(buf /* as mut slice! */);
649+
let n = ch.encode_utf16(buf.as_mut_slice()).unwrap_or(0);
650650
if n == 2 { self.extra = buf[1]; }
651651
buf[0]
652652
})

src/libcoretest/char.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ fn test_escape_unicode() {
173173
fn test_encode_utf8() {
174174
fn check(input: char, expect: &[u8]) {
175175
let mut buf = [0u8, ..4];
176-
let n = input.encode_utf8(buf /* as mut slice! */);
176+
let n = input.encode_utf8(buf.as_mut_slice()).unwrap_or(0);
177177
assert_eq!(buf.slice_to(n), expect);
178178
}
179179

@@ -187,7 +187,7 @@ fn test_encode_utf8() {
187187
fn test_encode_utf16() {
188188
fn check(input: char, expect: &[u16]) {
189189
let mut buf = [0u16, ..2];
190-
let n = input.encode_utf16(buf /* as mut slice! */);
190+
let n = input.encode_utf16(buf.as_mut_slice()).unwrap_or(0);
191191
assert_eq!(buf.slice_to(n), expect);
192192
}
193193

src/libstd/io/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1110,7 +1110,7 @@ pub trait Writer {
11101110
#[inline]
11111111
fn write_char(&mut self, c: char) -> IoResult<()> {
11121112
let mut buf = [0u8, ..4];
1113-
let n = c.encode_utf8(buf.as_mut_slice());
1113+
let n = c.encode_utf8(buf.as_mut_slice()).unwrap_or(0);
11141114
self.write(buf.slice_to(n))
11151115
}
11161116

0 commit comments

Comments
 (0)