Skip to content

Improved code generation for chars UTF-8/UTF-16 encoding methods #16498

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 17, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/libcollections/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1680,7 +1680,7 @@ mod tests {
fn test_chars_decoding() {
let mut bytes = [0u8, ..4];
for c in range(0u32, 0x110000).filter_map(|c| ::core::char::from_u32(c)) {
let len = c.encode_utf8(bytes);
let len = c.encode_utf8(bytes).unwrap_or(0);
let s = ::core::str::from_utf8(bytes.slice_to(len)).unwrap();
if Some(c) != s.chars().next() {
fail!("character {:x}={} does not decode correctly", c as u32, c);
Expand All @@ -1692,7 +1692,7 @@ mod tests {
fn test_chars_rev_decoding() {
let mut bytes = [0u8, ..4];
for c in range(0u32, 0x110000).filter_map(|c| ::core::char::from_u32(c)) {
let len = c.encode_utf8(bytes);
let len = c.encode_utf8(bytes).unwrap_or(0);
let s = ::core::str::from_utf8(bytes.slice_to(len)).unwrap();
if Some(c) != s.chars().rev().next() {
fail!("character {:x}={} does not decode correctly", c as u32, c);
Expand Down
2 changes: 1 addition & 1 deletion src/libcollections/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@ impl String {
data: self.vec.as_ptr().offset(cur_len as int),
len: 4,
};
let used = ch.encode_utf8(mem::transmute(slice));
let used = ch.encode_utf8(mem::transmute(slice)).unwrap_or(0);
self.vec.set_len(cur_len + used);
}
}
Expand Down
67 changes: 36 additions & 31 deletions src/libcore/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
use mem::transmute;
use option::{None, Option, Some};
use iter::range_step;
use collections::Collection;

// UTF-8 ranges and tags for encoding characters
static TAG_CONT: u8 = 0b1000_0000u8;
Expand All @@ -27,7 +28,6 @@ static TAG_FOUR_B: u8 = 0b1111_0000u8;
static MAX_ONE_B: u32 = 0x80u32;
static MAX_TWO_B: u32 = 0x800u32;
static MAX_THREE_B: u32 = 0x10000u32;
static MAX_FOUR_B: u32 = 0x200000u32;

/*
Lu Uppercase_Letter an uppercase letter
Expand Down Expand Up @@ -217,14 +217,14 @@ pub fn escape_default(c: char, f: |char|) {
}

/// Returns the amount of bytes this `char` would need if encoded in UTF-8
#[inline]
pub fn len_utf8_bytes(c: char) -> uint {
let code = c as u32;
match () {
_ if code < MAX_ONE_B => 1u,
_ if code < MAX_TWO_B => 2u,
_ if code < MAX_THREE_B => 3u,
_ if code < MAX_FOUR_B => 4u,
_ => fail!("invalid character!"),
_ => 4u,
}
}

Expand Down Expand Up @@ -297,21 +297,19 @@ pub trait Char {
/// UTF-8.
fn len_utf8_bytes(&self) -> uint;

/// Encodes this character as UTF-8 into the provided byte buffer.
///
/// The buffer must be at least 4 bytes long or a runtime failure may
/// occur.
/// Encodes this character as UTF-8 into the provided byte buffer,
/// and then returns the number of bytes written.
///
/// This will then return the number of bytes written to the slice.
fn encode_utf8(&self, dst: &mut [u8]) -> uint;
/// If the buffer is not large enough, nothing will be written into it
/// and a `None` will be returned.
fn encode_utf8(&self, dst: &mut [u8]) -> Option<uint>;

/// Encodes this character as UTF-16 into the provided `u16` buffer.
/// Encodes this character as UTF-16 into the provided `u16` buffer,
/// and then returns the number of `u16`s written.
///
/// The buffer must be at least 2 elements long or a runtime failure may
/// occur.
///
/// This will then return the number of `u16`s written to the slice.
fn encode_utf16(&self, dst: &mut [u16]) -> uint;
/// If the buffer is not large enough, nothing will be written into it
/// and a `None` will be returned.
fn encode_utf16(&self, dst: &mut [u16]) -> Option<uint>;
}

impl Char for char {
Expand All @@ -325,45 +323,52 @@ impl Char for char {

fn escape_default(&self, f: |char|) { escape_default(*self, f) }

#[inline]
fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }

fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> uint {
#[inline]
fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> Option<uint> {
// Marked #[inline] to allow llvm optimizing it away
let code = *self as u32;
if code < MAX_ONE_B {
if code < MAX_ONE_B && dst.len() >= 1 {
dst[0] = code as u8;
1
} else if code < MAX_TWO_B {
Some(1)
} else if code < MAX_TWO_B && dst.len() >= 2 {
dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
2
} else if code < MAX_THREE_B {
Some(2)
} else if code < MAX_THREE_B && dst.len() >= 3 {
dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
dst[1] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
3
} else {
Some(3)
} else if dst.len() >= 4 {
dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
dst[2] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
4
Some(4)
} else {
None
}
}

fn encode_utf16(&self, dst: &mut [u16]) -> uint {
#[inline]
fn encode_utf16(&self, dst: &mut [u16]) -> Option<uint> {
// Marked #[inline] to allow llvm optimizing it away
let mut ch = *self as u32;
if (ch & 0xFFFF_u32) == ch {
if (ch & 0xFFFF_u32) == ch && dst.len() >= 1 {
// The BMP falls through (assuming non-surrogate, as it should)
assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
dst[0] = ch as u16;
1
} else {
Some(1)
} else if dst.len() >= 2 {
// Supplementary planes break into surrogates.
assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
ch -= 0x1_0000_u32;
dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
2
Some(2)
} else {
None
}
}
}
6 changes: 3 additions & 3 deletions src/libcore/fmt/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ impl<'a> Formatter<'a> {
let write_prefix = |f: &mut Formatter| {
for c in sign.move_iter() {
let mut b = [0, ..4];
let n = c.encode_utf8(b);
let n = c.encode_utf8(b).unwrap_or(0);
try!(f.buf.write(b.slice_to(n)));
}
if prefixed { f.buf.write(prefix.as_bytes()) }
Expand Down Expand Up @@ -464,7 +464,7 @@ impl<'a> Formatter<'a> {
try!(f(self));
}
let mut fill = [0u8, ..4];
let len = self.fill.encode_utf8(fill);
let len = self.fill.encode_utf8(fill).unwrap_or(0);
for _ in range(0, padding) {
try!(self.buf.write(fill.slice_to(len)));
}
Expand Down Expand Up @@ -540,7 +540,7 @@ impl<'a, T: str::Str> String for T {
impl Char for char {
fn fmt(&self, f: &mut Formatter) -> Result {
let mut utf8 = [0u8, ..4];
let amt = self.encode_utf8(utf8);
let amt = self.encode_utf8(utf8).unwrap_or(0);
let s: &str = unsafe { mem::transmute(utf8.slice_to(amt)) };
secret_string(&s, f)
}
Expand Down
4 changes: 2 additions & 2 deletions src/libcore/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ use iter::range;
use num::{CheckedMul, Saturating};
use option::{Option, None, Some};
use raw::Repr;
use slice::ImmutableSlice;
use slice::{ImmutableSlice, MutableSlice};
use slice;
use uint;

Expand Down Expand Up @@ -646,7 +646,7 @@ impl<'a> Iterator<u16> for Utf16CodeUnits<'a> {

let mut buf = [0u16, ..2];
self.chars.next().map(|ch| {
let n = ch.encode_utf16(buf /* as mut slice! */);
let n = ch.encode_utf16(buf.as_mut_slice()).unwrap_or(0);
if n == 2 { self.extra = buf[1]; }
buf[0]
})
Expand Down
4 changes: 2 additions & 2 deletions src/libcoretest/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ fn test_escape_unicode() {
fn test_encode_utf8() {
fn check(input: char, expect: &[u8]) {
let mut buf = [0u8, ..4];
let n = input.encode_utf8(buf /* as mut slice! */);
let n = input.encode_utf8(buf.as_mut_slice()).unwrap_or(0);
assert_eq!(buf.slice_to(n), expect);
}

Expand All @@ -187,7 +187,7 @@ fn test_encode_utf8() {
fn test_encode_utf16() {
fn check(input: char, expect: &[u16]) {
let mut buf = [0u16, ..2];
let n = input.encode_utf16(buf /* as mut slice! */);
let n = input.encode_utf16(buf.as_mut_slice()).unwrap_or(0);
assert_eq!(buf.slice_to(n), expect);
}

Expand Down
2 changes: 1 addition & 1 deletion src/libstd/io/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1110,7 +1110,7 @@ pub trait Writer {
#[inline]
fn write_char(&mut self, c: char) -> IoResult<()> {
let mut buf = [0u8, ..4];
let n = c.encode_utf8(buf.as_mut_slice());
let n = c.encode_utf8(buf.as_mut_slice()).unwrap_or(0);
self.write(buf.slice_to(n))
}

Expand Down