auto merge of #16498 : Kimundi/rust/inline-utf-encoding, r=alexcrichton

bors · bors · commit cb9c1e0e702f · 2014-08-17T04:42:32.000Z
The first commit improves code generation through a few changes:
- The `#[inline]` attributes allow llvm to constant fold the encoding step away in certain situations. For example, code like this changes from a call to `encode_utf8` in a inner loop to the pushing of a byte constant:

 ```rust
let mut s = String::new();
for _ in range(0u, 21) {
        s.push_char('a');
}
```
- Both methods changed their semantic from causing run time failure if the target buffer is not large enough to returning `None` instead. This makes llvm no longer emit code for causing failure for these methods.
- A few debug `assert!()` calls got removed because they affected code generation due to unwinding, and where basically unnecessary with today's sound handling of `char` as a Unicode scalar value.

~~The second commit is optional. It changes the methods from regular indexing with the `dst[i]` syntax to unsafe indexing with `dst.unsafe_mut_ref(i)`. This does not change code generation directly - in both cases llvm is smart enough to see that there can never be an out-of-bounds access. But it makes it emit a `nounwind` attribute for the function. 
However, I'm not sure whether that is a real improvement, so if there is any objection to this I'll remove the commit.~~

This changes how the methods behave on a too small buffer, so this is a 

[breaking-change]
diff --git a/src/libcollections/str.rs b/src/libcollections/str.rs
@@ -1682,7 +1682,7 @@ mod tests {
     fn test_chars_decoding() {
         let mut bytes = [0u8, ..4];
         for c in range(0u32, 0x110000).filter_map(|c| ::core::char::from_u32(c)) {
-            let len = c.encode_utf8(bytes);
+            let len = c.encode_utf8(bytes).unwrap_or(0);
             let s = ::core::str::from_utf8(bytes.slice_to(len)).unwrap();
             if Some(c) != s.chars().next() {
                 fail!("character {:x}={} does not decode correctly", c as u32, c);
@@ -1694,7 +1694,7 @@ mod tests {
     fn test_chars_rev_decoding() {
         let mut bytes = [0u8, ..4];
         for c in range(0u32, 0x110000).filter_map(|c| ::core::char::from_u32(c)) {
-            let len = c.encode_utf8(bytes);
+            let len = c.encode_utf8(bytes).unwrap_or(0);
             let s = ::core::str::from_utf8(bytes.slice_to(len)).unwrap();
             if Some(c) != s.chars().rev().next() {
                 fail!("character {:x}={} does not decode correctly", c as u32, c);
diff --git a/src/libcollections/string.rs b/src/libcollections/string.rs
@@ -502,7 +502,7 @@ impl String {
                 data: self.vec.as_ptr().offset(cur_len as int),
                 len: 4,
             };
-            let used = ch.encode_utf8(mem::transmute(slice));
+            let used = ch.encode_utf8(mem::transmute(slice)).unwrap_or(0);
             self.vec.set_len(cur_len + used);
         }
     }
diff --git a/src/libcore/char.rs b/src/libcore/char.rs
@@ -18,6 +18,7 @@
 use mem::transmute;
 use option::{None, Option, Some};
 use iter::range_step;
+use collections::Collection;
 
 // UTF-8 ranges and tags for encoding characters
 static TAG_CONT: u8    = 0b1000_0000u8;
@@ -27,7 +28,6 @@ static TAG_FOUR_B: u8  = 0b1111_0000u8;
 static MAX_ONE_B: u32   =     0x80u32;
 static MAX_TWO_B: u32   =    0x800u32;
 static MAX_THREE_B: u32 =  0x10000u32;
-static MAX_FOUR_B:  u32 = 0x200000u32;
 
 /*
     Lu  Uppercase_Letter        an uppercase letter
@@ -217,14 +217,14 @@ pub fn escape_default(c: char, f: |char|) {
 }
 
 /// Returns the amount of bytes this `char` would need if encoded in UTF-8
+#[inline]
 pub fn len_utf8_bytes(c: char) -> uint {
     let code = c as u32;
     match () {
         _ if code < MAX_ONE_B   => 1u,
         _ if code < MAX_TWO_B   => 2u,
         _ if code < MAX_THREE_B => 3u,
-        _ if code < MAX_FOUR_B  => 4u,
-        _                       => fail!("invalid character!"),
+        _  => 4u,
     }
 }
 
@@ -297,21 +297,19 @@ pub trait Char {
     /// UTF-8.
     fn len_utf8_bytes(&self) -> uint;
 
-    /// Encodes this character as UTF-8 into the provided byte buffer.
-    ///
-    /// The buffer must be at least 4 bytes long or a runtime failure may
-    /// occur.
+    /// Encodes this character as UTF-8 into the provided byte buffer,
+    /// and then returns the number of bytes written.
     ///
-    /// This will then return the number of bytes written to the slice.
-    fn encode_utf8(&self, dst: &mut [u8]) -> uint;
+    /// If the buffer is not large enough, nothing will be written into it
+    /// and a `None` will be returned.
+    fn encode_utf8(&self, dst: &mut [u8]) -> Option<uint>;
 
-    /// Encodes this character as UTF-16 into the provided `u16` buffer.
+    /// Encodes this character as UTF-16 into the provided `u16` buffer,
+    /// and then returns the number of `u16`s written.
     ///
-    /// The buffer must be at least 2 elements long or a runtime failure may
-    /// occur.
-    ///
-    /// This will then return the number of `u16`s written to the slice.
-    fn encode_utf16(&self, dst: &mut [u16]) -> uint;
+    /// If the buffer is not large enough, nothing will be written into it
+    /// and a `None` will be returned.
+    fn encode_utf16(&self, dst: &mut [u16]) -> Option<uint>;
 }
 
 impl Char for char {
@@ -325,45 +323,52 @@ impl Char for char {
 
     fn escape_default(&self, f: |char|) { escape_default(*self, f) }
 
+    #[inline]
     fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }
 
-    fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> uint {
+    #[inline]
+    fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> Option<uint> {
+        // Marked #[inline] to allow llvm optimizing it away
         let code = *self as u32;
-        if code < MAX_ONE_B {
+        if code < MAX_ONE_B && dst.len() >= 1 {
             dst[0] = code as u8;
-            1
-        } else if code < MAX_TWO_B {
+            Some(1)
+        } else if code < MAX_TWO_B && dst.len() >= 2 {
             dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
             dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
-            2
-        } else if code < MAX_THREE_B {
+            Some(2)
+        } else if code < MAX_THREE_B && dst.len() >= 3  {
             dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
             dst[1] = (code >>  6u & 0x3F_u32) as u8 | TAG_CONT;
             dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
-            3
-        } else {
+            Some(3)
+        } else if dst.len() >= 4 {
             dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
             dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
             dst[2] = (code >>  6u & 0x3F_u32) as u8 | TAG_CONT;
             dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
-            4
+            Some(4)
+        } else {
+            None
         }
     }
 
-    fn encode_utf16(&self, dst: &mut [u16]) -> uint {
+    #[inline]
+    fn encode_utf16(&self, dst: &mut [u16]) -> Option<uint> {
+        // Marked #[inline] to allow llvm optimizing it away
         let mut ch = *self as u32;
-        if (ch & 0xFFFF_u32) == ch {
+        if (ch & 0xFFFF_u32) == ch  && dst.len() >= 1 {
             // The BMP falls through (assuming non-surrogate, as it should)
-            assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
             dst[0] = ch as u16;
-            1
-        } else {
+            Some(1)
+        } else if dst.len() >= 2 {
             // Supplementary planes break into surrogates.
-            assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
             ch -= 0x1_0000_u32;
             dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
             dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
-            2
+            Some(2)
+        } else {
+            None
         }
     }
 }
diff --git a/src/libcore/fmt/mod.rs b/src/libcore/fmt/mod.rs
@@ -366,7 +366,7 @@ impl<'a> Formatter<'a> {
         let write_prefix = |f: &mut Formatter| {
             for c in sign.move_iter() {
                 let mut b = [0, ..4];
-                let n = c.encode_utf8(b);
+                let n = c.encode_utf8(b).unwrap_or(0);
                 try!(f.buf.write(b.slice_to(n)));
             }
             if prefixed { f.buf.write(prefix.as_bytes()) }
@@ -467,7 +467,7 @@ impl<'a> Formatter<'a> {
             try!(f(self));
         }
         let mut fill = [0u8, ..4];
-        let len = self.fill.encode_utf8(fill);
+        let len = self.fill.encode_utf8(fill).unwrap_or(0);
         for _ in range(0, padding) {
             try!(self.buf.write(fill.slice_to(len)));
         }
@@ -545,7 +545,7 @@ impl Char for char {
         use char::Char;
 
         let mut utf8 = [0u8, ..4];
-        let amt = self.encode_utf8(utf8);
+        let amt = self.encode_utf8(utf8).unwrap_or(0);
         let s: &str = unsafe { mem::transmute(utf8.slice_to(amt)) };
         secret_string(&s, f)
     }
diff --git a/src/libcore/str.rs b/src/libcore/str.rs
@@ -30,7 +30,7 @@ use iter::range;
 use num::{CheckedMul, Saturating};
 use option::{Option, None, Some};
 use raw::Repr;
-use slice::ImmutableSlice;
+use slice::{ImmutableSlice, MutableSlice};
 use slice;
 use uint;
 
@@ -646,7 +646,7 @@ impl<'a> Iterator<u16> for Utf16CodeUnits<'a> {
 
         let mut buf = [0u16, ..2];
         self.chars.next().map(|ch| {
-            let n = ch.encode_utf16(buf /* as mut slice! */);
+            let n = ch.encode_utf16(buf.as_mut_slice()).unwrap_or(0);
             if n == 2 { self.extra = buf[1]; }
             buf[0]
         })
diff --git a/src/libcoretest/char.rs b/src/libcoretest/char.rs
@@ -173,7 +173,7 @@ fn test_escape_unicode() {
 fn test_encode_utf8() {
     fn check(input: char, expect: &[u8]) {
         let mut buf = [0u8, ..4];
-        let n = input.encode_utf8(buf /* as mut slice! */);
+        let n = input.encode_utf8(buf.as_mut_slice()).unwrap_or(0);
         assert_eq!(buf.slice_to(n), expect);
     }
 
@@ -187,7 +187,7 @@ fn test_encode_utf8() {
 fn test_encode_utf16() {
     fn check(input: char, expect: &[u16]) {
         let mut buf = [0u16, ..2];
-        let n = input.encode_utf16(buf /* as mut slice! */);
+        let n = input.encode_utf16(buf.as_mut_slice()).unwrap_or(0);
         assert_eq!(buf.slice_to(n), expect);
     }
 
diff --git a/src/libstd/io/mod.rs b/src/libstd/io/mod.rs
@@ -1110,7 +1110,7 @@ pub trait Writer {
     #[inline]
     fn write_char(&mut self, c: char) -> IoResult<()> {
         let mut buf = [0u8, ..4];
-        let n = c.encode_utf8(buf.as_mut_slice());
+        let n = c.encode_utf8(buf.as_mut_slice()).unwrap_or(0);
         self.write(buf.slice_to(n))
     }
 

Original file line number	Diff line number	Diff line change
`@@ -502,7 +502,7 @@ impl String {`
`502`	`502`	`data: self.vec.as_ptr().offset(cur_len as int),`
`503`	`503`	`len: 4,`
`504`	`504`	`};`
`505`		`- let used = ch.encode_utf8(mem::transmute(slice));`
	`505`	`+ let used = ch.encode_utf8(mem::transmute(slice)).unwrap_or(0);`
`506`	`506`	`self.vec.set_len(cur_len + used);`
`507`	`507`	`}`
`508`	`508`	`}`
Original file line number	Diff line number	Diff line change
`@@ -366,7 +366,7 @@ impl<'a> Formatter<'a> {`
`366`	`366`	`let write_prefix = \|f: &mut Formatter\| {`
`367`	`367`	`for c in sign.move_iter() {`
`368`	`368`	`let mut b = [0, ..4];`
`369`		`- let n = c.encode_utf8(b);`
	`369`	`+ let n = c.encode_utf8(b).unwrap_or(0);`
`370`	`370`	`try!(f.buf.write(b.slice_to(n)));`
`371`	`371`	`}`
`372`	`372`	`if prefixed { f.buf.write(prefix.as_bytes()) }`
`@@ -467,7 +467,7 @@ impl<'a> Formatter<'a> {`
`467`	`467`	`try!(f(self));`
`468`	`468`	`}`
`469`	`469`	`let mut fill = [0u8, ..4];`
`470`		`- let len = self.fill.encode_utf8(fill);`
	`470`	`+ let len = self.fill.encode_utf8(fill).unwrap_or(0);`
`471`	`471`	`for _ in range(0, padding) {`
`472`	`472`	`try!(self.buf.write(fill.slice_to(len)));`
`473`	`473`	`}`
`@@ -545,7 +545,7 @@ impl Char for char {`
`545`	`545`	`use char::Char;`
`546`	`546`
`547`	`547`	`let mut utf8 = [0u8, ..4];`
`548`		`- let amt = self.encode_utf8(utf8);`
	`548`	`+ let amt = self.encode_utf8(utf8).unwrap_or(0);`
`549`	`549`	`let s: &str = unsafe { mem::transmute(utf8.slice_to(amt)) };`
`550`	`550`	`secret_string(&s, f)`
`551`	`551`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1110,7 +1110,7 @@ pub trait Writer {`
`1110`	`1110`	`#[inline]`
`1111`	`1111`	`fn write_char(&mut self, c: char) -> IoResult<()> {`
`1112`	`1112`	`let mut buf = [0u8, ..4];`
`1113`		`- let n = c.encode_utf8(buf.as_mut_slice());`
	`1113`	`+ let n = c.encode_utf8(buf.as_mut_slice()).unwrap_or(0);`
`1114`	`1114`	`self.write(buf.slice_to(n))`
`1115`	`1115`	`}`
`1116`	`1116`