std: remove substr & str::count_*, methodise char_len, implement slice_chars.

huonw · huonw · commit 3ac00a94895f · 2013-06-12T12:21:04.000+10:00
The confusing mixture of byte index and character count meant that every
use of .substr was incorrect; replaced by slice_chars which only uses
character indices. The old behaviour of `.substr(start, n)` can be emulated
via `.slice_from(start).slice_chars(0, n)`.
diff --git a/doc/tutorial.md b/doc/tutorial.md
@@ -1410,7 +1410,7 @@ let new_favorite_crayon_name = favorite_crayon_name.trim();
 
 if favorite_crayon_name.len() > 5 {
    // Create a substring
-   println(favorite_crayon_name.substr(0, 5));
+   println(favorite_crayon_name.slice_chars(0, 5));
 }
 ~~~
 
diff --git a/src/libextra/rope.rs b/src/libextra/rope.rs
@@ -84,9 +84,9 @@ pub fn of_str(str: @~str) -> Rope {
  *
  * # Return value
  *
- * A rope representing the same string as `str.substr(byte_offset,
- * byte_len)`.  Depending on `byte_len`, this rope may be empty, flat
- * or complex.
+ * A rope representing the same string as `str.slice(byte_offset,
+ * byte_offset + byte_len)`.  Depending on `byte_len`, this rope may
+ * be empty, flat or complex.
  *
  * # Performance note
  *
@@ -588,7 +588,7 @@ pub mod node {
      * * char_len - The number of chars in the leaf.
      * * content - Contents of the leaf.
      *
-     *     Note that we can have `char_len < str::char_len(content)`, if
+     *     Note that we can have `char_len < content.char_len()`, if
      *     this leaf is only a subset of the string. Also note that the
      *     string can be shared between several ropes, e.g. for indexing
      *     purposes.
@@ -680,7 +680,7 @@ pub mod node {
      */
     pub fn of_substr(str: @~str, byte_start: uint, byte_len: uint) -> @Node {
         return of_substr_unsafer(str, byte_start, byte_len,
-                              str::count_chars(*str, byte_start, byte_len));
+                                 str.slice(byte_start, byte_start + byte_len).char_len());
     }
 
     /**
@@ -734,7 +734,7 @@ pub mod node {
                     if i == 0u  { first_leaf_char_len }
                     else { hint_max_leaf_char_len };
                 let chunk_byte_len =
-                    str::count_bytes(*str, offset, chunk_char_len);
+                    str.slice_from(offset).slice_chars(0, chunk_char_len).len();
                 nodes[i] = @Leaf(Leaf {
                     byte_offset: offset,
                     byte_len: chunk_byte_len,
@@ -938,7 +938,7 @@ pub mod node {
             match (*node) {
               node::Leaf(x) => {
                 let char_len =
-                    str::count_chars(*x.content, byte_offset, byte_len);
+                    x.content.slice(byte_offset, byte_offset + byte_len).char_len();
                 return @Leaf(Leaf {
                     byte_offset: byte_offset,
                     byte_len: byte_len,
@@ -1002,9 +1002,9 @@ pub mod node {
                     return node;
                 }
                 let byte_offset =
-                    str::count_bytes(*x.content, 0u, char_offset);
+                    x.content.slice_chars(0, char_offset).len();
                 let byte_len    =
-                    str::count_bytes(*x.content, byte_offset, char_len);
+                    x.content.slice_from(byte_offset).slice_chars(0, char_len).len();
                 return @Leaf(Leaf {
                     byte_offset: byte_offset,
                     byte_len: byte_len,
@@ -1312,7 +1312,7 @@ mod tests {
         let sample = @~"0123456789ABCDE";
         let r      = of_str(sample);
 
-        assert_eq!(char_len(r), str::char_len(*sample));
+        assert_eq!(char_len(r), sample.char_len());
         assert!(rope_to_string(r) == *sample);
     }
 
@@ -1328,7 +1328,7 @@ mod tests {
         }
         let sample = @copy *buf;
         let r      = of_str(sample);
-        assert!(char_len(r) == str::char_len(*sample));
+        assert_eq!(char_len(r), sample.char_len());
         assert!(rope_to_string(r) == *sample);
 
         let mut string_iter = 0u;
@@ -1374,7 +1374,7 @@ mod tests {
             }
         }
 
-        assert_eq!(len, str::char_len(*sample));
+        assert_eq!(len, sample.char_len());
     }
 
     #[test]
diff --git a/src/libextra/terminfo/searcher.rs b/src/libextra/terminfo/searcher.rs
@@ -27,7 +27,7 @@ pub fn get_dbpath_for_term(term: &str) -> Option<~path> {
     let homedir = os::homedir();
 
     let mut dirs_to_search = ~[];
-    let first_char = term.substr(0, 1);
+    let first_char = term.char_at(0);
 
     // Find search directory
     match getenv("TERMINFO") {
@@ -57,12 +57,12 @@ pub fn get_dbpath_for_term(term: &str) -> Option<~path> {
 
     // Look for the terminal in all of the search directories
     for dirs_to_search.each |p| {
-        let newp = ~p.push_many(&[first_char.to_owned(), term.to_owned()]);
+        let newp = ~p.push_many(&[str::from_char(first_char), term.to_owned()]);
         if os::path_exists(p) && os::path_exists(newp) {
             return Some(newp);
         }
         // on some installations the dir is named after the hex of the char (e.g. OS X)
-        let newp = ~p.push_many(&[fmt!("%x", first_char[0] as uint), term.to_owned()]);
+        let newp = ~p.push_many(&[fmt!("%x", first_char as uint), term.to_owned()]);
         if os::path_exists(p) && os::path_exists(newp) {
             return Some(newp);
         }
diff --git a/src/librustc/driver/driver.rs b/src/librustc/driver/driver.rs
@@ -590,7 +590,7 @@ pub fn build_session_options(binary: @~str,
 
         // FIXME: #4318 Instead of to_ascii and to_str_ascii, could use
         // to_ascii_consume and to_str_consume to not do a unnecessary copy.
-        let level_short = level_name.substr(0,1);
+        let level_short = level_name.slice_chars(0, 1);
         let level_short = level_short.to_ascii().to_upper().to_str_ascii();
         let flags = vec::append(getopts::opt_strs(matches, level_short),
                                 getopts::opt_strs(matches, level_name));
diff --git a/src/librustc/middle/resolve.rs b/src/librustc/middle/resolve.rs
@@ -2678,14 +2678,14 @@ impl Resolver {
         match module_prefix_result {
             Failed => {
                 let mpath = self.idents_to_str(module_path);
-                match self.idents_to_str(module_path).rfind(':') {
+                match mpath.rfind(':') {
                     Some(idx) => {
                         self.session.span_err(span, fmt!("unresolved import: could not find `%s` \
-                                                         in `%s`", mpath.substr(idx,
-                                                                                mpath.len() - idx),
-                                                         // idx - 1 to account for the extra
-                                                         // colon
-                                                         mpath.substr(0, idx - 1)));
+                                                         in `%s`",
+                                                         // idx +- 1 to account for the colons
+                                                         // on either side
+                                                         mpath.slice_from(idx + 1),
+                                                         mpath.slice_to(idx - 1)));
                     },
                     None => (),
                 };
diff --git a/src/librustc/middle/trans/expr.rs b/src/librustc/middle/trans/expr.rs
@@ -1704,5 +1704,5 @@ fn trans_assign_op(bcx: block,
 }
 
 fn shorten(x: ~str) -> ~str {
-    if x.len() > 60 { x.substr(0, 60).to_owned() } else { x }
+    if x.char_len() > 60 { x.slice_chars(0, 60).to_owned() } else { x }
 }
diff --git a/src/librusti/rusti.rc b/src/librusti/rusti.rc
@@ -284,7 +284,7 @@ fn run_cmd(repl: &mut Repl, _in: @io::Reader, _out: @io::Writer,
             for args.each |arg| {
                 let (crate, filename) =
                     if arg.ends_with(".rs") || arg.ends_with(".rc") {
-                    (arg.substr(0, arg.len() - 3).to_owned(), copy *arg)
+                    (arg.slice_to(arg.len() - 3).to_owned(), copy *arg)
                 } else {
                     (copy *arg, arg + ".rs")
                 };
@@ -342,7 +342,8 @@ pub fn run_line(repl: &mut Repl, in: @io::Reader, out: @io::Writer, line: ~str,
         // FIXME #5898: conflicts with Cell.take(), so can't be at the top level
         use core::iterator::IteratorUtil;
 
-        let full = line.substr(1, line.len() - 1);
+        // drop the : and the \n (one byte each)
+        let full = line.slice(1, line.len() - 1);
         let split: ~[~str] = full.word_iter().transform(|s| s.to_owned()).collect();
         let len = split.len();
 
diff --git a/src/libstd/str.rs b/src/libstd/str.rs
@@ -848,15 +848,6 @@ fn match_at<'a,'b>(haystack: &'a str, needle: &'b str, at: uint) -> bool {
     return true;
 }
 
-
-/*
-Section: String properties
-*/
-
-/// Returns the number of characters that a string holds
-#[inline(always)]
-pub fn char_len(s: &str) -> uint { count_chars(s, 0u, s.len()) }
-
 /*
 Section: Misc
 */
@@ -974,46 +965,6 @@ pub fn with_capacity(capacity: uint) -> ~str {
     buf
 }
 
-/**
- * As char_len but for a slice of a string
- *
- * # Arguments
- *
- * * s - A valid string
- * * start - The position inside `s` where to start counting in bytes
- * * end - The position where to stop counting
- *
- * # Return value
- *
- * The number of Unicode characters in `s` between the given indices.
- */
-pub fn count_chars(s: &str, start: uint, end: uint) -> uint {
-    assert!(s.is_char_boundary(start));
-    assert!(s.is_char_boundary(end));
-    let mut (i, len) = (start, 0u);
-    while i < end {
-        let next = s.char_range_at(i).next;
-        len += 1u;
-        i = next;
-    }
-    return len;
-}
-
-/// Counts the number of bytes taken by the first `n` chars in `s`
-/// starting from `start`.
-pub fn count_bytes<'b>(s: &'b str, start: uint, n: uint) -> uint {
-    assert!(s.is_char_boundary(start));
-    let mut (end, cnt) = (start, n);
-    let l = s.len();
-    while cnt > 0u {
-        assert!(end < l);
-        let next = s.char_range_at(end).next;
-        cnt -= 1u;
-        end = next;
-    }
-    end - start
-}
-
 /// Given a first byte, determine how many bytes are in this UTF-8 character
 pub fn utf8_char_width(b: u8) -> uint {
     let byte: uint = b as uint;
@@ -1394,11 +1345,14 @@ pub trait StrSlice<'self> {
     fn is_alphanumeric(&self) -> bool;
     fn len(&self) -> uint;
     fn char_len(&self) -> uint;
+
     fn slice(&self, begin: uint, end: uint) -> &'self str;
     fn slice_from(&self, begin: uint) -> &'self str;
     fn slice_to(&self, end: uint) -> &'self str;
+
+    fn slice_chars(&self, begin: uint, end: uint) -> &'self str;
+
     fn starts_with(&self, needle: &str) -> bool;
-    fn substr(&self, begin: uint, n: uint) -> &'self str;
     fn escape_default(&self) -> ~str;
     fn escape_unicode(&self) -> ~str;
     fn trim(&self) -> &'self str;
@@ -1595,7 +1549,8 @@ impl<'self> StrSlice<'self> for &'self str {
     }
     /// Returns the number of characters that a string holds
     #[inline]
-    fn char_len(&self) -> uint { char_len(*self) }
+    fn char_len(&self) -> uint { self.iter().count() }
+
     /**
      * Returns a slice of the given string from the byte range
      * [`begin`..`end`)
@@ -1626,6 +1581,32 @@ impl<'self> StrSlice<'self> for &'self str {
     fn slice_to(&self, end: uint) -> &'self str {
         self.slice(0, end)
     }
+
+    /// Returns a slice of the string from the char range
+    /// [`begin`..`end`).
+    ///
+    /// Fails if `begin` > `end` or the either `begin` or `end` are
+    /// beyond the last character of the string.
+    fn slice_chars(&self, begin: uint, end: uint) -> &'self str {
+        assert!(begin <= end);
+        // not sure how to use the iterators for this nicely.
+        let mut (position, count) = (0, 0);
+        let l = self.len();
+        while count < begin && position < l {
+            position = self.char_range_at(position).next;
+            count += 1;
+        }
+        if count < begin { fail!("Attempted to begin slice_chars beyond end of string") }
+        let start_byte = position;
+        while count < end && position < l {
+            position = self.char_range_at(position).next;
+            count += 1;
+        }
+        if count < end { fail!("Attempted to end slice_chars beyond end of string") }
+
+        self.slice(start_byte, position)
+    }
+
     /// Returns true if `needle` is a prefix of the string.
     fn starts_with<'a>(&self, needle: &'a str) -> bool {
         let (self_len, needle_len) = (self.len(), needle.len());
@@ -1641,16 +1622,6 @@ impl<'self> StrSlice<'self> for &'self str {
         else { match_at(*self, needle, self_len - needle_len) }
     }
 
-    /**
-     * Take a substring of another.
-     *
-     * Returns a string containing `n` characters starting at byte offset
-     * `begin`.
-     */
-    #[inline]
-    fn substr(&self, begin: uint, n: uint) -> &'self str {
-        self.slice(begin, begin + count_bytes(*self, begin, n))
-    }
     /// Escape each char in `s` with char::escape_default.
     #[inline]
     fn escape_default(&self) -> ~str { escape_default(*self) }
@@ -2367,14 +2338,14 @@ mod tests {
         assert_eq!("\u2620".len(), 3u);
         assert_eq!("\U0001d11e".len(), 4u);
 
-        assert_eq!(char_len(""), 0u);
-        assert_eq!(char_len("hello world"), 11u);
-        assert_eq!(char_len("\x63"), 1u);
-        assert_eq!(char_len("\xa2"), 1u);
-        assert_eq!(char_len("\u03c0"), 1u);
-        assert_eq!(char_len("\u2620"), 1u);
-        assert_eq!(char_len("\U0001d11e"), 1u);
-        assert_eq!(char_len("ประเทศไทย中华Việt Nam"), 19u);
+        assert_eq!("".char_len(), 0u);
+        assert_eq!("hello world".char_len(), 11u);
+        assert_eq!("\x63".char_len(), 1u);
+        assert_eq!("\xa2".char_len(), 1u);
+        assert_eq!("\u03c0".char_len(), 1u);
+        assert_eq!("\u2620".char_len(), 1u);
+        assert_eq!("\U0001d11e".char_len(), 1u);
+        assert_eq!("ประเทศไทย中华Việt Nam".char_len(), 19u);
     }
 
     #[test]
@@ -2509,13 +2480,13 @@ mod tests {
     }
 
     #[test]
-    fn test_substr() {
-        fn t(a: &str, b: &str, start: int) {
-            assert_eq!(a.substr(start as uint, b.len()), b);
+    fn test_slice_chars() {
+        fn t(a: &str, b: &str, start: uint) {
+            assert_eq!(a.slice_chars(start, start + b.char_len()), b);
         }
         t("hello", "llo", 2);
         t("hello", "el", 1);
-        assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".substr(6u, 6u));
+        assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".slice_chars(2, 8));
     }
 
     #[test]
diff --git a/src/libstd/unstable/extfmt.rs b/src/libstd/unstable/extfmt.rs
@@ -325,7 +325,7 @@ pub mod ct {
             'o' => TyOctal,
             'f' => TyFloat,
             '?' => TyPoly,
-            _ => err(~"unknown type in conversion: " + s.substr(i, 1))
+            _ => err(fmt!("unknown type in conversion: %c", s.char_at(i)))
         };
 
         Parsed::new(t, i + 1)
@@ -546,7 +546,7 @@ pub mod rt {
         // displayed
         let unpadded = match cv.precision {
           CountImplied => s,
-          CountIs(max) => if (max as uint) < str::char_len(s) {
+          CountIs(max) => if (max as uint) < s.char_len() {
             s.slice(0, max as uint)
           } else {
             s
@@ -584,7 +584,7 @@ pub mod rt {
                 ~""
             } else {
                 let s = uint::to_str_radix(num, radix);
-                let len = str::char_len(s);
+                let len = s.char_len();
                 if len < prec {
                     let diff = prec - len;
                     let pad = str::from_chars(vec::from_elem(diff, '0'));
@@ -614,7 +614,7 @@ pub mod rt {
             }
             CountIs(width) => { width as uint }
         };
-        let strlen = str::char_len(s) + headsize;
+        let strlen = s.char_len() + headsize;
         if uwidth <= strlen {
             for head.iter().advance |&c| {
                 buf.push_char(c);
diff --git a/src/test/run-pass/utf8_chars.rs b/src/test/run-pass/utf8_chars.rs

Original file line number	Diff line number	Diff line change
`@@ -1410,7 +1410,7 @@ let new_favorite_crayon_name = favorite_crayon_name.trim();`
`1410`	`1410`
`1411`	`1411`	`if favorite_crayon_name.len() > 5 {`
`1412`	`1412`	`// Create a substring`
`1413`		`- println(favorite_crayon_name.substr(0, 5));`
	`1413`	`+ println(favorite_crayon_name.slice_chars(0, 5));`
`1414`	`1414`	`}`
`1415`	`1415`	`~~~`
`1416`	`1416`
Original file line number	Diff line number	Diff line change
`@@ -1704,5 +1704,5 @@ fn trans_assign_op(bcx: block,`
`1704`	`1704`	`}`
`1705`	`1705`
`1706`	`1706`	`fn shorten(x: ~str) -> ~str {`
`1707`		`- if x.len() > 60 { x.substr(0, 60).to_owned() } else { x }`
	`1707`	`+ if x.char_len() > 60 { x.slice_chars(0, 60).to_owned() } else { x }`
`1708`	`1708`	`}`