Fix #3961 : use char range methods instead of byte offsets to detect whitespace.

pnkfelix · pnkfelix · commit 1deebeef7d18 · 2013-06-13T23:40:24.000+02:00
diff --git a/src/libsyntax/parse/comments.rs b/src/libsyntax/parse/comments.rs
@@ -198,26 +198,35 @@ fn read_line_comments(rdr: @mut StringReader, code_to_the_left: bool,
     }
 }
 
-// FIXME #3961: This is not the right way to convert string byte
-// offsets to characters.
-fn all_whitespace(s: &str, begin: uint, end: uint) -> bool {
-    let mut i: uint = begin;
-    while i != end {
-        if !is_whitespace(s[i] as char) { return false; } i += 1u;
+// Returns None if the first col chars of s contain a non-whitespace char.
+// Otherwise returns Some(k) where k is first char offset after that leading
+// whitespace.  Note k may be outside bounds of s.
+fn all_whitespace(s: &str, col: CharPos) -> Option<uint> {
+    let len = s.len();
+    let mut col = col.to_uint();
+    let mut cursor: uint = 0;
+    while col > 0 && cursor < len {
+        let r: str::CharRange = str::char_range_at(s, cursor);
+        if !r.ch.is_whitespace() {
+            return None;
+        }
+        cursor = r.next;
+        col -= 1;
     }
-    return true;
+    return Some(cursor);
 }
 
 fn trim_whitespace_prefix_and_push_line(lines: &mut ~[~str],
                                         s: ~str, col: CharPos) {
     let len = s.len();
-    // FIXME #3961: Doing bytewise comparison and slicing with CharPos
-    let col = col.to_uint();
-    let s1 = if all_whitespace(s, 0, uint::min(len, col)) {
-        if col < len {
-            s.slice(col, len).to_owned()
-        } else {  ~"" }
-    } else { s };
+    let s1 = match all_whitespace(s, col) {
+        Some(col) => {
+            if col < len {
+                s.slice(col, len).to_owned()
+            } else {  ~"" }
+        }
+        None => s,
+    };
     debug!("pushing line: %s", s1);
     lines.push(s1);
 }
diff --git a/src/test/pretty/block-comment-wchar.rs b/src/test/pretty/block-comment-wchar.rs
@@ -0,0 +1,109 @@
+// Copyright 2012 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// This is meant as a test case for Issue 3961.
+//
+// Test via: rustc --pretty normal src/test/pretty/block-comment-wchar.rs
+
+fn f() {
+    fn nested() {
+        /*
+  Spaced2
+        */
+        /*
+          Spaced10
+        */
+        /*
+								  Tabbed8+2
+        */
+        /*
+  CR8+2
+        */
+    }
+    /*
+  Spaced2:                       (prefixed so start of space aligns with comment)
+    */
+    /*
+		Tabbed2: (more indented b/c *start* of space will align with comment)
+    */
+    /*
+      Spaced6:                       (Alignment removed and realigning spaces inserted)
+    */
+    /*
+				  Tabbed4+2:                     (Alignment removed and realigning spaces inserted)
+    */
+
+    /*
+  VT4+2:                         (should align)
+    */
+    /*
+  FF4+2:                         (should align)
+    */
+    /*
+  CR4+2:                         (should align)
+    */
+    /*
+    // (NEL deliberately omitted)
+    */
+    /*
+      Ogham Space Mark 4+2:          (should align)
+    */
+    /*
+᠎᠎᠎᠎  Mongolian Vowel Separator 4+2: (should align)
+    */
+    /*
+      Four-per-em space 4+2:         (should align)
+    */
+
+    /*
+   ᠎  Mongolian Vowel Sep   count 1: (should align)
+  ᠎   Mongolian Vowel Sep   count 2: (should align)
+  ᠎᠎  Mongolian Vowel Sep   count 3: (should align)
+ ᠎    Mongolian Vowel Sep   count 4: (should align)
+ ᠎ ᠎  Mongolian Vowel Sep   count 5: (should align)
+ ᠎᠎   Mongolian Vowel Sep   count 6: (should align)
+ ᠎᠎᠎  Mongolian Vowel Sep   count 7: (should align)
+᠎     Mongolian Vowel Sep   count 8: (should align)
+᠎  ᠎  Mongolian Vowel Sep   count 9: (should align)
+᠎ ᠎   Mongolian Vowel Sep   count A: (should align)
+᠎ ᠎᠎  Mongolian Vowel Sep   count B: (should align)
+᠎᠎    Mongolian Vowel Sep   count C: (should align)
+᠎᠎ ᠎  Mongolian Vowel Sep   count D: (should align)
+᠎᠎᠎   Mongolian Vowel Sep   count E: (should align)
+᠎᠎᠎᠎  Mongolian Vowel Sep   count F: (should align)
+    */
+
+/* */ /*
+        Hello from offset 6
+        Space 6+2:                     compare A
+᠎᠎᠎᠎᠎᠎  Mongolian Vowel Separator 6+2: compare B
+      */
+/*᠎*/ /*
+        Hello from another offset 6 with wchars establishing column offset
+        Space 6+2:                     compare C
+᠎᠎᠎᠎᠎᠎  Mongolian Vowel Separator 6+2: compare D
+      */
+}
+
+fn main() {
+    // Taken from http://en.wikipedia.org/wiki/Whitespace_character
+    let chars = [ '\x0A', '\x0B', '\x0C', '\x0D', '\x20',
+                 // '\x85', // for some reason Rust thinks NEL isn't whitespace
+                 '\xA0', '\u1680', '\u180E',
+                 '\u2000', '\u2001', '\u2002', '\u2003',
+                 '\u2004', '\u2005', '\u2006', '\u2007',
+                 '\u2008', '\u2009', '\u200A',
+                 '\u2028', '\u2029', '\u202F', '\u205F',
+                 '\u3000'
+                ];
+    for vec::each(chars) |c| {
+        io::println(fmt!("%? %?", c, c.is_whitespace()));
+    }
+}