---

emberian · emberian · commit 749613412baa · 2014-07-09T00:06:29.000-07:00
yaml --- r: 123707 b: refs/heads/try c: f512779 h: refs/heads/master i: 123705: 36f075b 123703: 1b1cdee v: v3
diff --git a/[refs] b/[refs]
@@ -2,7 +2,7 @@
 refs/heads/master: da4e4e4e0a7778a85748aa4a303b13f603e96b4b
 refs/heads/snap-stage1: e33de59e47c5076a89eadeb38f4934f58a3618a6
 refs/heads/snap-stage3: 8ddd286ea4ba4384a0dc9eae393ed515460a986e
-refs/heads/try: cc4213418e3ab225867d8e3911f592481b1bbffc
+refs/heads/try: f512779554a436d11dd9ffde4c198da6241dfd58
 refs/tags/release-0.1: 1f5c5126e96c79d22cb7862f75304136e204f105
 refs/heads/ndm: f3868061cd7988080c30d6d5bf352a5a5fe2460b
 refs/heads/try2: 147ecfdd8221e4a4d4e090486829a06da1e0ca3c
diff --git a/branches/try/src/librustdoc/html/highlight.rs b/branches/try/src/librustdoc/html/highlight.rs
@@ -18,7 +18,6 @@ use std::io;
 
 use syntax::parse;
 use syntax::parse::lexer;
-use syntax::codemap::{BytePos, Span};
 
 use html::escape::Escape;
 
@@ -59,38 +58,30 @@ fn doit(sess: &parse::ParseSess, mut lexer: lexer::StringReader,
         None => {}
     }
     try!(write!(out, "class='rust {}'>\n", class.unwrap_or("")));
-    let mut last = BytePos(0);
     let mut is_attribute = false;
     let mut is_macro = false;
     let mut is_macro_nonterminal = false;
     loop {
         let next = lexer.next_token();
-        let test = if next.tok == t::EOF {lexer.pos} else {next.sp.lo};
-
-        // The lexer consumes all whitespace and non-doc-comments when iterating
-        // between tokens. If this token isn't directly adjacent to our last
-        // token, then we need to emit the whitespace/comment.
-        //
-        // If the gap has any '/' characters then we consider the whole thing a
-        // comment. This will classify some whitespace as a comment, but that
-        // doesn't matter too much for syntax highlighting purposes.
-        if test > last {
-            let snip = sess.span_diagnostic.cm.span_to_snippet(Span {
-                lo: last,
-                hi: test,
-                expn_info: None,
-            }).unwrap();
-            if snip.as_slice().contains("/") {
-                try!(write!(out, "<span class='comment'>{}</span>",
-                              Escape(snip.as_slice())));
-            } else {
-                try!(write!(out, "{}", Escape(snip.as_slice())));
-            }
-        }
-        last = next.sp.hi;
+
+        let snip = |sp| sess.span_diagnostic.cm.span_to_snippet(sp).unwrap();
+
         if next.tok == t::EOF { break }
 
         let klass = match next.tok {
+            t::WS => {
+                try!(write!(out, "{}", Escape(snip(next.sp).as_slice())));
+                continue
+            },
+            t::COMMENT => {
+                try!(write!(out, "<span class='comment'>{}</span>",
+                            Escape(snip(next.sp).as_slice())));
+                continue
+            },
+            t::SHEBANG(s) => {
+                try!(write!(out, "{}", Escape(s.as_str())));
+                continue
+            },
             // If this '&' token is directly adjacent to another token, assume
             // that it's the address-of operator instead of the and-operator.
             // This allows us to give all pointers their own class (`Box` and
diff --git a/branches/try/src/libsyntax/parse/attr.rs b/branches/try/src/libsyntax/parse/attr.rs
@@ -34,7 +34,7 @@ impl<'a> ParserAttr for Parser<'a> {
     fn parse_outer_attributes(&mut self) -> Vec<ast::Attribute> {
         let mut attrs: Vec<ast::Attribute> = Vec::new();
         loop {
-            debug!("parse_outer_attributes: self.token={:?}",
+            debug!("parse_outer_attributes: self.token={}",
                    self.token);
             match self.token {
               token::POUND => {
diff --git a/branches/try/src/libsyntax/parse/lexer/comments.rs b/branches/try/src/libsyntax/parse/lexer/comments.rs
@@ -13,7 +13,7 @@ use codemap::{BytePos, CharPos, CodeMap, Pos};
 use diagnostic;
 use parse::lexer::{is_whitespace, Reader};
 use parse::lexer::{StringReader, TokenAndSpan};
-use parse::lexer::{is_line_non_doc_comment, is_block_non_doc_comment};
+use parse::lexer::is_block_doc_comment;
 use parse::lexer;
 use parse::token;
 
@@ -42,9 +42,9 @@ pub struct Comment {
 }
 
 pub fn is_doc_comment(s: &str) -> bool {
-    (s.starts_with("///") && !is_line_non_doc_comment(s)) ||
+    (s.starts_with("///") && super::is_doc_comment(s)) ||
     s.starts_with("//!") ||
-    (s.starts_with("/**") && !is_block_non_doc_comment(s)) ||
+    (s.starts_with("/**") && is_block_doc_comment(s)) ||
     s.starts_with("/*!")
 }
 
@@ -260,7 +260,7 @@ fn read_block_comment(rdr: &mut StringReader,
             rdr.bump();
             rdr.bump();
         }
-        if !is_block_non_doc_comment(curr_line.as_slice()) {
+        if is_block_doc_comment(curr_line.as_slice()) {
             return
         }
         assert!(!curr_line.as_slice().contains_char('\n'));
diff --git a/branches/try/src/libsyntax/parse/lexer/mod.rs b/branches/try/src/libsyntax/parse/lexer/mod.rs
@@ -187,7 +187,7 @@ impl<'a> StringReader<'a> {
     /// Advance peek_tok and peek_span to refer to the next token, and
     /// possibly update the interner.
     fn advance_token(&mut self) {
-        match self.consume_whitespace_and_comments() {
+        match self.scan_whitespace_or_comment() {
             Some(comment) => {
                 self.peek_span = comment.sp;
                 self.peek_tok = comment.tok;
@@ -339,8 +339,7 @@ impl<'a> StringReader<'a> {
 
     /// PRECONDITION: self.curr is not whitespace
     /// Eats any kind of comment.
-    /// Returns a Some(sugared-doc-attr) if one exists, None otherwise
-    fn consume_any_line_comment(&mut self) -> Option<TokenAndSpan> {
+    fn scan_comment(&mut self) -> Option<TokenAndSpan> {
         match self.curr {
             Some(c) => {
                 if c.is_whitespace() {
@@ -375,28 +374,32 @@ impl<'a> StringReader<'a> {
                             }
                             self.bump();
                         }
-                        let ret = self.with_str_from(start_bpos, |string| {
+                        return self.with_str_from(start_bpos, |string| {
                             // but comments with only more "/"s are not
-                            if !is_line_non_doc_comment(string) {
-                                Some(TokenAndSpan{
-                                    tok: token::DOC_COMMENT(str_to_ident(string)),
-                                    sp: codemap::mk_sp(start_bpos, self.last_pos)
-                                })
+                            let tok = if is_doc_comment(string) {
+                                token::DOC_COMMENT(str_to_ident(string))
                             } else {
-                                None
-                            }
-                        });
+                                token::COMMENT
+                            };
 
-                        if ret.is_some() {
-                            return ret;
-                        }
+                            return Some(TokenAndSpan{
+                                tok: tok,
+                                sp: codemap::mk_sp(start_bpos, self.last_pos)
+                            });
+                        });
                     } else {
+                        let start_bpos = self.last_pos - BytePos(2);
                         while !self.curr_is('\n') && !self.is_eof() { self.bump(); }
+                        return Some(TokenAndSpan {
+                            tok: token::COMMENT,
+                            sp: codemap::mk_sp(start_bpos, self.last_pos)
+                        });
                     }
-                    // Restart whitespace munch.
-                    self.consume_whitespace_and_comments()
                 }
-                Some('*') => { self.bump(); self.bump(); self.consume_block_comment() }
+                Some('*') => {
+                    self.bump(); self.bump();
+                    self.scan_block_comment()
+                }
                 _ => None
             }
         } else if self.curr_is('#') {
@@ -412,9 +415,15 @@ impl<'a> StringReader<'a> {
                 let cmap = CodeMap::new();
                 cmap.files.borrow_mut().push(self.filemap.clone());
                 let loc = cmap.lookup_char_pos_adj(self.last_pos);
+                debug!("Skipping a shebang");
                 if loc.line == 1u && loc.col == CharPos(0u) {
+                    // FIXME: Add shebang "token", return it
+                    let start = self.last_pos;
                     while !self.curr_is('\n') && !self.is_eof() { self.bump(); }
-                    return self.consume_whitespace_and_comments();
+                    return Some(TokenAndSpan {
+                        tok: token::SHEBANG(self.ident_from(start)),
+                        sp: codemap::mk_sp(start, self.last_pos)
+                    });
                 }
             }
             None
@@ -423,15 +432,33 @@ impl<'a> StringReader<'a> {
         }
     }
 
-    /// EFFECT: eats whitespace and comments.
-    /// Returns a Some(sugared-doc-attr) if one exists, None otherwise.
-    fn consume_whitespace_and_comments(&mut self) -> Option<TokenAndSpan> {
-        while is_whitespace(self.curr) { self.bump(); }
-        return self.consume_any_line_comment();
+    /// If there is whitespace, shebang, or a comment, scan it. Otherwise,
+    /// return None.
+    fn scan_whitespace_or_comment(&mut self) -> Option<TokenAndSpan> {
+        match self.curr.unwrap_or('\0') {
+            // # to handle shebang at start of file -- this is the entry point
+            // for skipping over all "junk"
+            '/' | '#' => {
+                let c = self.scan_comment();
+                debug!("scanning a comment {}", c);
+                c
+            },
+            c if is_whitespace(Some(c)) => {
+                let start_bpos = self.last_pos;
+                while is_whitespace(self.curr) { self.bump(); }
+                let c = Some(TokenAndSpan {
+                    tok: token::WS,
+                    sp: codemap::mk_sp(start_bpos, self.last_pos)
+                });
+                debug!("scanning whitespace: {}", c);
+                c
+            },
+            _ => None
+        }
     }
 
     /// Might return a sugared-doc-attr
-    fn consume_block_comment(&mut self) -> Option<TokenAndSpan> {
+    fn scan_block_comment(&mut self) -> Option<TokenAndSpan> {
         // block comments starting with "/**" or "/*!" are doc-comments
         let is_doc_comment = self.curr_is('*') || self.curr_is('!');
         let start_bpos = self.last_pos - BytePos(2);
@@ -466,28 +493,23 @@ impl<'a> StringReader<'a> {
             self.bump();
         }
 
-        let res = if is_doc_comment {
-            self.with_str_from(start_bpos, |string| {
-                // but comments with only "*"s between two "/"s are not
-                if !is_block_non_doc_comment(string) {
-                    let string = if has_cr {
-                        self.translate_crlf(start_bpos, string,
-                                            "bare CR not allowed in block doc-comment")
-                    } else { string.into_maybe_owned() };
-                    Some(TokenAndSpan{
-                            tok: token::DOC_COMMENT(str_to_ident(string.as_slice())),
-                            sp: codemap::mk_sp(start_bpos, self.last_pos)
-                        })
-                } else {
-                    None
-                }
-            })
-        } else {
-            None
-        };
+        self.with_str_from(start_bpos, |string| {
+            // but comments with only "*"s between two "/"s are not
+            let tok = if is_block_doc_comment(string) {
+                let string = if has_cr {
+                    self.translate_crlf(start_bpos, string,
+                                        "bare CR not allowed in block doc-comment")
+                } else { string.into_maybe_owned() };
+                token::DOC_COMMENT(str_to_ident(string.as_slice()))
+            } else {
+                token::COMMENT
+            };
 
-        // restart whitespace munch.
-        if res.is_some() { res } else { self.consume_whitespace_and_comments() }
+            Some(TokenAndSpan{
+                tok: tok,
+                sp: codemap::mk_sp(start_bpos, self.last_pos)
+            })
+        })
     }
 
     /// Scan through any digits (base `radix`) or underscores, and return how
@@ -1242,12 +1264,18 @@ fn in_range(c: Option<char>, lo: char, hi: char) -> bool {
 
 fn is_dec_digit(c: Option<char>) -> bool { return in_range(c, '0', '9'); }
 
-pub fn is_line_non_doc_comment(s: &str) -> bool {
-    s.starts_with("////")
+pub fn is_doc_comment(s: &str) -> bool {
+    let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/')
+              || s.starts_with("//!");
+    debug!("is `{}` a doc comment? {}", s, res);
+    res
 }
 
-pub fn is_block_non_doc_comment(s: &str) -> bool {
-    s.starts_with("/***")
+pub fn is_block_doc_comment(s: &str) -> bool {
+    let res = (s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*')
+              || s.starts_with("/*!");
+    debug!("is `{}` a doc comment? {}", s, res);
+    res
 }
 
 fn ident_start(c: Option<char>) -> bool {
@@ -1383,9 +1411,9 @@ mod test {
     }
 
     #[test] fn line_doc_comments() {
-        assert!(!is_line_non_doc_comment("///"));
-        assert!(!is_line_non_doc_comment("/// blah"));
-        assert!(is_line_non_doc_comment("////"));
+        assert!(is_doc_comment("///"));
+        assert!(is_doc_comment("/// blah"));
+        assert!(!is_doc_comment("////"));
     }
 
     #[test] fn nested_block_comments() {
diff --git a/branches/try/src/libsyntax/parse/parser.rs b/branches/try/src/libsyntax/parse/parser.rs
@@ -325,10 +325,24 @@ fn is_plain_ident_or_underscore(t: &token::Token) -> bool {
     is_plain_ident(t) || *t == token::UNDERSCORE
 }
 
+/// Get a token the parser cares about
+fn real_token(rdr: &mut Reader) -> TokenAndSpan {
+    let mut t = rdr.next_token();
+    loop {
+        match t.tok {
+            token::WS | token::COMMENT | token::SHEBANG(_) => {
+                t = rdr.next_token();
+            },
+            _ => break
+        }
+    }
+    t
+}
+
 impl<'a> Parser<'a> {
     pub fn new(sess: &'a ParseSess, cfg: ast::CrateConfig,
                mut rdr: Box<Reader>) -> Parser<'a> {
-        let tok0 = rdr.next_token();
+        let tok0 = real_token(rdr);
         let span = tok0.sp;
         let placeholder = TokenAndSpan {
             tok: token::UNDERSCORE,
@@ -864,7 +878,7 @@ impl<'a> Parser<'a> {
             None
         };
         let next = if self.buffer_start == self.buffer_end {
-            self.reader.next_token()
+            real_token(self.reader)
         } else {
             // Avoid token copies with `replace`.
             let buffer_start = self.buffer_start as uint;
@@ -908,7 +922,7 @@ impl<'a> Parser<'a> {
                       -> R {
         let dist = distance as int;
         while self.buffer_length() < dist {
-            self.buffer[self.buffer_end as uint] = self.reader.next_token();
+            self.buffer[self.buffer_end as uint] = real_token(self.reader);
             self.buffer_end = (self.buffer_end + 1) & 3;
         }
         f(&self.buffer[((self.buffer_start + dist - 1) & 3) as uint].tok)
diff --git a/branches/try/src/libsyntax/parse/token.rs b/branches/try/src/libsyntax/parse/token.rs
@@ -97,8 +97,18 @@ pub enum Token {
 
     /* For interpolation */
     INTERPOLATED(Nonterminal),
-
     DOC_COMMENT(Ident),
+
+    // Junk. These carry no data because we don't really care about the data
+    // they *would* carry, and don't really want to allocate a new ident for
+    // them. Instead, users could extract that from the associated span.
+
+    /// Whitespace
+    WS,
+    /// Comment
+    COMMENT,
+    SHEBANG(Ident),
+
     EOF,
 }
 
@@ -231,6 +241,10 @@ pub fn to_string(t: &Token) -> String {
       /* Other */
       DOC_COMMENT(s) => get_ident(s).get().to_string(),
       EOF => "<eof>".to_string(),
+      WS => " ".to_string(),
+      COMMENT => "/* */".to_string(),
+      SHEBANG(s) => format!("/* shebang: {}*/", s.as_str()),
+
       INTERPOLATED(ref nt) => {
         match nt {
             &NtExpr(ref e) => ::print::pprust::expr_to_string(&**e),