Skip to content

Commit f512779

Browse files
committed
lexer: lex WS/COMMENT/SHEBANG rather than skipping
Now, the lexer will categorize every byte in its input according to the grammar. The parser skips over these while parsing, thus avoiding their presence in the input to syntax extensions.
1 parent cc42134 commit f512779

File tree

6 files changed

+134
-87
lines changed

6 files changed

+134
-87
lines changed

src/librustdoc/html/highlight.rs

Lines changed: 16 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ use std::io;
1818

1919
use syntax::parse;
2020
use syntax::parse::lexer;
21-
use syntax::codemap::{BytePos, Span};
2221

2322
use html::escape::Escape;
2423

@@ -59,38 +58,30 @@ fn doit(sess: &parse::ParseSess, mut lexer: lexer::StringReader,
5958
None => {}
6059
}
6160
try!(write!(out, "class='rust {}'>\n", class.unwrap_or("")));
62-
let mut last = BytePos(0);
6361
let mut is_attribute = false;
6462
let mut is_macro = false;
6563
let mut is_macro_nonterminal = false;
6664
loop {
6765
let next = lexer.next_token();
68-
let test = if next.tok == t::EOF {lexer.pos} else {next.sp.lo};
69-
70-
// The lexer consumes all whitespace and non-doc-comments when iterating
71-
// between tokens. If this token isn't directly adjacent to our last
72-
// token, then we need to emit the whitespace/comment.
73-
//
74-
// If the gap has any '/' characters then we consider the whole thing a
75-
// comment. This will classify some whitespace as a comment, but that
76-
// doesn't matter too much for syntax highlighting purposes.
77-
if test > last {
78-
let snip = sess.span_diagnostic.cm.span_to_snippet(Span {
79-
lo: last,
80-
hi: test,
81-
expn_info: None,
82-
}).unwrap();
83-
if snip.as_slice().contains("/") {
84-
try!(write!(out, "<span class='comment'>{}</span>",
85-
Escape(snip.as_slice())));
86-
} else {
87-
try!(write!(out, "{}", Escape(snip.as_slice())));
88-
}
89-
}
90-
last = next.sp.hi;
66+
67+
let snip = |sp| sess.span_diagnostic.cm.span_to_snippet(sp).unwrap();
68+
9169
if next.tok == t::EOF { break }
9270

9371
let klass = match next.tok {
72+
t::WS => {
73+
try!(write!(out, "{}", Escape(snip(next.sp).as_slice())));
74+
continue
75+
},
76+
t::COMMENT => {
77+
try!(write!(out, "<span class='comment'>{}</span>",
78+
Escape(snip(next.sp).as_slice())));
79+
continue
80+
},
81+
t::SHEBANG(s) => {
82+
try!(write!(out, "{}", Escape(s.as_str())));
83+
continue
84+
},
9485
// If this '&' token is directly adjacent to another token, assume
9586
// that it's the address-of operator instead of the and-operator.
9687
// This allows us to give all pointers their own class (`Box` and

src/libsyntax/parse/attr.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ impl<'a> ParserAttr for Parser<'a> {
3434
fn parse_outer_attributes(&mut self) -> Vec<ast::Attribute> {
3535
let mut attrs: Vec<ast::Attribute> = Vec::new();
3636
loop {
37-
debug!("parse_outer_attributes: self.token={:?}",
37+
debug!("parse_outer_attributes: self.token={}",
3838
self.token);
3939
match self.token {
4040
token::POUND => {

src/libsyntax/parse/lexer/comments.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use codemap::{BytePos, CharPos, CodeMap, Pos};
1313
use diagnostic;
1414
use parse::lexer::{is_whitespace, Reader};
1515
use parse::lexer::{StringReader, TokenAndSpan};
16-
use parse::lexer::{is_line_non_doc_comment, is_block_non_doc_comment};
16+
use parse::lexer::is_block_doc_comment;
1717
use parse::lexer;
1818
use parse::token;
1919

@@ -42,9 +42,9 @@ pub struct Comment {
4242
}
4343

4444
pub fn is_doc_comment(s: &str) -> bool {
45-
(s.starts_with("///") && !is_line_non_doc_comment(s)) ||
45+
(s.starts_with("///") && super::is_doc_comment(s)) ||
4646
s.starts_with("//!") ||
47-
(s.starts_with("/**") && !is_block_non_doc_comment(s)) ||
47+
(s.starts_with("/**") && is_block_doc_comment(s)) ||
4848
s.starts_with("/*!")
4949
}
5050

@@ -260,7 +260,7 @@ fn read_block_comment(rdr: &mut StringReader,
260260
rdr.bump();
261261
rdr.bump();
262262
}
263-
if !is_block_non_doc_comment(curr_line.as_slice()) {
263+
if is_block_doc_comment(curr_line.as_slice()) {
264264
return
265265
}
266266
assert!(!curr_line.as_slice().contains_char('\n'));

src/libsyntax/parse/lexer/mod.rs

Lines changed: 81 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ impl<'a> StringReader<'a> {
187187
/// Advance peek_tok and peek_span to refer to the next token, and
188188
/// possibly update the interner.
189189
fn advance_token(&mut self) {
190-
match self.consume_whitespace_and_comments() {
190+
match self.scan_whitespace_or_comment() {
191191
Some(comment) => {
192192
self.peek_span = comment.sp;
193193
self.peek_tok = comment.tok;
@@ -339,8 +339,7 @@ impl<'a> StringReader<'a> {
339339

340340
/// PRECONDITION: self.curr is not whitespace
341341
/// Eats any kind of comment.
342-
/// Returns a Some(sugared-doc-attr) if one exists, None otherwise
343-
fn consume_any_line_comment(&mut self) -> Option<TokenAndSpan> {
342+
fn scan_comment(&mut self) -> Option<TokenAndSpan> {
344343
match self.curr {
345344
Some(c) => {
346345
if c.is_whitespace() {
@@ -375,28 +374,32 @@ impl<'a> StringReader<'a> {
375374
}
376375
self.bump();
377376
}
378-
let ret = self.with_str_from(start_bpos, |string| {
377+
return self.with_str_from(start_bpos, |string| {
379378
// but comments with only more "/"s are not
380-
if !is_line_non_doc_comment(string) {
381-
Some(TokenAndSpan{
382-
tok: token::DOC_COMMENT(str_to_ident(string)),
383-
sp: codemap::mk_sp(start_bpos, self.last_pos)
384-
})
379+
let tok = if is_doc_comment(string) {
380+
token::DOC_COMMENT(str_to_ident(string))
385381
} else {
386-
None
387-
}
388-
});
382+
token::COMMENT
383+
};
389384

390-
if ret.is_some() {
391-
return ret;
392-
}
385+
return Some(TokenAndSpan{
386+
tok: tok,
387+
sp: codemap::mk_sp(start_bpos, self.last_pos)
388+
});
389+
});
393390
} else {
391+
let start_bpos = self.last_pos - BytePos(2);
394392
while !self.curr_is('\n') && !self.is_eof() { self.bump(); }
393+
return Some(TokenAndSpan {
394+
tok: token::COMMENT,
395+
sp: codemap::mk_sp(start_bpos, self.last_pos)
396+
});
395397
}
396-
// Restart whitespace munch.
397-
self.consume_whitespace_and_comments()
398398
}
399-
Some('*') => { self.bump(); self.bump(); self.consume_block_comment() }
399+
Some('*') => {
400+
self.bump(); self.bump();
401+
self.scan_block_comment()
402+
}
400403
_ => None
401404
}
402405
} else if self.curr_is('#') {
@@ -412,9 +415,15 @@ impl<'a> StringReader<'a> {
412415
let cmap = CodeMap::new();
413416
cmap.files.borrow_mut().push(self.filemap.clone());
414417
let loc = cmap.lookup_char_pos_adj(self.last_pos);
418+
debug!("Skipping a shebang");
415419
if loc.line == 1u && loc.col == CharPos(0u) {
420+
// FIXME: Add shebang "token", return it
421+
let start = self.last_pos;
416422
while !self.curr_is('\n') && !self.is_eof() { self.bump(); }
417-
return self.consume_whitespace_and_comments();
423+
return Some(TokenAndSpan {
424+
tok: token::SHEBANG(self.ident_from(start)),
425+
sp: codemap::mk_sp(start, self.last_pos)
426+
});
418427
}
419428
}
420429
None
@@ -423,15 +432,33 @@ impl<'a> StringReader<'a> {
423432
}
424433
}
425434

426-
/// EFFECT: eats whitespace and comments.
427-
/// Returns a Some(sugared-doc-attr) if one exists, None otherwise.
428-
fn consume_whitespace_and_comments(&mut self) -> Option<TokenAndSpan> {
429-
while is_whitespace(self.curr) { self.bump(); }
430-
return self.consume_any_line_comment();
435+
/// If there is whitespace, shebang, or a comment, scan it. Otherwise,
436+
/// return None.
437+
fn scan_whitespace_or_comment(&mut self) -> Option<TokenAndSpan> {
438+
match self.curr.unwrap_or('\0') {
439+
// # to handle shebang at start of file -- this is the entry point
440+
// for skipping over all "junk"
441+
'/' | '#' => {
442+
let c = self.scan_comment();
443+
debug!("scanning a comment {}", c);
444+
c
445+
},
446+
c if is_whitespace(Some(c)) => {
447+
let start_bpos = self.last_pos;
448+
while is_whitespace(self.curr) { self.bump(); }
449+
let c = Some(TokenAndSpan {
450+
tok: token::WS,
451+
sp: codemap::mk_sp(start_bpos, self.last_pos)
452+
});
453+
debug!("scanning whitespace: {}", c);
454+
c
455+
},
456+
_ => None
457+
}
431458
}
432459

433460
/// Might return a sugared-doc-attr
434-
fn consume_block_comment(&mut self) -> Option<TokenAndSpan> {
461+
fn scan_block_comment(&mut self) -> Option<TokenAndSpan> {
435462
// block comments starting with "/**" or "/*!" are doc-comments
436463
let is_doc_comment = self.curr_is('*') || self.curr_is('!');
437464
let start_bpos = self.last_pos - BytePos(2);
@@ -466,28 +493,23 @@ impl<'a> StringReader<'a> {
466493
self.bump();
467494
}
468495

469-
let res = if is_doc_comment {
470-
self.with_str_from(start_bpos, |string| {
471-
// but comments with only "*"s between two "/"s are not
472-
if !is_block_non_doc_comment(string) {
473-
let string = if has_cr {
474-
self.translate_crlf(start_bpos, string,
475-
"bare CR not allowed in block doc-comment")
476-
} else { string.into_maybe_owned() };
477-
Some(TokenAndSpan{
478-
tok: token::DOC_COMMENT(str_to_ident(string.as_slice())),
479-
sp: codemap::mk_sp(start_bpos, self.last_pos)
480-
})
481-
} else {
482-
None
483-
}
484-
})
485-
} else {
486-
None
487-
};
496+
self.with_str_from(start_bpos, |string| {
497+
// but comments with only "*"s between two "/"s are not
498+
let tok = if is_block_doc_comment(string) {
499+
let string = if has_cr {
500+
self.translate_crlf(start_bpos, string,
501+
"bare CR not allowed in block doc-comment")
502+
} else { string.into_maybe_owned() };
503+
token::DOC_COMMENT(str_to_ident(string.as_slice()))
504+
} else {
505+
token::COMMENT
506+
};
488507

489-
// restart whitespace munch.
490-
if res.is_some() { res } else { self.consume_whitespace_and_comments() }
508+
Some(TokenAndSpan{
509+
tok: tok,
510+
sp: codemap::mk_sp(start_bpos, self.last_pos)
511+
})
512+
})
491513
}
492514

493515
/// Scan through any digits (base `radix`) or underscores, and return how
@@ -1242,12 +1264,18 @@ fn in_range(c: Option<char>, lo: char, hi: char) -> bool {
12421264

12431265
fn is_dec_digit(c: Option<char>) -> bool { return in_range(c, '0', '9'); }
12441266

1245-
pub fn is_line_non_doc_comment(s: &str) -> bool {
1246-
s.starts_with("////")
1267+
pub fn is_doc_comment(s: &str) -> bool {
1268+
let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/')
1269+
|| s.starts_with("//!");
1270+
debug!("is `{}` a doc comment? {}", s, res);
1271+
res
12471272
}
12481273

1249-
pub fn is_block_non_doc_comment(s: &str) -> bool {
1250-
s.starts_with("/***")
1274+
pub fn is_block_doc_comment(s: &str) -> bool {
1275+
let res = (s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*')
1276+
|| s.starts_with("/*!");
1277+
debug!("is `{}` a doc comment? {}", s, res);
1278+
res
12511279
}
12521280

12531281
fn ident_start(c: Option<char>) -> bool {
@@ -1383,9 +1411,9 @@ mod test {
13831411
}
13841412

13851413
#[test] fn line_doc_comments() {
1386-
assert!(!is_line_non_doc_comment("///"));
1387-
assert!(!is_line_non_doc_comment("/// blah"));
1388-
assert!(is_line_non_doc_comment("////"));
1414+
assert!(is_doc_comment("///"));
1415+
assert!(is_doc_comment("/// blah"));
1416+
assert!(!is_doc_comment("////"));
13891417
}
13901418

13911419
#[test] fn nested_block_comments() {

src/libsyntax/parse/parser.rs

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -325,10 +325,24 @@ fn is_plain_ident_or_underscore(t: &token::Token) -> bool {
325325
is_plain_ident(t) || *t == token::UNDERSCORE
326326
}
327327

328+
/// Get a token the parser cares about
329+
fn real_token(rdr: &mut Reader) -> TokenAndSpan {
330+
let mut t = rdr.next_token();
331+
loop {
332+
match t.tok {
333+
token::WS | token::COMMENT | token::SHEBANG(_) => {
334+
t = rdr.next_token();
335+
},
336+
_ => break
337+
}
338+
}
339+
t
340+
}
341+
328342
impl<'a> Parser<'a> {
329343
pub fn new(sess: &'a ParseSess, cfg: ast::CrateConfig,
330344
mut rdr: Box<Reader>) -> Parser<'a> {
331-
let tok0 = rdr.next_token();
345+
let tok0 = real_token(rdr);
332346
let span = tok0.sp;
333347
let placeholder = TokenAndSpan {
334348
tok: token::UNDERSCORE,
@@ -864,7 +878,7 @@ impl<'a> Parser<'a> {
864878
None
865879
};
866880
let next = if self.buffer_start == self.buffer_end {
867-
self.reader.next_token()
881+
real_token(self.reader)
868882
} else {
869883
// Avoid token copies with `replace`.
870884
let buffer_start = self.buffer_start as uint;
@@ -908,7 +922,7 @@ impl<'a> Parser<'a> {
908922
-> R {
909923
let dist = distance as int;
910924
while self.buffer_length() < dist {
911-
self.buffer[self.buffer_end as uint] = self.reader.next_token();
925+
self.buffer[self.buffer_end as uint] = real_token(self.reader);
912926
self.buffer_end = (self.buffer_end + 1) & 3;
913927
}
914928
f(&self.buffer[((self.buffer_start + dist - 1) & 3) as uint].tok)

src/libsyntax/parse/token.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,18 @@ pub enum Token {
9797

9898
/* For interpolation */
9999
INTERPOLATED(Nonterminal),
100-
101100
DOC_COMMENT(Ident),
101+
102+
// Junk. These carry no data because we don't really care about the data
103+
// they *would* carry, and don't really want to allocate a new ident for
104+
// them. Instead, users could extract that from the associated span.
105+
106+
/// Whitespace
107+
WS,
108+
/// Comment
109+
COMMENT,
110+
SHEBANG(Ident),
111+
102112
EOF,
103113
}
104114

@@ -231,6 +241,10 @@ pub fn to_string(t: &Token) -> String {
231241
/* Other */
232242
DOC_COMMENT(s) => get_ident(s).get().to_string(),
233243
EOF => "<eof>".to_string(),
244+
WS => " ".to_string(),
245+
COMMENT => "/* */".to_string(),
246+
SHEBANG(s) => format!("/* shebang: {}*/", s.as_str()),
247+
234248
INTERPOLATED(ref nt) => {
235249
match nt {
236250
&NtExpr(ref e) => ::print::pprust::expr_to_string(&**e),

0 commit comments

Comments
 (0)