Skip to content

Commit 9f5e21d

Browse files
committed
syntax: don't process string/char/byte/binary lits
This shuffles things around a bit so that LIT_CHAR and co store an Ident which is the original, unaltered literal in the source. When creating the AST, unescape and postprocess them. This changes how syntax extensions can work, slightly, but otherwise poses no visible changes. To get a useful value out of one of these tokens, call `parse::{char_lit, byte_lit, bin_lit, str_lit}` [breaking-change]
1 parent bf04a7c commit 9f5e21d

File tree

7 files changed

+327
-81
lines changed

7 files changed

+327
-81
lines changed

src/libsyntax/ext/base.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -579,9 +579,9 @@ pub fn get_single_str_from_tts(cx: &ExtCtxt,
579579
cx.span_err(sp, format!("{} takes 1 argument.", name).as_slice());
580580
} else {
581581
match tts[0] {
582-
ast::TTTok(_, token::LIT_STR(ident))
583-
| ast::TTTok(_, token::LIT_STR_RAW(ident, _)) => {
584-
return Some(token::get_ident(ident).get().to_string())
582+
ast::TTTok(_, token::LIT_STR(ident)) => return Some(parse::str_lit(ident.as_str())),
583+
ast::TTTok(_, token::LIT_STR_RAW(ident, _)) => {
584+
return Some(parse::raw_str_lit(ident.as_str()))
585585
}
586586
_ => {
587587
cx.span_err(sp,

src/libsyntax/ext/quote.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -401,13 +401,13 @@ fn mk_token(cx: &ExtCtxt, sp: Span, tok: &token::Token) -> Gc<ast::Expr> {
401401
}
402402

403403
LIT_BYTE(i) => {
404-
let e_byte = cx.expr_lit(sp, ast::LitByte(i));
404+
let e_byte = mk_ident(cx, sp, i);
405405

406406
return cx.expr_call(sp, mk_token_path(cx, sp, "LIT_BYTE"), vec!(e_byte));
407407
}
408408

409409
LIT_CHAR(i) => {
410-
let e_char = cx.expr_lit(sp, ast::LitChar(i));
410+
let e_char = mk_ident(cx, sp, i);
411411

412412
return cx.expr_call(sp, mk_token_path(cx, sp, "LIT_CHAR"), vec!(e_char));
413413
}

src/libsyntax/parse/lexer/mod.rs

Lines changed: 53 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -685,7 +685,7 @@ impl<'a> StringReader<'a> {
685685
}
686686

687687

688-
fn scan_numeric_escape(&mut self, n_hex_digits: uint, delim: char) -> char {
688+
fn scan_numeric_escape(&mut self, n_hex_digits: uint, delim: char) -> bool {
689689
let mut accum_int = 0u32;
690690
let start_bpos = self.last_pos;
691691
for _ in range(0, n_hex_digits) {
@@ -709,20 +709,22 @@ impl<'a> StringReader<'a> {
709709
}
710710

711711
match char::from_u32(accum_int) {
712-
Some(x) => x,
712+
Some(_) => true,
713713
None => {
714714
let last_bpos = self.last_pos;
715715
self.err_span_(start_bpos, last_bpos, "illegal numeric character escape");
716-
'?'
716+
false
717717
}
718718
}
719719
}
720720

721721
/// Scan for a single (possibly escaped) byte or char
722722
/// in a byte, (non-raw) byte string, char, or (non-raw) string literal.
723723
/// `start` is the position of `first_source_char`, which is already consumed.
724+
///
725+
/// Returns true if there was a valid char/byte, false otherwise.
724726
fn scan_char_or_byte(&mut self, start: BytePos, first_source_char: char,
725-
ascii_only: bool, delim: char) -> Option<char> {
727+
ascii_only: bool, delim: char) -> bool {
726728
match first_source_char {
727729
'\\' => {
728730
// '\X' for some X must be a character constant:
@@ -732,24 +734,18 @@ impl<'a> StringReader<'a> {
732734
match escaped {
733735
None => {}, // EOF here is an error that will be checked later.
734736
Some(e) => {
735-
return Some(match e {
736-
'n' => '\n',
737-
'r' => '\r',
738-
't' => '\t',
739-
'\\' => '\\',
740-
'\'' => '\'',
741-
'"' => '"',
742-
'0' => '\x00',
737+
return match e {
738+
'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true,
743739
'x' => self.scan_numeric_escape(2u, delim),
744740
'u' if !ascii_only => self.scan_numeric_escape(4u, delim),
745741
'U' if !ascii_only => self.scan_numeric_escape(8u, delim),
746742
'\n' if delim == '"' => {
747743
self.consume_whitespace();
748-
return None
744+
true
749745
},
750746
'\r' if delim == '"' && self.curr_is('\n') => {
751747
self.consume_whitespace();
752-
return None
748+
true
753749
}
754750
c => {
755751
let last_pos = self.last_pos;
@@ -758,9 +754,9 @@ impl<'a> StringReader<'a> {
758754
if ascii_only { "unknown byte escape" }
759755
else { "unknown character escape" },
760756
c);
761-
c
757+
false
762758
}
763-
})
759+
}
764760
}
765761
}
766762
}
@@ -771,14 +767,16 @@ impl<'a> StringReader<'a> {
771767
if ascii_only { "byte constant must be escaped" }
772768
else { "character constant must be escaped" },
773769
first_source_char);
770+
return false;
774771
}
775772
'\r' => {
776773
if self.curr_is('\n') {
777774
self.bump();
778-
return Some('\n');
775+
return true;
779776
} else {
780777
self.err_span_(start, self.last_pos,
781778
"bare CR not allowed in string, use \\r instead");
779+
return false;
782780
}
783781
}
784782
_ => if ascii_only && first_source_char > '\x7F' {
@@ -787,9 +785,10 @@ impl<'a> StringReader<'a> {
787785
start, last_pos,
788786
"byte constant must be ASCII. \
789787
Use a \\xHH escape for a non-ASCII byte", first_source_char);
788+
return false;
790789
}
791790
}
792-
Some(first_source_char)
791+
true
793792
}
794793

795794
fn binop(&mut self, op: token::BinOp) -> token::Token {
@@ -924,7 +923,7 @@ impl<'a> StringReader<'a> {
924923
let start = self.last_pos;
925924

926925
// the eof will be picked up by the final `'` check below
927-
let mut c2 = self.curr.unwrap_or('\x00');
926+
let c2 = self.curr.unwrap_or('\x00');
928927
self.bump();
929928

930929
// If the character is an ident start not followed by another single
@@ -967,7 +966,7 @@ impl<'a> StringReader<'a> {
967966
}
968967

969968
// Otherwise it is a character constant:
970-
c2 = self.scan_char_or_byte(start, c2, /* ascii_only = */ false, '\'').unwrap();
969+
let valid = self.scan_char_or_byte(start, c2, /* ascii_only = */ false, '\'');
971970
if !self.curr_is('\'') {
972971
let last_bpos = self.last_pos;
973972
self.fatal_span_verbose(
@@ -977,8 +976,9 @@ impl<'a> StringReader<'a> {
977976
start - BytePos(1), last_bpos,
978977
"unterminated character constant".to_string());
979978
}
979+
let id = if valid { self.ident_from(start) } else { str_to_ident("0") };
980980
self.bump(); // advance curr past token
981-
return token::LIT_CHAR(c2);
981+
return token::LIT_CHAR(id);
982982
}
983983
'b' => {
984984
self.bump();
@@ -991,8 +991,8 @@ impl<'a> StringReader<'a> {
991991

992992
}
993993
'"' => {
994-
let mut accum_str = String::new();
995994
let start_bpos = self.last_pos;
995+
let mut valid = true;
996996
self.bump();
997997
while !self.curr_is('"') {
998998
if self.is_eof() {
@@ -1003,11 +1003,13 @@ impl<'a> StringReader<'a> {
10031003
let ch_start = self.last_pos;
10041004
let ch = self.curr.unwrap();
10051005
self.bump();
1006-
self.scan_char_or_byte(ch_start, ch, /* ascii_only = */ false, '"')
1007-
.map(|ch| accum_str.push_char(ch));
1006+
valid &= self.scan_char_or_byte(ch_start, ch, /* ascii_only = */ false, '"');
10081007
}
1008+
// adjust for the ACSII " at the start of the literal
1009+
let id = if valid { self.ident_from(start_bpos + BytePos(1)) }
1010+
else { str_to_ident("??") };
10091011
self.bump();
1010-
return token::LIT_STR(str_to_ident(accum_str.as_slice()));
1012+
return token::LIT_STR(id);
10111013
}
10121014
'r' => {
10131015
let start_bpos = self.last_pos;
@@ -1032,7 +1034,7 @@ impl<'a> StringReader<'a> {
10321034
self.bump();
10331035
let content_start_bpos = self.last_pos;
10341036
let mut content_end_bpos;
1035-
let mut has_cr = false;
1037+
let mut valid = true;
10361038
'outer: loop {
10371039
if self.is_eof() {
10381040
let last_bpos = self.last_pos;
@@ -1055,23 +1057,26 @@ impl<'a> StringReader<'a> {
10551057
}
10561058
}
10571059
break;
1058-
}
1060+
},
10591061
'\r' => {
1060-
has_cr = true;
1062+
if !self.nextch_is('\n') {
1063+
let last_bpos = self.last_pos;
1064+
self.err_span_(start_bpos, last_bpos, "bare CR not allowed in raw \
1065+
string, use \\r instead");
1066+
valid = false;
1067+
}
10611068
}
10621069
_ => ()
10631070
}
10641071
self.bump();
10651072
}
10661073
self.bump();
1067-
let str_content = self.with_str_from_to(content_start_bpos, content_end_bpos, |string| {
1068-
let string = if has_cr {
1069-
self.translate_crlf(content_start_bpos, string,
1070-
"bare CR not allowed in raw string")
1071-
} else { string.into_maybe_owned() };
1072-
str_to_ident(string.as_slice())
1073-
});
1074-
return token::LIT_STR_RAW(str_content, hash_count);
1074+
let id = if valid {
1075+
self.ident_from_to(content_start_bpos, content_end_bpos)
1076+
} else {
1077+
str_to_ident("??")
1078+
};
1079+
return token::LIT_STR_RAW(id, hash_count);
10751080
}
10761081
'-' => {
10771082
if self.nextch_is('>') {
@@ -1145,10 +1150,10 @@ impl<'a> StringReader<'a> {
11451150
let start = self.last_pos;
11461151

11471152
// the eof will be picked up by the final `'` check below
1148-
let mut c2 = self.curr.unwrap_or('\x00');
1153+
let c2 = self.curr.unwrap_or('\x00');
11491154
self.bump();
11501155

1151-
c2 = self.scan_char_or_byte(start, c2, /* ascii_only = */ true, '\'').unwrap();
1156+
let valid = self.scan_char_or_byte(start, c2, /* ascii_only = */ true, '\'');
11521157
if !self.curr_is('\'') {
11531158
// Byte offsetting here is okay because the
11541159
// character before position `start` are an
@@ -1158,14 +1163,17 @@ impl<'a> StringReader<'a> {
11581163
start - BytePos(2), last_pos,
11591164
"unterminated byte constant".to_string());
11601165
}
1166+
1167+
let id = if valid { self.ident_from(start) } else { str_to_ident("??") };
11611168
self.bump(); // advance curr past token
1162-
return token::LIT_BYTE(c2 as u8);
1169+
return token::LIT_BYTE(id);
11631170
}
11641171

11651172
fn scan_byte_string(&mut self) -> token::Token {
11661173
self.bump();
11671174
let start = self.last_pos;
1168-
let mut value = Vec::new();
1175+
let mut valid = true;
1176+
11691177
while !self.curr_is('"') {
11701178
if self.is_eof() {
11711179
let last_pos = self.last_pos;
@@ -1176,11 +1184,11 @@ impl<'a> StringReader<'a> {
11761184
let ch_start = self.last_pos;
11771185
let ch = self.curr.unwrap();
11781186
self.bump();
1179-
self.scan_char_or_byte(ch_start, ch, /* ascii_only = */ true, '"')
1180-
.map(|ch| value.push(ch as u8));
1187+
valid &= self.scan_char_or_byte(ch_start, ch, /* ascii_only = */ true, '"');
11811188
}
1189+
let id = if valid { self.ident_from(start) } else { str_to_ident("??") };
11821190
self.bump();
1183-
return token::LIT_BINARY(Rc::new(value));
1191+
return token::LIT_BINARY(id);
11841192
}
11851193

11861194
fn scan_raw_byte_string(&mut self) -> token::Token {
@@ -1231,10 +1239,8 @@ impl<'a> StringReader<'a> {
12311239
self.bump();
12321240
}
12331241
self.bump();
1234-
let bytes = self.with_str_from_to(content_start_bpos,
1235-
content_end_bpos,
1236-
|s| s.as_bytes().to_owned());
1237-
return token::LIT_BINARY_RAW(Rc::new(bytes), hash_count);
1242+
return token::LIT_BINARY_RAW(self.ident_from_to(content_start_bpos, content_end_bpos),
1243+
hash_count);
12381244
}
12391245
}
12401246

0 commit comments

Comments
 (0)