|
| 1 | +//! Lexing `&str` into a sequence of Rust tokens. |
| 2 | +//! |
| 3 | +//! Note that strictly speaking the parser in this crate is not required to work |
| 4 | +//! on tokens which originated from text. Macros, eg, can synthesize tokes out |
| 5 | +//! of thin air. So, ideally, lexer should be an orthogonal crate. It is however |
| 6 | +//! convenient to include a text-based lexer here! |
| 7 | +
|
| 8 | +use crate::{ |
| 9 | + SyntaxKind::{self, *}, |
| 10 | + T, |
| 11 | +}; |
| 12 | + |
| 13 | +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] |
| 14 | +pub struct LexerToken { |
| 15 | + pub kind: SyntaxKind, |
| 16 | + pub len: usize, |
| 17 | + pub error: Option<String>, |
| 18 | +} |
| 19 | + |
| 20 | +impl LexerToken { |
| 21 | + pub fn new(kind: SyntaxKind, len: usize) -> Self { |
| 22 | + Self { kind, len, error: None } |
| 23 | + } |
| 24 | + |
| 25 | + /// Lexes text as a sequence of tokens. |
| 26 | + pub fn tokenize(text: &str) -> Vec<LexerToken> { |
| 27 | + let mut res = Vec::new(); |
| 28 | + let mut offset = 0; |
| 29 | + |
| 30 | + if let Some(shebang_len) = rustc_lexer::strip_shebang(text) { |
| 31 | + res.push(LexerToken::new(SHEBANG, shebang_len)); |
| 32 | + offset = shebang_len |
| 33 | + }; |
| 34 | + |
| 35 | + for token in rustc_lexer::tokenize(&text[offset..]) { |
| 36 | + let token_text = &text[offset..][..token.len]; |
| 37 | + offset += token.len; |
| 38 | + |
| 39 | + let (kind, err) = from_rustc(&token.kind, token_text); |
| 40 | + let mut token = LexerToken::new(kind, token.len); |
| 41 | + token.error = err.map(|it| it.to_string()); |
| 42 | + res.push(token); |
| 43 | + } |
| 44 | + |
| 45 | + res |
| 46 | + } |
| 47 | + /// Lexes text as a single token. Returns `None` if there's leftover text. |
| 48 | + pub fn from_str(text: &str) -> Option<LexerToken> { |
| 49 | + if text.is_empty() { |
| 50 | + return None; |
| 51 | + } |
| 52 | + |
| 53 | + let token = rustc_lexer::first_token(text); |
| 54 | + if token.len != text.len() { |
| 55 | + return None; |
| 56 | + } |
| 57 | + |
| 58 | + let (kind, err) = from_rustc(&token.kind, text); |
| 59 | + |
| 60 | + let mut token = LexerToken::new(kind, token.len); |
| 61 | + token.error = err.map(|it| it.to_string()); |
| 62 | + Some(token) |
| 63 | + } |
| 64 | +} |
| 65 | + |
| 66 | +/// Returns `SyntaxKind` and an optional tokenize error message. |
| 67 | +fn from_rustc( |
| 68 | + kind: &rustc_lexer::TokenKind, |
| 69 | + token_text: &str, |
| 70 | +) -> (SyntaxKind, Option<&'static str>) { |
| 71 | + // A note on an intended tradeoff: |
| 72 | + // We drop some useful information here (see patterns with double dots `..`) |
| 73 | + // Storing that info in `SyntaxKind` is not possible due to its layout requirements of |
| 74 | + // being `u16` that come from `rowan::SyntaxKind`. |
| 75 | + let mut err = ""; |
| 76 | + |
| 77 | + let syntax_kind = { |
| 78 | + match kind { |
| 79 | + rustc_lexer::TokenKind::LineComment { doc_style: _ } => COMMENT, |
| 80 | + rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated } => { |
| 81 | + if !terminated { |
| 82 | + err = "Missing trailing `*/` symbols to terminate the block comment"; |
| 83 | + } |
| 84 | + COMMENT |
| 85 | + } |
| 86 | + |
| 87 | + rustc_lexer::TokenKind::Whitespace => WHITESPACE, |
| 88 | + |
| 89 | + rustc_lexer::TokenKind::Ident if token_text == "_" => UNDERSCORE, |
| 90 | + rustc_lexer::TokenKind::Ident => SyntaxKind::from_keyword(token_text).unwrap_or(IDENT), |
| 91 | + |
| 92 | + rustc_lexer::TokenKind::RawIdent => IDENT, |
| 93 | + rustc_lexer::TokenKind::Literal { kind, .. } => return from_rustc_literal(kind), |
| 94 | + |
| 95 | + rustc_lexer::TokenKind::Lifetime { starts_with_number } => { |
| 96 | + if *starts_with_number { |
| 97 | + err = "Lifetime name cannot start with a number"; |
| 98 | + } |
| 99 | + LIFETIME_IDENT |
| 100 | + } |
| 101 | + |
| 102 | + rustc_lexer::TokenKind::Semi => T![;], |
| 103 | + rustc_lexer::TokenKind::Comma => T![,], |
| 104 | + rustc_lexer::TokenKind::Dot => T![.], |
| 105 | + rustc_lexer::TokenKind::OpenParen => T!['('], |
| 106 | + rustc_lexer::TokenKind::CloseParen => T![')'], |
| 107 | + rustc_lexer::TokenKind::OpenBrace => T!['{'], |
| 108 | + rustc_lexer::TokenKind::CloseBrace => T!['}'], |
| 109 | + rustc_lexer::TokenKind::OpenBracket => T!['['], |
| 110 | + rustc_lexer::TokenKind::CloseBracket => T![']'], |
| 111 | + rustc_lexer::TokenKind::At => T![@], |
| 112 | + rustc_lexer::TokenKind::Pound => T![#], |
| 113 | + rustc_lexer::TokenKind::Tilde => T![~], |
| 114 | + rustc_lexer::TokenKind::Question => T![?], |
| 115 | + rustc_lexer::TokenKind::Colon => T![:], |
| 116 | + rustc_lexer::TokenKind::Dollar => T![$], |
| 117 | + rustc_lexer::TokenKind::Eq => T![=], |
| 118 | + rustc_lexer::TokenKind::Bang => T![!], |
| 119 | + rustc_lexer::TokenKind::Lt => T![<], |
| 120 | + rustc_lexer::TokenKind::Gt => T![>], |
| 121 | + rustc_lexer::TokenKind::Minus => T![-], |
| 122 | + rustc_lexer::TokenKind::And => T![&], |
| 123 | + rustc_lexer::TokenKind::Or => T![|], |
| 124 | + rustc_lexer::TokenKind::Plus => T![+], |
| 125 | + rustc_lexer::TokenKind::Star => T![*], |
| 126 | + rustc_lexer::TokenKind::Slash => T![/], |
| 127 | + rustc_lexer::TokenKind::Caret => T![^], |
| 128 | + rustc_lexer::TokenKind::Percent => T![%], |
| 129 | + rustc_lexer::TokenKind::Unknown => ERROR, |
| 130 | + } |
| 131 | + }; |
| 132 | + |
| 133 | + let err = if err.is_empty() { None } else { Some(err) }; |
| 134 | + (syntax_kind, err) |
| 135 | +} |
| 136 | + |
| 137 | +fn from_rustc_literal(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<&'static str>) { |
| 138 | + let mut err = ""; |
| 139 | + |
| 140 | + let syntax_kind = match *kind { |
| 141 | + rustc_lexer::LiteralKind::Int { empty_int, base: _ } => { |
| 142 | + if empty_int { |
| 143 | + err = "Missing digits after the integer base prefix"; |
| 144 | + } |
| 145 | + INT_NUMBER |
| 146 | + } |
| 147 | + rustc_lexer::LiteralKind::Float { empty_exponent, base: _ } => { |
| 148 | + if empty_exponent { |
| 149 | + err = "Missing digits after the exponent symbol"; |
| 150 | + } |
| 151 | + FLOAT_NUMBER |
| 152 | + } |
| 153 | + rustc_lexer::LiteralKind::Char { terminated } => { |
| 154 | + if !terminated { |
| 155 | + err = "Missing trailing `'` symbol to terminate the character literal"; |
| 156 | + } |
| 157 | + CHAR |
| 158 | + } |
| 159 | + rustc_lexer::LiteralKind::Byte { terminated } => { |
| 160 | + if !terminated { |
| 161 | + err = "Missing trailing `'` symbol to terminate the byte literal"; |
| 162 | + } |
| 163 | + BYTE |
| 164 | + } |
| 165 | + rustc_lexer::LiteralKind::Str { terminated } => { |
| 166 | + if !terminated { |
| 167 | + err = "Missing trailing `\"` symbol to terminate the string literal"; |
| 168 | + } |
| 169 | + STRING |
| 170 | + } |
| 171 | + rustc_lexer::LiteralKind::ByteStr { terminated } => { |
| 172 | + if !terminated { |
| 173 | + err = "Missing trailing `\"` symbol to terminate the byte string literal"; |
| 174 | + } |
| 175 | + BYTE_STRING |
| 176 | + } |
| 177 | + rustc_lexer::LiteralKind::RawStr { err: raw_str_err, .. } => { |
| 178 | + if let Some(raw_str_err) = raw_str_err { |
| 179 | + err = match raw_str_err { |
| 180 | + rustc_lexer::RawStrError::InvalidStarter { .. } => "Missing `\"` symbol after `#` symbols to begin the raw string literal", |
| 181 | + rustc_lexer::RawStrError::NoTerminator { expected, found, .. } => if expected == found { |
| 182 | + "Missing trailing `\"` to terminate the raw string literal" |
| 183 | + } else { |
| 184 | + "Missing trailing `\"` with `#` symbols to terminate the raw string literal" |
| 185 | + }, |
| 186 | + rustc_lexer::RawStrError::TooManyDelimiters { .. } => "Too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols", |
| 187 | + }; |
| 188 | + }; |
| 189 | + STRING |
| 190 | + } |
| 191 | + rustc_lexer::LiteralKind::RawByteStr { err: raw_str_err, .. } => { |
| 192 | + if let Some(raw_str_err) = raw_str_err { |
| 193 | + err = match raw_str_err { |
| 194 | + rustc_lexer::RawStrError::InvalidStarter { .. } => "Missing `\"` symbol after `#` symbols to begin the raw byte string literal", |
| 195 | + rustc_lexer::RawStrError::NoTerminator { expected, found, .. } => if expected == found { |
| 196 | + "Missing trailing `\"` to terminate the raw byte string literal" |
| 197 | + } else { |
| 198 | + "Missing trailing `\"` with `#` symbols to terminate the raw byte string literal" |
| 199 | + }, |
| 200 | + rustc_lexer::RawStrError::TooManyDelimiters { .. } => "Too many `#` symbols: raw byte strings may be delimited by up to 65535 `#` symbols", |
| 201 | + }; |
| 202 | + }; |
| 203 | + |
| 204 | + BYTE_STRING |
| 205 | + } |
| 206 | + }; |
| 207 | + |
| 208 | + let err = if err.is_empty() { None } else { Some(err) }; |
| 209 | + (syntax_kind, err) |
| 210 | +} |
0 commit comments