Skip to content

Commit 7e99864

Browse files
committed
move lexing to the parser crate
1 parent 958f20f commit 7e99864

File tree

5 files changed

+289
-2
lines changed

5 files changed

+289
-2
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/parser/Cargo.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,8 @@ doctest = false
1111

1212
[dependencies]
1313
drop_bomb = "0.1.4"
14-
14+
rustc_lexer = { version = "725.0.0", package = "rustc-ap-rustc_lexer" }
1515
limit = { path = "../limit", version = "0.0.0" }
16+
17+
[dev-dependencies]
18+
expect-test = "1.2"

crates/parser/src/lexer_token.rs

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
//! Lexing `&str` into a sequence of Rust tokens.
2+
//!
3+
//! Note that strictly speaking the parser in this crate is not required to work
4+
//! on tokens which originated from text. Macros, eg, can synthesize tokes out
5+
//! of thin air. So, ideally, lexer should be an orthogonal crate. It is however
6+
//! convenient to include a text-based lexer here!
7+
8+
use crate::{
9+
SyntaxKind::{self, *},
10+
T,
11+
};
12+
13+
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
14+
pub struct LexerToken {
15+
pub kind: SyntaxKind,
16+
pub len: usize,
17+
pub error: Option<String>,
18+
}
19+
20+
impl LexerToken {
21+
pub fn new(kind: SyntaxKind, len: usize) -> Self {
22+
Self { kind, len, error: None }
23+
}
24+
25+
/// Lexes text as a sequence of tokens.
26+
pub fn tokenize(text: &str) -> Vec<LexerToken> {
27+
let mut res = Vec::new();
28+
let mut offset = 0;
29+
30+
if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
31+
res.push(LexerToken::new(SHEBANG, shebang_len));
32+
offset = shebang_len
33+
};
34+
35+
for token in rustc_lexer::tokenize(&text[offset..]) {
36+
let token_text = &text[offset..][..token.len];
37+
offset += token.len;
38+
39+
let (kind, err) = from_rustc(&token.kind, token_text);
40+
let mut token = LexerToken::new(kind, token.len);
41+
token.error = err.map(|it| it.to_string());
42+
res.push(token);
43+
}
44+
45+
res
46+
}
47+
/// Lexes text as a single token. Returns `None` if there's leftover text.
48+
pub fn from_str(text: &str) -> Option<LexerToken> {
49+
if text.is_empty() {
50+
return None;
51+
}
52+
53+
let token = rustc_lexer::first_token(text);
54+
if token.len != text.len() {
55+
return None;
56+
}
57+
58+
let (kind, err) = from_rustc(&token.kind, text);
59+
60+
let mut token = LexerToken::new(kind, token.len);
61+
token.error = err.map(|it| it.to_string());
62+
Some(token)
63+
}
64+
}
65+
66+
/// Returns `SyntaxKind` and an optional tokenize error message.
67+
fn from_rustc(
68+
kind: &rustc_lexer::TokenKind,
69+
token_text: &str,
70+
) -> (SyntaxKind, Option<&'static str>) {
71+
// A note on an intended tradeoff:
72+
// We drop some useful information here (see patterns with double dots `..`)
73+
// Storing that info in `SyntaxKind` is not possible due to its layout requirements of
74+
// being `u16` that come from `rowan::SyntaxKind`.
75+
let mut err = "";
76+
77+
let syntax_kind = {
78+
match kind {
79+
rustc_lexer::TokenKind::LineComment { doc_style: _ } => COMMENT,
80+
rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated } => {
81+
if !terminated {
82+
err = "Missing trailing `*/` symbols to terminate the block comment";
83+
}
84+
COMMENT
85+
}
86+
87+
rustc_lexer::TokenKind::Whitespace => WHITESPACE,
88+
89+
rustc_lexer::TokenKind::Ident if token_text == "_" => UNDERSCORE,
90+
rustc_lexer::TokenKind::Ident => SyntaxKind::from_keyword(token_text).unwrap_or(IDENT),
91+
92+
rustc_lexer::TokenKind::RawIdent => IDENT,
93+
rustc_lexer::TokenKind::Literal { kind, .. } => return from_rustc_literal(kind),
94+
95+
rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
96+
if *starts_with_number {
97+
err = "Lifetime name cannot start with a number";
98+
}
99+
LIFETIME_IDENT
100+
}
101+
102+
rustc_lexer::TokenKind::Semi => T![;],
103+
rustc_lexer::TokenKind::Comma => T![,],
104+
rustc_lexer::TokenKind::Dot => T![.],
105+
rustc_lexer::TokenKind::OpenParen => T!['('],
106+
rustc_lexer::TokenKind::CloseParen => T![')'],
107+
rustc_lexer::TokenKind::OpenBrace => T!['{'],
108+
rustc_lexer::TokenKind::CloseBrace => T!['}'],
109+
rustc_lexer::TokenKind::OpenBracket => T!['['],
110+
rustc_lexer::TokenKind::CloseBracket => T![']'],
111+
rustc_lexer::TokenKind::At => T![@],
112+
rustc_lexer::TokenKind::Pound => T![#],
113+
rustc_lexer::TokenKind::Tilde => T![~],
114+
rustc_lexer::TokenKind::Question => T![?],
115+
rustc_lexer::TokenKind::Colon => T![:],
116+
rustc_lexer::TokenKind::Dollar => T![$],
117+
rustc_lexer::TokenKind::Eq => T![=],
118+
rustc_lexer::TokenKind::Bang => T![!],
119+
rustc_lexer::TokenKind::Lt => T![<],
120+
rustc_lexer::TokenKind::Gt => T![>],
121+
rustc_lexer::TokenKind::Minus => T![-],
122+
rustc_lexer::TokenKind::And => T![&],
123+
rustc_lexer::TokenKind::Or => T![|],
124+
rustc_lexer::TokenKind::Plus => T![+],
125+
rustc_lexer::TokenKind::Star => T![*],
126+
rustc_lexer::TokenKind::Slash => T![/],
127+
rustc_lexer::TokenKind::Caret => T![^],
128+
rustc_lexer::TokenKind::Percent => T![%],
129+
rustc_lexer::TokenKind::Unknown => ERROR,
130+
}
131+
};
132+
133+
let err = if err.is_empty() { None } else { Some(err) };
134+
(syntax_kind, err)
135+
}
136+
137+
fn from_rustc_literal(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<&'static str>) {
138+
let mut err = "";
139+
140+
let syntax_kind = match *kind {
141+
rustc_lexer::LiteralKind::Int { empty_int, base: _ } => {
142+
if empty_int {
143+
err = "Missing digits after the integer base prefix";
144+
}
145+
INT_NUMBER
146+
}
147+
rustc_lexer::LiteralKind::Float { empty_exponent, base: _ } => {
148+
if empty_exponent {
149+
err = "Missing digits after the exponent symbol";
150+
}
151+
FLOAT_NUMBER
152+
}
153+
rustc_lexer::LiteralKind::Char { terminated } => {
154+
if !terminated {
155+
err = "Missing trailing `'` symbol to terminate the character literal";
156+
}
157+
CHAR
158+
}
159+
rustc_lexer::LiteralKind::Byte { terminated } => {
160+
if !terminated {
161+
err = "Missing trailing `'` symbol to terminate the byte literal";
162+
}
163+
BYTE
164+
}
165+
rustc_lexer::LiteralKind::Str { terminated } => {
166+
if !terminated {
167+
err = "Missing trailing `\"` symbol to terminate the string literal";
168+
}
169+
STRING
170+
}
171+
rustc_lexer::LiteralKind::ByteStr { terminated } => {
172+
if !terminated {
173+
err = "Missing trailing `\"` symbol to terminate the byte string literal";
174+
}
175+
BYTE_STRING
176+
}
177+
rustc_lexer::LiteralKind::RawStr { err: raw_str_err, .. } => {
178+
if let Some(raw_str_err) = raw_str_err {
179+
err = match raw_str_err {
180+
rustc_lexer::RawStrError::InvalidStarter { .. } => "Missing `\"` symbol after `#` symbols to begin the raw string literal",
181+
rustc_lexer::RawStrError::NoTerminator { expected, found, .. } => if expected == found {
182+
"Missing trailing `\"` to terminate the raw string literal"
183+
} else {
184+
"Missing trailing `\"` with `#` symbols to terminate the raw string literal"
185+
},
186+
rustc_lexer::RawStrError::TooManyDelimiters { .. } => "Too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols",
187+
};
188+
};
189+
STRING
190+
}
191+
rustc_lexer::LiteralKind::RawByteStr { err: raw_str_err, .. } => {
192+
if let Some(raw_str_err) = raw_str_err {
193+
err = match raw_str_err {
194+
rustc_lexer::RawStrError::InvalidStarter { .. } => "Missing `\"` symbol after `#` symbols to begin the raw byte string literal",
195+
rustc_lexer::RawStrError::NoTerminator { expected, found, .. } => if expected == found {
196+
"Missing trailing `\"` to terminate the raw byte string literal"
197+
} else {
198+
"Missing trailing `\"` with `#` symbols to terminate the raw byte string literal"
199+
},
200+
rustc_lexer::RawStrError::TooManyDelimiters { .. } => "Too many `#` symbols: raw byte strings may be delimited by up to 65535 `#` symbols",
201+
};
202+
};
203+
204+
BYTE_STRING
205+
}
206+
};
207+
208+
let err = if err.is_empty() { None } else { Some(err) };
209+
(syntax_kind, err)
210+
}

crates/parser/src/lib.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,20 @@
1818
//! [`Parser`]: crate::parser::Parser
1919
#![allow(rustdoc::private_intra_doc_links)]
2020

21+
mod lexer_token;
2122
mod token_set;
2223
mod syntax_kind;
2324
mod event;
2425
mod parser;
2526
mod grammar;
2627
mod tokens;
2728

29+
#[cfg(test)]
30+
mod tests;
31+
2832
pub(crate) use token_set::TokenSet;
2933

30-
pub use crate::{syntax_kind::SyntaxKind, tokens::Tokens};
34+
pub use crate::{lexer_token::LexerToken, syntax_kind::SyntaxKind, tokens::Tokens};
3135

3236
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3337
pub struct ParseError(pub Box<String>);

crates/parser/src/tests.rs

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
use std::{
2+
fmt::Write,
3+
fs,
4+
path::{Path, PathBuf},
5+
};
6+
7+
use expect_test::expect_file;
8+
9+
use crate::LexerToken;
10+
11+
#[test]
12+
fn valid_lexes_input() {
13+
for case in TestCase::list("lexer/ok") {
14+
let actual = lex(&case.text);
15+
expect_file![case.txt].assert_eq(&actual)
16+
}
17+
}
18+
19+
#[test]
20+
fn invalid_lexes_input() {
21+
for case in TestCase::list("lexer/err") {
22+
let actual = lex(&case.text);
23+
expect_file![case.txt].assert_eq(&actual)
24+
}
25+
}
26+
27+
fn lex(text: &str) -> String {
28+
let mut res = String::new();
29+
let mut offset = 0;
30+
for token in LexerToken::tokenize(text) {
31+
let token_text = &text[offset..][..token.len];
32+
offset += token.len;
33+
let err = token.error.map(|err| format!(" error: {}", err)).unwrap_or_default();
34+
writeln!(res, "{:?} {:?}{}", token.kind, token_text, err).unwrap();
35+
}
36+
res
37+
}
38+
39+
#[derive(PartialEq, Eq, PartialOrd, Ord)]
40+
struct TestCase {
41+
rs: PathBuf,
42+
txt: PathBuf,
43+
text: String,
44+
}
45+
46+
impl TestCase {
47+
fn list(path: &'static str) -> Vec<TestCase> {
48+
let crate_root_dir = Path::new(env!("CARGO_MANIFEST_DIR"));
49+
let test_data_dir = crate_root_dir.join("test_data");
50+
let dir = test_data_dir.join(path);
51+
52+
let mut res = Vec::new();
53+
let read_dir = fs::read_dir(&dir)
54+
.unwrap_or_else(|err| panic!("can't `read_dir` {}: {}", dir.display(), err));
55+
for file in read_dir {
56+
let file = file.unwrap();
57+
let path = file.path();
58+
if path.extension().unwrap_or_default() == "rs" {
59+
let rs = path;
60+
let txt = rs.with_extension("txt");
61+
let text = fs::read_to_string(&rs).unwrap();
62+
res.push(TestCase { rs, txt, text });
63+
}
64+
}
65+
res.sort();
66+
res
67+
}
68+
}

0 commit comments

Comments
 (0)