Skip to content

Commit 8b9d145

Browse files
committed
soa all the things
1 parent 799941e commit 8b9d145

File tree

4 files changed

+75
-34
lines changed

4 files changed

+75
-34
lines changed

crates/parser/src/lexer_token.rs renamed to crates/parser/src/lexed_str.rs

Lines changed: 61 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,48 +4,55 @@
44
//! on tokens which originated from text. Macros, eg, can synthesize tokes out
55
//! of thin air. So, ideally, lexer should be an orthogonal crate. It is however
66
//! convenient to include a text-based lexer here!
7+
//!
8+
//! Note that these tokens, unlike the tokens we feed into the parser, do
9+
//! include info about comments and whitespace.
710
811
use crate::{
912
SyntaxKind::{self, *},
1013
T,
1114
};
1215

13-
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
14-
pub struct LexerToken {
15-
pub kind: SyntaxKind,
16-
pub len: usize,
17-
pub error: Option<String>,
16+
pub struct LexedStr<'a> {
17+
text: &'a str,
18+
kind: Vec<SyntaxKind>,
19+
start: Vec<u32>,
20+
error: Vec<LexError>,
1821
}
1922

20-
impl LexerToken {
21-
pub fn new(kind: SyntaxKind, len: usize) -> Self {
22-
Self { kind, len, error: None }
23-
}
23+
struct LexError {
24+
msg: String,
25+
token: u32,
26+
}
2427

25-
/// Lexes text as a sequence of tokens.
26-
pub fn tokenize(text: &str) -> Vec<LexerToken> {
27-
let mut res = Vec::new();
28-
let mut offset = 0;
28+
impl<'a> LexedStr<'a> {
29+
pub fn new(text: &'a str) -> LexedStr<'a> {
30+
let mut res = LexedStr { text, kind: Vec::new(), start: Vec::new(), error: Vec::new() };
2931

32+
let mut offset = 0;
3033
if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
31-
res.push(LexerToken::new(SHEBANG, shebang_len));
34+
res.push(SHEBANG, offset);
3235
offset = shebang_len
3336
};
34-
3537
for token in rustc_lexer::tokenize(&text[offset..]) {
3638
let token_text = &text[offset..][..token.len];
37-
offset += token.len;
3839

3940
let (kind, err) = from_rustc(&token.kind, token_text);
40-
let mut token = LexerToken::new(kind, token.len);
41-
token.error = err.map(|it| it.to_string());
42-
res.push(token);
41+
res.push(kind, offset);
42+
offset += token.len;
43+
44+
if let Some(err) = err {
45+
let token = res.len() as u32;
46+
let msg = err.to_string();
47+
res.error.push(LexError { msg, token });
48+
}
4349
}
50+
res.push(EOF, offset);
4451

4552
res
4653
}
47-
/// Lexes text as a single token. Returns `None` if there's leftover text.
48-
pub fn from_str(text: &str) -> Option<LexerToken> {
54+
55+
pub fn single_token(text: &'a str) -> Option<SyntaxKind> {
4956
if text.is_empty() {
5057
return None;
5158
}
@@ -56,10 +63,40 @@ impl LexerToken {
5663
}
5764

5865
let (kind, err) = from_rustc(&token.kind, text);
66+
if err.is_some() {
67+
return None;
68+
}
69+
70+
Some(kind)
71+
}
72+
73+
pub fn as_str(&self) -> &str {
74+
self.text
75+
}
76+
77+
pub fn len(&self) -> usize {
78+
self.kind.len() - 1
79+
}
80+
81+
pub fn kind(&self, i: usize) -> SyntaxKind {
82+
assert!(i < self.len());
83+
self.kind[i]
84+
}
85+
pub fn text(&self, i: usize) -> &str {
86+
assert!(i < self.len());
87+
let lo = self.start[i] as usize;
88+
let hi = self.start[i + 1] as usize;
89+
&self.text[lo..hi]
90+
}
91+
pub fn error(&self, i: usize) -> Option<&str> {
92+
assert!(i < self.len());
93+
let err = self.error.binary_search_by_key(&(i as u32), |i| i.token).ok()?;
94+
Some(self.error[err].msg.as_str())
95+
}
5996

60-
let mut token = LexerToken::new(kind, token.len);
61-
token.error = err.map(|it| it.to_string());
62-
Some(token)
97+
fn push(&mut self, kind: SyntaxKind, offset: usize) {
98+
self.kind.push(kind);
99+
self.start.push(offset as u32);
63100
}
64101
}
65102

crates/parser/src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
//! [`Parser`]: crate::parser::Parser
1919
#![allow(rustdoc::private_intra_doc_links)]
2020

21-
mod lexer_token;
21+
mod lexed_str;
2222
mod token_set;
2323
mod syntax_kind;
2424
mod event;
@@ -31,7 +31,7 @@ mod tests;
3131

3232
pub(crate) use token_set::TokenSet;
3333

34-
pub use crate::{lexer_token::LexerToken, syntax_kind::SyntaxKind, tokens::Tokens};
34+
pub use crate::{lexed_str::LexedStr, syntax_kind::SyntaxKind, tokens::Tokens};
3535

3636
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3737
pub struct ParseError(pub Box<String>);

crates/parser/src/tests.rs

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use std::{
66

77
use expect_test::expect_file;
88

9-
use crate::LexerToken;
9+
use crate::LexedStr;
1010

1111
#[test]
1212
fn valid_lexes_input() {
@@ -25,13 +25,16 @@ fn invalid_lexes_input() {
2525
}
2626

2727
fn lex(text: &str) -> String {
28+
let lexed = LexedStr::new(text);
29+
2830
let mut res = String::new();
29-
let mut offset = 0;
30-
for token in LexerToken::tokenize(text) {
31-
let token_text = &text[offset..][..token.len];
32-
offset += token.len;
33-
let err = token.error.map(|err| format!(" error: {}", err)).unwrap_or_default();
34-
writeln!(res, "{:?} {:?}{}", token.kind, token_text, err).unwrap();
31+
for i in 0..lexed.len() {
32+
let kind = lexed.kind(i);
33+
let text = lexed.text(i);
34+
let error = lexed.error(i);
35+
36+
let error = error.map(|err| format!(" error: {}", err)).unwrap_or_default();
37+
writeln!(res, "{:?} {:?}{}", kind, text, error).unwrap();
3538
}
3639
res
3740
}

crates/parser/src/tokens.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
//! Input for the parser -- a sequence of tokens.
22
//!
33
//! As of now, parser doesn't have access to the *text* of the tokens, and makes
4-
//! decisions based solely on their classification.
4+
//! decisions based solely on their classification. Unlike `LexerToken`, the
5+
//! `Tokens` doesn't include whitespace and comments.
56
67
use crate::SyntaxKind;
78

0 commit comments

Comments
 (0)