Skip to content

Commit 9b5bbab

Browse files
committed
Add character literal parsing and validation
1 parent 19c6cbd commit 9b5bbab

File tree

6 files changed

+397
-2
lines changed

6 files changed

+397
-2
lines changed

crates/ra_syntax/src/ast/generated.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,40 @@ impl<'a> AstNode<'a> for CastExpr<'a> {
409409

410410
impl<'a> CastExpr<'a> {}
411411

412+
// Char
413+
414+
#[derive(Debug, Clone)]
415+
pub struct CharNode(SyntaxNode);
416+
417+
impl CharNode {
418+
pub fn ast(&self) -> Char {
419+
Char::cast(self.0.borrowed()).unwrap()
420+
}
421+
}
422+
423+
impl<'a> From<Char<'a>> for CharNode {
424+
fn from(ast: Char<'a>) -> CharNode {
425+
let syntax = ast.syntax().owned();
426+
CharNode(syntax)
427+
}
428+
}
429+
#[derive(Debug, Clone, Copy)]
430+
pub struct Char<'a> {
431+
syntax: SyntaxNodeRef<'a>,
432+
}
433+
434+
impl<'a> AstNode<'a> for Char<'a> {
435+
fn cast(syntax: SyntaxNodeRef<'a>) -> Option<Self> {
436+
match syntax.kind() {
437+
CHAR => Some(Char { syntax }),
438+
_ => None,
439+
}
440+
}
441+
fn syntax(self) -> SyntaxNodeRef<'a> { self.syntax }
442+
}
443+
444+
impl<'a> Char<'a> {}
445+
412446
// Comment
413447

414448
#[derive(Debug, Clone)]

crates/ra_syntax/src/ast/mod.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,12 @@ impl<'a> Lifetime<'a> {
123123
}
124124
}
125125

126+
impl<'a> Char<'a> {
127+
pub fn text(&self) -> &SmolStr {
128+
&self.syntax().leaf_text().unwrap()
129+
}
130+
}
131+
126132
impl<'a> Comment<'a> {
127133
pub fn text(&self) -> &SmolStr {
128134
self.syntax().leaf_text().unwrap()

crates/ra_syntax/src/grammar.ron

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,7 @@ Grammar(
406406
"PrefixExpr": (),
407407
"RangeExpr": (),
408408
"BinExpr": (),
409+
"Char": (),
409410
"Literal": (),
410411

411412
"Expr": (

crates/ra_syntax/src/lib.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,12 @@ mod grammar;
3939
mod parser_api;
4040
mod parser_impl;
4141
mod reparsing;
42-
42+
mod string_lexing;
4343
mod syntax_kinds;
4444
pub mod text_utils;
4545
/// Utilities for simple uses of the parser.
4646
pub mod utils;
47+
mod validation;
4748
mod yellow;
4849

4950
pub use crate::{
@@ -98,6 +99,8 @@ impl File {
9899
self.root.borrowed()
99100
}
100101
pub fn errors(&self) -> Vec<SyntaxError> {
101-
self.root.root_data().clone()
102+
let mut errors = self.root.root_data().clone();
103+
errors.extend(validation::validate(self));
104+
errors
102105
}
103106
}
Lines changed: 311 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,311 @@
1+
use self::CharComponentKind::*;
2+
use rowan::{TextRange, TextUnit};
3+
4+
pub fn parse_char_literal(src: &str) -> CharComponentIterator {
5+
CharComponentIterator {
6+
parser: Parser::new(src),
7+
has_closing_quote: false,
8+
}
9+
}
10+
11+
#[derive(Debug, Eq, PartialEq, Clone)]
12+
pub struct CharComponent {
13+
pub range: TextRange,
14+
pub kind: CharComponentKind,
15+
}
16+
17+
impl CharComponent {
18+
fn new(range: TextRange, kind: CharComponentKind) -> CharComponent {
19+
CharComponent { range, kind }
20+
}
21+
}
22+
23+
#[derive(Debug, Eq, PartialEq, Clone)]
24+
pub enum CharComponentKind {
25+
CodePoint,
26+
AsciiEscape,
27+
AsciiCodeEscape,
28+
UnicodeEscape,
29+
}
30+
31+
pub struct CharComponentIterator<'a> {
32+
parser: Parser<'a>,
33+
pub has_closing_quote: bool,
34+
}
35+
36+
impl<'a> Iterator for CharComponentIterator<'a> {
37+
type Item = CharComponent;
38+
fn next(&mut self) -> Option<CharComponent> {
39+
if self.parser.pos == 0 {
40+
assert!(
41+
self.parser.advance() == '\'',
42+
"char literal should start with a quote"
43+
);
44+
}
45+
46+
if let Some(component) = self.parser.parse_char_component() {
47+
return Some(component);
48+
}
49+
50+
// We get here when there are no char components left to parse
51+
if self.parser.peek() == Some('\'') {
52+
self.parser.advance();
53+
self.has_closing_quote = true;
54+
}
55+
56+
assert!(
57+
self.parser.peek() == None,
58+
"char literal should leave no unparsed input: src = {}, pos = {}, length = {}",
59+
self.parser.src,
60+
self.parser.pos,
61+
self.parser.src.len()
62+
);
63+
64+
None
65+
}
66+
}
67+
68+
pub struct Parser<'a> {
69+
src: &'a str,
70+
pos: usize,
71+
}
72+
73+
impl<'a> Parser<'a> {
74+
pub fn new(src: &'a str) -> Parser<'a> {
75+
Parser { src, pos: 0 }
76+
}
77+
78+
// Utility methods
79+
80+
pub fn peek(&self) -> Option<char> {
81+
if self.pos == self.src.len() {
82+
return None;
83+
}
84+
85+
self.src[self.pos..].chars().next()
86+
}
87+
88+
pub fn advance(&mut self) -> char {
89+
let next = self
90+
.peek()
91+
.expect("cannot advance if end of input is reached");
92+
self.pos += next.len_utf8();
93+
next
94+
}
95+
96+
pub fn get_pos(&self) -> TextUnit {
97+
(self.pos as u32).into()
98+
}
99+
100+
// Char parsing methods
101+
102+
fn parse_unicode_escape(&mut self, start: TextUnit) -> CharComponent {
103+
// Note: validation of UnicodeEscape will be done elsewhere:
104+
// * Only hex digits or underscores allowed
105+
// * Max 6 chars
106+
// * Within allowed range (must be at most 10FFFF)
107+
match self.peek() {
108+
Some('{') => {
109+
self.advance();
110+
111+
// Parse anything until we reach `}`
112+
while let Some(next) = self.peek() {
113+
self.advance();
114+
if next == '}' {
115+
break;
116+
}
117+
}
118+
119+
let end = self.get_pos();
120+
CharComponent::new(TextRange::from_to(start, end), UnicodeEscape)
121+
}
122+
Some(_) | None => {
123+
let end = self.get_pos();
124+
CharComponent::new(TextRange::from_to(start, end), UnicodeEscape)
125+
}
126+
}
127+
}
128+
129+
fn parse_ascii_code_escape(&mut self, start: TextUnit) -> CharComponent {
130+
// Note: validation of AsciiCodeEscape will be done elsewhere:
131+
// * First digit is octal
132+
// * Second digit is hex
133+
let code_start = self.get_pos();
134+
while let Some(next) = self.peek() {
135+
if next == '\'' || (self.get_pos() - code_start == 2.into()) {
136+
break;
137+
}
138+
139+
self.advance();
140+
}
141+
142+
let end = self.get_pos();
143+
CharComponent::new(TextRange::from_to(start, end), AsciiCodeEscape)
144+
}
145+
146+
fn parse_escape(&mut self, start: TextUnit) -> CharComponent {
147+
// Note: validation of AsciiEscape will be done elsewhere:
148+
// * The escape sequence is non-empty
149+
// * The escape sequence is valid
150+
if self.peek().is_none() {
151+
return CharComponent::new(TextRange::from_to(start, start), AsciiEscape);
152+
}
153+
154+
let next = self.advance();
155+
let end = self.get_pos();
156+
let range = TextRange::from_to(start, end);
157+
match next {
158+
'x' => self.parse_ascii_code_escape(start),
159+
'u' => self.parse_unicode_escape(start),
160+
_ => CharComponent::new(range, AsciiEscape),
161+
}
162+
}
163+
164+
pub fn parse_char_component(&mut self) -> Option<CharComponent> {
165+
let next = self.peek()?;
166+
167+
// Ignore character close
168+
if next == '\'' {
169+
return None;
170+
}
171+
172+
let start = self.get_pos();
173+
self.advance();
174+
175+
if next == '\\' {
176+
Some(self.parse_escape(start))
177+
} else {
178+
let end = self.get_pos();
179+
Some(CharComponent::new(
180+
TextRange::from_to(start, end),
181+
CodePoint,
182+
))
183+
}
184+
}
185+
}
186+
187+
#[cfg(test)]
188+
mod tests {
189+
use super::*;
190+
191+
fn parse(src: &str) -> (bool, Vec<CharComponent>) {
192+
let component_iterator = &mut super::parse_char_literal(src);
193+
let components: Vec<_> = component_iterator.collect();
194+
(component_iterator.has_closing_quote, components)
195+
}
196+
197+
fn unclosed_char_component(src: &str) -> CharComponent {
198+
let (has_closing_quote, components) = parse(src);
199+
assert!(!has_closing_quote, "char should not have closing quote");
200+
assert!(components.len() == 1);
201+
components[0].clone()
202+
}
203+
204+
fn closed_char_component(src: &str) -> CharComponent {
205+
let (has_closing_quote, components) = parse(src);
206+
assert!(has_closing_quote, "char should have closing quote");
207+
assert!(
208+
components.len() == 1,
209+
"Literal: {}\nComponents: {:#?}",
210+
src,
211+
components
212+
);
213+
components[0].clone()
214+
}
215+
216+
fn closed_char_components(src: &str) -> Vec<CharComponent> {
217+
let (has_closing_quote, components) = parse(src);
218+
assert!(has_closing_quote, "char should have closing quote");
219+
components
220+
}
221+
222+
fn range_closed(src: &str) -> TextRange {
223+
TextRange::from_to(1.into(), (src.len() as u32 - 1).into())
224+
}
225+
226+
fn range_unclosed(src: &str) -> TextRange {
227+
TextRange::from_to(1.into(), (src.len() as u32).into())
228+
}
229+
230+
#[test]
231+
fn test_unicode_escapes() {
232+
let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", ""];
233+
for escape in unicode_escapes {
234+
let escape_sequence = format!(r"'\u{}'", escape);
235+
let component = closed_char_component(&escape_sequence);
236+
let expected_range = range_closed(&escape_sequence);
237+
assert_eq!(component.kind, CharComponentKind::UnicodeEscape);
238+
assert_eq!(component.range, expected_range);
239+
}
240+
}
241+
242+
#[test]
243+
fn test_unicode_escapes_unclosed() {
244+
let unicode_escapes = &["{DEAD", "{BEEF", "{FF"];
245+
for escape in unicode_escapes {
246+
let escape_sequence = format!(r"'\u{}'", escape);
247+
let component = unclosed_char_component(&escape_sequence);
248+
let expected_range = range_unclosed(&escape_sequence);
249+
assert_eq!(component.kind, CharComponentKind::UnicodeEscape);
250+
assert_eq!(component.range, expected_range);
251+
}
252+
}
253+
254+
#[test]
255+
fn test_empty_char() {
256+
let (has_closing_quote, components) = parse("''");
257+
assert!(has_closing_quote, "char should have closing quote");
258+
assert!(components.len() == 0);
259+
}
260+
261+
#[test]
262+
fn test_unclosed_char() {
263+
let component = unclosed_char_component("'a");
264+
assert!(component.kind == CodePoint);
265+
assert!(component.range == TextRange::from_to(1.into(), 2.into()));
266+
}
267+
268+
#[test]
269+
fn test_digit_escapes() {
270+
let literals = &[r"", r"5", r"55"];
271+
272+
for literal in literals {
273+
let lit_text = format!(r"'\x{}'", literal);
274+
let component = closed_char_component(&lit_text);
275+
assert!(component.kind == CharComponentKind::AsciiCodeEscape);
276+
assert!(component.range == range_closed(&lit_text));
277+
}
278+
279+
// More than 2 digits starts a new codepoint
280+
let components = closed_char_components(r"'\x555'");
281+
assert!(components.len() == 2);
282+
assert!(components[1].kind == CharComponentKind::CodePoint);
283+
}
284+
285+
#[test]
286+
fn test_ascii_escapes() {
287+
let literals = &[
288+
r"\'", "\\\"", // equivalent to \"
289+
r"\n", r"\r", r"\t", r"\\", r"\0",
290+
];
291+
292+
for literal in literals {
293+
let lit_text = format!("'{}'", literal);
294+
let component = closed_char_component(&lit_text);
295+
assert!(component.kind == CharComponentKind::AsciiEscape);
296+
assert!(component.range == range_closed(&lit_text));
297+
}
298+
}
299+
300+
#[test]
301+
fn test_no_escapes() {
302+
let literals = &['"', 'n', 'r', 't', '0', 'x', 'u'];
303+
304+
for &literal in literals {
305+
let lit_text = format!("'{}'", literal);
306+
let component = closed_char_component(&lit_text);
307+
assert!(component.kind == CharComponentKind::CodePoint);
308+
assert!(component.range == range_closed(&lit_text));
309+
}
310+
}
311+
}

0 commit comments

Comments
 (0)