Skip to content

Commit 2e24454

Browse files
bors[bot]aochagavia
andcommitted
Merge #207
207: Finish implementing char validation r=aochagavia a=aochagavia The only thing missing right now are good integration tests (and maybe more descriptive error messages) Co-authored-by: Adolfo Ochagavía <[email protected]>
2 parents a46a07e + 433a806 commit 2e24454

File tree

8 files changed

+235
-13
lines changed

8 files changed

+235
-13
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/ra_syntax/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ description = "Comment and whitespace preserving parser for the Rust langauge"
88
repository = "https://github.com/rust-analyzer/rust-analyzer"
99

1010
[dependencies]
11+
arrayvec = "0.4.7"
1112
unicode-xid = "0.1.0"
1213
itertools = "0.7.8"
1314
drop_bomb = "0.1.4"

crates/ra_syntax/src/lexer/ptr.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,7 @@ impl<'s> Ptr<'s> {
3030
/// Gets the nth character from the current.
3131
/// For example, 0 will return the current token, 1 will return the next, etc.
3232
pub fn nth(&self, n: u32) -> Option<char> {
33-
let mut chars = self.chars().peekable();
34-
chars.by_ref().nth(n as usize)
33+
self.chars().nth(n as usize)
3534
}
3635

3736
/// Checks whether the current character is `c`.

crates/ra_syntax/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#![allow(missing_docs)]
2121
//#![warn(unreachable_pub)] // rust-lang/rust#47816
2222

23+
extern crate arrayvec;
2324
extern crate drop_bomb;
2425
extern crate itertools;
2526
extern crate parking_lot;

crates/ra_syntax/src/string_lexing/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ mod tests {
219219

220220
#[test]
221221
fn test_unicode_escapes() {
222-
let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", ""];
222+
let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""];
223223
for escape in unicode_escapes {
224224
let escape_sequence = format!(r"'\u{}'", escape);
225225
let component = closed_char_component(&escape_sequence);

crates/ra_syntax/src/utils.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use crate::{File, SyntaxKind, SyntaxNodeRef, WalkEvent};
22
use std::fmt::Write;
3+
use std::str;
34

45
/// Parse a file and create a string representation of the resulting parse tree.
56
pub fn dump_tree(syntax: SyntaxNodeRef) -> String {

crates/ra_syntax/src/validation.rs

Lines changed: 203 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
use std::u32;
2+
3+
use arrayvec::ArrayString;
4+
15
use crate::{
26
algo::visit::{visitor_ctx, VisitorCtx},
37
ast::{self, AstNode},
@@ -42,18 +46,90 @@ fn validate_char(node: ast::Char, errors: &mut Vec<SyntaxError>) {
4246
}
4347
}
4448
AsciiCodeEscape => {
45-
// TODO:
46-
// * First digit is octal
47-
// * Second digit is hex
49+
// An AsciiCodeEscape has 4 chars, example: `\xDD`
50+
if text.len() < 4 {
51+
errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range));
52+
} else {
53+
assert!(
54+
text.chars().count() == 4,
55+
"AsciiCodeEscape cannot be longer than 4 chars"
56+
);
57+
58+
match u8::from_str_radix(&text[2..], 16) {
59+
Ok(code) if code < 128 => { /* Escape code is valid */ }
60+
Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)),
61+
Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)),
62+
}
63+
}
4864
}
4965
UnicodeEscape => {
50-
// TODO:
51-
// * Only hex digits or underscores allowed
52-
// * Max 6 chars
53-
// * Within allowed range (must be at most 10FFFF)
66+
assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u");
67+
68+
if text.len() == 2 {
69+
// No starting `{`
70+
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
71+
return;
72+
}
73+
74+
if text.len() == 3 {
75+
// Only starting `{`
76+
errors.push(SyntaxError::new(UnclosedUnicodeEscape, range));
77+
return;
78+
}
79+
80+
let mut code = ArrayString::<[_; 6]>::new();
81+
let mut closed = false;
82+
for c in text[3..].chars() {
83+
assert!(!closed, "no characters after escape is closed");
84+
85+
if c.is_digit(16) {
86+
if code.len() == 6 {
87+
errors.push(SyntaxError::new(OverlongUnicodeEscape, range));
88+
return;
89+
}
90+
91+
code.push(c);
92+
} else if c == '_' {
93+
// Reject leading _
94+
if code.len() == 0 {
95+
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
96+
return;
97+
}
98+
} else if c == '}' {
99+
closed = true;
100+
} else {
101+
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
102+
return;
103+
}
104+
}
105+
106+
if !closed {
107+
errors.push(SyntaxError::new(UnclosedUnicodeEscape, range))
108+
}
109+
110+
if code.len() == 0 {
111+
errors.push(SyntaxError::new(EmptyUnicodeEcape, range));
112+
return;
113+
}
114+
115+
match u32::from_str_radix(&code, 16) {
116+
Ok(code_u32) if code_u32 > 0x10FFFF => {
117+
errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range));
118+
}
119+
Ok(_) => {
120+
// Valid escape code
121+
}
122+
Err(_) => {
123+
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
124+
}
125+
}
126+
}
127+
CodePoint => {
128+
// These code points must always be escaped
129+
if text == "\t" || text == "\r" {
130+
errors.push(SyntaxError::new(UnescapedCodepoint, range));
131+
}
54132
}
55-
// Code points are always valid
56-
CodePoint => (),
57133
}
58134
}
59135

@@ -72,7 +148,124 @@ fn validate_char(node: ast::Char, errors: &mut Vec<SyntaxError>) {
72148

73149
fn is_ascii_escape(code: char) -> bool {
74150
match code {
75-
'\'' | '"' | 'n' | 'r' | 't' | '0' => true,
151+
'\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true,
76152
_ => false,
77153
}
78154
}
155+
156+
#[cfg(test)]
157+
mod test {
158+
use crate::File;
159+
160+
fn build_file(literal: &str) -> File {
161+
let src = format!("const C: char = '{}';", literal);
162+
File::parse(&src)
163+
}
164+
165+
fn assert_valid_char(literal: &str) {
166+
let file = build_file(literal);
167+
assert!(
168+
file.errors().len() == 0,
169+
"Errors for literal '{}': {:?}",
170+
literal,
171+
file.errors()
172+
);
173+
}
174+
175+
fn assert_invalid_char(literal: &str) {
176+
let file = build_file(literal);
177+
assert!(file.errors().len() > 0);
178+
}
179+
180+
#[test]
181+
fn test_ansi_codepoints() {
182+
for byte in 0..=255u8 {
183+
match byte {
184+
b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()),
185+
b'\'' | b'\\' => { /* Ignore character close and backslash */ }
186+
_ => assert_valid_char(&(byte as char).to_string()),
187+
}
188+
}
189+
}
190+
191+
#[test]
192+
fn test_unicode_codepoints() {
193+
let valid = ["Ƒ", "バ", "メ", "﷽"];
194+
for c in &valid {
195+
assert_valid_char(c);
196+
}
197+
}
198+
199+
#[test]
200+
fn test_unicode_multiple_codepoints() {
201+
let invalid = ["नी", "👨‍👨‍"];
202+
for c in &invalid {
203+
assert_invalid_char(c);
204+
}
205+
}
206+
207+
#[test]
208+
fn test_valid_ascii_escape() {
209+
let valid = [
210+
r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b",
211+
];
212+
for c in &valid {
213+
assert_valid_char(c);
214+
}
215+
}
216+
217+
#[test]
218+
fn test_invalid_ascii_escape() {
219+
let invalid = [r"\a", r"\?", r"\"];
220+
for c in &invalid {
221+
assert_invalid_char(c);
222+
}
223+
}
224+
225+
#[test]
226+
fn test_valid_ascii_code_escape() {
227+
let valid = [r"\x00", r"\x7F", r"\x55"];
228+
for c in &valid {
229+
assert_valid_char(c);
230+
}
231+
}
232+
233+
#[test]
234+
fn test_invalid_ascii_code_escape() {
235+
let invalid = [r"\x", r"\x7", r"\xF0"];
236+
for c in &invalid {
237+
assert_invalid_char(c);
238+
}
239+
}
240+
241+
#[test]
242+
fn test_valid_unicode_escape() {
243+
let valid = [
244+
r"\u{FF}",
245+
r"\u{0}",
246+
r"\u{F}",
247+
r"\u{10FFFF}",
248+
r"\u{1_0__FF___FF_____}",
249+
];
250+
for c in &valid {
251+
assert_valid_char(c);
252+
}
253+
}
254+
255+
#[test]
256+
fn test_invalid_unicode_escape() {
257+
let invalid = [
258+
r"\u",
259+
r"\u{}",
260+
r"\u{",
261+
r"\u{FF",
262+
r"\u{FFFFFF}",
263+
r"\u{_F}",
264+
r"\u{00FFFFF}",
265+
r"\u{110000}",
266+
];
267+
for c in &invalid {
268+
assert_invalid_char(c);
269+
}
270+
}
271+
}

crates/ra_syntax/src/yellow/syntax_error.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ impl SyntaxError {
3434
}
3535
}
3636

37+
pub fn kind(&self) -> SyntaxErrorKind {
38+
self.kind.clone()
39+
}
40+
3741
pub fn location(&self) -> Location {
3842
self.location.clone()
3943
}
@@ -64,11 +68,20 @@ impl fmt::Display for SyntaxError {
6468
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6569
pub enum SyntaxErrorKind {
6670
ParseError(ParseError),
71+
UnescapedCodepoint,
6772
EmptyChar,
6873
UnclosedChar,
6974
LongChar,
7075
EmptyAsciiEscape,
7176
InvalidAsciiEscape,
77+
TooShortAsciiCodeEscape,
78+
AsciiCodeEscapeOutOfRange,
79+
MalformedAsciiCodeEscape,
80+
UnclosedUnicodeEscape,
81+
MalformedUnicodeEscape,
82+
EmptyUnicodeEcape,
83+
OverlongUnicodeEscape,
84+
UnicodeEscapeOutOfRange,
7285
}
7386

7487
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
@@ -78,11 +91,24 @@ impl fmt::Display for SyntaxErrorKind {
7891
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
7992
use self::SyntaxErrorKind::*;
8093
match self {
94+
UnescapedCodepoint => write!(f, "This codepoint should always be escaped"),
8195
EmptyAsciiEscape => write!(f, "Empty escape sequence"),
8296
InvalidAsciiEscape => write!(f, "Invalid escape sequence"),
8397
EmptyChar => write!(f, "Empty char literal"),
8498
UnclosedChar => write!(f, "Unclosed char literal"),
8599
LongChar => write!(f, "Char literal should be one character long"),
100+
TooShortAsciiCodeEscape => write!(f, "Escape sequence should have two digits"),
101+
AsciiCodeEscapeOutOfRange => {
102+
write!(f, "Escape sequence should be between \\x00 and \\x7F")
103+
}
104+
MalformedAsciiCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"),
105+
UnclosedUnicodeEscape => write!(f, "Missing `}}`"),
106+
MalformedUnicodeEscape => write!(f, "Malformed unicode escape sequence"),
107+
EmptyUnicodeEcape => write!(f, "Empty unicode escape sequence"),
108+
OverlongUnicodeEscape => {
109+
write!(f, "Unicode escape sequence should have at most 6 digits")
110+
}
111+
UnicodeEscapeOutOfRange => write!(f, "Unicode escape code should be at most 0x10FFFF"),
86112
ParseError(msg) => write!(f, "{}", msg.0),
87113
}
88114
}

0 commit comments

Comments
 (0)