Skip to content

Finish implementing char validation #207

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Nov 7, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/ra_syntax/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ description = "Comment and whitespace preserving parser for the Rust langauge"
repository = "https://github.com/rust-analyzer/rust-analyzer"

[dependencies]
arrayvec = "0.4.7"
unicode-xid = "0.1.0"
itertools = "0.7.8"
drop_bomb = "0.1.4"
Expand Down
3 changes: 1 addition & 2 deletions crates/ra_syntax/src/lexer/ptr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ impl<'s> Ptr<'s> {
/// Gets the nth character from the current.
/// For example, 0 will return the current token, 1 will return the next, etc.
pub fn nth(&self, n: u32) -> Option<char> {
let mut chars = self.chars().peekable();
chars.by_ref().nth(n as usize)
self.chars().nth(n as usize)
}

/// Checks whether the current character is `c`.
Expand Down
1 change: 1 addition & 0 deletions crates/ra_syntax/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#![allow(missing_docs)]
//#![warn(unreachable_pub)] // rust-lang/rust#47816

extern crate arrayvec;
extern crate drop_bomb;
extern crate itertools;
extern crate parking_lot;
Expand Down
2 changes: 1 addition & 1 deletion crates/ra_syntax/src/string_lexing/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ mod tests {

#[test]
fn test_unicode_escapes() {
let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", ""];
let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""];
for escape in unicode_escapes {
let escape_sequence = format!(r"'\u{}'", escape);
let component = closed_char_component(&escape_sequence);
Expand Down
1 change: 1 addition & 0 deletions crates/ra_syntax/src/utils.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::{File, SyntaxKind, SyntaxNodeRef, WalkEvent};
use std::fmt::Write;
use std::str;

/// Parse a file and create a string representation of the resulting parse tree.
pub fn dump_tree(syntax: SyntaxNodeRef) -> String {
Expand Down
213 changes: 203 additions & 10 deletions crates/ra_syntax/src/validation.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
use std::u32;

use arrayvec::ArrayString;

use crate::{
algo::visit::{visitor_ctx, VisitorCtx},
ast::{self, AstNode},
Expand Down Expand Up @@ -42,18 +46,90 @@ fn validate_char(node: ast::Char, errors: &mut Vec<SyntaxError>) {
}
}
AsciiCodeEscape => {
// TODO:
// * First digit is octal
// * Second digit is hex
// An AsciiCodeEscape has 4 chars, example: `\xDD`
if text.len() < 4 {
errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range));
} else {
assert!(
text.chars().count() == 4,
"AsciiCodeEscape cannot be longer than 4 chars"
);

match u8::from_str_radix(&text[2..], 16) {
Ok(code) if code < 128 => { /* Escape code is valid */ }
Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)),
Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)),
}
}
}
UnicodeEscape => {
// TODO:
// * Only hex digits or underscores allowed
// * Max 6 chars
// * Within allowed range (must be at most 10FFFF)
assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u");

if text.len() == 2 {
// No starting `{`
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
return;
}

if text.len() == 3 {
// Only starting `{`
errors.push(SyntaxError::new(UnclosedUnicodeEscape, range));
return;
}

let mut code = ArrayString::<[_; 6]>::new();
let mut closed = false;
for c in text[3..].chars() {
assert!(!closed, "no characters after escape is closed");

if c.is_digit(16) {
if code.len() == 6 {
errors.push(SyntaxError::new(OverlongUnicodeEscape, range));
return;
}

code.push(c);
} else if c == '_' {
// Reject leading _
if code.len() == 0 {
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
return;
}
} else if c == '}' {
closed = true;
} else {
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
return;
}
}

if !closed {
errors.push(SyntaxError::new(UnclosedUnicodeEscape, range))
}

if code.len() == 0 {
errors.push(SyntaxError::new(EmptyUnicodeEcape, range));
return;
}

match u32::from_str_radix(&code, 16) {
Ok(code_u32) if code_u32 > 0x10FFFF => {
errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range));
}
Ok(_) => {
// Valid escape code
}
Err(_) => {
errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
}
}
}
CodePoint => {
// These code points must always be escaped
if text == "\t" || text == "\r" {
errors.push(SyntaxError::new(UnescapedCodepoint, range));
}
}
// Code points are always valid
CodePoint => (),
}
}

Expand All @@ -72,7 +148,124 @@ fn validate_char(node: ast::Char, errors: &mut Vec<SyntaxError>) {

fn is_ascii_escape(code: char) -> bool {
match code {
'\'' | '"' | 'n' | 'r' | 't' | '0' => true,
'\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true,
_ => false,
}
}

#[cfg(test)]
mod test {
use crate::File;

fn build_file(literal: &str) -> File {
let src = format!("const C: char = '{}';", literal);
File::parse(&src)
}

fn assert_valid_char(literal: &str) {
let file = build_file(literal);
assert!(
file.errors().len() == 0,
"Errors for literal '{}': {:?}",
literal,
file.errors()
);
}

fn assert_invalid_char(literal: &str) {
let file = build_file(literal);
assert!(file.errors().len() > 0);
}

#[test]
fn test_ansi_codepoints() {
for byte in 0..=255u8 {
match byte {
b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()),
b'\'' | b'\\' => { /* Ignore character close and backslash */ }
_ => assert_valid_char(&(byte as char).to_string()),
}
}
}

#[test]
fn test_unicode_codepoints() {
let valid = ["Ƒ", "バ", "メ", "﷽"];
for c in &valid {
assert_valid_char(c);
}
}

#[test]
fn test_unicode_multiple_codepoints() {
let invalid = ["नी", "👨‍👨‍"];
for c in &invalid {
assert_invalid_char(c);
}
}

#[test]
fn test_valid_ascii_escape() {
let valid = [
r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b",
];
for c in &valid {
assert_valid_char(c);
}
}

#[test]
fn test_invalid_ascii_escape() {
let invalid = [r"\a", r"\?", r"\"];
for c in &invalid {
assert_invalid_char(c);
}
}

#[test]
fn test_valid_ascii_code_escape() {
let valid = [r"\x00", r"\x7F", r"\x55"];
for c in &valid {
assert_valid_char(c);
}
}

#[test]
fn test_invalid_ascii_code_escape() {
let invalid = [r"\x", r"\x7", r"\xF0"];
for c in &invalid {
assert_invalid_char(c);
}
}

#[test]
fn test_valid_unicode_escape() {
let valid = [
r"\u{FF}",
r"\u{0}",
r"\u{F}",
r"\u{10FFFF}",
r"\u{1_0__FF___FF_____}",
];
for c in &valid {
assert_valid_char(c);
}
}

#[test]
fn test_invalid_unicode_escape() {
let invalid = [
r"\u",
r"\u{}",
r"\u{",
r"\u{FF",
r"\u{FFFFFF}",
r"\u{_F}",
r"\u{00FFFFF}",
r"\u{110000}",
];
for c in &invalid {
assert_invalid_char(c);
}
}
}
26 changes: 26 additions & 0 deletions crates/ra_syntax/src/yellow/syntax_error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ impl SyntaxError {
}
}

pub fn kind(&self) -> SyntaxErrorKind {
self.kind.clone()
}

pub fn location(&self) -> Location {
self.location.clone()
}
Expand Down Expand Up @@ -64,11 +68,20 @@ impl fmt::Display for SyntaxError {
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum SyntaxErrorKind {
ParseError(ParseError),
UnescapedCodepoint,
EmptyChar,
UnclosedChar,
LongChar,
EmptyAsciiEscape,
InvalidAsciiEscape,
TooShortAsciiCodeEscape,
AsciiCodeEscapeOutOfRange,
MalformedAsciiCodeEscape,
UnclosedUnicodeEscape,
MalformedUnicodeEscape,
EmptyUnicodeEcape,
OverlongUnicodeEscape,
UnicodeEscapeOutOfRange,
}

#[derive(Debug, Clone, PartialEq, Eq, Hash)]
Expand All @@ -78,11 +91,24 @@ impl fmt::Display for SyntaxErrorKind {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use self::SyntaxErrorKind::*;
match self {
UnescapedCodepoint => write!(f, "This codepoint should always be escaped"),
EmptyAsciiEscape => write!(f, "Empty escape sequence"),
InvalidAsciiEscape => write!(f, "Invalid escape sequence"),
EmptyChar => write!(f, "Empty char literal"),
UnclosedChar => write!(f, "Unclosed char literal"),
LongChar => write!(f, "Char literal should be one character long"),
TooShortAsciiCodeEscape => write!(f, "Escape sequence should have two digits"),
AsciiCodeEscapeOutOfRange => {
write!(f, "Escape sequence should be between \\x00 and \\x7F")
}
MalformedAsciiCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"),
UnclosedUnicodeEscape => write!(f, "Missing `}}`"),
MalformedUnicodeEscape => write!(f, "Malformed unicode escape sequence"),
EmptyUnicodeEcape => write!(f, "Empty unicode escape sequence"),
OverlongUnicodeEscape => {
write!(f, "Unicode escape sequence should have at most 6 digits")
}
UnicodeEscapeOutOfRange => write!(f, "Unicode escape code should be at most 0x10FFFF"),
ParseError(msg) => write!(f, "{}", msg.0),
}
}
Expand Down