Skip to content

Commit f998cb6

Browse files
committed
Diagnose invalid characters and escape sequences in string literals
a
1 parent 6e8b0b1 commit f998cb6

File tree

4 files changed

+215
-99
lines changed

4 files changed

+215
-99
lines changed

Sources/SwiftParser/Lexer/Cursor.swift

Lines changed: 45 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1479,7 +1479,7 @@ extension Lexer.Cursor {
14791479
case endOfString
14801480

14811481
/// The character could not be lexed because it's not a valid Unicode character.
1482-
case error
1482+
case error(LexerError.Kind)
14831483
}
14841484

14851485
/// Lexes a single character in a string literal, handling escape sequences
@@ -1524,10 +1524,8 @@ extension Lexer.Cursor {
15241524
return .success(Unicode.Scalar(character))
15251525
}
15261526
case 0:
1527-
// if (EmitDiagnostics)
1528-
// diagnose(CurPtr-1, diag::lex_nul_character)
1529-
let character = self.advance()!
1530-
return .success(Unicode.Scalar(character))
1527+
_ = self.advance()
1528+
return .error(.nulCharacter)
15311529
case UInt8(ascii: "\n"), UInt8(ascii: "\r"): // String literals cannot have \n or \r in them.
15321530
let character = self.advance()!
15331531
assert(stringLiteralKind == .multiLine, "Caller must handle newlines in non-multiline")
@@ -1538,16 +1536,17 @@ extension Lexer.Cursor {
15381536
if !self.advanceIfStringDelimiter(delimiterLength: delimiterLength) {
15391537
return .success(Unicode.Scalar("\\"))
15401538
}
1541-
guard let escapedCharacterCode = self.lexEscapedCharacter(isMultilineString: stringLiteralKind == .multiLine) else {
1542-
return .error
1543-
}
1544-
1545-
// Check to see if the encoding is valid.
1546-
guard let validatedScalar = Unicode.Scalar(escapedCharacterCode) else {
1547-
return .error
1539+
switch self.lexEscapedCharacter(isMultilineString: stringLiteralKind == .multiLine) {
1540+
case .success(let escapedCharacterCode):
1541+
// Check to see if the encoding is valid.
1542+
if let validatedScalar = Unicode.Scalar(escapedCharacterCode) {
1543+
return .validatedEscapeSequence(Character(validatedScalar))
1544+
} else {
1545+
return .error(.invalidEscapeSequenceInStringLiteral)
1546+
}
1547+
case .error(let kind):
1548+
return .error(kind)
15481549
}
1549-
1550-
return .validatedEscapeSequence(Character(validatedScalar))
15511550
default:
15521551
_ = self.advance()
15531552
// Normal characters are part of the string.
@@ -1561,71 +1560,68 @@ extension Lexer.Cursor {
15611560
// }
15621561
self = charStart
15631562
guard let charValue = self.advanceValidatingUTF8Character() else {
1564-
// if (EmitDiagnostics)
1565-
// diagnose(CharStart, diag::lex_invalid_utf8)
1566-
return .error
1563+
return .error(.invalidUtf8)
15671564
}
15681565
return .success(charValue)
15691566
}
15701567
}
15711568

1569+
enum EscapedCharacterLex {
1570+
// Successfully lexed an escape sequence that represents the Unicode character
1571+
// at the given codepoint
1572+
case success(UInt32)
1573+
case error(LexerError.Kind)
1574+
}
1575+
15721576
/// Assuming that we are in a string literal and have already consumed a `\`,
15731577
/// consume the escaped characters and return the Unicode character code
15741578
/// (i.e. UTF-32 value) that the escaped character represents.
15751579
///
15761580
/// If the character is not a valid escape sequence, return `nil`.
1577-
private mutating func lexEscapedCharacter(isMultilineString: Bool) -> UInt32? {
1581+
private mutating func lexEscapedCharacter(isMultilineString: Bool) -> EscapedCharacterLex {
15781582
assert(self.previous == UInt8(ascii: "\\") || self.previous == UInt8(ascii: "#"))
15791583
// Escape processing. We already ate the "\".
15801584
switch self.peek() {
15811585
// Simple single-character escapes.
1582-
case UInt8(ascii: "0"): _ = self.advance(); return UInt32(UInt8(ascii: "\0"))
1583-
case UInt8(ascii: "n"): _ = self.advance(); return UInt32(UInt8(ascii: "\n"))
1584-
case UInt8(ascii: "r"): _ = self.advance(); return UInt32(UInt8(ascii: "\r"))
1585-
case UInt8(ascii: "t"): _ = self.advance(); return UInt32(UInt8(ascii: "\t"))
1586-
case UInt8(ascii: #"""#): _ = self.advance(); return UInt32(UInt8(ascii: #"""#))
1587-
case UInt8(ascii: "'"): _ = self.advance(); return UInt32(UInt8(ascii: "'"))
1588-
case UInt8(ascii: "\\"): _ = self.advance(); return UInt32(UInt8(ascii: "\\"))
1586+
case UInt8(ascii: "0"): _ = self.advance(); return .success(UInt32(UInt8(ascii: "\0")))
1587+
case UInt8(ascii: "n"): _ = self.advance(); return .success(UInt32(UInt8(ascii: "\n")))
1588+
case UInt8(ascii: "r"): _ = self.advance(); return .success(UInt32(UInt8(ascii: "\r")))
1589+
case UInt8(ascii: "t"): _ = self.advance(); return .success(UInt32(UInt8(ascii: "\t")))
1590+
case UInt8(ascii: #"""#): _ = self.advance(); return .success(UInt32(UInt8(ascii: #"""#)))
1591+
case UInt8(ascii: "'"): _ = self.advance(); return .success(UInt32(UInt8(ascii: "'")))
1592+
case UInt8(ascii: "\\"): _ = self.advance(); return .success(UInt32(UInt8(ascii: "\\")))
15891593

15901594
case UInt8(ascii: "u"): // e.g. \u{1234}
15911595
_ = self.advance()
15921596

15931597
guard self.is(at: "{") else {
1594-
// if (EmitDiagnostics)
1595-
// diagnose(CurPtr-1, diag::lex_unicode_escape_braces)
1596-
return nil
1598+
return .error(.expectedHexCodeInUnicodeEscape)
15971599
}
15981600

1599-
guard let cv = self.lexUnicodeEscape() else {
1600-
return nil
1601-
}
1602-
return cv
1603-
1601+
return self.lexUnicodeEscape()
16041602
case UInt8(ascii: "\n"), UInt8(ascii: "\r"):
16051603
if isMultilineString && self.maybeConsumeNewlineEscape() {
1606-
return UInt32(UInt8(ascii: "\n"))
1604+
return .success(UInt32(UInt8(ascii: "\n")))
16071605
}
1608-
return nil
1606+
return .error(.invalidEscapeSequenceInStringLiteral)
16091607
case nil:
1610-
return nil
1608+
return .error(.invalidEscapeSequenceInStringLiteral)
16111609
case .some(let peekedValue): // Invalid escape.
1612-
// if (EmitDiagnostics)
1613-
// diagnose(CurPtr, diag::lex_invalid_escape)
16141610
// If this looks like a plausible escape character, recover as though this
16151611
// is an invalid escape.
16161612
let c = Unicode.Scalar(peekedValue)
16171613
if c.isDigit || c.isLetter {
16181614
_ = self.advance()
16191615
}
1620-
return nil
1616+
return .error(.invalidEscapeSequenceInStringLiteral)
16211617
}
16221618
}
16231619

16241620
/// Lex the contents of a `\u{1234}` escape sequence, assuming that we are
16251621
/// placed at the opening `{`.
16261622
///
16271623
/// If this is not a valid unicode escape, return `nil`.
1628-
private mutating func lexUnicodeEscape() -> UInt32? {
1624+
private mutating func lexUnicodeEscape() -> EscapedCharacterLex {
16291625
let quoteConsumed = self.advance(matching: "{")
16301626
assert(quoteConsumed)
16311627

@@ -1636,18 +1632,18 @@ extension Lexer.Cursor {
16361632
}
16371633

16381634
guard self.advance(matching: "}") else {
1639-
// if (Diags)
1640-
// Diags->diagnose(CurPtr, diag::lex_invalid_u_escape_rbrace)
1641-
return nil
1635+
return .error(.excpectedClosingBraceInUnicodeEscape)
16421636
}
16431637

16441638
if numDigits == 0 || numDigits > 8 {
1645-
// if (Diags)
1646-
// Diags->diagnose(CurPtr, diag::lex_invalid_u_escape)
1647-
return nil
1639+
return .error(.invalidNumberOfHexDigitsInUnicodeEscape)
16481640
}
16491641

1650-
return UInt32(String(decoding: digitStart.input[0..<numDigits], as: UTF8.self), radix: 16)
1642+
if let codePoint = UInt32(String(decoding: digitStart.input[0..<numDigits], as: UTF8.self), radix: 16) {
1643+
return .success(codePoint)
1644+
} else {
1645+
return .error(.invalidEscapeSequenceInStringLiteral)
1646+
}
16511647
}
16521648

16531649
private mutating func maybeConsumeNewlineEscape() -> Bool {
@@ -1835,8 +1831,8 @@ extension Lexer.Cursor {
18351831
// validate the multi-line string literal's indentation.
18361832
return Lexer.Result(.stringSegment, error: error)
18371833
}
1838-
case .error:
1839-
error = (.invalidEscapeSequenceInStringLiteral, self)
1834+
case .error(let errorKind):
1835+
error = (errorKind, self)
18401836
self = clone
18411837
case .endOfString:
18421838
return Lexer.Result(

Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,14 @@ public extension LexerError {
3939
/// Please order the cases in this enum alphabetically by case name.
4040
public enum StaticLexerError: String, DiagnosticMessage {
4141
case expectedBinaryExponentInHexFloatLiteral = "hexadecimal floating point literal must end with an exponent"
42+
case excpectedClosingBraceInUnicodeEscape = #"expected '}' in \u{...} escape sequence"#
4243
case expectedDigitInFloatLiteral = "expected a digit in floating point exponent"
44+
case expectedHexCodeInUnicodeEscape = #"expected hexadecimal code in \u{...} escape sequence"#
4345
case invalidEscapeSequenceInStringLiteral = "invalid escape sequence in literal"
46+
case invalidNumberOfHexDigitsInUnicodeEscape = #"\u{...} escape sequence expects between 1 and 8 hex digits"#
47+
case invalidUtf8 = "invalid UTF-8 found in source file"
4448
case lexerErrorOffsetOverflow = "the lexer dicovered an error in this token but was not able to represent its offset due to overflow; please split the token"
49+
case nulCharacter = "nul character embedded in middle of file"
4550

4651
public var message: String { self.rawValue }
4752

@@ -109,8 +114,12 @@ public extension SwiftSyntax.LexerError {
109114
switch self.kind {
110115
case .expectedBinaryExponentInHexFloatLiteral:
111116
return StaticLexerError.expectedBinaryExponentInHexFloatLiteral
117+
case .excpectedClosingBraceInUnicodeEscape:
118+
return StaticLexerError.excpectedClosingBraceInUnicodeEscape
112119
case .expectedDigitInFloatLiteral:
113120
return StaticLexerError.expectedDigitInFloatLiteral
121+
case .expectedHexCodeInUnicodeEscape:
122+
return StaticLexerError.expectedHexCodeInUnicodeEscape
114123
case .insufficientIndentationInMultilineStringLiteral:
115124
// This should be diagnosed when visiting the `StringLiteralExprSyntax`
116125
// inside `ParseDiagnosticsGenerator` but fall back to an error message
@@ -128,10 +137,16 @@ public extension SwiftSyntax.LexerError {
128137
return InvalidFloatingPointExponentDigit(kind: .digit(scalarAtErrorOffset))
129138
case .invalidHexDigitInIntegerLiteral:
130139
return InvalidDigitInIntegerLiteral(kind: .hex(scalarAtErrorOffset))
140+
case .invalidNumberOfHexDigitsInUnicodeEscape:
141+
return StaticLexerError.invalidNumberOfHexDigitsInUnicodeEscape
131142
case .invalidOctalDigitInIntegerLiteral:
132143
return InvalidDigitInIntegerLiteral(kind: .octal(scalarAtErrorOffset))
144+
case .invalidUtf8:
145+
return StaticLexerError.invalidUtf8
133146
case .lexerErrorOffsetOverflow:
134147
return StaticLexerError.lexerErrorOffsetOverflow
148+
case .nulCharacter:
149+
return StaticLexerError.nulCharacter
135150
}
136151
}
137152

Sources/SwiftSyntax/LexerError.swift

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,22 @@ public struct LexerError: Hashable {
1818
// Please order these alphabetically
1919

2020
case expectedBinaryExponentInHexFloatLiteral
21+
case excpectedClosingBraceInUnicodeEscape
2122
case expectedDigitInFloatLiteral
23+
case expectedHexCodeInUnicodeEscape
2224
case insufficientIndentationInMultilineStringLiteral
2325
case invalidBinaryDigitInIntegerLiteral
2426
case invalidDecimalDigitInIntegerLiteral
2527
case invalidEscapeSequenceInStringLiteral
2628
case invalidFloatingPointExponentCharacter
2729
case invalidFloatingPointExponentDigit
2830
case invalidHexDigitInIntegerLiteral
31+
case invalidNumberOfHexDigitsInUnicodeEscape
2932
case invalidOctalDigitInIntegerLiteral
33+
case invalidUtf8
3034
/// The lexer dicovered an error but was not able to represent the offset of the error because it would overflow `LexerErrorOffset`.
3135
case lexerErrorOffsetOverflow
36+
case nulCharacter
3237
}
3338

3439
public let kind: Kind

0 commit comments

Comments
 (0)