Skip to content

Commit 1463460

Browse files
committed
Diagnose curly quote strings extraneous block comment close and invalid identifier starts
1 parent f998cb6 commit 1463460

File tree

9 files changed

+178
-106
lines changed

9 files changed

+178
-106
lines changed

Sources/SwiftParser/Lexer/Cursor.swift

Lines changed: 27 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -873,9 +873,13 @@ extension Lexer.Cursor {
873873
return self.lexOperatorIdentifier(sourceBufferStart: sourceBufferStart)
874874
}
875875

876-
let unknownClassification = self.lexUnknown()
877-
assert(unknownClassification == .lexemeContents, "Invalid UTF-8 sequence should be eaten by lexTrivia as LeadingTrivia")
878-
return Lexer.Result(.unknown)
876+
switch self.lexUnknown() {
877+
case .lexemeContents(let result):
878+
return result
879+
case .trivia:
880+
assertionFailure("Invalid UTF-8 sequence should be eaten by lexTrivia as LeadingTrivia")
881+
return Lexer.Result(.unknown, error: (.invalidUtf8, self))
882+
}
879883
}
880884
}
881885

@@ -1125,7 +1129,7 @@ extension Lexer.Cursor {
11251129

11261130
// `lexUnknown` expects that the first character has not been consumed yet.
11271131
self = start
1128-
if self.lexUnknown() == .trivia {
1132+
if case .trivia = self.lexUnknown() {
11291133
continue
11301134
} else {
11311135
break
@@ -1632,7 +1636,7 @@ extension Lexer.Cursor {
16321636
}
16331637

16341638
guard self.advance(matching: "}") else {
1635-
return .error(.excpectedClosingBraceInUnicodeEscape)
1639+
return .error(.expectedClosingBraceInUnicodeEscape)
16361640
}
16371641

16381642
if numDigits == 0 || numDigits > 8 {
@@ -1770,12 +1774,6 @@ extension Lexer.Cursor {
17701774
}
17711775

17721776
mutating func lexInStringLiteral(stringLiteralKind: StringLiteralKind, delimiterLength: Int) -> Lexer.Result {
1773-
/*
1774-
if IsMultilineString && *CurPtr != '\n' && *CurPtr != '\r' {
1775-
diagnose(CurPtr, diag::lex_illegal_multiline_string_start)
1776-
.fixItInsert(Lexer::getSourceLoc(CurPtr), "\n")
1777-
}
1778-
*/
17791777
var error: (LexerError.Kind, Lexer.Cursor)? = nil
17801778

17811779
while true {
@@ -1971,15 +1969,6 @@ extension Lexer.Cursor {
19711969
if self.input.baseAddress! - tokStart.input.baseAddress! == 1 {
19721970
switch tokStart.peek() {
19731971
case UInt8(ascii: "="):
1974-
// Refrain from emitting this message in operator name position.
1975-
// if (NextToken.isNot(tok::kw_operator) && leftBound != rightBound) {
1976-
// auto d = diagnose(TokStart, diag::lex_unary_equal)
1977-
// if (leftBound)
1978-
// d.fixItInsert(getSourceLoc(TokStart), " ")
1979-
// else
1980-
// d.fixItInsert(getSourceLoc(TokStart+1), " ")
1981-
// }
1982-
// always emit 'tok::equal' to avoid trickle down parse errors
19831972
return Lexer.Result(.equal)
19841973
case UInt8(ascii: "&"):
19851974
if leftBound == rightBound || leftBound {
@@ -2001,17 +1990,15 @@ extension Lexer.Cursor {
20011990
case (UInt8(ascii: "-"), UInt8(ascii: ">")): // ->
20021991
return Lexer.Result(.arrow)
20031992
case (UInt8(ascii: "*"), UInt8(ascii: "/")): // */
2004-
// diagnose(TokStart, diag::lex_unexpected_block_comment_end)
2005-
return Lexer.Result(.unknown)
1993+
return Lexer.Result(.unknown, error: (.unexpectedBlockCommentEnd, tokStart))
20061994
default:
20071995
break
20081996
}
20091997
} else {
20101998
// Verify there is no "*/" in the middle of the identifier token, we reject
20111999
// it as potentially ending a block comment.
20122000
if tokStart.text(upTo: self).contains("*/") {
2013-
// diagnose(TokStart+Pos, diag::lex_unexpected_block_comment_end)
2014-
return Lexer.Result(.unknown)
2001+
return Lexer.Result(.unknown, error: (.unexpectedBlockCommentEnd, tokStart))
20152002
}
20162003
}
20172004

@@ -2102,24 +2089,15 @@ extension Lexer.Cursor {
21022089

21032090
// Get the next character.
21042091
switch body.lexCharacterInStringLiteral(stringLiteralKind: .singleLine, delimiterLength: 0) {
2105-
case .error, .endOfString:
2092+
case .error:
21062093
// If the character was incorrectly encoded, give up.
21072094
return nil
2108-
case .success(let charValue) where charValue == Unicode.Scalar(UInt8(ascii: #"""#)):
2109-
// If we found a straight-quote, then we're done. Just return the spot
2095+
case .endOfString, .success(Unicode.Scalar(0x201D)):
2096+
// If we found a closing quote, then we're done. Just return the spot
21102097
// to continue.
21112098
return body
2112-
case .validatedEscapeSequence(let charValue) where charValue == Character(Unicode.Scalar(0x0000201D)!):
2113-
// If we found an ending curly quote (common since this thing started with
2114-
// an opening curly quote) diagnose it with a fixit and then return.
2115-
// if (EmitDiagnostics) {
2116-
// diagnose(CharStart, diag::lex_invalid_curly_quote)
2117-
// .fixItReplaceChars(getSourceLoc(CharStart), getSourceLoc(Body),
2118-
// "\"")
2119-
// }
2120-
return body
21212099
default:
2122-
continue
2100+
break
21232101
}
21242102
}
21252103
}
@@ -2128,14 +2106,15 @@ extension Lexer.Cursor {
21282106
/// The characters consumed by `lexUnknown` should be classified as trivia
21292107
case trivia
21302108
/// The characters consumed by `lexUnknown` should be classified as the contents of a lexeme
2131-
case lexemeContents
2109+
case lexemeContents(Lexer.Result)
21322110
}
21332111

21342112
/// Assuming the cursor is positioned at neighter a valid identifier nor a
21352113
/// valid operator start, advance the cursor by what can be considered a
21362114
/// lexeme.
21372115
mutating func lexUnknown() -> UnknownCharactersClassification {
21382116
assert(!(self.peekScalar()?.isValidIdentifierStartCodePoint ?? false) && !(self.peekScalar()?.isOperatorStartCodePoint ?? false))
2117+
let start = self
21392118
var tmp = self
21402119
if tmp.advance(if: { Unicode.Scalar($0).isValidIdentifierContinuationCodePoint }) {
21412120
// If this is a valid identifier continuation, but not a valid identifier
@@ -2145,7 +2124,7 @@ extension Lexer.Cursor {
21452124
// }
21462125
tmp.advance(while: { Unicode.Scalar($0).isValidIdentifierContinuationCodePoint })
21472126
self = tmp
2148-
return .lexemeContents
2127+
return .lexemeContents(Lexer.Result(.identifier, error: (.invalidIdentifierStartCharacter, position: start)))
21492128
}
21502129

21512130
// This character isn't allowed in Swift source.
@@ -2155,8 +2134,7 @@ extension Lexer.Cursor {
21552134
self = tmp
21562135
return .trivia
21572136
}
2158-
if codepoint.value == 0x000000A0 {
2159-
// Non-breaking whitespace (U+00A0)
2137+
if codepoint.value == 0xA0 { // Non-breaking whitespace (U+00A0)
21602138
while tmp.is(at: 0xC2) && tmp.is(offset: 1, at: 0xA0) {
21612139
_ = tmp.advance()
21622140
_ = tmp.advance()
@@ -2169,33 +2147,23 @@ extension Lexer.Cursor {
21692147
// Spaces)
21702148
self = tmp
21712149
return .trivia
2172-
} else if (codepoint.value == 0x0000201D) {
2150+
} else if codepoint.value == 0x201D { // Closing curly quote (U+201D)
21732151
// If this is an end curly quote, just diagnose it with a fixit hint.
2174-
// if (EmitDiagnosticsIfToken) {
2175-
// diagnose(CurPtr - 1, diag::lex_invalid_curly_quote)
2176-
// .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp), "\"")
2177-
// }
21782152
self = tmp
2179-
return .lexemeContents
2180-
} else if (codepoint.value == 0x0000201C) {
2153+
return .lexemeContents(Lexer.Result(.unknown, error: (.unicodeCurlyQuote, position: start)))
2154+
} else if codepoint.value == 0x201C { // Opening curly quote (U+201C)
21812155
// If this is a start curly quote, do a fuzzy match of a string literal
21822156
// to improve recovery.
21832157
if let tmp2 = tmp.findEndOfCurlyQuoteStringLiteral() {
21842158
tmp = tmp2
21852159
}
21862160

2187-
// Note, we intentionally diagnose the end quote before the start quote,
2188-
// so that the IDE suggests fixing the end quote before the start quote.
2189-
// This, in turn, works better with our error recovery because we won't
2190-
// diagnose an end curly quote in the middle of a straight quoted
2191-
// literal.
2192-
// if (EmitDiagnosticsIfToken) {
2193-
// diagnose(CurPtr - 1, diag::lex_invalid_curly_quote)
2194-
// .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(EndPtr),
2195-
// "\"")
2196-
// }
21972161
self = tmp
2198-
return .lexemeContents
2162+
2163+
// Identifiers are the closest representation of static string literals
2164+
// we have in the parser. Classify the entire curly string as an identifier
2165+
// for best recovery.
2166+
return .lexemeContents(Lexer.Result(.identifier, error: (.unicodeCurlyQuote, position: start)))
21992167
}
22002168

22012169
// diagnose(CurPtr - 1, diag::lex_invalid_character)

Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift

Lines changed: 50 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,18 @@ public extension LexerError {
3939
/// Please order the cases in this enum alphabetically by case name.
4040
public enum StaticLexerError: String, DiagnosticMessage {
4141
case expectedBinaryExponentInHexFloatLiteral = "hexadecimal floating point literal must end with an exponent"
42-
case excpectedClosingBraceInUnicodeEscape = #"expected '}' in \u{...} escape sequence"#
42+
case expectedClosingBraceInUnicodeEscape = #"expected '}' in \u{...} escape sequence"#
4343
case expectedDigitInFloatLiteral = "expected a digit in floating point exponent"
4444
case expectedHexCodeInUnicodeEscape = #"expected hexadecimal code in \u{...} escape sequence"#
4545
case invalidEscapeSequenceInStringLiteral = "invalid escape sequence in literal"
46+
case invalidIdentifierStartCharacter = "an identifier cannot begin with this character"
4647
case invalidNumberOfHexDigitsInUnicodeEscape = #"\u{...} escape sequence expects between 1 and 8 hex digits"#
4748
case invalidUtf8 = "invalid UTF-8 found in source file"
4849
case lexerErrorOffsetOverflow = "the lexer dicovered an error in this token but was not able to represent its offset due to overflow; please split the token"
50+
case nonBreakingSpace = "non-breaking space (U+00A0) used instead of regular space"
4951
case nulCharacter = "nul character embedded in middle of file"
52+
case unexpectedBlockCommentEnd = "unexpected end of block comment"
53+
case unicodeCurlyQuote = #"unicode curly quote found; use '"' instead"#
5054

5155
public var message: String { self.rawValue }
5256

@@ -104,53 +108,65 @@ public extension SwiftSyntax.LexerError {
104108
/// `tokenText` is the entire text of the token in which the `LexerError`
105109
/// occurred, including trivia.
106110
@_spi(RawSyntax)
107-
func diagnostic(wholeTextBytes: [UInt8]) -> DiagnosticMessage {
111+
func diagnosticMessage(wholeTextBytes: [UInt8]) -> DiagnosticMessage {
108112
var scalarAtErrorOffset: UnicodeScalar {
109113
// Fall back to the Unicode replacement character U+FFFD in case we can't
110114
// lex the unicode character at `byteOffset`. It's the best we can do
111115
Unicode.Scalar.lexing(from: wholeTextBytes[Int(self.byteOffset)...]) ?? UnicodeScalar("")
112116
}
113117

114118
switch self.kind {
115-
case .expectedBinaryExponentInHexFloatLiteral:
116-
return StaticLexerError.expectedBinaryExponentInHexFloatLiteral
117-
case .excpectedClosingBraceInUnicodeEscape:
118-
return StaticLexerError.excpectedClosingBraceInUnicodeEscape
119-
case .expectedDigitInFloatLiteral:
120-
return StaticLexerError.expectedDigitInFloatLiteral
121-
case .expectedHexCodeInUnicodeEscape:
122-
return StaticLexerError.expectedHexCodeInUnicodeEscape
119+
case .expectedBinaryExponentInHexFloatLiteral: return StaticLexerError.expectedBinaryExponentInHexFloatLiteral
120+
case .expectedClosingBraceInUnicodeEscape: return StaticLexerError.expectedClosingBraceInUnicodeEscape
121+
case .expectedDigitInFloatLiteral: return StaticLexerError.expectedDigitInFloatLiteral
122+
case .expectedHexCodeInUnicodeEscape: return StaticLexerError.expectedHexCodeInUnicodeEscape
123123
case .insufficientIndentationInMultilineStringLiteral:
124124
// This should be diagnosed when visiting the `StringLiteralExprSyntax`
125125
// inside `ParseDiagnosticsGenerator` but fall back to an error message
126126
// here in case the error is not diagnosed.
127127
return InvalidIndentationInMultiLineStringLiteralError(kind: .insufficientIndentation, lines: 1)
128-
case .invalidBinaryDigitInIntegerLiteral:
129-
return InvalidDigitInIntegerLiteral(kind: .binary(scalarAtErrorOffset))
130-
case .invalidDecimalDigitInIntegerLiteral:
131-
return InvalidDigitInIntegerLiteral(kind: .decimal(scalarAtErrorOffset))
132-
case .invalidEscapeSequenceInStringLiteral:
133-
return StaticLexerError.invalidEscapeSequenceInStringLiteral
134-
case .invalidFloatingPointExponentCharacter:
135-
return InvalidFloatingPointExponentDigit(kind: .character(scalarAtErrorOffset))
136-
case .invalidFloatingPointExponentDigit:
137-
return InvalidFloatingPointExponentDigit(kind: .digit(scalarAtErrorOffset))
138-
case .invalidHexDigitInIntegerLiteral:
139-
return InvalidDigitInIntegerLiteral(kind: .hex(scalarAtErrorOffset))
140-
case .invalidNumberOfHexDigitsInUnicodeEscape:
141-
return StaticLexerError.invalidNumberOfHexDigitsInUnicodeEscape
142-
case .invalidOctalDigitInIntegerLiteral:
143-
return InvalidDigitInIntegerLiteral(kind: .octal(scalarAtErrorOffset))
144-
case .invalidUtf8:
145-
return StaticLexerError.invalidUtf8
146-
case .lexerErrorOffsetOverflow:
147-
return StaticLexerError.lexerErrorOffsetOverflow
148-
case .nulCharacter:
149-
return StaticLexerError.nulCharacter
128+
case .invalidBinaryDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .binary(scalarAtErrorOffset))
129+
case .invalidDecimalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .decimal(scalarAtErrorOffset))
130+
case .invalidEscapeSequenceInStringLiteral: return StaticLexerError.invalidEscapeSequenceInStringLiteral
131+
case .invalidFloatingPointExponentCharacter: return InvalidFloatingPointExponentDigit(kind: .character(scalarAtErrorOffset))
132+
case .invalidFloatingPointExponentDigit: return InvalidFloatingPointExponentDigit(kind: .digit(scalarAtErrorOffset))
133+
case .invalidHexDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .hex(scalarAtErrorOffset))
134+
case .invalidIdentifierStartCharacter: return StaticLexerError.invalidIdentifierStartCharacter
135+
case .invalidNumberOfHexDigitsInUnicodeEscape: return StaticLexerError.invalidNumberOfHexDigitsInUnicodeEscape
136+
case .invalidOctalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .octal(scalarAtErrorOffset))
137+
case .invalidUtf8: return StaticLexerError.invalidUtf8
138+
case .lexerErrorOffsetOverflow: return StaticLexerError.lexerErrorOffsetOverflow
139+
case .nonBreakingSpace: return StaticLexerError.nonBreakingSpace
140+
case .nulCharacter: return StaticLexerError.nulCharacter
141+
case .unexpectedBlockCommentEnd: return StaticLexerError.unexpectedBlockCommentEnd
142+
case .unicodeCurlyQuote: return StaticLexerError.unicodeCurlyQuote
150143
}
151144
}
152145

153-
func diagnostic(in token: TokenSyntax) -> DiagnosticMessage {
154-
return self.diagnostic(wholeTextBytes: token.syntaxTextBytes)
146+
func diagnosticMessage(in token: TokenSyntax) -> DiagnosticMessage {
147+
return self.diagnosticMessage(wholeTextBytes: token.syntaxTextBytes)
148+
}
149+
150+
func fixIts(in token: TokenSyntax) -> [FixIt] {
151+
switch self.kind {
152+
case .nonBreakingSpace:
153+
return []
154+
case .unicodeCurlyQuote:
155+
let (rawKind, text) = token.tokenKind.decomposeToRaw()
156+
guard let text = text else {
157+
return []
158+
}
159+
let replacedText =
160+
text
161+
.replaceFirstOccuranceOf("", with: #"""#)
162+
.replaceLastOccuranceOf("", with: #"""#)
163+
164+
let fixedToken = token.withKind(TokenKind.fromRaw(kind: rawKind, text: replacedText))
165+
return [
166+
FixIt(message: .replaceCurlyQuoteByNormalQuote, changes: [[.replace(oldNode: Syntax(token), newNode: Syntax(fixedToken))]])
167+
]
168+
default:
169+
return []
170+
}
155171
}
156172
}

Sources/SwiftParserDiagnostics/ParseDiagnosticsGenerator.swift

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,12 @@ public class ParseDiagnosticsGenerator: SyntaxAnyVisitor {
347347
handleMissingToken(token)
348348
} else {
349349
if let lexerError = token.lexerError {
350-
self.addDiagnostic(token, position: token.position.advanced(by: Int(lexerError.byteOffset)), lexerError.diagnostic(in: token))
350+
self.addDiagnostic(
351+
token,
352+
position: token.position.advanced(by: Int(lexerError.byteOffset)),
353+
lexerError.diagnosticMessage(in: token),
354+
fixIts: lexerError.fixIts(in: token)
355+
)
351356
}
352357
}
353358

Sources/SwiftParserDiagnostics/ParserDiagnosticMessages.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,9 @@ extension FixItMessage where Self == StaticParserFixIt {
474474
public static var removeOperatorBody: Self {
475475
.init("remove operator body")
476476
}
477+
public static var replaceCurlyQuoteByNormalQuote: Self {
478+
.init(#"replace curly quotes by '"'"#)
479+
}
477480
public static var wrapInBackticks: Self {
478481
.init("if this name is unavoidable, use backticks to escape it")
479482
}

Sources/SwiftParserDiagnostics/Utils.swift

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,20 @@ extension String {
1818
return self
1919
}
2020
}
21+
22+
func replaceFirstOccuranceOf(_ character: Character, with replacement: Character) -> String {
23+
guard let match = self.firstIndex(of: character) else {
24+
return self
25+
}
26+
return self[startIndex..<match] + String(replacement) + self[index(after: match)...]
27+
}
28+
29+
func replaceLastOccuranceOf(_ character: Character, with replacement: Character) -> String {
30+
guard let match = self.lastIndex(of: character) else {
31+
return self
32+
}
33+
return self[startIndex..<match] + String(replacement) + self[index(after: match)...]
34+
}
2135
}
2236

2337
extension Collection {

Sources/SwiftSyntax/LexerError.swift

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ public struct LexerError: Hashable {
1818
// Please order these alphabetically
1919

2020
case expectedBinaryExponentInHexFloatLiteral
21-
case excpectedClosingBraceInUnicodeEscape
21+
case expectedClosingBraceInUnicodeEscape
2222
case expectedDigitInFloatLiteral
2323
case expectedHexCodeInUnicodeEscape
2424
case insufficientIndentationInMultilineStringLiteral
@@ -28,12 +28,16 @@ public struct LexerError: Hashable {
2828
case invalidFloatingPointExponentCharacter
2929
case invalidFloatingPointExponentDigit
3030
case invalidHexDigitInIntegerLiteral
31+
case invalidIdentifierStartCharacter
3132
case invalidNumberOfHexDigitsInUnicodeEscape
3233
case invalidOctalDigitInIntegerLiteral
3334
case invalidUtf8
3435
/// The lexer dicovered an error but was not able to represent the offset of the error because it would overflow `LexerErrorOffset`.
3536
case lexerErrorOffsetOverflow
37+
case nonBreakingSpace
3638
case nulCharacter
39+
case unexpectedBlockCommentEnd
40+
case unicodeCurlyQuote
3741
}
3842

3943
public let kind: Kind

Tests/SwiftParserTest/Assertions.swift

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ private func AssertTokens(
143143
)
144144
case (let actualError?, let expectedError?):
145145
AssertStringsEqualWithDiff(
146-
actualError.diagnostic(wholeTextBytes: Array(actualLexeme.wholeText)).message,
146+
actualError.diagnosticMessage(wholeTextBytes: Array(actualLexeme.wholeText)).message,
147147
expectedError,
148148
file: expectedLexeme.file,
149149
line: expectedLexeme.line
@@ -182,6 +182,7 @@ func AssertLexemes(
182182
line: UInt = #line
183183
) {
184184
var (markerLocations, source) = extractMarkers(markedSource)
185+
markerLocations["START"] = 0
185186
var expectedLexemes = expectedLexemes
186187
if expectedLexemes.last?.rawTokenKind != .eof {
187188
expectedLexemes.append(LexemeSpec(.eof, text: ""))

0 commit comments

Comments
 (0)