Diagnose curly quote strings extraneous block comment close and invalid identifier starts

ahoppen · ahoppen · commit 146346091079 · 2023-02-02T16:50:59.000+01:00
diff --git a/Sources/SwiftParser/Lexer/Cursor.swift b/Sources/SwiftParser/Lexer/Cursor.swift
@@ -873,9 +873,13 @@ extension Lexer.Cursor {
         return self.lexOperatorIdentifier(sourceBufferStart: sourceBufferStart)
       }
 
-      let unknownClassification = self.lexUnknown()
-      assert(unknownClassification == .lexemeContents, "Invalid UTF-8 sequence should be eaten by lexTrivia as LeadingTrivia")
-      return Lexer.Result(.unknown)
+      switch self.lexUnknown() {
+      case .lexemeContents(let result):
+        return result
+      case .trivia:
+        assertionFailure("Invalid UTF-8 sequence should be eaten by lexTrivia as LeadingTrivia")
+        return Lexer.Result(.unknown, error: (.invalidUtf8, self))
+      }
     }
   }
 
@@ -1125,7 +1129,7 @@ extension Lexer.Cursor {
 
         // `lexUnknown` expects that the first character has not been consumed yet.
         self = start
-        if self.lexUnknown() == .trivia {
+        if case .trivia = self.lexUnknown() {
           continue
         } else {
           break
@@ -1632,7 +1636,7 @@ extension Lexer.Cursor {
     }
 
     guard self.advance(matching: "}") else {
-      return .error(.excpectedClosingBraceInUnicodeEscape)
+      return .error(.expectedClosingBraceInUnicodeEscape)
     }
 
     if numDigits == 0 || numDigits > 8 {
@@ -1770,12 +1774,6 @@ extension Lexer.Cursor {
   }
 
   mutating func lexInStringLiteral(stringLiteralKind: StringLiteralKind, delimiterLength: Int) -> Lexer.Result {
-    /*
-    if IsMultilineString && *CurPtr != '\n' && *CurPtr != '\r' {
-      diagnose(CurPtr, diag::lex_illegal_multiline_string_start)
-        .fixItInsert(Lexer::getSourceLoc(CurPtr), "\n")
-    }
-*/
     var error: (LexerError.Kind, Lexer.Cursor)? = nil
 
     while true {
@@ -1971,15 +1969,6 @@ extension Lexer.Cursor {
     if self.input.baseAddress! - tokStart.input.baseAddress! == 1 {
       switch tokStart.peek() {
       case UInt8(ascii: "="):
-        // Refrain from emitting this message in operator name position.
-        //        if (NextToken.isNot(tok::kw_operator) && leftBound != rightBound) {
-        //          auto d = diagnose(TokStart, diag::lex_unary_equal)
-        //          if (leftBound)
-        //            d.fixItInsert(getSourceLoc(TokStart), " ")
-        //          else
-        //            d.fixItInsert(getSourceLoc(TokStart+1), " ")
-        //        }
-        // always emit 'tok::equal' to avoid trickle down parse errors
         return Lexer.Result(.equal)
       case UInt8(ascii: "&"):
         if leftBound == rightBound || leftBound {
@@ -2001,17 +1990,15 @@ extension Lexer.Cursor {
       case (UInt8(ascii: "-"), UInt8(ascii: ">")):  // ->
         return Lexer.Result(.arrow)
       case (UInt8(ascii: "*"), UInt8(ascii: "/")):  // */
-        //        diagnose(TokStart, diag::lex_unexpected_block_comment_end)
-        return Lexer.Result(.unknown)
+        return Lexer.Result(.unknown, error: (.unexpectedBlockCommentEnd, tokStart))
       default:
         break
       }
     } else {
       // Verify there is no "*/" in the middle of the identifier token, we reject
       // it as potentially ending a block comment.
       if tokStart.text(upTo: self).contains("*/") {
-        //        diagnose(TokStart+Pos, diag::lex_unexpected_block_comment_end)
-        return Lexer.Result(.unknown)
+        return Lexer.Result(.unknown, error: (.unexpectedBlockCommentEnd, tokStart))
       }
     }
 
@@ -2102,24 +2089,15 @@ extension Lexer.Cursor {
 
       // Get the next character.
       switch body.lexCharacterInStringLiteral(stringLiteralKind: .singleLine, delimiterLength: 0) {
-      case .error, .endOfString:
+      case .error:
         // If the character was incorrectly encoded, give up.
         return nil
-      case .success(let charValue) where charValue == Unicode.Scalar(UInt8(ascii: #"""#)):
-        // If we found a straight-quote, then we're done.  Just return the spot
+      case .endOfString, .success(Unicode.Scalar(0x201D)):
+        // If we found a closing quote, then we're done.  Just return the spot
         // to continue.
         return body
-      case .validatedEscapeSequence(let charValue) where charValue == Character(Unicode.Scalar(0x0000201D)!):
-        // If we found an ending curly quote (common since this thing started with
-        // an opening curly quote) diagnose it with a fixit and then return.
-        //        if (EmitDiagnostics) {
-        //          diagnose(CharStart, diag::lex_invalid_curly_quote)
-        //              .fixItReplaceChars(getSourceLoc(CharStart), getSourceLoc(Body),
-        //                                 "\"")
-        //        }
-        return body
       default:
-        continue
+        break
       }
     }
   }
@@ -2128,14 +2106,15 @@ extension Lexer.Cursor {
     /// The characters consumed by `lexUnknown` should be classified as trivia
     case trivia
     /// The characters consumed by `lexUnknown` should be classified as the contents of a lexeme
-    case lexemeContents
+    case lexemeContents(Lexer.Result)
   }
 
   /// Assuming the cursor is positioned at neighter a valid identifier nor a
   /// valid operator start, advance the cursor by what can be considered a
   /// lexeme.
   mutating func lexUnknown() -> UnknownCharactersClassification {
     assert(!(self.peekScalar()?.isValidIdentifierStartCodePoint ?? false) && !(self.peekScalar()?.isOperatorStartCodePoint ?? false))
+    let start = self
     var tmp = self
     if tmp.advance(if: { Unicode.Scalar($0).isValidIdentifierContinuationCodePoint }) {
       // If this is a valid identifier continuation, but not a valid identifier
@@ -2145,7 +2124,7 @@ extension Lexer.Cursor {
       //      }
       tmp.advance(while: { Unicode.Scalar($0).isValidIdentifierContinuationCodePoint })
       self = tmp
-      return .lexemeContents
+      return .lexemeContents(Lexer.Result(.identifier, error: (.invalidIdentifierStartCharacter, position: start)))
     }
 
     // This character isn't allowed in Swift source.
@@ -2155,8 +2134,7 @@ extension Lexer.Cursor {
       self = tmp
       return .trivia
     }
-    if codepoint.value == 0x000000A0 {
-      // Non-breaking whitespace (U+00A0)
+    if codepoint.value == 0xA0 {  // Non-breaking whitespace (U+00A0)
       while tmp.is(at: 0xC2) && tmp.is(offset: 1, at: 0xA0) {
         _ = tmp.advance()
         _ = tmp.advance()
@@ -2169,33 +2147,23 @@ extension Lexer.Cursor {
       //                           Spaces)
       self = tmp
       return .trivia
-    } else if (codepoint.value == 0x0000201D) {
+    } else if codepoint.value == 0x201D {  // Closing curly quote (U+201D)
       // If this is an end curly quote, just diagnose it with a fixit hint.
-      //      if (EmitDiagnosticsIfToken) {
-      //        diagnose(CurPtr - 1, diag::lex_invalid_curly_quote)
-      //            .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp), "\"")
-      //      }
       self = tmp
-      return .lexemeContents
-    } else if (codepoint.value == 0x0000201C) {
+      return .lexemeContents(Lexer.Result(.unknown, error: (.unicodeCurlyQuote, position: start)))
+    } else if codepoint.value == 0x201C {  // Opening curly quote (U+201C)
       // If this is a start curly quote, do a fuzzy match of a string literal
       // to improve recovery.
       if let tmp2 = tmp.findEndOfCurlyQuoteStringLiteral() {
         tmp = tmp2
       }
 
-      // Note, we intentionally diagnose the end quote before the start quote,
-      // so that the IDE suggests fixing the end quote before the start quote.
-      // This, in turn, works better with our error recovery because we won't
-      // diagnose an end curly quote in the middle of a straight quoted
-      // literal.
-      //      if (EmitDiagnosticsIfToken) {
-      //        diagnose(CurPtr - 1, diag::lex_invalid_curly_quote)
-      //            .fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(EndPtr),
-      //                               "\"")
-      //      }
       self = tmp
-      return .lexemeContents
+
+      // Identifiers are the closest representation of static string literals
+      // we have in the parser. Classify the entire curly string as an identifier
+      // for best recovery.
+      return .lexemeContents(Lexer.Result(.identifier, error: (.unicodeCurlyQuote, position: start)))
     }
 
     //    diagnose(CurPtr - 1, diag::lex_invalid_character)
diff --git a/Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift b/Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift
@@ -39,14 +39,18 @@ public extension LexerError {
 /// Please order the cases in this enum alphabetically by case name.
 public enum StaticLexerError: String, DiagnosticMessage {
   case expectedBinaryExponentInHexFloatLiteral = "hexadecimal floating point literal must end with an exponent"
-  case excpectedClosingBraceInUnicodeEscape = #"expected '}' in \u{...} escape sequence"#
+  case expectedClosingBraceInUnicodeEscape = #"expected '}' in \u{...} escape sequence"#
   case expectedDigitInFloatLiteral = "expected a digit in floating point exponent"
   case expectedHexCodeInUnicodeEscape = #"expected hexadecimal code in \u{...} escape sequence"#
   case invalidEscapeSequenceInStringLiteral = "invalid escape sequence in literal"
+  case invalidIdentifierStartCharacter = "an identifier cannot begin with this character"
   case invalidNumberOfHexDigitsInUnicodeEscape = #"\u{...} escape sequence expects between 1 and 8 hex digits"#
   case invalidUtf8 = "invalid UTF-8 found in source file"
   case lexerErrorOffsetOverflow = "the lexer dicovered an error in this token but was not able to represent its offset due to overflow; please split the token"
+  case nonBreakingSpace = "non-breaking space (U+00A0) used instead of regular space"
   case nulCharacter = "nul character embedded in middle of file"
+  case unexpectedBlockCommentEnd = "unexpected end of block comment"
+  case unicodeCurlyQuote = #"unicode curly quote found; use '"' instead"#
 
   public var message: String { self.rawValue }
 
@@ -104,53 +108,65 @@ public extension SwiftSyntax.LexerError {
   /// `tokenText` is the entire text of the token in which the `LexerError`
   /// occurred, including trivia.
   @_spi(RawSyntax)
-  func diagnostic(wholeTextBytes: [UInt8]) -> DiagnosticMessage {
+  func diagnosticMessage(wholeTextBytes: [UInt8]) -> DiagnosticMessage {
     var scalarAtErrorOffset: UnicodeScalar {
       // Fall back to the Unicode replacement character U+FFFD in case we can't
       // lex the unicode character at `byteOffset`. It's the best we can do
       Unicode.Scalar.lexing(from: wholeTextBytes[Int(self.byteOffset)...]) ?? UnicodeScalar("�")
     }
 
     switch self.kind {
-    case .expectedBinaryExponentInHexFloatLiteral:
-      return StaticLexerError.expectedBinaryExponentInHexFloatLiteral
-    case .excpectedClosingBraceInUnicodeEscape:
-      return StaticLexerError.excpectedClosingBraceInUnicodeEscape
-    case .expectedDigitInFloatLiteral:
-      return StaticLexerError.expectedDigitInFloatLiteral
-    case .expectedHexCodeInUnicodeEscape:
-      return StaticLexerError.expectedHexCodeInUnicodeEscape
+    case .expectedBinaryExponentInHexFloatLiteral: return StaticLexerError.expectedBinaryExponentInHexFloatLiteral
+    case .expectedClosingBraceInUnicodeEscape: return StaticLexerError.expectedClosingBraceInUnicodeEscape
+    case .expectedDigitInFloatLiteral: return StaticLexerError.expectedDigitInFloatLiteral
+    case .expectedHexCodeInUnicodeEscape: return StaticLexerError.expectedHexCodeInUnicodeEscape
     case .insufficientIndentationInMultilineStringLiteral:
       // This should be diagnosed when visiting the `StringLiteralExprSyntax`
       // inside `ParseDiagnosticsGenerator` but fall back to an error message
       // here in case the error is not diagnosed.
       return InvalidIndentationInMultiLineStringLiteralError(kind: .insufficientIndentation, lines: 1)
-    case .invalidBinaryDigitInIntegerLiteral:
-      return InvalidDigitInIntegerLiteral(kind: .binary(scalarAtErrorOffset))
-    case .invalidDecimalDigitInIntegerLiteral:
-      return InvalidDigitInIntegerLiteral(kind: .decimal(scalarAtErrorOffset))
-    case .invalidEscapeSequenceInStringLiteral:
-      return StaticLexerError.invalidEscapeSequenceInStringLiteral
-    case .invalidFloatingPointExponentCharacter:
-      return InvalidFloatingPointExponentDigit(kind: .character(scalarAtErrorOffset))
-    case .invalidFloatingPointExponentDigit:
-      return InvalidFloatingPointExponentDigit(kind: .digit(scalarAtErrorOffset))
-    case .invalidHexDigitInIntegerLiteral:
-      return InvalidDigitInIntegerLiteral(kind: .hex(scalarAtErrorOffset))
-    case .invalidNumberOfHexDigitsInUnicodeEscape:
-      return StaticLexerError.invalidNumberOfHexDigitsInUnicodeEscape
-    case .invalidOctalDigitInIntegerLiteral:
-      return InvalidDigitInIntegerLiteral(kind: .octal(scalarAtErrorOffset))
-    case .invalidUtf8:
-      return StaticLexerError.invalidUtf8
-    case .lexerErrorOffsetOverflow:
-      return StaticLexerError.lexerErrorOffsetOverflow
-    case .nulCharacter:
-      return StaticLexerError.nulCharacter
+    case .invalidBinaryDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .binary(scalarAtErrorOffset))
+    case .invalidDecimalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .decimal(scalarAtErrorOffset))
+    case .invalidEscapeSequenceInStringLiteral: return StaticLexerError.invalidEscapeSequenceInStringLiteral
+    case .invalidFloatingPointExponentCharacter: return InvalidFloatingPointExponentDigit(kind: .character(scalarAtErrorOffset))
+    case .invalidFloatingPointExponentDigit: return InvalidFloatingPointExponentDigit(kind: .digit(scalarAtErrorOffset))
+    case .invalidHexDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .hex(scalarAtErrorOffset))
+    case .invalidIdentifierStartCharacter: return StaticLexerError.invalidIdentifierStartCharacter
+    case .invalidNumberOfHexDigitsInUnicodeEscape: return StaticLexerError.invalidNumberOfHexDigitsInUnicodeEscape
+    case .invalidOctalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .octal(scalarAtErrorOffset))
+    case .invalidUtf8: return StaticLexerError.invalidUtf8
+    case .lexerErrorOffsetOverflow: return StaticLexerError.lexerErrorOffsetOverflow
+    case .nonBreakingSpace: return StaticLexerError.nonBreakingSpace
+    case .nulCharacter: return StaticLexerError.nulCharacter
+    case .unexpectedBlockCommentEnd: return StaticLexerError.unexpectedBlockCommentEnd
+    case .unicodeCurlyQuote: return StaticLexerError.unicodeCurlyQuote
     }
   }
 
-  func diagnostic(in token: TokenSyntax) -> DiagnosticMessage {
-    return self.diagnostic(wholeTextBytes: token.syntaxTextBytes)
+  func diagnosticMessage(in token: TokenSyntax) -> DiagnosticMessage {
+    return self.diagnosticMessage(wholeTextBytes: token.syntaxTextBytes)
+  }
+
+  func fixIts(in token: TokenSyntax) -> [FixIt] {
+    switch self.kind {
+    case .nonBreakingSpace:
+      return []
+    case .unicodeCurlyQuote:
+      let (rawKind, text) = token.tokenKind.decomposeToRaw()
+      guard let text = text else {
+        return []
+      }
+      let replacedText =
+        text
+        .replaceFirstOccuranceOf("“", with: #"""#)
+        .replaceLastOccuranceOf("”", with: #"""#)
+
+      let fixedToken = token.withKind(TokenKind.fromRaw(kind: rawKind, text: replacedText))
+      return [
+        FixIt(message: .replaceCurlyQuoteByNormalQuote, changes: [[.replace(oldNode: Syntax(token), newNode: Syntax(fixedToken))]])
+      ]
+    default:
+      return []
+    }
   }
 }
diff --git a/Sources/SwiftParserDiagnostics/ParseDiagnosticsGenerator.swift b/Sources/SwiftParserDiagnostics/ParseDiagnosticsGenerator.swift
@@ -347,7 +347,12 @@ public class ParseDiagnosticsGenerator: SyntaxAnyVisitor {
       handleMissingToken(token)
     } else {
       if let lexerError = token.lexerError {
-        self.addDiagnostic(token, position: token.position.advanced(by: Int(lexerError.byteOffset)), lexerError.diagnostic(in: token))
+        self.addDiagnostic(
+          token,
+          position: token.position.advanced(by: Int(lexerError.byteOffset)),
+          lexerError.diagnosticMessage(in: token),
+          fixIts: lexerError.fixIts(in: token)
+        )
       }
     }
 
diff --git a/Sources/SwiftParserDiagnostics/ParserDiagnosticMessages.swift b/Sources/SwiftParserDiagnostics/ParserDiagnosticMessages.swift
@@ -474,6 +474,9 @@ extension FixItMessage where Self == StaticParserFixIt {
   public static var removeOperatorBody: Self {
     .init("remove operator body")
   }
+  public static var replaceCurlyQuoteByNormalQuote: Self {
+    .init(#"replace curly quotes by '"'"#)
+  }
   public static var wrapInBackticks: Self {
     .init("if this name is unavoidable, use backticks to escape it")
   }
diff --git a/Sources/SwiftParserDiagnostics/Utils.swift b/Sources/SwiftParserDiagnostics/Utils.swift
@@ -18,6 +18,20 @@ extension String {
       return self
     }
   }
+
+  func replaceFirstOccuranceOf(_ character: Character, with replacement: Character) -> String {
+    guard let match = self.firstIndex(of: character) else {
+      return self
+    }
+    return self[startIndex..<match] + String(replacement) + self[index(after: match)...]
+  }
+
+  func replaceLastOccuranceOf(_ character: Character, with replacement: Character) -> String {
+    guard let match = self.lastIndex(of: character) else {
+      return self
+    }
+    return self[startIndex..<match] + String(replacement) + self[index(after: match)...]
+  }
 }
 
 extension Collection {
diff --git a/Sources/SwiftSyntax/LexerError.swift b/Sources/SwiftSyntax/LexerError.swift
@@ -18,7 +18,7 @@ public struct LexerError: Hashable {
     // Please order these alphabetically
 
     case expectedBinaryExponentInHexFloatLiteral
-    case excpectedClosingBraceInUnicodeEscape
+    case expectedClosingBraceInUnicodeEscape
     case expectedDigitInFloatLiteral
     case expectedHexCodeInUnicodeEscape
     case insufficientIndentationInMultilineStringLiteral
@@ -28,12 +28,16 @@ public struct LexerError: Hashable {
     case invalidFloatingPointExponentCharacter
     case invalidFloatingPointExponentDigit
     case invalidHexDigitInIntegerLiteral
+    case invalidIdentifierStartCharacter
     case invalidNumberOfHexDigitsInUnicodeEscape
     case invalidOctalDigitInIntegerLiteral
     case invalidUtf8
     /// The lexer dicovered an error but was not able to represent the offset of the error because it would overflow `LexerErrorOffset`.
     case lexerErrorOffsetOverflow
+    case nonBreakingSpace
     case nulCharacter
+    case unexpectedBlockCommentEnd
+    case unicodeCurlyQuote
   }
 
   public let kind: Kind
diff --git a/Tests/SwiftParserTest/Assertions.swift b/Tests/SwiftParserTest/Assertions.swift
@@ -143,7 +143,7 @@ private func AssertTokens(
       )
     case (let actualError?, let expectedError?):
       AssertStringsEqualWithDiff(
-        actualError.diagnostic(wholeTextBytes: Array(actualLexeme.wholeText)).message,
+        actualError.diagnosticMessage(wholeTextBytes: Array(actualLexeme.wholeText)).message,
         expectedError,
         file: expectedLexeme.file,
         line: expectedLexeme.line
@@ -182,6 +182,7 @@ func AssertLexemes(
   line: UInt = #line
 ) {
   var (markerLocations, source) = extractMarkers(markedSource)
+  markerLocations["START"] = 0
   var expectedLexemes = expectedLexemes
   if expectedLexemes.last?.rawTokenKind != .eof {
     expectedLexemes.append(LexemeSpec(.eof, text: ""))
diff --git a/Tests/SwiftParserTest/LexerTests.swift b/Tests/SwiftParserTest/LexerTests.swift
diff --git a/Tests/SwiftParserTest/translated/RecoveryTests.swift b/Tests/SwiftParserTest/translated/RecoveryTests.swift

Original file line number	Diff line number	Diff line change
`@@ -347,7 +347,12 @@ public class ParseDiagnosticsGenerator: SyntaxAnyVisitor {`
`347`	`347`	`handleMissingToken(token)`
`348`	`348`	`} else {`
`349`	`349`	`if let lexerError = token.lexerError {`
`350`		`- self.addDiagnostic(token, position: token.position.advanced(by: Int(lexerError.byteOffset)), lexerError.diagnostic(in: token))`
	`350`	`+ self.addDiagnostic(`
	`351`	`+ token,`
	`352`	`+ position: token.position.advanced(by: Int(lexerError.byteOffset)),`
	`353`	`+ lexerError.diagnosticMessage(in: token),`
	`354`	`+ fixIts: lexerError.fixIts(in: token)`
	`355`	`+ )`
`351`	`356`	`}`
`352`	`357`	`}`
`353`	`358`
Original file line number	Diff line number	Diff line change
`@@ -474,6 +474,9 @@ extension FixItMessage where Self == StaticParserFixIt {`
`474`	`474`	`public static var removeOperatorBody: Self {`
`475`	`475`	`.init("remove operator body")`
`476`	`476`	`}`
	`477`	`+ public static var replaceCurlyQuoteByNormalQuote: Self {`
	`478`	`+ .init(#"replace curly quotes by '"'"#)`
	`479`	`+ }`
`477`	`480`	`public static var wrapInBackticks: Self {`
`478`	`481`	`.init("if this name is unavoidable, use backticks to escape it")`
`479`	`482`	`}`