Diagnose invalid characters and escape sequences in string literals

ahoppen · ahoppen · commit f998cb649a7e · 2023-02-02T16:50:59.000+01:00
a
diff --git a/Sources/SwiftParser/Lexer/Cursor.swift b/Sources/SwiftParser/Lexer/Cursor.swift
@@ -1479,7 +1479,7 @@ extension Lexer.Cursor {
     case endOfString
 
     /// The character could not be lexed because it's not a valid Unicode character.
-    case error
+    case error(LexerError.Kind)
   }
 
   /// Lexes a single character in a string literal, handling escape sequences
@@ -1524,10 +1524,8 @@ extension Lexer.Cursor {
         return .success(Unicode.Scalar(character))
       }
     case 0:
-      //      if (EmitDiagnostics)
-      //        diagnose(CurPtr-1, diag::lex_nul_character)
-      let character = self.advance()!
-      return .success(Unicode.Scalar(character))
+      _ = self.advance()
+      return .error(.nulCharacter)
     case UInt8(ascii: "\n"), UInt8(ascii: "\r"):  // String literals cannot have \n or \r in them.
       let character = self.advance()!
       assert(stringLiteralKind == .multiLine, "Caller must handle newlines in non-multiline")
@@ -1538,16 +1536,17 @@ extension Lexer.Cursor {
       if !self.advanceIfStringDelimiter(delimiterLength: delimiterLength) {
         return .success(Unicode.Scalar("\\"))
       }
-      guard let escapedCharacterCode = self.lexEscapedCharacter(isMultilineString: stringLiteralKind == .multiLine) else {
-        return .error
-      }
-
-      // Check to see if the encoding is valid.
-      guard let validatedScalar = Unicode.Scalar(escapedCharacterCode) else {
-        return .error
+      switch self.lexEscapedCharacter(isMultilineString: stringLiteralKind == .multiLine) {
+      case .success(let escapedCharacterCode):
+        // Check to see if the encoding is valid.
+        if let validatedScalar = Unicode.Scalar(escapedCharacterCode) {
+          return .validatedEscapeSequence(Character(validatedScalar))
+        } else {
+          return .error(.invalidEscapeSequenceInStringLiteral)
+        }
+      case .error(let kind):
+        return .error(kind)
       }
-
-      return .validatedEscapeSequence(Character(validatedScalar))
     default:
       _ = self.advance()
       // Normal characters are part of the string.
@@ -1561,71 +1560,68 @@ extension Lexer.Cursor {
       //      }
       self = charStart
       guard let charValue = self.advanceValidatingUTF8Character() else {
-        //      if (EmitDiagnostics)
-        //        diagnose(CharStart, diag::lex_invalid_utf8)
-        return .error
+        return .error(.invalidUtf8)
       }
       return .success(charValue)
     }
   }
 
+  enum EscapedCharacterLex {
+    // Successfully lexed an escape sequence that represents the Unicode character
+    // at the given codepoint
+    case success(UInt32)
+    case error(LexerError.Kind)
+  }
+
   /// Assuming that we are in a string literal and have already consumed a `\`,
   /// consume the escaped characters and return the Unicode character code
   /// (i.e. UTF-32 value) that the escaped character represents.
   ///
   /// If the character is not a valid escape sequence, return `nil`.
-  private mutating func lexEscapedCharacter(isMultilineString: Bool) -> UInt32? {
+  private mutating func lexEscapedCharacter(isMultilineString: Bool) -> EscapedCharacterLex {
     assert(self.previous == UInt8(ascii: "\\") || self.previous == UInt8(ascii: "#"))
     // Escape processing.  We already ate the "\".
     switch self.peek() {
     // Simple single-character escapes.
-    case UInt8(ascii: "0"): _ = self.advance(); return UInt32(UInt8(ascii: "\0"))
-    case UInt8(ascii: "n"): _ = self.advance(); return UInt32(UInt8(ascii: "\n"))
-    case UInt8(ascii: "r"): _ = self.advance(); return UInt32(UInt8(ascii: "\r"))
-    case UInt8(ascii: "t"): _ = self.advance(); return UInt32(UInt8(ascii: "\t"))
-    case UInt8(ascii: #"""#): _ = self.advance(); return UInt32(UInt8(ascii: #"""#))
-    case UInt8(ascii: "'"): _ = self.advance(); return UInt32(UInt8(ascii: "'"))
-    case UInt8(ascii: "\\"): _ = self.advance(); return UInt32(UInt8(ascii: "\\"))
+    case UInt8(ascii: "0"): _ = self.advance(); return .success(UInt32(UInt8(ascii: "\0")))
+    case UInt8(ascii: "n"): _ = self.advance(); return .success(UInt32(UInt8(ascii: "\n")))
+    case UInt8(ascii: "r"): _ = self.advance(); return .success(UInt32(UInt8(ascii: "\r")))
+    case UInt8(ascii: "t"): _ = self.advance(); return .success(UInt32(UInt8(ascii: "\t")))
+    case UInt8(ascii: #"""#): _ = self.advance(); return .success(UInt32(UInt8(ascii: #"""#)))
+    case UInt8(ascii: "'"): _ = self.advance(); return .success(UInt32(UInt8(ascii: "'")))
+    case UInt8(ascii: "\\"): _ = self.advance(); return .success(UInt32(UInt8(ascii: "\\")))
 
     case UInt8(ascii: "u"):  // e.g. \u{1234}
       _ = self.advance()
 
       guard self.is(at: "{") else {
-        //        if (EmitDiagnostics)
-        //          diagnose(CurPtr-1, diag::lex_unicode_escape_braces)
-        return nil
+        return .error(.expectedHexCodeInUnicodeEscape)
       }
 
-      guard let cv = self.lexUnicodeEscape() else {
-        return nil
-      }
-      return cv
-
+      return self.lexUnicodeEscape()
     case UInt8(ascii: "\n"), UInt8(ascii: "\r"):
       if isMultilineString && self.maybeConsumeNewlineEscape() {
-        return UInt32(UInt8(ascii: "\n"))
+        return .success(UInt32(UInt8(ascii: "\n")))
       }
-      return nil
+      return .error(.invalidEscapeSequenceInStringLiteral)
     case nil:
-      return nil
+      return .error(.invalidEscapeSequenceInStringLiteral)
     case .some(let peekedValue):  // Invalid escape.
-      //     if (EmitDiagnostics)
-      //       diagnose(CurPtr, diag::lex_invalid_escape)
       // If this looks like a plausible escape character, recover as though this
       // is an invalid escape.
       let c = Unicode.Scalar(peekedValue)
       if c.isDigit || c.isLetter {
         _ = self.advance()
       }
-      return nil
+      return .error(.invalidEscapeSequenceInStringLiteral)
     }
   }
 
   /// Lex the contents of a `\u{1234}` escape sequence, assuming that we are
   /// placed at the opening `{`.
   ///
   /// If this is not a valid unicode escape, return `nil`.
-  private mutating func lexUnicodeEscape() -> UInt32? {
+  private mutating func lexUnicodeEscape() -> EscapedCharacterLex {
     let quoteConsumed = self.advance(matching: "{")
     assert(quoteConsumed)
 
@@ -1636,18 +1632,18 @@ extension Lexer.Cursor {
     }
 
     guard self.advance(matching: "}") else {
-      //      if (Diags)
-      //        Diags->diagnose(CurPtr, diag::lex_invalid_u_escape_rbrace)
-      return nil
+      return .error(.excpectedClosingBraceInUnicodeEscape)
     }
 
     if numDigits == 0 || numDigits > 8 {
-      //      if (Diags)
-      //        Diags->diagnose(CurPtr, diag::lex_invalid_u_escape)
-      return nil
+      return .error(.invalidNumberOfHexDigitsInUnicodeEscape)
     }
 
-    return UInt32(String(decoding: digitStart.input[0..<numDigits], as: UTF8.self), radix: 16)
+    if let codePoint = UInt32(String(decoding: digitStart.input[0..<numDigits], as: UTF8.self), radix: 16) {
+      return .success(codePoint)
+    } else {
+      return .error(.invalidEscapeSequenceInStringLiteral)
+    }
   }
 
   private mutating func maybeConsumeNewlineEscape() -> Bool {
@@ -1835,8 +1831,8 @@ extension Lexer.Cursor {
           // validate the multi-line string literal's indentation.
           return Lexer.Result(.stringSegment, error: error)
         }
-      case .error:
-        error = (.invalidEscapeSequenceInStringLiteral, self)
+      case .error(let errorKind):
+        error = (errorKind, self)
         self = clone
       case .endOfString:
         return Lexer.Result(
diff --git a/Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift b/Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift
@@ -39,9 +39,14 @@ public extension LexerError {
 /// Please order the cases in this enum alphabetically by case name.
 public enum StaticLexerError: String, DiagnosticMessage {
   case expectedBinaryExponentInHexFloatLiteral = "hexadecimal floating point literal must end with an exponent"
+  case excpectedClosingBraceInUnicodeEscape = #"expected '}' in \u{...} escape sequence"#
   case expectedDigitInFloatLiteral = "expected a digit in floating point exponent"
+  case expectedHexCodeInUnicodeEscape = #"expected hexadecimal code in \u{...} escape sequence"#
   case invalidEscapeSequenceInStringLiteral = "invalid escape sequence in literal"
+  case invalidNumberOfHexDigitsInUnicodeEscape = #"\u{...} escape sequence expects between 1 and 8 hex digits"#
+  case invalidUtf8 = "invalid UTF-8 found in source file"
   case lexerErrorOffsetOverflow = "the lexer dicovered an error in this token but was not able to represent its offset due to overflow; please split the token"
+  case nulCharacter = "nul character embedded in middle of file"
 
   public var message: String { self.rawValue }
 
@@ -109,8 +114,12 @@ public extension SwiftSyntax.LexerError {
     switch self.kind {
     case .expectedBinaryExponentInHexFloatLiteral:
       return StaticLexerError.expectedBinaryExponentInHexFloatLiteral
+    case .excpectedClosingBraceInUnicodeEscape:
+      return StaticLexerError.excpectedClosingBraceInUnicodeEscape
     case .expectedDigitInFloatLiteral:
       return StaticLexerError.expectedDigitInFloatLiteral
+    case .expectedHexCodeInUnicodeEscape:
+      return StaticLexerError.expectedHexCodeInUnicodeEscape
     case .insufficientIndentationInMultilineStringLiteral:
       // This should be diagnosed when visiting the `StringLiteralExprSyntax`
       // inside `ParseDiagnosticsGenerator` but fall back to an error message
@@ -128,10 +137,16 @@ public extension SwiftSyntax.LexerError {
       return InvalidFloatingPointExponentDigit(kind: .digit(scalarAtErrorOffset))
     case .invalidHexDigitInIntegerLiteral:
       return InvalidDigitInIntegerLiteral(kind: .hex(scalarAtErrorOffset))
+    case .invalidNumberOfHexDigitsInUnicodeEscape:
+      return StaticLexerError.invalidNumberOfHexDigitsInUnicodeEscape
     case .invalidOctalDigitInIntegerLiteral:
       return InvalidDigitInIntegerLiteral(kind: .octal(scalarAtErrorOffset))
+    case .invalidUtf8:
+      return StaticLexerError.invalidUtf8
     case .lexerErrorOffsetOverflow:
       return StaticLexerError.lexerErrorOffsetOverflow
+    case .nulCharacter:
+      return StaticLexerError.nulCharacter
     }
   }
 
diff --git a/Sources/SwiftSyntax/LexerError.swift b/Sources/SwiftSyntax/LexerError.swift
@@ -18,17 +18,22 @@ public struct LexerError: Hashable {
     // Please order these alphabetically
 
     case expectedBinaryExponentInHexFloatLiteral
+    case excpectedClosingBraceInUnicodeEscape
     case expectedDigitInFloatLiteral
+    case expectedHexCodeInUnicodeEscape
     case insufficientIndentationInMultilineStringLiteral
     case invalidBinaryDigitInIntegerLiteral
     case invalidDecimalDigitInIntegerLiteral
     case invalidEscapeSequenceInStringLiteral
     case invalidFloatingPointExponentCharacter
     case invalidFloatingPointExponentDigit
     case invalidHexDigitInIntegerLiteral
+    case invalidNumberOfHexDigitsInUnicodeEscape
     case invalidOctalDigitInIntegerLiteral
+    case invalidUtf8
     /// The lexer dicovered an error but was not able to represent the offset of the error because it would overflow `LexerErrorOffset`.
     case lexerErrorOffsetOverflow
+    case nulCharacter
   }
 
   public let kind: Kind
diff --git a/Tests/SwiftParserTest/LexerTests.swift b/Tests/SwiftParserTest/LexerTests.swift