Skip to content

Commit 1d0edfb

Browse files
committed
Parse raw identifiers.
This is the parser implementation for SE-0451.
1 parent b721fa8 commit 1d0edfb

File tree

7 files changed

+299
-14
lines changed

7 files changed

+299
-14
lines changed

Sources/SwiftParser/Lexer/Cursor.swift

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2025,14 +2025,51 @@ extension Lexer.Cursor {
20252025
// Check whether we have an identifier followed by another backtick, in which
20262026
// case this is an escaped identifier.
20272027
let identifierStart = self
2028-
if self.advance(if: { $0.isValidIdentifierStartCodePoint }) {
2029-
// Keep continuing the identifier.
2030-
self.advance(while: { $0.isValidIdentifierContinuationCodePoint })
20312028

2032-
// If we have the terminating "`", it's an escaped identifier.
2033-
if self.advance(matching: "`") {
2034-
return Lexer.Result(.identifier)
2029+
// Scan until we see either a closing backtick or the end of the line. Do
2030+
// additional validation for raw identifiers along the way; if we see
2031+
// characters that aren't allowed (prohibited whitespace or unprintable
2032+
// ASCII characters) or if the identifier is an operator, provide a more
2033+
// precise diagnostic and location, but otherwise keep trying to tokenize
2034+
// it as a raw identifier as long as we see the closing backtick because
2035+
// it more likely represents what the user was trying to do.
2036+
var hasNonOperatorCharacter = false
2037+
var isEmpty = true
2038+
var error: LexingDiagnostic? = nil
2039+
while true {
2040+
let ch = self.peek()
2041+
if ch == nil || ch == "`" || ch == "\n" || ch == "\r" {
2042+
break
2043+
}
2044+
let position = self
2045+
guard let scalar = self.advanceValidatingUTF8Character() else {
2046+
error = LexingDiagnostic(.invalidUtf8, position: position)
2047+
continue
2048+
}
2049+
if error == nil {
2050+
if scalar.isForbiddenRawIdentifierWhitespace {
2051+
error = LexingDiagnostic(.invalidWhitespaceInRawIdentifier, position: position)
2052+
} else if scalar == "\\" {
2053+
error = LexingDiagnostic(.invalidBackslashInRawIdentifier, position: position)
2054+
} else if scalar.isASCII && !scalar.isPrintableASCII {
2055+
error = LexingDiagnostic(.unprintableAsciiCharacter, position: position)
2056+
}
2057+
}
2058+
if isEmpty && !scalar.isOperatorStartCodePoint || !scalar.isOperatorContinuationCodePoint {
2059+
hasNonOperatorCharacter = true
2060+
}
2061+
isEmpty = false
2062+
}
2063+
2064+
// If we have the terminating "`", it's an escaped/raw identifier, unless
2065+
// it contained only operator characters or had other invalid elements.
2066+
if self.advance(matching: "`") {
2067+
if isEmpty {
2068+
error = LexingDiagnostic(.rawIdentifierCannotBeEmpty, position: quote)
2069+
} else if error == nil && !hasNonOperatorCharacter {
2070+
error = LexingDiagnostic(.rawIdentifierCannotBeOperator, position: quote)
20352071
}
2072+
return Lexer.Result(.identifier, error: error)
20362073
}
20372074

20382075
// Special case; allow '`$`'.

Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,24 @@ extension Unicode.Scalar {
9393
return true
9494
}
9595

96+
var isForbiddenRawIdentifierWhitespace: Bool {
97+
let c = self.value
98+
// This is the set of code points satisfying the `White_Space` property,
99+
// excluding the set satisfying the `Pattern_White_Space` property, and
100+
// excluding any other ASCII non-printables and Unicode separators. In
101+
// other words, the only whitespace code points allowed in a raw
102+
// identifier are U+0020, and U+200E/200F (LTR/RTL marks).
103+
return (c >= 0x0009 && c <= 0x000D) as Bool
104+
|| (c == 0x0085) as Bool
105+
|| (c == 0x00A0) as Bool
106+
|| (c == 0x1680) as Bool
107+
|| (c >= 0x2000 && c <= 0x200A) as Bool
108+
|| (c >= 0x2028 && c <= 0x2029) as Bool
109+
|| (c == 0x202F) as Bool
110+
|| (c == 0x205F) as Bool
111+
|| (c == 0x3000) as Bool
112+
}
113+
96114
/// isOperatorStartCodePoint - Return true if the specified code point is a
97115
/// valid start of an operator.
98116
var isOperatorStartCodePoint: Bool {

Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,11 +71,15 @@ public enum StaticTokenError: String, DiagnosticMessage {
7171
case expectedDigitInFloatLiteral = "expected a digit in floating point exponent"
7272
case expectedHexCodeInUnicodeEscape = #"expected hexadecimal code in \u{...} escape sequence"#
7373
case expectedHexDigitInHexLiteral = "expected hexadecimal digit (0-9, A-F) in integer literal"
74+
case invalidBackslashInRawIdentifier = "a raw identifier cannot contain a backslash"
7475
case invalidCharacter = "invalid character in source file"
7576
case invalidEscapeSequenceInStringLiteral = "invalid escape sequence in literal"
7677
case invalidIdentifierStartCharacter = "an identifier cannot begin with this character"
7778
case invalidNumberOfHexDigitsInUnicodeEscape = #"\u{...} escape sequence expects between 1 and 8 hex digits"#
7879
case invalidUtf8 = "invalid UTF-8 found in source file"
80+
case invalidWhitespaceInRawIdentifier = "invalid whitespace found in raw identifier"
81+
case rawIdentifierCannotBeEmpty = "a raw identifier cannot be empty"
82+
case rawIdentifierCannotBeOperator = "a raw identifier cannot contain only operator characters"
7983
case tokenDiagnosticOffsetOverflow =
8084
"the lexer discovered an error in this token but was not able to represent its offset due to overflow; please split the token"
8185
case sourceConflictMarker = "source control conflict marker in source file"
@@ -211,6 +215,7 @@ extension SwiftSyntax.TokenDiagnostic {
211215
// inside `ParseDiagnosticsGenerator` but fall back to an error message
212216
// here in case the error is not diagnosed.
213217
return InvalidIndentationInMultiLineStringLiteralError(kind: .insufficientIndentation, lines: 1)
218+
case .invalidBackslashInRawIdentifier: return StaticTokenError.invalidBackslashInRawIdentifier
214219
case .invalidBinaryDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .binary(scalarAtErrorOffset))
215220
case .invalidCharacter: return StaticTokenError.invalidCharacter
216221
case .invalidDecimalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .decimal(scalarAtErrorOffset))
@@ -223,9 +228,13 @@ extension SwiftSyntax.TokenDiagnostic {
223228
case .invalidNumberOfHexDigitsInUnicodeEscape: return StaticTokenError.invalidNumberOfHexDigitsInUnicodeEscape
224229
case .invalidOctalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .octal(scalarAtErrorOffset))
225230
case .invalidUtf8: return StaticTokenError.invalidUtf8
231+
case .invalidWhitespaceInRawIdentifier: return StaticTokenError.invalidWhitespaceInRawIdentifier
226232
case .multilineRegexClosingNotOnNewline: return StaticTokenError.multilineRegexClosingNotOnNewline
227233
case .nonBreakingSpace: return StaticTokenWarning.nonBreakingSpace
228234
case .nulCharacter: return StaticTokenWarning.nulCharacter
235+
case .rawIdentifierCannotBeEmpty: return StaticTokenError.rawIdentifierCannotBeEmpty
236+
case .rawIdentifierCannotBeOperator:
237+
return StaticTokenError.rawIdentifierCannotBeOperator
229238
case .sourceConflictMarker: return StaticTokenError.sourceConflictMarker
230239
case .spaceAtEndOfRegexLiteral: return StaticTokenError.spaceAtEndOfRegexLiteral
231240
case .spaceAtStartOfRegexLiteral: return StaticTokenError.spaceAtStartOfRegexLiteral

Sources/SwiftSyntax/TokenDiagnostic.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ public struct TokenDiagnostic: Hashable, Sendable {
3535
case extraneousTrailingWhitespaceError
3636
case extraneousTrailingWhitespaceWarning
3737
case insufficientIndentationInMultilineStringLiteral
38+
case invalidBackslashInRawIdentifier
3839
case invalidBinaryDigitInIntegerLiteral
3940
case invalidCharacter
4041
case invalidDecimalDigitInIntegerLiteral
@@ -46,9 +47,12 @@ public struct TokenDiagnostic: Hashable, Sendable {
4647
case invalidNumberOfHexDigitsInUnicodeEscape
4748
case invalidOctalDigitInIntegerLiteral
4849
case invalidUtf8
50+
case invalidWhitespaceInRawIdentifier
4951
case multilineRegexClosingNotOnNewline
5052
case nonBreakingSpace
5153
case nulCharacter
54+
case rawIdentifierCannotBeEmpty
55+
case rawIdentifierCannotBeOperator
5256
case sourceConflictMarker
5357
case spaceAtEndOfRegexLiteral
5458
case spaceAtStartOfRegexLiteral
@@ -74,6 +78,7 @@ public struct TokenDiagnostic: Hashable, Sendable {
7478
case .extraneousTrailingWhitespaceError: return .error
7579
case .extraneousTrailingWhitespaceWarning: return .warning
7680
case .insufficientIndentationInMultilineStringLiteral: return .error
81+
case .invalidBackslashInRawIdentifier: return .error
7782
case .invalidBinaryDigitInIntegerLiteral: return .error
7883
case .invalidCharacter: return .error
7984
case .invalidDecimalDigitInIntegerLiteral: return .error
@@ -85,9 +90,12 @@ public struct TokenDiagnostic: Hashable, Sendable {
8590
case .invalidNumberOfHexDigitsInUnicodeEscape: return .error
8691
case .invalidOctalDigitInIntegerLiteral: return .error
8792
case .invalidUtf8: return .error
93+
case .invalidWhitespaceInRawIdentifier: return .error
8894
case .multilineRegexClosingNotOnNewline: return .error
8995
case .nonBreakingSpace: return .warning
9096
case .nulCharacter: return .warning
97+
case .rawIdentifierCannotBeEmpty: return .error
98+
case .rawIdentifierCannotBeOperator: return .error
9199
case .sourceConflictMarker: return .error
92100
case .spaceAtEndOfRegexLiteral: return .error
93101
case .spaceAtStartOfRegexLiteral: return .error

Tests/SwiftParserTest/LexerTests.swift

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,14 @@ class LexerTests: ParserTestCase {
7676

7777
func testEscapedIdentifiers() {
7878
assertLexemes(
79-
"`Hello` `World` `$`",
79+
"`Hello` `World` `$` `with a space` `/not-an*operator+` `123`",
8080
lexemes: [
8181
LexemeSpec(.identifier, text: "`Hello`", trailing: " "),
8282
LexemeSpec(.identifier, text: "`World`", trailing: " "),
83-
LexemeSpec(.identifier, text: "`$`"),
83+
LexemeSpec(.identifier, text: "`$`", trailing: " "),
84+
LexemeSpec(.identifier, text: "`with a space`", trailing: " "),
85+
LexemeSpec(.identifier, text: "`/not-an*operator+`", trailing: " "),
86+
LexemeSpec(.identifier, text: "`123`"),
8487
]
8588
)
8689
}
@@ -1173,6 +1176,23 @@ class LexerTests: ParserTestCase {
11731176
}
11741177
}
11751178

1179+
func testInvalidUtf8_4() {
1180+
let sourceBytes: [UInt8] = [0x60, 0x41, 0xfd, 0x60] // 0x41 == "A", 0x60 == "`"
1181+
1182+
lex(sourceBytes) { lexemes in
1183+
guard lexemes.count == 2 else {
1184+
return XCTFail("Expected 2 lexemes, got \(lexemes.count)")
1185+
}
1186+
assertRawBytesLexeme(
1187+
lexemes[0],
1188+
kind: .identifier,
1189+
leadingTrivia: [],
1190+
text: [0x60, 0x41, 0xfd, 0x60],
1191+
error: TokenDiagnostic(.invalidUtf8, byteOffset: 2)
1192+
)
1193+
}
1194+
}
1195+
11761196
func testUTF16Surrogates1() {
11771197
// U+D800 <= (UTF16 surrogates code point) <= U+DFFF
11781198
let sourceBytes: [UInt8] = [0xED, 0xA0, 0x80] // The bytes represent the code point U+D800

Tests/SwiftParserTest/translated/DollarIdentifierTests.swift

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -214,15 +214,11 @@ final class DollarIdentifierTests: ParserTestCase {
214214
assertParse(
215215
"""
216216
func escapedDollarAnd() {
217-
1️⃣`$0` = 1
217+
`$0` = 1
218218
`$$` = 2
219219
`$abc` = 3
220220
}
221-
""",
222-
diagnostics: [
223-
// FIXME: Bad diagnostic
224-
DiagnosticSpec(message: "unexpected code in function")
225-
]
221+
"""
226222
)
227223
}
228224

0 commit comments

Comments
 (0)