Skip to content

Commit ed79bf5

Browse files
authored
Merge pull request #2857 from allevato/rich-identifiers
Parse raw identifiers.
2 parents 82eb5b9 + 6c7fc5a commit ed79bf5

File tree

7 files changed

+331
-14
lines changed

7 files changed

+331
-14
lines changed

Sources/SwiftParser/Lexer/Cursor.swift

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2025,14 +2025,57 @@ extension Lexer.Cursor {
20252025
// Check whether we have an identifier followed by another backtick, in which
20262026
// case this is an escaped identifier.
20272027
let identifierStart = self
2028-
if self.advance(if: { $0.isValidIdentifierStartCodePoint }) {
2029-
// Keep continuing the identifier.
2030-
self.advance(while: { $0.isValidIdentifierContinuationCodePoint })
20312028

2032-
// If we have the terminating "`", it's an escaped identifier.
2033-
if self.advance(matching: "`") {
2034-
return Lexer.Result(.identifier)
2029+
// Scan until we see either a closing backtick or the end of the line. Do
2030+
// additional validation for raw identifiers along the way; if we see
2031+
// characters that aren't allowed (prohibited whitespace or unprintable
2032+
// ASCII characters) or if the identifier is an operator, provide a more
2033+
// precise diagnostic and location, but otherwise keep trying to tokenize
2034+
// it as a raw identifier as long as we see the closing backtick because
2035+
// it more likely represents what the user was trying to do.
2036+
var hasNonOperatorCharacter = false
2037+
var hasNonWhitespaceCharacter = false
2038+
var isEmpty = true
2039+
var error: LexingDiagnostic? = nil
2040+
while true {
2041+
let ch = self.peek()
2042+
if ch == nil || ch == "`" || ch == "\n" || ch == "\r" {
2043+
break
2044+
}
2045+
let position = self
2046+
guard let scalar = self.advanceValidatingUTF8Character() else {
2047+
error = LexingDiagnostic(.invalidUtf8, position: position)
2048+
continue
2049+
}
2050+
if error == nil {
2051+
if scalar.isForbiddenRawIdentifierWhitespace {
2052+
error = LexingDiagnostic(.invalidWhitespaceInRawIdentifier, position: position)
2053+
} else if scalar == "\\" {
2054+
error = LexingDiagnostic(.invalidBackslashInRawIdentifier, position: position)
2055+
} else if scalar.isASCII && !scalar.isPrintableASCII {
2056+
error = LexingDiagnostic(.unprintableAsciiCharacter, position: position)
2057+
}
2058+
}
2059+
if !scalar.isPermittedRawIdentifierWhitespace {
2060+
hasNonWhitespaceCharacter = true
2061+
}
2062+
if (isEmpty && !scalar.isOperatorStartCodePoint) || !scalar.isOperatorContinuationCodePoint {
2063+
hasNonOperatorCharacter = true
2064+
}
2065+
isEmpty = false
2066+
}
2067+
2068+
// If we have the terminating "`", it's an escaped/raw identifier, unless
2069+
// it contained only operator characters or had other invalid elements.
2070+
if self.advance(matching: "`") {
2071+
if isEmpty {
2072+
error = LexingDiagnostic(.rawIdentifierCannotBeEmpty, position: quote)
2073+
} else if error == nil && !hasNonWhitespaceCharacter {
2074+
error = LexingDiagnostic(.rawIdentifierCannotBeAllWhitespace, position: quote)
2075+
} else if error == nil && !hasNonOperatorCharacter {
2076+
error = LexingDiagnostic(.rawIdentifierCannotBeOperator, position: quote)
20352077
}
2078+
return Lexer.Result(.identifier, error: error)
20362079
}
20372080

20382081
// Special case; allow '`$`'.

Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,31 @@ extension Unicode.Scalar {
9393
return true
9494
}
9595

96+
var isForbiddenRawIdentifierWhitespace: Bool {
97+
let c = self.value
98+
// This is the set of code points satisfying the `White_Space` property,
99+
// excluding the set satisfying the `Pattern_White_Space` property, and
100+
// excluding any other ASCII non-printables and Unicode separators. In
101+
// other words, the only whitespace code points allowed in a raw
102+
// identifier are U+0020, and U+200E/200F (LTR/RTL marks).
103+
return (c >= 0x0009 && c <= 0x000D) as Bool
104+
|| (c == 0x0085) as Bool
105+
|| (c == 0x00A0) as Bool
106+
|| (c == 0x1680) as Bool
107+
|| (c >= 0x2000 && c <= 0x200A) as Bool
108+
|| (c >= 0x2028 && c <= 0x2029) as Bool
109+
|| (c == 0x202F) as Bool
110+
|| (c == 0x205F) as Bool
111+
|| (c == 0x3000) as Bool
112+
}
113+
114+
var isPermittedRawIdentifierWhitespace: Bool {
115+
let c = self.value
116+
return (c == 0x0020) as Bool
117+
|| (c == 0x200E) as Bool
118+
|| (c == 0x200F) as Bool
119+
}
120+
96121
/// isOperatorStartCodePoint - Return true if the specified code point is a
97122
/// valid start of an operator.
98123
var isOperatorStartCodePoint: Bool {

Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,11 +71,16 @@ public enum StaticTokenError: String, DiagnosticMessage {
7171
case expectedDigitInFloatLiteral = "expected a digit in floating point exponent"
7272
case expectedHexCodeInUnicodeEscape = #"expected hexadecimal code in \u{...} escape sequence"#
7373
case expectedHexDigitInHexLiteral = "expected hexadecimal digit (0-9, A-F) in integer literal"
74+
case invalidBackslashInRawIdentifier = "a raw identifier cannot contain a backslash"
7475
case invalidCharacter = "invalid character in source file"
7576
case invalidEscapeSequenceInStringLiteral = "invalid escape sequence in literal"
7677
case invalidIdentifierStartCharacter = "an identifier cannot begin with this character"
7778
case invalidNumberOfHexDigitsInUnicodeEscape = #"\u{...} escape sequence expects between 1 and 8 hex digits"#
7879
case invalidUtf8 = "invalid UTF-8 found in source file"
80+
case invalidWhitespaceInRawIdentifier = "invalid whitespace found in raw identifier"
81+
case rawIdentifierCannotBeAllWhitespace = "a raw identifier cannot contain only whitespace characters"
82+
case rawIdentifierCannotBeEmpty = "a raw identifier cannot be empty"
83+
case rawIdentifierCannotBeOperator = "a raw identifier cannot contain only operator characters"
7984
case tokenDiagnosticOffsetOverflow =
8085
"the lexer discovered an error in this token but was not able to represent its offset due to overflow; please split the token"
8186
case sourceConflictMarker = "source control conflict marker in source file"
@@ -211,6 +216,7 @@ extension SwiftSyntax.TokenDiagnostic {
211216
// inside `ParseDiagnosticsGenerator` but fall back to an error message
212217
// here in case the error is not diagnosed.
213218
return InvalidIndentationInMultiLineStringLiteralError(kind: .insufficientIndentation, lines: 1)
219+
case .invalidBackslashInRawIdentifier: return StaticTokenError.invalidBackslashInRawIdentifier
214220
case .invalidBinaryDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .binary(scalarAtErrorOffset))
215221
case .invalidCharacter: return StaticTokenError.invalidCharacter
216222
case .invalidDecimalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .decimal(scalarAtErrorOffset))
@@ -223,9 +229,14 @@ extension SwiftSyntax.TokenDiagnostic {
223229
case .invalidNumberOfHexDigitsInUnicodeEscape: return StaticTokenError.invalidNumberOfHexDigitsInUnicodeEscape
224230
case .invalidOctalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .octal(scalarAtErrorOffset))
225231
case .invalidUtf8: return StaticTokenError.invalidUtf8
232+
case .invalidWhitespaceInRawIdentifier: return StaticTokenError.invalidWhitespaceInRawIdentifier
226233
case .multilineRegexClosingNotOnNewline: return StaticTokenError.multilineRegexClosingNotOnNewline
227234
case .nonBreakingSpace: return StaticTokenWarning.nonBreakingSpace
228235
case .nulCharacter: return StaticTokenWarning.nulCharacter
236+
case .rawIdentifierCannotBeAllWhitespace: return StaticTokenError.rawIdentifierCannotBeAllWhitespace
237+
case .rawIdentifierCannotBeEmpty: return StaticTokenError.rawIdentifierCannotBeEmpty
238+
case .rawIdentifierCannotBeOperator:
239+
return StaticTokenError.rawIdentifierCannotBeOperator
229240
case .sourceConflictMarker: return StaticTokenError.sourceConflictMarker
230241
case .spaceAtEndOfRegexLiteral: return StaticTokenError.spaceAtEndOfRegexLiteral
231242
case .spaceAtStartOfRegexLiteral: return StaticTokenError.spaceAtStartOfRegexLiteral

Sources/SwiftSyntax/TokenDiagnostic.swift

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ public struct TokenDiagnostic: Hashable, Sendable {
3535
case extraneousTrailingWhitespaceError
3636
case extraneousTrailingWhitespaceWarning
3737
case insufficientIndentationInMultilineStringLiteral
38+
case invalidBackslashInRawIdentifier
3839
case invalidBinaryDigitInIntegerLiteral
3940
case invalidCharacter
4041
case invalidDecimalDigitInIntegerLiteral
@@ -46,9 +47,13 @@ public struct TokenDiagnostic: Hashable, Sendable {
4647
case invalidNumberOfHexDigitsInUnicodeEscape
4748
case invalidOctalDigitInIntegerLiteral
4849
case invalidUtf8
50+
case invalidWhitespaceInRawIdentifier
4951
case multilineRegexClosingNotOnNewline
5052
case nonBreakingSpace
5153
case nulCharacter
54+
case rawIdentifierCannotBeAllWhitespace
55+
case rawIdentifierCannotBeEmpty
56+
case rawIdentifierCannotBeOperator
5257
case sourceConflictMarker
5358
case spaceAtEndOfRegexLiteral
5459
case spaceAtStartOfRegexLiteral
@@ -74,6 +79,7 @@ public struct TokenDiagnostic: Hashable, Sendable {
7479
case .extraneousTrailingWhitespaceError: return .error
7580
case .extraneousTrailingWhitespaceWarning: return .warning
7681
case .insufficientIndentationInMultilineStringLiteral: return .error
82+
case .invalidBackslashInRawIdentifier: return .error
7783
case .invalidBinaryDigitInIntegerLiteral: return .error
7884
case .invalidCharacter: return .error
7985
case .invalidDecimalDigitInIntegerLiteral: return .error
@@ -85,9 +91,13 @@ public struct TokenDiagnostic: Hashable, Sendable {
8591
case .invalidNumberOfHexDigitsInUnicodeEscape: return .error
8692
case .invalidOctalDigitInIntegerLiteral: return .error
8793
case .invalidUtf8: return .error
94+
case .invalidWhitespaceInRawIdentifier: return .error
8895
case .multilineRegexClosingNotOnNewline: return .error
8996
case .nonBreakingSpace: return .warning
9097
case .nulCharacter: return .warning
98+
case .rawIdentifierCannotBeAllWhitespace: return .error
99+
case .rawIdentifierCannotBeEmpty: return .error
100+
case .rawIdentifierCannotBeOperator: return .error
91101
case .sourceConflictMarker: return .error
92102
case .spaceAtEndOfRegexLiteral: return .error
93103
case .spaceAtStartOfRegexLiteral: return .error

Tests/SwiftParserTest/LexerTests.swift

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,14 @@ class LexerTests: ParserTestCase {
7676

7777
func testEscapedIdentifiers() {
7878
assertLexemes(
79-
"`Hello` `World` `$`",
79+
"`Hello` `World` `$` `with a space` `/not-an*operator+` `123`",
8080
lexemes: [
8181
LexemeSpec(.identifier, text: "`Hello`", trailing: " "),
8282
LexemeSpec(.identifier, text: "`World`", trailing: " "),
83-
LexemeSpec(.identifier, text: "`$`"),
83+
LexemeSpec(.identifier, text: "`$`", trailing: " "),
84+
LexemeSpec(.identifier, text: "`with a space`", trailing: " "),
85+
LexemeSpec(.identifier, text: "`/not-an*operator+`", trailing: " "),
86+
LexemeSpec(.identifier, text: "`123`"),
8487
]
8588
)
8689
}
@@ -1173,6 +1176,23 @@ class LexerTests: ParserTestCase {
11731176
}
11741177
}
11751178

1179+
func testInvalidUtf8_4() {
1180+
let sourceBytes: [UInt8] = [0x60, 0x41, 0xfd, 0x60] // 0x41 == "A", 0x60 == "`"
1181+
1182+
lex(sourceBytes) { lexemes in
1183+
guard lexemes.count == 2 else {
1184+
return XCTFail("Expected 2 lexemes, got \(lexemes.count)")
1185+
}
1186+
assertRawBytesLexeme(
1187+
lexemes[0],
1188+
kind: .identifier,
1189+
leadingTrivia: [],
1190+
text: [0x60, 0x41, 0xfd, 0x60],
1191+
error: TokenDiagnostic(.invalidUtf8, byteOffset: 2)
1192+
)
1193+
}
1194+
}
1195+
11761196
func testUTF16Surrogates1() {
11771197
// U+D800 <= (UTF16 surrogates code point) <= U+DFFF
11781198
let sourceBytes: [UInt8] = [0xED, 0xA0, 0x80] // The bytes represent the code point U+D800

Tests/SwiftParserTest/translated/DollarIdentifierTests.swift

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -214,15 +214,11 @@ final class DollarIdentifierTests: ParserTestCase {
214214
assertParse(
215215
"""
216216
func escapedDollarAnd() {
217-
1️⃣`$0` = 1
217+
`$0` = 1
218218
`$$` = 2
219219
`$abc` = 3
220220
}
221-
""",
222-
diagnostics: [
223-
// FIXME: Bad diagnostic
224-
DiagnosticSpec(message: "unexpected code in function")
225-
]
221+
"""
226222
)
227223
}
228224

0 commit comments

Comments
 (0)