Skip to content

Commit 4944fbe

Browse files
committed
Lex extended pound delimiters
Start lexing `/.../`, and allow any number of pound signs to surround it.
1 parent 120ffc9 commit 4944fbe

File tree

3 files changed

+143
-46
lines changed

3 files changed

+143
-46
lines changed

Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift

Lines changed: 116 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -11,34 +11,65 @@
1111

1212
// TODO: mock up multi-line soon
1313

14-
enum Delimiter: Hashable, CaseIterable {
15-
case traditional
16-
case experimental
17-
case reSingleQuote
18-
case rxSingleQuote
19-
20-
var openingAndClosing: (opening: String, closing: String) {
21-
switch self {
22-
case .traditional: return ("#/", "/#")
23-
case .experimental: return ("#|", "|#")
24-
case .reSingleQuote: return ("re'", "'")
25-
case .rxSingleQuote: return ("rx'", "'")
26-
}
14+
struct Delimiter: Hashable {
15+
let kind: Kind
16+
let poundCount: Int
17+
18+
init(_ kind: Kind, poundCount: Int) {
19+
precondition(kind.allowsExtendedPoundSyntax || poundCount == 0)
20+
self.kind = kind
21+
self.poundCount = poundCount
22+
}
23+
24+
var opening: String {
25+
String(repeating: "#", count: poundCount) + kind.opening
26+
}
27+
var closing: String {
28+
kind.closing + String(repeating: "#", count: poundCount)
2729
}
28-
var opening: String { openingAndClosing.opening }
29-
var closing: String { openingAndClosing.closing }
3030

3131
/// The default set of syntax options that the delimiter indicates.
3232
var defaultSyntaxOptions: SyntaxOptions {
33-
switch self {
34-
case .traditional, .reSingleQuote:
33+
switch kind {
34+
case .forwardSlash, .reSingleQuote:
3535
return .traditional
3636
case .experimental, .rxSingleQuote:
3737
return .experimental
3838
}
3939
}
4040
}
4141

42+
extension Delimiter {
43+
enum Kind: Hashable, CaseIterable {
44+
case forwardSlash
45+
case experimental
46+
case reSingleQuote
47+
case rxSingleQuote
48+
49+
var openingAndClosing: (opening: String, closing: String) {
50+
switch self {
51+
case .forwardSlash: return ("/", "/")
52+
case .experimental: return ("#|", "|#")
53+
case .reSingleQuote: return ("re'", "'")
54+
case .rxSingleQuote: return ("rx'", "'")
55+
}
56+
}
57+
var opening: String { openingAndClosing.opening }
58+
var closing: String { openingAndClosing.closing }
59+
60+
/// Whether or not extended pound syntax e.g `##/.../##` is allowed with
61+
/// this delimiter.
62+
var allowsExtendedPoundSyntax: Bool {
63+
switch self {
64+
case .forwardSlash:
65+
return true
66+
case .experimental, .reSingleQuote, .rxSingleQuote:
67+
return false
68+
}
69+
}
70+
}
71+
}
72+
4273
struct DelimiterLexError: Error, CustomStringConvertible {
4374
enum Kind: Hashable {
4475
case unterminated
@@ -120,25 +151,34 @@ fileprivate struct DelimiterLexer {
120151
precondition(cursor <= end, "Cannot advance past end")
121152
}
122153

123-
/// Check to see if a UTF-8 sequence can be eaten from the current cursor.
124-
func canEat(_ utf8: String.UTF8View) -> Bool {
125-
guard let slice = slice(utf8.count) else { return false }
126-
return slice.elementsEqual(utf8)
154+
/// Check to see if a byte sequence can be eaten from the current cursor.
155+
func canEat<C : Collection>(_ bytes: C) -> Bool where C.Element == UInt8 {
156+
guard let slice = slice(bytes.count) else { return false }
157+
return slice.elementsEqual(bytes)
158+
}
159+
160+
/// Attempt to eat a byte sequence, returning `true` if successful.
161+
mutating func tryEat<C : Collection>(
162+
_ bytes: C
163+
) -> Bool where C.Element == UInt8 {
164+
guard canEat(bytes) else { return false }
165+
advanceCursor(bytes.count)
166+
return true
127167
}
128168

129-
/// Attempt to eat a UTF-8 byte sequence, returning `true` if successful.
130-
mutating func tryEat(_ utf8: String.UTF8View) -> Bool {
131-
guard canEat(utf8) else { return false }
132-
advanceCursor(utf8.count)
169+
/// Attempt to eat an ascii scalar, returning `true` if successful.
170+
mutating func tryEat(ascii s: Unicode.Scalar) -> Bool {
171+
guard load() == ascii(s) else { return false }
172+
advanceCursor()
133173
return true
134174
}
135175

136176
/// Attempt to skip over a closing delimiter character that is unlikely to be
137177
/// the actual closing delimiter.
138178
mutating func trySkipDelimiter(_ delimiter: Delimiter) {
139179
// Only the closing `'` for re'...'/rx'...' can potentially be skipped over.
140-
switch delimiter {
141-
case .traditional, .experimental:
180+
switch delimiter.kind {
181+
case .forwardSlash, .experimental:
142182
return
143183
case .reSingleQuote, .rxSingleQuote:
144184
break
@@ -272,16 +312,42 @@ fileprivate struct DelimiterLexer {
272312
}
273313
}
274314

315+
mutating func tryLexOpeningDelimiter(poundCount: Int) -> Delimiter? {
316+
for kind in Delimiter.Kind.allCases {
317+
// If the delimiter allows extended pound syntax, or there are no pounds,
318+
// we just need to lex it.
319+
let opening = kind.opening.utf8
320+
if kind.allowsExtendedPoundSyntax || poundCount == 0 {
321+
guard tryEat(opening) else { continue }
322+
return Delimiter(kind, poundCount: poundCount)
323+
}
324+
325+
// The delimiter doesn't allow extended pound syntax, so the pounds must be
326+
// part of the delimiter.
327+
guard
328+
poundCount < opening.count,
329+
opening.prefix(poundCount)
330+
.elementsEqual(repeatElement(ascii("#"), count: poundCount)),
331+
tryEat(opening.dropFirst(poundCount))
332+
else { continue }
333+
334+
return Delimiter(kind, poundCount: 0)
335+
}
336+
return nil
337+
}
338+
275339
/*consuming*/ mutating func lex(
276340
) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
341+
// We can consume any number of pound signs.
342+
var poundCount = 0
343+
while tryEat(ascii: "#") {
344+
poundCount += 1
345+
}
277346

278347
// Try to lex the opening delimiter.
279-
guard let delimiter = Delimiter.allCases.first(
280-
where: { tryEat($0.opening.utf8) }
281-
) else {
348+
guard let delimiter = tryLexOpeningDelimiter(poundCount: poundCount) else {
282349
throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor())
283350
}
284-
285351
let contentsStart = cursor
286352
while true {
287353
// Check to see if we're at a character that looks like a delimiter, but
@@ -302,20 +368,34 @@ fileprivate struct DelimiterLexer {
302368
/// Drop a set of regex delimiters from the input string, returning the contents
303369
/// and the delimiter used. The input string must have valid delimiters.
304370
func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
305-
func stripDelimiter(_ delim: Delimiter) -> String? {
371+
func stripDelimiter(_ kind: Delimiter.Kind) -> (String, Delimiter)? {
372+
var slice = str.utf8[...]
373+
374+
// Try strip any number of opening '#'s.
375+
var poundCount = 0
376+
if kind.allowsExtendedPoundSyntax {
377+
poundCount = slice.prefix(while: {
378+
$0 == UInt8(("#" as UnicodeScalar).value)
379+
}).count
380+
slice = slice.dropFirst(poundCount)
381+
}
382+
306383
// The opening delimiter must match.
307-
guard var slice = str.utf8.tryDropPrefix(delim.opening.utf8)
384+
guard var slice = slice.tryDropPrefix(kind.opening.utf8)
308385
else { return nil }
309386

310387
// The closing delimiter may optionally match, as it may not be present in
311388
// invalid code.
389+
let delim = Delimiter(kind, poundCount: poundCount)
312390
if let newSlice = slice.tryDropSuffix(delim.closing.utf8) {
313391
slice = newSlice
314392
}
315-
return String(slice)
393+
let result = String(decoding: slice, as: UTF8.self)
394+
precondition(result.utf8.elementsEqual(slice))
395+
return (result, delim)
316396
}
317-
for d in Delimiter.allCases {
318-
if let contents = stripDelimiter(d) {
397+
for kind in Delimiter.Kind.allCases {
398+
if let (contents, d) = stripDelimiter(kind) {
319399
return (contents, d)
320400
}
321401
}

Tests/RegexTests/LexTests.swift

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -101,26 +101,31 @@ extension RegexTests {
101101

102102

103103
func testCompilerInterface() {
104+
func delim(_ kind: Delimiter.Kind, poundCount: Int = 0) -> Delimiter {
105+
Delimiter(kind, poundCount: poundCount)
106+
}
104107
let testCases: [(String, (String, Delimiter)?)] = [
105-
("#/abc/#", ("abc", .traditional)),
106-
("#|abc|#", ("abc", .experimental)),
108+
("/abc/", ("abc", delim(.forwardSlash))),
109+
("#/abc/#", ("abc", delim(.forwardSlash, poundCount: 1))),
110+
("###/abc/###", ("abc", delim(.forwardSlash, poundCount: 3))),
111+
("#|abc|#", ("abc", delim(.experimental))),
107112

108113
// TODO: Null characters are lexically valid, similar to string literals,
109114
// but we ought to warn the user about them.
110-
("#|ab\0c|#", ("ab\0c", .experimental)),
115+
("#|ab\0c|#", ("ab\0c", delim(.experimental))),
111116
("'abc'", nil),
112-
("#/abc/def/#", ("abc/def", .traditional)),
113-
("#|abc|def|#", ("abc|def", .experimental)),
114-
("#/abc\\/#def/#", ("abc\\/#def", .traditional)),
115-
("#|abc\\|#def|#", ("abc\\|#def", .experimental)),
116-
("#/abc|#def/#", ("abc|#def", .traditional)),
117-
("#|abc/#def|#", ("abc/#def", .experimental)),
117+
("#/abc/def/#", ("abc/def", delim(.forwardSlash, poundCount: 1))),
118+
("#|abc|def|#", ("abc|def", delim(.experimental))),
119+
("#/abc\\/#def/#", ("abc\\/#def", delim(.forwardSlash, poundCount: 1))),
120+
("#|abc\\|#def|#", ("abc\\|#def", delim(.experimental))),
121+
("#/abc|#def/#", ("abc|#def", delim(.forwardSlash, poundCount: 1))),
122+
("#|abc/#def|#", ("abc/#def", delim(.experimental))),
118123
("#/abc|#def/", nil),
119124
("#|abc/#def#", nil),
120125
("#/abc\n/#", nil),
121126
("#/abc\r/#", nil),
122127

123-
(#"re'abcre\''"#, (#"abcre\'"#, .reSingleQuote)),
128+
(#"re'abcre\''"#, (#"abcre\'"#, delim(.reSingleQuote))),
124129
(#"re'\'"#, nil)
125130
]
126131

Tests/RegexTests/ParseTests.swift

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1736,7 +1736,9 @@ extension RegexTests {
17361736

17371737
// MARK: Parse with delimiters
17381738

1739+
parseWithDelimitersTest("/a b/", concat("a", " ", "b"))
17391740
parseWithDelimitersTest("#/a b/#", concat("a", " ", "b"))
1741+
parseWithDelimitersTest("##/a b/##", concat("a", " ", "b"))
17401742
parseWithDelimitersTest("#|a b|#", concat("a", "b"))
17411743

17421744
parseWithDelimitersTest("re'a b'", concat("a", " ", "b"))
@@ -1773,6 +1775,11 @@ extension RegexTests {
17731775
// Printable ASCII characters.
17741776
delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
17751777

1778+
// Make sure we can handle a combining accent as first character.
1779+
parseWithDelimitersTest("/\u{301}/", "\u{301}")
1780+
1781+
delimiterLexingTest("/a/#", ignoreTrailing: true)
1782+
17761783
// MARK: Delimiter skipping: Make sure we can skip over the ending delimiter
17771784
// if it's clear that it's part of the regex syntax.
17781785

@@ -2302,6 +2309,11 @@ extension RegexTests {
23022309
delimiterLexingDiagnosticTest("re'(?('abc'", .unterminated)
23032310
delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .unterminated)
23042311
delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .unterminated)
2312+
2313+
// MARK: Unbalanced extended syntax
2314+
delimiterLexingDiagnosticTest("#/a/", .unterminated)
2315+
delimiterLexingDiagnosticTest("##/a/#", .unterminated)
2316+
23052317
}
23062318

23072319
func testlibswiftDiagnostics() {

0 commit comments

Comments
 (0)