Skip to content

Commit 9f42ea4

Browse files
committed
Introduce a multi-line literal mode
When an extended delimiter `#/` is followed by a newline, enter a multi-line mode where the literal may span multiple lines, and extended syntax is enabled by default.
1 parent 4944fbe commit 9f42ea4

File tree

8 files changed

+239
-27
lines changed

8 files changed

+239
-27
lines changed

Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
//
1010
//===----------------------------------------------------------------------===//
1111

12-
// TODO: mock up multi-line soon
13-
1412
struct Delimiter: Hashable {
1513
let kind: Kind
1614
let poundCount: Int
@@ -28,13 +26,13 @@ struct Delimiter: Hashable {
2826
kind.closing + String(repeating: "#", count: poundCount)
2927
}
3028

31-
/// The default set of syntax options that the delimiter indicates.
32-
var defaultSyntaxOptions: SyntaxOptions {
29+
/// Whether or not multi-line mode is permitted.
30+
var allowsMultiline: Bool {
3331
switch kind {
34-
case .forwardSlash, .reSingleQuote:
35-
return .traditional
36-
case .experimental, .rxSingleQuote:
37-
return .experimental
32+
case .forwardSlash:
33+
return poundCount > 0
34+
case .experimental, .reSingleQuote, .rxSingleQuote:
35+
return false
3836
}
3937
}
4038
}
@@ -76,6 +74,7 @@ struct DelimiterLexError: Error, CustomStringConvertible {
7674
case invalidUTF8 // TODO: better range reporting
7775
case unknownDelimiter
7876
case unprintableASCII
77+
case multilineClosingNotOnNewline
7978
}
8079

8180
var kind: Kind
@@ -94,6 +93,7 @@ struct DelimiterLexError: Error, CustomStringConvertible {
9493
case .invalidUTF8: return "invalid UTF-8 found in source file"
9594
case .unknownDelimiter: return "unknown regex literal delimiter"
9695
case .unprintableASCII: return "unprintable ASCII character found in source file"
96+
case .multilineClosingNotOnNewline: return "closing delimiter must appear on new line"
9797
}
9898
}
9999
}
@@ -103,6 +103,9 @@ fileprivate struct DelimiterLexer {
103103
var cursor: UnsafeRawPointer
104104
let end: UnsafeRawPointer
105105

106+
var firstNewline: UnsafeRawPointer?
107+
var isMultiline: Bool { firstNewline != nil }
108+
106109
init(start: UnsafeRawPointer, end: UnsafeRawPointer) {
107110
precondition(start <= end)
108111
self.start = start
@@ -262,12 +265,23 @@ fileprivate struct DelimiterLexer {
262265
let contentsEnd = cursor
263266
guard tryEat(delimiter.closing.utf8) else { return nil }
264267

265-
// Form a string from the contents and make sure it's valid UTF-8.
266268
let count = contentsEnd - contentsStart
267269
let contents = UnsafeRawBufferPointer(
268270
start: contentsStart, count: count)
269-
let s = String(decoding: contents, as: UTF8.self)
270271

272+
// In multi-line mode, we must be on a new line. So scan backwards and make
273+
// sure we only have whitespace until the newline.
274+
if isMultiline {
275+
let idx = contents.lastIndex(
276+
where: { $0 == ascii("\n") || $0 == ascii("\r") })! + 1
277+
guard contents[idx...].all({ $0 == ascii(" ") || $0 == ascii("\t") })
278+
else {
279+
throw DelimiterLexError(.multilineClosingNotOnNewline, resumeAt: cursor)
280+
}
281+
}
282+
283+
// Form a string from the contents and make sure it's valid UTF-8.
284+
let s = String(decoding: contents, as: UTF8.self)
271285
guard s.utf8.elementsEqual(contents) else {
272286
throw DelimiterLexError(.invalidUTF8, resumeAt: cursor)
273287
}
@@ -278,7 +292,10 @@ fileprivate struct DelimiterLexer {
278292
/// the end of the buffer is reached.
279293
mutating func advance(escaped: Bool = false) throws {
280294
guard let next = load() else {
281-
throw DelimiterLexError(.unterminated, resumeAt: cursor)
295+
// We've hit the end of the buffer. In multi-line mode, we don't want to
296+
// skip over what is likely otherwise valid Swift code, so resume from the
297+
// first newline.
298+
throw DelimiterLexError(.unterminated, resumeAt: firstNewline ?? cursor)
282299
}
283300
switch UnicodeScalar(next) {
284301
case let next where !next.isASCII:
@@ -289,7 +306,10 @@ fileprivate struct DelimiterLexer {
289306
advanceCursor()
290307

291308
case "\n", "\r":
292-
throw DelimiterLexError(.unterminated, resumeAt: cursor)
309+
guard isMultiline else {
310+
throw DelimiterLexError(.unterminated, resumeAt: cursor)
311+
}
312+
advanceCursor()
293313

294314
case "\0":
295315
// TODO: Warn to match the behavior of String literal lexer? Or should
@@ -301,8 +321,12 @@ fileprivate struct DelimiterLexer {
301321
advanceCursor()
302322
try advance(escaped: true)
303323

304-
case let next where !next.isPrintableASCII:
324+
case let next
325+
where !next.isPrintableASCII && !(isMultiline && next == "\t"):
305326
// Diagnose unprintable ASCII.
327+
// Note that tabs are allowed in multi-line literals.
328+
// TODO: This matches the string literal behavior, but should we allow
329+
// tabs for single-line regex literals too?
306330
// TODO: Ideally we would recover and continue to lex until the ending
307331
// delimiter.
308332
throw DelimiterLexError(.unprintableASCII, resumeAt: cursor.successor())
@@ -349,6 +373,23 @@ fileprivate struct DelimiterLexer {
349373
throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor())
350374
}
351375
let contentsStart = cursor
376+
377+
// If the delimiter allows multi-line, try skipping over any whitespace to a
378+
// newline character. If we can do that, we enter multi-line mode.
379+
if delimiter.allowsMultiline {
380+
while let next = load() {
381+
switch next {
382+
case ascii(" "), ascii("\t"):
383+
advanceCursor()
384+
continue
385+
case ascii("\n"), ascii("\r"):
386+
firstNewline = cursor
387+
default:
388+
break
389+
}
390+
break
391+
}
392+
}
352393
while true {
353394
// Check to see if we're at a character that looks like a delimiter, but
354395
// likely isn't. In such a case, we can attempt to skip over it.

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ enum ParseError: Error, Hashable {
7070

7171
case cannotRemoveTextSegmentOptions
7272
case cannotRemoveSemanticsOptions
73+
case cannotRemoveExtendedSyntaxInMultilineMode
74+
7375
case expectedCalloutArgument
7476
}
7577

@@ -158,6 +160,8 @@ extension ParseError: CustomStringConvertible {
158160
return "text segment mode cannot be unset, only changed"
159161
case .cannotRemoveSemanticsOptions:
160162
return "semantic level cannot be unset, only changed"
163+
case .cannotRemoveExtendedSyntaxInMultilineMode:
164+
return "extended syntax may not be disabled in multi-line mode"
161165
case .expectedCalloutArgument:
162166
return "expected argument to callout"
163167
}

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -657,6 +657,7 @@ extension Source {
657657
/// | MatchingOption* '-' MatchingOption*
658658
///
659659
mutating func lexMatchingOptionSequence(
660+
context: ParsingContext
660661
) throws -> AST.MatchingOptionSequence? {
661662
let ateCaret = recordLoc { $0.tryEat("^") }
662663

@@ -691,6 +692,11 @@ extension Source {
691692
if opt.isSemanticMatchingLevel {
692693
throw ParseError.cannotRemoveSemanticsOptions
693694
}
695+
// Extended syntax may not be removed if in multi-line mode.
696+
if context.syntax.contains(.multilineExtendedSyntax) &&
697+
opt.isAnyExtended {
698+
throw ParseError.cannotRemoveExtendedSyntaxInMultilineMode
699+
}
694700
removing.append(opt)
695701
}
696702
return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location,
@@ -864,7 +870,7 @@ extension Source {
864870
}
865871

866872
// Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:).
867-
if let seq = try src.lexMatchingOptionSequence() {
873+
if let seq = try src.lexMatchingOptionSequence(context: context) {
868874
if src.tryEat(":") {
869875
return .changeMatchingOptions(seq, isIsolated: false)
870876
}

Sources/_RegexParser/Regex/Parse/Mocking.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ func libswiftLexRegexLiteral(
6262
curPtrPtr.pointee = error.resumePtr.assumingMemoryBound(to: CChar.self)
6363

6464
switch error.kind {
65-
case .unterminated:
66-
// Missing closing delimiter can be recovered from.
65+
case .unterminated, .multilineClosingNotOnNewline:
66+
// These can be recovered from.
6767
return false
6868
case .unprintableASCII, .invalidUTF8:
6969
// We don't currently have good recovery behavior for these.

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -288,22 +288,25 @@ extension Parser {
288288
) throws -> AST.Group {
289289
context.recordGroup(kind.value)
290290

291-
// Check if we're introducing or removing extended syntax.
291+
// Check if we're introducing or removing extended syntax. We skip this for
292+
// multi-line, as extended syntax is always enabled there.
292293
// TODO: PCRE differentiates between (?x) and (?xx) where only the latter
293294
// handles non-semantic whitespace in a custom character class. Other
294295
// engines such as Oniguruma, Java, and ICU do this under (?x). Therefore,
295296
// treat (?x) and (?xx) as the same option here. If we ever get a strict
296297
// PCRE mode, we will need to change this to handle that.
297298
let currentSyntax = context.syntax
298-
if case .changeMatchingOptions(let c, isIsolated: _) = kind.value {
299-
if c.resetsCurrentOptions {
300-
context.syntax.remove(.extendedSyntax)
301-
}
302-
if c.adding.contains(where: \.isAnyExtended) {
303-
context.syntax.insert(.extendedSyntax)
304-
}
305-
if c.removing.contains(where: \.isAnyExtended) {
306-
context.syntax.remove(.extendedSyntax)
299+
if !context.syntax.contains(.multilineExtendedSyntax) {
300+
if case .changeMatchingOptions(let c, isIsolated: _) = kind.value {
301+
if c.resetsCurrentOptions {
302+
context.syntax.remove(.extendedSyntax)
303+
}
304+
if c.adding.contains(where: \.isAnyExtended) {
305+
context.syntax.insert(.extendedSyntax)
306+
}
307+
if c.removing.contains(where: \.isAnyExtended) {
308+
context.syntax.remove(.extendedSyntax)
309+
}
307310
}
308311
}
309312
defer {
@@ -532,11 +535,32 @@ public func parse<S: StringProtocol>(
532535
return try parser.parse()
533536
}
534537

538+
/// Retrieve the default set of syntax options that a delimiter and literal
539+
/// contents indicates.
540+
fileprivate func defaultSyntaxOptions(
541+
_ delim: Delimiter, contents: String
542+
) -> SyntaxOptions {
543+
switch delim.kind {
544+
case .forwardSlash:
545+
// For an extended syntax forward slash e.g #/.../#, extended syntax is
546+
// permitted if it spans multiple lines.
547+
if delim.poundCount > 0 &&
548+
contents.unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" }) {
549+
return .multilineExtendedSyntax
550+
}
551+
return .traditional
552+
case .reSingleQuote:
553+
return .traditional
554+
case .experimental, .rxSingleQuote:
555+
return .experimental
556+
}
557+
}
558+
535559
/// Parse a given regex string with delimiters, inferring the syntax options
536560
/// from the delimiter used.
537561
public func parseWithDelimiters<S: StringProtocol>(
538562
_ regex: S
539563
) throws -> AST where S.SubSequence == Substring {
540564
let (contents, delim) = droppingRegexDelimiters(String(regex))
541-
return try parse(contents, delim.defaultSyntaxOptions)
565+
return try parse(contents, defaultSyntaxOptions(delim, contents: contents))
542566
}

Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ public struct SyntaxOptions: OptionSet {
5858
/// `(_: .*)` == `(?:.*)`
5959
public static var experimentalCaptures: Self { Self(1 << 5) }
6060

61+
/// The default syntax for a multi-line regex literal.
62+
public static var multilineExtendedSyntax: Self {
63+
return [Self(1 << 6), .extendedSyntax]
64+
}
65+
6166
/*
6267

6368
/// `<digit>*` == `[[:digit:]]*` == `\d*`

Tests/RegexTests/LexTests.swift

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,11 @@ extension RegexTests {
110110
("###/abc/###", ("abc", delim(.forwardSlash, poundCount: 3))),
111111
("#|abc|#", ("abc", delim(.experimental))),
112112

113+
// Multiline
114+
("#/\na\nb\n/#", ("\na\nb\n", delim(.forwardSlash, poundCount: 1))),
115+
("#/ \na\nb\n /#", (" \na\nb\n ", delim(.forwardSlash, poundCount: 1))),
116+
("##/ \na\nb\n /##", (" \na\nb\n ", delim(.forwardSlash, poundCount: 2))),
117+
113118
// TODO: Null characters are lexically valid, similar to string literals,
114119
// but we ought to warn the user about them.
115120
("#|ab\0c|#", ("ab\0c", delim(.experimental))),

0 commit comments

Comments
 (0)