Skip to content

Commit 045fb95

Browse files
committed
Allow scoped removal of extended syntax in multi-line literals
Relax the ban on unsetting extended syntax in a multi-line literal such that it does not apply to a scoped unset e.g `(?-x:...)`, as long as it does not span multiple lines. This commit also bans the use of `(?^)` in a multi-line literal, unless it is scoped and does not span multiple lines. Instead, `(?^x)` must be written, as PCRE defines `(?^)` to be equivalent to `(?-imnsx)`.
1 parent 15f4238 commit 045fb95

File tree

4 files changed

+116
-27
lines changed

4 files changed

+116
-27
lines changed

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ enum ParseError: Error, Hashable {
4545
case confusableCharacter(Character)
4646

4747
case quoteMayNotSpanMultipleLines
48+
case unsetExtendedSyntaxMayNotSpanMultipleLines
4849

4950
case cannotReferToWholePattern
5051

@@ -81,6 +82,7 @@ enum ParseError: Error, Hashable {
8182
case cannotRemoveTextSegmentOptions
8283
case cannotRemoveSemanticsOptions
8384
case cannotRemoveExtendedSyntaxInMultilineMode
85+
case cannotResetExtendedSyntaxInMultilineMode
8486

8587
case expectedCalloutArgument
8688

@@ -143,6 +145,8 @@ extension ParseError: CustomStringConvertible {
143145
return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead"
144146
case .quoteMayNotSpanMultipleLines:
145147
return "quoted sequence may not span multiple lines in multi-line literal"
148+
case .unsetExtendedSyntaxMayNotSpanMultipleLines:
149+
return "group that unsets extended syntax may not span multiple lines in multi-line literal"
146150
case .cannotReferToWholePattern:
147151
return "cannot refer to whole pattern here"
148152
case .quantifierRequiresOperand(let q):
@@ -194,6 +198,8 @@ extension ParseError: CustomStringConvertible {
194198
return "semantic level cannot be unset, only changed"
195199
case .cannotRemoveExtendedSyntaxInMultilineMode:
196200
return "extended syntax may not be disabled in multi-line mode"
201+
case .cannotResetExtendedSyntaxInMultilineMode:
202+
return "extended syntax may not be disabled in multi-line mode; use '(?^x)' instead"
197203
case .expectedCalloutArgument:
198204
return "expected argument to callout"
199205
case .unrecognizedScript(let value):

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -840,11 +840,6 @@ extension Source {
840840
if opt.isSemanticMatchingLevel {
841841
throw ParseError.cannotRemoveSemanticsOptions
842842
}
843-
// Extended syntax may not be removed if in multi-line mode.
844-
if context.syntax.contains(.multilineCompilerLiteral) &&
845-
opt.isAnyExtended {
846-
throw ParseError.cannotRemoveExtendedSyntaxInMultilineMode
847-
}
848843
removing.append(opt)
849844
}
850845
return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location,

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -289,8 +289,8 @@ extension Parser {
289289
/// Apply the syntax options of a given matching option sequence to the
290290
/// current set of options.
291291
private mutating func applySyntaxOptions(
292-
of opts: AST.MatchingOptionSequence
293-
) {
292+
of opts: AST.MatchingOptionSequence, isScoped: Bool
293+
) throws {
294294
func mapOption(_ option: SyntaxOptions,
295295
_ pred: (AST.MatchingOption) -> Bool) {
296296
if opts.resetsCurrentOptions {
@@ -311,22 +311,41 @@ extension Parser {
311311
mapOption(.namedCapturesOnly, .namedCapturesOnly)
312312

313313
// (?x), (?xx)
314-
// We skip this for multi-line, as extended syntax is always enabled there.
314+
// This cannot be unset in a multi-line literal, unless in a scoped group
315+
// e.g (?-x:...). We later enforce that such a group does not span multiple
316+
// lines.
315317
// TODO: PCRE differentiates between (?x) and (?xx) where only the latter
316318
// handles non-semantic whitespace in a custom character class. Other
317319
// engines such as Oniguruma, Java, and ICU do this under (?x). Therefore,
318320
// treat (?x) and (?xx) as the same option here. If we ever get a strict
319321
// PCRE mode, we will need to change this to handle that.
320-
if !context.syntax.contains(.multilineCompilerLiteral) {
322+
if !isScoped && context.syntax.contains(.multilineCompilerLiteral) {
323+
// An unscoped removal of extended syntax is not allowed in a multi-line
324+
// literal.
325+
if let opt = opts.removing.first(where: \.isAnyExtended) {
326+
throw Source.LocatedError(
327+
ParseError.cannotRemoveExtendedSyntaxInMultilineMode, opt.location)
328+
}
329+
if opts.resetsCurrentOptions {
330+
throw Source.LocatedError(
331+
ParseError.cannotResetExtendedSyntaxInMultilineMode, opts.caretLoc!)
332+
}
333+
// The only remaning case is an unscoped addition of extended syntax,
334+
// which is a no-op.
335+
} else {
336+
// We either have a scoped change of extended syntax, or this is a
337+
// single-line literal.
321338
mapOption(.extendedSyntax, \.isAnyExtended)
322339
}
323340
}
324341

325342
/// Apply the syntax options of a matching option changing group to the
326343
/// current set of options.
327-
private mutating func applySyntaxOptions(of group: AST.Group.Kind) {
344+
private mutating func applySyntaxOptions(
345+
of group: AST.Group.Kind, isScoped: Bool
346+
) throws {
328347
if case .changeMatchingOptions(let seq) = group {
329-
applySyntaxOptions(of: seq)
348+
try applySyntaxOptions(of: seq, isScoped: isScoped)
330349
}
331350
}
332351

@@ -337,14 +356,25 @@ extension Parser {
337356
context.recordGroup(kind.value)
338357

339358
let currentSyntax = context.syntax
340-
applySyntaxOptions(of: kind.value)
359+
try applySyntaxOptions(of: kind.value, isScoped: true)
341360
defer {
342361
context.syntax = currentSyntax
343362
}
344-
363+
let unsetsExtendedSyntax = currentSyntax.contains(.extendedSyntax) &&
364+
!context.syntax.contains(.extendedSyntax)
345365
let child = try parseNode()
346366
try source.expect(")")
347-
return .init(kind, child, loc(start))
367+
let groupLoc = loc(start)
368+
369+
// In multi-line literals, the body of a group that unsets extended syntax
370+
// may not span multiple lines.
371+
if unsetsExtendedSyntax &&
372+
context.syntax.contains(.multilineCompilerLiteral) &&
373+
source[child.location.range].spansMultipleLinesInRegexLiteral {
374+
throw Source.LocatedError(
375+
ParseError.unsetExtendedSyntaxMayNotSpanMultipleLines, groupLoc)
376+
}
377+
return .init(kind, child, groupLoc)
348378
}
349379

350380
/// Consume the body of an absent function.
@@ -438,7 +468,7 @@ extension Parser {
438468
// If we have a change matching options atom, apply the syntax options. We
439469
// already take care of scoping syntax options within a group.
440470
if case .changeMatchingOptions(let opts) = atom.kind {
441-
applySyntaxOptions(of: opts)
471+
try applySyntaxOptions(of: opts, isScoped: false)
442472
}
443473
// TODO: track source locations
444474
return .atom(atom)
@@ -592,7 +622,7 @@ public func parse<S: StringProtocol>(
592622
return ast
593623
}
594624

595-
extension String {
625+
extension StringProtocol {
596626
/// Whether the given string is considered multi-line for a regex literal.
597627
var spansMultipleLinesInRegexLiteral: Bool {
598628
unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" })

Tests/RegexTests/ParseTests.swift

Lines changed: 69 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1776,6 +1776,13 @@ extension RegexTests {
17761776
" ", "b"
17771777
)
17781778
)
1779+
parseTest(
1780+
"(?x) a (?^: b)", concat(
1781+
changeMatchingOptions(matchingOptions(adding: .extended)),
1782+
"a",
1783+
changeMatchingOptions(unsetMatchingOptions(), concat(" ", "b"))
1784+
)
1785+
)
17791786

17801787
parseTest("[ # abc]", charClass(" ", "#", " ", "a", "b", "c"))
17811788
parseTest("[#]", charClass("#"))
@@ -2184,22 +2191,40 @@ extension RegexTests {
21842191
/#
21852192
""", concat("a", "b"))
21862193

2187-
// Make sure (?^) is ignored.
2194+
// (?x) has no effect.
21882195
parseWithDelimitersTest("""
21892196
#/
2190-
(?^)
2197+
(?x)
21912198
# comment
21922199
/#
2193-
""", changeMatchingOptions(unsetMatchingOptions())
2200+
""", changeMatchingOptions(matchingOptions(adding: .extended))
21942201
)
21952202

2196-
// (?x) has no effect.
2203+
// Scoped removal of extended syntax is allowed as long as it does not span
2204+
// multiple lines.
21972205
parseWithDelimitersTest("""
21982206
#/
2199-
(?x)
2200-
# comment
2207+
(?-x:a b)
22012208
/#
2202-
""", changeMatchingOptions(matchingOptions(adding: .extended))
2209+
""", changeMatchingOptions(
2210+
matchingOptions(removing: .extended),
2211+
concat("a", " ", "b")
2212+
)
2213+
)
2214+
parseWithDelimitersTest("""
2215+
#/
2216+
(?-xx:a b)
2217+
/#
2218+
""", changeMatchingOptions(
2219+
matchingOptions(removing: .extraExtended),
2220+
concat("a", " ", "b")
2221+
)
2222+
)
2223+
parseWithDelimitersTest("""
2224+
#/
2225+
(?^: a b ) # comment
2226+
/#
2227+
""", changeMatchingOptions(unsetMatchingOptions(), concat(" ", "a", " ", "b", " "))
22032228
)
22042229

22052230
parseWithDelimitersTest(#"""
@@ -2782,17 +2807,50 @@ extension RegexTests {
27822807
/#
27832808
""", .cannotRemoveExtendedSyntaxInMultilineMode
27842809
)
2810+
2811+
// Scoped removal of extended syntax may not span multiple lines
27852812
diagnosticWithDelimitersTest("""
27862813
#/
2787-
(?-x:a b)
2814+
(?-x:a b
2815+
)
27882816
/#
2789-
""", .cannotRemoveExtendedSyntaxInMultilineMode
2817+
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
27902818
)
27912819
diagnosticWithDelimitersTest("""
27922820
#/
2793-
(?-xx:a b)
2821+
(?-x:a
2822+
b)
27942823
/#
2795-
""", .cannotRemoveExtendedSyntaxInMultilineMode
2824+
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
2825+
)
2826+
diagnosticWithDelimitersTest("""
2827+
#/
2828+
(?-xx:
2829+
a b)
2830+
/#
2831+
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
2832+
)
2833+
diagnosticWithDelimitersTest("""
2834+
#/
2835+
(?x-x:
2836+
a b)
2837+
/#
2838+
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
2839+
)
2840+
diagnosticWithDelimitersTest("""
2841+
#/
2842+
(?^)
2843+
# comment
2844+
/#
2845+
""", .cannotResetExtendedSyntaxInMultilineMode
2846+
)
2847+
diagnosticWithDelimitersTest("""
2848+
#/
2849+
(?^:
2850+
# comment
2851+
)
2852+
/#
2853+
""", .unsetExtendedSyntaxMayNotSpanMultipleLines
27962854
)
27972855

27982856
diagnosticWithDelimitersTest(#"""

0 commit comments

Comments
 (0)