Skip to content

Commit 1ea6f20

Browse files
committed
Change matching option scoping behavior to match PCRE
Previously we would always parse a "change matching option" sequence as a group, and for the isolated syntax e.g `(?x)`, we would have it implicitly wrap everything after in the same group by having it do a recursive parse. This matched the Oniguruma behavior for such isolated groups, and was easy to implement, but its behavior is quite unintuitive when it comes to alternations, as e.g `a(?x)b|c` becomes `a(?x:b|c)`, which may not be expected. Instead, let's follow PCRE's behavior by having such isolated cases affect the syntax options for the remainder of the current group, including across alternation branches. This is done by lexing such cases as atoms (as they aren't really group-like anymore), and having them change the syntax options when we encounter them. The existing scoping rules around groups take care of resetting the options when we exit the scope.
1 parent e84c93d commit 1ea6f20

File tree

14 files changed

+404
-346
lines changed

14 files changed

+404
-346
lines changed

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ extension AST {
7272

7373
// (*ACCEPT), (*FAIL), ...
7474
case backtrackingDirective(BacktrackingDirective)
75+
76+
// (?i), (?i-m), ...
77+
case changeMatchingOptions(MatchingOptionSequence)
7578
}
7679
}
7780
}
@@ -91,6 +94,7 @@ extension AST.Atom {
9194
case .subpattern(let v): return v
9295
case .callout(let v): return v
9396
case .backtrackingDirective(let v): return v
97+
case .changeMatchingOptions(let v): return v
9498
case .any: return nil
9599
case .startOfLine: return nil
96100
case .endOfLine: return nil
@@ -691,7 +695,7 @@ extension AST.Atom {
691695
return nil
692696

693697
case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern,
694-
.callout, .backtrackingDirective:
698+
.callout, .backtrackingDirective, .changeMatchingOptions:
695699
return nil
696700
}
697701
}
@@ -731,7 +735,7 @@ extension AST.Atom {
731735

732736
case .property, .escaped, .any, .startOfLine, .endOfLine,
733737
.backreference, .subpattern, .namedCharacter, .callout,
734-
.backtrackingDirective:
738+
.backtrackingDirective, .changeMatchingOptions:
735739
return nil
736740
}
737741
}
@@ -740,6 +744,8 @@ extension AST.Atom {
740744
switch kind {
741745
case .backtrackingDirective(let b):
742746
return b.isQuantifiable
747+
case .changeMatchingOptions:
748+
return false
743749
// TODO: Are callouts quantifiable?
744750
default:
745751
return true

Sources/_RegexParser/Regex/AST/Group.swift

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,7 @@ extension AST {
6868
case atomicScriptRun
6969

7070
// (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:)
71-
// Isolated options are written as e.g (?i), and implicitly form a group
72-
// containing all the following elements of the current group.
73-
case changeMatchingOptions(MatchingOptionSequence, isIsolated: Bool)
71+
case changeMatchingOptions(MatchingOptionSequence)
7472

7573
// NOTE: Comments appear to be groups, but are not parsed
7674
// the same. They parse more like quotes, so are not
@@ -87,21 +85,6 @@ extension AST.Group.Kind {
8785
}
8886
}
8987

90-
/// Whether this is a group with an implicit scope, e.g isolated matching
91-
/// options implicitly become parent groups for the rest of the elements in
92-
/// the current group:
93-
///
94-
/// (a(?i)bc)de -> (a(?i:bc))de
95-
///
96-
public var hasImplicitScope: Bool {
97-
switch self {
98-
case .changeMatchingOptions(_, let isIsolated):
99-
return isIsolated
100-
default:
101-
return false
102-
}
103-
}
104-
10588
/// If this is a named group, its name, `nil` otherwise.
10689
public var name: String? {
10790
switch self {

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 60 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,22 @@ extension Source {
712712
return .init(caretLoc: nil, adding: adding, minusLoc: nil, removing: [])
713713
}
714714

715+
/// A matching option changing atom.
716+
///
717+
/// '(?' MatchingOptionSeq ')'
718+
///
719+
mutating func lexChangeMatchingOptionAtom(
720+
context: ParsingContext
721+
) throws -> AST.MatchingOptionSequence? {
722+
try tryEating { src in
723+
guard src.tryEat(sequence: "(?"),
724+
let seq = try src.lexMatchingOptionSequence(context: context)
725+
else { return nil }
726+
try src.expect(")")
727+
return seq
728+
}
729+
}
730+
715731
/// Try to consume explicitly spelled-out PCRE2 group syntax.
716732
mutating func lexExplicitPCRE2GroupStart() -> AST.Group.Kind? {
717733
tryEating { src in
@@ -852,7 +868,7 @@ extension Source {
852868
// otherwise a matching option specifier. Conversely, '(?P' can be the
853869
// start of a matching option sequence, or a reference if it is followed
854870
// by '=' or '<'.
855-
guard !src.shouldLexGroupLikeAtom() else { return nil }
871+
guard !src.shouldLexGroupLikeAtom(context: context) else { return nil }
856872

857873
guard src.tryEat("(") else { return nil }
858874
if src.tryEat("?") {
@@ -877,22 +893,13 @@ extension Source {
877893

878894
// Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:).
879895
if let seq = try src.lexMatchingOptionSequence(context: context) {
880-
if src.tryEat(":") {
881-
return .changeMatchingOptions(seq, isIsolated: false)
882-
}
883-
// If this isn't start of an explicit group, we should have an
884-
// implicit group that covers the remaining elements of the current
885-
// group.
886-
// TODO: This implicit scoping behavior matches Oniguruma, but PCRE
887-
// also does it across alternations, which will require additional
888-
// handling.
889-
guard src.tryEat(")") else {
896+
guard src.tryEat(":") else {
890897
if let next = src.peek() {
891898
throw ParseError.invalidMatchingOption(next)
892899
}
893900
throw ParseError.expected(")")
894901
}
895-
return .changeMatchingOptions(seq, isIsolated: true)
902+
return .changeMatchingOptions(seq)
896903
}
897904

898905
guard let next = src.peek() else {
@@ -1041,18 +1048,8 @@ extension Source {
10411048
context: ParsingContext
10421049
) throws -> Located<AST.Group.Kind>? {
10431050
try tryEating { src in
1044-
guard src.tryEat(sequence: "(?"),
1045-
let group = try src.lexGroupStart(context: context)
1046-
else { return nil }
1047-
1048-
// Implicitly scoped groups are not supported here.
1049-
guard !group.value.hasImplicitScope else {
1050-
throw LocatedError(
1051-
ParseError.unsupportedCondition("implicitly scoped group"),
1052-
group.location
1053-
)
1054-
}
1055-
return group
1051+
guard src.tryEat(sequence: "(?") else { return nil }
1052+
return try src.lexGroupStart(context: context)
10561053
}
10571054
}
10581055

@@ -1239,17 +1236,19 @@ extension Source {
12391236
allowWholePatternRef: Bool = false, allowRecursionLevel: Bool = false
12401237
) throws -> AST.Reference? {
12411238
let kind = try recordLoc { src -> AST.Reference.Kind? in
1242-
// Note this logic should match canLexNumberedReference.
1243-
if src.tryEat("+") {
1244-
return .relative(try src.expectNumber().value)
1245-
}
1246-
if src.tryEat("-") {
1247-
return .relative(try -src.expectNumber().value)
1248-
}
1249-
if let num = try src.lexNumber() {
1250-
return .absolute(num.value)
1239+
try src.tryEating { src in
1240+
// Note this logic should match canLexNumberedReference.
1241+
if src.tryEat("+"), let num = try src.lexNumber() {
1242+
return .relative(num.value)
1243+
}
1244+
if src.tryEat("-"), let num = try src.lexNumber() {
1245+
return .relative(-num.value)
1246+
}
1247+
if let num = try src.lexNumber() {
1248+
return .absolute(num.value)
1249+
}
1250+
return nil
12511251
}
1252-
return nil
12531252
}
12541253
guard let kind = kind else { return nil }
12551254
guard allowWholePatternRef || kind.value != .recurseWholePattern else {
@@ -1478,8 +1477,21 @@ extension Source {
14781477
return src.canLexNumberedReference()
14791478
}
14801479

1480+
private func canLexMatchingOptionsAsAtom(context: ParsingContext) -> Bool {
1481+
var src = self
1482+
1483+
// See if we can lex a matching option sequence that terminates in ')'. Such
1484+
// a sequence is an atom. If an error is thrown, there are invalid elements
1485+
// of the matching option sequence. In such a case, we can lex as a group
1486+
// and diagnose the invalid group kind.
1487+
guard (try? src.lexMatchingOptionSequence(context: context)) != nil else {
1488+
return false
1489+
}
1490+
return src.tryEat(")")
1491+
}
1492+
14811493
/// Whether a group specifier should be lexed as an atom instead of a group.
1482-
private func shouldLexGroupLikeAtom() -> Bool {
1494+
private func shouldLexGroupLikeAtom(context: ParsingContext) -> Bool {
14831495
var src = self
14841496
guard src.tryEat("(") else { return false }
14851497

@@ -1493,6 +1505,9 @@ extension Source {
14931505
// The start of an Oniguruma 'of-contents' callout.
14941506
if src.tryEat("{") { return true }
14951507

1508+
// A matching option atom (?x), (?i), ...
1509+
if src.canLexMatchingOptionsAsAtom(context: context) { return true }
1510+
14961511
return false
14971512
}
14981513
// The start of a backreference directive or Oniguruma named callout.
@@ -1753,13 +1768,20 @@ extension Source {
17531768
///
17541769
/// GroupLikeAtom -> GroupLikeReference | Callout | BacktrackingDirective
17551770
///
1756-
mutating func expectGroupLikeAtom() throws -> AST.Atom.Kind {
1771+
mutating func expectGroupLikeAtom(
1772+
context: ParsingContext
1773+
) throws -> AST.Atom.Kind {
17571774
try recordLoc { src in
17581775
// References that look like groups, e.g (?R), (?1), ...
17591776
if let ref = try src.lexGroupLikeReference() {
17601777
return ref.value
17611778
}
17621779

1780+
// Change matching options atom (?i), (?x-i), ...
1781+
if let seq = try src.lexChangeMatchingOptionAtom(context: context) {
1782+
return .changeMatchingOptions(seq)
1783+
}
1784+
17631785
// (*ACCEPT), (*FAIL), (*MARK), ...
17641786
if let b = try src.lexBacktrackingDirective() {
17651787
return .backtrackingDirective(b)
@@ -1828,8 +1850,8 @@ extension Source {
18281850

18291851
// If we have group syntax that was skipped over in lexGroupStart, we
18301852
// need to handle it as an atom, or throw an error.
1831-
if !customCC && src.shouldLexGroupLikeAtom() {
1832-
return try src.expectGroupLikeAtom()
1853+
if !customCC && src.shouldLexGroupLikeAtom(context: context) {
1854+
return try src.expectGroupLikeAtom(context: context)
18331855
}
18341856

18351857
// A quantifier here is invalid.

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -282,42 +282,53 @@ extension Parser {
282282
loc(start)))
283283
}
284284

285+
/// Apply the syntax options of a given matching option sequence to the
286+
/// current set of options.
287+
private mutating func applySyntaxOptions(
288+
of opts: AST.MatchingOptionSequence
289+
) {
290+
// We skip this for multi-line, as extended syntax is always enabled there.
291+
if context.syntax.contains(.multilineExtendedSyntax) { return }
292+
293+
// Check if we're introducing or removing extended syntax.
294+
// TODO: PCRE differentiates between (?x) and (?xx) where only the latter
295+
// handles non-semantic whitespace in a custom character class. Other
296+
// engines such as Oniguruma, Java, and ICU do this under (?x). Therefore,
297+
// treat (?x) and (?xx) as the same option here. If we ever get a strict
298+
// PCRE mode, we will need to change this to handle that.
299+
if opts.resetsCurrentOptions {
300+
context.syntax.remove(.extendedSyntax)
301+
}
302+
if opts.adding.contains(where: \.isAnyExtended) {
303+
context.syntax.insert(.extendedSyntax)
304+
}
305+
if opts.removing.contains(where: \.isAnyExtended) {
306+
context.syntax.remove(.extendedSyntax)
307+
}
308+
}
309+
310+
/// Apply the syntax options of a matching option changing group to the
311+
/// current set of options.
312+
private mutating func applySyntaxOptions(of group: AST.Group.Kind) {
313+
if case .changeMatchingOptions(let seq) = group {
314+
applySyntaxOptions(of: seq)
315+
}
316+
}
317+
285318
/// Perform a recursive parse for the body of a group.
286319
mutating func parseGroupBody(
287320
start: Source.Position, _ kind: AST.Located<AST.Group.Kind>
288321
) throws -> AST.Group {
289322
context.recordGroup(kind.value)
290323

291-
// Check if we're introducing or removing extended syntax. We skip this for
292-
// multi-line, as extended syntax is always enabled there.
293-
// TODO: PCRE differentiates between (?x) and (?xx) where only the latter
294-
// handles non-semantic whitespace in a custom character class. Other
295-
// engines such as Oniguruma, Java, and ICU do this under (?x). Therefore,
296-
// treat (?x) and (?xx) as the same option here. If we ever get a strict
297-
// PCRE mode, we will need to change this to handle that.
298324
let currentSyntax = context.syntax
299-
if !context.syntax.contains(.multilineExtendedSyntax) {
300-
if case .changeMatchingOptions(let c, isIsolated: _) = kind.value {
301-
if c.resetsCurrentOptions {
302-
context.syntax.remove(.extendedSyntax)
303-
}
304-
if c.adding.contains(where: \.isAnyExtended) {
305-
context.syntax.insert(.extendedSyntax)
306-
}
307-
if c.removing.contains(where: \.isAnyExtended) {
308-
context.syntax.remove(.extendedSyntax)
309-
}
310-
}
311-
}
325+
applySyntaxOptions(of: kind.value)
312326
defer {
313327
context.syntax = currentSyntax
314328
}
315329

316330
let child = try parseNode()
317-
// An implicit scoped group has already consumed its closing paren.
318-
if !kind.value.hasImplicitScope {
319-
try source.expect(")")
320-
}
331+
try source.expect(")")
321332
return .init(kind, child, loc(start))
322333
}
323334

@@ -409,6 +420,11 @@ extension Parser {
409420
}
410421

411422
if let atom = try source.lexAtom(context: context) {
423+
// If we have a change matching options atom, apply the syntax options. We
424+
// already take care of scoping syntax options within a group.
425+
if case .changeMatchingOptions(let opts) = atom.kind {
426+
applySyntaxOptions(of: opts)
427+
}
412428
// TODO: track source locations
413429
return .atom(atom)
414430
}

Sources/_RegexParser/Regex/Printing/DumpAST.swift

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,9 @@ extension AST.Atom {
156156

157157
case .backtrackingDirective(let d): return "\(d)"
158158

159+
case .changeMatchingOptions(let opts):
160+
return "changeMatchingOptions<\(opts)>"
161+
159162
case .char, .scalar:
160163
fatalError("Unreachable")
161164
}
@@ -225,22 +228,21 @@ extension AST.Reference: _ASTPrintable {
225228
extension AST.Group.Kind: _ASTPrintable {
226229
public var _dumpBase: String {
227230
switch self {
228-
case .capture: return "capture"
229-
case .namedCapture(let s): return "capture<\(s.value)>"
230-
case .balancedCapture(let b): return "balanced capture \(b)"
231-
case .nonCapture: return "nonCapture"
232-
case .nonCaptureReset: return "nonCaptureReset"
233-
case .atomicNonCapturing: return "atomicNonCapturing"
234-
case .lookahead: return "lookahead"
235-
case .negativeLookahead: return "negativeLookahead"
236-
case .nonAtomicLookahead: return "nonAtomicLookahead"
237-
case .lookbehind: return "lookbehind"
238-
case .negativeLookbehind: return "negativeLookbehind"
239-
case .nonAtomicLookbehind: return "nonAtomicLookbehind"
240-
case .scriptRun: return "scriptRun"
241-
case .atomicScriptRun: return "atomicScriptRun"
242-
case .changeMatchingOptions(let seq, let isIsolated):
243-
return "changeMatchingOptions<\(seq), \(isIsolated)>"
231+
case .capture: return "capture"
232+
case .namedCapture(let s): return "capture<\(s.value)>"
233+
case .balancedCapture(let b): return "balanced capture \(b)"
234+
case .nonCapture: return "nonCapture"
235+
case .nonCaptureReset: return "nonCaptureReset"
236+
case .atomicNonCapturing: return "atomicNonCapturing"
237+
case .lookahead: return "lookahead"
238+
case .negativeLookahead: return "negativeLookahead"
239+
case .nonAtomicLookahead: return "nonAtomicLookahead"
240+
case .lookbehind: return "lookbehind"
241+
case .negativeLookbehind: return "negativeLookbehind"
242+
case .nonAtomicLookbehind: return "nonAtomicLookbehind"
243+
case .scriptRun: return "scriptRun"
244+
case .atomicScriptRun: return "atomicScriptRun"
245+
case .changeMatchingOptions(let seq): return "changeMatchingOptions<\(seq)>"
244246
}
245247
}
246248
}

0 commit comments

Comments
 (0)