Skip to content

Commit 9a088aa

Browse files
committed
Change matching option scoping behavior to match PCRE
Previously we would always parse a "change matching option" sequence as a group, and for the isolated syntax e.g `(?x)`, we would have it implicitly wrap everything after in the same group by having it do a recursive parse. This matched the Oniguruma behavior for such isolated groups, and was easy to implement, but its behavior is quite unintuitive when it comes to alternations, as e.g `a(?x)b|c` becomes `a(?x:b|c)`, which may not be expected. Instead, let's follow PCRE's behavior by having such isolated cases affect the syntax options for the remainder of the current group, including across alternation branches. This is done by lexing such cases as atoms (as they aren't really group-like anymore), and having them change the syntax options when we encounter them. The existing scoping rules around groups take care of resetting the options when we exit the scope.
1 parent 7b43ad0 commit 9a088aa

File tree

14 files changed

+353
-300
lines changed

14 files changed

+353
-300
lines changed

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ extension AST {
7272

7373
// (*ACCEPT), (*FAIL), ...
7474
case backtrackingDirective(BacktrackingDirective)
75+
76+
// (?i), (?i-m), ...
77+
case changeMatchingOptions(MatchingOptionSequence)
7578
}
7679
}
7780
}
@@ -91,6 +94,7 @@ extension AST.Atom {
9194
case .subpattern(let v): return v
9295
case .callout(let v): return v
9396
case .backtrackingDirective(let v): return v
97+
case .changeMatchingOptions(let v): return v
9498
case .any: return nil
9599
case .startOfLine: return nil
96100
case .endOfLine: return nil
@@ -683,7 +687,7 @@ extension AST.Atom {
683687
return nil
684688

685689
case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern,
686-
.callout, .backtrackingDirective:
690+
.callout, .backtrackingDirective, .changeMatchingOptions:
687691
return nil
688692
}
689693
}
@@ -723,7 +727,7 @@ extension AST.Atom {
723727

724728
case .property, .escaped, .any, .startOfLine, .endOfLine,
725729
.backreference, .subpattern, .namedCharacter, .callout,
726-
.backtrackingDirective:
730+
.backtrackingDirective, .changeMatchingOptions:
727731
return nil
728732
}
729733
}
@@ -732,6 +736,8 @@ extension AST.Atom {
732736
switch kind {
733737
case .backtrackingDirective(let b):
734738
return b.isQuantifiable
739+
case .changeMatchingOptions:
740+
return false
735741
// TODO: Are callouts quantifiable?
736742
default:
737743
return true

Sources/_RegexParser/Regex/AST/Group.swift

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,7 @@ extension AST {
6868
case atomicScriptRun
6969

7070
// (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:)
71-
// Isolated options are written as e.g (?i), and implicitly form a group
72-
// containing all the following elements of the current group.
73-
case changeMatchingOptions(MatchingOptionSequence, isIsolated: Bool)
71+
case changeMatchingOptions(MatchingOptionSequence)
7472

7573
// NOTE: Comments appear to be groups, but are not parsed
7674
// the same. They parse more like quotes, so are not
@@ -87,21 +85,6 @@ extension AST.Group.Kind {
8785
}
8886
}
8987

90-
/// Whether this is a group with an implicit scope, e.g isolated matching
91-
/// options implicitly become parent groups for the rest of the elements in
92-
/// the current group:
93-
///
94-
/// (a(?i)bc)de -> (a(?i:bc))de
95-
///
96-
public var hasImplicitScope: Bool {
97-
switch self {
98-
case .changeMatchingOptions(_, let isIsolated):
99-
return isIsolated
100-
default:
101-
return false
102-
}
103-
}
104-
10588
/// If this is a named group, its name, `nil` otherwise.
10689
public var name: String? {
10790
switch self {

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 42 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -706,6 +706,21 @@ extension Source {
706706
return .init(caretLoc: nil, adding: adding, minusLoc: nil, removing: [])
707707
}
708708

709+
/// A matching option changing atom.
710+
///
711+
/// '(?' MatchingOptionSeq ')'
712+
///
713+
mutating func lexChangeMatchingOptionAtom(
714+
) throws -> AST.MatchingOptionSequence? {
715+
try tryEating { src in
716+
guard src.tryEat(sequence: "(?"),
717+
let seq = try src.lexMatchingOptionSequence()
718+
else { return nil }
719+
try src.expect(")")
720+
return seq
721+
}
722+
}
723+
709724
/// Try to consume explicitly spelled-out PCRE2 group syntax.
710725
mutating func lexExplicitPCRE2GroupStart() -> AST.Group.Kind? {
711726
tryEating { src in
@@ -871,22 +886,13 @@ extension Source {
871886

872887
// Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:).
873888
if let seq = try src.lexMatchingOptionSequence() {
874-
if src.tryEat(":") {
875-
return .changeMatchingOptions(seq, isIsolated: false)
876-
}
877-
// If this isn't start of an explicit group, we should have an
878-
// implicit group that covers the remaining elements of the current
879-
// group.
880-
// TODO: This implicit scoping behavior matches Oniguruma, but PCRE
881-
// also does it across alternations, which will require additional
882-
// handling.
883-
guard src.tryEat(")") else {
889+
guard src.tryEat(":") else {
884890
if let next = src.peek() {
885891
throw ParseError.invalidMatchingOption(next)
886892
}
887893
throw ParseError.expected(")")
888894
}
889-
return .changeMatchingOptions(seq, isIsolated: true)
895+
return .changeMatchingOptions(seq)
890896
}
891897

892898
guard let next = src.peek() else {
@@ -1035,18 +1041,8 @@ extension Source {
10351041
context: ParsingContext
10361042
) throws -> Located<AST.Group.Kind>? {
10371043
try tryEating { src in
1038-
guard src.tryEat(sequence: "(?"),
1039-
let group = try src.lexGroupStart(context: context)
1040-
else { return nil }
1041-
1042-
// Implicitly scoped groups are not supported here.
1043-
guard !group.value.hasImplicitScope else {
1044-
throw LocatedError(
1045-
ParseError.unsupportedCondition("implicitly scoped group"),
1046-
group.location
1047-
)
1048-
}
1049-
return group
1044+
guard src.tryEat(sequence: "(?") else { return nil }
1045+
return try src.lexGroupStart(context: context)
10501046
}
10511047
}
10521048

@@ -1177,17 +1173,19 @@ extension Source {
11771173
allowWholePatternRef: Bool = false, allowRecursionLevel: Bool = false
11781174
) throws -> AST.Reference? {
11791175
let kind = try recordLoc { src -> AST.Reference.Kind? in
1180-
// Note this logic should match canLexNumberedReference.
1181-
if src.tryEat("+") {
1182-
return .relative(try src.expectNumber().value)
1183-
}
1184-
if src.tryEat("-") {
1185-
return .relative(try -src.expectNumber().value)
1186-
}
1187-
if let num = try src.lexNumber() {
1188-
return .absolute(num.value)
1176+
try src.tryEating { src in
1177+
// Note this logic should match canLexNumberedReference.
1178+
if src.tryEat("+"), let num = try src.lexNumber() {
1179+
return .relative(num.value)
1180+
}
1181+
if src.tryEat("-"), let num = try src.lexNumber() {
1182+
return .relative(-num.value)
1183+
}
1184+
if let num = try src.lexNumber() {
1185+
return .absolute(num.value)
1186+
}
1187+
return nil
11891188
}
1190-
return nil
11911189
}
11921190
guard let kind = kind else { return nil }
11931191
guard allowWholePatternRef || kind.value != .recurseWholePattern else {
@@ -1431,6 +1429,12 @@ extension Source {
14311429
// The start of an Oniguruma 'of-contents' callout.
14321430
if src.tryEat("{") { return true }
14331431

1432+
// See if we can lex a change-matching-option sequence that terminates in
1433+
// ')'. Such a sequence is an atom.
1434+
if (try? src.lexMatchingOptionSequence()) != nil {
1435+
return src.tryEat(")")
1436+
}
1437+
14341438
return false
14351439
}
14361440
// The start of a backreference directive or Oniguruma named callout.
@@ -1698,6 +1702,11 @@ extension Source {
16981702
return ref.value
16991703
}
17001704

1705+
// Change matching options atom (?i), (?x-i), ...
1706+
if let seq = try src.lexChangeMatchingOptionAtom() {
1707+
return .changeMatchingOptions(seq)
1708+
}
1709+
17011710
// (*ACCEPT), (*FAIL), (*MARK), ...
17021711
if let b = try src.lexBacktrackingDirective() {
17031712
return .backtrackingDirective(b)

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 37 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -269,39 +269,50 @@ extension Parser {
269269
loc(start)))
270270
}
271271

272-
/// Perform a recursive parse for the body of a group.
273-
mutating func parseGroupBody(
274-
start: Source.Position, _ kind: AST.Located<AST.Group.Kind>
275-
) throws -> AST.Group {
276-
context.recordGroup(kind.value)
277-
272+
/// Apply the syntax options of a given matching option sequence to the
273+
/// current set of options.
274+
private mutating func applySyntaxOptions(
275+
of opts: AST.MatchingOptionSequence
276+
) {
278277
// Check if we're introducing or removing extended syntax.
279278
// TODO: PCRE differentiates between (?x) and (?xx) where only the latter
280279
// handles non-semantic whitespace in a custom character class. Other
281280
// engines such as Oniguruma, Java, and ICU do this under (?x). Therefore,
282281
// treat (?x) and (?xx) as the same option here. If we ever get a strict
283282
// PCRE mode, we will need to change this to handle that.
284-
let currentSyntax = context.syntax
285-
if case .changeMatchingOptions(let c, isIsolated: _) = kind.value {
286-
if c.resetsCurrentOptions {
287-
context.syntax.remove(.extendedSyntax)
288-
}
289-
if c.adding.contains(where: \.isAnyExtended) {
290-
context.syntax.insert(.extendedSyntax)
291-
}
292-
if c.removing.contains(where: \.isAnyExtended) {
293-
context.syntax.remove(.extendedSyntax)
294-
}
283+
if opts.resetsCurrentOptions {
284+
context.syntax.remove(.extendedSyntax)
285+
}
286+
if opts.adding.contains(where: \.isAnyExtended) {
287+
context.syntax.insert(.extendedSyntax)
295288
}
289+
if opts.removing.contains(where: \.isAnyExtended) {
290+
context.syntax.remove(.extendedSyntax)
291+
}
292+
}
293+
294+
/// Apply the syntax options of a matching option changing group to the
295+
/// current set of options.
296+
private mutating func applySyntaxOptions(of group: AST.Group.Kind) {
297+
if case .changeMatchingOptions(let seq) = group {
298+
applySyntaxOptions(of: seq)
299+
}
300+
}
301+
302+
/// Perform a recursive parse for the body of a group.
303+
mutating func parseGroupBody(
304+
start: Source.Position, _ kind: AST.Located<AST.Group.Kind>
305+
) throws -> AST.Group {
306+
context.recordGroup(kind.value)
307+
308+
let currentSyntax = context.syntax
309+
applySyntaxOptions(of: kind.value)
296310
defer {
297311
context.syntax = currentSyntax
298312
}
299313

300314
let child = try parseNode()
301-
// An implicit scoped group has already consumed its closing paren.
302-
if !kind.value.hasImplicitScope {
303-
try source.expect(")")
304-
}
315+
try source.expect(")")
305316
return .init(kind, child, loc(start))
306317
}
307318

@@ -393,6 +404,11 @@ extension Parser {
393404
}
394405

395406
if let atom = try source.lexAtom(context: context) {
407+
// If we have a change matching options atom, apply the syntax options. We
408+
// already take care of scoping syntax options within a group.
409+
if case .changeMatchingOptions(let opts) = atom.kind {
410+
applySyntaxOptions(of: opts)
411+
}
396412
// TODO: track source locations
397413
return .atom(atom)
398414
}

Sources/_RegexParser/Regex/Printing/DumpAST.swift

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,9 @@ extension AST.Atom {
156156

157157
case .backtrackingDirective(let d): return "\(d)"
158158

159+
case .changeMatchingOptions(let opts):
160+
return "changeMatchingOptions<\(opts)>"
161+
159162
case .char, .scalar:
160163
fatalError("Unreachable")
161164
}
@@ -225,22 +228,21 @@ extension AST.Reference: _ASTPrintable {
225228
extension AST.Group.Kind: _ASTPrintable {
226229
public var _dumpBase: String {
227230
switch self {
228-
case .capture: return "capture"
229-
case .namedCapture(let s): return "capture<\(s.value)>"
230-
case .balancedCapture(let b): return "balanced capture \(b)"
231-
case .nonCapture: return "nonCapture"
232-
case .nonCaptureReset: return "nonCaptureReset"
233-
case .atomicNonCapturing: return "atomicNonCapturing"
234-
case .lookahead: return "lookahead"
235-
case .negativeLookahead: return "negativeLookahead"
236-
case .nonAtomicLookahead: return "nonAtomicLookahead"
237-
case .lookbehind: return "lookbehind"
238-
case .negativeLookbehind: return "negativeLookbehind"
239-
case .nonAtomicLookbehind: return "nonAtomicLookbehind"
240-
case .scriptRun: return "scriptRun"
241-
case .atomicScriptRun: return "atomicScriptRun"
242-
case .changeMatchingOptions(let seq, let isIsolated):
243-
return "changeMatchingOptions<\(seq), \(isIsolated)>"
231+
case .capture: return "capture"
232+
case .namedCapture(let s): return "capture<\(s.value)>"
233+
case .balancedCapture(let b): return "balanced capture \(b)"
234+
case .nonCapture: return "nonCapture"
235+
case .nonCaptureReset: return "nonCaptureReset"
236+
case .atomicNonCapturing: return "atomicNonCapturing"
237+
case .lookahead: return "lookahead"
238+
case .negativeLookahead: return "negativeLookahead"
239+
case .nonAtomicLookahead: return "nonAtomicLookahead"
240+
case .lookbehind: return "lookbehind"
241+
case .negativeLookbehind: return "negativeLookbehind"
242+
case .nonAtomicLookbehind: return "nonAtomicLookbehind"
243+
case .scriptRun: return "scriptRun"
244+
case .atomicScriptRun: return "atomicScriptRun"
245+
case .changeMatchingOptions(let seq): return "changeMatchingOptions<\(seq)>"
244246
}
245247
}
246248
}

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ extension Compiler.ByteCodeGen {
3434
case let .symbolicReference(id):
3535
builder.buildUnresolvedReference(id: id)
3636

37+
case let .changeMatchingOptions(optionSequence):
38+
options.apply(optionSequence)
39+
3740
case let .unconverted(astAtom):
3841
if let consumer = try astAtom.generateConsumer(options) {
3942
builder.buildConsume(by: consumer)
@@ -336,7 +339,7 @@ extension Compiler.ByteCodeGen {
336339
case .capture, .namedCapture, .balancedCapture:
337340
throw Unreachable("These should produce a capture node")
338341

339-
case .changeMatchingOptions(let optionSequence, _):
342+
case .changeMatchingOptions(let optionSequence):
340343
options.apply(optionSequence)
341344
try emitNode(child)
342345

@@ -562,6 +565,9 @@ extension Compiler.ByteCodeGen {
562565
}
563566

564567
case let .capture(_, refId, child):
568+
options.beginScope()
569+
defer { options.endScope() }
570+
565571
let cap = builder.makeCapture(id: refId)
566572
switch child {
567573
case let .matcher(_, m):

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,10 @@ extension DSLTree.Atom {
100100
// TODO: Should we handle?
101101
return nil
102102

103+
case .changeMatchingOptions:
104+
// TODO: Should we handle?
105+
return nil
106+
103107
case let .unconverted(a):
104108
return try a.generateConsumer(opts)
105109
}
@@ -178,7 +182,8 @@ extension AST.Atom {
178182
return nil
179183

180184
case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl,
181-
.backreference, .subpattern, .callout, .backtrackingDirective:
185+
.backreference, .subpattern, .callout, .backtrackingDirective,
186+
.changeMatchingOptions:
182187
// FIXME: implement
183188
return nil
184189
}

0 commit comments

Comments
 (0)