Skip to content

Commit 177f521

Browse files
committed
Add DSL support for backreferences.
Allow `capture` and `tryCapture` to assign the captured value to a `Reference`, which can be used as a regex later in the scope. `Reference` initialization creates a unique identifier. The compiler converts the identifier to an absolute backreference offset. ----- Example: ```swift let regex = Regex { let a = Reference() let b = Reference() capture("abc", as: a) capture("def", as: b) a capture(b) } ```
1 parent 8ed3fca commit 177f521

File tree

10 files changed

+449
-159
lines changed

10 files changed

+449
-159
lines changed

Sources/VariadicsGenerator/VariadicsGenerator.swift

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -538,52 +538,65 @@ struct VariadicsGenerator: ParsableCommand {
538538
output("""
539539
// MARK: - Non-builder capture arity \(arity)
540540
541-
public func capture<\(genericParams)>(_ component: R) -> \(regexTypeName)<\(newMatchType(newCaptureType: "W"))> \(whereClause) {
542-
.init(node: .group(.capture, component.regex.root))
541+
public func capture<\(genericParams)>(
542+
_ component: R, as reference: Reference? = nil
543+
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "W"))> \(whereClause) {
544+
.init(node: .group(.capture, component.regex.root, reference?.id))
543545
}
544546
545547
public func capture<\(genericParams), NewCapture>(
546-
_ component: R, transform: @escaping (Substring) -> NewCapture
548+
_ component: R,
549+
as reference: Reference? = nil,
550+
transform: @escaping (Substring) -> NewCapture
547551
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
548552
.init(node: .groupTransform(
549553
.capture,
550554
component.regex.root,
551555
CaptureTransform(resultType: NewCapture.self) {
552556
transform($0) as Any
553-
}))
557+
},
558+
reference?.id))
554559
}
555560
556561
public func tryCapture<\(genericParams), NewCapture>(
557-
_ component: R, transform: @escaping (Substring) throws -> NewCapture
562+
_ component: R,
563+
as reference: Reference? = nil,
564+
transform: @escaping (Substring) throws -> NewCapture
558565
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
559566
.init(node: .groupTransform(
560567
.capture,
561568
component.regex.root,
562569
CaptureTransform(resultType: NewCapture.self) {
563570
try transform($0) as Any
564-
}))
571+
},
572+
reference?.id))
565573
}
566574
567575
public func tryCapture<\(genericParams), NewCapture>(
568-
_ component: R, transform: @escaping (Substring) -> NewCapture?
576+
_ component: R,
577+
as reference: Reference? = nil,
578+
transform: @escaping (Substring) -> NewCapture?
569579
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
570580
.init(node: .groupTransform(
571581
.capture,
572582
component.regex.root,
573583
CaptureTransform(resultType: NewCapture.self) {
574584
transform($0) as Any?
575-
}))
585+
},
586+
reference?.id))
576587
}
577588
578589
// MARK: - Builder capture arity \(arity)
579590
580591
public func capture<\(genericParams)>(
592+
as reference: Reference? = nil,
581593
@RegexBuilder _ component: () -> R
582594
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "W"))> \(whereClause) {
583-
.init(node: .group(.capture, component().regex.root))
595+
.init(node: .group(.capture, component().regex.root, reference?.id))
584596
}
585597
586598
public func capture<\(genericParams), NewCapture>(
599+
as reference: Reference? = nil,
587600
@RegexBuilder _ component: () -> R,
588601
transform: @escaping (Substring) -> NewCapture
589602
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
@@ -592,10 +605,12 @@ struct VariadicsGenerator: ParsableCommand {
592605
component().regex.root,
593606
CaptureTransform(resultType: NewCapture.self) {
594607
transform($0) as Any
595-
}))
608+
},
609+
reference?.id))
596610
}
597611
598612
public func tryCapture<\(genericParams), NewCapture>(
613+
as reference: Reference? = nil,
599614
@RegexBuilder _ component: () -> R,
600615
transform: @escaping (Substring) throws -> NewCapture
601616
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
@@ -604,10 +619,12 @@ struct VariadicsGenerator: ParsableCommand {
604619
component().regex.root,
605620
CaptureTransform(resultType: NewCapture.self) {
606621
try transform($0) as Any
607-
}))
622+
},
623+
reference?.id))
608624
}
609625
610626
public func tryCapture<\(genericParams), NewCapture>(
627+
as reference: Reference? = nil,
611628
@RegexBuilder _ component: () -> R,
612629
transform: @escaping (Substring) -> NewCapture?
613630
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
@@ -616,7 +633,8 @@ struct VariadicsGenerator: ParsableCommand {
616633
component().regex.root,
617634
CaptureTransform(resultType: NewCapture.self) {
618635
transform($0) as Any?
619-
}))
636+
},
637+
reference?.id))
620638
}
621639
622640
""")

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ extension Compiler {
44
struct ByteCodeGen {
55
var options: MatchingOptions
66
var builder = Program.Builder()
7+
var captureCount: Int = 0
8+
var referencedCaptureOffsets: [Reference.ID: Int] = [:]
79

810
mutating func finish(
911
) throws -> Program {
@@ -35,6 +37,12 @@ extension Compiler.ByteCodeGen {
3537
case let .backreference(ref):
3638
try emitBackreference(ref)
3739

40+
case let .symbolicReference(id):
41+
guard let offset = referencedCaptureOffsets[id] else {
42+
throw RegexCompilationError.uncapturedReference
43+
}
44+
try emitBackreference(.init(.absolute(offset+1), innerLoc: .fake))
45+
3846
case let .unconverted(astAtom):
3947
if let consumer = try astAtom.generateConsumer(options) {
4048
builder.buildConsume(by: consumer)
@@ -246,9 +254,24 @@ extension Compiler.ByteCodeGen {
246254
builder.label(success)
247255
}
248256

257+
mutating func registerCapture(id: Reference.ID?) {
258+
if let id = id {
259+
let preexistingValue = referencedCaptureOffsets.updateValue(
260+
captureCount, forKey: id)
261+
assert(preexistingValue == nil)
262+
}
263+
captureCount += 1
264+
}
265+
249266
mutating func emitGroup(
250-
_ kind: AST.Group.Kind, _ child: DSLTree.Node
267+
_ kind: AST.Group.Kind,
268+
_ child: DSLTree.Node,
269+
_ referenceID: Reference.ID?
251270
) throws -> CaptureRegister? {
271+
guard kind.isCapturing || referenceID == nil else {
272+
throw Unreachable("Reference ID shouldn't exist for non-capturing groups")
273+
}
274+
252275
options.beginScope()
253276
defer { options.endScope() }
254277

@@ -263,6 +286,7 @@ extension Compiler.ByteCodeGen {
263286
throw Unreachable("TODO: reason")
264287

265288
case .capture, .namedCapture:
289+
registerCapture(id: referenceID)
266290
let cap = builder.makeCapture()
267291
builder.buildBeginCapture(cap)
268292
try emitNode(child)
@@ -496,8 +520,8 @@ extension Compiler.ByteCodeGen {
496520
try emitConcatenationComponent(child)
497521
}
498522

499-
case let .group(kind, child):
500-
_ = try emitGroup(kind, child)
523+
case let .group(kind, child, referenceID):
524+
_ = try emitGroup(kind, child, referenceID)
501525

502526
case .conditional:
503527
throw Unsupported("Conditionals")
@@ -521,8 +545,8 @@ extension Compiler.ByteCodeGen {
521545
case let .convertedRegexLiteral(n, _):
522546
try emitNode(n)
523547

524-
case let .groupTransform(kind, child, t):
525-
guard let cap = try emitGroup(kind, child) else {
548+
case let .groupTransform(kind, child, t, referenceID):
549+
guard let cap = try emitGroup(kind, child, referenceID) else {
526550
assertionFailure("""
527551
What does it mean to not have a capture to transform?
528552
""")

Sources/_StringProcessing/Compiler.swift

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,15 @@ public func _compileRegex(
4343
return Executor(program: program)
4444
}
4545

46+
// An error produced when compiling a regular expression.
47+
public enum RegexCompilationError: Error, CustomStringConvertible {
48+
// TODO: Source location?
49+
case uncapturedReference
50+
51+
public var description: String {
52+
switch self {
53+
case .uncapturedReference:
54+
return "Found a reference used before it captured any match."
55+
}
56+
}
57+
}

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ extension DSLTree.Atom {
8383
// TODO: Should we handle?
8484
return nil
8585

86+
case .symbolicReference:
87+
// TODO: Should we handle?
88+
return nil
89+
8690
case let .unconverted(a):
8791
return try a.generateConsumer(opts)
8892
}

Sources/_StringProcessing/Legacy/LegacyCompile.swift

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ func compile(
5454
throw Unsupported("Unsupported: \(a)")
5555
}
5656

57-
case let .group(kind, child):
57+
case let .group(kind, child, _):
5858
switch kind {
5959
case .nonCapture:
6060
instructions.append(.beginGroup)
@@ -71,13 +71,13 @@ func compile(
7171
throw Unsupported("Unsupported group \(kind)")
7272
}
7373

74-
case let .groupTransform(kind, child, transform) where kind == .capture:
74+
case let .groupTransform(kind, child, transform, _) where kind == .capture:
7575
instructions.append(.beginCapture)
7676
try compileNode(child)
7777
instructions.append(.endCapture(transform: transform))
7878
return
7979

80-
case let .groupTransform(kind, _, _):
80+
case let .groupTransform(kind, _, _, _):
8181
throw Unsupported(
8282
"Unsupported group transform \(kind)")
8383

Sources/_StringProcessing/PrintAsPattern.swift

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,9 +87,10 @@ extension PrettyPrinter {
8787
}
8888
}
8989

90-
case let .group(kind, child):
90+
case let .group(kind, child, referenceID):
9191
let kind = kind._patternBase
92-
printBlock("Group(\(kind))") { printer in
92+
let refIDString = referenceID.map { ", referenceID: \($0)" } ?? ""
93+
printBlock("Group(\(kind)\(refIDString)") { printer in
9394
printer.printAsPattern(convertedFromAST: child)
9495
}
9596

@@ -124,6 +125,8 @@ extension PrettyPrinter {
124125
print("/* TOOD: assertions */")
125126
case .backreference:
126127
print("/* TOOD: backreferences */")
128+
case .symbolicReference:
129+
print("/* TOOD: symbolic backreferences */")
127130
}
128131

129132
case .trivia:

Sources/_StringProcessing/RegexDSL/DSL.swift

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,3 +184,23 @@ public func choiceOf<R: RegexProtocol>(
184184
) -> R {
185185
builder()
186186
}
187+
188+
// MARK: - Backreference
189+
190+
public struct Reference: RegexProtocol {
191+
struct ID: Hashable, Equatable {
192+
private static var counter: Int = 0
193+
var base: Int
194+
init() {
195+
base = ID.counter
196+
ID.counter += 1
197+
}
198+
}
199+
200+
let id = ID()
201+
public init() {}
202+
203+
public var regex: Regex<Substring> {
204+
.init(node: .atom(.symbolicReference(id)))
205+
}
206+
}

Sources/_StringProcessing/RegexDSL/DSLTree.swift

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ extension DSLTree {
3030
case concatenation([Node])
3131

3232
/// (...)
33-
case group(AST.Group.Kind, Node)
33+
case group(AST.Group.Kind, Node, Reference.ID? = nil)
3434

3535
/// (?(cond) true-branch | false-branch)
3636
///
@@ -78,7 +78,8 @@ extension DSLTree {
7878
case groupTransform(
7979
AST.Group.Kind,
8080
Node,
81-
CaptureTransform)
81+
CaptureTransform,
82+
Reference.ID? = nil)
8283

8384
case consumer(_ConsumerInterface)
8485

@@ -116,6 +117,7 @@ extension DSLTree {
116117

117118
case assertion(AST.Atom.AssertionKind)
118119
case backreference(AST.Reference)
120+
case symbolicReference(Reference.ID)
119121

120122
case unconverted(AST.Atom)
121123
}
@@ -158,8 +160,8 @@ extension DSLTree.Node {
158160
// Treat this transparently
159161
return n.children
160162

161-
case let .group(_, n): return [n]
162-
case let .groupTransform(_, n, _): return [n]
163+
case let .group(_, n, _): return [n]
164+
case let .groupTransform(_, n, _, _): return [n]
163165
case let .quantification(_, _, n): return [n]
164166

165167
case let .conditional(_, t, f): return [t,f]
@@ -220,8 +222,8 @@ extension DSLTree {
220222
extension DSLTree.Node {
221223
var hasCapture: Bool {
222224
switch self {
223-
case let .group(k, _) where k.isCapturing,
224-
let .groupTransform(k, _, _) where k.isCapturing:
225+
case let .group(k, _, _) where k.isCapturing,
226+
let .groupTransform(k, _, _, _) where k.isCapturing:
225227
return true
226228
case let .convertedRegexLiteral(n, re):
227229
assert(n.hasCapture == re.hasCapture)
@@ -253,10 +255,10 @@ extension DSLTree.Node {
253255
case let .concatenation(children):
254256
return constructor.concatenating(children)
255257

256-
case let .group(kind, child):
258+
case let .group(kind, child, _):
257259
return constructor.grouping(child, as: kind)
258260

259-
case let .groupTransform(kind, child, transform):
261+
case let .groupTransform(kind, child, transform, _):
260262
return constructor.grouping(
261263
child, as: kind, withTransform: transform)
262264

0 commit comments

Comments
 (0)