Skip to content

Commit 8417aea

Browse files
committed
Add DSL support for backreferences.
Allow `capture` and `tryCapture` to assign the captured value to a `Reference`, which can be used as a regex later in the scope. `Reference` initialization creates a unique identifier. The compiler converts the identifier to an absolute backreference offset. ----- Example: ```swift let regex = Regex { let a = Reference() let b = Reference() capture("abc", as: a) capture("def", as: b) a capture(b) } ```
1 parent 5e2b77c commit 8417aea

File tree

10 files changed

+496
-163
lines changed

10 files changed

+496
-163
lines changed

Sources/VariadicsGenerator/VariadicsGenerator.swift

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -538,52 +538,65 @@ struct VariadicsGenerator: ParsableCommand {
538538
output("""
539539
// MARK: - Non-builder capture arity \(arity)
540540
541-
public func capture<\(genericParams)>(_ component: R) -> \(regexTypeName)<\(newMatchType(newCaptureType: "W"))> \(whereClause) {
542-
.init(node: .group(.capture, component.regex.root))
541+
public func capture<\(genericParams)>(
542+
_ component: R, as reference: Reference? = nil
543+
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "W"))> \(whereClause) {
544+
.init(node: .group(.capture, component.regex.root, reference?.id))
543545
}
544546
545547
public func capture<\(genericParams), NewCapture>(
546-
_ component: R, transform: @escaping (Substring) -> NewCapture
548+
_ component: R,
549+
as reference: Reference? = nil,
550+
transform: @escaping (Substring) -> NewCapture
547551
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
548552
.init(node: .groupTransform(
549553
.capture,
550554
component.regex.root,
551555
CaptureTransform(resultType: NewCapture.self) {
552556
transform($0) as Any
553-
}))
557+
},
558+
reference?.id))
554559
}
555560
556561
public func tryCapture<\(genericParams), NewCapture>(
557-
_ component: R, transform: @escaping (Substring) throws -> NewCapture
562+
_ component: R,
563+
as reference: Reference? = nil,
564+
transform: @escaping (Substring) throws -> NewCapture
558565
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
559566
.init(node: .groupTransform(
560567
.capture,
561568
component.regex.root,
562569
CaptureTransform(resultType: NewCapture.self) {
563570
try transform($0) as Any
564-
}))
571+
},
572+
reference?.id))
565573
}
566574
567575
public func tryCapture<\(genericParams), NewCapture>(
568-
_ component: R, transform: @escaping (Substring) -> NewCapture?
576+
_ component: R,
577+
as reference: Reference? = nil,
578+
transform: @escaping (Substring) -> NewCapture?
569579
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
570580
.init(node: .groupTransform(
571581
.capture,
572582
component.regex.root,
573583
CaptureTransform(resultType: NewCapture.self) {
574584
transform($0) as Any?
575-
}))
585+
},
586+
reference?.id))
576587
}
577588
578589
// MARK: - Builder capture arity \(arity)
579590
580591
public func capture<\(genericParams)>(
592+
as reference: Reference? = nil,
581593
@RegexBuilder _ component: () -> R
582594
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "W"))> \(whereClause) {
583-
.init(node: .group(.capture, component().regex.root))
595+
.init(node: .group(.capture, component().regex.root, reference?.id))
584596
}
585597
586598
public func capture<\(genericParams), NewCapture>(
599+
as reference: Reference? = nil,
587600
@RegexBuilder _ component: () -> R,
588601
transform: @escaping (Substring) -> NewCapture
589602
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
@@ -592,10 +605,12 @@ struct VariadicsGenerator: ParsableCommand {
592605
component().regex.root,
593606
CaptureTransform(resultType: NewCapture.self) {
594607
transform($0) as Any
595-
}))
608+
},
609+
reference?.id))
596610
}
597611
598612
public func tryCapture<\(genericParams), NewCapture>(
613+
as reference: Reference? = nil,
599614
@RegexBuilder _ component: () -> R,
600615
transform: @escaping (Substring) throws -> NewCapture
601616
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
@@ -604,10 +619,12 @@ struct VariadicsGenerator: ParsableCommand {
604619
component().regex.root,
605620
CaptureTransform(resultType: NewCapture.self) {
606621
try transform($0) as Any
607-
}))
622+
},
623+
reference?.id))
608624
}
609625
610626
public func tryCapture<\(genericParams), NewCapture>(
627+
as reference: Reference? = nil,
611628
@RegexBuilder _ component: () -> R,
612629
transform: @escaping (Substring) -> NewCapture?
613630
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
@@ -616,7 +633,8 @@ struct VariadicsGenerator: ParsableCommand {
616633
component().regex.root,
617634
CaptureTransform(resultType: NewCapture.self) {
618635
transform($0) as Any?
619-
}))
636+
},
637+
reference?.id))
620638
}
621639
622640
""")

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,13 @@ extension Compiler {
44
struct ByteCodeGen {
55
var options: MatchingOptions
66
var builder = Program.Builder()
7+
var captureCount: Int = 0
8+
var unresolvedReferences: [Reference.ID: Set<InstructionAddress>] = [:]
9+
var captureOffsets: [Reference.ID: Int] = [:]
710

811
mutating func finish(
912
) throws -> Program {
13+
try resolveReferences()
1014
builder.buildAccept()
1115
return try builder.assemble()
1216
}
@@ -31,6 +35,9 @@ extension Compiler.ByteCodeGen {
3135
case let .backreference(ref):
3236
try emitBackreference(ref)
3337

38+
case let .symbolicReference(id):
39+
emitUnresolvedReference(id: id)
40+
3441
case let .unconverted(astAtom):
3542
if let consumer = try astAtom.generateConsumer(options) {
3643
builder.buildConsume(by: consumer)
@@ -292,9 +299,41 @@ extension Compiler.ByteCodeGen {
292299
}
293300
}
294301

302+
mutating func resolveReferences() throws {
303+
for (id, uses) in unresolvedReferences {
304+
guard let offset = captureOffsets[id] else {
305+
throw RegexCompilationError.uncapturedReference
306+
}
307+
for use in uses {
308+
builder.instructions[use.rawValue] =
309+
Instruction(.backreference, .init(capture: .init(offset)))
310+
}
311+
}
312+
}
313+
314+
mutating func emitUnresolvedReference(id: Reference.ID) {
315+
builder.buildBackreference(.init(0))
316+
unresolvedReferences[id, default: []].insert(builder.lastInstructionAddress)
317+
}
318+
319+
mutating func registerCapture(id: Reference.ID?) {
320+
if let id = id {
321+
let preexistingValue = captureOffsets.updateValue(
322+
captureCount, forKey: id)
323+
assert(preexistingValue == nil)
324+
}
325+
captureCount += 1
326+
}
327+
295328
mutating func emitGroup(
296-
_ kind: AST.Group.Kind, _ child: DSLTree.Node
329+
_ kind: AST.Group.Kind,
330+
_ child: DSLTree.Node,
331+
_ referenceID: Reference.ID?
297332
) throws -> CaptureRegister? {
333+
guard kind.isCapturing || referenceID == nil else {
334+
throw Unreachable("Reference ID shouldn't exist for non-capturing groups")
335+
}
336+
298337
options.beginScope()
299338
defer { options.endScope() }
300339

@@ -319,6 +358,7 @@ extension Compiler.ByteCodeGen {
319358
throw Unreachable("TODO: reason")
320359

321360
case .capture, .namedCapture:
361+
registerCapture(id: referenceID)
322362
let cap = builder.makeCapture()
323363
builder.buildBeginCapture(cap)
324364
try emitNode(child)
@@ -552,8 +592,8 @@ extension Compiler.ByteCodeGen {
552592
try emitConcatenationComponent(child)
553593
}
554594

555-
case let .group(kind, child):
556-
_ = try emitGroup(kind, child)
595+
case let .group(kind, child, referenceID):
596+
_ = try emitGroup(kind, child, referenceID)
557597

558598
case .conditional:
559599
throw Unsupported("Conditionals")
@@ -592,8 +632,8 @@ extension Compiler.ByteCodeGen {
592632
case let .convertedRegexLiteral(n, _):
593633
try emitNode(n)
594634

595-
case let .groupTransform(kind, child, t):
596-
guard let cap = try emitGroup(kind, child) else {
635+
case let .groupTransform(kind, child, t, referenceID):
636+
guard let cap = try emitGroup(kind, child, referenceID) else {
597637
assertionFailure("""
598638
What does it mean to not have a capture to transform?
599639
""")

Sources/_StringProcessing/Compiler.swift

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,15 @@ public func _compileRegex(
4343
return Executor(program: program)
4444
}
4545

46+
// An error produced when compiling a regular expression.
47+
public enum RegexCompilationError: Error, CustomStringConvertible {
48+
// TODO: Source location?
49+
case uncapturedReference
50+
51+
public var description: String {
52+
switch self {
53+
case .uncapturedReference:
54+
return "Found a reference used before it captured any match."
55+
}
56+
}
57+
}

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,10 @@ extension DSLTree.Atom {
9393
// TODO: Should we handle?
9494
return nil
9595

96+
case .symbolicReference:
97+
// TODO: Should we handle?
98+
return nil
99+
96100
case let .unconverted(a):
97101
return try a.generateConsumer(opts)
98102
}

Sources/_StringProcessing/Engine/MEBuilder.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ extension MEProgram.Builder {
6767
staticElements.forEach { elements.store($0) }
6868
}
6969

70+
var lastInstructionAddress: InstructionAddress {
71+
.init(instructions.endIndex - 1)
72+
}
73+
7074
public mutating func buildNop(_ r: StringRegister? = nil) {
7175
instructions.append(.init(.nop, .init(optionalString: r)))
7276
}

Sources/_StringProcessing/PrintAsPattern.swift

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,9 +87,10 @@ extension PrettyPrinter {
8787
}
8888
}
8989

90-
case let .group(kind, child):
90+
case let .group(kind, child, referenceID):
9191
let kind = kind._patternBase
92-
printBlock("Group(\(kind))") { printer in
92+
let refIDString = referenceID.map { ", referenceID: \($0)" } ?? ""
93+
printBlock("Group(\(kind)\(refIDString)") { printer in
9394
printer.printAsPattern(convertedFromAST: child)
9495
}
9596

@@ -123,7 +124,9 @@ extension PrettyPrinter {
123124
case .assertion:
124125
print("/* TODO: assertions */")
125126
case .backreference:
126-
print("/* TODO: backreferences */")
127+
print("/* TOOD: backreferences */")
128+
case .symbolicReference:
129+
print("/* TOOD: symbolic references */")
127130
}
128131

129132
case .trivia:

Sources/_StringProcessing/RegexDSL/DSL.swift

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,3 +184,24 @@ public func choiceOf<R: RegexProtocol>(
184184
) -> R {
185185
builder()
186186
}
187+
188+
// MARK: - Backreference
189+
190+
public struct Reference: RegexProtocol {
191+
struct ID: Hashable, Equatable {
192+
private static var counter: Int = 0
193+
var base: Int
194+
init() {
195+
base = ID.counter
196+
ID.counter += 1
197+
}
198+
}
199+
200+
let id = ID()
201+
202+
public init() {}
203+
204+
public var regex: Regex<Substring> {
205+
.init(node: .atom(.symbolicReference(id)))
206+
}
207+
}

Sources/_StringProcessing/RegexDSL/DSLTree.swift

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ extension DSLTree {
3030
case concatenation([Node])
3131

3232
/// (...)
33-
case group(AST.Group.Kind, Node)
33+
case group(AST.Group.Kind, Node, Reference.ID? = nil)
3434

3535
/// (?(cond) true-branch | false-branch)
3636
///
@@ -81,7 +81,8 @@ extension DSLTree {
8181
case groupTransform(
8282
AST.Group.Kind,
8383
Node,
84-
CaptureTransform)
84+
CaptureTransform,
85+
Reference.ID? = nil)
8586

8687
case consumer(_ConsumerInterface)
8788

@@ -119,6 +120,7 @@ extension DSLTree {
119120

120121
case assertion(AST.Atom.AssertionKind)
121122
case backreference(AST.Reference)
123+
case symbolicReference(Reference.ID)
122124

123125
case unconverted(AST.Atom)
124126
}
@@ -162,8 +164,8 @@ extension DSLTree.Node {
162164
// Treat this transparently
163165
return n.children
164166

165-
case let .group(_, n): return [n]
166-
case let .groupTransform(_, n, _): return [n]
167+
case let .group(_, n, _): return [n]
168+
case let .groupTransform(_, n, _, _): return [n]
167169
case let .quantification(_, _, n): return [n]
168170

169171
case let .conditional(_, t, f): return [t,f]
@@ -224,8 +226,8 @@ extension DSLTree {
224226
extension DSLTree.Node {
225227
var hasCapture: Bool {
226228
switch self {
227-
case let .group(k, _) where k.isCapturing,
228-
let .groupTransform(k, _, _) where k.isCapturing:
229+
case let .group(k, _, _) where k.isCapturing,
230+
let .groupTransform(k, _, _, _) where k.isCapturing:
229231
return true
230232
case let .convertedRegexLiteral(n, re):
231233
assert(n.hasCapture == re.hasCapture)
@@ -257,14 +259,14 @@ extension DSLTree.Node {
257259
case let .concatenation(children):
258260
return constructor.concatenating(children)
259261

260-
case let .group(kind, child):
262+
case let .group(kind, child, _):
261263
if let type = child.matcherCaptureType {
262264
return constructor.grouping(
263265
child, as: kind, withType: type)
264266
}
265267
return constructor.grouping(child, as: kind)
266268

267-
case let .groupTransform(kind, child, transform):
269+
case let .groupTransform(kind, child, transform, _):
268270
return constructor.grouping(
269271
child, as: kind, withTransform: transform)
270272

0 commit comments

Comments
 (0)