Skip to content

Add DSL support for backreferences. #148

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 30 additions & 12 deletions Sources/VariadicsGenerator/VariadicsGenerator.swift
Original file line number Diff line number Diff line change
Expand Up @@ -538,52 +538,65 @@ struct VariadicsGenerator: ParsableCommand {
output("""
// MARK: - Non-builder capture arity \(arity)

public func capture<\(genericParams)>(_ component: R) -> \(regexTypeName)<\(newMatchType(newCaptureType: "W"))> \(whereClause) {
.init(node: .group(.capture, component.regex.root))
public func capture<\(genericParams)>(
_ component: R, as reference: Reference? = nil
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "W"))> \(whereClause) {
.init(node: .group(.capture, component.regex.root, reference?.id))
}

public func capture<\(genericParams), NewCapture>(
_ component: R, transform: @escaping (Substring) -> NewCapture
_ component: R,
as reference: Reference? = nil,
transform: @escaping (Substring) -> NewCapture
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
.init(node: .groupTransform(
.capture,
component.regex.root,
CaptureTransform(resultType: NewCapture.self) {
transform($0) as Any
}))
},
reference?.id))
}

public func tryCapture<\(genericParams), NewCapture>(
_ component: R, transform: @escaping (Substring) throws -> NewCapture
_ component: R,
as reference: Reference? = nil,
transform: @escaping (Substring) throws -> NewCapture
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
.init(node: .groupTransform(
.capture,
component.regex.root,
CaptureTransform(resultType: NewCapture.self) {
try transform($0) as Any
}))
},
reference?.id))
}

public func tryCapture<\(genericParams), NewCapture>(
_ component: R, transform: @escaping (Substring) -> NewCapture?
_ component: R,
as reference: Reference? = nil,
transform: @escaping (Substring) -> NewCapture?
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
.init(node: .groupTransform(
.capture,
component.regex.root,
CaptureTransform(resultType: NewCapture.self) {
transform($0) as Any?
}))
},
reference?.id))
}

// MARK: - Builder capture arity \(arity)

public func capture<\(genericParams)>(
as reference: Reference? = nil,
@RegexBuilder _ component: () -> R
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "W"))> \(whereClause) {
.init(node: .group(.capture, component().regex.root))
.init(node: .group(.capture, component().regex.root, reference?.id))
}

public func capture<\(genericParams), NewCapture>(
as reference: Reference? = nil,
@RegexBuilder _ component: () -> R,
transform: @escaping (Substring) -> NewCapture
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
Expand All @@ -592,10 +605,12 @@ struct VariadicsGenerator: ParsableCommand {
component().regex.root,
CaptureTransform(resultType: NewCapture.self) {
transform($0) as Any
}))
},
reference?.id))
}

public func tryCapture<\(genericParams), NewCapture>(
as reference: Reference? = nil,
@RegexBuilder _ component: () -> R,
transform: @escaping (Substring) throws -> NewCapture
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
Expand All @@ -604,10 +619,12 @@ struct VariadicsGenerator: ParsableCommand {
component().regex.root,
CaptureTransform(resultType: NewCapture.self) {
try transform($0) as Any
}))
},
reference?.id))
}

public func tryCapture<\(genericParams), NewCapture>(
as reference: Reference? = nil,
@RegexBuilder _ component: () -> R,
transform: @escaping (Substring) -> NewCapture?
) -> \(regexTypeName)<\(newMatchType(newCaptureType: "NewCapture"))> \(whereClause) {
Expand All @@ -616,7 +633,8 @@ struct VariadicsGenerator: ParsableCommand {
component().regex.root,
CaptureTransform(resultType: NewCapture.self) {
transform($0) as Any?
}))
},
reference?.id))
}

""")
Expand Down
23 changes: 16 additions & 7 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ extension Compiler.ByteCodeGen {
case let .backreference(ref):
try emitBackreference(ref)

case let .symbolicReference(id):
builder.buildUnresolvedReference(id: id)

case let .unconverted(astAtom):
if let consumer = try astAtom.generateConsumer(options) {
builder.buildConsume(by: consumer)
Expand Down Expand Up @@ -293,8 +296,14 @@ extension Compiler.ByteCodeGen {
}

mutating func emitGroup(
_ kind: AST.Group.Kind, _ child: DSLTree.Node
_ kind: AST.Group.Kind,
_ child: DSLTree.Node,
_ referenceID: Reference.ID?
) throws -> CaptureRegister? {
guard kind.isCapturing || referenceID == nil else {
throw Unreachable("Reference ID shouldn't exist for non-capturing groups")
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm also very strongly considering splitting off the capturing concept into its own DSLTree node. But that's a future PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I agree. The current construction is a bit weird. Perhaps something like case capture(DSLTree.Node, CaptureTransform?, ReferenceID?)


options.beginScope()
defer { options.endScope() }

Expand All @@ -303,7 +312,7 @@ extension Compiler.ByteCodeGen {
//
// FIXME: Unify with .groupTransform
if kind.isCapturing, case let .matcher(_, m) = child {
let cap = builder.makeCapture()
let cap = builder.makeCapture(id: referenceID)
emitMatcher(m, into: cap)
return cap
}
Expand All @@ -319,7 +328,7 @@ extension Compiler.ByteCodeGen {
throw Unreachable("TODO: reason")

case .capture, .namedCapture:
let cap = builder.makeCapture()
let cap = builder.makeCapture(id: referenceID)
builder.buildBeginCapture(cap)
try emitNode(child)
builder.buildEndCapture(cap)
Expand Down Expand Up @@ -552,8 +561,8 @@ extension Compiler.ByteCodeGen {
try emitConcatenationComponent(child)
}

case let .group(kind, child):
_ = try emitGroup(kind, child)
case let .group(kind, child, referenceID):
_ = try emitGroup(kind, child, referenceID)

case .conditional:
throw Unsupported("Conditionals")
Expand Down Expand Up @@ -592,8 +601,8 @@ extension Compiler.ByteCodeGen {
case let .convertedRegexLiteral(n, _):
try emitNode(n)

case let .groupTransform(kind, child, t):
guard let cap = try emitGroup(kind, child) else {
case let .groupTransform(kind, child, t, referenceID):
guard let cap = try emitGroup(kind, child, referenceID) else {
assertionFailure("""
What does it mean to not have a capture to transform?
""")
Expand Down
12 changes: 12 additions & 0 deletions Sources/_StringProcessing/Compiler.swift
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,15 @@ public func _compileRegex(
return Executor(program: program)
}

// An error produced when compiling a regular expression.
public enum RegexCompilationError: Error, CustomStringConvertible {
// TODO: Source location?
case uncapturedReference

public var description: String {
switch self {
case .uncapturedReference:
return "Found a reference used before it captured any match."
}
}
}
4 changes: 4 additions & 0 deletions Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,10 @@ extension DSLTree.Atom {
// TODO: Should we handle?
return nil

case .symbolicReference:
// TODO: Should we handle?
return nil

case let .unconverted(a):
return try a.generateConsumer(opts)
}
Expand Down
42 changes: 41 additions & 1 deletion Sources/_StringProcessing/Engine/MEBuilder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ extension MEProgram where Input.Element: Hashable {
// as we compile?
var captureStructure: CaptureStructure = .empty

// Symbolic reference resolution
var unresolvedReferences: [Reference.ID: [InstructionAddress]] = [:]
var captureOffsets: [Reference.ID: Int] = [:]
var captureCount: Int {
// We currently deduce the capture count from the capture register number.
nextCaptureRegister.rawValue
}

public init() {}
}
}
Expand All @@ -67,6 +75,10 @@ extension MEProgram.Builder {
staticElements.forEach { elements.store($0) }
}

var lastInstructionAddress: InstructionAddress {
.init(instructions.endIndex - 1)
}

public mutating func buildNop(_ r: StringRegister? = nil) {
instructions.append(.init(.nop, .init(optionalString: r)))
}
Expand Down Expand Up @@ -262,9 +274,16 @@ extension MEProgram.Builder {
.init(.backreference, .init(capture: cap)))
}

public mutating func buildUnresolvedReference(id: Reference.ID) {
buildBackreference(.init(0))
unresolvedReferences[id, default: []].append(lastInstructionAddress)
}

// TODO: Mutating because of fail address fixup, drop when
// that's removed
public mutating func assemble() throws -> MEProgram {
try resolveReferences()

// TODO: This will add a fail instruction at the end every
// time it's assembled. Better to do to the local instruction
// list copy, but that complicates logic. It's possible we
Expand Down Expand Up @@ -401,10 +420,31 @@ extension MEProgram.Builder {

}

// Symbolic reference helpers
fileprivate extension MEProgram.Builder {
mutating func resolveReferences() throws {
for (id, uses) in unresolvedReferences {
guard let offset = captureOffsets[id] else {
throw RegexCompilationError.uncapturedReference
}
for use in uses {
instructions[use.rawValue] =
Instruction(.backreference, .init(capture: .init(offset)))
}
}
}
}

// Register helpers
extension MEProgram.Builder {
public mutating func makeCapture() -> CaptureRegister {
public mutating func makeCapture(id: Reference.ID?) -> CaptureRegister {
defer { nextCaptureRegister.rawValue += 1 }
// Register the capture for later lookup via symbolic references.
if let id = id {
let preexistingValue = captureOffsets.updateValue(
captureCount, forKey: id)
assert(preexistingValue == nil)
}
return nextCaptureRegister
}

Expand Down
9 changes: 6 additions & 3 deletions Sources/_StringProcessing/PrintAsPattern.swift
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,10 @@ extension PrettyPrinter {
}
}

case let .group(kind, child):
case let .group(kind, child, referenceID):
let kind = kind._patternBase
printBlock("Group(\(kind))") { printer in
let refIDString = referenceID.map { ", referenceID: \($0)" } ?? ""
printBlock("Group(\(kind)\(refIDString)") { printer in
printer.printAsPattern(convertedFromAST: child)
}

Expand Down Expand Up @@ -123,7 +124,9 @@ extension PrettyPrinter {
case .assertion:
print("/* TODO: assertions */")
case .backreference:
print("/* TODO: backreferences */")
print("/* TOOD: backreferences */")
case .symbolicReference:
print("/* TOOD: symbolic references */")
}

case .trivia:
Expand Down
22 changes: 22 additions & 0 deletions Sources/_StringProcessing/RegexDSL/DSL.swift
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,25 @@ public func choiceOf<R: RegexProtocol>(
) -> R {
builder()
}

// MARK: - Backreference

public struct Reference: RegexProtocol {
// FIXME: Public for prototypes.
public struct ID: Hashable, Equatable {
private static var counter: Int = 0
var base: Int
init() {
base = ID.counter
ID.counter += 1
}
}

let id = ID()

public init() {}

public var regex: Regex<Substring> {
.init(node: .atom(.symbolicReference(id)))
}
}
18 changes: 10 additions & 8 deletions Sources/_StringProcessing/RegexDSL/DSLTree.swift
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ extension DSLTree {
case concatenation([Node])

/// (...)
case group(AST.Group.Kind, Node)
case group(AST.Group.Kind, Node, Reference.ID? = nil)

/// (?(cond) true-branch | false-branch)
///
Expand Down Expand Up @@ -81,7 +81,8 @@ extension DSLTree {
case groupTransform(
AST.Group.Kind,
Node,
CaptureTransform)
CaptureTransform,
Reference.ID? = nil)

case consumer(_ConsumerInterface)

Expand Down Expand Up @@ -119,6 +120,7 @@ extension DSLTree {

case assertion(AST.Atom.AssertionKind)
case backreference(AST.Reference)
case symbolicReference(Reference.ID)

case unconverted(AST.Atom)
}
Expand Down Expand Up @@ -162,8 +164,8 @@ extension DSLTree.Node {
// Treat this transparently
return n.children

case let .group(_, n): return [n]
case let .groupTransform(_, n, _): return [n]
case let .group(_, n, _): return [n]
case let .groupTransform(_, n, _, _): return [n]
case let .quantification(_, _, n): return [n]

case let .conditional(_, t, f): return [t,f]
Expand Down Expand Up @@ -224,8 +226,8 @@ extension DSLTree {
extension DSLTree.Node {
var hasCapture: Bool {
switch self {
case let .group(k, _) where k.isCapturing,
let .groupTransform(k, _, _) where k.isCapturing:
case let .group(k, _, _) where k.isCapturing,
let .groupTransform(k, _, _, _) where k.isCapturing:
return true
case let .convertedRegexLiteral(n, re):
assert(n.hasCapture == re.hasCapture)
Expand Down Expand Up @@ -257,14 +259,14 @@ extension DSLTree.Node {
case let .concatenation(children):
return constructor.concatenating(children)

case let .group(kind, child):
case let .group(kind, child, _):
if let type = child.matcherCaptureType {
return constructor.grouping(
child, as: kind, withType: type)
}
return constructor.grouping(child, as: kind)

case let .groupTransform(kind, child, transform):
case let .groupTransform(kind, child, transform, _):
return constructor.grouping(
child, as: kind, withTransform: transform)

Expand Down
Loading