Skip to content

Commit 2021c05

Browse files
committed
Implement named backreferences
Use the CaptureList as the source of truth on which index a name corresponds to, and query it when emitting a named backreference.
1 parent 3e3f45f commit 2021c05

File tree

8 files changed

+94
-24
lines changed

8 files changed

+94
-24
lines changed

Sources/_RegexParser/Regex/Parse/CaptureList.swift

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,21 @@ extension CaptureList {
4242
}
4343
}
4444

45+
extension CaptureList {
46+
/// Retrieve the capture index of a given named capture, or `nil` if there is
47+
/// no such capture.
48+
public func indexOfCapture(named name: String) -> Int? {
49+
// Named references are guaranteed to be unique for literal ASTs by Sema.
50+
// The DSL tree does not use named references.
51+
captures.indices.first(where: { captures[$0].name == name })
52+
}
53+
54+
/// Whether the capture list has a given named capture.
55+
public func hasCapture(named name: String) -> Bool {
56+
indexOfCapture(named: name) != nil
57+
}
58+
}
59+
4560
// MARK: Generating from AST
4661

4762
extension AST.Node {

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ enum ParseError: Error, Hashable {
8686
case unsupported(String)
8787
case deprecatedUnicode(String)
8888
case invalidReference(Int)
89+
case invalidNamedReference(String)
8990
case duplicateNamedCapture(String)
9091
case invalidCharacterClassRangeOperand
9192
case invalidQuantifierRange(Int, Int)
@@ -211,6 +212,8 @@ extension ParseError: CustomStringConvertible {
211212
return "\(kind) is a deprecated Unicode property, and is not supported"
212213
case let .invalidReference(i):
213214
return "no capture numbered \(i)"
215+
case let .invalidNamedReference(name):
216+
return "no capture named '\(name)'"
214217
case let .duplicateNamedCapture(str):
215218
return "group named '\(str)' already exists"
216219
case let .invalidQuantifierRange(lhs, rhs):

Sources/_RegexParser/Regex/Parse/Sema.swift

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,20 +72,20 @@ extension RegexValidator {
7272
}
7373

7474
func validateReference(_ ref: AST.Reference) throws {
75+
if let recLevel = ref.recursionLevel {
76+
throw error(.unsupported("recursion level"), at: recLevel.location)
77+
}
7578
switch ref.kind {
7679
case .absolute(let i):
7780
guard i <= captures.captures.count else {
7881
throw error(.invalidReference(i), at: ref.innerLoc)
7982
}
83+
case .named(let name):
84+
guard captures.hasCapture(named: name) else {
85+
throw error(.invalidNamedReference(name), at: ref.innerLoc)
86+
}
8087
case .relative:
8188
throw error(.unsupported("relative capture reference"), at: ref.innerLoc)
82-
case .named:
83-
// TODO: This could be implemented by querying the capture list for an
84-
// index.
85-
throw error(.unsupported("named capture reference"), at: ref.innerLoc)
86-
}
87-
if let recLevel = ref.recursionLevel {
88-
throw error(.unsupported("recursion level"), at: recLevel.location)
8989
}
9090
}
9191

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ extension Compiler {
55
var options: MatchingOptions
66
var builder = Program.Builder()
77

8+
init(options: MatchingOptions, captureList: CaptureList) {
9+
self.options = options
10+
self.builder.captureList = captureList
11+
}
12+
813
mutating func finish(
914
) throws -> Program {
1015
builder.buildAccept()
@@ -62,7 +67,9 @@ extension Compiler.ByteCodeGen {
6267
case .absolute(let i):
6368
// Backreferences number starting at 1
6469
builder.buildBackreference(.init(i-1))
65-
case .relative, .named:
70+
case .named(let name):
71+
try builder.buildNamedReference(name)
72+
case .relative:
6673
throw Unsupported("Backreference kind: \(ref)")
6774
}
6875
}

Sources/_StringProcessing/Compiler.swift

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,9 @@ class Compiler {
2727

2828
__consuming func emit() throws -> Program {
2929
// TODO: Handle global options
30-
var codegen = ByteCodeGen(options: options)
31-
codegen.builder.captureList = tree.root._captureList
30+
var codegen = ByteCodeGen(
31+
options: options, captureList: tree.root._captureList
32+
)
3233
try codegen.emitNode(tree.root)
3334
let program = try codegen.finish()
3435
return program

Sources/_StringProcessing/Engine/MEBuilder.swift

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ extension MEProgram where Input.Element: Hashable {
4444
// Symbolic reference resolution
4545
var unresolvedReferences: [ReferenceID: [InstructionAddress]] = [:]
4646
var referencedCaptureOffsets: [ReferenceID: Int] = [:]
47-
var namedCaptureOffsets: [String: Int] = [:]
47+
4848
var captureCount: Int {
4949
// We currently deduce the capture count from the capture register number.
5050
nextCaptureRegister.rawValue
@@ -284,6 +284,13 @@ extension MEProgram.Builder {
284284
unresolvedReferences[id, default: []].append(lastInstructionAddress)
285285
}
286286

287+
mutating func buildNamedReference(_ name: String) throws {
288+
guard let index = captureList.indexOfCapture(named: name) else {
289+
throw RegexCompilationError.uncapturedReference
290+
}
291+
buildBackreference(.init(index))
292+
}
293+
287294
// TODO: Mutating because of fail address fixup, drop when
288295
// that's removed
289296
mutating func assemble() throws -> MEProgram {
@@ -456,9 +463,10 @@ extension MEProgram.Builder {
456463
assert(preexistingValue == nil)
457464
}
458465
if let name = name {
459-
// TODO: Reject duplicate capture names unless `(?J)`?
460-
namedCaptureOffsets.updateValue(captureCount, forKey: name)
466+
let index = captureList.indexOfCapture(named: name)
467+
assert(index == nextCaptureRegister.rawValue)
461468
}
469+
assert(nextCaptureRegister.rawValue < captureList.captures.count)
462470
return nextCaptureRegister
463471
}
464472

Tests/RegexTests/MatchTests.swift

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,23 +1142,32 @@ extension RegexTests {
11421142
}
11431143

11441144
func testMatchReferences() {
1145-
// TODO: Implement backreference/subpattern matching.
11461145
firstMatchTest(
11471146
#"(.)\1"#,
11481147
input: "112", match: "11")
11491148
firstMatchTest(
11501149
#"(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\10"#,
11511150
input: "aaaaaaaaabbc", match: "aaaaaaaaabb")
11521151

1152+
firstMatchTest(
1153+
#"(.)(.)(.)(.)(.)(.)(.)(.)(.)(?<a1>.)(?P=a1)"#,
1154+
input: "aaaaaaaaabbc", match: "aaaaaaaaabb")
1155+
11531156
firstMatchTest(
11541157
#"(.)\g001"#,
11551158
input: "112", match: "11")
11561159

1157-
firstMatchTest(#"(.)(.)\g-02"#, input: "abac", match: "aba", xfail: true)
1158-
firstMatchTest(#"(?<a>.)(.)\k<a>"#, input: "abac", match: "aba", xfail: true)
1159-
firstMatchTest(#"\g'+2'(.)(.)"#, input: "abac", match: "aba", xfail: true)
1160+
firstMatchTest(#"(?<a>.)(.)\k<a>"#, input: "abac", match: "aba")
1161+
1162+
firstMatchTest(#"(?<a>.)(?<b>.)(?<c>.)\k<c>\k<a>\k<b>"#,
1163+
input: "xyzzxy", match: "xyzzxy")
11601164

11611165
firstMatchTest(#"\1(.)"#, input: "112", match: nil)
1166+
firstMatchTest(#"\k<a>(?<a>.)"#, input: "112", match: nil)
1167+
1168+
// TODO: Implement subpattern matching.
1169+
firstMatchTest(#"(.)(.)\g-02"#, input: "abac", match: "aba", xfail: true)
1170+
firstMatchTest(#"\g'+2'(.)(.)"#, input: "abac", match: "aba", xfail: true)
11621171
}
11631172

11641173
func testMatchExamples() {

Tests/RegexTests/ParseTests.swift

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1242,16 +1242,37 @@ extension RegexTests {
12421242
parseTest(#"\k'-3'"#, backreference(.relative(-3)), throwsError: .unsupported)
12431243
parseTest(#"\k'1'"#, backreference(.absolute(1)), throwsError: .invalid)
12441244

1245-
parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .unsupported)
1246-
parseTest(#"\k<bc>"#, backreference(.named("bc")), throwsError: .unsupported)
1247-
parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .unsupported)
1248-
parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .unsupported)
1245+
parseTest(
1246+
#"(?<a>)\k<a>"#, concat(
1247+
namedCapture("a", empty()), backreference(.named("a"))
1248+
), captures: [.named("a")]
1249+
)
1250+
parseTest(
1251+
#"(?<a>)\k{a}"#, concat(
1252+
namedCapture("a", empty()), backreference(.named("a"))
1253+
), captures: [.named("a")]
1254+
)
1255+
parseTest(
1256+
#"(?<a>)\g{a}"#, concat(
1257+
namedCapture("a", empty()), backreference(.named("a"))
1258+
), captures: [.named("a")]
1259+
)
1260+
parseTest(
1261+
#"(?<a>)(?P=a)"#, concat(
1262+
namedCapture("a", empty()), backreference(.named("a"))
1263+
), captures: [.named("a")]
1264+
)
1265+
1266+
parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .invalid)
1267+
parseTest(#"\k<bc>"#, backreference(.named("bc")), throwsError: .invalid)
1268+
parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .invalid)
1269+
parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .invalid)
12491270

12501271
// Oniguruma recursion levels.
12511272
parseTest(#"\k<bc-0>"#, backreference(.named("bc"), recursionLevel: 0), throwsError: .unsupported)
12521273
parseTest(#"\k<a+0>"#, backreference(.named("a"), recursionLevel: 0), throwsError: .unsupported)
1253-
parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1), throwsError: .invalid)
1254-
parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8), throwsError: .invalid)
1274+
parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1), throwsError: .unsupported)
1275+
parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8), throwsError: .unsupported)
12551276
parseTest(#"\k'-3-8'"#, backreference(.relative(-3), recursionLevel: -8), throwsError: .unsupported)
12561277
parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8), throwsError: .unsupported)
12571278
parseTest(#"\k'+3-8'"#, backreference(.relative(3), recursionLevel: -8), throwsError: .unsupported)
@@ -2167,7 +2188,7 @@ extension RegexTests {
21672188
throwsError: .unsupported
21682189
)
21692190
parseWithDelimitersTest(
2170-
#"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .unsupported)
2191+
#"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .invalid)
21712192
parseWithDelimitersTest(
21722193
#"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1),
21732194
throwsError: .unsupported
@@ -2815,6 +2836,12 @@ extension RegexTests {
28152836
diagnosticTest(#"(?:)()\2"#, .invalidReference(2))
28162837
diagnosticTest(#"(?:)(?:)\2"#, .invalidReference(2))
28172838

2839+
diagnosticTest(#"\k<a>"#, .invalidNamedReference("a"))
2840+
diagnosticTest(#"(?:)\k<a>"#, .invalidNamedReference("a"))
2841+
diagnosticTest(#"()\k<a>"#, .invalidNamedReference("a"))
2842+
diagnosticTest(#"()\k<a>()"#, .invalidNamedReference("a"))
2843+
diagnosticTest(#"(?<b>)\k<a>()"#, .invalidNamedReference("a"))
2844+
28182845
// MARK: Conditionals
28192846

28202847
diagnosticTest(#"(?(1)a|b|c)"#, .tooManyBranchesInConditional(3))

0 commit comments

Comments
 (0)