Skip to content

Commit 7c956f6

Browse files
committed
Implement named backreferences
Use the CaptureList as the source of truth on which index a name corresponds to, and query it when emitting a named backreference.
1 parent bd9bf23 commit 7c956f6

File tree

7 files changed

+90
-25
lines changed

7 files changed

+90
-25
lines changed

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ enum ParseError: Error, Hashable {
8686
case unsupported(String)
8787
case deprecatedUnicode(String)
8888
case invalidReference(Int)
89+
case invalidNamedReference(String)
8990
case duplicateNamedCapture(String)
9091
case invalidCharacterClassRangeOperand
9192
case invalidQuantifierRange(Int, Int)
@@ -211,6 +212,8 @@ extension ParseError: CustomStringConvertible {
211212
return "\(kind) is a deprecated Unicode property, and is not supported"
212213
case let .invalidReference(i):
213214
return "no capture numbered \(i)"
215+
case let .invalidNamedReference(name):
216+
return "no capture named '\(name)'"
214217
case let .duplicateNamedCapture(str):
215218
return "group named '\(str)' already exists"
216219
case let .invalidQuantifierRange(lhs, rhs):

Sources/_RegexParser/Regex/Parse/Sema.swift

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,20 +72,20 @@ extension RegexValidator {
7272
}
7373

7474
func validateReference(_ ref: AST.Reference) throws {
75+
if let recLevel = ref.recursionLevel {
76+
throw error(.unsupported("recursion level"), at: recLevel.location)
77+
}
7578
switch ref.kind {
7679
case .absolute(let i):
7780
guard i <= captures.captures.count else {
7881
throw error(.invalidReference(i), at: ref.innerLoc)
7982
}
83+
case .named(let name):
84+
guard captures.captures.contains(where: { $0.name == name }) else {
85+
throw error(.invalidNamedReference(name), at: ref.innerLoc)
86+
}
8087
case .relative:
8188
throw error(.unsupported("relative capture reference"), at: ref.innerLoc)
82-
case .named:
83-
// TODO: This could be implemented by querying the capture list for an
84-
// index.
85-
throw error(.unsupported("named capture reference"), at: ref.innerLoc)
86-
}
87-
if let recLevel = ref.recursionLevel {
88-
throw error(.unsupported("recursion level"), at: recLevel.location)
8989
}
9090
}
9191

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ extension Compiler {
55
var options: MatchingOptions
66
var builder = Program.Builder()
77

8+
init(options: MatchingOptions, captureList: CaptureList) {
9+
self.options = options
10+
self.builder.captureList = captureList
11+
}
12+
813
mutating func finish(
914
) throws -> Program {
1015
builder.buildAccept()
@@ -62,7 +67,9 @@ extension Compiler.ByteCodeGen {
6267
case .absolute(let i):
6368
// Backreferences number starting at 1
6469
builder.buildBackreference(.init(i-1))
65-
case .relative, .named:
70+
case .named(let name):
71+
try builder.buildNamedReference(name)
72+
case .relative:
6673
throw Unsupported("Backreference kind: \(ref)")
6774
}
6875
}

Sources/_StringProcessing/Compiler.swift

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,9 @@ class Compiler {
2727

2828
__consuming func emit() throws -> Program {
2929
// TODO: Handle global options
30-
var codegen = ByteCodeGen(options: options)
31-
codegen.builder.captureList = tree.root._captureList
30+
var codegen = ByteCodeGen(
31+
options: options, captureList: tree.root._captureList
32+
)
3233
try codegen.emitNode(tree.root)
3334
let program = try codegen.finish()
3435
return program

Sources/_StringProcessing/Engine/MEBuilder.swift

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,24 @@ extension MEProgram where Input.Element: Hashable {
3838
// Special addresses or instructions
3939
var failAddressToken: AddressToken? = nil
4040

41-
var captureList = CaptureList()
41+
private(set) var namedCaptureOffsets: [String: Int] = [:]
42+
43+
var captureList = CaptureList() {
44+
didSet {
45+
// Named references are guaranteed to be unique for literal ASTs by
46+
// Sema. The DSL tree does not use named references.
47+
for (i, capture) in captureList.captures.enumerated() {
48+
guard let name = capture.name else { continue }
49+
namedCaptureOffsets[name] = i
50+
}
51+
}
52+
}
4253
var initialOptions = MatchingOptions()
4354

4455
// Symbolic reference resolution
4556
var unresolvedReferences: [ReferenceID: [InstructionAddress]] = [:]
4657
var referencedCaptureOffsets: [ReferenceID: Int] = [:]
47-
var namedCaptureOffsets: [String: Int] = [:]
58+
4859
var captureCount: Int {
4960
// We currently deduce the capture count from the capture register number.
5061
nextCaptureRegister.rawValue
@@ -284,6 +295,13 @@ extension MEProgram.Builder {
284295
unresolvedReferences[id, default: []].append(lastInstructionAddress)
285296
}
286297

298+
mutating func buildNamedReference(_ name: String) throws {
299+
guard let index = namedCaptureOffsets[name] else {
300+
throw RegexCompilationError.uncapturedReference
301+
}
302+
buildBackreference(.init(index))
303+
}
304+
287305
// TODO: Mutating because of fail address fixup, drop when
288306
// that's removed
289307
mutating func assemble() throws -> MEProgram {
@@ -456,9 +474,9 @@ extension MEProgram.Builder {
456474
assert(preexistingValue == nil)
457475
}
458476
if let name = name {
459-
// TODO: Reject duplicate capture names unless `(?J)`?
460-
namedCaptureOffsets.updateValue(captureCount, forKey: name)
477+
assert(namedCaptureOffsets[name] == nextCaptureRegister.rawValue)
461478
}
479+
assert(nextCaptureRegister.rawValue < captureList.captures.count)
462480
return nextCaptureRegister
463481
}
464482

Tests/RegexTests/MatchTests.swift

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,23 +1142,32 @@ extension RegexTests {
11421142
}
11431143

11441144
func testMatchReferences() {
1145-
// TODO: Implement backreference/subpattern matching.
11461145
firstMatchTest(
11471146
#"(.)\1"#,
11481147
input: "112", match: "11")
11491148
firstMatchTest(
11501149
#"(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\10"#,
11511150
input: "aaaaaaaaabbc", match: "aaaaaaaaabb")
11521151

1152+
firstMatchTest(
1153+
#"(.)(.)(.)(.)(.)(.)(.)(.)(.)(?<a1>.)(?P=a1)"#,
1154+
input: "aaaaaaaaabbc", match: "aaaaaaaaabb")
1155+
11531156
firstMatchTest(
11541157
#"(.)\g001"#,
11551158
input: "112", match: "11")
11561159

1157-
firstMatchTest(#"(.)(.)\g-02"#, input: "abac", match: "aba", xfail: true)
1158-
firstMatchTest(#"(?<a>.)(.)\k<a>"#, input: "abac", match: "aba", xfail: true)
1159-
firstMatchTest(#"\g'+2'(.)(.)"#, input: "abac", match: "aba", xfail: true)
1160+
firstMatchTest(#"(?<a>.)(.)\k<a>"#, input: "abac", match: "aba")
1161+
1162+
firstMatchTest(#"(?<a>.)(?<b>.)(?<c>.)\k<c>\k<a>\k<b>"#,
1163+
input: "xyzzxy", match: "xyzzxy")
11601164

11611165
firstMatchTest(#"\1(.)"#, input: "112", match: nil)
1166+
firstMatchTest(#"\k<a>(?<a>.)"#, input: "112", match: nil)
1167+
1168+
// TODO: Implement subpattern matching.
1169+
firstMatchTest(#"(.)(.)\g-02"#, input: "abac", match: "aba", xfail: true)
1170+
firstMatchTest(#"\g'+2'(.)(.)"#, input: "abac", match: "aba", xfail: true)
11621171
}
11631172

11641173
func testMatchExamples() {

Tests/RegexTests/ParseTests.swift

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1242,16 +1242,37 @@ extension RegexTests {
12421242
parseTest(#"\k'-3'"#, backreference(.relative(-3)), throwsError: .unsupported)
12431243
parseTest(#"\k'1'"#, backreference(.absolute(1)), throwsError: .invalid)
12441244

1245-
parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .unsupported)
1246-
parseTest(#"\k<bc>"#, backreference(.named("bc")), throwsError: .unsupported)
1247-
parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .unsupported)
1248-
parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .unsupported)
1245+
parseTest(
1246+
#"(?<a>)\k<a>"#, concat(
1247+
namedCapture("a", empty()), backreference(.named("a"))
1248+
), captures: [.named("a")]
1249+
)
1250+
parseTest(
1251+
#"(?<a>)\k{a}"#, concat(
1252+
namedCapture("a", empty()), backreference(.named("a"))
1253+
), captures: [.named("a")]
1254+
)
1255+
parseTest(
1256+
#"(?<a>)\g{a}"#, concat(
1257+
namedCapture("a", empty()), backreference(.named("a"))
1258+
), captures: [.named("a")]
1259+
)
1260+
parseTest(
1261+
#"(?<a>)(?P=a)"#, concat(
1262+
namedCapture("a", empty()), backreference(.named("a"))
1263+
), captures: [.named("a")]
1264+
)
1265+
1266+
parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .invalid)
1267+
parseTest(#"\k<bc>"#, backreference(.named("bc")), throwsError: .invalid)
1268+
parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .invalid)
1269+
parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .invalid)
12491270

12501271
// Oniguruma recursion levels.
12511272
parseTest(#"\k<bc-0>"#, backreference(.named("bc"), recursionLevel: 0), throwsError: .unsupported)
12521273
parseTest(#"\k<a+0>"#, backreference(.named("a"), recursionLevel: 0), throwsError: .unsupported)
1253-
parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1), throwsError: .invalid)
1254-
parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8), throwsError: .invalid)
1274+
parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1), throwsError: .unsupported)
1275+
parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8), throwsError: .unsupported)
12551276
parseTest(#"\k'-3-8'"#, backreference(.relative(-3), recursionLevel: -8), throwsError: .unsupported)
12561277
parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8), throwsError: .unsupported)
12571278
parseTest(#"\k'+3-8'"#, backreference(.relative(3), recursionLevel: -8), throwsError: .unsupported)
@@ -2167,7 +2188,7 @@ extension RegexTests {
21672188
throwsError: .unsupported
21682189
)
21692190
parseWithDelimitersTest(
2170-
#"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .unsupported)
2191+
#"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .invalid)
21712192
parseWithDelimitersTest(
21722193
#"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1),
21732194
throwsError: .unsupported
@@ -2811,6 +2832,12 @@ extension RegexTests {
28112832
diagnosticTest(#"(?:)()\2"#, .invalidReference(2))
28122833
diagnosticTest(#"(?:)(?:)\2"#, .invalidReference(2))
28132834

2835+
diagnosticTest(#"\k<a>"#, .invalidNamedReference("a"))
2836+
diagnosticTest(#"(?:)\k<a>"#, .invalidNamedReference("a"))
2837+
diagnosticTest(#"()\k<a>"#, .invalidNamedReference("a"))
2838+
diagnosticTest(#"()\k<a>()"#, .invalidNamedReference("a"))
2839+
diagnosticTest(#"(?<b>)\k<a>()"#, .invalidNamedReference("a"))
2840+
28142841
// MARK: Conditionals
28152842

28162843
diagnosticTest(#"(?(1)a|b|c)"#, .tooManyBranchesInConditional(3))

0 commit comments

Comments
 (0)