Skip to content

Commit 3d44861

Browse files
authored
Merge pull request #444 from hamishknight/named-refs-5.7
2 parents e50feb0 + 50f6c96 commit 3d44861

File tree

11 files changed

+95
-29
lines changed

11 files changed

+95
-29
lines changed

Sources/_RegexParser/Regex/Parse/CaptureList.swift

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,21 @@ extension CaptureList {
4242
}
4343
}
4444

45+
extension CaptureList {
46+
/// Retrieve the capture index of a given named capture, or `nil` if there is
47+
/// no such capture.
48+
public func indexOfCapture(named name: String) -> Int? {
49+
// Named references are guaranteed to be unique for literal ASTs by Sema.
50+
// The DSL tree does not use named references.
51+
captures.indices.first(where: { captures[$0].name == name })
52+
}
53+
54+
/// Whether the capture list has a given named capture.
55+
public func hasCapture(named name: String) -> Bool {
56+
indexOfCapture(named: name) != nil
57+
}
58+
}
59+
4560
// MARK: Generating from AST
4661

4762
extension AST.Node {

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ enum ParseError: Error, Hashable {
8686
case unsupported(String)
8787
case deprecatedUnicode(String)
8888
case invalidReference(Int)
89+
case invalidNamedReference(String)
8990
case duplicateNamedCapture(String)
9091
case invalidCharacterClassRangeOperand
9192
case invalidQuantifierRange(Int, Int)
@@ -211,6 +212,8 @@ extension ParseError: CustomStringConvertible {
211212
return "\(kind) is a deprecated Unicode property, and is not supported"
212213
case let .invalidReference(i):
213214
return "no capture numbered \(i)"
215+
case let .invalidNamedReference(name):
216+
return "no capture named '\(name)'"
214217
case let .duplicateNamedCapture(str):
215218
return "group named '\(str)' already exists"
216219
case let .invalidQuantifierRange(lhs, rhs):

Sources/_RegexParser/Regex/Parse/Sema.swift

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,20 +72,20 @@ extension RegexValidator {
7272
}
7373

7474
func validateReference(_ ref: AST.Reference) throws {
75+
if let recLevel = ref.recursionLevel {
76+
throw error(.unsupported("recursion level"), at: recLevel.location)
77+
}
7578
switch ref.kind {
7679
case .absolute(let i):
7780
guard i <= captures.captures.count else {
7881
throw error(.invalidReference(i), at: ref.innerLoc)
7982
}
83+
case .named(let name):
84+
guard captures.hasCapture(named: name) else {
85+
throw error(.invalidNamedReference(name), at: ref.innerLoc)
86+
}
8087
case .relative:
8188
throw error(.unsupported("relative capture reference"), at: ref.innerLoc)
82-
case .named:
83-
// TODO: This could be implemented by querying the capture list for an
84-
// index.
85-
throw error(.unsupported("named capture reference"), at: ref.innerLoc)
86-
}
87-
if let recLevel = ref.recursionLevel {
88-
throw error(.unsupported("recursion level"), at: recLevel.location)
8989
}
9090
}
9191

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ extension Compiler {
55
var options: MatchingOptions
66
var builder = Program.Builder()
77

8+
init(options: MatchingOptions, captureList: CaptureList) {
9+
self.options = options
10+
self.builder.captureList = captureList
11+
}
12+
813
mutating func finish(
914
) throws -> Program {
1015
builder.buildAccept()
@@ -62,7 +67,9 @@ extension Compiler.ByteCodeGen {
6267
case .absolute(let i):
6368
// Backreferences number starting at 1
6469
builder.buildBackreference(.init(i-1))
65-
case .relative, .named:
70+
case .named(let name):
71+
try builder.buildNamedReference(name)
72+
case .relative:
6673
throw Unsupported("Backreference kind: \(ref)")
6774
}
6875
}

Sources/_StringProcessing/Compiler.swift

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,9 @@ class Compiler {
2727

2828
__consuming func emit() throws -> Program {
2929
// TODO: Handle global options
30-
var codegen = ByteCodeGen(options: options)
31-
codegen.builder.captureList = tree.root._captureList
30+
var codegen = ByteCodeGen(
31+
options: options, captureList: tree.root._captureList
32+
)
3233
try codegen.emitNode(tree.root)
3334
let program = try codegen.finish()
3435
return program

Sources/_StringProcessing/Engine/MEBuilder.swift

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ extension MEProgram where Input.Element: Hashable {
4444
// Symbolic reference resolution
4545
var unresolvedReferences: [ReferenceID: [InstructionAddress]] = [:]
4646
var referencedCaptureOffsets: [ReferenceID: Int] = [:]
47-
var namedCaptureOffsets: [String: Int] = [:]
47+
4848
var captureCount: Int {
4949
// We currently deduce the capture count from the capture register number.
5050
nextCaptureRegister.rawValue
@@ -284,6 +284,13 @@ extension MEProgram.Builder {
284284
unresolvedReferences[id, default: []].append(lastInstructionAddress)
285285
}
286286

287+
mutating func buildNamedReference(_ name: String) throws {
288+
guard let index = captureList.indexOfCapture(named: name) else {
289+
throw RegexCompilationError.uncapturedReference
290+
}
291+
buildBackreference(.init(index))
292+
}
293+
287294
// TODO: Mutating because of fail address fixup, drop when
288295
// that's removed
289296
mutating func assemble() throws -> MEProgram {
@@ -359,7 +366,6 @@ extension MEProgram.Builder {
359366
registerInfo: regInfo,
360367
captureList: captureList,
361368
referencedCaptureOffsets: referencedCaptureOffsets,
362-
namedCaptureOffsets: namedCaptureOffsets,
363369
initialOptions: initialOptions)
364370
}
365371

@@ -456,9 +462,10 @@ extension MEProgram.Builder {
456462
assert(preexistingValue == nil)
457463
}
458464
if let name = name {
459-
// TODO: Reject duplicate capture names unless `(?J)`?
460-
namedCaptureOffsets.updateValue(captureCount, forKey: name)
465+
let index = captureList.indexOfCapture(named: name)
466+
assert(index == nextCaptureRegister.rawValue)
461467
}
468+
assert(nextCaptureRegister.rawValue < captureList.captures.count)
462469
return nextCaptureRegister
463470
}
464471

Sources/_StringProcessing/Engine/MECapture.swift

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,6 @@ extension Processor._StoredCapture: CustomStringConvertible {
145145
struct MECaptureList {
146146
var values: Array<Processor<String>._StoredCapture>
147147
var referencedCaptureOffsets: [ReferenceID: Int]
148-
var namedCaptureOffsets: [String: Int]
149148

150149
// func extract(from s: String) -> Array<Array<Substring>> {
151150
// caps.map { $0.map { s[$0] } }

Sources/_StringProcessing/Engine/MEProgram.swift

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ struct MEProgram<Input: Collection> where Input.Element: Equatable {
3636

3737
let captureList: CaptureList
3838
let referencedCaptureOffsets: [ReferenceID: Int]
39-
let namedCaptureOffsets: [String: Int]
4039

4140
var initialOptions: MatchingOptions
4241
}

Sources/_StringProcessing/Executor.swift

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,7 @@ struct Executor {
3737

3838
let capList = MECaptureList(
3939
values: cpu.storedCaptures,
40-
referencedCaptureOffsets: engine.program.referencedCaptureOffsets,
41-
namedCaptureOffsets: engine.program.namedCaptureOffsets)
40+
referencedCaptureOffsets: engine.program.referencedCaptureOffsets)
4241

4342
let range = inputRange.lowerBound..<endIdx
4443
let caps = engine.program.captureList.createElements(capList, input)

Tests/RegexTests/MatchTests.swift

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,23 +1134,32 @@ extension RegexTests {
11341134
}
11351135

11361136
func testMatchReferences() {
1137-
// TODO: Implement backreference/subpattern matching.
11381137
firstMatchTest(
11391138
#"(.)\1"#,
11401139
input: "112", match: "11")
11411140
firstMatchTest(
11421141
#"(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\10"#,
11431142
input: "aaaaaaaaabbc", match: "aaaaaaaaabb")
11441143

1144+
firstMatchTest(
1145+
#"(.)(.)(.)(.)(.)(.)(.)(.)(.)(?<a1>.)(?P=a1)"#,
1146+
input: "aaaaaaaaabbc", match: "aaaaaaaaabb")
1147+
11451148
firstMatchTest(
11461149
#"(.)\g001"#,
11471150
input: "112", match: "11")
11481151

1149-
firstMatchTest(#"(.)(.)\g-02"#, input: "abac", match: "aba", xfail: true)
1150-
firstMatchTest(#"(?<a>.)(.)\k<a>"#, input: "abac", match: "aba", xfail: true)
1151-
firstMatchTest(#"\g'+2'(.)(.)"#, input: "abac", match: "aba", xfail: true)
1152+
firstMatchTest(#"(?<a>.)(.)\k<a>"#, input: "abac", match: "aba")
1153+
1154+
firstMatchTest(#"(?<a>.)(?<b>.)(?<c>.)\k<c>\k<a>\k<b>"#,
1155+
input: "xyzzxy", match: "xyzzxy")
11521156

11531157
firstMatchTest(#"\1(.)"#, input: "112", match: nil)
1158+
firstMatchTest(#"\k<a>(?<a>.)"#, input: "112", match: nil)
1159+
1160+
// TODO: Implement subpattern matching.
1161+
firstMatchTest(#"(.)(.)\g-02"#, input: "abac", match: "aba", xfail: true)
1162+
firstMatchTest(#"\g'+2'(.)(.)"#, input: "abac", match: "aba", xfail: true)
11541163
}
11551164

11561165
func testMatchExamples() {

Tests/RegexTests/ParseTests.swift

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1231,16 +1231,37 @@ extension RegexTests {
12311231
parseTest(#"\k'-3'"#, backreference(.relative(-3)), throwsError: .unsupported)
12321232
parseTest(#"\k'1'"#, backreference(.absolute(1)), throwsError: .invalid)
12331233

1234-
parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .unsupported)
1235-
parseTest(#"\k<bc>"#, backreference(.named("bc")), throwsError: .unsupported)
1236-
parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .unsupported)
1237-
parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .unsupported)
1234+
parseTest(
1235+
#"(?<a>)\k<a>"#, concat(
1236+
namedCapture("a", empty()), backreference(.named("a"))
1237+
), captures: [.named("a")]
1238+
)
1239+
parseTest(
1240+
#"(?<a>)\k{a}"#, concat(
1241+
namedCapture("a", empty()), backreference(.named("a"))
1242+
), captures: [.named("a")]
1243+
)
1244+
parseTest(
1245+
#"(?<a>)\g{a}"#, concat(
1246+
namedCapture("a", empty()), backreference(.named("a"))
1247+
), captures: [.named("a")]
1248+
)
1249+
parseTest(
1250+
#"(?<a>)(?P=a)"#, concat(
1251+
namedCapture("a", empty()), backreference(.named("a"))
1252+
), captures: [.named("a")]
1253+
)
1254+
1255+
parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .invalid)
1256+
parseTest(#"\k<bc>"#, backreference(.named("bc")), throwsError: .invalid)
1257+
parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .invalid)
1258+
parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .invalid)
12381259

12391260
// Oniguruma recursion levels.
12401261
parseTest(#"\k<bc-0>"#, backreference(.named("bc"), recursionLevel: 0), throwsError: .unsupported)
12411262
parseTest(#"\k<a+0>"#, backreference(.named("a"), recursionLevel: 0), throwsError: .unsupported)
1242-
parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1), throwsError: .invalid)
1243-
parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8), throwsError: .invalid)
1263+
parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1), throwsError: .unsupported)
1264+
parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8), throwsError: .unsupported)
12441265
parseTest(#"\k'-3-8'"#, backreference(.relative(-3), recursionLevel: -8), throwsError: .unsupported)
12451266
parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8), throwsError: .unsupported)
12461267
parseTest(#"\k'+3-8'"#, backreference(.relative(3), recursionLevel: -8), throwsError: .unsupported)
@@ -2137,7 +2158,7 @@ extension RegexTests {
21372158
throwsError: .unsupported
21382159
)
21392160
parseWithDelimitersTest(
2140-
#"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .unsupported)
2161+
#"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .invalid)
21412162
parseWithDelimitersTest(
21422163
#"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1),
21432164
throwsError: .unsupported
@@ -2774,6 +2795,12 @@ extension RegexTests {
27742795
diagnosticTest(#"(?:)()\2"#, .invalidReference(2))
27752796
diagnosticTest(#"(?:)(?:)\2"#, .invalidReference(2))
27762797

2798+
diagnosticTest(#"\k<a>"#, .invalidNamedReference("a"))
2799+
diagnosticTest(#"(?:)\k<a>"#, .invalidNamedReference("a"))
2800+
diagnosticTest(#"()\k<a>"#, .invalidNamedReference("a"))
2801+
diagnosticTest(#"()\k<a>()"#, .invalidNamedReference("a"))
2802+
diagnosticTest(#"(?<b>)\k<a>()"#, .invalidNamedReference("a"))
2803+
27772804
// MARK: Conditionals
27782805

27792806
diagnosticTest(#"(?(1)a|b|c)"#, .tooManyBranchesInConditional(3))

0 commit comments

Comments
 (0)