Skip to content

Commit 8f57563

Browse files
committed
Cache word indices
1 parent 9cb337f commit 8f57563

File tree

4 files changed

+60
-18
lines changed

4 files changed

+60
-18
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -118,12 +118,13 @@ fileprivate extension Compiler.ByteCodeGen {
118118
// need to supply both a slice bounds and a per-search bounds.
119119
switch kind {
120120
case .startOfSubject:
121-
builder.buildAssert { (input, pos, subjectBounds) in
121+
builder.buildAssert { (_, _, input, pos, subjectBounds) in
122122
pos == subjectBounds.lowerBound
123123
}
124124

125125
case .endOfSubjectBeforeNewline:
126-
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
126+
builder.buildAssert { [semanticLevel = options.semanticLevel]
127+
(_, _, input, pos, subjectBounds) in
127128
if pos == subjectBounds.upperBound { return true }
128129
switch semanticLevel {
129130
case .graphemeCluster:
@@ -136,7 +137,7 @@ fileprivate extension Compiler.ByteCodeGen {
136137
}
137138

138139
case .endOfSubject:
139-
builder.buildAssert { (input, pos, subjectBounds) in
140+
builder.buildAssert { (_, _, input, pos, subjectBounds) in
140141
pos == subjectBounds.upperBound
141142
}
142143

@@ -149,16 +150,16 @@ fileprivate extension Compiler.ByteCodeGen {
149150

150151
// FIXME: This needs to be based on `searchBounds`,
151152
// not the `subjectBounds` given as an argument here
152-
builder.buildAssert { (input, pos, subjectBounds) in false }
153+
builder.buildAssert { (_, _, input, pos, subjectBounds) in false }
153154

154155
case .textSegment:
155-
builder.buildAssert { (input, pos, _) in
156+
builder.buildAssert { (_, _, input, pos, _) in
156157
// FIXME: Grapheme or word based on options
157158
input.isOnGraphemeClusterBoundary(pos)
158159
}
159160

160161
case .notTextSegment:
161-
builder.buildAssert { (input, pos, _) in
162+
builder.buildAssert { (_, _, input, pos, _) in
162163
// FIXME: Grapheme or word based on options
163164
!input.isOnGraphemeClusterBoundary(pos)
164165
}
@@ -169,7 +170,8 @@ fileprivate extension Compiler.ByteCodeGen {
169170
// the DSL-based `.startOfLine` anchor should always match the start
170171
// of a line. Right now we don't distinguish between those anchors.
171172
if options.anchorsMatchNewlines {
172-
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
173+
builder.buildAssert { [semanticLevel = options.semanticLevel]
174+
(_, _, input, pos, subjectBounds) in
173175
if pos == subjectBounds.lowerBound { return true }
174176
switch semanticLevel {
175177
case .graphemeCluster:
@@ -179,7 +181,7 @@ fileprivate extension Compiler.ByteCodeGen {
179181
}
180182
}
181183
} else {
182-
builder.buildAssert { (input, pos, subjectBounds) in
184+
builder.buildAssert { (_, _, input, pos, subjectBounds) in
183185
pos == subjectBounds.lowerBound
184186
}
185187
}
@@ -190,7 +192,8 @@ fileprivate extension Compiler.ByteCodeGen {
190192
// the DSL-based `.endOfLine` anchor should always match the end
191193
// of a line. Right now we don't distinguish between those anchors.
192194
if options.anchorsMatchNewlines {
193-
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
195+
builder.buildAssert { [semanticLevel = options.semanticLevel]
196+
(_, _, input, pos, subjectBounds) in
194197
if pos == subjectBounds.upperBound { return true }
195198
switch semanticLevel {
196199
case .graphemeCluster:
@@ -200,13 +203,14 @@ fileprivate extension Compiler.ByteCodeGen {
200203
}
201204
}
202205
} else {
203-
builder.buildAssert { (input, pos, subjectBounds) in
206+
builder.buildAssert { (_, _, input, pos, subjectBounds) in
204207
pos == subjectBounds.upperBound
205208
}
206209
}
207210

208211
case .wordBoundary:
209-
builder.buildAssert { [options] (input, pos, subjectBounds) in
212+
builder.buildAssert { [options]
213+
(cache, maxIndex, input, pos, subjectBounds) in
210214
if options.usesSimpleUnicodeBoundaries {
211215
// TODO: How should we handle bounds?
212216
return _CharacterClassModel.word.isBoundary(
@@ -216,12 +220,13 @@ fileprivate extension Compiler.ByteCodeGen {
216220
with: options
217221
)
218222
} else {
219-
return input.isOnWordBoundary(at: pos)
223+
return input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
220224
}
221225
}
222226

223227
case .notWordBoundary:
224-
builder.buildAssert { [options] (input, pos, subjectBounds) in
228+
builder.buildAssert { [options]
229+
(cache, maxIndex, input, pos, subjectBounds) in
225230
if options.usesSimpleUnicodeBoundaries {
226231
// TODO: How should we handle bounds?
227232
return !_CharacterClassModel.word.isBoundary(
@@ -231,7 +236,7 @@ fileprivate extension Compiler.ByteCodeGen {
231236
with: options
232237
)
233238
} else {
234-
return !input.isOnWordBoundary(at: pos)
239+
return !input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
235240
}
236241
}
237242
}

Sources/_StringProcessing/Engine/MEProgram.swift

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,13 @@ struct MEProgram {
1616

1717
typealias ConsumeFunction = (Input, Range<Input.Index>) -> Input.Index?
1818
typealias AssertionFunction =
19-
(Input, Input.Index, Range<Input.Index>) throws -> Bool
19+
(
20+
inout Set<String.Index>?,
21+
inout String.Index?,
22+
Input,
23+
Input.Index,
24+
Range<Input.Index>
25+
) throws -> Bool
2026
typealias TransformFunction =
2127
(Input, Processor._StoredCapture) throws -> Any?
2228
typealias MatcherFunction =

Sources/_StringProcessing/Engine/Processor.swift

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ struct Processor {
7878

7979
var storedCaptures: Array<_StoredCapture>
8080

81+
var wordIndexCache: Set<String.Index>? = nil
82+
var wordIndexMaxIndex: String.Index? = nil
83+
8184
var state: State = .inProgress
8285

8386
var failureReason: Error? = nil
@@ -401,7 +404,13 @@ extension Processor {
401404
let reg = payload.assertion
402405
let assertion = registers[reg]
403406
do {
404-
guard try assertion(input, currentPosition, subjectBounds) else {
407+
guard try assertion(
408+
&wordIndexCache,
409+
&wordIndexMaxIndex,
410+
input,
411+
currentPosition,
412+
subjectBounds
413+
) else {
405414
signalFailure()
406415
return
407416
}

Sources/_StringProcessing/Unicode/WordBreaking.swift

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,20 +13,42 @@
1313
import Swift
1414

1515
extension String {
16-
func isOnWordBoundary(at i: String.Index) -> Bool {
16+
func isOnWordBoundary(
17+
at i: String.Index,
18+
using cache: inout Set<String.Index>?,
19+
_ maxIndex: inout String.Index?
20+
) -> Bool {
1721
guard i != startIndex, i != endIndex else {
1822
return true
1923
}
2024

25+
// If our index is already in our cache, then this is obviously on a
26+
// boundary.
27+
if let cache = cache, cache.contains(i) {
28+
return true
29+
}
30+
31+
// If its not in the cache AND our max index is larger than our index, it
32+
// means this index is never on a word boundary in our string. If our index
33+
// is larger than max index, we may need to still do work to determine if
34+
// i is on a boundary. If it's equal to max index, then it should've been
35+
// taken the cache path.
36+
if let maxIndex = maxIndex, i < maxIndex {
37+
return false
38+
}
39+
2140
if #available(SwiftStdlib 5.7, *) {
2241
var indices: Set<String.Index> = []
23-
var j = startIndex
42+
var j = maxIndex ?? startIndex
2443

2544
while j < endIndex, j <= i {
2645
indices.insert(j)
2746
j = _wordIndex(after: j)
2847
}
2948

49+
cache = indices
50+
maxIndex = j
51+
3052
return indices.contains(i)
3153
} else {
3254
return false

0 commit comments

Comments
 (0)