Skip to content

Commit 8ff9441

Browse files
committed
Merge pull request swiftlang#530 from Azoy/words
Use word breaking SPI for \b
1 parent be98b28 commit 8ff9441

File tree

7 files changed

+199
-47
lines changed

7 files changed

+199
-47
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 53 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
@_spi(_Unicode)
13+
import Swift
14+
115
@_implementationOnly import _RegexParser
216

317
extension Compiler {
@@ -107,12 +121,13 @@ fileprivate extension Compiler.ByteCodeGen {
107121
// need to supply both a slice bounds and a per-search bounds.
108122
switch kind {
109123
case .startOfSubject:
110-
builder.buildAssert { (input, pos, subjectBounds) in
124+
builder.buildAssert { (_, _, input, pos, subjectBounds) in
111125
pos == subjectBounds.lowerBound
112126
}
113127

114128
case .endOfSubjectBeforeNewline:
115-
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
129+
builder.buildAssert { [semanticLevel = options.semanticLevel]
130+
(_, _, input, pos, subjectBounds) in
116131
if pos == subjectBounds.upperBound { return true }
117132
switch semanticLevel {
118133
case .graphemeCluster:
@@ -125,7 +140,7 @@ fileprivate extension Compiler.ByteCodeGen {
125140
}
126141

127142
case .endOfSubject:
128-
builder.buildAssert { (input, pos, subjectBounds) in
143+
builder.buildAssert { (_, _, input, pos, subjectBounds) in
129144
pos == subjectBounds.upperBound
130145
}
131146

@@ -138,16 +153,16 @@ fileprivate extension Compiler.ByteCodeGen {
138153

139154
// FIXME: This needs to be based on `searchBounds`,
140155
// not the `subjectBounds` given as an argument here
141-
builder.buildAssert { (input, pos, subjectBounds) in false }
156+
builder.buildAssert { (_, _, input, pos, subjectBounds) in false }
142157

143158
case .textSegment:
144-
builder.buildAssert { (input, pos, _) in
159+
builder.buildAssert { (_, _, input, pos, _) in
145160
// FIXME: Grapheme or word based on options
146161
input.isOnGraphemeClusterBoundary(pos)
147162
}
148163

149164
case .notTextSegment:
150-
builder.buildAssert { (input, pos, _) in
165+
builder.buildAssert { (_, _, input, pos, _) in
151166
// FIXME: Grapheme or word based on options
152167
!input.isOnGraphemeClusterBoundary(pos)
153168
}
@@ -158,7 +173,8 @@ fileprivate extension Compiler.ByteCodeGen {
158173
// the DSL-based `.startOfLine` anchor should always match the start
159174
// of a line. Right now we don't distinguish between those anchors.
160175
if options.anchorsMatchNewlines {
161-
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
176+
builder.buildAssert { [semanticLevel = options.semanticLevel]
177+
(_, _, input, pos, subjectBounds) in
162178
if pos == subjectBounds.lowerBound { return true }
163179
switch semanticLevel {
164180
case .graphemeCluster:
@@ -168,7 +184,7 @@ fileprivate extension Compiler.ByteCodeGen {
168184
}
169185
}
170186
} else {
171-
builder.buildAssert { (input, pos, subjectBounds) in
187+
builder.buildAssert { (_, _, input, pos, subjectBounds) in
172188
pos == subjectBounds.lowerBound
173189
}
174190
}
@@ -179,7 +195,8 @@ fileprivate extension Compiler.ByteCodeGen {
179195
// the DSL-based `.endOfLine` anchor should always match the end
180196
// of a line. Right now we don't distinguish between those anchors.
181197
if options.anchorsMatchNewlines {
182-
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
198+
builder.buildAssert { [semanticLevel = options.semanticLevel]
199+
(_, _, input, pos, subjectBounds) in
183200
if pos == subjectBounds.upperBound { return true }
184201
switch semanticLevel {
185202
case .graphemeCluster:
@@ -189,25 +206,41 @@ fileprivate extension Compiler.ByteCodeGen {
189206
}
190207
}
191208
} else {
192-
builder.buildAssert { (input, pos, subjectBounds) in
209+
builder.buildAssert { (_, _, input, pos, subjectBounds) in
193210
pos == subjectBounds.upperBound
194211
}
195212
}
196213

197214
case .wordBoundary:
198-
// TODO: May want to consider Unicode level
199-
builder.buildAssert { [options] (input, pos, subjectBounds) in
200-
// TODO: How should we handle bounds?
201-
_CharacterClassModel.word.isBoundary(
202-
input, at: pos, bounds: subjectBounds, with: options)
215+
builder.buildAssert { [options]
216+
(cache, maxIndex, input, pos, subjectBounds) in
217+
if options.usesSimpleUnicodeBoundaries {
218+
// TODO: How should we handle bounds?
219+
return _CharacterClassModel.word.isBoundary(
220+
input,
221+
at: pos,
222+
bounds: subjectBounds,
223+
with: options
224+
)
225+
} else {
226+
return input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
227+
}
203228
}
204229

205230
case .notWordBoundary:
206-
// TODO: May want to consider Unicode level
207-
builder.buildAssert { [options] (input, pos, subjectBounds) in
208-
// TODO: How should we handle bounds?
209-
!_CharacterClassModel.word.isBoundary(
210-
input, at: pos, bounds: subjectBounds, with: options)
231+
builder.buildAssert { [options]
232+
(cache, maxIndex, input, pos, subjectBounds) in
233+
if options.usesSimpleUnicodeBoundaries {
234+
// TODO: How should we handle bounds?
235+
return !_CharacterClassModel.word.isBoundary(
236+
input,
237+
at: pos,
238+
bounds: subjectBounds,
239+
with: options
240+
)
241+
} else {
242+
return !input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
243+
}
211244
}
212245
}
213246
}

Sources/_StringProcessing/Engine/MEProgram.swift

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,13 @@ struct MEProgram {
1616

1717
typealias ConsumeFunction = (Input, Range<Input.Index>) -> Input.Index?
1818
typealias AssertionFunction =
19-
(Input, Input.Index, Range<Input.Index>) throws -> Bool
19+
(
20+
inout Set<String.Index>?,
21+
inout String.Index?,
22+
Input,
23+
Input.Index,
24+
Range<Input.Index>
25+
) throws -> Bool
2026
typealias TransformFunction =
2127
(Input, Processor._StoredCapture) throws -> Any?
2228
typealias MatcherFunction =

Sources/_StringProcessing/Engine/Processor.swift

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ struct Processor {
7878

7979
var storedCaptures: Array<_StoredCapture>
8080

81+
var wordIndexCache: Set<String.Index>? = nil
82+
var wordIndexMaxIndex: String.Index? = nil
83+
8184
var state: State = .inProgress
8285

8386
var failureReason: Error? = nil
@@ -401,7 +404,13 @@ extension Processor {
401404
let reg = payload.assertion
402405
let assertion = registers[reg]
403406
do {
404-
guard try assertion(input, currentPosition, subjectBounds) else {
407+
guard try assertion(
408+
&wordIndexCache,
409+
&wordIndexMaxIndex,
410+
input,
411+
currentPosition,
412+
subjectBounds
413+
) else {
405414
signalFailure()
406415
return
407416
}

Sources/_StringProcessing/MatchingOptions.swift

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,9 +311,12 @@ extension MatchingOptions.Representation {
311311
[.reluctantByDefault, .possessiveByDefault]
312312
}
313313

314+
// Uses level 2 Unicode word boundaries
315+
static var unicodeWordBoundaries: Self { .init(.unicodeWordBoundaries) }
316+
314317
/// The default set of options.
315318
static var `default`: Self {
316-
[.graphemeClusterSemantics, .textSegmentGraphemeMode]
319+
[.graphemeClusterSemantics, .textSegmentGraphemeMode, .unicodeWordBoundaries]
317320
}
318321
}
319322

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
@_spi(_Unicode)
13+
import Swift
14+
15+
extension String {
16+
func isOnWordBoundary(
17+
at i: String.Index,
18+
using cache: inout Set<String.Index>?,
19+
_ maxIndex: inout String.Index?
20+
) -> Bool {
21+
guard i != startIndex, i != endIndex else {
22+
return true
23+
}
24+
25+
// If our index is already in our cache, then this is obviously on a
26+
// boundary.
27+
if let cache = cache, cache.contains(i) {
28+
return true
29+
}
30+
31+
// If its not in the cache AND our max index is larger than our index, it
32+
// means this index is never on a word boundary in our string. If our index
33+
// is larger than max index, we may need to still do work to determine if
34+
// i is on a boundary. If it's equal to max index, then it should've been
35+
// taken the cache path.
36+
if let maxIndex = maxIndex, i < maxIndex {
37+
return false
38+
}
39+
40+
if #available(SwiftStdlib 5.7, *) {
41+
var indices: Set<String.Index> = []
42+
var j = maxIndex ?? startIndex
43+
44+
while j < endIndex, j <= i {
45+
indices.insert(j)
46+
j = _wordIndex(after: j)
47+
}
48+
49+
cache = indices
50+
maxIndex = j
51+
52+
return indices.contains(i)
53+
} else {
54+
return false
55+
}
56+
}
57+
}

Tests/RegexBuilderTests/RegexDSLTests.swift

Lines changed: 37 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -271,29 +271,44 @@ class RegexDSLTests: XCTestCase {
271271
.ignoresCase(false)
272272
}
273273

274-
#if os(macOS)
275-
try XCTExpectFailure("Implement level 2 word boundaries") {
276-
try _testDSLCaptures(
277-
("can't stop won't stop", ("can't stop won't stop", "can't", "won")),
278-
matchType: (Substring, Substring, Substring).self, ==) {
279-
Capture {
280-
OneOrMore(.word)
281-
Anchor.wordBoundary
282-
}
283-
OneOrMore(.any, .reluctant)
284-
"stop"
285-
" "
286-
287-
Capture {
288-
OneOrMore(.word)
289-
Anchor.wordBoundary
290-
}
291-
.wordBoundaryKind(.unicodeLevel1)
292-
OneOrMore(.any, .reluctant)
293-
"stop"
274+
try _testDSLCaptures(
275+
("can't stop won't stop", ("can't stop won't stop", "can't", "won't")),
276+
matchType: (Substring, Substring, Substring).self, ==) {
277+
Capture {
278+
OneOrMore(.word)
279+
Anchor.wordBoundary
294280
}
295-
}
296-
#endif
281+
OneOrMore(.any, .reluctant)
282+
"stop"
283+
" "
284+
285+
Capture {
286+
OneOrMore(.word)
287+
Anchor.wordBoundary
288+
}
289+
OneOrMore(.any, .reluctant)
290+
"stop"
291+
}
292+
293+
try _testDSLCaptures(
294+
("can't stop won't stop", ("can't stop won't stop", "can", "won")),
295+
matchType: (Substring, Substring, Substring).self, ==) {
296+
Capture {
297+
OneOrMore(.word)
298+
Anchor.wordBoundary
299+
}
300+
OneOrMore(.any, .reluctant)
301+
"stop"
302+
" "
303+
304+
Capture {
305+
OneOrMore(.word)
306+
Anchor.wordBoundary
307+
}
308+
.wordBoundaryKind(.unicodeLevel1)
309+
OneOrMore(.any, .reluctant)
310+
"stop"
311+
}
297312

298313
try _testDSLCaptures(
299314
("abcdef123", ("abcdef123", "a", "123")),

0 commit comments

Comments
 (0)