Skip to content

Commit 9cb337f

Browse files
committed
Use word breaking SPI for \b
1 parent 290ce10 commit 9cb337f

File tree

5 files changed

+131
-33
lines changed

5 files changed

+131
-33
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
@_spi(_Unicode)
13+
import Swift
14+
115
@_implementationOnly import _RegexParser
216

317
extension Compiler {
@@ -192,19 +206,33 @@ fileprivate extension Compiler.ByteCodeGen {
192206
}
193207

194208
case .wordBoundary:
195-
// TODO: May want to consider Unicode level
196209
builder.buildAssert { [options] (input, pos, subjectBounds) in
197-
// TODO: How should we handle bounds?
198-
_CharacterClassModel.word.isBoundary(
199-
input, at: pos, bounds: subjectBounds, with: options)
210+
if options.usesSimpleUnicodeBoundaries {
211+
// TODO: How should we handle bounds?
212+
return _CharacterClassModel.word.isBoundary(
213+
input,
214+
at: pos,
215+
bounds: subjectBounds,
216+
with: options
217+
)
218+
} else {
219+
return input.isOnWordBoundary(at: pos)
220+
}
200221
}
201222

202223
case .notWordBoundary:
203-
// TODO: May want to consider Unicode level
204224
builder.buildAssert { [options] (input, pos, subjectBounds) in
205-
// TODO: How should we handle bounds?
206-
!_CharacterClassModel.word.isBoundary(
207-
input, at: pos, bounds: subjectBounds, with: options)
225+
if options.usesSimpleUnicodeBoundaries {
226+
// TODO: How should we handle bounds?
227+
return !_CharacterClassModel.word.isBoundary(
228+
input,
229+
at: pos,
230+
bounds: subjectBounds,
231+
with: options
232+
)
233+
} else {
234+
return !input.isOnWordBoundary(at: pos)
235+
}
208236
}
209237
}
210238
}

Sources/_StringProcessing/MatchingOptions.swift

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,9 +311,12 @@ extension MatchingOptions.Representation {
311311
[.reluctantByDefault, .possessiveByDefault]
312312
}
313313

314+
// Uses level 2 Unicode word boundaries
315+
static var unicodeWordBoundaries: Self { .init(.unicodeWordBoundaries) }
316+
314317
/// The default set of options.
315318
static var `default`: Self {
316-
[.graphemeClusterSemantics, .textSegmentGraphemeMode]
319+
[.graphemeClusterSemantics, .textSegmentGraphemeMode, .unicodeWordBoundaries]
317320
}
318321
}
319322

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
@_spi(_Unicode)
13+
import Swift
14+
15+
extension String {
16+
func isOnWordBoundary(at i: String.Index) -> Bool {
17+
guard i != startIndex, i != endIndex else {
18+
return true
19+
}
20+
21+
if #available(SwiftStdlib 5.7, *) {
22+
var indices: Set<String.Index> = []
23+
var j = startIndex
24+
25+
while j < endIndex, j <= i {
26+
indices.insert(j)
27+
j = _wordIndex(after: j)
28+
}
29+
30+
return indices.contains(i)
31+
} else {
32+
return false
33+
}
34+
}
35+
}

Tests/RegexBuilderTests/RegexDSLTests.swift

Lines changed: 37 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -271,29 +271,44 @@ class RegexDSLTests: XCTestCase {
271271
.ignoresCase(false)
272272
}
273273

274-
#if os(macOS)
275-
try XCTExpectFailure("Implement level 2 word boundaries") {
276-
try _testDSLCaptures(
277-
("can't stop won't stop", ("can't stop won't stop", "can't", "won")),
278-
matchType: (Substring, Substring, Substring).self, ==) {
279-
Capture {
280-
OneOrMore(.word)
281-
Anchor.wordBoundary
282-
}
283-
OneOrMore(.any, .reluctant)
284-
"stop"
285-
" "
286-
287-
Capture {
288-
OneOrMore(.word)
289-
Anchor.wordBoundary
290-
}
291-
.wordBoundaryKind(.unicodeLevel1)
292-
OneOrMore(.any, .reluctant)
293-
"stop"
274+
try _testDSLCaptures(
275+
("can't stop won't stop", ("can't stop won't stop", "can't", "won't")),
276+
matchType: (Substring, Substring, Substring).self, ==) {
277+
Capture {
278+
OneOrMore(.word)
279+
Anchor.wordBoundary
294280
}
295-
}
296-
#endif
281+
OneOrMore(.any, .reluctant)
282+
"stop"
283+
" "
284+
285+
Capture {
286+
OneOrMore(.word)
287+
Anchor.wordBoundary
288+
}
289+
OneOrMore(.any, .reluctant)
290+
"stop"
291+
}
292+
293+
try _testDSLCaptures(
294+
("can't stop won't stop", ("can't stop won't stop", "can", "won")),
295+
matchType: (Substring, Substring, Substring).self, ==) {
296+
Capture {
297+
OneOrMore(.word)
298+
Anchor.wordBoundary
299+
}
300+
OneOrMore(.any, .reluctant)
301+
"stop"
302+
" "
303+
304+
Capture {
305+
OneOrMore(.word)
306+
Anchor.wordBoundary
307+
}
308+
.wordBoundaryKind(.unicodeLevel1)
309+
OneOrMore(.any, .reluctant)
310+
"stop"
311+
}
297312

298313
try _testDSLCaptures(
299314
("abcdef123", ("abcdef123", "a", "123")),

Tests/RegexTests/MatchTests.swift

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1069,6 +1069,23 @@ extension RegexTests {
10691069
("Sol Cafe", nil), xfail: true)
10701070
}
10711071

1072+
func testLevel2WordBoundaries() {
1073+
// MARK: Level 2 Word Boundaries
1074+
firstMatchTest(#"\b😊\b"#, input: "🔥😊👍", match: "😊")
1075+
firstMatchTest(#"\b👨🏽\b"#, input: "👩🏻👶🏿👨🏽🧑🏾👩🏼", match: "👨🏽")
1076+
firstMatchTest(#"\b🇺🇸\b"#, input: "🇨🇦🇺🇸🇲🇽", match: "🇺🇸")
1077+
firstMatchTest(#"\b.+\b"#, input: "€1 234,56", match: "€1 234,56")
1078+
firstMatchTest(#"〱\B㋞\Bツ"#, input: "〱㋞ツ", match: "〱㋞ツ")
1079+
firstMatchTest(#"\bhello\b"#, input: "hello〱㋞ツ", match: "hello")
1080+
firstMatchTest(#"\bChicago\b"#, input: "나는 Chicago에 산다", match: "Chicago")
1081+
firstMatchTest(#"\blove\b"#, input: "眼睛love食物", match: "love")
1082+
firstMatchTest(#"\b\u{d}\u{a}\b"#, input: "\u{d}\u{a}", match: "\u{d}\u{a}")
1083+
firstMatchTest(#"\bㅋㅋㅋ\b"#, input: "아니ㅋㅋㅋ네", match: "ㅋㅋㅋ")
1084+
firstMatchTest(#"Re\B\:\BZero"#, input: "Re:Zero Starting Life in Another World", match: "Re:Zero")
1085+
firstMatchTest(#"can\B\'\Bt"#, input: "I can't do that.", match: "can't")
1086+
firstMatchTest(#"\b÷\b"#, input: "3 ÷ 3 = 1", match: "÷")
1087+
}
1088+
10721089
func testMatchGroups() {
10731090
// MARK: Groups
10741091

@@ -1475,12 +1492,12 @@ extension RegexTests {
14751492
#"(?W)abcd\b.+"#,
14761493
("abcd ef", true),
14771494
("abcdef", false),
1478-
("abcdéf", true)) // "dé" matches /d\b./ because "é" isn't ASCII
1495+
("abcdéf", false))
14791496
matchTest(
14801497
#"(?P)abcd\b.+"#,
14811498
("abcd ef", true),
14821499
("abcdef", false),
1483-
("abcdéf", true)) // "dé" matches /d\b./ because "é" isn't ASCII
1500+
("abcdéf", false))
14841501

14851502
// 'S' ASCII-only spaces
14861503
matchTest(

0 commit comments

Comments
 (0)