Skip to content

[5.7] Use word breaking SPI for \b #549

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 53 additions & 20 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

@_spi(_Unicode)
import Swift

@_implementationOnly import _RegexParser

extension Compiler {
Expand Down Expand Up @@ -107,12 +121,13 @@ fileprivate extension Compiler.ByteCodeGen {
// need to supply both a slice bounds and a per-search bounds.
switch kind {
case .startOfSubject:
builder.buildAssert { (input, pos, subjectBounds) in
builder.buildAssert { (_, _, input, pos, subjectBounds) in
pos == subjectBounds.lowerBound
}

case .endOfSubjectBeforeNewline:
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
builder.buildAssert { [semanticLevel = options.semanticLevel]
(_, _, input, pos, subjectBounds) in
if pos == subjectBounds.upperBound { return true }
switch semanticLevel {
case .graphemeCluster:
Expand All @@ -125,7 +140,7 @@ fileprivate extension Compiler.ByteCodeGen {
}

case .endOfSubject:
builder.buildAssert { (input, pos, subjectBounds) in
builder.buildAssert { (_, _, input, pos, subjectBounds) in
pos == subjectBounds.upperBound
}

Expand All @@ -138,16 +153,16 @@ fileprivate extension Compiler.ByteCodeGen {

// FIXME: This needs to be based on `searchBounds`,
// not the `subjectBounds` given as an argument here
builder.buildAssert { (input, pos, subjectBounds) in false }
builder.buildAssert { (_, _, input, pos, subjectBounds) in false }

case .textSegment:
builder.buildAssert { (input, pos, _) in
builder.buildAssert { (_, _, input, pos, _) in
// FIXME: Grapheme or word based on options
input.isOnGraphemeClusterBoundary(pos)
}

case .notTextSegment:
builder.buildAssert { (input, pos, _) in
builder.buildAssert { (_, _, input, pos, _) in
// FIXME: Grapheme or word based on options
!input.isOnGraphemeClusterBoundary(pos)
}
Expand All @@ -158,7 +173,8 @@ fileprivate extension Compiler.ByteCodeGen {
// the DSL-based `.startOfLine` anchor should always match the start
// of a line. Right now we don't distinguish between those anchors.
if options.anchorsMatchNewlines {
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
builder.buildAssert { [semanticLevel = options.semanticLevel]
(_, _, input, pos, subjectBounds) in
if pos == subjectBounds.lowerBound { return true }
switch semanticLevel {
case .graphemeCluster:
Expand All @@ -168,7 +184,7 @@ fileprivate extension Compiler.ByteCodeGen {
}
}
} else {
builder.buildAssert { (input, pos, subjectBounds) in
builder.buildAssert { (_, _, input, pos, subjectBounds) in
pos == subjectBounds.lowerBound
}
}
Expand All @@ -179,7 +195,8 @@ fileprivate extension Compiler.ByteCodeGen {
// the DSL-based `.endOfLine` anchor should always match the end
// of a line. Right now we don't distinguish between those anchors.
if options.anchorsMatchNewlines {
builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in
builder.buildAssert { [semanticLevel = options.semanticLevel]
(_, _, input, pos, subjectBounds) in
if pos == subjectBounds.upperBound { return true }
switch semanticLevel {
case .graphemeCluster:
Expand All @@ -189,25 +206,41 @@ fileprivate extension Compiler.ByteCodeGen {
}
}
} else {
builder.buildAssert { (input, pos, subjectBounds) in
builder.buildAssert { (_, _, input, pos, subjectBounds) in
pos == subjectBounds.upperBound
}
}

case .wordBoundary:
// TODO: May want to consider Unicode level
builder.buildAssert { [options] (input, pos, subjectBounds) in
// TODO: How should we handle bounds?
_CharacterClassModel.word.isBoundary(
input, at: pos, bounds: subjectBounds, with: options)
builder.buildAssert { [options]
(cache, maxIndex, input, pos, subjectBounds) in
if options.usesSimpleUnicodeBoundaries {
// TODO: How should we handle bounds?
return _CharacterClassModel.word.isBoundary(
input,
at: pos,
bounds: subjectBounds,
with: options
)
} else {
return input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
}
}

case .notWordBoundary:
// TODO: May want to consider Unicode level
builder.buildAssert { [options] (input, pos, subjectBounds) in
// TODO: How should we handle bounds?
!_CharacterClassModel.word.isBoundary(
input, at: pos, bounds: subjectBounds, with: options)
builder.buildAssert { [options]
(cache, maxIndex, input, pos, subjectBounds) in
if options.usesSimpleUnicodeBoundaries {
// TODO: How should we handle bounds?
return !_CharacterClassModel.word.isBoundary(
input,
at: pos,
bounds: subjectBounds,
with: options
)
} else {
return !input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
}
}
}
}
Expand Down
8 changes: 7 additions & 1 deletion Sources/_StringProcessing/Engine/MEProgram.swift
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,13 @@ struct MEProgram {

typealias ConsumeFunction = (Input, Range<Input.Index>) -> Input.Index?
typealias AssertionFunction =
(Input, Input.Index, Range<Input.Index>) throws -> Bool
(
inout Set<String.Index>?,
inout String.Index?,
Input,
Input.Index,
Range<Input.Index>
) throws -> Bool
typealias TransformFunction =
(Input, Processor._StoredCapture) throws -> Any?
typealias MatcherFunction =
Expand Down
11 changes: 10 additions & 1 deletion Sources/_StringProcessing/Engine/Processor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ struct Processor {

var storedCaptures: Array<_StoredCapture>

var wordIndexCache: Set<String.Index>? = nil
var wordIndexMaxIndex: String.Index? = nil

var state: State = .inProgress

var failureReason: Error? = nil
Expand Down Expand Up @@ -401,7 +404,13 @@ extension Processor {
let reg = payload.assertion
let assertion = registers[reg]
do {
guard try assertion(input, currentPosition, subjectBounds) else {
guard try assertion(
&wordIndexCache,
&wordIndexMaxIndex,
input,
currentPosition,
subjectBounds
) else {
signalFailure()
return
}
Expand Down
5 changes: 4 additions & 1 deletion Sources/_StringProcessing/MatchingOptions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -311,9 +311,12 @@ extension MatchingOptions.Representation {
[.reluctantByDefault, .possessiveByDefault]
}

// Uses level 2 Unicode word boundaries
static var unicodeWordBoundaries: Self { .init(.unicodeWordBoundaries) }

/// The default set of options.
static var `default`: Self {
[.graphemeClusterSemantics, .textSegmentGraphemeMode]
[.graphemeClusterSemantics, .textSegmentGraphemeMode, .unicodeWordBoundaries]
}
}

Expand Down
57 changes: 57 additions & 0 deletions Sources/_StringProcessing/Unicode/WordBreaking.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

@_spi(_Unicode)
import Swift

extension String {
func isOnWordBoundary(
at i: String.Index,
using cache: inout Set<String.Index>?,
_ maxIndex: inout String.Index?
) -> Bool {
guard i != startIndex, i != endIndex else {
return true
}

// If our index is already in our cache, then this is obviously on a
// boundary.
if let cache = cache, cache.contains(i) {
return true
}

// If its not in the cache AND our max index is larger than our index, it
// means this index is never on a word boundary in our string. If our index
// is larger than max index, we may need to still do work to determine if
// i is on a boundary. If it's equal to max index, then it should've been
// taken the cache path.
if let maxIndex = maxIndex, i < maxIndex {
return false
}

if #available(SwiftStdlib 5.7, *) {
var indices: Set<String.Index> = []
var j = maxIndex ?? startIndex

while j < endIndex, j <= i {
indices.insert(j)
j = _wordIndex(after: j)
}

cache = indices
maxIndex = j

return indices.contains(i)
} else {
return false
}
}
}
59 changes: 37 additions & 22 deletions Tests/RegexBuilderTests/RegexDSLTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -271,29 +271,44 @@ class RegexDSLTests: XCTestCase {
.ignoresCase(false)
}

#if os(macOS)
try XCTExpectFailure("Implement level 2 word boundaries") {
try _testDSLCaptures(
("can't stop won't stop", ("can't stop won't stop", "can't", "won")),
matchType: (Substring, Substring, Substring).self, ==) {
Capture {
OneOrMore(.word)
Anchor.wordBoundary
}
OneOrMore(.any, .reluctant)
"stop"
" "

Capture {
OneOrMore(.word)
Anchor.wordBoundary
}
.wordBoundaryKind(.unicodeLevel1)
OneOrMore(.any, .reluctant)
"stop"
try _testDSLCaptures(
("can't stop won't stop", ("can't stop won't stop", "can't", "won't")),
matchType: (Substring, Substring, Substring).self, ==) {
Comment on lines +274 to +276
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI I'm disabling these again in #552 (originally #555)

Capture {
OneOrMore(.word)
Anchor.wordBoundary
}
}
#endif
OneOrMore(.any, .reluctant)
"stop"
" "

Capture {
OneOrMore(.word)
Anchor.wordBoundary
}
OneOrMore(.any, .reluctant)
"stop"
}

try _testDSLCaptures(
("can't stop won't stop", ("can't stop won't stop", "can", "won")),
matchType: (Substring, Substring, Substring).self, ==) {
Capture {
OneOrMore(.word)
Anchor.wordBoundary
}
OneOrMore(.any, .reluctant)
"stop"
" "

Capture {
OneOrMore(.word)
Anchor.wordBoundary
}
.wordBoundaryKind(.unicodeLevel1)
OneOrMore(.any, .reluctant)
"stop"
}

try _testDSLCaptures(
("abcdef123", ("abcdef123", "a", "123")),
Expand Down
Loading