Skip to content

[stdlib] Implement String.WordView #42414

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions stdlib/private/StdlibUnicodeUnittest/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ add_swift_target_library(swiftStdlibUnicodeUnittest ${SWIFT_STDLIB_LIBRARY_BUILD
Collation.swift
UnicodeScalarProperties.swift
GraphemeBreaking.swift
WordBreaking.swift

SWIFT_MODULE_DEPENDS StdlibUnittest
SWIFT_MODULE_DEPENDS_LINUX Glibc
Expand Down
66 changes: 66 additions & 0 deletions stdlib/private/StdlibUnicodeUnittest/WordBreaking.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

// Normalization tests are currently only avaible on Darwin, awaiting a sensible
// file API...
#if _runtime(_ObjC)
import Foundation

func parseWordBreakTests(
_ data: String,
into result: inout [(String, [String])]
) {
for line in data.split(separator: "\n") {
// Only look at actual tests
guard line.hasPrefix("÷") else {
continue
}

let components = line.split(separator: "#").first!.split(separator: " ")

var string = ""
var words: [String] = [""]

for i in components.indices.dropFirst() {
// If we're an odd index, this is a scalar.
if !i.isMultiple(of: 2) {
let scalar = Unicode.Scalar(UInt32(components[i], radix: 16)!)!

string.unicodeScalars.append(scalar)
words[words.count - 1].unicodeScalars.append(scalar)
} else {
// Otherwise, it is a word breaking operator.

// If this is a break, record the +1 count. Otherwise it is × which is
// not a break.
if components[i] == "÷" {
words.append("")
}
}
}

words.removeLast()

result.append((string, words))
}
}

public let wordBreakTests: [(String, [String])] = {
var result: [(String, [String])] = []

let testFile = readInputFile("WordBreakTest.txt")

parseWordBreakTests(testFile, into: &result)

return result
}()
#endif
7 changes: 7 additions & 0 deletions stdlib/public/SwiftShims/UnicodeData.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@ __swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar);
SWIFT_RUNTIME_STDLIB_INTERNAL
__swift_bool _swift_stdlib_isLinkingConsonant(__swift_uint32_t scalar);

//===----------------------------------------------------------------------===//
// Word Breaking
//===----------------------------------------------------------------------===//

SWIFT_RUNTIME_STDLIB_INTERNAL
__swift_uint8_t _swift_stdlib_getWordBreakProperty(__swift_uint32_t scalar);

//===----------------------------------------------------------------------===//
// Unicode.Scalar.Properties
//===----------------------------------------------------------------------===//
Expand Down
3 changes: 2 additions & 1 deletion stdlib/public/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ set(SWIFTLIB_ESSENTIAL
StringUTF16View.swift
StringUTF8View.swift
StringUTF8Validation.swift
StringWordBreaking.swift
Substring.swift
SwiftNativeNSArray.swift
TemporaryAllocation.swift
Expand All @@ -178,7 +179,7 @@ set(SWIFTLIB_ESSENTIAL
UnavailableStringAPIs.swift
UnicodeData.swift
UnicodeEncoding.swift
UnicodeGraphemeBreakProperty.swift
UnicodeBreakProperty.swift
UnicodeHelpers.swift
UnicodeParser.swift
UnicodeScalarProperties.swift
Expand Down
3 changes: 2 additions & 1 deletion stdlib/public/core/GroupInfo.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,12 @@
"StringUTF8View.swift",
"StringUTF8Validation.swift",
"StringUnicodeScalarView.swift",
"StringWordBreaking.swift",
"Substring.swift",
"Unicode.swift",
"UnicodeData.swift",
"UnicodeEncoding.swift",
"UnicodeGraphemeBreakProperty.swift",
"UnicodeBreakProperty.swift",
"UnicodeHelpers.swift",
"UnicodeParser.swift",
"UnicodeScalar.swift",
Expand Down
58 changes: 17 additions & 41 deletions stdlib/public/core/StringIndexValidation.swift
Original file line number Diff line number Diff line change
Expand Up @@ -300,47 +300,6 @@ extension _StringGuts {
scalarAlign(validateInclusiveSubscalarIndex(i, in: bounds)),
in: bounds)
}

internal func validateCharacterRange(
_ range: Range<String.Index>
) -> Range<String.Index> {
if
isFastCharacterIndex(range.lowerBound),
isFastCharacterIndex(range.upperBound)
{
_precondition(range.upperBound._encodedOffset <= count,
"String index range is out of bounds")
return range
}

let r = validateSubscalarRange(range)
let l = roundDownToNearestCharacter(scalarAlign(r.lowerBound))
let u = roundDownToNearestCharacter(scalarAlign(r.upperBound))
return Range(_uncheckedBounds: (l, u))
}

internal func validateCharacterRange(
_ range: Range<String.Index>,
in bounds: Range<String.Index>
) -> Range<String.Index> {
_internalInvariant(bounds.upperBound <= endIndex)

if
isFastCharacterIndex(range.lowerBound),
isFastCharacterIndex(range.upperBound)
{
_precondition(
range.lowerBound >= bounds.lowerBound
&& range.upperBound <= bounds.upperBound,
"String index range is out of bounds")
return range
}

let r = validateSubscalarRange(range, in: bounds)
let l = roundDownToNearestCharacter(scalarAlign(r.lowerBound), in: bounds)
let u = roundDownToNearestCharacter(scalarAlign(r.upperBound), in: bounds)
return Range(_uncheckedBounds: (l, u))
}
}

// Temporary additions to deal with binary compatibility issues with existing
Expand Down Expand Up @@ -439,3 +398,20 @@ extension _StringGuts {
scalarAlign(validateInclusiveSubscalarIndex_5_7(i)))
}
}

// Word index validation (String)
extension _StringGuts {
internal func validateWordIndex(
_ i: String.Index
) -> String.Index {
return roundDownToNearestWord(scalarAlign(validateSubscalarIndex(i)))
}

internal func validateInclusiveWordIndex(
_ i: String.Index
) -> String.Index {
return roundDownToNearestWord(
scalarAlign(validateInclusiveSubscalarIndex(i))
)
}
}
Loading