Skip to content

Commit 6c72aad

Browse files
authored
[stdlib] Implement String.WordView (#42414) (#59793)
* Implement String.WordView * Add isWordAligned bit * Hide WordView for now (also separate Index type) add bidirectional conformance Fix tests * Address comments from Karoy and Michael * Remove word view, use index methods * Address Karoy's comments aaa
1 parent 80790ae commit 6c72aad

File tree

14 files changed

+3148
-43
lines changed

14 files changed

+3148
-43
lines changed

stdlib/private/StdlibUnicodeUnittest/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ add_swift_target_library(swiftStdlibUnicodeUnittest ${SWIFT_STDLIB_LIBRARY_BUILD
77
Collation.swift
88
UnicodeScalarProperties.swift
99
GraphemeBreaking.swift
10+
WordBreaking.swift
1011

1112
SWIFT_MODULE_DEPENDS StdlibUnittest
1213
SWIFT_MODULE_DEPENDS_LINUX Glibc
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
// Normalization tests are currently only avaible on Darwin, awaiting a sensible
14+
// file API...
15+
#if _runtime(_ObjC)
16+
import Foundation
17+
18+
func parseWordBreakTests(
19+
_ data: String,
20+
into result: inout [(String, [String])]
21+
) {
22+
for line in data.split(separator: "\n") {
23+
// Only look at actual tests
24+
guard line.hasPrefix("÷") else {
25+
continue
26+
}
27+
28+
let components = line.split(separator: "#").first!.split(separator: " ")
29+
30+
var string = ""
31+
var words: [String] = [""]
32+
33+
for i in components.indices.dropFirst() {
34+
// If we're an odd index, this is a scalar.
35+
if !i.isMultiple(of: 2) {
36+
let scalar = Unicode.Scalar(UInt32(components[i], radix: 16)!)!
37+
38+
string.unicodeScalars.append(scalar)
39+
words[words.count - 1].unicodeScalars.append(scalar)
40+
} else {
41+
// Otherwise, it is a word breaking operator.
42+
43+
// If this is a break, record the +1 count. Otherwise it is × which is
44+
// not a break.
45+
if components[i] == "÷" {
46+
words.append("")
47+
}
48+
}
49+
}
50+
51+
words.removeLast()
52+
53+
result.append((string, words))
54+
}
55+
}
56+
57+
public let wordBreakTests: [(String, [String])] = {
58+
var result: [(String, [String])] = []
59+
60+
let testFile = readInputFile("WordBreakTest.txt")
61+
62+
parseWordBreakTests(testFile, into: &result)
63+
64+
return result
65+
}()
66+
#endif

stdlib/public/SwiftShims/UnicodeData.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,13 @@ __swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar);
6464
SWIFT_RUNTIME_STDLIB_INTERNAL
6565
__swift_bool _swift_stdlib_isLinkingConsonant(__swift_uint32_t scalar);
6666

67+
//===----------------------------------------------------------------------===//
68+
// Word Breaking
69+
//===----------------------------------------------------------------------===//
70+
71+
SWIFT_RUNTIME_STDLIB_INTERNAL
72+
__swift_uint8_t _swift_stdlib_getWordBreakProperty(__swift_uint32_t scalar);
73+
6774
//===----------------------------------------------------------------------===//
6875
// Unicode.Scalar.Properties
6976
//===----------------------------------------------------------------------===//

stdlib/public/core/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ set(SWIFTLIB_ESSENTIAL
170170
StringUTF16View.swift
171171
StringUTF8View.swift
172172
StringUTF8Validation.swift
173+
StringWordBreaking.swift
173174
Substring.swift
174175
SwiftNativeNSArray.swift
175176
TemporaryAllocation.swift
@@ -178,7 +179,7 @@ set(SWIFTLIB_ESSENTIAL
178179
UnavailableStringAPIs.swift
179180
UnicodeData.swift
180181
UnicodeEncoding.swift
181-
UnicodeGraphemeBreakProperty.swift
182+
UnicodeBreakProperty.swift
182183
UnicodeHelpers.swift
183184
UnicodeParser.swift
184185
UnicodeScalarProperties.swift

stdlib/public/core/GroupInfo.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,12 @@
4141
"StringUTF8View.swift",
4242
"StringUTF8Validation.swift",
4343
"StringUnicodeScalarView.swift",
44+
"StringWordBreaking.swift",
4445
"Substring.swift",
4546
"Unicode.swift",
4647
"UnicodeData.swift",
4748
"UnicodeEncoding.swift",
48-
"UnicodeGraphemeBreakProperty.swift",
49+
"UnicodeBreakProperty.swift",
4950
"UnicodeHelpers.swift",
5051
"UnicodeParser.swift",
5152
"UnicodeScalar.swift",

stdlib/public/core/StringIndexValidation.swift

Lines changed: 17 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -300,47 +300,6 @@ extension _StringGuts {
300300
scalarAlign(validateInclusiveSubscalarIndex(i, in: bounds)),
301301
in: bounds)
302302
}
303-
304-
internal func validateCharacterRange(
305-
_ range: Range<String.Index>
306-
) -> Range<String.Index> {
307-
if
308-
isFastCharacterIndex(range.lowerBound),
309-
isFastCharacterIndex(range.upperBound)
310-
{
311-
_precondition(range.upperBound._encodedOffset <= count,
312-
"String index range is out of bounds")
313-
return range
314-
}
315-
316-
let r = validateSubscalarRange(range)
317-
let l = roundDownToNearestCharacter(scalarAlign(r.lowerBound))
318-
let u = roundDownToNearestCharacter(scalarAlign(r.upperBound))
319-
return Range(_uncheckedBounds: (l, u))
320-
}
321-
322-
internal func validateCharacterRange(
323-
_ range: Range<String.Index>,
324-
in bounds: Range<String.Index>
325-
) -> Range<String.Index> {
326-
_internalInvariant(bounds.upperBound <= endIndex)
327-
328-
if
329-
isFastCharacterIndex(range.lowerBound),
330-
isFastCharacterIndex(range.upperBound)
331-
{
332-
_precondition(
333-
range.lowerBound >= bounds.lowerBound
334-
&& range.upperBound <= bounds.upperBound,
335-
"String index range is out of bounds")
336-
return range
337-
}
338-
339-
let r = validateSubscalarRange(range, in: bounds)
340-
let l = roundDownToNearestCharacter(scalarAlign(r.lowerBound), in: bounds)
341-
let u = roundDownToNearestCharacter(scalarAlign(r.upperBound), in: bounds)
342-
return Range(_uncheckedBounds: (l, u))
343-
}
344303
}
345304

346305
// Temporary additions to deal with binary compatibility issues with existing
@@ -439,3 +398,20 @@ extension _StringGuts {
439398
scalarAlign(validateInclusiveSubscalarIndex_5_7(i)))
440399
}
441400
}
401+
402+
// Word index validation (String)
403+
extension _StringGuts {
404+
internal func validateWordIndex(
405+
_ i: String.Index
406+
) -> String.Index {
407+
return roundDownToNearestWord(scalarAlign(validateSubscalarIndex(i)))
408+
}
409+
410+
internal func validateInclusiveWordIndex(
411+
_ i: String.Index
412+
) -> String.Index {
413+
return roundDownToNearestWord(
414+
scalarAlign(validateInclusiveSubscalarIndex(i))
415+
)
416+
}
417+
}

0 commit comments

Comments
 (0)