Skip to content

Commit ec900f9

Browse files
committed
Add isWordAligned bit
1 parent 3b4b475 commit ec900f9

File tree

4 files changed

+106
-56
lines changed

4 files changed

+106
-56
lines changed

stdlib/public/core/StringIndex.swift

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import SwiftShims
1717
String's Index has the following layout:
1818

1919
┌──────────┬────────────────╥────────────────┬───────╥───────┐
20-
│ b63:b16 │ b15:b14 ║ b13:b8 │ b7:b4b3:b0 │
20+
│ b63:b16 │ b15:b14 ║ b13:b8 │ b7:b5b4:b0 │
2121
├──────────┼────────────────╫────────────────┼───────╫───────┤
2222
│ position │ transc. offset ║ grapheme cache │ rsvd ║ flags │
2323
└──────────┴────────────────╨────────────────┴───────╨───────┘
@@ -43,7 +43,7 @@ isn't frozen.
4343
looking back at scalars preceding the index. (Substrings that don't start on a
4444
`Character` boundary heavily rely on this.)
4545

46-
- reserved: 4 unused bits available for future flags etc. The meaning of each
46+
- reserved: 3 unused bits available for future flags etc. The meaning of each
4747
bit may change between stdlib versions. These must be set to zero if
4848
constructing an index in inlinable code.
4949

@@ -70,6 +70,11 @@ isn't frozen.
7070
If set, the position is known to be expressed in UTF-16 code units.
7171
(Introduced in Swift 5.7)
7272

73+
* b4: `_isWordAligned`
74+
75+
If set, the index is known to be on a Unicode word boundary.
76+
(Introduced in Swift 5.7)
77+
7378
Before Swift 5.7, bits b1, b2 and b3 used to be part of the resilient slice. See
7479
the notes on Character Alignment and Index Encoding below to see how this works.
7580

@@ -261,6 +266,9 @@ extension String.Index {
261266
@_alwaysEmitIntoClient @inline(__always) // Swift 5.7
262267
internal static var __utf16Bit: UInt64 { 0x8 }
263268

269+
@_alwaysEmitIntoClient @inline(__always) // Swift 5.7
270+
internal static var __wordAlignmentBit: UInt64 { 0x10 }
271+
264272
@_alwaysEmitIntoClient @inline(__always) // Swift 5.7
265273
internal static func __encodingBit(utf16: Bool) -> UInt64 {
266274
let utf16 = Int8(Builtin.zext_Int1_Int8(utf16._value))
@@ -365,10 +373,35 @@ extension String.Index {
365373
}
366374
}
367375

376+
// ### Word Alignment
377+
//
378+
// Enter some pretty cool information about Unicode words
379+
extension String.Index {
380+
@_alwaysEmitIntoClient // Swift 5.7
381+
@inline(__always)
382+
internal var _isWordAligned: Bool {
383+
_rawBits & Self.__wordAlignmentBit != 0
384+
}
385+
386+
@_alwaysEmitIntoClient // Swift 5.7
387+
@inline(__always)
388+
internal var _wordAligned: String.Index {
389+
let r = _rawBits
390+
| Self.__wordAlignmentBit
391+
| Self.__characterAlignmentBit
392+
| Self.__scalarAlignmentBit
393+
let idx = Self(r)
394+
idx._invariantCheck()
395+
return idx
396+
}
397+
}
398+
368399
extension String.Index {
369400
@_alwaysEmitIntoClient // Swift 5.7
370401
internal func _copyingAlignment(from index: Self) -> Self {
371-
let mask = Self.__scalarAlignmentBit | Self.__characterAlignmentBit
402+
let mask = Self.__scalarAlignmentBit
403+
| Self.__characterAlignmentBit
404+
| Self.__wordAlignmentBit
372405
return Self((_rawBits & ~mask) | (index._rawBits & mask))
373406
}
374407
}

stdlib/public/core/StringIndexValidation.swift

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ extension _StringGuts {
2121
internal func isFastCharacterIndex(_ i: String.Index) -> Bool {
2222
hasMatchingEncoding(i) && i._isCharacterAligned
2323
}
24+
25+
@_alwaysEmitIntoClient @inline(__always)
26+
internal func isFastWordIndex(_ i: String.Index) -> Bool {
27+
hasMatchingEncoding(i) && i._isWordAligned
28+
}
2429
}
2530

2631
// Subscalar index validation (UTF-8 & UTF-16 views)
@@ -444,13 +449,21 @@ extension _StringGuts {
444449
extension _StringGuts {
445450
@available(SwiftStdlib 5.7, *)
446451
internal func validateWordIndex(_ i: String.Index) -> String.Index {
447-
// TODO: Maybe fast word index bit?
452+
if isFastWordIndex(i) {
453+
_precondition(i._encodedOffset < count, "String index is out of bounds")
454+
return i
455+
}
456+
448457
return roundDownToNearestWord(scalarAlign(validateSubscalarIndex(i)))
449458
}
450459

451460
@available(SwiftStdlib 5.7, *)
452461
internal func validateInclusiveWordIndex(_ i: String.Index) -> String.Index {
453-
// TODO: Maybe fast word index bit?
462+
if isFastWordIndex(i) {
463+
_precondition(i._encodedOffset < count, "String index is out of bounds")
464+
return i
465+
}
466+
454467
return roundDownToNearestCharacter(
455468
scalarAlign(validateInclusiveSubscalarIndex(i))
456469
)

stdlib/public/core/StringWordBreaking.swift

Lines changed: 53 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,14 @@ extension _StringGuts {
1717
_internalInvariant(hasMatchingEncoding(i))
1818
_internalInvariant(i._encodedOffset <= count)
1919

20+
if _fastPath(i._isWordAligned) {
21+
return i
22+
}
23+
2024
let offset = i._encodedOffset
2125

2226
if offset == 0 || offset == count {
23-
return i
27+
return i._wordAligned
2428
}
2529

2630
return _slowRoundDownToNearestWord(i)
@@ -33,15 +37,15 @@ extension _StringGuts {
3337

3438
let offset = i._encodedOffset
3539
let start = offset &- words._uncheckedIndex(before: i)._encodedOffset
36-
let startIndex = String.Index(_encodedOffset: start)._scalarAligned
40+
let startIndex = String.Index(_encodedOffset: start)._wordAligned
3741
let stride = words._uncheckedIndex(after: startIndex)._encodedOffset
3842
_internalInvariant(offset <= start + stride, "Word breaking inconsistency")
3943

4044
if offset >= start + stride {
41-
return i
45+
return i._wordAligned
4246
}
4347

44-
let r = String.Index(_encodedOffset: start)._scalarAligned
48+
let r = String.Index(_encodedOffset: start)._wordAligned
4549
return markEncoding(r)
4650
}
4751
}
@@ -170,7 +174,7 @@ extension String._WordView {
170174

171175
// WB3a and WB3b
172176
case (.newlineCRLF, _),
173-
(_, .newlineCRLF):
177+
(_, .newlineCRLF):
174178
return true
175179

176180
// WB3c
@@ -183,8 +187,8 @@ extension String._WordView {
183187

184188
// WB4
185189
case (_, .format),
186-
(_, .extend),
187-
(_, .zwj):
190+
(_, .extend),
191+
(_, .zwj):
188192
if x != .format && x != .extend && x != .zwj {
189193
state.previousProperty = x
190194
}
@@ -212,9 +216,9 @@ extension String._WordView {
212216
switch (x, y) {
213217
// WB5
214218
case (.aLetter, .aLetter),
215-
(.aLetter, .hebrewLetter),
216-
(.hebrewLetter, .aLetter),
217-
(.hebrewLetter, .hebrewLetter):
219+
(.aLetter, .hebrewLetter),
220+
(.hebrewLetter, .aLetter),
221+
(.hebrewLetter, .hebrewLetter):
218222
return false
219223

220224
// WB6
@@ -276,12 +280,12 @@ extension String._WordView {
276280

277281
// WB9
278282
case (.aLetter, .numeric),
279-
(.hebrewLetter, .numeric):
283+
(.hebrewLetter, .numeric):
280284
return false
281285

282286
// WB10
283287
case (.numeric, .aLetter),
284-
(.numeric, .hebrewLetter):
288+
(.numeric, .hebrewLetter):
285289
return false
286290

287291
// WB11
@@ -314,17 +318,17 @@ extension String._WordView {
314318

315319
// WB13a
316320
case (.aLetter, .extendNumLet),
317-
(.hebrewLetter, .extendNumLet),
318-
(.numeric, .extendNumLet),
319-
(.katakana, .extendNumLet),
320-
(.extendNumLet, .extendNumLet):
321+
(.hebrewLetter, .extendNumLet),
322+
(.numeric, .extendNumLet),
323+
(.katakana, .extendNumLet),
324+
(.extendNumLet, .extendNumLet):
321325
return false
322326

323327
// WB13b
324328
case (.extendNumLet, .aLetter),
325-
(.extendNumLet, .hebrewLetter),
326-
(.extendNumLet, .numeric),
327-
(.extendNumLet, .katakana):
329+
(.extendNumLet, .hebrewLetter),
330+
(.extendNumLet, .numeric),
331+
(.extendNumLet, .katakana):
328332
return false
329333

330334
// WB15
@@ -370,7 +374,7 @@ extension String._WordView {
370374

371375
// WB3a and WB3b
372376
case (.newlineCRLF, _),
373-
(_, .newlineCRLF):
377+
(_, .newlineCRLF):
374378
return true
375379

376380
// WB3c
@@ -383,8 +387,8 @@ extension String._WordView {
383387

384388
// WB4
385389
case (.format, _),
386-
(.extend, _),
387-
(.zwj, _):
390+
(.extend, _),
391+
(.zwj, _):
388392
if y != .format && y != .extend && y != .zwj {
389393
state.previousProperty = y
390394
state.previousIndex = state.index
@@ -394,8 +398,8 @@ extension String._WordView {
394398

395399
// WB4
396400
case (_, .format),
397-
(_, .extend),
398-
(_, .zwj):
401+
(_, .extend),
402+
(_, .zwj):
399403
if state.previousProperty != nil {
400404
fallthrough
401405
}
@@ -426,18 +430,18 @@ extension String._WordView {
426430

427431
// WB5
428432
case (.aLetter, .aLetter),
429-
(.aLetter, .hebrewLetter),
430-
(.hebrewLetter, .aLetter),
431-
(.hebrewLetter, .hebrewLetter):
433+
(.aLetter, .hebrewLetter),
434+
(.hebrewLetter, .aLetter),
435+
(.hebrewLetter, .hebrewLetter):
432436
state.previousIndex = nil
433437
return false
434438

435439
// WB6
436440
case (.aLetter, .midLetter),
437-
(.hebrewLetter, .midLetter),
438-
(.aLetter, .midNumLet),
439-
(.hebrewLetter, .midNumLet),
440-
(.aLetter, .singleQuote):
441+
(.hebrewLetter, .midLetter),
442+
(.aLetter, .midNumLet),
443+
(.hebrewLetter, .midNumLet),
444+
(.aLetter, .singleQuote):
441445
if let constraint = state.constraint {
442446
if constraint.question == .requireAHLetter {
443447
state.constraint = nil
@@ -453,11 +457,11 @@ extension String._WordView {
453457

454458
// WB7
455459
case (.midLetter, .aLetter),
456-
(.midLetter, .hebrewLetter),
457-
(.midNumLet, .aLetter),
458-
(.midNumLet, .hebrewLetter),
459-
(.singleQuote, .aLetter),
460-
(.singleQuote, .hebrewLetter):
460+
(.midLetter, .hebrewLetter),
461+
(.midNumLet, .aLetter),
462+
(.midNumLet, .hebrewLetter),
463+
(.singleQuote, .aLetter),
464+
(.singleQuote, .hebrewLetter):
461465
state.constraint = (question: .requireAHLetter, index: state.index)
462466

463467
return false
@@ -495,28 +499,28 @@ extension String._WordView {
495499

496500
// WB9
497501
case (.aLetter, .numeric),
498-
(.hebrewLetter, .numeric):
502+
(.hebrewLetter, .numeric):
499503
state.previousIndex = nil
500504
return false
501505

502506
// WB10
503507
case (.numeric, .aLetter),
504-
(.numeric, .hebrewLetter):
508+
(.numeric, .hebrewLetter):
505509
state.previousIndex = nil
506510
return false
507511

508512
// WB11
509513
case (.midNum, .numeric),
510-
(.midNumLet, .numeric),
511-
(.singleQuote, .numeric):
514+
(.midNumLet, .numeric),
515+
(.singleQuote, .numeric):
512516
state.constraint = (question: .requireNumeric, index: state.index)
513517

514518
return false
515519

516520
// WB12
517521
case (.numeric, .midNum),
518-
(.numeric, .midNumLet),
519-
(.numeric, .singleQuote):
522+
(.numeric, .midNumLet),
523+
(.numeric, .singleQuote):
520524
if let constraint = state.constraint {
521525
if constraint.question == .requireNumeric {
522526
state.constraint = nil
@@ -537,18 +541,18 @@ extension String._WordView {
537541

538542
// WB13a
539543
case (.aLetter, .extendNumLet),
540-
(.hebrewLetter, .extendNumLet),
541-
(.numeric, .extendNumLet),
542-
(.katakana, .extendNumLet),
543-
(.extendNumLet, .extendNumLet):
544+
(.hebrewLetter, .extendNumLet),
545+
(.numeric, .extendNumLet),
546+
(.katakana, .extendNumLet),
547+
(.extendNumLet, .extendNumLet):
544548
state.previousIndex = nil
545549
return false
546550

547551
// WB13b
548552
case (.extendNumLet, .aLetter),
549-
(.extendNumLet, .hebrewLetter),
550-
(.extendNumLet, .numeric),
551-
(.extendNumLet, .katakana):
553+
(.extendNumLet, .hebrewLetter),
554+
(.extendNumLet, .numeric),
555+
(.extendNumLet, .katakana):
552556
state.previousIndex = nil
553557
return false
554558

stdlib/public/core/StringWordView.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ extension String._WordView: Collection {
5353
internal func _uncheckedIndex(after i: Index) -> Index {
5454
_internalInvariant(_guts.hasMatchingEncoding(i))
5555
_internalInvariant(i < endIndex)
56-
_internalInvariant(i._isScalarAligned)
56+
_internalInvariant(i._isWordAligned)
5757

5858
if _slowPath(_guts.isForeign) {
5959
return _foreignIndex(after: i)
@@ -72,7 +72,7 @@ extension String._WordView: Collection {
7272
}
7373
}
7474

75-
let nextIndex = String.Index(_encodedOffset: nextOffset)._scalarAligned
75+
let nextIndex = String.Index(_encodedOffset: nextOffset)._wordAligned
7676
return _guts.markEncoding(nextIndex)
7777
}
7878

0 commit comments

Comments
 (0)