Skip to content

Commit 2866b4a

Browse files
committed
[string] Fix small string implementation for big endian platforms
Exclusively store small strings in little-endian byte order. This will insert byte swaps when accessing small strings on big endian platforms, however these are usually extremely cheap. This approach means that the layout of the code points and count in memory will be the same on both big and little endian machines simplifying future development. Prior to this change this code was broken on big endian machines because the memory layout was different (the count ending up in the middle of the string).
1 parent 5ff0bab commit 2866b4a

File tree

2 files changed

+69
-19
lines changed

2 files changed

+69
-19
lines changed

stdlib/public/core/SmallString.swift

Lines changed: 68 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -30,28 +30,79 @@ func unsupportedOn32bit() -> Never { _conditionallyUnreachable() }
3030
@_fixed_layout public struct _SmallUTF8String {}
3131

3232
#else
33+
//
34+
// The low byte of the first word (low) stores the first code unit. Up to 15
35+
// such code units are encodable, with the second-highest byte of the second
36+
// word (high) being the final code unit. The high byte of the final word
37+
// stores the count.
38+
//
39+
// The low and high values are automatically stored in little-endian byte order
40+
// by the _RawBitPattern struct, which reverses the byte order as needed to
41+
// convert between the little-endian storage format and the host byte order.
42+
// The memory layout of the _RawBitPattern struct will therefore be identical
43+
// on both big- and little-endian machines. The values of 'high' and 'low'
44+
// will also appear to be identical. Routines which build, manipulate and
45+
// convert small strings should therefore always assume little-endian byte
46+
// order.
47+
//
48+
// Storage layout:
49+
//
50+
// |0 1 2 3 4 5 6 7 8 9 a b c d e f| ← offset
51+
// | low | high | ← properties
52+
// | string | | ← encoded layout
53+
// ↑ ↑
54+
// first (low) byte count
55+
//
56+
// Examples:
57+
// ! o l l e H 6
58+
// |H e l l o ! ░ ░ ░ ░ ░ ░ ░ ░ ░|6| → low=0x0000216f6c6c6548 high=0x0600000000000000
59+
//
60+
// W , o l l e H 13 ! d l r o
61+
// |H e l l o , W o r l d ! ░ ░|d| → low=0x57202c6f6c6c6548 high=0x0d000021646c726f
62+
//
3363
@_fixed_layout
3464
@usableFromInline
3565
internal struct _SmallUTF8String {
66+
@_fixed_layout
3667
@usableFromInline
37-
typealias _RawBitPattern = (low: UInt, high: UInt)
38-
39-
//
40-
// TODO: pretty ASCII art.
41-
//
42-
// TODO: endianess awareness day
43-
//
44-
// The low byte of the first word stores the first code unit. There is up to
45-
// 15 such code units encodable, with the second-highest byte of the second
46-
// word being the final code unit. The high byte of the final word stores the
47-
// count.
48-
//
68+
internal struct _RawBitPattern: Equatable {
69+
// high and low are stored in little-endian byte order
70+
@usableFromInline
71+
internal var _storage: (low: UInt, high: UInt)
72+
73+
@inlinable
74+
var low: UInt {
75+
@inline(__always) get { return _storage.low.littleEndian }
76+
@inline(__always) set { _storage.low = newValue.littleEndian }
77+
}
78+
79+
@inlinable
80+
var high: UInt {
81+
@inline(__always) get { return _storage.high.littleEndian }
82+
@inline(__always) set { _storage.high = newValue.littleEndian }
83+
}
84+
85+
@inlinable
86+
@inline(__always)
87+
init(low l: UInt, high h: UInt) {
88+
// host byte order to little-endian byte order
89+
_storage.low = l.littleEndian
90+
_storage.high = h.littleEndian
91+
}
92+
93+
@inlinable
94+
@inline(__always)
95+
static func == (lhs: _RawBitPattern, rhs: _RawBitPattern) -> Bool {
96+
return lhs._storage == rhs._storage
97+
}
98+
}
99+
49100
@usableFromInline
50-
var _storage: _RawBitPattern = (0,0)
101+
var _storage: _RawBitPattern
51102
@inlinable
52103
@inline(__always)
53104
init() {
54-
self._storage = (0,0)
105+
self._storage = _RawBitPattern(low: 0, high: 0)
55106
}
56107
}
57108
#endif // 64-bit
@@ -150,7 +201,7 @@ extension _SmallUTF8String {
150201
}
151202
high |= (UInt(count) &<< (8*15))
152203
let low = _bytesToUInt(addr, lowCount)
153-
_storage = (low, high)
204+
_storage = _RawBitPattern(low: low, high: high)
154205

155206
// FIXME: support transcoding
156207
if !self.isASCII { return nil }
@@ -585,7 +636,8 @@ extension _SmallUTF8String {
585636
extension _SmallUTF8String {
586637
@inlinable
587638
@inline(__always)
588-
init(_rawBits: _RawBitPattern) {
639+
init(_rawBits: (low: UInt, high: UInt)) {
640+
self.init()
589641
self._storage.low = _rawBits.low
590642
self._storage.high = _rawBits.high
591643
_invariantCheck()
@@ -831,8 +883,6 @@ func _castBufPtr<A, B>(
831883

832884
extension UInt {
833885
// Fetches the `i`th byte, from least-significant to most-significant
834-
//
835-
// TODO: endianess awareness day
836886
@inlinable
837887
@inline(__always)
838888
func _uncheckedGetByte(at i: Int) -> UInt8 {

stdlib/public/core/StringGuts.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ extension _StringGuts {
365365
unsupportedOn32bit()
366366
#else
367367
return _SmallUTF8String(
368-
_rawBits: (_otherBits, _object.asSmallUTF8SecondWord))
368+
_rawBits: (low: _otherBits, high: _object.asSmallUTF8SecondWord))
369369
#endif
370370
}
371371
}

0 commit comments

Comments
 (0)