Skip to content

[string] Fix string implementation for big endian platforms #21077

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 5, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 44 additions & 16 deletions stdlib/public/core/SmallString.swift
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,19 @@
//
//===----------------------------------------------------------------------===//

// The code units in _SmallString are always stored in memory in the same order
// that they would be stored in an array. This means that on big-endian
// platforms the order of the bytes in storage is reversed compared to
// _StringObject whereas on little-endian platforms the order is the same.
//
// Memory layout:
//
// |0 1 2 3 4 5 6 7 8 9 A B C D E F| ← hexadecimal offset in bytes
// | _storage.0 | _storage.1 | ← raw bits
// | code units | | ← encoded layout
// ↑ ↑
// first (leftmost) code unit discriminator (incl. count)
//
@_fixed_layout @usableFromInline
internal struct _SmallString {
@usableFromInline
Expand Down Expand Up @@ -50,16 +63,18 @@ internal struct _SmallString {
@inlinable @inline(__always)
internal init(_ object: _StringObject) {
_internalInvariant(object.isSmall)
self.init(raw: object.rawBits)
// On big-endian platforms the byte order is the reverse of _StringObject.
let leading = object.rawBits.0.littleEndian
let trailing = object.rawBits.1.littleEndian
self.init(raw: (leading, trailing))
}

@inlinable @inline(__always)
internal init() {
self.init(raw: _StringObject(empty:()).rawBits)
self.init(_StringObject(empty:()))
}
}

// TODO
extension _SmallString {
@inlinable
internal static var capacity: Int {
Expand All @@ -72,9 +87,12 @@ extension _SmallString {
}
}

// Get an integer equivalent to the _StringObject.discriminatedObjectRawBits
// computed property.
@inlinable @inline(__always)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is worth a comment stating it's back to _StringObject's notion of a raw discriminated object. I can add that in a followup PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a brief comment. Happy to drop this additional commit if you'd rather do a comment follow up yourself.

internal var rawDiscriminatedObject: UInt64 {
return _storage.1
// Reverse the bytes on big-endian systems.
return _storage.1.littleEndian
}

@inlinable
Expand Down Expand Up @@ -107,7 +125,7 @@ extension _SmallString {
// usage: it always clears the discriminator and count (in case it's full)
@inlinable @inline(__always)
internal var zeroTerminatedRawCodeUnits: RawBitPattern {
let smallStringCodeUnitMask: UInt64 = 0x00FF_FFFF_FFFF_FFFF
let smallStringCodeUnitMask = ~UInt64(0xFF).bigEndian // zero last byte
return (self._storage.0, self._storage.1 & smallStringCodeUnitMask)
}

Expand Down Expand Up @@ -231,11 +249,12 @@ extension _SmallString {
_internalInvariant(count <= _SmallString.capacity)

let isASCII = (leading | trailing) & 0x8080_8080_8080_8080 == 0
let countAndDiscriminator = UInt64(truncatingIfNeeded: count) &<< 56
| _StringObject.Nibbles.small(isASCII: isASCII)
_internalInvariant(trailing & countAndDiscriminator == 0)
let discriminator = _StringObject.Nibbles
.small(withCount: count, isASCII: isASCII)
.littleEndian // reversed byte order on big-endian platforms
_internalInvariant(trailing & discriminator == 0)

self.init(raw: (leading, trailing | countAndDiscriminator))
self.init(raw: (leading, trailing | discriminator))
_internalInvariant(self.count == count)
}

Expand Down Expand Up @@ -295,23 +314,31 @@ extension _SmallString {
#endif

extension UInt64 {
// Fetches the `i`th byte, from least-significant to most-significant
//
// TODO: endianess awareness day
// Fetches the `i`th byte in memory order. On little-endian systems the byte
// at i=0 is the least significant byte (LSB) while on big-endian systems the
// byte at i=7 is the LSB.
@inlinable @inline(__always)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does "left to right" mean, memory-order/host-endianess?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Clarified the comment (memory-order).

internal func _uncheckedGetByte(at i: Int) -> UInt8 {
_internalInvariant(i >= 0 && i < MemoryLayout<UInt64>.stride)
#if _endian(big)
let shift = (7 - UInt64(truncatingIfNeeded: i)) &* 8
#else
let shift = UInt64(truncatingIfNeeded: i) &* 8
#endif
return UInt8(truncatingIfNeeded: (self &>> shift))
}

// Sets the `i`th byte, from least-significant to most-significant
//
// TODO: endianess awareness day
// Sets the `i`th byte in memory order. On little-endian systems the byte
// at i=0 is the least significant byte (LSB) while on big-endian systems the
// byte at i=7 is the LSB.
@inlinable @inline(__always)
internal mutating func _uncheckedSetByte(at i: Int, to value: UInt8) {
_internalInvariant(i >= 0 && i < MemoryLayout<UInt64>.stride)
#if _endian(big)
let shift = (7 - UInt64(truncatingIfNeeded: i)) &* 8
#else
let shift = UInt64(truncatingIfNeeded: i) &* 8
#endif
let valueMask: UInt64 = 0xFF &<< shift
self = (self & ~valueMask) | (UInt64(truncatingIfNeeded: value) &<< shift)
}
Expand All @@ -331,5 +358,6 @@ internal func _bytesToUInt64(
r = r | (UInt64(input[idx]) &<< shift)
shift = shift &+ 8
}
return r
// Convert from little-endian to host byte order.
return r.littleEndian
}
17 changes: 9 additions & 8 deletions stdlib/public/core/StringObject.swift
Original file line number Diff line number Diff line change
Expand Up @@ -516,11 +516,15 @@ extension _StringObject {

*/
extension _StringObject {
#if arch(i386) || arch(arm)
@inlinable @inline(__always)
internal init(_ small: _SmallString) {
// Small strings are encoded as _StringObjects in reverse byte order
// on big-endian platforms. This is to match the discriminator to the
// spare bits (the most significant nibble) in a pointer.
let word1 = small.rawBits.0.littleEndian
let word2 = small.rawBits.1.littleEndian
#if arch(i386) || arch(arm)
// On 32-bit, we need to unpack the small string.
let (word1, word2) = small.rawBits
let smallStringDiscriminatorAndCount: UInt64 = 0xFF00_0000_0000_0000

let leadingFour = Int(truncatingIfNeeded: word1)
Expand All @@ -532,15 +536,12 @@ extension _StringObject {
variant: .immortal(nextFour),
discriminator: smallDiscriminatorAndCount,
flags: trailingTwo)
_internalInvariant(isSmall)
}
#else
@inlinable @inline(__always)
internal init(_ small: _SmallString) {
self.init(rawValue: small.rawBits)
// On 64-bit, we copy the raw bits (to host byte order).
self.init(rawValue: (word1, word2))
#endif
_internalInvariant(isSmall)
}
#endif

@inlinable
internal static func getSmallCount(fromRaw x: UInt64) -> Int {
Expand Down