Skip to content

Commit a6a31b0

Browse files
authored
Initial Unicode module (#65)
Initial Unicode module
1 parent 2637691 commit a6a31b0

16 files changed

+260
-1
lines changed

Package.swift

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ let package = Package(
1616
.library(
1717
name: "_MatchingEngine",
1818
targets: ["_MatchingEngine"]),
19+
.library(
20+
name: "_Unicode",
21+
targets: ["_Unicode"]),
1922
.executable(
2023
name: "VariadicsGenerator",
2124
targets: ["VariadicsGenerator"])
@@ -31,13 +34,16 @@ let package = Package(
3134
dependencies: ["_StringProcessing"]),
3235
.target(
3336
name: "_MatchingEngine",
34-
dependencies: []),
37+
dependencies: [/*"_Unicode"*/]),
3538
.testTarget(
3639
name: "MatchingEngineTests",
3740
dependencies: ["_MatchingEngine"]),
3841
.target(
3942
name: "_StringProcessing",
4043
dependencies: ["_MatchingEngine"]),
44+
.target(
45+
name: "_Unicode",
46+
dependencies: []),
4147
.testTarget(
4248
name: "RegexTests",
4349
dependencies: ["_StringProcessing"]),
@@ -59,6 +65,9 @@ let package = Package(
5965
.testTarget(
6066
name: "AlgorithmsTests",
6167
dependencies: ["Algorithms"]),
68+
.testTarget(
69+
name: "UnicodeTests",
70+
dependencies: ["_Unicode"]),
6271

6372
// MARK: Scripts
6473
.executableTarget(

Sources/_Unicode/CaseConversion.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
// TODO
3+

Sources/_Unicode/CharacterProps.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
// TODO
3+

Sources/_Unicode/Comparison.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
// TODO
3+

Sources/_Unicode/Decoding.swift

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
/*
2+
3+
Provide very low-level interfaces for scalar decoding.
4+
5+
These can be faster if we assume certain invariants are
6+
maintained. We assert, of course, because we're not monsters.
7+
8+
Thus they are unsafe in the following senses:
9+
10+
- They assume validly encoded contents, otherwise UB
11+
- They assume any pointers passed in will be live and valid
12+
during execution and not concurrently written to, otherwise UB
13+
- They assume any pointer passed in has sufficient bounds
14+
for decoding a scalar, otherwise UB.
15+
16+
String maintains these invariants for its in-memory storage.
17+
18+
*/
19+
20+
21+
// TODO: Design an "unsafe" and "assumingValid" API convention
22+
23+
enum UnsafeAssumingValidUTF8 {
24+
@inlinable @inline(__always)
25+
public func decode(_ x: UInt8) -> Unicode.Scalar {
26+
_internalInvariant(UTF8.isASCII(x))
27+
return Unicode.Scalar(_unchecked: UInt32(x))
28+
}
29+
30+
@inlinable @inline(__always)
31+
public func decode(
32+
_ x: UInt8, _ y: UInt8
33+
) -> Unicode.Scalar {
34+
_internalInvariant(scalarLength(x) == 2)
35+
_internalInvariant(UTF8.isContinuation(y))
36+
let x = UInt32(x)
37+
let value = ((x & 0b0001_1111) &<< 6) | continuationPayload(y)
38+
return Unicode.Scalar(_unchecked: value)
39+
}
40+
41+
@inlinable @inline(__always)
42+
public func decode(
43+
_ x: UInt8, _ y: UInt8, _ z: UInt8
44+
) -> Unicode.Scalar {
45+
_internalInvariant(scalarLength(x) == 3)
46+
_internalInvariant(UTF8.isContinuation(y) && UTF8.isContinuation(z))
47+
let x = UInt32(x)
48+
let value = ((x & 0b0000_1111) &<< 12)
49+
| (continuationPayload(y) &<< 6)
50+
| continuationPayload(z)
51+
return Unicode.Scalar(_unchecked: value)
52+
}
53+
54+
@inlinable @inline(__always)
55+
public func decode(
56+
_ x: UInt8, _ y: UInt8, _ z: UInt8, _ w: UInt8
57+
) -> Unicode.Scalar {
58+
_internalInvariant(scalarLength(x) == 4)
59+
_internalInvariant(
60+
UTF8.isContinuation(y) && UTF8.isContinuation(z)
61+
&& UTF8.isContinuation(w))
62+
let x = UInt32(x)
63+
let value = ((x & 0b0000_1111) &<< 18)
64+
| (continuationPayload(y) &<< 12)
65+
| (continuationPayload(z) &<< 6)
66+
| continuationPayload(w)
67+
return Unicode.Scalar(_unchecked: value)
68+
}
69+
70+
// Also, assuming we can load from those bounds...
71+
@inlinable
72+
public func decode(
73+
_ utf8: UnsafeByteBuffer, startingAt i: Int
74+
) -> (Unicode.Scalar, scalarLength: Int) {
75+
let cu0 = utf8[_unchecked: i]
76+
let len = scalarLength(cu0)
77+
switch len {
78+
case 1: return (decode(cu0), len)
79+
case 2: return (decode(cu0, utf8[_unchecked: i &+ 1]), len)
80+
case 3: return (decode(
81+
cu0, utf8[_unchecked: i &+ 1], utf8[_unchecked: i &+ 2]), len)
82+
case 4:
83+
return (decode(
84+
cu0,
85+
utf8[_unchecked: i &+ 1],
86+
utf8[_unchecked: i &+ 2],
87+
utf8[_unchecked: i &+ 3]),
88+
len)
89+
default:
90+
fatalError("unreachable")//Builtin.unreachable()
91+
}
92+
}
93+
94+
@inlinable
95+
public func decode(
96+
_ utf8: UnsafeByteBuffer, endingAt i: Int
97+
) -> (Unicode.Scalar, scalarLength: Int) {
98+
let len = scalarLength(utf8, endingAt: i)
99+
let (scalar, scalarLen) = decode(utf8, startingAt: i &- len)
100+
_internalInvariant(len == scalarLen)
101+
return (scalar, len)
102+
}
103+
104+
@inlinable @inline(__always)
105+
public func scalarLength(_ x: UInt8) -> Int {
106+
_internalInvariant(!UTF8.isContinuation(x))
107+
if UTF8.isASCII(x) { return 1 }
108+
// TODO(String micro-performance): check codegen
109+
return (~x).leadingZeroBitCount
110+
}
111+
112+
@inlinable @inline(__always)
113+
public func scalarLength(
114+
_ utf8: UnsafeByteBuffer, endingAt i: Int
115+
) -> Int {
116+
var len = 1
117+
while UTF8.isContinuation(utf8[_unchecked: i &- len]) {
118+
len &+= 1
119+
}
120+
_internalInvariant(len == scalarLength(utf8[i &- len]))
121+
return len
122+
}
123+
124+
@inlinable @inline(__always)
125+
public func continuationPayload(_ x: UInt8) -> UInt32 {
126+
return UInt32(x & 0x3F)
127+
}
128+
129+
@inlinable
130+
public func scalarAlign(
131+
_ utf8: UnsafeByteBuffer, _ idx: Int
132+
) -> Int {
133+
guard _fastPath(idx != utf8.count) else { return idx }
134+
135+
var i = idx
136+
while _slowPath(UTF8.isContinuation(utf8[_unchecked: i])) {
137+
i &-= 1
138+
_internalInvariant(i >= 0,
139+
"Malformed contents: starts with continuation byte")
140+
}
141+
return i
142+
}
143+
}
144+
145+
// TODO: Validating versions that remove that aspect of
146+
// unsafety. Stdlib has stuff on _StrinGuts that could be
147+
// at least partially refactored.
148+
149+
// TODO: Consider UTF-16 support, but that's normally best
150+
// handled as a transcoding concern.
151+
152+

Sources/_Unicode/Encodings.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
2+
// TODO

Sources/_Unicode/Formatting.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
// TODO
3+

Sources/_Unicode/Graphemes.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
// TODO
3+

Sources/_Unicode/NecessaryEvils.swift

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
3+
Pull in shims and other things so our code is closer to the
4+
stdlib's and vice versa. Long term, we'll want to address
5+
each of these
6+
7+
*/
8+
9+
func _internalInvariant(
10+
_ b: @autoclosure () -> Bool, _ s: String = ""
11+
) {
12+
assert(b(), s)
13+
}
14+
15+
// Don't use UnsafeRawBufferPointer for anything important
16+
public struct UnsafeByteBuffer {
17+
var pointer: UnsafeRawPointer
18+
var count: Int
19+
20+
func boundsCheck(_ idx: Int) -> Bool {
21+
idx < count
22+
}
23+
24+
subscript(_unchecked idx: Int) -> UInt8 {
25+
assert(boundsCheck(idx))
26+
return pointer.load(fromByteOffset: idx, as: UInt8.self)
27+
}
28+
subscript(idx: Int) -> UInt8 {
29+
precondition(boundsCheck(idx))
30+
return self[_unchecked: idx]
31+
}
32+
}
33+
34+
extension Unicode.Scalar {
35+
init(_unchecked v: UInt32) {
36+
self.init(v)!
37+
}
38+
}
39+
40+
// TODO: This actually might be good module API fodder
41+
extension UTF16 {
42+
//@inlinable @inline(__always)
43+
internal static func _decodeSurrogates(
44+
_ lead: CodeUnit,
45+
_ trail: CodeUnit
46+
) -> Unicode.Scalar {
47+
_internalInvariant(isLeadSurrogate(lead))
48+
_internalInvariant(isTrailSurrogate(trail))
49+
return Unicode.Scalar(
50+
_unchecked: 0x10000 +
51+
(UInt32(lead & 0x03ff) &<< 10 | UInt32(trail & 0x03ff)))
52+
}
53+
}

Sources/_Unicode/Normaliation.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
// TODO
3+

Sources/_Unicode/NumberParsing.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
// TODO
3+

Sources/_Unicode/ScalarProps.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
// TODO
3+

Sources/_Unicode/Transcoding.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
2+
// TODO

Sources/_Unicode/UCD.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
// TODO
3+

Sources/_Unicode/Validation.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
// TODO
3+

Tests/UnicodeTests/Decoding.swift

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
2+
import XCTest
3+
import _Unicode
4+
5+
class UnicodeTests: XCTestCase {}
6+
7+
extension UnicodeTests {
8+
func testUnsafeDecoding() {
9+
// ...
10+
}
11+
}

0 commit comments

Comments
 (0)