Skip to content

Commit 4455226

Browse files
author
Lance Parker
committed
fast/foreignNormalize functions
1 parent 5c2d6ac commit 4455226

File tree

3 files changed

+294
-1
lines changed

3 files changed

+294
-1
lines changed

stdlib/public/core/StringNormalization.swift

Lines changed: 255 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
import SwiftShims
1414

1515
internal enum _Normalization {
16-
1716
// ICU's NFC unorm2 instance
1817
//
1918
// TODO(String performance): Should we cache one on TLS? Is this an expensive
@@ -102,3 +101,258 @@ internal func _tryNormalize(
102101
}
103102
return numericCast(count)
104103
}
104+
105+
internal enum NormalizationResult {
106+
case success(SuccessResult)
107+
case bufferTooSmall(BufferResizeRequest) // The size needed to normalize the rest of the string
108+
109+
struct SuccessResult {
110+
var amountFilled: Int
111+
var nextReadPosition: String.Index
112+
}
113+
struct BufferResizeRequest {
114+
var newOutputBufferSize: Int
115+
var newPreNormalScratchBufferSize: Int
116+
var newPostNormalScratchBufferSize: Int
117+
}
118+
119+
static func bufferTooSmall(count: Int) -> NormalizationResult {
120+
let outputBufferSize = count * 9
121+
let preNormalBufferSize = count
122+
let postNormalBufferSize = count * 3
123+
let resizeRequest = BufferResizeRequest(
124+
newOutputBufferSize: outputBufferSize,
125+
newPreNormalScratchBufferSize: preNormalBufferSize,
126+
newPostNormalScratchBufferSize: postNormalBufferSize
127+
)
128+
return .bufferTooSmall(resizeRequest)
129+
}
130+
131+
static func success(
132+
amountFilled filled: Int, nextReadPosition index: String.Index
133+
) -> NormalizationResult {
134+
let successResult = SuccessResult(amountFilled: filled, nextReadPosition: index)
135+
return .success(successResult)
136+
}
137+
}
138+
139+
func unimplemented() -> Never { fatalError("Unimplemented function called") }
140+
141+
internal func fastFill(
142+
_ sourceBuffer: UnsafeBufferPointer<UInt8>,
143+
_ outputBuffer: UnsafeMutableBufferPointer<UInt8>
144+
) -> (Int, Int) {
145+
// Quick check if a scalar is NFC and a segment starter
146+
@inline(__always) func isNFCStarter(_ scalar: Unicode.Scalar) -> Bool {
147+
// Fast-path: All scalars up through U+02FF are NFC and have boundaries
148+
// before them
149+
if scalar.value < 0x300 { return true }
150+
151+
// Otherwise, consult the properties
152+
return scalar._hasNormalizationBoundaryBefore && scalar._isNFCQCYes
153+
}
154+
155+
var outputBufferThreshold: Int {
156+
return outputBuffer.count - 4
157+
}
158+
159+
// TODO: Additional fast-path: All CCC-ascending NFC_QC segments are NFC
160+
// TODO: Just freakin do normalization and don't bother with ICU
161+
var outputCount = 0
162+
let outputEnd = outputBufferThreshold
163+
var inputCount = 0
164+
let inputEnd = sourceBuffer.count
165+
while inputCount < inputEnd && outputCount < outputEnd {
166+
// TODO: Slightly faster code-unit scan for latiny (<0xCC)
167+
168+
// Check scalar-based fast-paths
169+
let (scalar, len) = _decodeScalar(sourceBuffer, startingAt: inputCount)
170+
_internalInvariant(inputCount &+ len <= inputEnd)
171+
172+
if _slowPath(
173+
!sourceBuffer.hasNormalizationBoundary(before: inputCount &+ len)
174+
|| !isNFCStarter(scalar)
175+
) {
176+
break
177+
}
178+
inputCount &+= len
179+
180+
for cu in UTF8.encode(scalar)._unsafelyUnwrappedUnchecked {
181+
outputBuffer[outputCount] = cu
182+
outputCount &+= 1
183+
}
184+
185+
_internalInvariant(inputCount == outputCount,
186+
"non-normalizing UTF-8 fast path should be 1-to-1 in code units")
187+
}
188+
return (inputCount, outputCount)
189+
}
190+
191+
internal func transcodeToUTF16(
192+
_ sourceBuffer: UnsafeBufferPointer<UInt8>,
193+
into outputBuffer: UnsafeMutableBufferPointer<UInt16>
194+
) -> (Int, Int)? {
195+
var readIndex = 0
196+
var writeIndex = 0
197+
let outputCount = outputBuffer.count
198+
let sourceCount = sourceBuffer.count
199+
200+
while readIndex < sourceCount {
201+
let (scalar, length) = _decodeScalar(sourceBuffer, startingAt: readIndex)
202+
203+
if scalar._hasNormalizationBoundaryBefore && readIndex != 0 {
204+
break
205+
}
206+
207+
readIndex += length
208+
209+
for cu in scalar.utf16 {
210+
if writeIndex < outputCount {
211+
outputBuffer[writeIndex] = cu
212+
writeIndex += 1
213+
} else {
214+
return nil
215+
}
216+
}
217+
}
218+
219+
return (readIndex, writeIndex)
220+
}
221+
222+
internal func transcodeToUTF8(
223+
_ sourceBuffer: UnsafeBufferPointer<UInt16>,
224+
into outputBuffer: UnsafeMutableBufferPointer<UInt8>
225+
) -> (Int, Int)? {
226+
var readIndex = 0
227+
var writeIndex = 0
228+
let outputCount = outputBuffer.count
229+
let sourceCount = sourceBuffer.count
230+
231+
while readIndex < sourceCount {
232+
let (scalar, length) = _decodeScalar(sourceBuffer, startingAt: readIndex)
233+
//we don't need to check for normalization boundaries here because we are only transcoding
234+
//a single segment at this point
235+
236+
readIndex += length
237+
guard scalar.withUTF8CodeUnits({ utf8 in
238+
for cu in utf8 {
239+
if writeIndex < outputCount {
240+
outputBuffer[writeIndex] = cu
241+
writeIndex += 1
242+
} else {
243+
return false
244+
}
245+
}
246+
return true
247+
}) else {
248+
return nil
249+
}
250+
}
251+
return (readIndex, writeIndex)
252+
}
253+
254+
internal func fastNormalize(
255+
readIndex: String.Index,
256+
guts: _StringGuts,
257+
outputBuffer: UnsafeMutableBufferPointer<UInt8>,
258+
icuInputBuffer: UnsafeMutableBufferPointer<UInt16>,
259+
icuOutputBuffer: UnsafeMutableBufferPointer<UInt16>
260+
) -> NormalizationResult {
261+
_internalInvariant(guts.isFastUTF8)
262+
return guts.withFastUTF8 { sourceBuffer in
263+
let sourceCount = sourceBuffer.count - readIndex.encodedOffset
264+
let start = readIndex.encodedOffset
265+
let rebasedSourceBuffer = UnsafeBufferPointer(rebasing: sourceBuffer[start...])
266+
do {
267+
let (read, filled) = fastFill(rebasedSourceBuffer, outputBuffer)
268+
if filled > 0 {
269+
let nextIndex = readIndex.encoded(offsetBy: read)
270+
_internalInvariant(guts.isOnUnicodeScalarBoundary(nextIndex))
271+
272+
return .success(amountFilled: filled, nextReadPosition: nextIndex)
273+
}
274+
}
275+
guard let (read, filled) = transcodeToUTF16(rebasedSourceBuffer, into: icuInputBuffer) else {
276+
return .bufferTooSmall(count: sourceBuffer.count)
277+
}
278+
279+
let nextIndex = readIndex.encoded(offsetBy: read)
280+
_internalInvariant(guts.isOnUnicodeScalarBoundary(nextIndex))
281+
282+
let rebasedICUInputBuffer = UnsafeBufferPointer(rebasing: icuInputBuffer[..<filled])
283+
return sharedNormalize(
284+
sourceCount, nextIndex, outputBuffer, rebasedICUInputBuffer, icuOutputBuffer
285+
)
286+
}
287+
}
288+
289+
internal func foreignNormalize(
290+
readIndex: String.Index,
291+
guts: _StringGuts,
292+
outputBuffer: UnsafeMutableBufferPointer<UInt8>,
293+
icuInputBuffer: UnsafeMutableBufferPointer<UInt16>,
294+
icuOutputBuffer: UnsafeMutableBufferPointer<UInt16>
295+
) -> NormalizationResult {
296+
let sourceCount = guts.count - readIndex.encodedOffset
297+
guard let (read, filled) = foreignFill(readIndex, guts, into: icuInputBuffer) else {
298+
return .bufferTooSmall(count: guts.count)
299+
}
300+
301+
let nextIndex = readIndex.encoded(offsetBy: read)
302+
_internalInvariant(guts.isOnUnicodeScalarBoundary(nextIndex))
303+
304+
let rebasedICUInputBuffer = UnsafeBufferPointer(rebasing: icuInputBuffer[..<filled])
305+
return sharedNormalize(
306+
sourceCount, nextIndex, outputBuffer, rebasedICUInputBuffer, icuOutputBuffer
307+
)
308+
}
309+
310+
func foreignFill(
311+
_ readIndex: String.Index,
312+
_ guts: _StringGuts,
313+
into outputBuffer: UnsafeMutableBufferPointer<UInt16>
314+
) -> (Int, Int)? {
315+
var index = readIndex
316+
var writeIndex = 0
317+
let outputCount = outputBuffer.count
318+
let cachedEndIndex = guts.endIndex
319+
while index != cachedEndIndex {
320+
let (scalar, length) = guts.foreignErrorCorrectedScalar(startingAt: index)
321+
if scalar._hasNormalizationBoundaryBefore && index != readIndex {
322+
break
323+
}
324+
325+
index = index.encoded(offsetBy: length)
326+
327+
for cu in scalar.utf16 {
328+
if writeIndex < outputCount {
329+
outputBuffer[writeIndex] = cu
330+
writeIndex += 1
331+
} else {
332+
return nil
333+
}
334+
}
335+
}
336+
return (index.encodedOffset - readIndex.encodedOffset, writeIndex)
337+
}
338+
339+
private func sharedNormalize(
340+
_ sourceCount: Int,
341+
_ nextIndex: String.Index,
342+
_ outputBuffer: UnsafeMutableBufferPointer<UInt8>,
343+
_ icuInputBuffer: UnsafeBufferPointer<UInt16>,
344+
_ icuOutputBuffer: UnsafeMutableBufferPointer<UInt16>
345+
) -> NormalizationResult {
346+
guard let normalized = _tryNormalize(icuInputBuffer, into: icuOutputBuffer) else {
347+
return .bufferTooSmall(count: sourceCount)
348+
}
349+
350+
guard let (_, transcoded) = transcodeToUTF8(
351+
UnsafeBufferPointer<UInt16>(rebasing: icuOutputBuffer[..<normalized]),
352+
into: outputBuffer
353+
) else {
354+
return .bufferTooSmall(count: sourceCount)
355+
}
356+
357+
return .success(amountFilled: transcoded, nextReadPosition: nextIndex)
358+
}

stdlib/public/core/UnicodeHelpers.swift

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ internal func _isTrailingSurrogate(_ cu: UInt16) -> Bool {
2323
return cu & _surrogateMask == _trailingSurrogateBias
2424
}
2525
@inline(__always)
26+
@usableFromInline
2627
internal func _isLeadingSurrogate(_ cu: UInt16) -> Bool {
2728
return cu & _surrogateMask == _leadingSurrogateBias
2829
}
@@ -84,6 +85,23 @@ internal func _decodeUTF8(
8485
return Unicode.Scalar(_unchecked: value)
8586
}
8687

88+
@inlinable
89+
internal func _decodeScalar(
90+
_ utf16: UnsafeBufferPointer<UInt16>, startingAt i: Int
91+
) -> (Unicode.Scalar, scalarLength: Int) {
92+
let high = utf16[i]
93+
if i + 1 >= utf16.count {
94+
return (Unicode.Scalar(_unchecked: UInt32(high)), 1)
95+
}
96+
97+
if !_isLeadingSurrogate(high) {
98+
return (Unicode.Scalar(_unchecked: UInt32(high)), 1)
99+
}
100+
101+
let low = utf16[i+1]
102+
return (Unicode.Scalar(_unchecked: _decodeSurrogatePair(leading: high, trailing: low)), 2)
103+
}
104+
87105
@inlinable
88106
internal func _decodeScalar(
89107
_ utf8: UnsafeBufferPointer<UInt8>, startingAt i: Int
@@ -148,6 +166,7 @@ internal func _continuationPayload(_ x: UInt8) -> UInt32 {
148166
}
149167

150168
@inline(__always)
169+
@usableFromInline
151170
internal func _decodeSurrogatePair(
152171
leading high: UInt16, trailing low: UInt16
153172
) -> UInt32 {

validation-test/stdlib/String.swift

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2216,8 +2216,28 @@ StringTests.test("NormalizationCheck") {
22162216
let str = "\u{0336}\u{0344}\u{0357}\u{0343}\u{0314}\u{0351}\u{0340}\u{0300}\u{0340}\u{0360}\u{0314}\u{0357}\u{0315}\u{0301}\u{0344}a"
22172217
let nfcCodeUnits = str._nfcCodeUnits
22182218
let expectedCodeUnits: [UInt8] = [0xCC, 0xB6, 0xCC, 0x88, 0xCC, 0x81, 0xCD, 0x97, 0xCC, 0x93, 0xCC, 0x94, 0xCD, 0x91, 0xCC, 0x80, 0xCC, 0x80, 0xCC, 0x80, 0xCC, 0x94, 0xCD, 0x97, 0xCC, 0x81, 0xCC, 0x88, 0xCC, 0x81, 0xCC, 0x95, 0xCD, 0xA0, 0x61]
2219+
2220+
expectEqual(expectedCodeUnits, nfcCodeUnits)
2221+
}
2222+
2223+
StringTests.test("NormalizationCheck/Opaque") {
2224+
let str = "\u{0336}\u{0344}\u{0357}\u{0343}\u{0314}\u{0351}\u{0340}\u{0300}\u{0340}\u{0360}\u{0314}\u{0357}\u{0315}\u{0301}\u{0344}a"
2225+
let opaqueString = NSSlowString(string: str) as String
2226+
let nfcCodeUnits = opaqueString._nfcCodeUnits
2227+
let expectedCodeUnits: [UInt8] = [0xCC, 0xB6, 0xCC, 0x88, 0xCC, 0x81, 0xCD, 0x97, 0xCC, 0x93, 0xCC, 0x94, 0xCD, 0x91, 0xCC, 0x80, 0xCC, 0x80, 0xCC, 0x80, 0xCC, 0x94, 0xCD, 0x97, 0xCC, 0x81, 0xCC, 0x88, 0xCC, 0x81, 0xCC, 0x95, 0xCD, 0xA0, 0x61]
2228+
2229+
expectEqual(expectedCodeUnits, nfcCodeUnits)
2230+
}
2231+
2232+
StringTests.test("ABC") {
2233+
let str = "abcdefg"
2234+
let nfcCodeUnits = str._nfcCodeUnits
2235+
let expectedCodeUnits = Array(str.utf8)
22192236

22202237
expectEqual(expectedCodeUnits, nfcCodeUnits)
2238+
2239+
let opaqueString = NSSlowString(string: str) as String
2240+
expectEqual(expectedCodeUnits, opaqueString._nfcCodeUnits)
22212241
}
22222242

22232243
runAllTests()

0 commit comments

Comments
 (0)