Skip to content

Commit 022c973

Browse files
authored
Merge pull request #10002 from milseman/i_dream_of_grapheme
[stdlib] Grapheme break fast-paths for Cyrillic, Arabic, Hangul
2 parents 99d77dd + 44cccba commit 022c973

File tree

4 files changed

+73
-15
lines changed

4 files changed

+73
-15
lines changed

benchmark/single-source/StringWalk.swift

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ let utf16 = emoji + "the quick brown fox" + String(emoji.reversed() as Array<Cha
6969
let japanese = "今回のアップデートでSwiftに大幅な改良が施され、安定していてしかも直感的に使うことができるAppleプラットフォーム向けプログラミング言語になりました。"
7070
let chinese = "Swift 是面向 Apple 平台的编程语言,功能强大且直观易用,而本次更新对其进行了全面优化。"
7171
let korean = "이번 업데이트에서는 강력하면서도 직관적인 Apple 플랫폼용 프로그래밍 언어인 Swift를 완벽히 개선하였습니다."
72+
let russian = "в чащах юга жил-был цитрус? да, но фальшивый экземпляр"
7273

7374
// A workload that's mostly Latin characters, with occasional emoji
7475
// interspersed. Common for tweets.
@@ -269,3 +270,33 @@ public func run_StringWalk_korean_characters_Backwards(_ N: Int) {
269270
}
270271
}
271272

273+
274+
@inline(never)
275+
public func run_StringWalk_russian_unicodeScalars(_ N: Int) {
276+
for _ in 1...unicodeScalarsMultiplier*N {
277+
count_unicodeScalars(russian.unicodeScalars)
278+
}
279+
}
280+
281+
@inline(never)
282+
public func run_StringWalk_russian_unicodeScalars_Backwards(_ N: Int) {
283+
for _ in 1...unicodeScalarsMultiplier*N {
284+
count_unicodeScalars_rev(russian.unicodeScalars.reversed())
285+
}
286+
}
287+
288+
289+
@inline(never)
290+
public func run_StringWalk_russian_characters(_ N: Int) {
291+
for _ in 1...charactersMultiplier*N {
292+
count_characters(russian.characters)
293+
}
294+
}
295+
296+
@inline(never)
297+
public func run_StringWalk_russian_characters_Backwards(_ N: Int) {
298+
for _ in 1...charactersMultiplier*N {
299+
count_characters_rev(russian.characters.reversed())
300+
}
301+
}
302+

benchmark/single-source/StringWalk.swift.gyb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ let utf16 = emoji + "the quick brown fox" + String(emoji.reversed() as Array<Cha
7070
let japanese = "今回のアップデートでSwiftに大幅な改良が施され、安定していてしかも直感的に使うことができるAppleプラットフォーム向けプログラミング言語になりました。"
7171
let chinese = "Swift 是面向 Apple 平台的编程语言,功能强大且直观易用,而本次更新对其进行了全面优化。"
7272
let korean = "이번 업데이트에서는 강력하면서도 직관적인 Apple 플랫폼용 프로그래밍 언어인 Swift를 완벽히 개선하였습니다."
73+
let russian = "в чащах юга жил-был цитрус? да, но фальшивый экземпляр"
7374

7475
// A workload that's mostly Latin characters, with occasional emoji
7576
// interspersed. Common for tweets.
@@ -90,7 +91,7 @@ let baseMultiplier = 10_000
9091
let unicodeScalarsMultiplier = baseMultiplier
9192
let charactersMultiplier = baseMultiplier / 5
9293

93-
% for Name in ["ascii", "utf16", "tweet", "japanese", "chinese", "korean"]:
94+
% for Name in ["ascii", "utf16", "tweet", "japanese", "chinese", "korean", "russian"]:
9495
% for Kind in ["unicodeScalars", "characters"]:
9596

9697
@inline(never)

benchmark/utils/main.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,10 @@ addTo(&stringTests, "StringWalk_korean_characters", run_StringWalk_korean_charac
531531
addTo(&stringTests, "StringWalk_korean_characters_Backwards", run_StringWalk_korean_characters_Backwards)
532532
addTo(&stringTests, "StringWalk_korean_unicodeScalars", run_StringWalk_korean_unicodeScalars)
533533
addTo(&stringTests, "StringWalk_korean_unicodeScalars_Backwards", run_StringWalk_korean_unicodeScalars_Backwards)
534+
addTo(&stringTests, "StringWalk_russian_characters", run_StringWalk_russian_characters)
535+
addTo(&stringTests, "StringWalk_russian_characters_Backwards", run_StringWalk_russian_characters_Backwards)
536+
addTo(&stringTests, "StringWalk_russian_unicodeScalars", run_StringWalk_russian_unicodeScalars)
537+
addTo(&stringTests, "StringWalk_russian_unicodeScalars_Backwards", run_StringWalk_russian_unicodeScalars_Backwards)
534538
addTo(&stringTests, "StringWalk_tweet_characters", run_StringWalk_tweet_characters)
535539
addTo(&stringTests, "StringWalk_tweet_characters_Backwards", run_StringWalk_tweet_characters_Backwards)
536540
addTo(&stringTests, "StringWalk_tweet_unicodeScalars", run_StringWalk_tweet_unicodeScalars)

stdlib/public/core/StringCharacterView.swift

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -297,33 +297,55 @@ extension String.CharacterView : BidirectionalCollection {
297297
internal static func _internalExtraCheckGraphemeBreakBetween(
298298
_ lhs: UInt16, _ rhs: UInt16
299299
) -> Bool {
300+
_sanityCheck(
301+
lhs != _CR || rhs != _LF,
302+
"CR-LF special case handled by _quickCheckGraphemeBreakBetween")
303+
300304
// Whether the given scalar, when it appears paired with another scalar
301305
// satisfying this property, has a grapheme break between it and the other
302306
// scalar.
303307
func hasBreakWhenPaired(_ x: UInt16) -> Bool {
304-
// TODO: This doesn't generate optimal code, tune/re-write at a lower level.
305-
308+
// TODO: This doesn't generate optimal code, tune/re-write at a lower
309+
// level.
310+
//
311+
// NOTE: Order of case ranges affects codegen, and thus performance. All
312+
// things being equal, keep existing order below.
313+
switch x {
306314
// Unified CJK Han ideographs, common and some supplemental, amongst
307315
// others:
308316
// 0x3400-0xA4CF
309-
if 0x3400 <= x && x <= 0xa4cf {
310-
return true
311-
}
317+
case 0x3400...0xa4cf: return true
318+
// TODO: CJK punctuation
312319

320+
// Repeat sub-300 check, this is beneficial for common cases of Latin
321+
// characters embedded within non-Latin script (e.g. newlines, spaces,
322+
// proper nouns and/or jargon, punctuation).
313323
//
324+
// NOTE: CR-LF special case has already been checked.
325+
case 0x0000...0x02ff: return true
326+
327+
// TODO: general punctuation
328+
314329
// Non-combining kana:
315330
// 0x3041-0x3096
316331
// 0x30A1-0x30FA
317-
//
318-
// TODO: may be faster to verify whether only 3099 and 309A don't have
319-
// this property, and compare not-equal rather than using two ranges.
320-
if 0x3041 <= x && x <= 0x3096 || 0x30a1 <= x && x <= 0x30fa {
321-
return true
322-
}
332+
case 0x3041...0x3096: return true
333+
case 0x30a1...0x30fa: return true
334+
335+
// Non-combining modern (and some archaic) Cyrillic:
336+
// 0x0400-0x0482 (first half of Cyrillic block)
337+
case 0x0400...0x0482: return true
323338

324-
// TODO: sub-300 check would also be valuable, e.g. when breaking at the
325-
// boundary between English embedded in Chinese.
326-
return false
339+
// Modern Arabic, excluding extenders and prependers:
340+
// 0x061D-0x064A
341+
case 0x061d...0x064a: return true
342+
343+
// Precomposed Hangul syllables:
344+
// 0xAC00–0xD7AF
345+
case 0xac00...0xd7af: return true
346+
347+
default: return false
348+
}
327349
}
328350
return hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs)
329351
}

0 commit comments

Comments
 (0)