Skip to content

[String] Grapheme fast paths for punctuation: 5-8x speedup. #10648

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 28, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 176 additions & 15 deletions benchmark/single-source/StringWalk.swift
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,11 @@
// scripts/generate_harness/generate_harness.py to regenerate this file.
////////////////////////////////////////////////////////////////////////////////


// Test String subscript performance.
//
// Subscript has a slow path that initializes a global variable:
// Swift._cocoaStringSubscript.addressor. Global optimization would
// normally hoist the initializer outside the inner loop (over
// unicodeScalars), forcing the initializer to be called on each
// lap. However, no that the cocoa code is properly marked "slowPath",
// no hoisting should occur.
// Test String iteration performance over a variety of workloads, languages,
// and symbols.
//

import TestsUtils

var count: Int = 0
Expand Down Expand Up @@ -70,6 +66,8 @@ let japanese = "今回のアップデートでSwiftに大幅な改良が施さ
let chinese = "Swift 是面向 Apple 平台的编程语言,功能强大且直观易用,而本次更新对其进行了全面优化。"
let korean = "이번 업데이트에서는 강력하면서도 직관적인 Apple 플랫폼용 프로그래밍 언어인 Swift를 완벽히 개선하였습니다."
let russian = "в чащах юга жил-был цитрус? да, но фальшивый экземпляр"
let punctuated = "\u{201c}Hello\u{2010}world\u{2026}\u{201d}"
let punctuatedJapanese = "\u{300c}\u{300e}今日は\u{3001}世界\u{3002}\u{300f}\u{300d}"

// A workload that's mostly Latin characters, with occasional emoji
// interspersed. Common for tweets.
Expand All @@ -91,7 +89,6 @@ let unicodeScalarsMultiplier = baseMultiplier
let charactersMultiplier = baseMultiplier / 5



@inline(never)
public func run_StringWalk_ascii_unicodeScalars(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
Expand Down Expand Up @@ -177,7 +174,6 @@ public func run_CharIndexing_ascii_unicodeScalars_Backwards(_ N: Int) {




@inline(never)
public func run_StringWalk_utf16_unicodeScalars(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
Expand Down Expand Up @@ -263,7 +259,6 @@ public func run_CharIndexing_utf16_unicodeScalars_Backwards(_ N: Int) {




@inline(never)
public func run_StringWalk_tweet_unicodeScalars(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
Expand Down Expand Up @@ -349,7 +344,6 @@ public func run_CharIndexing_tweet_unicodeScalars_Backwards(_ N: Int) {




@inline(never)
public func run_StringWalk_japanese_unicodeScalars(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
Expand Down Expand Up @@ -435,7 +429,6 @@ public func run_CharIndexing_japanese_unicodeScalars_Backwards(_ N: Int) {




@inline(never)
public func run_StringWalk_chinese_unicodeScalars(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
Expand Down Expand Up @@ -521,7 +514,6 @@ public func run_CharIndexing_chinese_unicodeScalars_Backwards(_ N: Int) {




@inline(never)
public func run_StringWalk_korean_unicodeScalars(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
Expand Down Expand Up @@ -607,7 +599,6 @@ public func run_CharIndexing_korean_unicodeScalars_Backwards(_ N: Int) {




@inline(never)
public func run_StringWalk_russian_unicodeScalars(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
Expand Down Expand Up @@ -692,3 +683,173 @@ public func run_CharIndexing_russian_unicodeScalars_Backwards(_ N: Int) {




@inline(never)
public func run_StringWalk_punctuated_unicodeScalars(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
count_unicodeScalars(punctuated.unicodeScalars)
}
}

@inline(never)
public func run_StringWalk_punctuated_unicodeScalars_Backwards(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
count_unicodeScalars_rev(punctuated.unicodeScalars.reversed())
}
}




@inline(never)
public func run_StringWalk_punctuated_characters(_ N: Int) {
for _ in 1...charactersMultiplier*N {
count_characters(punctuated.characters)
}
}

@inline(never)
public func run_StringWalk_punctuated_characters_Backwards(_ N: Int) {
for _ in 1...charactersMultiplier*N {
count_characters_rev(punctuated.characters.reversed())
}
}




let punctuatedCharacters = Array(punctuated)

@inline(never)
public func run_CharIteration_punctuated_unicodeScalars(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
for c in punctuatedCharacters {
for u in c.unicodeScalars {
count |= Int(u.value)
}
}
}
}

@inline(never)
public func run_CharIteration_punctuated_unicodeScalars_Backwards(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
for c in punctuatedCharacters {
for u in c.unicodeScalars.reversed() {
count |= Int(u.value)
}
}
}
}

@inline(never)
public func run_CharIndexing_punctuated_unicodeScalars(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
for c in punctuatedCharacters {
let s = c.unicodeScalars
for i in s.indices {
count |= Int(s[i].value)
}
}
}
}

@inline(never)
public func run_CharIndexing_punctuated_unicodeScalars_Backwards(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
for c in punctuatedCharacters {
let s = c.unicodeScalars
for i in s.indices.reversed() {
count |= Int(s[i].value)
}
}
}
}




@inline(never)
public func run_StringWalk_punctuatedJapanese_unicodeScalars(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
count_unicodeScalars(punctuatedJapanese.unicodeScalars)
}
}

@inline(never)
public func run_StringWalk_punctuatedJapanese_unicodeScalars_Backwards(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
count_unicodeScalars_rev(punctuatedJapanese.unicodeScalars.reversed())
}
}




@inline(never)
public func run_StringWalk_punctuatedJapanese_characters(_ N: Int) {
for _ in 1...charactersMultiplier*N {
count_characters(punctuatedJapanese.characters)
}
}

@inline(never)
public func run_StringWalk_punctuatedJapanese_characters_Backwards(_ N: Int) {
for _ in 1...charactersMultiplier*N {
count_characters_rev(punctuatedJapanese.characters.reversed())
}
}




let punctuatedJapaneseCharacters = Array(punctuatedJapanese)

@inline(never)
public func run_CharIteration_punctuatedJapanese_unicodeScalars(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
for c in punctuatedJapaneseCharacters {
for u in c.unicodeScalars {
count |= Int(u.value)
}
}
}
}

@inline(never)
public func run_CharIteration_punctuatedJapanese_unicodeScalars_Backwards(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
for c in punctuatedJapaneseCharacters {
for u in c.unicodeScalars.reversed() {
count |= Int(u.value)
}
}
}
}

@inline(never)
public func run_CharIndexing_punctuatedJapanese_unicodeScalars(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
for c in punctuatedJapaneseCharacters {
let s = c.unicodeScalars
for i in s.indices {
count |= Int(s[i].value)
}
}
}
}

@inline(never)
public func run_CharIndexing_punctuatedJapanese_unicodeScalars_Backwards(_ N: Int) {
for _ in 1...unicodeScalarsMultiplier*N {
for c in punctuatedJapaneseCharacters {
let s = c.unicodeScalars
for i in s.indices.reversed() {
count |= Int(s[i].value)
}
}
}
}



17 changes: 7 additions & 10 deletions benchmark/single-source/StringWalk.swift.gyb
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,11 @@
// scripts/generate_harness/generate_harness.py to regenerate this file.
////////////////////////////////////////////////////////////////////////////////


// Test String subscript performance.
//
// Subscript has a slow path that initializes a global variable:
// Swift._cocoaStringSubscript.addressor. Global optimization would
// normally hoist the initializer outside the inner loop (over
// unicodeScalars), forcing the initializer to be called on each
// lap. However, no that the cocoa code is properly marked "slowPath",
// no hoisting should occur.
// Test String iteration performance over a variety of workloads, languages,
// and symbols.
//

import TestsUtils

var count: Int = 0
Expand Down Expand Up @@ -71,6 +67,8 @@ let japanese = "今回のアップデートでSwiftに大幅な改良が施さ
let chinese = "Swift 是面向 Apple 平台的编程语言,功能强大且直观易用,而本次更新对其进行了全面优化。"
let korean = "이번 업데이트에서는 강력하면서도 직관적인 Apple 플랫폼용 프로그래밍 언어인 Swift를 완벽히 개선하였습니다."
let russian = "в чащах юга жил-был цитрус? да, но фальшивый экземпляр"
let punctuated = "\u{201c}Hello\u{2010}world\u{2026}\u{201d}"
let punctuatedJapanese = "\u{300c}\u{300e}今日は\u{3001}世界\u{3002}\u{300f}\u{300d}"

// A workload that's mostly Latin characters, with occasional emoji
// interspersed. Common for tweets.
Expand All @@ -91,8 +89,7 @@ let baseMultiplier = 10_000
let unicodeScalarsMultiplier = baseMultiplier
let charactersMultiplier = baseMultiplier / 5

% for Name in ["ascii", "utf16", "tweet", "japanese", "chinese", "korean", "russian"]:

% for Name in ["ascii", "utf16", "tweet", "japanese", "chinese", "korean", "russian", "punctuated", "punctuatedJapanese"]:
% for Kind in ["unicodeScalars", "characters"]:

@inline(never)
Expand Down
16 changes: 16 additions & 0 deletions benchmark/utils/main.swift
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,10 @@ addTo(&precommitTests, "CharIndexing_japanese_unicodeScalars", run_CharIndexing_
addTo(&precommitTests, "CharIndexing_japanese_unicodeScalars_Backwards", run_CharIndexing_japanese_unicodeScalars_Backwards)
addTo(&precommitTests, "CharIndexing_korean_unicodeScalars", run_CharIndexing_korean_unicodeScalars)
addTo(&precommitTests, "CharIndexing_korean_unicodeScalars_Backwards", run_CharIndexing_korean_unicodeScalars_Backwards)
addTo(&precommitTests, "CharIndexing_punctuatedJapanese_unicodeScalars", run_CharIndexing_punctuatedJapanese_unicodeScalars)
addTo(&precommitTests, "CharIndexing_punctuatedJapanese_unicodeScalars_Backwards", run_CharIndexing_punctuatedJapanese_unicodeScalars_Backwards)
addTo(&precommitTests, "CharIndexing_punctuated_unicodeScalars", run_CharIndexing_punctuated_unicodeScalars)
addTo(&precommitTests, "CharIndexing_punctuated_unicodeScalars_Backwards", run_CharIndexing_punctuated_unicodeScalars_Backwards)
addTo(&precommitTests, "CharIndexing_russian_unicodeScalars", run_CharIndexing_russian_unicodeScalars)
addTo(&precommitTests, "CharIndexing_russian_unicodeScalars_Backwards", run_CharIndexing_russian_unicodeScalars_Backwards)
addTo(&precommitTests, "CharIndexing_tweet_unicodeScalars", run_CharIndexing_tweet_unicodeScalars)
Expand All @@ -192,6 +196,10 @@ addTo(&precommitTests, "CharIteration_japanese_unicodeScalars", run_CharIteratio
addTo(&precommitTests, "CharIteration_japanese_unicodeScalars_Backwards", run_CharIteration_japanese_unicodeScalars_Backwards)
addTo(&precommitTests, "CharIteration_korean_unicodeScalars", run_CharIteration_korean_unicodeScalars)
addTo(&precommitTests, "CharIteration_korean_unicodeScalars_Backwards", run_CharIteration_korean_unicodeScalars_Backwards)
addTo(&precommitTests, "CharIteration_punctuatedJapanese_unicodeScalars", run_CharIteration_punctuatedJapanese_unicodeScalars)
addTo(&precommitTests, "CharIteration_punctuatedJapanese_unicodeScalars_Backwards", run_CharIteration_punctuatedJapanese_unicodeScalars_Backwards)
addTo(&precommitTests, "CharIteration_punctuated_unicodeScalars", run_CharIteration_punctuated_unicodeScalars)
addTo(&precommitTests, "CharIteration_punctuated_unicodeScalars_Backwards", run_CharIteration_punctuated_unicodeScalars_Backwards)
addTo(&precommitTests, "CharIteration_russian_unicodeScalars", run_CharIteration_russian_unicodeScalars)
addTo(&precommitTests, "CharIteration_russian_unicodeScalars_Backwards", run_CharIteration_russian_unicodeScalars_Backwards)
addTo(&precommitTests, "CharIteration_tweet_unicodeScalars", run_CharIteration_tweet_unicodeScalars)
Expand Down Expand Up @@ -563,6 +571,14 @@ addTo(&stringTests, "StringWalk_korean_characters", run_StringWalk_korean_charac
addTo(&stringTests, "StringWalk_korean_characters_Backwards", run_StringWalk_korean_characters_Backwards)
addTo(&stringTests, "StringWalk_korean_unicodeScalars", run_StringWalk_korean_unicodeScalars)
addTo(&stringTests, "StringWalk_korean_unicodeScalars_Backwards", run_StringWalk_korean_unicodeScalars_Backwards)
addTo(&stringTests, "StringWalk_punctuatedJapanese_characters", run_StringWalk_punctuatedJapanese_characters)
addTo(&stringTests, "StringWalk_punctuatedJapanese_characters_Backwards", run_StringWalk_punctuatedJapanese_characters_Backwards)
addTo(&stringTests, "StringWalk_punctuatedJapanese_unicodeScalars", run_StringWalk_punctuatedJapanese_unicodeScalars)
addTo(&stringTests, "StringWalk_punctuatedJapanese_unicodeScalars_Backwards", run_StringWalk_punctuatedJapanese_unicodeScalars_Backwards)
addTo(&stringTests, "StringWalk_punctuated_characters", run_StringWalk_punctuated_characters)
addTo(&stringTests, "StringWalk_punctuated_characters_Backwards", run_StringWalk_punctuated_characters_Backwards)
addTo(&stringTests, "StringWalk_punctuated_unicodeScalars", run_StringWalk_punctuated_unicodeScalars)
addTo(&stringTests, "StringWalk_punctuated_unicodeScalars_Backwards", run_StringWalk_punctuated_unicodeScalars_Backwards)
addTo(&stringTests, "StringWalk_russian_characters", run_StringWalk_russian_characters)
addTo(&stringTests, "StringWalk_russian_characters_Backwards", run_StringWalk_russian_characters_Backwards)
addTo(&stringTests, "StringWalk_russian_unicodeScalars", run_StringWalk_russian_unicodeScalars)
Expand Down
11 changes: 8 additions & 3 deletions stdlib/public/core/StringCharacterView.swift
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,6 @@ extension String.CharacterView : BidirectionalCollection {
// others:
// 0x3400-0xA4CF
case 0x3400...0xa4cf: return true
// TODO: CJK punctuation

// Repeat sub-300 check, this is beneficial for common cases of Latin
// characters embedded within non-Latin script (e.g. newlines, spaces,
Expand All @@ -324,8 +323,6 @@ extension String.CharacterView : BidirectionalCollection {
// NOTE: CR-LF special case has already been checked.
case 0x0000...0x02ff: return true

// TODO: general punctuation

// Non-combining kana:
// 0x3041-0x3096
// 0x30A1-0x30FA
Expand All @@ -344,6 +341,14 @@ extension String.CharacterView : BidirectionalCollection {
// 0xAC00–0xD7AF
case 0xac00...0xd7af: return true

// Common general use punctuation, excluding extenders:
// 0x2010-0x2029
case 0x2010...0x2029: return true

// CJK punctuation characters, excluding extenders:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean by "extenders"?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The characters (i.e. scalars) that have the "extend" property for the purposes of grapheme breaking. Such characters usually* don't have a grapheme break before them. E.g. 0x302A.

See:
http://unicode.org/reports/tr29/#GB9
http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt

// 0x3000-0x3029
case 0x3000...0x3029: return true

default: return false
}
}
Expand Down