Skip to content

Commit b43bfa8

Browse files
committed
[benchmark] Add ChaCha20-based performance benchmark
This patch adds a benchmark to the Swift benchmark suite based on the ChaCha20 encryption algorithm. As Swift evolves it is important that it tackles more and more features and possible use cases. One of these great use-cases is low-level CPU intensive code, and cryptographic algorithms are a really important test-bed. This benchmark therefore provides a real-world test case for Swift's optimiser. My ideal outcome here is that Swift should be able to perform as well at this benchmark as a naive equivalent C implementation.
1 parent 4f4557c commit b43bfa8

File tree

3 files changed

+364
-0
lines changed

3 files changed

+364
-0
lines changed

benchmark/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ set(SWIFT_BENCH_MODULES
5151
single-source/CSVParsing
5252
single-source/Calculator
5353
single-source/CaptureProp
54+
single-source/ChaCha
5455
single-source/ChainedFilterMap
5556
single-source/CharacterLiteralsLarge
5657
single-source/CharacterLiteralsSmall

benchmark/single-source/ChaCha.swift

Lines changed: 361 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,361 @@
1+
//===--- ChaCha.swift -----------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2014-2019 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
import TestsUtils
14+
15+
/// This benchmark tests two things:
16+
///
17+
/// 1. Swift's ability to optimise low-level bit twiddling code.
18+
/// 2. Swift's ability to optimise generic code when using contiguous data structures.
19+
///
20+
/// In principle initializing ChaCha20's state and then xoring the keystream with the
21+
/// plaintext should be able to be vectorised.
22+
enum ChaCha20 { }
23+
24+
extension ChaCha20 {
25+
public static func encrypt<Key: Collection, Nonce: Collection, Bytes: MutableCollection>(bytes: inout Bytes, key: Key, nonce: Nonce, initialCounter: UInt32 = 0) where Bytes.Element == UInt8, Key.Element == UInt8, Nonce.Element == UInt8 {
26+
var baseState = ChaChaState(key: key, nonce: nonce, counter: initialCounter)
27+
var index = bytes.startIndex
28+
29+
while index < bytes.endIndex {
30+
let keyStream = baseState.block()
31+
keyStream.xorBytes(bytes: &bytes, at: &index)
32+
baseState.incrementCounter()
33+
}
34+
}
35+
}
36+
37+
38+
typealias BackingState = (UInt32, UInt32, UInt32, UInt32,
39+
UInt32, UInt32, UInt32, UInt32,
40+
UInt32, UInt32, UInt32, UInt32,
41+
UInt32, UInt32, UInt32, UInt32)
42+
43+
struct ChaChaState {
44+
/// The ChaCha20 algorithm has 16 32-bit integer numbers as its state.
45+
/// They are traditionally laid out as a matrix: we do the same.
46+
var _state: BackingState
47+
48+
/// Create a ChaChaState.
49+
///
50+
/// The inputs to ChaCha20 are:
51+
///
52+
/// - A 256-bit key, treated as a concatenation of eight 32-bit little-
53+
/// endian integers.
54+
/// - A 96-bit nonce, treated as a concatenation of three 32-bit little-
55+
/// endian integers.
56+
/// - A 32-bit block count parameter, treated as a 32-bit little-endian
57+
/// integer.
58+
init<Key: Collection, Nonce: Collection>(key: Key, nonce: Nonce, counter: UInt32) where Key.Element == UInt8, Nonce.Element == UInt8 {
59+
guard key.count == 32 && nonce.count == 12 else {
60+
fatalError("Invalid key or nonce length.")
61+
}
62+
63+
// The ChaCha20 state is initialized as follows:
64+
//
65+
// - The first four words (0-3) are constants: 0x61707865, 0x3320646e,
66+
// 0x79622d32, 0x6b206574.
67+
self._state.0 = 0x61707865
68+
self._state.1 = 0x3320646e
69+
self._state.2 = 0x79622d32
70+
self._state.3 = 0x6b206574
71+
72+
// - The next eight words (4-11) are taken from the 256-bit key by
73+
// reading the bytes in little-endian order, in 4-byte chunks.
74+
//
75+
// We force unwrap here because we have already preconditioned on the length.
76+
var keyIterator = CollectionOf32BitLittleEndianIntegers(key).makeIterator()
77+
self._state.4 = keyIterator.next()!
78+
self._state.5 = keyIterator.next()!
79+
self._state.6 = keyIterator.next()!
80+
self._state.7 = keyIterator.next()!
81+
self._state.8 = keyIterator.next()!
82+
self._state.9 = keyIterator.next()!
83+
self._state.10 = keyIterator.next()!
84+
self._state.11 = keyIterator.next()!
85+
86+
87+
// - Word 12 is a block counter. Since each block is 64-byte, a 32-bit
88+
// word is enough for 256 gigabytes of data.
89+
self._state.12 = counter
90+
91+
// - Words 13-15 are a nonce, which should not be repeated for the same
92+
// key. The 13th word is the first 32 bits of the input nonce taken
93+
// as a little-endian integer, while the 15th word is the last 32
94+
// bits.
95+
//
96+
// Again, we forcibly unwrap these bytes.
97+
var nonceIterator = CollectionOf32BitLittleEndianIntegers(nonce).makeIterator()
98+
self._state.13 = nonceIterator.next()!
99+
self._state.14 = nonceIterator.next()!
100+
self._state.15 = nonceIterator.next()!
101+
}
102+
103+
/// As a performance enhancement, it is often useful to be able to increment the counter portion directly. This avoids the
104+
/// expensive construction cost of the ChaCha state for each next sequence of bytes of the keystream.
105+
mutating func incrementCounter() {
106+
self._state.12 &+= 1
107+
}
108+
109+
110+
private mutating func add(_ otherState: ChaChaState) {
111+
self._state.0 &+= otherState._state.0
112+
self._state.1 &+= otherState._state.1
113+
self._state.2 &+= otherState._state.2
114+
self._state.3 &+= otherState._state.3
115+
self._state.4 &+= otherState._state.4
116+
self._state.5 &+= otherState._state.5
117+
self._state.6 &+= otherState._state.6
118+
self._state.7 &+= otherState._state.7
119+
self._state.8 &+= otherState._state.8
120+
self._state.9 &+= otherState._state.9
121+
self._state.10 &+= otherState._state.10
122+
self._state.11 &+= otherState._state.11
123+
self._state.12 &+= otherState._state.12
124+
self._state.13 &+= otherState._state.13
125+
self._state.14 &+= otherState._state.14
126+
self._state.15 &+= otherState._state.15
127+
}
128+
129+
private mutating func columnRound() {
130+
// The column round:
131+
//
132+
// 1. QUARTERROUND ( 0, 4, 8,12)
133+
// 2. QUARTERROUND ( 1, 5, 9,13)
134+
// 3. QUARTERROUND ( 2, 6,10,14)
135+
// 4. QUARTERROUND ( 3, 7,11,15)
136+
ChaChaState.quarterRound(a: &self._state.0, b: &self._state.4, c: &self._state.8, d: &self._state.12)
137+
ChaChaState.quarterRound(a: &self._state.1, b: &self._state.5, c: &self._state.9, d: &self._state.13)
138+
ChaChaState.quarterRound(a: &self._state.2, b: &self._state.6, c: &self._state.10, d: &self._state.14)
139+
ChaChaState.quarterRound(a: &self._state.3, b: &self._state.7, c: &self._state.11, d: &self._state.15)
140+
}
141+
142+
private mutating func diagonalRound() {
143+
// The diagonal round:
144+
//
145+
// 5. QUARTERROUND ( 0, 5,10,15)
146+
// 6. QUARTERROUND ( 1, 6,11,12)
147+
// 7. QUARTERROUND ( 2, 7, 8,13)
148+
// 8. QUARTERROUND ( 3, 4, 9,14)
149+
ChaChaState.quarterRound(a: &self._state.0, b: &self._state.5, c: &self._state.10, d: &self._state.15)
150+
ChaChaState.quarterRound(a: &self._state.1, b: &self._state.6, c: &self._state.11, d: &self._state.12)
151+
ChaChaState.quarterRound(a: &self._state.2, b: &self._state.7, c: &self._state.8, d: &self._state.13)
152+
ChaChaState.quarterRound(a: &self._state.3, b: &self._state.4, c: &self._state.9, d: &self._state.14)
153+
}
154+
}
155+
156+
extension ChaChaState {
157+
static func quarterRound(a: inout UInt32, b: inout UInt32, c: inout UInt32, d: inout UInt32) {
158+
// The ChaCha quarter round. This is almost identical to the definition from RFC 7539
159+
// except that we use &+= instead of += because overflow modulo 32 is expected.
160+
a &+= b; d ^= a; d <<<= 16
161+
c &+= d; b ^= c; b <<<= 12
162+
a &+= b; d ^= a; d <<<= 8
163+
c &+= d; b ^= c; b <<<= 7
164+
}
165+
}
166+
167+
extension ChaChaState {
168+
func block() -> ChaChaKeystreamBlock {
169+
var stateCopy = self // We need this copy. This is cheaper than initializing twice.
170+
171+
// The ChaCha20 block runs 10 double rounds (a total of 20 rounds), made of one column and
172+
// one diagonal round.
173+
for _ in 0..<10 {
174+
stateCopy.columnRound()
175+
stateCopy.diagonalRound()
176+
}
177+
178+
// We add the original input words to the output words.
179+
stateCopy.add(self)
180+
181+
return ChaChaKeystreamBlock(stateCopy)
182+
}
183+
}
184+
185+
186+
/// The result of running the ChaCha block function on a given set of ChaCha state.
187+
///
188+
/// This result has a distinct set of behaviours compared to the ChaChaState object, so we give it a different
189+
/// (and more constrained) type.
190+
struct ChaChaKeystreamBlock {
191+
var _state: BackingState
192+
193+
init(_ state: ChaChaState) {
194+
self._state = state._state
195+
}
196+
197+
/// A nice thing we can do with a ChaCha keystream block is xor it with some bytes.
198+
///
199+
/// This helper function exists because we want a hook to do fast, in-place encryption of bytes.
200+
func xorBytes<Bytes: MutableCollection>(bytes: inout Bytes, at index: inout Bytes.Index) where Bytes.Element == UInt8 {
201+
// This is a naive implementation of this loop but I'm interested in testing the Swift compiler's ability
202+
// to optimise this. If we have a programmatic way to roll up this loop I'd love to hear it!
203+
self._state.0.xorLittleEndianBytes(bytes: &bytes, at: &index)
204+
if index == bytes.endIndex { return }
205+
self._state.1.xorLittleEndianBytes(bytes: &bytes, at: &index)
206+
if index == bytes.endIndex { return }
207+
self._state.2.xorLittleEndianBytes(bytes: &bytes, at: &index)
208+
if index == bytes.endIndex { return }
209+
self._state.3.xorLittleEndianBytes(bytes: &bytes, at: &index)
210+
if index == bytes.endIndex { return }
211+
self._state.4.xorLittleEndianBytes(bytes: &bytes, at: &index)
212+
if index == bytes.endIndex { return }
213+
self._state.5.xorLittleEndianBytes(bytes: &bytes, at: &index)
214+
if index == bytes.endIndex { return }
215+
self._state.6.xorLittleEndianBytes(bytes: &bytes, at: &index)
216+
if index == bytes.endIndex { return }
217+
self._state.7.xorLittleEndianBytes(bytes: &bytes, at: &index)
218+
if index == bytes.endIndex { return }
219+
self._state.8.xorLittleEndianBytes(bytes: &bytes, at: &index)
220+
if index == bytes.endIndex { return }
221+
self._state.9.xorLittleEndianBytes(bytes: &bytes, at: &index)
222+
if index == bytes.endIndex { return }
223+
self._state.10.xorLittleEndianBytes(bytes: &bytes, at: &index)
224+
if index == bytes.endIndex { return }
225+
self._state.11.xorLittleEndianBytes(bytes: &bytes, at: &index)
226+
if index == bytes.endIndex { return }
227+
self._state.12.xorLittleEndianBytes(bytes: &bytes, at: &index)
228+
if index == bytes.endIndex { return }
229+
self._state.13.xorLittleEndianBytes(bytes: &bytes, at: &index)
230+
if index == bytes.endIndex { return }
231+
self._state.14.xorLittleEndianBytes(bytes: &bytes, at: &index)
232+
if index == bytes.endIndex { return }
233+
self._state.15.xorLittleEndianBytes(bytes: &bytes, at: &index)
234+
}
235+
}
236+
237+
238+
infix operator <<<: BitwiseShiftPrecedence
239+
240+
infix operator <<<=: AssignmentPrecedence
241+
242+
243+
extension FixedWidthInteger {
244+
func leftRotate(_ distance: Int) -> Self {
245+
return (self << distance) | (self >> (Self.bitWidth - distance))
246+
}
247+
248+
mutating func rotatedLeft(_ distance: Int) {
249+
self = self.leftRotate(distance)
250+
}
251+
252+
static func <<<(lhs: Self, rhs: Int) -> Self {
253+
return lhs.leftRotate(rhs)
254+
}
255+
256+
static func <<<=(lhs: inout Self, rhs: Int) {
257+
lhs.rotatedLeft(rhs)
258+
}
259+
}
260+
261+
262+
struct CollectionOf32BitLittleEndianIntegers<BaseCollection: Collection> where BaseCollection.Element == UInt8 {
263+
var baseCollection: BaseCollection
264+
265+
init(_ baseCollection: BaseCollection) {
266+
precondition(baseCollection.count % 4 == 0)
267+
self.baseCollection = baseCollection
268+
}
269+
}
270+
271+
extension CollectionOf32BitLittleEndianIntegers: Collection {
272+
typealias Element = UInt32
273+
274+
struct Index {
275+
var baseIndex: BaseCollection.Index
276+
277+
init(_ baseIndex: BaseCollection.Index) {
278+
self.baseIndex = baseIndex
279+
}
280+
}
281+
282+
var startIndex: Index {
283+
return Index(self.baseCollection.startIndex)
284+
}
285+
286+
var endIndex: Index {
287+
return Index(self.baseCollection.endIndex)
288+
}
289+
290+
func index(after index: Index) -> Index {
291+
return Index(self.baseCollection.index(index.baseIndex, offsetBy: 4))
292+
}
293+
294+
subscript(_ index: Index) -> UInt32 {
295+
var baseIndex = index.baseIndex
296+
var result = UInt32(0)
297+
298+
for shift in stride(from: 0, through: 24, by: 8) {
299+
result |= UInt32(self.baseCollection[baseIndex]) << shift
300+
self.baseCollection.formIndex(after: &baseIndex)
301+
}
302+
303+
return result
304+
}
305+
}
306+
307+
extension CollectionOf32BitLittleEndianIntegers.Index: Equatable {
308+
static func ==(lhs: Self, rhs: Self) -> Bool {
309+
return lhs.baseIndex == rhs.baseIndex
310+
}
311+
}
312+
313+
extension CollectionOf32BitLittleEndianIntegers.Index: Comparable {
314+
static func <(lhs: Self, rhs: Self) -> Bool {
315+
return lhs.baseIndex < rhs.baseIndex
316+
}
317+
318+
static func <=(lhs: Self, rhs: Self) -> Bool {
319+
return lhs.baseIndex <= rhs.baseIndex
320+
}
321+
322+
static func >(lhs: Self, rhs: Self) -> Bool {
323+
return lhs.baseIndex > rhs.baseIndex
324+
}
325+
326+
static func >=(lhs: Self, rhs: Self) -> Bool {
327+
return lhs.baseIndex >= rhs.baseIndex
328+
}
329+
}
330+
331+
332+
extension UInt32 {
333+
/// Performs an xor operation on up to 4 bytes of the mutable collection.
334+
func xorLittleEndianBytes<Bytes: MutableCollection>(bytes: inout Bytes, at index: inout Bytes.Index) where Bytes.Element == UInt8 {
335+
var loopCount = 0
336+
while index < bytes.endIndex && loopCount < 4 {
337+
bytes[index] ^= UInt8((self >> (loopCount * 8)) & UInt32(0xFF))
338+
bytes.formIndex(after: &index)
339+
loopCount += 1
340+
}
341+
}
342+
}
343+
344+
345+
public let ChaCha = BenchmarkInfo(
346+
name: "ChaCha",
347+
runFunction: run_ChaCha,
348+
tags: [.runtime, .cpubench])
349+
350+
351+
@inline(never)
352+
public func run_ChaCha(_ N: Int) {
353+
var plaintext = Array(repeating: UInt8(0), count: 30720) // Chosen for CI runtime
354+
let key = Array(repeating: UInt8(1), count: 32)
355+
let nonce = Array(repeating: UInt8(2), count: 12)
356+
357+
for _ in 1...N {
358+
ChaCha20.encrypt(bytes: &plaintext, key: key, nonce: nonce)
359+
blackHole(plaintext.first!)
360+
}
361+
}

benchmark/utils/main.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import CString
3939
import CSVParsing
4040
import Calculator
4141
import CaptureProp
42+
import ChaCha
4243
import ChainedFilterMap
4344
import CharacterLiteralsLarge
4445
import CharacterLiteralsSmall
@@ -216,6 +217,7 @@ registerBenchmark(CString)
216217
registerBenchmark(CSVParsing)
217218
registerBenchmark(Calculator)
218219
registerBenchmark(CaptureProp)
220+
registerBenchmark(ChaCha)
219221
registerBenchmark(ChainedFilterMap)
220222
registerBenchmark(CharacterLiteralsLarge)
221223
registerBenchmark(CharacterLiteralsSmall)

0 commit comments

Comments
 (0)