Skip to content

Commit 9565300

Browse files
authored
Merge pull request #27699 from Lukasa/cb-chacha-benchmark
[benchmark] Add ChaCha20-based performance benchmark
2 parents 282d0de + b43bfa8 commit 9565300

File tree

3 files changed

+364
-0
lines changed

3 files changed

+364
-0
lines changed

benchmark/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ set(SWIFT_BENCH_MODULES
5151
single-source/CSVParsing
5252
single-source/Calculator
5353
single-source/CaptureProp
54+
single-source/ChaCha
5455
single-source/ChainedFilterMap
5556
single-source/CharacterLiteralsLarge
5657
single-source/CharacterLiteralsSmall

benchmark/single-source/ChaCha.swift

Lines changed: 361 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,361 @@
1+
//===--- ChaCha.swift -----------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2014-2019 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
import TestsUtils
14+
15+
/// This benchmark tests two things:
16+
///
17+
/// 1. Swift's ability to optimise low-level bit twiddling code.
18+
/// 2. Swift's ability to optimise generic code when using contiguous data structures.
19+
///
20+
/// In principle initializing ChaCha20's state and then xoring the keystream with the
21+
/// plaintext should be able to be vectorised.
22+
enum ChaCha20 { }
23+
24+
extension ChaCha20 {
25+
public static func encrypt<Key: Collection, Nonce: Collection, Bytes: MutableCollection>(bytes: inout Bytes, key: Key, nonce: Nonce, initialCounter: UInt32 = 0) where Bytes.Element == UInt8, Key.Element == UInt8, Nonce.Element == UInt8 {
26+
var baseState = ChaChaState(key: key, nonce: nonce, counter: initialCounter)
27+
var index = bytes.startIndex
28+
29+
while index < bytes.endIndex {
30+
let keyStream = baseState.block()
31+
keyStream.xorBytes(bytes: &bytes, at: &index)
32+
baseState.incrementCounter()
33+
}
34+
}
35+
}
36+
37+
38+
typealias BackingState = (UInt32, UInt32, UInt32, UInt32,
39+
UInt32, UInt32, UInt32, UInt32,
40+
UInt32, UInt32, UInt32, UInt32,
41+
UInt32, UInt32, UInt32, UInt32)
42+
43+
struct ChaChaState {
44+
/// The ChaCha20 algorithm has 16 32-bit integer numbers as its state.
45+
/// They are traditionally laid out as a matrix: we do the same.
46+
var _state: BackingState
47+
48+
/// Create a ChaChaState.
49+
///
50+
/// The inputs to ChaCha20 are:
51+
///
52+
/// - A 256-bit key, treated as a concatenation of eight 32-bit little-
53+
/// endian integers.
54+
/// - A 96-bit nonce, treated as a concatenation of three 32-bit little-
55+
/// endian integers.
56+
/// - A 32-bit block count parameter, treated as a 32-bit little-endian
57+
/// integer.
58+
init<Key: Collection, Nonce: Collection>(key: Key, nonce: Nonce, counter: UInt32) where Key.Element == UInt8, Nonce.Element == UInt8 {
59+
guard key.count == 32 && nonce.count == 12 else {
60+
fatalError("Invalid key or nonce length.")
61+
}
62+
63+
// The ChaCha20 state is initialized as follows:
64+
//
65+
// - The first four words (0-3) are constants: 0x61707865, 0x3320646e,
66+
// 0x79622d32, 0x6b206574.
67+
self._state.0 = 0x61707865
68+
self._state.1 = 0x3320646e
69+
self._state.2 = 0x79622d32
70+
self._state.3 = 0x6b206574
71+
72+
// - The next eight words (4-11) are taken from the 256-bit key by
73+
// reading the bytes in little-endian order, in 4-byte chunks.
74+
//
75+
// We force unwrap here because we have already preconditioned on the length.
76+
var keyIterator = CollectionOf32BitLittleEndianIntegers(key).makeIterator()
77+
self._state.4 = keyIterator.next()!
78+
self._state.5 = keyIterator.next()!
79+
self._state.6 = keyIterator.next()!
80+
self._state.7 = keyIterator.next()!
81+
self._state.8 = keyIterator.next()!
82+
self._state.9 = keyIterator.next()!
83+
self._state.10 = keyIterator.next()!
84+
self._state.11 = keyIterator.next()!
85+
86+
87+
// - Word 12 is a block counter. Since each block is 64-byte, a 32-bit
88+
// word is enough for 256 gigabytes of data.
89+
self._state.12 = counter
90+
91+
// - Words 13-15 are a nonce, which should not be repeated for the same
92+
// key. The 13th word is the first 32 bits of the input nonce taken
93+
// as a little-endian integer, while the 15th word is the last 32
94+
// bits.
95+
//
96+
// Again, we forcibly unwrap these bytes.
97+
var nonceIterator = CollectionOf32BitLittleEndianIntegers(nonce).makeIterator()
98+
self._state.13 = nonceIterator.next()!
99+
self._state.14 = nonceIterator.next()!
100+
self._state.15 = nonceIterator.next()!
101+
}
102+
103+
/// As a performance enhancement, it is often useful to be able to increment the counter portion directly. This avoids the
104+
/// expensive construction cost of the ChaCha state for each next sequence of bytes of the keystream.
105+
mutating func incrementCounter() {
106+
self._state.12 &+= 1
107+
}
108+
109+
110+
private mutating func add(_ otherState: ChaChaState) {
111+
self._state.0 &+= otherState._state.0
112+
self._state.1 &+= otherState._state.1
113+
self._state.2 &+= otherState._state.2
114+
self._state.3 &+= otherState._state.3
115+
self._state.4 &+= otherState._state.4
116+
self._state.5 &+= otherState._state.5
117+
self._state.6 &+= otherState._state.6
118+
self._state.7 &+= otherState._state.7
119+
self._state.8 &+= otherState._state.8
120+
self._state.9 &+= otherState._state.9
121+
self._state.10 &+= otherState._state.10
122+
self._state.11 &+= otherState._state.11
123+
self._state.12 &+= otherState._state.12
124+
self._state.13 &+= otherState._state.13
125+
self._state.14 &+= otherState._state.14
126+
self._state.15 &+= otherState._state.15
127+
}
128+
129+
private mutating func columnRound() {
130+
// The column round:
131+
//
132+
// 1. QUARTERROUND ( 0, 4, 8,12)
133+
// 2. QUARTERROUND ( 1, 5, 9,13)
134+
// 3. QUARTERROUND ( 2, 6,10,14)
135+
// 4. QUARTERROUND ( 3, 7,11,15)
136+
ChaChaState.quarterRound(a: &self._state.0, b: &self._state.4, c: &self._state.8, d: &self._state.12)
137+
ChaChaState.quarterRound(a: &self._state.1, b: &self._state.5, c: &self._state.9, d: &self._state.13)
138+
ChaChaState.quarterRound(a: &self._state.2, b: &self._state.6, c: &self._state.10, d: &self._state.14)
139+
ChaChaState.quarterRound(a: &self._state.3, b: &self._state.7, c: &self._state.11, d: &self._state.15)
140+
}
141+
142+
private mutating func diagonalRound() {
143+
// The diagonal round:
144+
//
145+
// 5. QUARTERROUND ( 0, 5,10,15)
146+
// 6. QUARTERROUND ( 1, 6,11,12)
147+
// 7. QUARTERROUND ( 2, 7, 8,13)
148+
// 8. QUARTERROUND ( 3, 4, 9,14)
149+
ChaChaState.quarterRound(a: &self._state.0, b: &self._state.5, c: &self._state.10, d: &self._state.15)
150+
ChaChaState.quarterRound(a: &self._state.1, b: &self._state.6, c: &self._state.11, d: &self._state.12)
151+
ChaChaState.quarterRound(a: &self._state.2, b: &self._state.7, c: &self._state.8, d: &self._state.13)
152+
ChaChaState.quarterRound(a: &self._state.3, b: &self._state.4, c: &self._state.9, d: &self._state.14)
153+
}
154+
}
155+
156+
extension ChaChaState {
157+
static func quarterRound(a: inout UInt32, b: inout UInt32, c: inout UInt32, d: inout UInt32) {
158+
// The ChaCha quarter round. This is almost identical to the definition from RFC 7539
159+
// except that we use &+= instead of += because overflow modulo 32 is expected.
160+
a &+= b; d ^= a; d <<<= 16
161+
c &+= d; b ^= c; b <<<= 12
162+
a &+= b; d ^= a; d <<<= 8
163+
c &+= d; b ^= c; b <<<= 7
164+
}
165+
}
166+
167+
extension ChaChaState {
168+
func block() -> ChaChaKeystreamBlock {
169+
var stateCopy = self // We need this copy. This is cheaper than initializing twice.
170+
171+
// The ChaCha20 block runs 10 double rounds (a total of 20 rounds), made of one column and
172+
// one diagonal round.
173+
for _ in 0..<10 {
174+
stateCopy.columnRound()
175+
stateCopy.diagonalRound()
176+
}
177+
178+
// We add the original input words to the output words.
179+
stateCopy.add(self)
180+
181+
return ChaChaKeystreamBlock(stateCopy)
182+
}
183+
}
184+
185+
186+
/// The result of running the ChaCha block function on a given set of ChaCha state.
187+
///
188+
/// This result has a distinct set of behaviours compared to the ChaChaState object, so we give it a different
189+
/// (and more constrained) type.
190+
struct ChaChaKeystreamBlock {
191+
var _state: BackingState
192+
193+
init(_ state: ChaChaState) {
194+
self._state = state._state
195+
}
196+
197+
/// A nice thing we can do with a ChaCha keystream block is xor it with some bytes.
198+
///
199+
/// This helper function exists because we want a hook to do fast, in-place encryption of bytes.
200+
func xorBytes<Bytes: MutableCollection>(bytes: inout Bytes, at index: inout Bytes.Index) where Bytes.Element == UInt8 {
201+
// This is a naive implementation of this loop but I'm interested in testing the Swift compiler's ability
202+
// to optimise this. If we have a programmatic way to roll up this loop I'd love to hear it!
203+
self._state.0.xorLittleEndianBytes(bytes: &bytes, at: &index)
204+
if index == bytes.endIndex { return }
205+
self._state.1.xorLittleEndianBytes(bytes: &bytes, at: &index)
206+
if index == bytes.endIndex { return }
207+
self._state.2.xorLittleEndianBytes(bytes: &bytes, at: &index)
208+
if index == bytes.endIndex { return }
209+
self._state.3.xorLittleEndianBytes(bytes: &bytes, at: &index)
210+
if index == bytes.endIndex { return }
211+
self._state.4.xorLittleEndianBytes(bytes: &bytes, at: &index)
212+
if index == bytes.endIndex { return }
213+
self._state.5.xorLittleEndianBytes(bytes: &bytes, at: &index)
214+
if index == bytes.endIndex { return }
215+
self._state.6.xorLittleEndianBytes(bytes: &bytes, at: &index)
216+
if index == bytes.endIndex { return }
217+
self._state.7.xorLittleEndianBytes(bytes: &bytes, at: &index)
218+
if index == bytes.endIndex { return }
219+
self._state.8.xorLittleEndianBytes(bytes: &bytes, at: &index)
220+
if index == bytes.endIndex { return }
221+
self._state.9.xorLittleEndianBytes(bytes: &bytes, at: &index)
222+
if index == bytes.endIndex { return }
223+
self._state.10.xorLittleEndianBytes(bytes: &bytes, at: &index)
224+
if index == bytes.endIndex { return }
225+
self._state.11.xorLittleEndianBytes(bytes: &bytes, at: &index)
226+
if index == bytes.endIndex { return }
227+
self._state.12.xorLittleEndianBytes(bytes: &bytes, at: &index)
228+
if index == bytes.endIndex { return }
229+
self._state.13.xorLittleEndianBytes(bytes: &bytes, at: &index)
230+
if index == bytes.endIndex { return }
231+
self._state.14.xorLittleEndianBytes(bytes: &bytes, at: &index)
232+
if index == bytes.endIndex { return }
233+
self._state.15.xorLittleEndianBytes(bytes: &bytes, at: &index)
234+
}
235+
}
236+
237+
238+
infix operator <<<: BitwiseShiftPrecedence
239+
240+
infix operator <<<=: AssignmentPrecedence
241+
242+
243+
extension FixedWidthInteger {
244+
func leftRotate(_ distance: Int) -> Self {
245+
return (self << distance) | (self >> (Self.bitWidth - distance))
246+
}
247+
248+
mutating func rotatedLeft(_ distance: Int) {
249+
self = self.leftRotate(distance)
250+
}
251+
252+
static func <<<(lhs: Self, rhs: Int) -> Self {
253+
return lhs.leftRotate(rhs)
254+
}
255+
256+
static func <<<=(lhs: inout Self, rhs: Int) {
257+
lhs.rotatedLeft(rhs)
258+
}
259+
}
260+
261+
262+
struct CollectionOf32BitLittleEndianIntegers<BaseCollection: Collection> where BaseCollection.Element == UInt8 {
263+
var baseCollection: BaseCollection
264+
265+
init(_ baseCollection: BaseCollection) {
266+
precondition(baseCollection.count % 4 == 0)
267+
self.baseCollection = baseCollection
268+
}
269+
}
270+
271+
extension CollectionOf32BitLittleEndianIntegers: Collection {
272+
typealias Element = UInt32
273+
274+
struct Index {
275+
var baseIndex: BaseCollection.Index
276+
277+
init(_ baseIndex: BaseCollection.Index) {
278+
self.baseIndex = baseIndex
279+
}
280+
}
281+
282+
var startIndex: Index {
283+
return Index(self.baseCollection.startIndex)
284+
}
285+
286+
var endIndex: Index {
287+
return Index(self.baseCollection.endIndex)
288+
}
289+
290+
func index(after index: Index) -> Index {
291+
return Index(self.baseCollection.index(index.baseIndex, offsetBy: 4))
292+
}
293+
294+
subscript(_ index: Index) -> UInt32 {
295+
var baseIndex = index.baseIndex
296+
var result = UInt32(0)
297+
298+
for shift in stride(from: 0, through: 24, by: 8) {
299+
result |= UInt32(self.baseCollection[baseIndex]) << shift
300+
self.baseCollection.formIndex(after: &baseIndex)
301+
}
302+
303+
return result
304+
}
305+
}
306+
307+
extension CollectionOf32BitLittleEndianIntegers.Index: Equatable {
308+
static func ==(lhs: Self, rhs: Self) -> Bool {
309+
return lhs.baseIndex == rhs.baseIndex
310+
}
311+
}
312+
313+
extension CollectionOf32BitLittleEndianIntegers.Index: Comparable {
314+
static func <(lhs: Self, rhs: Self) -> Bool {
315+
return lhs.baseIndex < rhs.baseIndex
316+
}
317+
318+
static func <=(lhs: Self, rhs: Self) -> Bool {
319+
return lhs.baseIndex <= rhs.baseIndex
320+
}
321+
322+
static func >(lhs: Self, rhs: Self) -> Bool {
323+
return lhs.baseIndex > rhs.baseIndex
324+
}
325+
326+
static func >=(lhs: Self, rhs: Self) -> Bool {
327+
return lhs.baseIndex >= rhs.baseIndex
328+
}
329+
}
330+
331+
332+
extension UInt32 {
333+
/// Performs an xor operation on up to 4 bytes of the mutable collection.
334+
func xorLittleEndianBytes<Bytes: MutableCollection>(bytes: inout Bytes, at index: inout Bytes.Index) where Bytes.Element == UInt8 {
335+
var loopCount = 0
336+
while index < bytes.endIndex && loopCount < 4 {
337+
bytes[index] ^= UInt8((self >> (loopCount * 8)) & UInt32(0xFF))
338+
bytes.formIndex(after: &index)
339+
loopCount += 1
340+
}
341+
}
342+
}
343+
344+
345+
public let ChaCha = BenchmarkInfo(
346+
name: "ChaCha",
347+
runFunction: run_ChaCha,
348+
tags: [.runtime, .cpubench])
349+
350+
351+
@inline(never)
352+
public func run_ChaCha(_ N: Int) {
353+
var plaintext = Array(repeating: UInt8(0), count: 30720) // Chosen for CI runtime
354+
let key = Array(repeating: UInt8(1), count: 32)
355+
let nonce = Array(repeating: UInt8(2), count: 12)
356+
357+
for _ in 1...N {
358+
ChaCha20.encrypt(bytes: &plaintext, key: key, nonce: nonce)
359+
blackHole(plaintext.first!)
360+
}
361+
}

benchmark/utils/main.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import CString
3939
import CSVParsing
4040
import Calculator
4141
import CaptureProp
42+
import ChaCha
4243
import ChainedFilterMap
4344
import CharacterLiteralsLarge
4445
import CharacterLiteralsSmall
@@ -216,6 +217,7 @@ registerBenchmark(CString)
216217
registerBenchmark(CSVParsing)
217218
registerBenchmark(Calculator)
218219
registerBenchmark(CaptureProp)
220+
registerBenchmark(ChaCha)
219221
registerBenchmark(ChainedFilterMap)
220222
registerBenchmark(CharacterLiteralsLarge)
221223
registerBenchmark(CharacterLiteralsSmall)

0 commit comments

Comments
 (0)