Skip to content

Commit c70f4ab

Browse files
martinwilliherbertx
authored andcommitted
crypto: poly1305 - Add a SSE2 SIMD variant for x86_64
Implements an x86_64 assembler driver for the Poly1305 authenticator. This single block variant holds the 130-bit integer in 5 32-bit words, but uses SSE to do two multiplications/additions in parallel. When calling updates with small blocks, the overhead for kernel_fpu_begin/ kernel_fpu_end() negates the perfmance gain. We therefore use the poly1305-generic fallback for small updates. For large messages, throughput increases by ~5-10% compared to poly1305-generic: testing speed of poly1305 (poly1305-generic) test 0 ( 96 byte blocks, 16 bytes per update, 6 updates): 4080026 opers/sec, 391682496 bytes/sec test 1 ( 96 byte blocks, 32 bytes per update, 3 updates): 6221094 opers/sec, 597225024 bytes/sec test 2 ( 96 byte blocks, 96 bytes per update, 1 updates): 9609750 opers/sec, 922536057 bytes/sec test 3 ( 288 byte blocks, 16 bytes per update, 18 updates): 1459379 opers/sec, 420301267 bytes/sec test 4 ( 288 byte blocks, 32 bytes per update, 9 updates): 2115179 opers/sec, 609171609 bytes/sec test 5 ( 288 byte blocks, 288 bytes per update, 1 updates): 3729874 opers/sec, 1074203856 bytes/sec test 6 ( 1056 byte blocks, 32 bytes per update, 33 updates): 593000 opers/sec, 626208000 bytes/sec test 7 ( 1056 byte blocks, 1056 bytes per update, 1 updates): 1081536 opers/sec, 1142102332 bytes/sec test 8 ( 2080 byte blocks, 32 bytes per update, 65 updates): 302077 opers/sec, 628320576 bytes/sec test 9 ( 2080 byte blocks, 2080 bytes per update, 1 updates): 554384 opers/sec, 1153120176 bytes/sec test 10 ( 4128 byte blocks, 4128 bytes per update, 1 updates): 278715 opers/sec, 1150536345 bytes/sec test 11 ( 8224 byte blocks, 8224 bytes per update, 1 updates): 140202 opers/sec, 1153022070 bytes/sec testing speed of poly1305 (poly1305-simd) test 0 ( 96 byte blocks, 16 bytes per update, 6 updates): 3790063 opers/sec, 363846076 bytes/sec test 1 ( 96 byte blocks, 32 bytes per update, 3 updates): 5913378 opers/sec, 567684355 bytes/sec test 2 ( 96 byte blocks, 96 bytes per update, 1 updates): 9352574 opers/sec, 897847104 bytes/sec test 3 ( 288 byte blocks, 16 bytes per update, 18 updates): 1362145 opers/sec, 392297990 bytes/sec test 4 ( 288 byte blocks, 32 bytes per update, 9 updates): 2007075 opers/sec, 578037628 bytes/sec test 5 ( 288 byte blocks, 288 bytes per update, 1 updates): 3709811 opers/sec, 1068425798 bytes/sec test 6 ( 1056 byte blocks, 32 bytes per update, 33 updates): 566272 opers/sec, 597984182 bytes/sec test 7 ( 1056 byte blocks, 1056 bytes per update, 1 updates): 1111657 opers/sec, 1173910108 bytes/sec test 8 ( 2080 byte blocks, 32 bytes per update, 65 updates): 288857 opers/sec, 600823808 bytes/sec test 9 ( 2080 byte blocks, 2080 bytes per update, 1 updates): 590746 opers/sec, 1228751888 bytes/sec test 10 ( 4128 byte blocks, 4128 bytes per update, 1 updates): 301825 opers/sec, 1245936902 bytes/sec test 11 ( 8224 byte blocks, 8224 bytes per update, 1 updates): 153075 opers/sec, 1258896201 bytes/sec Benchmark results from a Core i5-4670T. Signed-off-by: Martin Willi <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent 2546f81 commit c70f4ab

File tree

4 files changed

+413
-0
lines changed

4 files changed

+413
-0
lines changed

arch/x86/crypto/Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
3131
obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
3232
obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
3333
obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
34+
obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
3435

3536
# These modules require assembler to support AVX.
3637
ifeq ($(avx_supported),yes)
@@ -85,6 +86,7 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
8586
aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
8687
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
8788
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
89+
poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
8890
ifeq ($(avx2_supported),yes)
8991
sha1-ssse3-y += sha1_avx2_x86_64_asm.o
9092
endif
Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
/*
2+
* Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
3+
*
4+
* Copyright (C) 2015 Martin Willi
5+
*
6+
* This program is free software; you can redistribute it and/or modify
7+
* it under the terms of the GNU General Public License as published by
8+
* the Free Software Foundation; either version 2 of the License, or
9+
* (at your option) any later version.
10+
*/
11+
12+
#include <linux/linkage.h>
13+
14+
.data
15+
.align 16
16+
17+
ANMASK: .octa 0x0000000003ffffff0000000003ffffff
18+
19+
.text
20+
21+
#define h0 0x00(%rdi)
22+
#define h1 0x04(%rdi)
23+
#define h2 0x08(%rdi)
24+
#define h3 0x0c(%rdi)
25+
#define h4 0x10(%rdi)
26+
#define r0 0x00(%rdx)
27+
#define r1 0x04(%rdx)
28+
#define r2 0x08(%rdx)
29+
#define r3 0x0c(%rdx)
30+
#define r4 0x10(%rdx)
31+
#define s1 0x00(%rsp)
32+
#define s2 0x04(%rsp)
33+
#define s3 0x08(%rsp)
34+
#define s4 0x0c(%rsp)
35+
#define m %rsi
36+
#define h01 %xmm0
37+
#define h23 %xmm1
38+
#define h44 %xmm2
39+
#define t1 %xmm3
40+
#define t2 %xmm4
41+
#define t3 %xmm5
42+
#define t4 %xmm6
43+
#define mask %xmm7
44+
#define d0 %r8
45+
#define d1 %r9
46+
#define d2 %r10
47+
#define d3 %r11
48+
#define d4 %r12
49+
50+
ENTRY(poly1305_block_sse2)
51+
# %rdi: Accumulator h[5]
52+
# %rsi: 16 byte input block m
53+
# %rdx: Poly1305 key r[5]
54+
# %rcx: Block count
55+
56+
# This single block variant tries to improve performance by doing two
57+
# multiplications in parallel using SSE instructions. There is quite
58+
# some quardword packing involved, hence the speedup is marginal.
59+
60+
push %rbx
61+
push %r12
62+
sub $0x10,%rsp
63+
64+
# s1..s4 = r1..r4 * 5
65+
mov r1,%eax
66+
lea (%eax,%eax,4),%eax
67+
mov %eax,s1
68+
mov r2,%eax
69+
lea (%eax,%eax,4),%eax
70+
mov %eax,s2
71+
mov r3,%eax
72+
lea (%eax,%eax,4),%eax
73+
mov %eax,s3
74+
mov r4,%eax
75+
lea (%eax,%eax,4),%eax
76+
mov %eax,s4
77+
78+
movdqa ANMASK(%rip),mask
79+
80+
.Ldoblock:
81+
# h01 = [0, h1, 0, h0]
82+
# h23 = [0, h3, 0, h2]
83+
# h44 = [0, h4, 0, h4]
84+
movd h0,h01
85+
movd h1,t1
86+
movd h2,h23
87+
movd h3,t2
88+
movd h4,h44
89+
punpcklqdq t1,h01
90+
punpcklqdq t2,h23
91+
punpcklqdq h44,h44
92+
93+
# h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
94+
movd 0x00(m),t1
95+
movd 0x03(m),t2
96+
psrld $2,t2
97+
punpcklqdq t2,t1
98+
pand mask,t1
99+
paddd t1,h01
100+
# h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
101+
movd 0x06(m),t1
102+
movd 0x09(m),t2
103+
psrld $4,t1
104+
psrld $6,t2
105+
punpcklqdq t2,t1
106+
pand mask,t1
107+
paddd t1,h23
108+
# h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
109+
mov 0x0c(m),%eax
110+
shr $8,%eax
111+
or $0x01000000,%eax
112+
movd %eax,t1
113+
pshufd $0xc4,t1,t1
114+
paddd t1,h44
115+
116+
# t1[0] = h0 * r0 + h2 * s3
117+
# t1[1] = h1 * s4 + h3 * s2
118+
movd r0,t1
119+
movd s4,t2
120+
punpcklqdq t2,t1
121+
pmuludq h01,t1
122+
movd s3,t2
123+
movd s2,t3
124+
punpcklqdq t3,t2
125+
pmuludq h23,t2
126+
paddq t2,t1
127+
# t2[0] = h0 * r1 + h2 * s4
128+
# t2[1] = h1 * r0 + h3 * s3
129+
movd r1,t2
130+
movd r0,t3
131+
punpcklqdq t3,t2
132+
pmuludq h01,t2
133+
movd s4,t3
134+
movd s3,t4
135+
punpcklqdq t4,t3
136+
pmuludq h23,t3
137+
paddq t3,t2
138+
# t3[0] = h4 * s1
139+
# t3[1] = h4 * s2
140+
movd s1,t3
141+
movd s2,t4
142+
punpcklqdq t4,t3
143+
pmuludq h44,t3
144+
# d0 = t1[0] + t1[1] + t3[0]
145+
# d1 = t2[0] + t2[1] + t3[1]
146+
movdqa t1,t4
147+
punpcklqdq t2,t4
148+
punpckhqdq t2,t1
149+
paddq t4,t1
150+
paddq t3,t1
151+
movq t1,d0
152+
psrldq $8,t1
153+
movq t1,d1
154+
155+
# t1[0] = h0 * r2 + h2 * r0
156+
# t1[1] = h1 * r1 + h3 * s4
157+
movd r2,t1
158+
movd r1,t2
159+
punpcklqdq t2,t1
160+
pmuludq h01,t1
161+
movd r0,t2
162+
movd s4,t3
163+
punpcklqdq t3,t2
164+
pmuludq h23,t2
165+
paddq t2,t1
166+
# t2[0] = h0 * r3 + h2 * r1
167+
# t2[1] = h1 * r2 + h3 * r0
168+
movd r3,t2
169+
movd r2,t3
170+
punpcklqdq t3,t2
171+
pmuludq h01,t2
172+
movd r1,t3
173+
movd r0,t4
174+
punpcklqdq t4,t3
175+
pmuludq h23,t3
176+
paddq t3,t2
177+
# t3[0] = h4 * s3
178+
# t3[1] = h4 * s4
179+
movd s3,t3
180+
movd s4,t4
181+
punpcklqdq t4,t3
182+
pmuludq h44,t3
183+
# d2 = t1[0] + t1[1] + t3[0]
184+
# d3 = t2[0] + t2[1] + t3[1]
185+
movdqa t1,t4
186+
punpcklqdq t2,t4
187+
punpckhqdq t2,t1
188+
paddq t4,t1
189+
paddq t3,t1
190+
movq t1,d2
191+
psrldq $8,t1
192+
movq t1,d3
193+
194+
# t1[0] = h0 * r4 + h2 * r2
195+
# t1[1] = h1 * r3 + h3 * r1
196+
movd r4,t1
197+
movd r3,t2
198+
punpcklqdq t2,t1
199+
pmuludq h01,t1
200+
movd r2,t2
201+
movd r1,t3
202+
punpcklqdq t3,t2
203+
pmuludq h23,t2
204+
paddq t2,t1
205+
# t3[0] = h4 * r0
206+
movd r0,t3
207+
pmuludq h44,t3
208+
# d4 = t1[0] + t1[1] + t3[0]
209+
movdqa t1,t4
210+
psrldq $8,t4
211+
paddq t4,t1
212+
paddq t3,t1
213+
movq t1,d4
214+
215+
# d1 += d0 >> 26
216+
mov d0,%rax
217+
shr $26,%rax
218+
add %rax,d1
219+
# h0 = d0 & 0x3ffffff
220+
mov d0,%rbx
221+
and $0x3ffffff,%ebx
222+
223+
# d2 += d1 >> 26
224+
mov d1,%rax
225+
shr $26,%rax
226+
add %rax,d2
227+
# h1 = d1 & 0x3ffffff
228+
mov d1,%rax
229+
and $0x3ffffff,%eax
230+
mov %eax,h1
231+
232+
# d3 += d2 >> 26
233+
mov d2,%rax
234+
shr $26,%rax
235+
add %rax,d3
236+
# h2 = d2 & 0x3ffffff
237+
mov d2,%rax
238+
and $0x3ffffff,%eax
239+
mov %eax,h2
240+
241+
# d4 += d3 >> 26
242+
mov d3,%rax
243+
shr $26,%rax
244+
add %rax,d4
245+
# h3 = d3 & 0x3ffffff
246+
mov d3,%rax
247+
and $0x3ffffff,%eax
248+
mov %eax,h3
249+
250+
# h0 += (d4 >> 26) * 5
251+
mov d4,%rax
252+
shr $26,%rax
253+
lea (%eax,%eax,4),%eax
254+
add %eax,%ebx
255+
# h4 = d4 & 0x3ffffff
256+
mov d4,%rax
257+
and $0x3ffffff,%eax
258+
mov %eax,h4
259+
260+
# h1 += h0 >> 26
261+
mov %ebx,%eax
262+
shr $26,%eax
263+
add %eax,h1
264+
# h0 = h0 & 0x3ffffff
265+
andl $0x3ffffff,%ebx
266+
mov %ebx,h0
267+
268+
add $0x10,m
269+
dec %rcx
270+
jnz .Ldoblock
271+
272+
add $0x10,%rsp
273+
pop %r12
274+
pop %rbx
275+
ret
276+
ENDPROC(poly1305_block_sse2)

0 commit comments

Comments
 (0)