Skip to content

Commit 0487cca

Browse files
qsnherbertx
authored andcommitted
crypto: aesni - make non-AVX AES-GCM work with any aadlen
This is the first step to make the aesni AES-GCM implementation generic. The current code was written for rfc4106, so it handles only some specific sizes of associated data. Signed-off-by: Sabrina Dubroca <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent f4857f4 commit 0487cca

File tree

1 file changed

+132
-37
lines changed

1 file changed

+132
-37
lines changed

arch/x86/crypto/aesni-intel_asm.S

Lines changed: 132 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,29 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
8989
ALL_F: .octa 0xffffffffffffffffffffffffffffffff
9090
.octa 0x00000000000000000000000000000000
9191

92+
.section .rodata
93+
.align 16
94+
.type aad_shift_arr, @object
95+
.size aad_shift_arr, 272
96+
aad_shift_arr:
97+
.octa 0xffffffffffffffffffffffffffffffff
98+
.octa 0xffffffffffffffffffffffffffffff0C
99+
.octa 0xffffffffffffffffffffffffffff0D0C
100+
.octa 0xffffffffffffffffffffffffff0E0D0C
101+
.octa 0xffffffffffffffffffffffff0F0E0D0C
102+
.octa 0xffffffffffffffffffffff0C0B0A0908
103+
.octa 0xffffffffffffffffffff0D0C0B0A0908
104+
.octa 0xffffffffffffffffff0E0D0C0B0A0908
105+
.octa 0xffffffffffffffff0F0E0D0C0B0A0908
106+
.octa 0xffffffffffffff0C0B0A090807060504
107+
.octa 0xffffffffffff0D0C0B0A090807060504
108+
.octa 0xffffffffff0E0D0C0B0A090807060504
109+
.octa 0xffffffff0F0E0D0C0B0A090807060504
110+
.octa 0xffffff0C0B0A09080706050403020100
111+
.octa 0xffff0D0C0B0A09080706050403020100
112+
.octa 0xff0E0D0C0B0A09080706050403020100
113+
.octa 0x0F0E0D0C0B0A09080706050403020100
114+
92115

93116
.text
94117

@@ -252,32 +275,66 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
252275
mov arg8, %r12 # %r12 = aadLen
253276
mov %r12, %r11
254277
pxor %xmm\i, %xmm\i
278+
pxor \XMM2, \XMM2
255279

256-
_get_AAD_loop\num_initial_blocks\operation:
257-
movd (%r10), \TMP1
258-
pslldq $12, \TMP1
259-
psrldq $4, %xmm\i
280+
cmp $16, %r11
281+
jl _get_AAD_rest8\num_initial_blocks\operation
282+
_get_AAD_blocks\num_initial_blocks\operation:
283+
movdqu (%r10), %xmm\i
284+
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
285+
pxor %xmm\i, \XMM2
286+
GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
287+
add $16, %r10
288+
sub $16, %r12
289+
sub $16, %r11
290+
cmp $16, %r11
291+
jge _get_AAD_blocks\num_initial_blocks\operation
292+
293+
movdqu \XMM2, %xmm\i
294+
cmp $0, %r11
295+
je _get_AAD_done\num_initial_blocks\operation
296+
297+
pxor %xmm\i,%xmm\i
298+
299+
/* read the last <16B of AAD. since we have at least 4B of
300+
data right after the AAD (the ICV, and maybe some CT), we can
301+
read 4B/8B blocks safely, and then get rid of the extra stuff */
302+
_get_AAD_rest8\num_initial_blocks\operation:
303+
cmp $4, %r11
304+
jle _get_AAD_rest4\num_initial_blocks\operation
305+
movq (%r10), \TMP1
306+
add $8, %r10
307+
sub $8, %r11
308+
pslldq $8, \TMP1
309+
psrldq $8, %xmm\i
260310
pxor \TMP1, %xmm\i
311+
jmp _get_AAD_rest8\num_initial_blocks\operation
312+
_get_AAD_rest4\num_initial_blocks\operation:
313+
cmp $0, %r11
314+
jle _get_AAD_rest0\num_initial_blocks\operation
315+
mov (%r10), %eax
316+
movq %rax, \TMP1
261317
add $4, %r10
262-
sub $4, %r12
263-
jne _get_AAD_loop\num_initial_blocks\operation
264-
265-
cmp $16, %r11
266-
je _get_AAD_loop2_done\num_initial_blocks\operation
267-
268-
mov $16, %r12
269-
_get_AAD_loop2\num_initial_blocks\operation:
318+
sub $4, %r10
319+
pslldq $12, \TMP1
270320
psrldq $4, %xmm\i
271-
sub $4, %r12
272-
cmp %r11, %r12
273-
jne _get_AAD_loop2\num_initial_blocks\operation
274-
275-
_get_AAD_loop2_done\num_initial_blocks\operation:
321+
pxor \TMP1, %xmm\i
322+
_get_AAD_rest0\num_initial_blocks\operation:
323+
/* finalize: shift out the extra bytes we read, and align
324+
left. since pslldq can only shift by an immediate, we use
325+
vpshufb and an array of shuffle masks */
326+
movq %r12, %r11
327+
salq $4, %r11
328+
movdqu aad_shift_arr(%r11), \TMP1
329+
PSHUFB_XMM \TMP1, %xmm\i
330+
_get_AAD_rest_final\num_initial_blocks\operation:
276331
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
332+
pxor \XMM2, %xmm\i
333+
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
277334

335+
_get_AAD_done\num_initial_blocks\operation:
278336
xor %r11, %r11 # initialise the data pointer offset as zero
279-
280-
# start AES for num_initial_blocks blocks
337+
# start AES for num_initial_blocks blocks
281338

282339
mov %arg5, %rax # %rax = *Y0
283340
movdqu (%rax), \XMM0 # XMM0 = Y0
@@ -322,7 +379,7 @@ aes_loop_initial_dec\num_initial_blocks:
322379
# prepare plaintext/ciphertext for GHASH computation
323380
.endr
324381
.endif
325-
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
382+
326383
# apply GHASH on num_initial_blocks blocks
327384

328385
.if \i == 5
@@ -477,28 +534,66 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
477534
mov arg8, %r12 # %r12 = aadLen
478535
mov %r12, %r11
479536
pxor %xmm\i, %xmm\i
480-
_get_AAD_loop\num_initial_blocks\operation:
481-
movd (%r10), \TMP1
482-
pslldq $12, \TMP1
483-
psrldq $4, %xmm\i
537+
pxor \XMM2, \XMM2
538+
539+
cmp $16, %r11
540+
jl _get_AAD_rest8\num_initial_blocks\operation
541+
_get_AAD_blocks\num_initial_blocks\operation:
542+
movdqu (%r10), %xmm\i
543+
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
544+
pxor %xmm\i, \XMM2
545+
GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546+
add $16, %r10
547+
sub $16, %r12
548+
sub $16, %r11
549+
cmp $16, %r11
550+
jge _get_AAD_blocks\num_initial_blocks\operation
551+
552+
movdqu \XMM2, %xmm\i
553+
cmp $0, %r11
554+
je _get_AAD_done\num_initial_blocks\operation
555+
556+
pxor %xmm\i,%xmm\i
557+
558+
/* read the last <16B of AAD. since we have at least 4B of
559+
data right after the AAD (the ICV, and maybe some PT), we can
560+
read 4B/8B blocks safely, and then get rid of the extra stuff */
561+
_get_AAD_rest8\num_initial_blocks\operation:
562+
cmp $4, %r11
563+
jle _get_AAD_rest4\num_initial_blocks\operation
564+
movq (%r10), \TMP1
565+
add $8, %r10
566+
sub $8, %r11
567+
pslldq $8, \TMP1
568+
psrldq $8, %xmm\i
484569
pxor \TMP1, %xmm\i
570+
jmp _get_AAD_rest8\num_initial_blocks\operation
571+
_get_AAD_rest4\num_initial_blocks\operation:
572+
cmp $0, %r11
573+
jle _get_AAD_rest0\num_initial_blocks\operation
574+
mov (%r10), %eax
575+
movq %rax, \TMP1
485576
add $4, %r10
486-
sub $4, %r12
487-
jne _get_AAD_loop\num_initial_blocks\operation
488-
cmp $16, %r11
489-
je _get_AAD_loop2_done\num_initial_blocks\operation
490-
mov $16, %r12
491-
_get_AAD_loop2\num_initial_blocks\operation:
577+
sub $4, %r10
578+
pslldq $12, \TMP1
492579
psrldq $4, %xmm\i
493-
sub $4, %r12
494-
cmp %r11, %r12
495-
jne _get_AAD_loop2\num_initial_blocks\operation
496-
_get_AAD_loop2_done\num_initial_blocks\operation:
580+
pxor \TMP1, %xmm\i
581+
_get_AAD_rest0\num_initial_blocks\operation:
582+
/* finalize: shift out the extra bytes we read, and align
583+
left. since pslldq can only shift by an immediate, we use
584+
vpshufb and an array of shuffle masks */
585+
movq %r12, %r11
586+
salq $4, %r11
587+
movdqu aad_shift_arr(%r11), \TMP1
588+
PSHUFB_XMM \TMP1, %xmm\i
589+
_get_AAD_rest_final\num_initial_blocks\operation:
497590
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
591+
pxor \XMM2, %xmm\i
592+
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
498593

594+
_get_AAD_done\num_initial_blocks\operation:
499595
xor %r11, %r11 # initialise the data pointer offset as zero
500-
501-
# start AES for num_initial_blocks blocks
596+
# start AES for num_initial_blocks blocks
502597

503598
mov %arg5, %rax # %rax = *Y0
504599
movdqu (%rax), \XMM0 # XMM0 = Y0
@@ -543,7 +638,7 @@ aes_loop_initial_enc\num_initial_blocks:
543638
# prepare plaintext/ciphertext for GHASH computation
544639
.endr
545640
.endif
546-
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
641+
547642
# apply GHASH on num_initial_blocks blocks
548643

549644
.if \i == 5

0 commit comments

Comments
 (0)