@@ -89,6 +89,29 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
89
89
ALL_F: .octa 0xffffffffffffffffffffffffffffffff
90
90
.octa 0x00000000000000000000000000000000
91
91
92
+ .section .rodata
93
+ .align 16
94
+ .type aad_shift_arr, @object
95
+ .size aad_shift_arr, 272
96
+ aad_shift_arr:
97
+ .octa 0xffffffffffffffffffffffffffffffff
98
+ .octa 0xffffffffffffffffffffffffffffff0C
99
+ .octa 0xffffffffffffffffffffffffffff0D0C
100
+ .octa 0xffffffffffffffffffffffffff0E0D0C
101
+ .octa 0xffffffffffffffffffffffff0F0E0D0C
102
+ .octa 0xffffffffffffffffffffff0C0B0A0908
103
+ .octa 0xffffffffffffffffffff0D0C0B0A0908
104
+ .octa 0xffffffffffffffffff0E0D0C0B0A0908
105
+ .octa 0xffffffffffffffff0F0E0D0C0B0A0908
106
+ .octa 0xffffffffffffff0C0B0A090807060504
107
+ .octa 0xffffffffffff0D0C0B0A090807060504
108
+ .octa 0xffffffffff0E0D0C0B0A090807060504
109
+ .octa 0xffffffff0F0E0D0C0B0A090807060504
110
+ .octa 0xffffff0C0B0A09080706050403020100
111
+ .octa 0xffff0D0C0B0A09080706050403020100
112
+ .octa 0xff0E0D0C0B0A09080706050403020100
113
+ .octa 0x0F0E0D0C0B0A09080706050403020100
114
+
92
115
93
116
.text
94
117
@@ -252,32 +275,66 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
252
275
mov arg8, %r12 # %r12 = aadLen
253
276
mov %r12 , %r11
254
277
pxor %xmm\i, %xmm\i
278
+ pxor \XMM2, \XMM2
255
279
256
- _get_AAD_loop\num_initial_blocks\operation:
257
- movd (%r10 ), \TMP1
258
- pslldq $12 , \TMP1
259
- psrldq $4 , %xmm\i
280
+ cmp $16 , %r11
281
+ jl _get_AAD_rest8\num_initial_blocks\operation
282
+ _get_AAD_blocks\num_initial_blocks\operation:
283
+ movdqu (%r10 ), %xmm\i
284
+ PSHUFB_XMM %xmm14 , %xmm\i # byte-reflect the AAD data
285
+ pxor %xmm\i, \XMM2
286
+ GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
287
+ add $16 , %r10
288
+ sub $16 , %r12
289
+ sub $16 , %r11
290
+ cmp $16 , %r11
291
+ jge _get_AAD_blocks\num_initial_blocks\operation
292
+
293
+ movdqu \XMM2, %xmm\i
294
+ cmp $0 , %r11
295
+ je _get_AAD_done\num_initial_blocks\operation
296
+
297
+ pxor %xmm\i,%xmm\i
298
+
299
+ /* read the last <16B of AAD. since we have at least 4B of
300
+ data right after the AAD (the ICV, and maybe some CT), we can
301
+ read 4B/8B blocks safely, and then get rid of the extra stuff */
302
+ _get_AAD_rest8\num_initial_blocks\operation:
303
+ cmp $4 , %r11
304
+ jle _get_AAD_rest4\num_initial_blocks\operation
305
+ movq (%r10 ), \TMP1
306
+ add $8 , %r10
307
+ sub $8 , %r11
308
+ pslldq $8 , \TMP1
309
+ psrldq $8 , %xmm\i
260
310
pxor \TMP1, %xmm\i
311
+ jmp _get_AAD_rest8\num_initial_blocks\operation
312
+ _get_AAD_rest4\num_initial_blocks\operation:
313
+ cmp $0 , %r11
314
+ jle _get_AAD_rest0\num_initial_blocks\operation
315
+ mov (%r10 ), %eax
316
+ movq %rax , \TMP1
261
317
add $4 , %r10
262
- sub $4 , %r12
263
- jne _get_AAD_loop\num_initial_blocks\operation
264
-
265
- cmp $16 , %r11
266
- je _get_AAD_loop2_done\num_initial_blocks\operation
267
-
268
- mov $16 , %r12
269
- _get_AAD_loop2\num_initial_blocks\operation:
318
+ sub $4 , %r10
319
+ pslldq $12 , \TMP1
270
320
psrldq $4 , %xmm\i
271
- sub $4 , %r12
272
- cmp %r11 , %r12
273
- jne _get_AAD_loop2\num_initial_blocks\operation
274
-
275
- _get_AAD_loop2_done\num_initial_blocks\operation:
321
+ pxor \TMP1, %xmm\i
322
+ _get_AAD_rest0\num_initial_blocks\operation:
323
+ /* finalize: shift out the extra bytes we read, and align
324
+ left. since pslldq can only shift by an immediate, we use
325
+ vpshufb and an array of shuffle masks */
326
+ movq %r12 , %r11
327
+ salq $4 , %r11
328
+ movdqu aad_shift_arr(%r11 ), \TMP1
329
+ PSHUFB_XMM \TMP1, %xmm\i
330
+ _get_AAD_rest_final\num_initial_blocks\operation:
276
331
PSHUFB_XMM %xmm14 , %xmm\i # byte-reflect the AAD data
332
+ pxor \XMM2, %xmm\i
333
+ GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
277
334
335
+ _get_AAD_done\num_initial_blocks\operation:
278
336
xor %r11 , %r11 # initialise the data pointer offset as zero
279
-
280
- # start AES for num_initial_blocks blocks
337
+ # start AES for num_initial_blocks blocks
281
338
282
339
mov %arg5, %rax # %rax = *Y0
283
340
movdqu (%rax ), \XMM0 # XMM0 = Y0
@@ -322,7 +379,7 @@ aes_loop_initial_dec\num_initial_blocks:
322
379
# prepare plaintext/ciphertext for GHASH computation
323
380
.endr
324
381
.endif
325
- GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
382
+
326
383
# apply GHASH on num_initial_blocks blocks
327
384
328
385
.if \i == 5
@@ -477,28 +534,66 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
477
534
mov arg8, %r12 # %r12 = aadLen
478
535
mov %r12 , %r11
479
536
pxor %xmm\i, %xmm\i
480
- _get_AAD_loop\num_initial_blocks\operation:
481
- movd (%r10 ), \TMP1
482
- pslldq $12 , \TMP1
483
- psrldq $4 , %xmm\i
537
+ pxor \XMM2, \XMM2
538
+
539
+ cmp $16 , %r11
540
+ jl _get_AAD_rest8\num_initial_blocks\operation
541
+ _get_AAD_blocks\num_initial_blocks\operation:
542
+ movdqu (%r10 ), %xmm\i
543
+ PSHUFB_XMM %xmm14 , %xmm\i # byte-reflect the AAD data
544
+ pxor %xmm\i, \XMM2
545
+ GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546
+ add $16 , %r10
547
+ sub $16 , %r12
548
+ sub $16 , %r11
549
+ cmp $16 , %r11
550
+ jge _get_AAD_blocks\num_initial_blocks\operation
551
+
552
+ movdqu \XMM2, %xmm\i
553
+ cmp $0 , %r11
554
+ je _get_AAD_done\num_initial_blocks\operation
555
+
556
+ pxor %xmm\i,%xmm\i
557
+
558
+ /* read the last <16B of AAD. since we have at least 4B of
559
+ data right after the AAD (the ICV, and maybe some PT), we can
560
+ read 4B/8B blocks safely, and then get rid of the extra stuff */
561
+ _get_AAD_rest8\num_initial_blocks\operation:
562
+ cmp $4 , %r11
563
+ jle _get_AAD_rest4\num_initial_blocks\operation
564
+ movq (%r10 ), \TMP1
565
+ add $8 , %r10
566
+ sub $8 , %r11
567
+ pslldq $8 , \TMP1
568
+ psrldq $8 , %xmm\i
484
569
pxor \TMP1, %xmm\i
570
+ jmp _get_AAD_rest8\num_initial_blocks\operation
571
+ _get_AAD_rest4\num_initial_blocks\operation:
572
+ cmp $0 , %r11
573
+ jle _get_AAD_rest0\num_initial_blocks\operation
574
+ mov (%r10 ), %eax
575
+ movq %rax , \TMP1
485
576
add $4 , %r10
486
- sub $4 , %r12
487
- jne _get_AAD_loop\num_initial_blocks\operation
488
- cmp $16 , %r11
489
- je _get_AAD_loop2_done\num_initial_blocks\operation
490
- mov $16 , %r12
491
- _get_AAD_loop2\num_initial_blocks\operation:
577
+ sub $4 , %r10
578
+ pslldq $12 , \TMP1
492
579
psrldq $4 , %xmm\i
493
- sub $4 , %r12
494
- cmp %r11 , %r12
495
- jne _get_AAD_loop2\num_initial_blocks\operation
496
- _get_AAD_loop2_done\num_initial_blocks\operation:
580
+ pxor \TMP1, %xmm\i
581
+ _get_AAD_rest0\num_initial_blocks\operation:
582
+ /* finalize: shift out the extra bytes we read, and align
583
+ left. since pslldq can only shift by an immediate, we use
584
+ vpshufb and an array of shuffle masks */
585
+ movq %r12 , %r11
586
+ salq $4 , %r11
587
+ movdqu aad_shift_arr(%r11 ), \TMP1
588
+ PSHUFB_XMM \TMP1, %xmm\i
589
+ _get_AAD_rest_final\num_initial_blocks\operation:
497
590
PSHUFB_XMM %xmm14 , %xmm\i # byte-reflect the AAD data
591
+ pxor \XMM2, %xmm\i
592
+ GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
498
593
594
+ _get_AAD_done\num_initial_blocks\operation:
499
595
xor %r11 , %r11 # initialise the data pointer offset as zero
500
-
501
- # start AES for num_initial_blocks blocks
596
+ # start AES for num_initial_blocks blocks
502
597
503
598
mov %arg5, %rax # %rax = *Y0
504
599
movdqu (%rax ), \XMM0 # XMM0 = Y0
@@ -543,7 +638,7 @@ aes_loop_initial_enc\num_initial_blocks:
543
638
# prepare plaintext/ciphertext for GHASH computation
544
639
.endr
545
640
.endif
546
- GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
641
+
547
642
# apply GHASH on num_initial_blocks blocks
548
643
549
644
.if \i == 5
0 commit comments