Skip to content

Commit 5b3da65

Browse files
Ard Biesheuvelherbertx
authored andcommitted
crypto: arm64/crct10dif-ce - yield NEON after every block of input
Avoid excessive scheduling delays under a preemptible kernel by yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent 4e530fb commit 5b3da65

File tree

1 file changed

+28
-4
lines changed

1 file changed

+28
-4
lines changed

arch/arm64/crypto/crct10dif-ce-core.S

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,13 +74,19 @@
7474
.text
7575
.cpu generic+crypto
7676

77-
arg1_low32 .req w0
78-
arg2 .req x1
79-
arg3 .req x2
77+
arg1_low32 .req w19
78+
arg2 .req x20
79+
arg3 .req x21
8080

8181
vzr .req v13
8282

8383
ENTRY(crc_t10dif_pmull)
84+
frame_push 3, 128
85+
86+
mov arg1_low32, w0
87+
mov arg2, x1
88+
mov arg3, x2
89+
8490
movi vzr.16b, #0 // init zero register
8591

8692
// adjust the 16-bit initial_crc value, scale it to 32 bits
@@ -175,8 +181,25 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
175181
subs arg3, arg3, #128
176182

177183
// check if there is another 64B in the buffer to be able to fold
178-
b.ge _fold_64_B_loop
184+
b.lt _fold_64_B_end
185+
186+
if_will_cond_yield_neon
187+
stp q0, q1, [sp, #.Lframe_local_offset]
188+
stp q2, q3, [sp, #.Lframe_local_offset + 32]
189+
stp q4, q5, [sp, #.Lframe_local_offset + 64]
190+
stp q6, q7, [sp, #.Lframe_local_offset + 96]
191+
do_cond_yield_neon
192+
ldp q0, q1, [sp, #.Lframe_local_offset]
193+
ldp q2, q3, [sp, #.Lframe_local_offset + 32]
194+
ldp q4, q5, [sp, #.Lframe_local_offset + 64]
195+
ldp q6, q7, [sp, #.Lframe_local_offset + 96]
196+
ldr_l q10, rk3, x8
197+
movi vzr.16b, #0 // init zero register
198+
endif_yield_neon
199+
200+
b _fold_64_B_loop
179201

202+
_fold_64_B_end:
180203
// at this point, the buffer pointer is pointing at the last y Bytes
181204
// of the buffer the 64B of folded data is in 4 of the vector
182205
// registers: v0, v1, v2, v3
@@ -304,6 +327,7 @@ _barrett:
304327
_cleanup:
305328
// scale the result back to 16 bits
306329
lsr x0, x0, #16
330+
frame_pop
307331
ret
308332

309333
_less_than_128:

0 commit comments

Comments
 (0)