Skip to content

Commit 7edc86c

Browse files
Ard Biesheuvelherbertx
authored andcommitted
crypto: arm64/sha3-ce - yield NEON after every block of input
Avoid excessive scheduling delays under a preemptible kernel by conditionally yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent 5b3da65 commit 7edc86c

File tree

1 file changed

+50
-27
lines changed

1 file changed

+50
-27
lines changed

arch/arm64/crypto/sha3-ce-core.S

Lines changed: 50 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -41,23 +41,30 @@
4141
*/
4242
.text
4343
ENTRY(sha3_ce_transform)
44-
/* load state */
45-
add x8, x0, #32
46-
ld1 { v0.1d- v3.1d}, [x0]
44+
frame_push 4
45+
46+
mov x19, x0
47+
mov x20, x1
48+
mov x21, x2
49+
mov x22, x3
50+
51+
0: /* load state */
52+
add x8, x19, #32
53+
ld1 { v0.1d- v3.1d}, [x19]
4754
ld1 { v4.1d- v7.1d}, [x8], #32
4855
ld1 { v8.1d-v11.1d}, [x8], #32
4956
ld1 {v12.1d-v15.1d}, [x8], #32
5057
ld1 {v16.1d-v19.1d}, [x8], #32
5158
ld1 {v20.1d-v23.1d}, [x8], #32
5259
ld1 {v24.1d}, [x8]
5360

54-
0: sub w2, w2, #1
61+
1: sub w21, w21, #1
5562
mov w8, #24
5663
adr_l x9, .Lsha3_rcon
5764

5865
/* load input */
59-
ld1 {v25.8b-v28.8b}, [x1], #32
60-
ld1 {v29.8b-v31.8b}, [x1], #24
66+
ld1 {v25.8b-v28.8b}, [x20], #32
67+
ld1 {v29.8b-v31.8b}, [x20], #24
6168
eor v0.8b, v0.8b, v25.8b
6269
eor v1.8b, v1.8b, v26.8b
6370
eor v2.8b, v2.8b, v27.8b
@@ -66,45 +73,45 @@ ENTRY(sha3_ce_transform)
6673
eor v5.8b, v5.8b, v30.8b
6774
eor v6.8b, v6.8b, v31.8b
6875

69-
tbnz x3, #6, 2f // SHA3-512
76+
tbnz x22, #6, 3f // SHA3-512
7077

71-
ld1 {v25.8b-v28.8b}, [x1], #32
72-
ld1 {v29.8b-v30.8b}, [x1], #16
78+
ld1 {v25.8b-v28.8b}, [x20], #32
79+
ld1 {v29.8b-v30.8b}, [x20], #16
7380
eor v7.8b, v7.8b, v25.8b
7481
eor v8.8b, v8.8b, v26.8b
7582
eor v9.8b, v9.8b, v27.8b
7683
eor v10.8b, v10.8b, v28.8b
7784
eor v11.8b, v11.8b, v29.8b
7885
eor v12.8b, v12.8b, v30.8b
7986

80-
tbnz x3, #4, 1f // SHA3-384 or SHA3-224
87+
tbnz x22, #4, 2f // SHA3-384 or SHA3-224
8188

8289
// SHA3-256
83-
ld1 {v25.8b-v28.8b}, [x1], #32
90+
ld1 {v25.8b-v28.8b}, [x20], #32
8491
eor v13.8b, v13.8b, v25.8b
8592
eor v14.8b, v14.8b, v26.8b
8693
eor v15.8b, v15.8b, v27.8b
8794
eor v16.8b, v16.8b, v28.8b
88-
b 3f
95+
b 4f
8996

90-
1: tbz x3, #2, 3f // bit 2 cleared? SHA-384
97+
2: tbz x22, #2, 4f // bit 2 cleared? SHA-384
9198

9299
// SHA3-224
93-
ld1 {v25.8b-v28.8b}, [x1], #32
94-
ld1 {v29.8b}, [x1], #8
100+
ld1 {v25.8b-v28.8b}, [x20], #32
101+
ld1 {v29.8b}, [x20], #8
95102
eor v13.8b, v13.8b, v25.8b
96103
eor v14.8b, v14.8b, v26.8b
97104
eor v15.8b, v15.8b, v27.8b
98105
eor v16.8b, v16.8b, v28.8b
99106
eor v17.8b, v17.8b, v29.8b
100-
b 3f
107+
b 4f
101108

102109
// SHA3-512
103-
2: ld1 {v25.8b-v26.8b}, [x1], #16
110+
3: ld1 {v25.8b-v26.8b}, [x20], #16
104111
eor v7.8b, v7.8b, v25.8b
105112
eor v8.8b, v8.8b, v26.8b
106113

107-
3: sub w8, w8, #1
114+
4: sub w8, w8, #1
108115

109116
eor3 v29.16b, v4.16b, v9.16b, v14.16b
110117
eor3 v26.16b, v1.16b, v6.16b, v11.16b
@@ -183,17 +190,33 @@ ENTRY(sha3_ce_transform)
183190

184191
eor v0.16b, v0.16b, v31.16b
185192

186-
cbnz w8, 3b
187-
cbnz w2, 0b
193+
cbnz w8, 4b
194+
cbz w21, 5f
195+
196+
if_will_cond_yield_neon
197+
add x8, x19, #32
198+
st1 { v0.1d- v3.1d}, [x19]
199+
st1 { v4.1d- v7.1d}, [x8], #32
200+
st1 { v8.1d-v11.1d}, [x8], #32
201+
st1 {v12.1d-v15.1d}, [x8], #32
202+
st1 {v16.1d-v19.1d}, [x8], #32
203+
st1 {v20.1d-v23.1d}, [x8], #32
204+
st1 {v24.1d}, [x8]
205+
do_cond_yield_neon
206+
b 0b
207+
endif_yield_neon
208+
209+
b 1b
188210

189211
/* save state */
190-
st1 { v0.1d- v3.1d}, [x0], #32
191-
st1 { v4.1d- v7.1d}, [x0], #32
192-
st1 { v8.1d-v11.1d}, [x0], #32
193-
st1 {v12.1d-v15.1d}, [x0], #32
194-
st1 {v16.1d-v19.1d}, [x0], #32
195-
st1 {v20.1d-v23.1d}, [x0], #32
196-
st1 {v24.1d}, [x0]
212+
5: st1 { v0.1d- v3.1d}, [x19], #32
213+
st1 { v4.1d- v7.1d}, [x19], #32
214+
st1 { v8.1d-v11.1d}, [x19], #32
215+
st1 {v12.1d-v15.1d}, [x19], #32
216+
st1 {v16.1d-v19.1d}, [x19], #32
217+
st1 {v20.1d-v23.1d}, [x19], #32
218+
st1 {v24.1d}, [x19]
219+
frame_pop
197220
ret
198221
ENDPROC(sha3_ce_transform)
199222

0 commit comments

Comments
 (0)