Skip to content

Commit 4e530fb

Browse files
Ard Biesheuvelherbertx
authored andcommitted
crypto: arm64/crc32-ce - yield NEON after every block of input
Avoid excessive scheduling delays under a preemptible kernel by yielding the NEON after every block of input. Signed-off-by: Ard Biesheuvel <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent 7c50136 commit 4e530fb

File tree

1 file changed

+30
-10
lines changed

1 file changed

+30
-10
lines changed

arch/arm64/crypto/crc32-ce-core.S

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,10 @@
100100
dCONSTANT .req d0
101101
qCONSTANT .req q0
102102

103-
BUF .req x0
104-
LEN .req x1
105-
CRC .req x2
103+
BUF .req x19
104+
LEN .req x20
105+
CRC .req x21
106+
CONST .req x22
106107

107108
vzr .req v9
108109

@@ -123,7 +124,14 @@ ENTRY(crc32_pmull_le)
123124
ENTRY(crc32c_pmull_le)
124125
adr_l x3, .Lcrc32c_constants
125126

126-
0: bic LEN, LEN, #15
127+
0: frame_push 4, 64
128+
129+
mov BUF, x0
130+
mov LEN, x1
131+
mov CRC, x2
132+
mov CONST, x3
133+
134+
bic LEN, LEN, #15
127135
ld1 {v1.16b-v4.16b}, [BUF], #0x40
128136
movi vzr.16b, #0
129137
fmov dCONSTANT, CRC
@@ -132,7 +140,7 @@ ENTRY(crc32c_pmull_le)
132140
cmp LEN, #0x40
133141
b.lt less_64
134142

135-
ldr qCONSTANT, [x3]
143+
ldr qCONSTANT, [CONST]
136144

137145
loop_64: /* 64 bytes Full cache line folding */
138146
sub LEN, LEN, #0x40
@@ -162,10 +170,21 @@ loop_64: /* 64 bytes Full cache line folding */
162170
eor v4.16b, v4.16b, v8.16b
163171

164172
cmp LEN, #0x40
165-
b.ge loop_64
173+
b.lt less_64
174+
175+
if_will_cond_yield_neon
176+
stp q1, q2, [sp, #.Lframe_local_offset]
177+
stp q3, q4, [sp, #.Lframe_local_offset + 32]
178+
do_cond_yield_neon
179+
ldp q1, q2, [sp, #.Lframe_local_offset]
180+
ldp q3, q4, [sp, #.Lframe_local_offset + 32]
181+
ldr qCONSTANT, [CONST]
182+
movi vzr.16b, #0
183+
endif_yield_neon
184+
b loop_64
166185

167186
less_64: /* Folding cache line into 128bit */
168-
ldr qCONSTANT, [x3, #16]
187+
ldr qCONSTANT, [CONST, #16]
169188

170189
pmull2 v5.1q, v1.2d, vCONSTANT.2d
171190
pmull v1.1q, v1.1d, vCONSTANT.1d
@@ -204,16 +223,16 @@ fold_64:
204223
eor v1.16b, v1.16b, v2.16b
205224

206225
/* final 32-bit fold */
207-
ldr dCONSTANT, [x3, #32]
208-
ldr d3, [x3, #40]
226+
ldr dCONSTANT, [CONST, #32]
227+
ldr d3, [CONST, #40]
209228

210229
ext v2.16b, v1.16b, vzr.16b, #4
211230
and v1.16b, v1.16b, v3.16b
212231
pmull v1.1q, v1.1d, vCONSTANT.1d
213232
eor v1.16b, v1.16b, v2.16b
214233

215234
/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
216-
ldr qCONSTANT, [x3, #48]
235+
ldr qCONSTANT, [CONST, #48]
217236

218237
and v2.16b, v1.16b, v3.16b
219238
ext v2.16b, vzr.16b, v2.16b, #8
@@ -223,6 +242,7 @@ fold_64:
223242
eor v1.16b, v1.16b, v2.16b
224243
mov w0, v1.s[1]
225244

245+
frame_pop
226246
ret
227247
ENDPROC(crc32_pmull_le)
228248
ENDPROC(crc32c_pmull_le)

0 commit comments

Comments
 (0)