|
74 | 74 | .text
|
75 | 75 | .cpu generic+crypto
|
76 | 76 |
|
77 |
| - arg1_low32 .req w0 |
78 |
| - arg2 .req x1 |
79 |
| - arg3 .req x2 |
| 77 | + arg1_low32 .req w19 |
| 78 | + arg2 .req x20 |
| 79 | + arg3 .req x21 |
80 | 80 |
|
81 | 81 | vzr .req v13
|
82 | 82 |
|
83 | 83 | ENTRY(crc_t10dif_pmull)
|
| 84 | + frame_push 3, 128 |
| 85 | + |
| 86 | + mov arg1_low32, w0 |
| 87 | + mov arg2, x1 |
| 88 | + mov arg3, x2 |
| 89 | + |
84 | 90 | movi vzr.16b, #0 // init zero register
|
85 | 91 |
|
86 | 92 | // adjust the 16-bit initial_crc value, scale it to 32 bits
|
@@ -175,8 +181,25 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
|
175 | 181 | subs arg3, arg3, #128
|
176 | 182 |
|
177 | 183 | // check if there is another 64B in the buffer to be able to fold
|
178 |
| - b.ge _fold_64_B_loop |
| 184 | + b.lt _fold_64_B_end |
| 185 | + |
| 186 | + if_will_cond_yield_neon |
| 187 | + stp q0, q1, [sp, #.Lframe_local_offset] |
| 188 | + stp q2, q3, [sp, #.Lframe_local_offset + 32] |
| 189 | + stp q4, q5, [sp, #.Lframe_local_offset + 64] |
| 190 | + stp q6, q7, [sp, #.Lframe_local_offset + 96] |
| 191 | + do_cond_yield_neon |
| 192 | + ldp q0, q1, [sp, #.Lframe_local_offset] |
| 193 | + ldp q2, q3, [sp, #.Lframe_local_offset + 32] |
| 194 | + ldp q4, q5, [sp, #.Lframe_local_offset + 64] |
| 195 | + ldp q6, q7, [sp, #.Lframe_local_offset + 96] |
| 196 | + ldr_l q10, rk3, x8 |
| 197 | + movi vzr.16b, #0 // init zero register |
| 198 | + endif_yield_neon |
| 199 | + |
| 200 | + b _fold_64_B_loop |
179 | 201 |
|
| 202 | +_fold_64_B_end: |
180 | 203 | // at this point, the buffer pointer is pointing at the last y Bytes
|
181 | 204 | // of the buffer the 64B of folded data is in 4 of the vector
|
182 | 205 | // registers: v0, v1, v2, v3
|
@@ -304,6 +327,7 @@ _barrett:
|
304 | 327 | _cleanup:
|
305 | 328 | // scale the result back to 16 bits
|
306 | 329 | lsr x0, x0, #16
|
| 330 | + frame_pop |
307 | 331 | ret
|
308 | 332 |
|
309 | 333 | _less_than_128:
|
|
0 commit comments