100
100
dCONSTANT .req d0
101
101
qCONSTANT .req q0
102
102
103
- BUF .req x0
104
- LEN .req x1
105
- CRC .req x2
103
+ BUF .req x19
104
+ LEN .req x20
105
+ CRC .req x21
106
+ CONST .req x22
106
107
107
108
vzr .req v9
108
109
@@ -123,7 +124,14 @@ ENTRY(crc32_pmull_le)
123
124
ENTRY(crc32c_pmull_le)
124
125
adr_l x3 , .Lcrc32c_constants
125
126
126
- 0 : bic LEN , LEN , # 15
127
+ 0 : frame_push 4 , 64
128
+
129
+ mov BUF , x0
130
+ mov LEN , x1
131
+ mov CRC , x2
132
+ mov CONST , x3
133
+
134
+ bic LEN , LEN , # 15
127
135
ld1 {v1.16b - v4.16b} , [ BUF ], # 0x40
128
136
movi vzr.16b , # 0
129
137
fmov dCONSTANT , CRC
@@ -132,7 +140,7 @@ ENTRY(crc32c_pmull_le)
132
140
cmp LEN , # 0x40
133
141
b.lt less_64
134
142
135
- ldr qCONSTANT , [ x3 ]
143
+ ldr qCONSTANT , [ CONST ]
136
144
137
145
loop_64: / * 64 bytes Full cache line folding * /
138
146
sub LEN , LEN , # 0x40
@@ -162,10 +170,21 @@ loop_64: /* 64 bytes Full cache line folding */
162
170
eor v4.16b , v4.16b , v8.16b
163
171
164
172
cmp LEN , # 0x40
165
- b.ge loop_64
173
+ b.lt less_64
174
+
175
+ if_will_cond_yield_neon
176
+ stp q1 , q2 , [ sp , #.Lframe_local_offset ]
177
+ stp q3 , q4 , [ sp , #.Lframe_local_offset + 32 ]
178
+ do_cond_yield_neon
179
+ ldp q1 , q2 , [ sp , #.Lframe_local_offset ]
180
+ ldp q3 , q4 , [ sp , #.Lframe_local_offset + 32 ]
181
+ ldr qCONSTANT , [ CONST ]
182
+ movi vzr.16b , # 0
183
+ endif_yield_neon
184
+ b loop_64
166
185
167
186
less_64: / * Folding cache line into 128bit * /
168
- ldr qCONSTANT , [ x3 , # 16 ]
187
+ ldr qCONSTANT , [ CONST , # 16 ]
169
188
170
189
pmull2 v5.1q , v1.2d , vCONSTANT.2d
171
190
pmull v1.1q , v1.1d , vCONSTANT.1d
@@ -204,16 +223,16 @@ fold_64:
204
223
eor v1.16b , v1.16b , v2.16b
205
224
206
225
/ * final 32 - bit fold * /
207
- ldr dCONSTANT , [ x3 , # 32 ]
208
- ldr d3 , [ x3 , # 40 ]
226
+ ldr dCONSTANT , [ CONST , # 32 ]
227
+ ldr d3 , [ CONST , # 40 ]
209
228
210
229
ext v2.16b , v1.16b , vzr.16b , # 4
211
230
and v1.16b , v1.16b , v3.16b
212
231
pmull v1.1q , v1.1d , vCONSTANT.1d
213
232
eor v1.16b , v1.16b , v2.16b
214
233
215
234
/ * Finish up with the bit - reversed barrett reduction 64 ==> 32 bits * /
216
- ldr qCONSTANT , [ x3 , # 48 ]
235
+ ldr qCONSTANT , [ CONST , # 48 ]
217
236
218
237
and v2.16b , v1.16b , v3.16b
219
238
ext v2.16b , vzr.16b , v2.16b , # 8
@@ -223,6 +242,7 @@ fold_64:
223
242
eor v1.16b , v1.16b , v2.16b
224
243
mov w0 , v1.s [ 1 ]
225
244
245
+ frame_pop
226
246
ret
227
247
ENDPROC(crc32_pmull_le)
228
248
ENDPROC(crc32c_pmull_le)
0 commit comments