6
6
7
7
#include "../assembly.h"
8
8
9
- #define L(l) .L ## l
10
-
11
9
//
12
10
// __arm_sc_memcpy / __arm_sc_memmove
13
11
//
52
50
The loop tail is handled by always copying 64 bytes from the end.
53
51
* /
54
52
55
- DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED (__arm_sc_memcpy)
53
+ DEFINE_COMPILERRT_FUNCTION (__arm_sc_memcpy)
56
54
add srcend1 , src , count
57
55
add dstend1 , dstin , count
58
56
cmp count , 128
59
- b.hi L( copy_long)
57
+ b.hi 7f // copy_long
60
58
cmp count , 32
61
- b.hi L( copy32_128)
59
+ b.hi 4f // copy32_128
62
60
63
61
/ * Small copies: 0 .. 32 bytes. * /
64
62
cmp count , 16
65
- b.lo L( copy16)
63
+ b.lo 0f // copy16
66
64
ldp A_l , A_h , [ src ]
67
65
ldp D_l , D_h , [ srcend1 , - 16 ]
68
66
stp A_l , A_h , [ dstin ]
69
67
stp D_l , D_h , [ dstend1 , - 16 ]
70
68
ret
71
69
72
70
/ * Copy 8 - 15 bytes. * /
73
- L(copy16):
74
- tbz count , 3 , L( copy8)
71
+ 0 : // copy16
72
+ tbz count , 3 , 1f // copy8
75
73
ldr A_l , [ src ]
76
74
ldr A_h , [ srcend1 , - 8 ]
77
75
str A_l , [ dstin ]
@@ -80,36 +78,36 @@ L(copy16):
80
78
81
79
.p2align 3
82
80
/ * Copy 4 - 7 bytes. * /
83
- L(copy8):
84
- tbz count , 2 , L( copy4)
81
+ 1 : // copy8
82
+ tbz count , 2 , 2f // copy4
85
83
ldr A_lw , [ src ]
86
84
ldr B_lw , [ srcend1 , - 4 ]
87
85
str A_lw , [ dstin ]
88
86
str B_lw , [ dstend1 , - 4 ]
89
87
ret
90
88
91
89
/ * Copy 0 .. 3 bytes using a branchless sequence. * /
92
- L(copy4):
93
- cbz count , L( copy0)
90
+ 2 : // copy4
91
+ cbz count , 3f // copy0
94
92
lsr tmp1 , count , 1
95
93
ldrb A_lw , [ src ]
96
94
ldrb C_lw , [ srcend1 , - 1 ]
97
95
ldrb B_lw , [ src , tmp1 ]
98
96
strb A_lw , [ dstin ]
99
97
strb B_lw , [ dstin , tmp1 ]
100
98
strb C_lw , [ dstend1 , - 1 ]
101
- L(copy0):
99
+ 3 : // copy0
102
100
ret
103
101
104
102
.p2align 4
105
103
/ * Medium copies: 33 .. 128 bytes. * /
106
- L(copy32_128):
104
+ 4 : // copy32_128
107
105
ldp A_l , A_h , [ src ]
108
106
ldp B_l , B_h , [ src , 16 ]
109
107
ldp C_l , C_h , [ srcend1 , - 32 ]
110
108
ldp D_l , D_h , [ srcend1 , - 16 ]
111
109
cmp count , 64
112
- b.hi L( copy128)
110
+ b.hi 5f // copy128
113
111
stp A_l , A_h , [ dstin ]
114
112
stp B_l , B_h , [ dstin , 16 ]
115
113
stp C_l , C_h , [ dstend1 , - 32 ]
@@ -118,16 +116,16 @@ L(copy32_128):
118
116
119
117
.p2align 4
120
118
/ * Copy 65 .. 128 bytes. * /
121
- L(copy128):
119
+ 5 : // copy128
122
120
ldp E_l , E_h , [ src , 32 ]
123
121
ldp F_l , F_h , [ src , 48 ]
124
122
cmp count , 96
125
- b.ls L( copy96)
123
+ b.ls 6f // copy96
126
124
ldp G_l , G_h , [ srcend1 , - 64 ]
127
125
ldp H_l , H_h , [ srcend1 , - 48 ]
128
126
stp G_l , G_h , [ dstend1 , - 64 ]
129
127
stp H_l , H_h , [ dstend1 , - 48 ]
130
- L(copy96):
128
+ 6 : // copy96
131
129
stp A_l , A_h , [ dstin ]
132
130
stp B_l , B_h , [ dstin , 16 ]
133
131
stp E_l , E_h , [ dstin , 32 ]
@@ -138,12 +136,12 @@ L(copy96):
138
136
139
137
.p2align 4
140
138
/ * Copy more than 128 bytes. * /
141
- L(copy_long):
139
+ 7 : // copy_long
142
140
/ * Use backwards copy if there is an overlap. * /
143
141
sub tmp1 , dstin , src
144
- cbz tmp1 , L( copy0)
142
+ cbz tmp1 , 3b // copy0
145
143
cmp tmp1 , count
146
- b.lo L( copy_long_backwards)
144
+ b.lo 10f // copy_long_backwards
147
145
148
146
/ * Copy 16 bytes and then align dst to 16 - byte alignment. * /
149
147
@@ -158,8 +156,8 @@ L(copy_long):
158
156
ldp C_l , C_h , [ src , 48 ]
159
157
ldp D_l , D_h , [ src , 64 ] !
160
158
subs count , count , 128 + 16 / * Test and readjust count. * /
161
- b.ls L( copy64_from_end)
162
- L(loop64):
159
+ b.ls 9f // copy64_from_end
160
+ 8 : // loop64
163
161
stp A_l , A_h , [ dst , 16 ]
164
162
ldp A_l , A_h , [ src , 16 ]
165
163
stp B_l , B_h , [ dst , 32 ]
@@ -169,10 +167,10 @@ L(loop64):
169
167
stp D_l , D_h , [ dst , 64 ] !
170
168
ldp D_l , D_h , [ src , 64 ] !
171
169
subs count , count , 64
172
- b.hi L( loop64)
170
+ b.hi 8b // loop64
173
171
174
172
/ * Write the last iteration and copy 64 bytes from the end. * /
175
- L(copy64_from_end):
173
+ 9 : // copy64_from_end
176
174
ldp E_l , E_h , [ srcend1 , - 64 ]
177
175
stp A_l , A_h , [ dst , 16 ]
178
176
ldp A_l , A_h , [ srcend1 , - 48 ]
@@ -191,7 +189,7 @@ L(copy64_from_end):
191
189
192
190
/ * Large backwards copy for overlapping copies.
193
191
Copy 16 bytes and then align dst to 16 - byte alignment. * /
194
- L(copy_long_backwards):
192
+ 10 : // copy_long_backwards
195
193
ldp D_l , D_h , [ srcend1 , - 16 ]
196
194
and tmp1 , dstend1 , 15
197
195
sub srcend1 , srcend1 , tmp1
@@ -203,9 +201,9 @@ L(copy_long_backwards):
203
201
ldp D_l , D_h , [ srcend1 , - 64 ] !
204
202
sub dstend1 , dstend1 , tmp1
205
203
subs count , count , 128
206
- b.ls L( copy64_from_start)
204
+ b.ls 12f // copy64_from_start
207
205
208
- L(loop64_backwards):
206
+ 11 : // loop64_backwards
209
207
stp A_l , A_h , [ dstend1 , - 16 ]
210
208
ldp A_l , A_h , [ srcend1 , - 16 ]
211
209
stp B_l , B_h , [ dstend1 , - 32 ]
@@ -215,10 +213,10 @@ L(loop64_backwards):
215
213
stp D_l , D_h , [ dstend1 , - 64 ] !
216
214
ldp D_l , D_h , [ srcend1 , - 64 ] !
217
215
subs count , count , 64
218
- b.hi L( loop64_backwards)
216
+ b.hi 11b // loop64_backwards
219
217
220
218
/ * Write the last iteration and copy 64 bytes from the start. * /
221
- L(copy64_from_start):
219
+ 12 : // copy64_from_start
222
220
ldp G_l , G_h , [ src , 48 ]
223
221
stp A_l , A_h , [ dstend1 , - 16 ]
224
222
ldp A_l , A_h , [ src , 32 ]
@@ -232,7 +230,7 @@ L(copy64_from_start):
232
230
stp B_l , B_h , [ dstin , 16 ]
233
231
stp C_l , C_h , [ dstin ]
234
232
ret
235
- END_COMPILERRT_OUTLINE_FUNCTION (__arm_sc_memcpy)
233
+ END_COMPILERRT_FUNCTION (__arm_sc_memcpy)
236
234
237
235
DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove , __arm_sc_memcpy)
238
236
@@ -250,7 +248,7 @@ DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
250
248
#define dstend2 x4
251
249
#define zva_val x5
252
250
253
- DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED (__arm_sc_memset)
251
+ DEFINE_COMPILERRT_FUNCTION (__arm_sc_memset)
254
252
#ifdef __ARM_FEATURE_SVE
255
253
mov z0.b , valw
256
254
#else
@@ -263,9 +261,9 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
263
261
add dstend2 , dstin , count
264
262
265
263
cmp count , 96
266
- b.hi L( set_long)
264
+ b.hi 7f // set_long
267
265
cmp count , 16
268
- b.hs L( set_medium)
266
+ b.hs 4f // set_medium
269
267
mov val , v0.D [ 0 ]
270
268
271
269
/ * Set 0 .. 15 bytes. * /
@@ -285,38 +283,38 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
285
283
3 : ret
286
284
287
285
/ * Set 17 .. 96 bytes. * /
288
- L(set_medium):
286
+ 4 : // set_medium
289
287
str q0 , [ dstin ]
290
- tbnz count , 6 , L( set96)
288
+ tbnz count , 6 , 6f // set96
291
289
str q0 , [ dstend2 , - 16 ]
292
- tbz count , 5 , 1f
290
+ tbz count , 5 , 5f
293
291
str q0 , [ dstin , 16 ]
294
292
str q0 , [ dstend2 , - 32 ]
295
- 1 : ret
293
+ 5 : ret
296
294
297
295
.p2align 4
298
296
/ * Set 64 .. 96 bytes. Write 64 bytes from the start and
299
297
32 bytes from the end. * /
300
- L(set96):
298
+ 6 : // set96
301
299
str q0 , [ dstin , 16 ]
302
300
stp q0 , q0 , [ dstin , 32 ]
303
301
stp q0 , q0 , [ dstend2 , - 32 ]
304
302
ret
305
303
306
304
.p2align 4
307
- L(set_long):
305
+ 7 : // set_long
308
306
and valw , valw , 255
309
307
bic dst , dstin , 15
310
308
str q0 , [ dstin ]
311
309
cmp count , 160
312
310
ccmp valw , 0 , 0 , hs
313
- b.ne L( no_zva)
311
+ b.ne 9f // no_zva
314
312
315
313
#ifndef SKIP_ZVA_CHECK
316
314
mrs zva_val , dczid_el0
317
315
and zva_val , zva_val , 31
318
316
cmp zva_val , 4 / * ZVA size is 64 bytes. * /
319
- b.ne L( no_zva)
317
+ b.ne 9f // no_zva
320
318
#endif
321
319
str q0 , [ dst , 16 ]
322
320
stp q0 , q0 , [ dst , 32 ]
@@ -325,27 +323,27 @@ L(set_long):
325
323
sub count , count , 128 / * Adjust count and bias for loop . * /
326
324
327
325
.p2align 4
328
- L(zva_loop):
326
+ 8 : // zva_loop
329
327
add dst , dst , 64
330
328
dc zva , dst
331
329
subs count , count , 64
332
- b.hi L( zva_loop)
330
+ b.hi 8b // zva_loop
333
331
stp q0 , q0 , [ dstend2 , - 64 ]
334
332
stp q0 , q0 , [ dstend2 , - 32 ]
335
333
ret
336
334
337
- L(no_zva):
335
+ 9 : // no_zva
338
336
sub count , dstend2 , dst / * Count is 16 too large. * /
339
337
sub dst , dst , 16 / * Dst is biased by - 32 . * /
340
338
sub count , count , 64 + 16 / * Adjust count and bias for loop . * /
341
- L(no_zva_loop):
339
+ 10 : // no_zva_loop
342
340
stp q0 , q0 , [ dst , 32 ]
343
341
stp q0 , q0 , [ dst , 64 ] !
344
342
subs count , count , 64
345
- b.hi L( no_zva_loop)
343
+ b.hi 10b // no_zva_loop
346
344
stp q0 , q0 , [ dstend2 , - 64 ]
347
345
stp q0 , q0 , [ dstend2 , - 32 ]
348
346
ret
349
- END_COMPILERRT_OUTLINE_FUNCTION (__arm_sc_memset)
347
+ END_COMPILERRT_FUNCTION (__arm_sc_memset)
350
348
351
349
#endif // __aarch64__
0 commit comments