Skip to content

Commit ef2af7f

Browse files
[AArch64][SME] Make use of Arm Optimised Routines in compiler-rt (#99326)
A number of streaming-compatible versions of standard C functions were added to compiler-rt, however there are already optimised versions of most of these in libc which are valid in streaming-SVE mode. This patch replaces the implementations of __arm_sc_mem* with these versions where possible.
1 parent 45c0dec commit ef2af7f

File tree

3 files changed

+345
-76
lines changed

3 files changed

+345
-76
lines changed

compiler-rt/lib/builtins/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -571,7 +571,7 @@ set(aarch64_SOURCES
571571

572572
if (COMPILER_RT_HAS_AARCH64_SME)
573573
if (NOT COMPILER_RT_DISABLE_AARCH64_FMV AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
574-
list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c aarch64/sme-abi-vg.c aarch64/sme-libc-routines.c)
574+
list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-libc-mem-routines.S aarch64/sme-abi-init.c aarch64/sme-abi-vg.c aarch64/sme-libc-routines.c)
575575
message(STATUS "AArch64 SME ABI routines enabled")
576576
set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
577577
else()
Lines changed: 344 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,344 @@
1+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2+
// See https://llvm.org/LICENSE.txt for license information.
3+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4+
5+
// Routines taken from libc/AOR_v20.02/string/aarch64
6+
7+
#include "../assembly.h"
8+
9+
#ifdef __aarch64__
10+
11+
#define L(l) .L ## l
12+
13+
//
14+
// __arm_sc_memcpy / __arm_sc_memmove
15+
//
16+
17+
#define dstin x0
18+
#define src x1
19+
#define count x2
20+
#define dst x3
21+
#define srcend1 x4
22+
#define dstend1 x5
23+
#define A_l x6
24+
#define A_lw w6
25+
#define A_h x7
26+
#define B_l x8
27+
#define B_lw w8
28+
#define B_h x9
29+
#define C_l x10
30+
#define C_lw w10
31+
#define C_h x11
32+
#define D_l x12
33+
#define D_h x13
34+
#define E_l x14
35+
#define E_h x15
36+
#define F_l x16
37+
#define F_h x17
38+
#define G_l count
39+
#define G_h dst
40+
#define H_l src
41+
#define H_h srcend1
42+
#define tmp1 x14
43+
44+
/* This implementation handles overlaps and supports both memcpy and memmove
45+
from a single entry point. It uses unaligned accesses and branchless
46+
sequences to keep the code small, simple and improve performance.
47+
48+
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
49+
copies of up to 128 bytes, and large copies. The overhead of the overlap
50+
check is negligible since it is only required for large copies.
51+
52+
Large copies use a software pipelined loop processing 64 bytes per iteration.
53+
The destination pointer is 16-byte aligned to minimize unaligned accesses.
54+
The loop tail is handled by always copying 64 bytes from the end.
55+
*/
56+
57+
DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy)
58+
add srcend1, src, count
59+
add dstend1, dstin, count
60+
cmp count, 128
61+
b.hi L(copy_long)
62+
cmp count, 32
63+
b.hi L(copy32_128)
64+
65+
/* Small copies: 0..32 bytes. */
66+
cmp count, 16
67+
b.lo L(copy16)
68+
ldp A_l, A_h, [src]
69+
ldp D_l, D_h, [srcend1, -16]
70+
stp A_l, A_h, [dstin]
71+
stp D_l, D_h, [dstend1, -16]
72+
ret
73+
74+
/* Copy 8-15 bytes. */
75+
L(copy16):
76+
tbz count, 3, L(copy8)
77+
ldr A_l, [src]
78+
ldr A_h, [srcend1, -8]
79+
str A_l, [dstin]
80+
str A_h, [dstend1, -8]
81+
ret
82+
83+
.p2align 3
84+
/* Copy 4-7 bytes. */
85+
L(copy8):
86+
tbz count, 2, L(copy4)
87+
ldr A_lw, [src]
88+
ldr B_lw, [srcend1, -4]
89+
str A_lw, [dstin]
90+
str B_lw, [dstend1, -4]
91+
ret
92+
93+
/* Copy 0..3 bytes using a branchless sequence. */
94+
L(copy4):
95+
cbz count, L(copy0)
96+
lsr tmp1, count, 1
97+
ldrb A_lw, [src]
98+
ldrb C_lw, [srcend1, -1]
99+
ldrb B_lw, [src, tmp1]
100+
strb A_lw, [dstin]
101+
strb B_lw, [dstin, tmp1]
102+
strb C_lw, [dstend1, -1]
103+
L(copy0):
104+
ret
105+
106+
.p2align 4
107+
/* Medium copies: 33..128 bytes. */
108+
L(copy32_128):
109+
ldp A_l, A_h, [src]
110+
ldp B_l, B_h, [src, 16]
111+
ldp C_l, C_h, [srcend1, -32]
112+
ldp D_l, D_h, [srcend1, -16]
113+
cmp count, 64
114+
b.hi L(copy128)
115+
stp A_l, A_h, [dstin]
116+
stp B_l, B_h, [dstin, 16]
117+
stp C_l, C_h, [dstend1, -32]
118+
stp D_l, D_h, [dstend1, -16]
119+
ret
120+
121+
.p2align 4
122+
/* Copy 65..128 bytes. */
123+
L(copy128):
124+
ldp E_l, E_h, [src, 32]
125+
ldp F_l, F_h, [src, 48]
126+
cmp count, 96
127+
b.ls L(copy96)
128+
ldp G_l, G_h, [srcend1, -64]
129+
ldp H_l, H_h, [srcend1, -48]
130+
stp G_l, G_h, [dstend1, -64]
131+
stp H_l, H_h, [dstend1, -48]
132+
L(copy96):
133+
stp A_l, A_h, [dstin]
134+
stp B_l, B_h, [dstin, 16]
135+
stp E_l, E_h, [dstin, 32]
136+
stp F_l, F_h, [dstin, 48]
137+
stp C_l, C_h, [dstend1, -32]
138+
stp D_l, D_h, [dstend1, -16]
139+
ret
140+
141+
.p2align 4
142+
/* Copy more than 128 bytes. */
143+
L(copy_long):
144+
/* Use backwards copy if there is an overlap. */
145+
sub tmp1, dstin, src
146+
cbz tmp1, L(copy0)
147+
cmp tmp1, count
148+
b.lo L(copy_long_backwards)
149+
150+
/* Copy 16 bytes and then align dst to 16-byte alignment. */
151+
152+
ldp D_l, D_h, [src]
153+
and tmp1, dstin, 15
154+
bic dst, dstin, 15
155+
sub src, src, tmp1
156+
add count, count, tmp1 /* Count is now 16 too large. */
157+
ldp A_l, A_h, [src, 16]
158+
stp D_l, D_h, [dstin]
159+
ldp B_l, B_h, [src, 32]
160+
ldp C_l, C_h, [src, 48]
161+
ldp D_l, D_h, [src, 64]!
162+
subs count, count, 128 + 16 /* Test and readjust count. */
163+
b.ls L(copy64_from_end)
164+
L(loop64):
165+
stp A_l, A_h, [dst, 16]
166+
ldp A_l, A_h, [src, 16]
167+
stp B_l, B_h, [dst, 32]
168+
ldp B_l, B_h, [src, 32]
169+
stp C_l, C_h, [dst, 48]
170+
ldp C_l, C_h, [src, 48]
171+
stp D_l, D_h, [dst, 64]!
172+
ldp D_l, D_h, [src, 64]!
173+
subs count, count, 64
174+
b.hi L(loop64)
175+
176+
/* Write the last iteration and copy 64 bytes from the end. */
177+
L(copy64_from_end):
178+
ldp E_l, E_h, [srcend1, -64]
179+
stp A_l, A_h, [dst, 16]
180+
ldp A_l, A_h, [srcend1, -48]
181+
stp B_l, B_h, [dst, 32]
182+
ldp B_l, B_h, [srcend1, -32]
183+
stp C_l, C_h, [dst, 48]
184+
ldp C_l, C_h, [srcend1, -16]
185+
stp D_l, D_h, [dst, 64]
186+
stp E_l, E_h, [dstend1, -64]
187+
stp A_l, A_h, [dstend1, -48]
188+
stp B_l, B_h, [dstend1, -32]
189+
stp C_l, C_h, [dstend1, -16]
190+
ret
191+
192+
.p2align 4
193+
194+
/* Large backwards copy for overlapping copies.
195+
Copy 16 bytes and then align dst to 16-byte alignment. */
196+
L(copy_long_backwards):
197+
ldp D_l, D_h, [srcend1, -16]
198+
and tmp1, dstend1, 15
199+
sub srcend1, srcend1, tmp1
200+
sub count, count, tmp1
201+
ldp A_l, A_h, [srcend1, -16]
202+
stp D_l, D_h, [dstend1, -16]
203+
ldp B_l, B_h, [srcend1, -32]
204+
ldp C_l, C_h, [srcend1, -48]
205+
ldp D_l, D_h, [srcend1, -64]!
206+
sub dstend1, dstend1, tmp1
207+
subs count, count, 128
208+
b.ls L(copy64_from_start)
209+
210+
L(loop64_backwards):
211+
stp A_l, A_h, [dstend1, -16]
212+
ldp A_l, A_h, [srcend1, -16]
213+
stp B_l, B_h, [dstend1, -32]
214+
ldp B_l, B_h, [srcend1, -32]
215+
stp C_l, C_h, [dstend1, -48]
216+
ldp C_l, C_h, [srcend1, -48]
217+
stp D_l, D_h, [dstend1, -64]!
218+
ldp D_l, D_h, [srcend1, -64]!
219+
subs count, count, 64
220+
b.hi L(loop64_backwards)
221+
222+
/* Write the last iteration and copy 64 bytes from the start. */
223+
L(copy64_from_start):
224+
ldp G_l, G_h, [src, 48]
225+
stp A_l, A_h, [dstend1, -16]
226+
ldp A_l, A_h, [src, 32]
227+
stp B_l, B_h, [dstend1, -32]
228+
ldp B_l, B_h, [src, 16]
229+
stp C_l, C_h, [dstend1, -48]
230+
ldp C_l, C_h, [src]
231+
stp D_l, D_h, [dstend1, -64]
232+
stp G_l, G_h, [dstin, 48]
233+
stp A_l, A_h, [dstin, 32]
234+
stp B_l, B_h, [dstin, 16]
235+
stp C_l, C_h, [dstin]
236+
ret
237+
END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy)
238+
239+
DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
240+
241+
242+
//
243+
// __arm_sc_memset
244+
//
245+
246+
#define dstin x0
247+
#define val x1
248+
#define valw w1
249+
#define count x2
250+
#define dst x3
251+
#define dstend2 x4
252+
#define zva_val x5
253+
254+
DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
255+
dup v0.16B, valw
256+
add dstend2, dstin, count
257+
258+
cmp count, 96
259+
b.hi L(set_long)
260+
cmp count, 16
261+
b.hs L(set_medium)
262+
mov val, v0.D[0]
263+
264+
/* Set 0..15 bytes. */
265+
tbz count, 3, 1f
266+
str val, [dstin]
267+
str val, [dstend2, -8]
268+
ret
269+
nop
270+
1: tbz count, 2, 2f
271+
str valw, [dstin]
272+
str valw, [dstend2, -4]
273+
ret
274+
2: cbz count, 3f
275+
strb valw, [dstin]
276+
tbz count, 1, 3f
277+
strh valw, [dstend2, -2]
278+
3: ret
279+
280+
/* Set 17..96 bytes. */
281+
L(set_medium):
282+
str q0, [dstin]
283+
tbnz count, 6, L(set96)
284+
str q0, [dstend2, -16]
285+
tbz count, 5, 1f
286+
str q0, [dstin, 16]
287+
str q0, [dstend2, -32]
288+
1: ret
289+
290+
.p2align 4
291+
/* Set 64..96 bytes. Write 64 bytes from the start and
292+
32 bytes from the end. */
293+
L(set96):
294+
str q0, [dstin, 16]
295+
stp q0, q0, [dstin, 32]
296+
stp q0, q0, [dstend2, -32]
297+
ret
298+
299+
.p2align 4
300+
L(set_long):
301+
and valw, valw, 255
302+
bic dst, dstin, 15
303+
str q0, [dstin]
304+
cmp count, 160
305+
ccmp valw, 0, 0, hs
306+
b.ne L(no_zva)
307+
308+
#ifndef SKIP_ZVA_CHECK
309+
mrs zva_val, dczid_el0
310+
and zva_val, zva_val, 31
311+
cmp zva_val, 4 /* ZVA size is 64 bytes. */
312+
b.ne L(no_zva)
313+
#endif
314+
str q0, [dst, 16]
315+
stp q0, q0, [dst, 32]
316+
bic dst, dst, 63
317+
sub count, dstend2, dst /* Count is now 64 too large. */
318+
sub count, count, 128 /* Adjust count and bias for loop. */
319+
320+
.p2align 4
321+
L(zva_loop):
322+
add dst, dst, 64
323+
dc zva, dst
324+
subs count, count, 64
325+
b.hi L(zva_loop)
326+
stp q0, q0, [dstend2, -64]
327+
stp q0, q0, [dstend2, -32]
328+
ret
329+
330+
L(no_zva):
331+
sub count, dstend2, dst /* Count is 16 too large. */
332+
sub dst, dst, 16 /* Dst is biased by -32. */
333+
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
334+
L(no_zva_loop):
335+
stp q0, q0, [dst, 32]
336+
stp q0, q0, [dst, 64]!
337+
subs count, count, 64
338+
b.hi L(no_zva_loop)
339+
stp q0, q0, [dstend2, -64]
340+
stp q0, q0, [dstend2, -32]
341+
ret
342+
END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memset)
343+
344+
#endif // __aarch64__

0 commit comments

Comments
 (0)