Skip to content

Commit 75c3ff8

Browse files
authored
[compiler-rt][AArch64] Provide basic implementations of SME memcpy/memmove in case of strictly aligned memory access (llvm#138250)
The existing implementations, written in assembly, make use of unaligned accesses for performance reasons. They are not compatible with strict aligned configurations, i.e. with `-mno-unaligned-access`. If the functions are used in this scenario, an exception is raised due to unaligned memory accesses. This patch reintroduces vanilla implementations for these functions to be used in strictly aligned configurations. The actual code is largely based on the code from llvm#77496
1 parent 3108cbd commit 75c3ff8

File tree

7 files changed

+395
-141
lines changed

7 files changed

+395
-141
lines changed

compiler-rt/cmake/builtin-config-ix.cmake

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,24 @@ void foo(void) __arm_streaming_compatible {
5050
}
5151
")
5252

53+
builtin_check_c_compiler_source(COMPILER_RT_HAS_ARM_UNALIGNED
54+
"
55+
void foo() {
56+
#ifndef __ARM_FEATURE_UNALIGNED
57+
#error \"Unaligned accesses unsupported\"
58+
#endif
59+
}
60+
")
61+
62+
builtin_check_c_compiler_source(COMPILER_RT_HAS_ARM_FP
63+
"
64+
void foo() {
65+
#ifndef __ARM_FP
66+
#error \"No floating-point support\"
67+
#endif
68+
}
69+
")
70+
5371
check_include_files("sys/auxv.h" COMPILER_RT_HAS_AUXV)
5472

5573
if(ANDROID)

compiler-rt/lib/builtins/CMakeLists.txt

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -600,9 +600,17 @@ if (COMPILER_RT_HAS_AARCH64_SME)
600600
set_source_files_properties(aarch64/arm_apple_sme_abi.s PROPERTIES COMPILE_FLAGS -march=armv8a+sme)
601601
message(STATUS "AArch64 Apple SME ABI routines enabled")
602602
elseif (NOT COMPILER_RT_DISABLE_AARCH64_FMV AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND COMPILER_RT_AARCH64_FMV_USES_GLOBAL_CONSTRUCTOR)
603-
list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-libc-mem-routines.S aarch64/sme-abi-assert.c aarch64/sme-libc-routines.c)
603+
if(COMPILER_RT_HAS_ARM_UNALIGNED AND COMPILER_RT_HAS_ARM_FP)
604+
list(APPEND aarch64_SOURCES aarch64/sme-libc-opt-memset-memchr.S aarch64/sme-libc-opt-memcpy-memmove.S)
605+
elseif(COMPILER_RT_HAS_ARM_UNALIGNED)
606+
list(APPEND aarch64_SOURCES aarch64/sme-libc-memset-memchr.c aarch64/sme-libc-opt-memcpy-memmove.S)
607+
message(WARNING "AArch64 SME ABI assembly-optimized memset/memchr disabled: target does not have hardware floating-point support.")
608+
else()
609+
list(APPEND aarch64_SOURCES aarch64/sme-libc-memset-memchr.c aarch64/sme-libc-memcpy-memmove.c)
610+
message(WARNING "AArch64 SME ABI assembly-optimized routines disabled: target does not support unaligned accesses.")
611+
endif()
604612
message(STATUS "AArch64 SME ABI routines enabled")
605-
set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
613+
set_source_files_properties(aarch64/sme-libc-memset-memchr.c aarch64/sme-libc-memcpy-memmove.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
606614
else()
607615
if(COMPILER_RT_DISABLE_AARCH64_FMV)
608616
message(WARNING "AArch64 SME ABI routines require function multiversioning support.")
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
///
9+
/// \file
10+
/// This file contains basic implementations of Scalable Matrix Extension (SME)
11+
/// compatible memcpy and memmove functions to be used when their assembly-
12+
/// optimized counterparts can't.
13+
///
14+
//===----------------------------------------------------------------------===//
15+
16+
#include <stddef.h>
17+
18+
static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
19+
size_t n) __arm_streaming_compatible {
20+
unsigned char *destp = (unsigned char *)dest;
21+
const unsigned char *srcp = (const unsigned char *)src;
22+
23+
for (size_t i = 0; i < n; ++i)
24+
destp[i] = srcp[i];
25+
return dest;
26+
}
27+
28+
static void *__arm_sc_memcpy_rev(void *dest, const void *src,
29+
size_t n) __arm_streaming_compatible {
30+
unsigned char *destp = (unsigned char *)dest;
31+
const unsigned char *srcp = (const unsigned char *)src;
32+
33+
while (n > 0) {
34+
--n;
35+
destp[n] = srcp[n];
36+
}
37+
return dest;
38+
}
39+
40+
extern void *__arm_sc_memcpy(void *__restrict dest, const void *__restrict src,
41+
size_t n) __arm_streaming_compatible {
42+
return __arm_sc_memcpy_fwd(dest, src, n);
43+
}
44+
45+
extern void *__arm_sc_memmove(void *dest, const void *src,
46+
size_t n) __arm_streaming_compatible {
47+
unsigned char *destp = (unsigned char *)dest;
48+
const unsigned char *srcp = (const unsigned char *)src;
49+
50+
if ((srcp > (destp + n)) || (destp > (srcp + n)))
51+
return __arm_sc_memcpy(dest, src, n);
52+
if (srcp > destp)
53+
return __arm_sc_memcpy_fwd(dest, src, n);
54+
return __arm_sc_memcpy_rev(dest, src, n);
55+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
///
9+
/// \file
10+
/// This file contains basic implementations of Scalable Matrix Extension (SME)
11+
/// compatible memset and memchr functions to be used when their assembly-
12+
/// optimized counterparts can't.
13+
///
14+
//===----------------------------------------------------------------------===//
15+
16+
#include <stddef.h>
17+
18+
extern void *__arm_sc_memset(void *dest, int c,
19+
size_t n) __arm_streaming_compatible {
20+
unsigned char *destp = (unsigned char *)dest;
21+
unsigned char c8 = (unsigned char)c;
22+
for (size_t i = 0; i < n; ++i)
23+
destp[i] = c8;
24+
25+
return dest;
26+
}
27+
28+
extern const void *__arm_sc_memchr(const void *src, int c,
29+
size_t n) __arm_streaming_compatible {
30+
const unsigned char *srcp = (const unsigned char *)src;
31+
unsigned char c8 = (unsigned char)c;
32+
for (size_t i = 0; i < n; ++i)
33+
if (srcp[i] == c8)
34+
return &srcp[i];
35+
36+
return NULL;
37+
}

compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S renamed to compiler-rt/lib/builtins/aarch64/sme-libc-opt-memcpy-memmove.S

Lines changed: 14 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,20 @@
1+
//===----------------------------------------------------------------------===//
2+
//
13
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
24
// See https://llvm.org/LICENSE.txt for license information.
35
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4-
5-
// Routines taken from libc/AOR_v20.02/string/aarch64
6+
//
7+
//===----------------------------------------------------------------------===//
8+
///
9+
/// \file
10+
/// This file contains assembly-optimized implementations of Scalable Matrix
11+
/// Extension (SME) compatible memcpy and memmove functions.
12+
///
13+
/// These implementations depend on unaligned access support.
14+
///
15+
/// Routines taken from libc/AOR_v20.02/string/aarch64.
16+
///
17+
//===----------------------------------------------------------------------===//
618

719
#include "../assembly.h"
820

@@ -234,116 +246,3 @@ END_COMPILERRT_FUNCTION(__arm_sc_memcpy)
234246

235247
DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
236248

237-
// This version uses FP registers. Use this only on targets with them
238-
#if (defined(__aarch64__) && __ARM_FP != 0) || defined(__arm64ec__)
239-
//
240-
// __arm_sc_memset
241-
//
242-
243-
#define dstin x0
244-
#define val x1
245-
#define valw w1
246-
#define count x2
247-
#define dst x3
248-
#define dstend2 x4
249-
#define zva_val x5
250-
251-
DEFINE_COMPILERRT_FUNCTION(__arm_sc_memset)
252-
#ifdef __ARM_FEATURE_SVE
253-
mov z0.b, valw
254-
#else
255-
bfi valw, valw, #8, #8
256-
bfi valw, valw, #16, #16
257-
bfi val, val, #32, #32
258-
fmov d0, val
259-
fmov v0.d[1], val
260-
#endif
261-
add dstend2, dstin, count
262-
263-
cmp count, 96
264-
b.hi 7f // set_long
265-
cmp count, 16
266-
b.hs 4f // set_medium
267-
mov val, v0.D[0]
268-
269-
/* Set 0..15 bytes. */
270-
tbz count, 3, 1f
271-
str val, [dstin]
272-
str val, [dstend2, -8]
273-
ret
274-
nop
275-
1: tbz count, 2, 2f
276-
str valw, [dstin]
277-
str valw, [dstend2, -4]
278-
ret
279-
2: cbz count, 3f
280-
strb valw, [dstin]
281-
tbz count, 1, 3f
282-
strh valw, [dstend2, -2]
283-
3: ret
284-
285-
/* Set 17..96 bytes. */
286-
4: // set_medium
287-
str q0, [dstin]
288-
tbnz count, 6, 6f // set96
289-
str q0, [dstend2, -16]
290-
tbz count, 5, 5f
291-
str q0, [dstin, 16]
292-
str q0, [dstend2, -32]
293-
5: ret
294-
295-
.p2align 4
296-
/* Set 64..96 bytes. Write 64 bytes from the start and
297-
32 bytes from the end. */
298-
6: // set96
299-
str q0, [dstin, 16]
300-
stp q0, q0, [dstin, 32]
301-
stp q0, q0, [dstend2, -32]
302-
ret
303-
304-
.p2align 4
305-
7: // set_long
306-
and valw, valw, 255
307-
bic dst, dstin, 15
308-
str q0, [dstin]
309-
cmp count, 160
310-
ccmp valw, 0, 0, hs
311-
b.ne 9f // no_zva
312-
313-
#ifndef SKIP_ZVA_CHECK
314-
mrs zva_val, dczid_el0
315-
and zva_val, zva_val, 31
316-
cmp zva_val, 4 /* ZVA size is 64 bytes. */
317-
b.ne 9f // no_zva
318-
#endif
319-
str q0, [dst, 16]
320-
stp q0, q0, [dst, 32]
321-
bic dst, dst, 63
322-
sub count, dstend2, dst /* Count is now 64 too large. */
323-
sub count, count, 128 /* Adjust count and bias for loop. */
324-
325-
.p2align 4
326-
8: // zva_loop
327-
add dst, dst, 64
328-
dc zva, dst
329-
subs count, count, 64
330-
b.hi 8b // zva_loop
331-
stp q0, q0, [dstend2, -64]
332-
stp q0, q0, [dstend2, -32]
333-
ret
334-
335-
9: // no_zva
336-
sub count, dstend2, dst /* Count is 16 too large. */
337-
sub dst, dst, 16 /* Dst is biased by -32. */
338-
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
339-
10: // no_zva_loop
340-
stp q0, q0, [dst, 32]
341-
stp q0, q0, [dst, 64]!
342-
subs count, count, 64
343-
b.hi 10b // no_zva_loop
344-
stp q0, q0, [dstend2, -64]
345-
stp q0, q0, [dstend2, -32]
346-
ret
347-
END_COMPILERRT_FUNCTION(__arm_sc_memset)
348-
349-
#endif // __aarch64__

0 commit comments

Comments
 (0)