Skip to content

[compiler-rt][AArch64] Provide basic implementations of SME memcpy/memmove in case of strictly aligned memory access #138250

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions compiler-rt/cmake/builtin-config-ix.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,24 @@ void foo(void) __arm_streaming_compatible {
}
")

builtin_check_c_compiler_source(COMPILER_RT_HAS_ARM_UNALIGNED
"
void foo() {
#ifndef __ARM_FEATURE_UNALIGNED
#error \"Unaligned accesses unsupported\"
#endif
}
")

builtin_check_c_compiler_source(COMPILER_RT_HAS_ARM_FP
"
void foo() {
#ifndef __ARM_FP
#error \"No floating-point support\"
#endif
}
")

check_include_files("sys/auxv.h" COMPILER_RT_HAS_AUXV)

if(ANDROID)
Expand Down
12 changes: 10 additions & 2 deletions compiler-rt/lib/builtins/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -600,9 +600,17 @@ if (COMPILER_RT_HAS_AARCH64_SME)
set_source_files_properties(aarch64/arm_apple_sme_abi.s PROPERTIES COMPILE_FLAGS -march=armv8a+sme)
message(STATUS "AArch64 Apple SME ABI routines enabled")
elseif (NOT COMPILER_RT_DISABLE_AARCH64_FMV AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND COMPILER_RT_AARCH64_FMV_USES_GLOBAL_CONSTRUCTOR)
list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-libc-mem-routines.S aarch64/sme-abi-assert.c aarch64/sme-libc-routines.c)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The change inadvertently removed files from the build. I will fix it shortly.

if(COMPILER_RT_HAS_ARM_UNALIGNED AND COMPILER_RT_HAS_ARM_FP)
list(APPEND aarch64_SOURCES aarch64/sme-libc-opt-memset-memchr.S aarch64/sme-libc-opt-memcpy-memmove.S)
elseif(COMPILER_RT_HAS_ARM_UNALIGNED)
list(APPEND aarch64_SOURCES aarch64/sme-libc-memset-memchr.c aarch64/sme-libc-opt-memcpy-memmove.S)
message(WARNING "AArch64 SME ABI assembly-optimized memset/memchr disabled: target does not have hardware floating-point support.")
else()
list(APPEND aarch64_SOURCES aarch64/sme-libc-memset-memchr.c aarch64/sme-libc-memcpy-memmove.c)
message(WARNING "AArch64 SME ABI assembly-optimized routines disabled: target does not support unaligned accesses.")
endif()
message(STATUS "AArch64 SME ABI routines enabled")
set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
set_source_files_properties(aarch64/sme-libc-memset-memchr.c aarch64/sme-libc-memcpy-memmove.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
else()
if(COMPILER_RT_DISABLE_AARCH64_FMV)
message(WARNING "AArch64 SME ABI routines require function multiversioning support.")
Expand Down
55 changes: 55 additions & 0 deletions compiler-rt/lib/builtins/aarch64/sme-libc-memcpy-memmove.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file contains basic implementations of Scalable Matrix Extension (SME)
/// compatible memcpy and memmove functions to be used when their assembly-
/// optimized counterparts can't.
///
//===----------------------------------------------------------------------===//

#include <stddef.h>

static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
size_t n) __arm_streaming_compatible {
unsigned char *destp = (unsigned char *)dest;
const unsigned char *srcp = (const unsigned char *)src;

for (size_t i = 0; i < n; ++i)
destp[i] = srcp[i];
return dest;
}

static void *__arm_sc_memcpy_rev(void *dest, const void *src,
size_t n) __arm_streaming_compatible {
unsigned char *destp = (unsigned char *)dest;
const unsigned char *srcp = (const unsigned char *)src;

while (n > 0) {
--n;
destp[n] = srcp[n];
}
return dest;
}

extern void *__arm_sc_memcpy(void *__restrict dest, const void *__restrict src,
size_t n) __arm_streaming_compatible {
return __arm_sc_memcpy_fwd(dest, src, n);
}

extern void *__arm_sc_memmove(void *dest, const void *src,
size_t n) __arm_streaming_compatible {
unsigned char *destp = (unsigned char *)dest;
const unsigned char *srcp = (const unsigned char *)src;

if ((srcp > (destp + n)) || (destp > (srcp + n)))
return __arm_sc_memcpy(dest, src, n);
if (srcp > destp)
return __arm_sc_memcpy_fwd(dest, src, n);
return __arm_sc_memcpy_rev(dest, src, n);
}
37 changes: 37 additions & 0 deletions compiler-rt/lib/builtins/aarch64/sme-libc-memset-memchr.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file contains basic implementations of Scalable Matrix Extension (SME)
/// compatible memset and memchr functions to be used when their assembly-
/// optimized counterparts can't.
///
//===----------------------------------------------------------------------===//

#include <stddef.h>

extern void *__arm_sc_memset(void *dest, int c,
size_t n) __arm_streaming_compatible {
unsigned char *destp = (unsigned char *)dest;
unsigned char c8 = (unsigned char)c;
for (size_t i = 0; i < n; ++i)
destp[i] = c8;

return dest;
}

extern const void *__arm_sc_memchr(const void *src, int c,
size_t n) __arm_streaming_compatible {
const unsigned char *srcp = (const unsigned char *)src;
unsigned char c8 = (unsigned char)c;
for (size_t i = 0; i < n; ++i)
if (srcp[i] == c8)
return &srcp[i];

return NULL;
}
Original file line number Diff line number Diff line change
@@ -1,8 +1,20 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

// Routines taken from libc/AOR_v20.02/string/aarch64
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file contains assembly-optimized implementations of Scalable Matrix
/// Extension (SME) compatible memcpy and memmove functions.
///
/// These implementations depend on unaligned access support.
///
/// Routines taken from libc/AOR_v20.02/string/aarch64.
///
//===----------------------------------------------------------------------===//

#include "../assembly.h"

Expand Down Expand Up @@ -234,116 +246,3 @@ END_COMPILERRT_FUNCTION(__arm_sc_memcpy)

DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)

// This version uses FP registers. Use this only on targets with them
#if (defined(__aarch64__) && __ARM_FP != 0) || defined(__arm64ec__)
//
// __arm_sc_memset
//

#define dstin x0
#define val x1
#define valw w1
#define count x2
#define dst x3
#define dstend2 x4
#define zva_val x5

DEFINE_COMPILERRT_FUNCTION(__arm_sc_memset)
#ifdef __ARM_FEATURE_SVE
mov z0.b, valw
#else
bfi valw, valw, #8, #8
bfi valw, valw, #16, #16
bfi val, val, #32, #32
fmov d0, val
fmov v0.d[1], val
#endif
add dstend2, dstin, count

cmp count, 96
b.hi 7f // set_long
cmp count, 16
b.hs 4f // set_medium
mov val, v0.D[0]

/* Set 0..15 bytes. */
tbz count, 3, 1f
str val, [dstin]
str val, [dstend2, -8]
ret
nop
1: tbz count, 2, 2f
str valw, [dstin]
str valw, [dstend2, -4]
ret
2: cbz count, 3f
strb valw, [dstin]
tbz count, 1, 3f
strh valw, [dstend2, -2]
3: ret

/* Set 17..96 bytes. */
4: // set_medium
str q0, [dstin]
tbnz count, 6, 6f // set96
str q0, [dstend2, -16]
tbz count, 5, 5f
str q0, [dstin, 16]
str q0, [dstend2, -32]
5: ret

.p2align 4
/* Set 64..96 bytes. Write 64 bytes from the start and
32 bytes from the end. */
6: // set96
str q0, [dstin, 16]
stp q0, q0, [dstin, 32]
stp q0, q0, [dstend2, -32]
ret

.p2align 4
7: // set_long
and valw, valw, 255
bic dst, dstin, 15
str q0, [dstin]
cmp count, 160
ccmp valw, 0, 0, hs
b.ne 9f // no_zva

#ifndef SKIP_ZVA_CHECK
mrs zva_val, dczid_el0
and zva_val, zva_val, 31
cmp zva_val, 4 /* ZVA size is 64 bytes. */
b.ne 9f // no_zva
#endif
str q0, [dst, 16]
stp q0, q0, [dst, 32]
bic dst, dst, 63
sub count, dstend2, dst /* Count is now 64 too large. */
sub count, count, 128 /* Adjust count and bias for loop. */

.p2align 4
8: // zva_loop
add dst, dst, 64
dc zva, dst
subs count, count, 64
b.hi 8b // zva_loop
stp q0, q0, [dstend2, -64]
stp q0, q0, [dstend2, -32]
ret

9: // no_zva
sub count, dstend2, dst /* Count is 16 too large. */
sub dst, dst, 16 /* Dst is biased by -32. */
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
10: // no_zva_loop
stp q0, q0, [dst, 32]
stp q0, q0, [dst, 64]!
subs count, count, 64
b.hi 10b // no_zva_loop
stp q0, q0, [dstend2, -64]
stp q0, q0, [dstend2, -32]
ret
END_COMPILERRT_FUNCTION(__arm_sc_memset)

#endif // __aarch64__
Loading
Loading