Skip to content

[AArch64][compiler-rt] Add memcpy, memset, memmove, memchr builtins. #77496

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions compiler-rt/cmake/builtin-config-ix.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,12 @@ asm(\".arch armv8-a+lse\");
asm(\"cas w0, w1, [x2]\");
")

builtin_check_c_compiler_source(COMPILER_RT_HAS_ASM_SME
builtin_check_c_compiler_source(COMPILER_RT_HAS_AARCH64_SME
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you combine this with COMPILER_RT_HAS_ASM_SME and just have a single COMPILER_RT_HAS_AARCH64_SME ? That way we don't need to have two variables given that you're AND'ing the result later on anyway.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

"
asm(\".arch armv9-a+sme\");
asm(\"smstart\");
void foo(void) __arm_streaming_compatible {
asm(\".arch armv9-a+sme\");
asm(\"smstart\");
}
")

if(ANDROID)
Expand Down
5 changes: 3 additions & 2 deletions compiler-rt/lib/builtins/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -560,9 +560,10 @@ set(aarch64_SOURCES
aarch64/fp_mode.c
)

if(COMPILER_RT_HAS_ASM_SME AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c)
if(COMPILER_RT_HAS_AARCH64_SME AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c aarch64/sme-libc-routines.c)
message(STATUS "AArch64 SME ABI routines enabled")
set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
else()
message(STATUS "AArch64 SME ABI routines disabled")
endif()
Expand Down
87 changes: 87 additions & 0 deletions compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#include <stdlib.h>
Copy link
Contributor

@peterwaller-arm peterwaller-arm Jan 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've proposed #79454 to switch this out for stddef.h which is provided by the compiler resource headers rather than the libc (since this is inside builtins).


// WARNING: When building the scalar versions of these functions you need to
// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
// from recognising a loop idiom and planting calls to memcpy!
Comment on lines +3 to +5
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a comment here that seems important, but doesn't seem to be addressed by your patch?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
size_t n) __arm_streaming_compatible {
unsigned char *destp = (unsigned char *)dest;
const unsigned char *srcp = (const unsigned char *)src;
for (size_t i = 0; i < n; ++i)
destp[i] = srcp[i];

return dest;
}

// If dest and src overlap then behaviour is undefined, hence we can add the
// restrict keywords here. This also matches the definition of the libc memcpy
// according to the man page.
void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
size_t n) __arm_streaming_compatible {
return __arm_sc_memcpy_fwd(dest, src, n);
}

void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible {
unsigned char *destp = (unsigned char *)dest;
unsigned char c8 = (unsigned char)c;
for (size_t i = 0; i < n; ++i)
destp[i] = c8;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

drop braces, ++i

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


return dest;
}

static void *__arm_sc_memcpy_rev(void *dest, const void *src,
size_t n) __arm_streaming_compatible {
unsigned char *destp = (unsigned char *)dest;
const unsigned char *srcp = (const unsigned char *)src;
// TODO: Improve performance by copying larger chunks in reverse, or by
// using SVE.
while (n > 0) {
--n;
destp[n] = srcp[n];
}
return dest;
}

// Semantically a memmove is equivalent to the following:
// 1. Copy the entire contents of src to a temporary array that does not
// overlap with src or dest.
// 2. Copy the contents of the temporary array into dest.
void *__arm_sc_memmove(void *dest, const void *src,
size_t n) __arm_streaming_compatible {
unsigned char *destp = (unsigned char *)dest;
const unsigned char *srcp = (const unsigned char *)src;

// If src and dest don't overlap then just invoke memcpy
if ((srcp > (destp + n)) || (destp > (srcp + n)))
return __arm_sc_memcpy_fwd(dest, src, n);

// Overlap case 1:
// src: Low | -> | High
// dest: Low | -> | High
// Here src is always ahead of dest at a higher addres. If we first read a
// chunk of data from src we can safely write the same chunk to dest without
// corrupting future reads of src.
if (srcp > destp)
return __arm_sc_memcpy_fwd(dest, src, n);

// Overlap case 2:
// src: Low | -> | High
// dest: Low | -> | High
// While we're in the overlap region we're always corrupting future reads of
// src when writing to dest. An efficient way to do this is to copy the data
// in reverse by starting at the highest address.
return __arm_sc_memcpy_rev(dest, src, n);
}

const void *__arm_sc_memchr(const void *src, int c,
size_t n) __arm_streaming_compatible {
const unsigned char *srcp = (const unsigned char *)src;
unsigned char c8 = (unsigned char)c;
for (size_t i = 0; i < n; ++i)
if (srcp[i] == c8)
return &srcp[i];

return NULL;
}
120 changes: 120 additions & 0 deletions compiler-rt/test/builtins/Unit/sme-string-test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// REQUIRES: aarch64-target-arch, aarch64-sme-available
// RUN: %clangxx_builtins %s %librt -o %t && %run %t

#include <cassert>
#include <initializer_list>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>

extern "C" {
void *__arm_sc_memcpy(void *, const void *, size_t);
void *__arm_sc_memset(void *, int, size_t);
void *__arm_sc_memmove(void *, const void *, size_t);
void *__arm_sc_memchr(const void *, int, size_t);
}

template <unsigned N> class Memory {
public:
uint8_t ptr[N];
unsigned size;

Memory(unsigned stride = 0) {
size = N;
if (stride == 0)
return;
for (unsigned i = 0; i < N; i++)
ptr[i] = i * stride;
}

void assert_equal(const Memory &other) {
assert(N == other.size);
assert(memcmp(ptr, other.ptr, N) == 0);
}

void assert_equal(std::initializer_list<uint8_t> s) {
assert(N == s.size());
auto it = s.begin();
for (unsigned i = 0; i < N; ++i)
assert(ptr[i] == *it++);
}

void assert_elemt_equal_at(unsigned I, uint8_t elem) {
assert(ptr[I] == elem);
}
};

int main() {

// Testing memcpy from src to dst.
{
Memory<8> src(1);
Memory<8> dst;
if (!__arm_sc_memcpy(dst.ptr, src.ptr, 8))
abort();
dst.assert_equal(src);
dst.assert_equal({0, 1, 2, 3, 4, 5, 6, 7});
}

// Testing memcpy from src to dst with pointer offset.
{
Memory<8> src(1);
Memory<8> dst(1);
if (!__arm_sc_memcpy(dst.ptr + 1, src.ptr, 6))
abort();
dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
}

// Testing memchr.
{
Memory<8> src(4);
for (unsigned i = 0; i < 8; ++i) {
uint8_t e = src.ptr[i];
uint8_t *elem = (uint8_t *)memchr(src.ptr, e, 8);
if (!elem)
abort();
src.assert_elemt_equal_at(elem - src.ptr, *elem);
for (unsigned i = 0; i < 8; ++i)
assert(__arm_sc_memchr(src.ptr, src.ptr[i], 8) ==
memchr(src.ptr, src.ptr[i], 8));
}
}

// Testing memset.
{
Memory<8> array;
if (!__arm_sc_memset(array.ptr, 2, 8))
abort();
array.assert_equal({2, 2, 2, 2, 2, 2, 2, 2});
}

// Testing memset with pointer offset.
{
Memory<8> array(1);
if (!__arm_sc_memset(array.ptr + 1, 2, 6))
abort();
array.assert_equal({0, 2, 2, 2, 2, 2, 2, 7});
}

// Testing memmove with a simple non-overlap case.
{
Memory<8> src(1);
Memory<8> dst(1);
if (!__arm_sc_memmove(dst.ptr + 1, src.ptr, 6))
abort();
dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
}

// Testing memove with overlap pointers dst > src, dst < src.
{
Memory<8> srcdst(1);
if (!__arm_sc_memmove(srcdst.ptr + 1, srcdst.ptr, 6))
abort();
srcdst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
if (!__arm_sc_memmove(srcdst.ptr, srcdst.ptr + 1, 6))
abort();
srcdst.assert_equal({0, 1, 2, 3, 4, 5, 5, 7});
}

return 0;
}
3 changes: 3 additions & 0 deletions compiler-rt/test/lit.common.cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,9 @@ def get_ios_commands_dir():
if config.has_lld:
config.available_features.add("lld-available")

if config.aarch64_sme:
config.available_features.add("aarch64-sme-available")

if config.use_lld:
config.available_features.add("lld")

Expand Down
1 change: 1 addition & 0 deletions compiler-rt/test/lit.common.configured.in
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ set_default("gwp_asan", @COMPILER_RT_HAS_GWP_ASAN_PYBOOL@)
set_default("expensive_checks", @LLVM_ENABLE_EXPENSIVE_CHECKS_PYBOOL@)
set_default("test_standalone_build_libs", @COMPILER_RT_TEST_STANDALONE_BUILD_LIBS_PYBOOL@)
set_default("has_compiler_rt_libatomic", @COMPILER_RT_BUILD_STANDALONE_LIBATOMIC_PYBOOL@)
set_default("aarch64_sme", @COMPILER_RT_HAS_AARCH64_SME@)
# True iff the test suite supports ignoring the test compiler's runtime library path
# and using `config.compiler_rt_libdir` instead. This only matters when the runtime
# library paths differ.
Expand Down
1 change: 1 addition & 0 deletions compiler-rt/unittests/lit.common.unit.configured.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ config.llvm_obj_root = "@LLVM_BINARY_DIR@"
config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
config.compiler_rt_src_root = "@COMPILER_RT_SOURCE_DIR@"
config.compiler_rt_libdir = lit_config.substitute("@COMPILER_RT_RESOLVED_LIBRARY_OUTPUT_DIR@")
config.aarch64_sme = "@COMPILER_RT_HAS_AARCH64_SME@"
config.enable_per_target_runtime_dir = @LLVM_ENABLE_PER_TARGET_RUNTIME_DIR_PYBOOL@
config.llvm_build_mode = lit_config.substitute("@LLVM_BUILD_MODE@")
config.host_arch = "@HOST_ARCH@"
Expand Down