-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64][compiler-rt] Add memcpy, memset, memmove, memchr builtins. #77496
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2db7921
7e2df5b
2504e89
5324c3f
d31dcb1
5d696f7
942a0d8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
#include <stdlib.h> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've proposed #79454 to switch this out for |
||
|
||
// WARNING: When building the scalar versions of these functions you need to | ||
// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang | ||
// from recognising a loop idiom and planting calls to memcpy! | ||
Comment on lines
+3
to
+5
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a comment here that seems important, but doesn't seem to be addressed by your patch? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
|
||
static void *__arm_sc_memcpy_fwd(void *dest, const void *src, | ||
size_t n) __arm_streaming_compatible { | ||
unsigned char *destp = (unsigned char *)dest; | ||
const unsigned char *srcp = (const unsigned char *)src; | ||
for (size_t i = 0; i < n; ++i) | ||
destp[i] = srcp[i]; | ||
|
||
return dest; | ||
} | ||
|
||
// If dest and src overlap then behaviour is undefined, hence we can add the | ||
// restrict keywords here. This also matches the definition of the libc memcpy | ||
// according to the man page. | ||
void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src, | ||
size_t n) __arm_streaming_compatible { | ||
return __arm_sc_memcpy_fwd(dest, src, n); | ||
} | ||
|
||
void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible { | ||
unsigned char *destp = (unsigned char *)dest; | ||
unsigned char c8 = (unsigned char)c; | ||
for (size_t i = 0; i < n; ++i) | ||
destp[i] = c8; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. drop braces, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
|
||
return dest; | ||
} | ||
|
||
static void *__arm_sc_memcpy_rev(void *dest, const void *src, | ||
size_t n) __arm_streaming_compatible { | ||
unsigned char *destp = (unsigned char *)dest; | ||
const unsigned char *srcp = (const unsigned char *)src; | ||
// TODO: Improve performance by copying larger chunks in reverse, or by | ||
// using SVE. | ||
while (n > 0) { | ||
--n; | ||
destp[n] = srcp[n]; | ||
} | ||
return dest; | ||
} | ||
|
||
// Semantically a memmove is equivalent to the following: | ||
// 1. Copy the entire contents of src to a temporary array that does not | ||
// overlap with src or dest. | ||
// 2. Copy the contents of the temporary array into dest. | ||
void *__arm_sc_memmove(void *dest, const void *src, | ||
size_t n) __arm_streaming_compatible { | ||
unsigned char *destp = (unsigned char *)dest; | ||
const unsigned char *srcp = (const unsigned char *)src; | ||
|
||
// If src and dest don't overlap then just invoke memcpy | ||
if ((srcp > (destp + n)) || (destp > (srcp + n))) | ||
return __arm_sc_memcpy_fwd(dest, src, n); | ||
|
||
// Overlap case 1: | ||
// src: Low | -> | High | ||
// dest: Low | -> | High | ||
// Here src is always ahead of dest at a higher addres. If we first read a | ||
// chunk of data from src we can safely write the same chunk to dest without | ||
// corrupting future reads of src. | ||
if (srcp > destp) | ||
return __arm_sc_memcpy_fwd(dest, src, n); | ||
|
||
// Overlap case 2: | ||
// src: Low | -> | High | ||
// dest: Low | -> | High | ||
// While we're in the overlap region we're always corrupting future reads of | ||
// src when writing to dest. An efficient way to do this is to copy the data | ||
// in reverse by starting at the highest address. | ||
return __arm_sc_memcpy_rev(dest, src, n); | ||
} | ||
|
||
const void *__arm_sc_memchr(const void *src, int c, | ||
size_t n) __arm_streaming_compatible { | ||
const unsigned char *srcp = (const unsigned char *)src; | ||
unsigned char c8 = (unsigned char)c; | ||
for (size_t i = 0; i < n; ++i) | ||
if (srcp[i] == c8) | ||
return &srcp[i]; | ||
|
||
return NULL; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
// REQUIRES: aarch64-target-arch, aarch64-sme-available | ||
// RUN: %clangxx_builtins %s %librt -o %t && %run %t | ||
|
||
#include <cassert> | ||
#include <initializer_list> | ||
#include <stdint.h> | ||
#include <stdlib.h> | ||
#include <string.h> | ||
|
||
extern "C" { | ||
void *__arm_sc_memcpy(void *, const void *, size_t); | ||
void *__arm_sc_memset(void *, int, size_t); | ||
void *__arm_sc_memmove(void *, const void *, size_t); | ||
void *__arm_sc_memchr(const void *, int, size_t); | ||
} | ||
|
||
template <unsigned N> class Memory { | ||
public: | ||
uint8_t ptr[N]; | ||
unsigned size; | ||
|
||
Memory(unsigned stride = 0) { | ||
size = N; | ||
if (stride == 0) | ||
return; | ||
for (unsigned i = 0; i < N; i++) | ||
ptr[i] = i * stride; | ||
} | ||
|
||
void assert_equal(const Memory &other) { | ||
assert(N == other.size); | ||
assert(memcmp(ptr, other.ptr, N) == 0); | ||
} | ||
|
||
void assert_equal(std::initializer_list<uint8_t> s) { | ||
assert(N == s.size()); | ||
auto it = s.begin(); | ||
for (unsigned i = 0; i < N; ++i) | ||
assert(ptr[i] == *it++); | ||
} | ||
|
||
void assert_elemt_equal_at(unsigned I, uint8_t elem) { | ||
assert(ptr[I] == elem); | ||
} | ||
}; | ||
|
||
int main() { | ||
|
||
// Testing memcpy from src to dst. | ||
{ | ||
Memory<8> src(1); | ||
Memory<8> dst; | ||
if (!__arm_sc_memcpy(dst.ptr, src.ptr, 8)) | ||
abort(); | ||
dst.assert_equal(src); | ||
dst.assert_equal({0, 1, 2, 3, 4, 5, 6, 7}); | ||
} | ||
|
||
// Testing memcpy from src to dst with pointer offset. | ||
{ | ||
Memory<8> src(1); | ||
Memory<8> dst(1); | ||
if (!__arm_sc_memcpy(dst.ptr + 1, src.ptr, 6)) | ||
abort(); | ||
dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7}); | ||
} | ||
|
||
// Testing memchr. | ||
{ | ||
Memory<8> src(4); | ||
for (unsigned i = 0; i < 8; ++i) { | ||
uint8_t e = src.ptr[i]; | ||
uint8_t *elem = (uint8_t *)memchr(src.ptr, e, 8); | ||
if (!elem) | ||
abort(); | ||
src.assert_elemt_equal_at(elem - src.ptr, *elem); | ||
for (unsigned i = 0; i < 8; ++i) | ||
assert(__arm_sc_memchr(src.ptr, src.ptr[i], 8) == | ||
memchr(src.ptr, src.ptr[i], 8)); | ||
} | ||
} | ||
|
||
// Testing memset. | ||
{ | ||
Memory<8> array; | ||
if (!__arm_sc_memset(array.ptr, 2, 8)) | ||
abort(); | ||
array.assert_equal({2, 2, 2, 2, 2, 2, 2, 2}); | ||
} | ||
|
||
// Testing memset with pointer offset. | ||
{ | ||
Memory<8> array(1); | ||
if (!__arm_sc_memset(array.ptr + 1, 2, 6)) | ||
abort(); | ||
array.assert_equal({0, 2, 2, 2, 2, 2, 2, 7}); | ||
} | ||
|
||
// Testing memmove with a simple non-overlap case. | ||
{ | ||
Memory<8> src(1); | ||
Memory<8> dst(1); | ||
if (!__arm_sc_memmove(dst.ptr + 1, src.ptr, 6)) | ||
abort(); | ||
dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7}); | ||
} | ||
|
||
// Testing memove with overlap pointers dst > src, dst < src. | ||
{ | ||
Memory<8> srcdst(1); | ||
if (!__arm_sc_memmove(srcdst.ptr + 1, srcdst.ptr, 6)) | ||
abort(); | ||
srcdst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7}); | ||
if (!__arm_sc_memmove(srcdst.ptr, srcdst.ptr + 1, 6)) | ||
abort(); | ||
srcdst.assert_equal({0, 1, 2, 3, 4, 5, 5, 7}); | ||
} | ||
|
||
return 0; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you combine this with
COMPILER_RT_HAS_ASM_SME
and just have a singleCOMPILER_RT_HAS_AARCH64_SME
? That way we don't need to have two variables given that you're AND'ing the result later on anyway.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.